本文整理汇总了Python中pyspark.sql.types.StringType方法的典型用法代码示例。如果您正苦于以下问题:Python types.StringType方法的具体用法?Python types.StringType怎么用?Python types.StringType使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types
的用法示例。
在下文中一共展示了types.StringType方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def main():
temp_schema = StructType([
StructField('StationID', StringType(), False),
StructField('DateTime', StringType(), False),
StructField('Observation', StringType(), False),
StructField('DataValue', DoubleType(), False),
StructField('MFlag', StringType(), True),
StructField('QFlag', StringType(), True),
StructField('SFlag', StringType(), True),
StructField('OBSTime', StringType(), True),
])
df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(inputs1, schema=temp_schema)
df = df.filter(df.QFlag == '')
dfrange = get_range(df)
result = dfrange.rdd.map(lambda r: str(r.DateTime)+' '+str(r.StationID)+' '+str(r.MaxRange))
outdata = result.sortBy(lambda r: r[0]).coalesce(1)
outdata.saveAsTextFile(output)
示例2: _decodeOutputAsPredictions
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def _decodeOutputAsPredictions(self, df):
# If we start having different weights than imagenet, we'll need to
# move this logic to individual model building in NamedImageTransformer.
# Also, we could put the computation directly in the main computation
# graph or use a scala UDF for potentially better performance.
topK = self.getOrDefault(self.topK)
def decode(predictions):
pred_arr = np.expand_dims(np.array(predictions), axis=0)
decoded = decode_predictions(pred_arr, top=topK)[0]
# convert numpy dtypes to python native types
return [(t[0], t[1], t[2].item()) for t in decoded]
decodedSchema = ArrayType(
StructType([
StructField("class", StringType(), False),
StructField("description", StringType(), False),
StructField("probability", FloatType(), False)
]))
decodeUDF = udf(decode, decodedSchema)
interim_output = self._getIntermediateOutputCol()
return df \
.withColumn(self.getOutputCol(), decodeUDF(df[interim_output])) \
.drop(interim_output)
示例3: filesToDF
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def filesToDF(sc, path, numPartitions=None):
"""
Read files from a directory to a DataFrame.
:param sc: SparkContext.
:param path: str, path to files.
:param numPartition: int, number or partitions to use for reading files.
:return: DataFrame, with columns: (filePath: str, fileData: BinaryType)
"""
numPartitions = numPartitions or sc.defaultParallelism
schema = StructType([StructField("filePath", StringType(), False),
StructField("fileData", BinaryType(), False)])
rdd = sc.binaryFiles(
path, minPartitions=numPartitions).repartition(numPartitions)
rdd = rdd.map(lambda x: (x[0], bytearray(x[1])))
return rdd.toDF(schema)
示例4: test_as_spark_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def test_as_spark_schema():
"""Try using 'as_spark_schema' function"""
TestSchema = Unischema('TestSchema', [
UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
UnischemaField('string_field_implicit', np.string_, ()),
])
spark_schema = TestSchema.as_spark_schema()
assert spark_schema.fields[0].name == 'int_field'
assert spark_schema.fields[1].name == 'string_field'
assert spark_schema.fields[1].dataType == StringType()
assert spark_schema.fields[2].name == 'string_field_implicit'
assert spark_schema.fields[2].dataType == StringType()
assert TestSchema.fields['int_field'].name == 'int_field'
assert TestSchema.fields['string_field'].name == 'string_field'
示例5: main
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def main():
schema = StructType([
StructField('subreddit', StringType(), False),
StructField('score', IntegerType(), False),
])
inputs = sqlContext.read.json(inputs1, schema=schema)
# Uncomment this then shcema is not added
# inputs = sqlContext.read.json(inputs1)
# Uncomment these when there are 2 inputs dir
# comments_input1 = sqlContext.read.json(inputs1, schema=schema)
# comments_input2 = sqlContext.read.json(inputs2, schema=schema)
# inputs = comments_input1.unionAll(comments_input2)
df = get_avg(inputs)
df.write.save(output, format='json', mode='overwrite')
示例6: to_date
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def to_date(col, format=None):
"""Converts a :class:`Column` of :class:`pyspark.sql.types.StringType` or
:class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType`
using the optionally specified format. Specify formats according to
`SimpleDateFormats <http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html>`_.
By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format
is omitted (equivalent to ``col.cast("date")``).
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
>>> df.select(to_date(df.t).alias('date')).collect()
[Row(date=datetime.date(1997, 2, 28))]
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
>>> df.select(to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect()
[Row(date=datetime.date(1997, 2, 28))]
"""
sc = SparkContext._active_spark_context
if format is None:
jc = sc._jvm.functions.to_date(_to_java_column(col))
else:
jc = sc._jvm.functions.to_date(_to_java_column(col), format)
return Column(jc)
示例7: to_timestamp
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def to_timestamp(col, format=None):
"""Converts a :class:`Column` of :class:`pyspark.sql.types.StringType` or
:class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType`
using the optionally specified format. Specify formats according to
`SimpleDateFormats <http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html>`_.
By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format
is omitted (equivalent to ``col.cast("timestamp")``).
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
>>> df.select(to_timestamp(df.t).alias('dt')).collect()
[Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
>>> df.select(to_timestamp(df.t, 'yyyy-MM-dd HH:mm:ss').alias('dt')).collect()
[Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
"""
sc = SparkContext._active_spark_context
if format is None:
jc = sc._jvm.functions.to_timestamp(_to_java_column(col))
else:
jc = sc._jvm.functions.to_timestamp(_to_java_column(col), format)
return Column(jc)
示例8: locate
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def locate(substr, str, pos=1):
"""
Locate the position of the first occurrence of substr in a string column, after position pos.
.. note:: The position is not zero based, but 1 based index. Returns 0 if substr
could not be found in str.
:param substr: a string
:param str: a Column of :class:`pyspark.sql.types.StringType`
:param pos: start position (zero based)
>>> df = spark.createDataFrame([('abcd',)], ['s',])
>>> df.select(locate('b', df.s, 1).alias('s')).collect()
[Row(s=2)]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.locate(substr, _to_java_column(str), pos))
示例9: test_smvArrayFlatten
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def test_smvArrayFlatten(self):
df = self.createDF('a:String;b:String;c:String', ',,;1,2,;2,3,4')
df1 = df.select(F.array(
F.array(F.lit(None), F.col('a')),
F.array(F.col('a'), F.col('b'), F.col('c'))
).alias('aa'))
res1 = df1.select(F.col('aa').smvArrayFlatten(StringType()).alias('a'))\
.select(SF.smvArrayCat('|', F.col('a')).alias('k'))
exp = self.createDF("k: String",
"""||||;
|1|1|2|;
|2|2|3|4""")
res2 = df1.select(F.col('aa').smvArrayFlatten(df1).alias('a'))\
.select(SF.smvArrayCat('|', F.col('a')).alias('k'))
self.should_be_same(res1, exp)
self.should_be_same(res2, exp)
示例10: register_udfs
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def register_udfs(self, sess, sc):
"""Register UDFs to be used in SQL queries.
:type sess: `pyspark.sql.SparkSession`
:param sess: Session used in Spark for SQL queries.
:type sc: `pyspark.SparkContext`
:param sc: Spark Context to run Spark jobs.
"""
sess.udf.register("SQUARED", self.squared, returnType=(
stypes.ArrayType(stypes.StructType(
fields=[stypes.StructField('sku0', stypes.StringType()),
stypes.StructField('norm', stypes.FloatType())]))))
sess.udf.register('INTERSECTIONS',self.process_intersections,
returnType=stypes.ArrayType(stypes.StructType(fields=[
stypes.StructField('sku0', stypes.StringType()),
stypes.StructField('sku1', stypes.StringType()),
stypes.StructField('cor', stypes.FloatType())])))
示例11: read_groundtruth
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def read_groundtruth(self):
"""
Create a dataframe from the ground truth csv file
Takes as argument the full path name of the csv file
and the spark_session
"""
filereader = Reader(self.spark_session)
groundtruth_schema = StructType([
StructField("tid", IntegerType(), False),
StructField("attr_name", StringType(), False),
StructField("attr_val", StringType(), False)])
self.ground_truth_flat = filereader.read(self.path_to_grand_truth, 0,
groundtruth_schema).\
drop(GlobalVariables.index_name)
self.dataengine.add_db_table(
'Groundtruth', self.ground_truth_flat, self.dataset)
示例12: _join_results_multi
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def _join_results_multi(self, scaffolds_df, sampled_df):
def _join_scaffold(scaff, dec):
mol = usc.join(scaff, dec)
if mol:
return usc.to_smiles(mol)
def _format_attachment_point(smi, num):
smi = usc.add_first_attachment_point_number(smi, num)
return usc.to_smiles(uc.to_mol(smi)) # canonicalize
join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType())
format_attachment_point_udf = psf.udf(_format_attachment_point, pst.StringType())
return sampled_df.join(scaffolds_df, on="id")\
.withColumn("decoration", format_attachment_point_udf("decoration_smi", psf.col("attachment_points")[0]))\
.select(
join_scaffold_udf("smiles", "decoration").alias("smiles"),
psf.map_concat(
psf.create_map(psf.col("attachment_points")[0],
SampleScaffolds.cleanup_decoration_udf("decoration")),
"decorations",
).alias("decorations"),
"scaffold")
示例13: _join_results_single
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def _join_results_single(self, scaffolds_df, sampled_df):
def _join_scaffold(scaff, decs):
mol = usc.join_joined_attachments(scaff, decs)
if mol:
return usc.to_smiles(mol)
join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType())
def _create_decorations_map(decorations_smi, attachment_points):
decorations = decorations_smi.split(usc.ATTACHMENT_SEPARATOR_TOKEN)
return {idx: _cleanup_decoration(dec) for dec, idx in zip(decorations, attachment_points)}
create_decorations_map_udf = psf.udf(_create_decorations_map, pst.MapType(pst.IntegerType(), pst.StringType()))
return sampled_df.join(scaffolds_df, on="id")\
.select(
join_scaffold_udf("randomized_scaffold", "decoration_smi").alias("smiles"),
create_decorations_map_udf("decoration_smi", "attachment_points").alias("decorations"),
"scaffold")
示例14: format_to_file_path
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def format_to_file_path(spark_session):
rows = [
Row(8, 32, "bat"),
Row(64, 40, "mouse"),
Row(-27, 55, "horse")
]
schema = StructType([
StructField("number2", IntegerType()),
StructField("number1", IntegerType()),
StructField("word", StringType())
])
rdd = spark_session.sparkContext.parallelize(rows)
df = spark_session.createDataFrame(rdd, schema)
res = {}
tempdir = tempfile.mkdtemp()
for data_format in ["csv", "parquet", "json"]:
res[data_format] = os.path.join(tempdir, "test-data-%s" % data_format)
for data_format, file_path in res.items():
df.write.option("header", "true").format(data_format).save(file_path)
yield res
shutil.rmtree(tempdir)
示例15: getSampleImagePathsDF
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StringType [as 别名]
def getSampleImagePathsDF(sqlContext, colName):
files = getSampleImagePaths()
return sqlContext.createDataFrame(files, StringType()).toDF(colName)
# Methods for making comparisons between outputs of using different frameworks.
# For ImageNet.