本文整理汇总了Python中pyspark.sql.types.StructField方法的典型用法代码示例。如果您正苦于以下问题:Python types.StructField方法的具体用法?Python types.StructField怎么用?Python types.StructField使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types
的用法示例。
在下文中一共展示了types.StructField方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: filesToDF
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def filesToDF(sc, path, numPartitions=None):
"""
Read files from a directory to a DataFrame.
:param sc: SparkContext.
:param path: str, path to files.
:param numPartition: int, number or partitions to use for reading files.
:return: DataFrame, with columns: (filePath: str, fileData: BinaryType)
"""
numPartitions = numPartitions or sc.defaultParallelism
schema = StructType([StructField("filePath", StringType(), False),
StructField("fileData", BinaryType(), False)])
rdd = sc.binaryFiles(
path, minPartitions=numPartitions).repartition(numPartitions)
rdd = rdd.map(lambda x: (x[0], bytearray(x[1])))
return rdd.toDF(schema)
示例2: _merge_schemas
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def _merge_schemas(*schemas: T.StructType):
"""Merge one or more spark schemas into a new schema"""
fields = cast(Dict[str, T.StructField], {})
errors = []
for schema in schemas:
for field in schema:
if field.name not in fields:
fields[field.name] = field
elif field != fields[field.name]:
errors.append('Incompatible fields: {} != {}'.format(field, fields[field.name]))
if errors:
raise Exception('\n'.join(errors))
return T.StructType(list(fields.values()))
# Primary input schema from which most everything else is derived
示例3: as_spark_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def as_spark_schema(self):
"""Returns an object derived from the unischema as spark schema.
Example:
>>> spark.createDataFrame(dataset_rows,
>>> SomeSchema.as_spark_schema())
"""
# Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
# (currently works only with make_batch_reader)
import pyspark.sql.types as sql_types
schema_entries = []
for field in self._fields.values():
spark_type = _field_spark_dtype(field)
schema_entries.append(sql_types.StructField(field.name, spark_type, field.nullable))
return sql_types.StructType(schema_entries)
示例4: register_udfs
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def register_udfs(self, sess, sc):
"""Register UDFs to be used in SQL queries.
:type sess: `pyspark.sql.SparkSession`
:param sess: Session used in Spark for SQL queries.
:type sc: `pyspark.SparkContext`
:param sc: Spark Context to run Spark jobs.
"""
sess.udf.register("SQUARED", self.squared, returnType=(
stypes.ArrayType(stypes.StructType(
fields=[stypes.StructField('sku0', stypes.StringType()),
stypes.StructField('norm', stypes.FloatType())]))))
sess.udf.register('INTERSECTIONS',self.process_intersections,
returnType=stypes.ArrayType(stypes.StructType(fields=[
stypes.StructField('sku0', stypes.StringType()),
stypes.StructField('sku1', stypes.StringType()),
stypes.StructField('cor', stypes.FloatType())])))
示例5: transform
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def transform(df, url_list=None, brokers=None, **kwargs):
if brokers and url_list:
raise ValueError('cannot specify brokers and url_list')
if brokers:
rdd = transform_from_kafka(df, brokers, **kwargs)
else:
rdd = transform_from_elasticsearch(df, url_list, **kwargs)
return df.sql_ctx.createDataFrame(rdd, T.StructType([
df.schema['wikiid'],
df.schema['query'],
df.schema['norm_query'],
T.StructField('hit_page_ids', T.ArrayType(T.IntegerType()), nullable=False),
]))
示例6: cluster_within_norm_query_groups
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def cluster_within_norm_query_groups(df: DataFrame) -> DataFrame:
make_groups = F.udf(_make_query_groups, T.ArrayType(T.StructType([
T.StructField('query', T.StringType(), nullable=False),
T.StructField('norm_query_group_id', T.IntegerType(), nullable=False),
])))
return (
df
.groupBy('wikiid', 'norm_query')
.agg(F.collect_list(F.struct('query', 'hit_page_ids')).alias('source'))
.select(
'wikiid', 'norm_query',
F.explode(make_groups('source')).alias('group'))
.select('wikiid', 'norm_query', 'group.query', 'group.norm_query_group_id'))
示例7: test_dataframe_with_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def test_dataframe_with_schema(dataset, spark):
schema = StructType([StructField("foo", IntegerType(), True)])
df = dataset.dataframe(spark, decode=decode, schema=schema, table_name='bar')
assert type(df) == DataFrame
assert df.columns == ['foo']
assert df.orderBy(["foo"]).collect() == [Row(foo=1), Row(foo=2)]
示例8: test_dataframe_bad_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def test_dataframe_bad_schema(dataset, spark):
spark.catalog.dropTempView('bar')
schema = StructType([StructField("name", StringType(), True)])
df = dataset.dataframe(spark, decode=decode, schema=schema, table_name='bar')
assert type(df) == DataFrame
assert df.collect() == [Row(name=None), Row(name=None)]
示例9: ibis_struct_dtype_to_spark_dtype
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def ibis_struct_dtype_to_spark_dtype(ibis_dtype_obj):
fields = [
pt.StructField(n, spark_dtype(t), t.nullable)
for n, t in zip(ibis_dtype_obj.names, ibis_dtype_obj.types)
]
return pt.StructType(fields)
示例10: _transform
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def _transform(self, df):
"""Private transform method of a Transformer. This serves as batch-prediction method for our purposes.
"""
output_col = self.getOutputCol()
label_col = self.getLabelCol()
new_schema = copy.deepcopy(df.schema)
new_schema.add(StructField(output_col, StringType(), True))
rdd = df.rdd.coalesce(1)
features = np.asarray(
rdd.map(lambda x: from_vector(x.features)).collect())
# Note that we collect, since executing this on the rdd would require model serialization once again
model = model_from_yaml(self.get_keras_model_config())
model.set_weights(self.weights.value)
predictions = rdd.ctx.parallelize(
model.predict_classes(features)).coalesce(1)
predictions = predictions.map(lambda x: tuple(str(x)))
results_rdd = rdd.zip(predictions).map(lambda x: x[0] + x[1])
results_df = df.sql_ctx.createDataFrame(results_rdd, new_schema)
results_df = results_df.withColumn(
output_col, results_df[output_col].cast(DoubleType()))
results_df = results_df.withColumn(
label_col, results_df[label_col].cast(DoubleType()))
return results_df
示例11: transformSchema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def transformSchema(self, schema):
inputType = schema[self.getInputCol()].dataType
self.validateInputType(inputType)
if self.getOutputCol() in schema.names:
raise ValueError("Output column %s already exists." % self.getOutputCol())
outputFields = copy.copy(schema.fields)
outputFields.append(StructField(self.getOutputCol(),
self.outputDataType(),
nullable=False))
return StructType(outputFields)
示例12: _scala_to_python_field_type
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def _scala_to_python_field_type(self, scala_field_type):
"""create a python FieldType from the scala field type"""
col_name = str(scala_field_type.name())
col_type_name = str(scala_field_type.dataType())
# map string "IntegerType" to actual class IntegerType
col_type_class = getattr(sql_types, col_type_name)
return sql_types.StructField(col_name, col_type_class())
示例13: _toStructType
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def _toStructType(self):
"""return equivalent Spark schema (StructType) from this smv schema"""
# ss is the raw scala spark schema (Scala StructType). This has no
# iterator defined on the python side, so we use old school for loop.
ss = self.j_smv_schema.toStructType()
spark_schema = sql_types.StructType()
for i in range(ss.length()):
# use "apply" to get the nth StructField item in StructType
ft = self._scala_to_python_field_type(ss.apply(i))
spark_schema = spark_schema.add(ft)
return spark_schema
示例14: test_fromString
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def test_fromString(self):
s = SmvSchema.fromString("a:string; b:double")
fields = s.spark_schema.fields
assert(len(fields) == 2)
assert(fields[0] == st.StructField('a', st.StringType()))
assert(fields[1] == st.StructField('b', st.DoubleType()))
示例15: test_fromFile
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructField [as 别名]
def test_fromFile(self):
f = os.path.join(SmvSchemaTest.resourceTestDir(), "data/a.schema")
s = SmvSchema.fromFile(f)
fields = s.spark_schema.fields
assert(len(fields) == 2)
assert(fields[0] == st.StructField('a', st.StringType()))
assert(fields[1] == st.StructField('b', st.FloatType()))