本文整理汇总了Python中pyspark.sql.types.StructType方法的典型用法代码示例。如果您正苦于以下问题:Python types.StructType方法的具体用法?Python types.StructType怎么用?Python types.StructType使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types
的用法示例。
在下文中一共展示了types.StructType方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def main():
temp_schema = StructType([
StructField('StationID', StringType(), False),
StructField('DateTime', StringType(), False),
StructField('Observation', StringType(), False),
StructField('DataValue', DoubleType(), False),
StructField('MFlag', StringType(), True),
StructField('QFlag', StringType(), True),
StructField('SFlag', StringType(), True),
StructField('OBSTime', StringType(), True),
])
df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(inputs1, schema=temp_schema)
df = df.filter(df.QFlag == '')
dfrange = get_range(df)
result = dfrange.rdd.map(lambda r: str(r.DateTime)+' '+str(r.StationID)+' '+str(r.MaxRange))
outdata = result.sortBy(lambda r: r[0]).coalesce(1)
outdata.saveAsTextFile(output)
示例2: filesToDF
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def filesToDF(sc, path, numPartitions=None):
"""
Read files from a directory to a DataFrame.
:param sc: SparkContext.
:param path: str, path to files.
:param numPartition: int, number or partitions to use for reading files.
:return: DataFrame, with columns: (filePath: str, fileData: BinaryType)
"""
numPartitions = numPartitions or sc.defaultParallelism
schema = StructType([StructField("filePath", StringType(), False),
StructField("fileData", BinaryType(), False)])
rdd = sc.binaryFiles(
path, minPartitions=numPartitions).repartition(numPartitions)
rdd = rdd.map(lambda x: (x[0], bytearray(x[1])))
return rdd.toDF(schema)
示例3: _simplify_data_type
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def _simplify_data_type(data_type: T.DataType) -> Tuple:
"""Simplify datatype into a tuple of equality information we care about
Most notably this ignores nullability concerns due to hive not
being able to represent not null in it's schemas.
"""
try:
# Normalize UDT into it's sql form. Allows comparison of schemas
# from hive and spark.
sql_type = data_type.sqlType() # type: ignore
except AttributeError:
sql_type = data_type
if isinstance(sql_type, T.StructType):
return ('StructType', [(field.name, _simplify_data_type(field.dataType)) for field in sql_type])
elif isinstance(sql_type, T.ArrayType):
return ('ArrayType', _simplify_data_type(sql_type.elementType))
else:
return (type(sql_type).__name__,)
示例4: _verify_schema_compatability
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def _verify_schema_compatability(expect: T.StructType, have: T.StructType) -> List[str]:
"""Verify all expected fields and types are present
Allows additional columns in the `have` schema. Additionally
allows relaxing nullability """
errors = []
for expect_field in expect:
try:
have_field = have[expect_field.name]
except KeyError:
errors.append('Field {} missing. Have: {}'.format(expect_field.name, ','.join(have.names)))
continue
expect_type = _simplify_data_type(expect_field.dataType)
have_type = _simplify_data_type(have_field.dataType)
if expect_type != have_type:
errors.append('Field {} has incompatible data types: expect {} != have {}'.format(
expect_field.name, expect_type, have_type))
return errors
示例5: _merge_schemas
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def _merge_schemas(*schemas: T.StructType):
"""Merge one or more spark schemas into a new schema"""
fields = cast(Dict[str, T.StructField], {})
errors = []
for schema in schemas:
for field in schema:
if field.name not in fields:
fields[field.name] = field
elif field != fields[field.name]:
errors.append('Incompatible fields: {} != {}'.format(field, fields[field.name]))
if errors:
raise Exception('\n'.join(errors))
return T.StructType(list(fields.values()))
# Primary input schema from which most everything else is derived
示例6: as_spark_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def as_spark_schema(self):
"""Returns an object derived from the unischema as spark schema.
Example:
>>> spark.createDataFrame(dataset_rows,
>>> SomeSchema.as_spark_schema())
"""
# Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
# (currently works only with make_batch_reader)
import pyspark.sql.types as sql_types
schema_entries = []
for field in self._fields.values():
spark_type = _field_spark_dtype(field)
schema_entries.append(sql_types.StructField(field.name, spark_type, field.nullable))
return sql_types.StructType(schema_entries)
示例7: main
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def main():
schema = StructType([
StructField('subreddit', StringType(), False),
StructField('score', IntegerType(), False),
])
inputs = sqlContext.read.json(inputs1, schema=schema)
# Uncomment this then shcema is not added
# inputs = sqlContext.read.json(inputs1)
# Uncomment these when there are 2 inputs dir
# comments_input1 = sqlContext.read.json(inputs1, schema=schema)
# comments_input2 = sqlContext.read.json(inputs2, schema=schema)
# inputs = comments_input1.unionAll(comments_input2)
df = get_avg(inputs)
df.write.save(output, format='json', mode='overwrite')
示例8: read_groundtruth
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def read_groundtruth(self):
"""
Create a dataframe from the ground truth csv file
Takes as argument the full path name of the csv file
and the spark_session
"""
filereader = Reader(self.spark_session)
groundtruth_schema = StructType([
StructField("tid", IntegerType(), False),
StructField("attr_name", StringType(), False),
StructField("attr_val", StringType(), False)])
self.ground_truth_flat = filereader.read(self.path_to_grand_truth, 0,
groundtruth_schema).\
drop(GlobalVariables.index_name)
self.dataengine.add_db_table(
'Groundtruth', self.ground_truth_flat, self.dataset)
示例9: format_to_file_path
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def format_to_file_path(spark_session):
rows = [
Row(8, 32, "bat"),
Row(64, 40, "mouse"),
Row(-27, 55, "horse")
]
schema = StructType([
StructField("number2", IntegerType()),
StructField("number1", IntegerType()),
StructField("word", StringType())
])
rdd = spark_session.sparkContext.parallelize(rows)
df = spark_session.createDataFrame(rdd, schema)
res = {}
tempdir = tempfile.mkdtemp()
for data_format in ["csv", "parquet", "json"]:
res[data_format] = os.path.join(tempdir, "test-data-%s" % data_format)
for data_format, file_path in res.items():
df.write.option("header", "true").format(data_format).save(file_path)
yield res
shutil.rmtree(tempdir)
示例10: dstream
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def dstream(self):
'''
Return the schema of this :class:`DataFrame` as a
:class:`pyspark.sql.types.StructType`.
'''
return self.__dstream\
.map(lambda x: x[1])\
.flatMap(lambda x: x)\
.map(lambda x: _analyzer(x))
示例11: transform
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def transform(df, url_list=None, brokers=None, **kwargs):
if brokers and url_list:
raise ValueError('cannot specify brokers and url_list')
if brokers:
rdd = transform_from_kafka(df, brokers, **kwargs)
else:
rdd = transform_from_elasticsearch(df, url_list, **kwargs)
return df.sql_ctx.createDataFrame(rdd, T.StructType([
df.schema['wikiid'],
df.schema['query'],
df.schema['norm_query'],
T.StructField('hit_page_ids', T.ArrayType(T.IntegerType()), nullable=False),
]))
示例12: _verify_schema_equality
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def _verify_schema_equality(expect: T.StructType, have: T.StructType) -> List[str]:
"""Verify the dataframe and table have equal schemas"""
def resolve(schema, field_name) -> Optional[Tuple]:
try:
field = schema[field_name]
except KeyError:
return None
return _simplify_data_type(field.dataType)
errors = []
for field_name in set(expect.names).union(have.names):
expect_type = resolve(expect, field_name)
if expect_type is None:
errors.append('Extra field in provided schema: {}'.format(field_name))
continue
have_type = resolve(have, field_name)
if have_type is None:
errors.append('Missing field in provided schema: {}'.format(field_name))
continue
if expect_type != have_type:
fmt = 'Column {} of type {} does not match expected {}'
errors.append(fmt.format(field_name, have_type, expect_type))
continue
# TODO: Test nullability? But hive doesn't track nullability, everything is nullable.
return errors
示例13: read_partition
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def read_partition(
spark: SparkSession,
table: str,
partition_spec: Mapping[str, str],
schema: Optional[T.StructType] = None,
direct_parquet_read: bool = False
) -> DataFrame:
"""Read a single partition from a hive table.
Verifies the partition specification describes a complete partition,
that the partition exists, and optionally that the table is compatible
with an expected schema. The partition could still be empty.
"""
# We don't need to do anything with the result, our goal is to
# trigger AnalysisException when the arguments are invalid.
spark.sql(_describe_partition_ql(table, partition_spec)).collect()
partition_cond = F.lit(True)
for k, v in partition_spec.items():
partition_cond &= F.col(k) == v
df = spark.read.table(table).where(partition_cond)
# The df we have now has types defined by the hive table, but this downgrades
# non-standard types like VectorUDT() to it's sql equivalent. Use the first
# df to find the files, then read them directly.
if direct_parquet_read:
input_files = list(df._jdf.inputFiles()) # type: ignore
input_dirs = set(os.path.dirname(path) for path in input_files)
if len(input_dirs) != 1:
raise Exception('Expected single directory containing partition data: [{}]'.format(
'],['.join(input_files)))
df = spark.read.parquet(list(input_dirs)[0])
if schema is not None:
# TODO: This only allows extra top level columns, anything
# nested must be exactly the same. Fine for now.
_verify_schema_compatability(schema, df.schema)
df = df.select(*(field.name for field in schema))
# Drop partitioning columns. These are not part of the mjolnir transformations, and
# are only an implementation detail of putting them on disk and tracking history.
return df.drop(*partition_spec.keys())
示例14: typed_transformer
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def typed_transformer(
schema_in: Optional[T.StructType] = None,
schema_out: Optional[T.StructType] = None,
context: Optional[str] = None
) -> Callable[[Callable[..., Transformer]], Callable[..., Transformer]]:
"""Decorates a transformer factory with schema validation
An idiom in transform is calling a function to return a Transform. This
decorator can be applied to those factory functions to return transformers
that apply runtime schema validation.
"""
def decorate(fn: Callable[..., Transformer]) -> Callable[..., Transformer]:
def error_context(kind: str) -> str:
return 'While checking {} {}:'.format(fn.__name__ if context is None else context, kind)
@functools.wraps(fn)
def factory(*args, **kwargs) -> Transformer:
transformer = fn(*args, **kwargs)
@functools.wraps(transformer)
def transform(df_in: DataFrame) -> DataFrame:
if schema_in is not None:
check_schema(df_in, schema_in, error_context('schema_in'))
df_in = df_in.select(*schema_in.names)
df_out = transformer(df_in)
if schema_out is not None:
check_schema(df_out, schema_out, error_context('schema_out'))
df_out = df_out.select(*schema_out.names)
return df_out
return transform
return factory
return decorate
# Shared schemas between the primary mjolnir transformations. Transformations
# may require a schema with slightly more columns than they require to keep
# the total number of schemas low.
示例15: test_schema_comparison
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def test_schema_comparison(expect: T.StructType, have: T.StructType, compatible: bool, equal: bool) -> None:
if equal and not compatible:
raise Exception('Invalid constraint, can not be equal but not compatible')
# functions return list of errors, not bool() returns true when everything is ok
assert compatible is not bool(mt._verify_schema_compatability(expect, have))
assert equal is not bool(mt._verify_schema_equality(expect, have))