当前位置: 首页>>代码示例>>Python>>正文


Python types.StructType方法代码示例

本文整理汇总了Python中pyspark.sql.types.StructType方法的典型用法代码示例。如果您正苦于以下问题:Python types.StructType方法的具体用法?Python types.StructType怎么用?Python types.StructType使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.types的用法示例。


在下文中一共展示了types.StructType方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def main():
    temp_schema = StructType([
    StructField('StationID', StringType(), False),
    StructField('DateTime', StringType(), False),
    StructField('Observation', StringType(), False),
    StructField('DataValue', DoubleType(), False),
    StructField('MFlag', StringType(), True),
    StructField('QFlag', StringType(), True),
    StructField('SFlag', StringType(), True),
    StructField('OBSTime', StringType(), True),
    ])

    df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(inputs1, schema=temp_schema)
    df = df.filter(df.QFlag == '')

    dfrange = get_range(df)
    result = dfrange.rdd.map(lambda r: str(r.DateTime)+' '+str(r.StationID)+' '+str(r.MaxRange))
    outdata = result.sortBy(lambda r: r[0]).coalesce(1)
    outdata.saveAsTextFile(output) 
开发者ID:hanhanwu,项目名称:Hanhan-Spark-Python,代码行数:21,代码来源:temp_range_sql.py

示例2: filesToDF

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def filesToDF(sc, path, numPartitions=None):
    """
    Read files from a directory to a DataFrame.

    :param sc: SparkContext.
    :param path: str, path to files.
    :param numPartition: int, number or partitions to use for reading files.
    :return: DataFrame, with columns: (filePath: str, fileData: BinaryType)
    """
    numPartitions = numPartitions or sc.defaultParallelism
    schema = StructType([StructField("filePath", StringType(), False),
                         StructField("fileData", BinaryType(), False)])
    rdd = sc.binaryFiles(
        path, minPartitions=numPartitions).repartition(numPartitions)
    rdd = rdd.map(lambda x: (x[0], bytearray(x[1])))
    return rdd.toDF(schema) 
开发者ID:databricks,项目名称:spark-deep-learning,代码行数:18,代码来源:imageIO.py

示例3: _simplify_data_type

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def _simplify_data_type(data_type: T.DataType) -> Tuple:
    """Simplify datatype into a tuple of equality information we care about

    Most notably this ignores nullability concerns due to hive not
    being able to represent not null in it's schemas.
    """
    try:
        # Normalize UDT into it's sql form. Allows comparison of schemas
        # from hive and spark.
        sql_type = data_type.sqlType()  # type: ignore
    except AttributeError:
        sql_type = data_type

    if isinstance(sql_type, T.StructType):
        return ('StructType', [(field.name, _simplify_data_type(field.dataType)) for field in sql_type])
    elif isinstance(sql_type, T.ArrayType):
        return ('ArrayType', _simplify_data_type(sql_type.elementType))
    else:
        return (type(sql_type).__name__,) 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:21,代码来源:transform.py

示例4: _verify_schema_compatability

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def _verify_schema_compatability(expect: T.StructType, have: T.StructType) -> List[str]:
    """Verify all expected fields and types are present

    Allows additional columns in the `have` schema. Additionally
    allows relaxing nullability """
    errors = []
    for expect_field in expect:
        try:
            have_field = have[expect_field.name]
        except KeyError:
            errors.append('Field {} missing. Have: {}'.format(expect_field.name, ','.join(have.names)))
            continue
        expect_type = _simplify_data_type(expect_field.dataType)
        have_type = _simplify_data_type(have_field.dataType)
        if expect_type != have_type:
            errors.append('Field {} has incompatible data types: expect {} != have {}'.format(
                          expect_field.name, expect_type, have_type))
    return errors 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:20,代码来源:transform.py

示例5: _merge_schemas

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def _merge_schemas(*schemas: T.StructType):
    """Merge one or more spark schemas into a new schema"""
    fields = cast(Dict[str, T.StructField], {})
    errors = []
    for schema in schemas:
        for field in schema:
            if field.name not in fields:
                fields[field.name] = field
            elif field != fields[field.name]:
                errors.append('Incompatible fields: {} != {}'.format(field, fields[field.name]))
    if errors:
        raise Exception('\n'.join(errors))
    return T.StructType(list(fields.values()))


# Primary input schema from which most everything else is derived 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:18,代码来源:transform.py

示例6: as_spark_schema

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def as_spark_schema(self):
        """Returns an object derived from the unischema as spark schema.

        Example:

        >>> spark.createDataFrame(dataset_rows,
        >>>                       SomeSchema.as_spark_schema())
        """
        # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
        # (currently works only with make_batch_reader)
        import pyspark.sql.types as sql_types

        schema_entries = []
        for field in self._fields.values():
            spark_type = _field_spark_dtype(field)
            schema_entries.append(sql_types.StructField(field.name, spark_type, field.nullable))

        return sql_types.StructType(schema_entries) 
开发者ID:uber,项目名称:petastorm,代码行数:20,代码来源:unischema.py

示例7: main

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def main():
    schema = StructType([
    StructField('subreddit', StringType(), False),
    StructField('score', IntegerType(), False),
    ])
    inputs = sqlContext.read.json(inputs1, schema=schema)

    # Uncomment this then shcema is not added
    # inputs = sqlContext.read.json(inputs1)

    # Uncomment these when there are 2 inputs dir
    # comments_input1 = sqlContext.read.json(inputs1, schema=schema)
    # comments_input2 = sqlContext.read.json(inputs2, schema=schema)
    # inputs = comments_input1.unionAll(comments_input2)

    df = get_avg(inputs)
    df.write.save(output, format='json', mode='overwrite') 
开发者ID:hanhanwu,项目名称:Hanhan-Spark-Python,代码行数:19,代码来源:reddit_average_sql.py

示例8: read_groundtruth

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def read_groundtruth(self):

        """
        Create a dataframe from the ground truth csv file

        Takes as argument the full path name of the csv file
        and the spark_session
        """
        filereader = Reader(self.spark_session)

        groundtruth_schema = StructType([
            StructField("tid", IntegerType(), False),
            StructField("attr_name", StringType(), False),
            StructField("attr_val", StringType(), False)])

        self.ground_truth_flat = filereader.read(self.path_to_grand_truth, 0,
                                                 groundtruth_schema).\
            drop(GlobalVariables.index_name)

        self.dataengine.add_db_table(
            'Groundtruth', self.ground_truth_flat, self.dataset) 
开发者ID:HoloClean,项目名称:HoloClean-Legacy-deprecated,代码行数:23,代码来源:accuracy.py

示例9: format_to_file_path

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def format_to_file_path(spark_session):
    rows = [
        Row(8, 32, "bat"),
        Row(64, 40, "mouse"),
        Row(-27, 55, "horse")
    ]
    schema = StructType([
        StructField("number2", IntegerType()),
        StructField("number1", IntegerType()),
        StructField("word", StringType())
    ])
    rdd = spark_session.sparkContext.parallelize(rows)
    df = spark_session.createDataFrame(rdd, schema)
    res = {}
    tempdir = tempfile.mkdtemp()
    for data_format in ["csv", "parquet", "json"]:
        res[data_format] = os.path.join(tempdir, "test-data-%s" % data_format)

    for data_format, file_path in res.items():
        df.write.option("header", "true").format(data_format).save(file_path)
    yield res
    shutil.rmtree(tempdir) 
开发者ID:mlflow,项目名称:mlflow,代码行数:24,代码来源:utils.py

示例10: dstream

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def dstream(self):
        '''
            Return the schema of this :class:`DataFrame` as a
        :class:`pyspark.sql.types.StructType`.
        '''
        return self.__dstream\
            .map(lambda x: x[1])\
            .flatMap(lambda x: x)\
            .map(lambda x: _analyzer(x)) 
开发者ID:apache,项目名称:incubator-spot,代码行数:11,代码来源:streaming.py

示例11: transform

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def transform(df, url_list=None, brokers=None, **kwargs):
    if brokers and url_list:
        raise ValueError('cannot specify brokers and url_list')
    if brokers:
        rdd = transform_from_kafka(df, brokers, **kwargs)
    else:
        rdd = transform_from_elasticsearch(df, url_list, **kwargs)
    return df.sql_ctx.createDataFrame(rdd, T.StructType([
        df.schema['wikiid'],
        df.schema['query'],
        df.schema['norm_query'],
        T.StructField('hit_page_ids', T.ArrayType(T.IntegerType()), nullable=False),
    ])) 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:15,代码来源:es_hits.py

示例12: _verify_schema_equality

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def _verify_schema_equality(expect: T.StructType, have: T.StructType) -> List[str]:
    """Verify the dataframe and table have equal schemas"""
    def resolve(schema, field_name) -> Optional[Tuple]:
        try:
            field = schema[field_name]
        except KeyError:
            return None
        return _simplify_data_type(field.dataType)

    errors = []
    for field_name in set(expect.names).union(have.names):
        expect_type = resolve(expect, field_name)
        if expect_type is None:
            errors.append('Extra field in provided schema: {}'.format(field_name))
            continue

        have_type = resolve(have, field_name)
        if have_type is None:
            errors.append('Missing field in provided schema: {}'.format(field_name))
            continue

        if expect_type != have_type:
            fmt = 'Column {} of type {} does not match expected {}'
            errors.append(fmt.format(field_name, have_type, expect_type))
            continue
        # TODO: Test nullability? But hive doesn't track nullability, everything is nullable.
    return errors 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:29,代码来源:transform.py

示例13: read_partition

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def read_partition(
    spark: SparkSession,
    table: str,
    partition_spec: Mapping[str, str],
    schema: Optional[T.StructType] = None,
    direct_parquet_read: bool = False
) -> DataFrame:
    """Read a single partition from a hive table.

    Verifies the partition specification describes a complete partition,
    that the partition exists, and optionally that the table is compatible
    with an expected schema. The partition could still be empty.
    """
    # We don't need to do anything with the result, our goal is to
    # trigger AnalysisException when the arguments are invalid.
    spark.sql(_describe_partition_ql(table, partition_spec)).collect()

    partition_cond = F.lit(True)
    for k, v in partition_spec.items():
        partition_cond &= F.col(k) == v
    df = spark.read.table(table).where(partition_cond)
    # The df we have now has types defined by the hive table, but this downgrades
    # non-standard types like VectorUDT() to it's sql equivalent. Use the first
    # df to find the files, then read them directly.
    if direct_parquet_read:
        input_files = list(df._jdf.inputFiles())  # type: ignore
        input_dirs = set(os.path.dirname(path) for path in input_files)
        if len(input_dirs) != 1:
            raise Exception('Expected single directory containing partition data: [{}]'.format(
                '],['.join(input_files)))
        df = spark.read.parquet(list(input_dirs)[0])
    if schema is not None:
        # TODO: This only allows extra top level columns, anything
        # nested must be exactly the same. Fine for now.
        _verify_schema_compatability(schema, df.schema)
        df = df.select(*(field.name for field in schema))
    # Drop partitioning columns. These are not part of the mjolnir transformations, and
    # are only an implementation detail of putting them on disk and tracking history.
    return df.drop(*partition_spec.keys()) 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:41,代码来源:transform.py

示例14: typed_transformer

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def typed_transformer(
    schema_in: Optional[T.StructType] = None,
    schema_out: Optional[T.StructType] = None,
    context: Optional[str] = None
) -> Callable[[Callable[..., Transformer]], Callable[..., Transformer]]:
    """Decorates a transformer factory with schema validation

    An idiom in transform is calling a function to return a Transform. This
    decorator can be applied to those factory functions to return transformers
    that apply runtime schema validation.
    """
    def decorate(fn: Callable[..., Transformer]) -> Callable[..., Transformer]:
        def error_context(kind: str) -> str:
            return 'While checking {} {}:'.format(fn.__name__ if context is None else context, kind)

        @functools.wraps(fn)
        def factory(*args, **kwargs) -> Transformer:
            transformer = fn(*args, **kwargs)

            @functools.wraps(transformer)
            def transform(df_in: DataFrame) -> DataFrame:
                if schema_in is not None:
                    check_schema(df_in, schema_in, error_context('schema_in'))
                    df_in = df_in.select(*schema_in.names)
                df_out = transformer(df_in)
                if schema_out is not None:
                    check_schema(df_out, schema_out, error_context('schema_out'))
                    df_out = df_out.select(*schema_out.names)
                return df_out
            return transform
        return factory
    return decorate


# Shared schemas between the primary mjolnir transformations. Transformations
# may require a schema with slightly more columns than they require to keep
# the total number of schemas low. 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:39,代码来源:transform.py

示例15: test_schema_comparison

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import StructType [as 别名]
def test_schema_comparison(expect: T.StructType, have: T.StructType, compatible: bool, equal: bool) -> None:
    if equal and not compatible:
        raise Exception('Invalid constraint, can not be equal but not compatible')
    # functions return list of errors, not bool() returns true when everything is ok
    assert compatible is not bool(mt._verify_schema_compatability(expect, have))
    assert equal is not bool(mt._verify_schema_equality(expect, have)) 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:8,代码来源:test_transform.py


注:本文中的pyspark.sql.types.StructType方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。