当前位置: 首页>>代码示例>>Python>>正文


Python pyspark.Row方法代码示例

本文整理汇总了Python中pyspark.Row方法的典型用法代码示例。如果您正苦于以下问题:Python pyspark.Row方法的具体用法?Python pyspark.Row怎么用?Python pyspark.Row使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark的用法示例。


在下文中一共展示了pyspark.Row方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _convertOutputToImage

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def _convertOutputToImage(self, df, tfs_output_col, output_shape):
        assert len(output_shape) == 4, str(output_shape) + " does not have 4 dimensions"
        height = int(output_shape[1])
        width = int(output_shape[2])

        def to_image(orig_image, numeric_data):
            # Assume the returned image has float pixels but same #channels as input
            mode = imageIO.imageTypeByName('CV_32FC%d' % orig_image.nChannels)
            data = bytearray(np.array(numeric_data).astype(np.float32).tobytes())
            nChannels = orig_image.nChannels
            return Row(
                origin="",
                mode=mode.ord,
                height=height,
                width=width,
                nChannels=nChannels,
                data=data)

        to_image_udf = udf(to_image, ImageSchema.imageSchema['image'].dataType)
        resDf = df.withColumn(self.getOutputCol(),
                              to_image_udf(df[self.getInputCol()], df[tfs_output_col]))
        return resDf.drop(tfs_output_col) 
开发者ID:databricks,项目名称:spark-deep-learning,代码行数:24,代码来源:tf_image.py

示例2: imageArrayToStruct

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def imageArrayToStruct(imgArray, origin=""):
    """
    Create a row representation of an image from an image array.

    :param imgArray: ndarray, image data.
    :return: Row, image as a DataFrame Row with schema==ImageSchema.
    """
    # Sometimes tensors have a leading "batch-size" dimension. Assume to be 1 if it exists.
    if len(imgArray.shape) == 4:
        if imgArray.shape[0] != 1:
            raise ValueError(
                "The first dimension of a 4-d image array is expected to be 1.")
        imgArray = imgArray.reshape(imgArray.shape[1:])
    imageType = _arrayToOcvMode(imgArray)
    height, width, nChannels = imgArray.shape
    data = bytearray(imgArray.tobytes())
    return Row(origin=origin, mode=imageType.ord, height=height,
               width=width, nChannels=nChannels, data=data) 
开发者ID:databricks,项目名称:spark-deep-learning,代码行数:20,代码来源:imageIO.py

示例3: imageStructToPIL

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def imageStructToPIL(imageRow):
    """
    Convert the immage from image schema struct to PIL image

    :param imageRow: Row, must have ImageSchema
    :return PIL image
    """
    imgType = imageTypeByOrdinal(imageRow.mode)
    if imgType.dtype != 'uint8':
        raise ValueError("Can not convert image of type " +
                         imgType.dtype + " to PIL, can only deal with 8U format")
    ary = imageStructToArray(imageRow)
    # PIL expects RGB order, image schema is BGR
    # => we need to flip the order unless there is only one channel
    if imgType.nChannels != 1:
        ary = _reverseChannels(ary)
    if imgType.nChannels == 1:
        return Image.fromarray(obj=ary, mode='L')
    elif imgType.nChannels == 3:
        return Image.fromarray(obj=ary, mode='RGB')
    elif imgType.nChannels == 4:
        return Image.fromarray(obj=ary, mode='RGBA')
    else:
        raise ValueError("don't know how to convert " +
                         imgType.name + " to PIL") 
开发者ID:databricks,项目名称:spark-deep-learning,代码行数:27,代码来源:imageIO.py

示例4: process

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def process(timestamp, rdd):
    try:
        # Get the singleton instance of SparkSession
        spark = get_session(rdd.context.getConf())

        # Convert RDD[List[String]] to RDD[Row] to DataFrame
        rows = rdd.flatMap(lambda a: a).map(lambda w: Row(word=w))

        words_df = spark.createDataFrame(rows)

        # Creates a temporary view using the DataFrame
        words_df.createOrReplaceTempView('words')

        # Do word count on table using SQL and print it
        sql = "SELECT word, COUNT(1) AS total FROM words GROUP BY word"
        word_count_df = spark.sql(sql)
        word_count_df.show()
    except:
        pass 
开发者ID:ksindi,项目名称:kafka-compose,代码行数:21,代码来源:process.py

示例5: imageStructToArray

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def imageStructToArray(imageRow):
    """
    Convert an image to a numpy array.

    :param imageRow: Row, must use imageSchema.
    :return: ndarray, image data.
    """
    imType = imageTypeByOrdinal(imageRow.mode)
    shape = (imageRow.height, imageRow.width, imageRow.nChannels)
    return np.ndarray(shape, imType.dtype, imageRow.data) 
开发者ID:databricks,项目名称:spark-deep-learning,代码行数:12,代码来源:imageIO.py

示例6: build_vocabularies

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def build_vocabularies(self, rows: RDD):
        """
        Process rows to gather values and paths with their frequencies.
        :param rows: row structure is ((key, doc), val) where:
            * key: str with the path context
            * doc: file name
            * val: number of occurrences of key in doc
        """

        def _flatten_row(row: Row):
            # 2: removes the namespace v. from the string to parse it as tuple
            k = Vocabulary2Id._unstringify_path_context(row)
            return [(k[0], 1), (k[1], 1), (k[2], 1)]

        rows = rows \
            .flatMap(_flatten_row) \
            .reduceByKey(operator.add) \
            .persist()

        values = rows.filter(lambda x: type(x[0]) == str).collect()
        paths = rows.filter(lambda x: type(x[0]) == tuple).collect()

        value2index = {w: id for id, (w, _) in enumerate(values)}
        path2index = {w: id for id, (w, _) in enumerate(paths)}
        value2freq = {w: freq for _, (w, freq) in enumerate(values)}
        path2freq = {w: freq for _, (w, freq) in enumerate(paths)}

        rows.unpersist()

        return value2index, path2index, value2freq, path2freq 
开发者ID:src-d,项目名称:code2vec,代码行数:32,代码来源:vocabulary2id.py

示例7: build_doc2pc

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def build_doc2pc(self, value2index: dict, path2index: dict, rows: RDD):
        """
        Process rows and build elements (doc, [path_context_1, path_context_2, ...])
        :param value2index_freq: value -> id
        :param path2index_freq: path -> id
        """

        bc_value2index = self.sc.broadcast(value2index)
        bc_path2index = self.sc.broadcast(path2index)

        def _doc2pc(row: Row):
            (u, path, v), doc = Vocabulary2Id._unstringify_path_context(row), row[0][1]

            return doc, (bc_value2index.value[u], bc_path2index.value[path],
                         bc_value2index.value[v])

        rows = rows \
            .map(_doc2pc) \
            .distinct() \
            .combineByKey(lambda value: [value],
                          lambda x, value: x + [value],
                          lambda x, y: x + y)

        bc_value2index.unpersist(blocking=True)
        bc_path2index.unpersist(blocking=True)

        return rows 
开发者ID:src-d,项目名称:code2vec,代码行数:29,代码来源:vocabulary2id.py

示例8: align_type

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def align_type(init_condition: dict):
    def f(d):
        for y, x in init_condition.items():
            d[y] = type(x)(d[y])
        return Row(**d)
    return f

### Typefull Conversion: to Spark
# rdd -> spark
### Typeless Conversion: to Spark
# (rdd -> pandas) -> spark 
开发者ID:cadCAD-org,项目名称:cadCAD,代码行数:13,代码来源:sys_exec.py

示例9: test_dict_to_spark_row_field_validation_scalar_types

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def test_dict_to_spark_row_field_validation_scalar_types():
    """Test various validations done on data types when converting a dictionary to a spark row"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])

    assert isinstance(dict_to_spark_row(TestSchema, {'string_field': 'abc'}), Row)

    # Not a nullable field
    with pytest.raises(ValueError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)

    # Wrong field type
    with pytest.raises(TypeError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': []}), Row) 
开发者ID:uber,项目名称:petastorm,代码行数:17,代码来源:test_unischema.py

示例10: test_dict_to_spark_row_field_validation_scalar_nullable

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def test_dict_to_spark_row_field_validation_scalar_nullable():
    """Test various validations done on data types when converting a dictionary to a spark row"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), True),
        UnischemaField('nullable_implicitly_set', np.string_, (), ScalarCodec(StringType()), True),
    ])

    assert isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row) 
开发者ID:uber,项目名称:petastorm,代码行数:10,代码来源:test_unischema.py

示例11: test_dict_to_spark_row_field_validation_ndarrays

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def test_dict_to_spark_row_field_validation_ndarrays():
    """Test various validations done on data types when converting a dictionary to a spark row"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('tensor3d', np.float32, (10, 20, 30), NdarrayCodec(), False),
    ])

    assert isinstance(dict_to_spark_row(TestSchema, {'tensor3d': np.zeros((10, 20, 30), dtype=np.float32)}), Row)

    # Null value into not nullable field
    with pytest.raises(ValueError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)

    # Wrong dimensions
    with pytest.raises(ValueError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': np.zeros((1, 2, 3), dtype=np.float32)}), Row) 
开发者ID:uber,项目名称:petastorm,代码行数:17,代码来源:test_unischema.py

示例12: ping_to_row

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def ping_to_row(ping):
    return Row(client_id=ping["clientId"], os=ping["environment/system/os/name"]) 
开发者ID:mozilla,项目名称:python_mozetl,代码行数:4,代码来源:main.py

示例13: dict_to_spark_row

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def dict_to_spark_row(unischema, row_dict):
    """Converts a single row into a spark Row object.

    Verifies that the data confirms with unischema definition types and encodes the data using the codec specified
    by the unischema.

    The parameters are keywords to allow use of functools.partial.

    :param unischema: an instance of Unischema object
    :param row_dict: a dictionary where the keys match name of fields in the unischema.
    :return: a single pyspark.Row object
    """

    # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
    # (currently works only with make_batch_reader)
    import pyspark

    assert isinstance(unischema, Unischema)
    # Add null fields. Be careful not to mutate the input dictionary - that would be an unexpected side effect
    copy_row_dict = copy.copy(row_dict)
    insert_explicit_nulls(unischema, copy_row_dict)

    if set(copy_row_dict.keys()) != set(unischema.fields.keys()):
        raise ValueError('Dictionary fields \n{}\n do not match schema fields \n{}'.format(
            '\n'.join(sorted(copy_row_dict.keys())), '\n'.join(unischema.fields.keys())))

    encoded_dict = {}
    for field_name, value in copy_row_dict.items():
        schema_field = unischema.fields[field_name]
        if value is None:
            if not schema_field.nullable:
                raise ValueError('Field {} is not "nullable", but got passes a None value')
        if schema_field.codec:
            encoded_dict[field_name] = schema_field.codec.encode(schema_field, value) if value is not None else None
        else:
            if isinstance(value, (np.generic,)):
                encoded_dict[field_name] = value.tolist()
            else:
                encoded_dict[field_name] = value

    field_list = list(unischema.fields.keys())
    # generate a value list which match the schema column order.
    value_list = [encoded_dict[name] for name in field_list]
    # create a row by value list
    row = pyspark.Row(*value_list)
    # set row fields
    row.__fields__ = field_list
    return row 
开发者ID:uber,项目名称:petastorm,代码行数:50,代码来源:unischema.py

示例14: create_many_columns_non_petastorm_dataset

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def create_many_columns_non_petastorm_dataset(output_url, num_rows, num_columns=1000, num_files=4, spark=None):
    """Creates a dataset with the following properties (used in tests)

    1. Has 1000 columns
    2. Each column is an int32 integer
    3. Parquet store consists of 4 files (controlled by ``num_files`` argument)

    :param output_url: The dataset is written to this url (e.g. ``file:///tmp/some_directory``)
    :param num_rows: Number of rows in the generated dataset
    :param num_columns: Number of columns (1000 is the default)
    :param num_files: Number of parquet files that will be created in the store
    :param spark: An instance of SparkSession object. A new instance will be created if non specified
    :return:
    """
    shutdown = False
    if not spark:
        spark_session = SparkSession \
            .builder \
            .appName('petastorm_end_to_end_test') \
            .master('local[*]')

        spark = spark_session.getOrCreate()
        shutdown = True

    column_names = ['col_{}'.format(col_id) for col_id in range(num_columns)]

    def generate_row(i):
        return {'col_{}'.format(col_id): i * 10000 for col_id, col_name in enumerate(column_names)}

    expected_data = [generate_row(row_number) for row_number in range(num_rows)]

    rows = [Row(**row) for row in expected_data]

    # WARNING: surprisingly, schema fields and row fields are matched only by order and not name.
    schema = StructType([StructField(column_name, IntegerType(), False) for column_name in column_names])

    dataframe = spark.createDataFrame(rows, schema)
    dataframe. \
        coalesce(num_files). \
        write.option('compression', 'none'). \
        mode('overwrite'). \
        parquet(output_url)

    if shutdown:
        spark.stop()

    return expected_data 
开发者ID:uber,项目名称:petastorm,代码行数:49,代码来源:test_common.py


注:本文中的pyspark.Row方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。