當前位置: 首頁>>代碼示例>>Python>>正文


Python pyspark.Row方法代碼示例

本文整理匯總了Python中pyspark.Row方法的典型用法代碼示例。如果您正苦於以下問題:Python pyspark.Row方法的具體用法?Python pyspark.Row怎麽用?Python pyspark.Row使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在pyspark的用法示例。


在下文中一共展示了pyspark.Row方法的14個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: _convertOutputToImage

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def _convertOutputToImage(self, df, tfs_output_col, output_shape):
        assert len(output_shape) == 4, str(output_shape) + " does not have 4 dimensions"
        height = int(output_shape[1])
        width = int(output_shape[2])

        def to_image(orig_image, numeric_data):
            # Assume the returned image has float pixels but same #channels as input
            mode = imageIO.imageTypeByName('CV_32FC%d' % orig_image.nChannels)
            data = bytearray(np.array(numeric_data).astype(np.float32).tobytes())
            nChannels = orig_image.nChannels
            return Row(
                origin="",
                mode=mode.ord,
                height=height,
                width=width,
                nChannels=nChannels,
                data=data)

        to_image_udf = udf(to_image, ImageSchema.imageSchema['image'].dataType)
        resDf = df.withColumn(self.getOutputCol(),
                              to_image_udf(df[self.getInputCol()], df[tfs_output_col]))
        return resDf.drop(tfs_output_col) 
開發者ID:databricks,項目名稱:spark-deep-learning,代碼行數:24,代碼來源:tf_image.py

示例2: imageArrayToStruct

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def imageArrayToStruct(imgArray, origin=""):
    """
    Create a row representation of an image from an image array.

    :param imgArray: ndarray, image data.
    :return: Row, image as a DataFrame Row with schema==ImageSchema.
    """
    # Sometimes tensors have a leading "batch-size" dimension. Assume to be 1 if it exists.
    if len(imgArray.shape) == 4:
        if imgArray.shape[0] != 1:
            raise ValueError(
                "The first dimension of a 4-d image array is expected to be 1.")
        imgArray = imgArray.reshape(imgArray.shape[1:])
    imageType = _arrayToOcvMode(imgArray)
    height, width, nChannels = imgArray.shape
    data = bytearray(imgArray.tobytes())
    return Row(origin=origin, mode=imageType.ord, height=height,
               width=width, nChannels=nChannels, data=data) 
開發者ID:databricks,項目名稱:spark-deep-learning,代碼行數:20,代碼來源:imageIO.py

示例3: imageStructToPIL

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def imageStructToPIL(imageRow):
    """
    Convert the immage from image schema struct to PIL image

    :param imageRow: Row, must have ImageSchema
    :return PIL image
    """
    imgType = imageTypeByOrdinal(imageRow.mode)
    if imgType.dtype != 'uint8':
        raise ValueError("Can not convert image of type " +
                         imgType.dtype + " to PIL, can only deal with 8U format")
    ary = imageStructToArray(imageRow)
    # PIL expects RGB order, image schema is BGR
    # => we need to flip the order unless there is only one channel
    if imgType.nChannels != 1:
        ary = _reverseChannels(ary)
    if imgType.nChannels == 1:
        return Image.fromarray(obj=ary, mode='L')
    elif imgType.nChannels == 3:
        return Image.fromarray(obj=ary, mode='RGB')
    elif imgType.nChannels == 4:
        return Image.fromarray(obj=ary, mode='RGBA')
    else:
        raise ValueError("don't know how to convert " +
                         imgType.name + " to PIL") 
開發者ID:databricks,項目名稱:spark-deep-learning,代碼行數:27,代碼來源:imageIO.py

示例4: process

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def process(timestamp, rdd):
    try:
        # Get the singleton instance of SparkSession
        spark = get_session(rdd.context.getConf())

        # Convert RDD[List[String]] to RDD[Row] to DataFrame
        rows = rdd.flatMap(lambda a: a).map(lambda w: Row(word=w))

        words_df = spark.createDataFrame(rows)

        # Creates a temporary view using the DataFrame
        words_df.createOrReplaceTempView('words')

        # Do word count on table using SQL and print it
        sql = "SELECT word, COUNT(1) AS total FROM words GROUP BY word"
        word_count_df = spark.sql(sql)
        word_count_df.show()
    except:
        pass 
開發者ID:ksindi,項目名稱:kafka-compose,代碼行數:21,代碼來源:process.py

示例5: imageStructToArray

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def imageStructToArray(imageRow):
    """
    Convert an image to a numpy array.

    :param imageRow: Row, must use imageSchema.
    :return: ndarray, image data.
    """
    imType = imageTypeByOrdinal(imageRow.mode)
    shape = (imageRow.height, imageRow.width, imageRow.nChannels)
    return np.ndarray(shape, imType.dtype, imageRow.data) 
開發者ID:databricks,項目名稱:spark-deep-learning,代碼行數:12,代碼來源:imageIO.py

示例6: build_vocabularies

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def build_vocabularies(self, rows: RDD):
        """
        Process rows to gather values and paths with their frequencies.
        :param rows: row structure is ((key, doc), val) where:
            * key: str with the path context
            * doc: file name
            * val: number of occurrences of key in doc
        """

        def _flatten_row(row: Row):
            # 2: removes the namespace v. from the string to parse it as tuple
            k = Vocabulary2Id._unstringify_path_context(row)
            return [(k[0], 1), (k[1], 1), (k[2], 1)]

        rows = rows \
            .flatMap(_flatten_row) \
            .reduceByKey(operator.add) \
            .persist()

        values = rows.filter(lambda x: type(x[0]) == str).collect()
        paths = rows.filter(lambda x: type(x[0]) == tuple).collect()

        value2index = {w: id for id, (w, _) in enumerate(values)}
        path2index = {w: id for id, (w, _) in enumerate(paths)}
        value2freq = {w: freq for _, (w, freq) in enumerate(values)}
        path2freq = {w: freq for _, (w, freq) in enumerate(paths)}

        rows.unpersist()

        return value2index, path2index, value2freq, path2freq 
開發者ID:src-d,項目名稱:code2vec,代碼行數:32,代碼來源:vocabulary2id.py

示例7: build_doc2pc

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def build_doc2pc(self, value2index: dict, path2index: dict, rows: RDD):
        """
        Process rows and build elements (doc, [path_context_1, path_context_2, ...])
        :param value2index_freq: value -> id
        :param path2index_freq: path -> id
        """

        bc_value2index = self.sc.broadcast(value2index)
        bc_path2index = self.sc.broadcast(path2index)

        def _doc2pc(row: Row):
            (u, path, v), doc = Vocabulary2Id._unstringify_path_context(row), row[0][1]

            return doc, (bc_value2index.value[u], bc_path2index.value[path],
                         bc_value2index.value[v])

        rows = rows \
            .map(_doc2pc) \
            .distinct() \
            .combineByKey(lambda value: [value],
                          lambda x, value: x + [value],
                          lambda x, y: x + y)

        bc_value2index.unpersist(blocking=True)
        bc_path2index.unpersist(blocking=True)

        return rows 
開發者ID:src-d,項目名稱:code2vec,代碼行數:29,代碼來源:vocabulary2id.py

示例8: align_type

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def align_type(init_condition: dict):
    def f(d):
        for y, x in init_condition.items():
            d[y] = type(x)(d[y])
        return Row(**d)
    return f

### Typefull Conversion: to Spark
# rdd -> spark
### Typeless Conversion: to Spark
# (rdd -> pandas) -> spark 
開發者ID:cadCAD-org,項目名稱:cadCAD,代碼行數:13,代碼來源:sys_exec.py

示例9: test_dict_to_spark_row_field_validation_scalar_types

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def test_dict_to_spark_row_field_validation_scalar_types():
    """Test various validations done on data types when converting a dictionary to a spark row"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])

    assert isinstance(dict_to_spark_row(TestSchema, {'string_field': 'abc'}), Row)

    # Not a nullable field
    with pytest.raises(ValueError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)

    # Wrong field type
    with pytest.raises(TypeError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': []}), Row) 
開發者ID:uber,項目名稱:petastorm,代碼行數:17,代碼來源:test_unischema.py

示例10: test_dict_to_spark_row_field_validation_scalar_nullable

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def test_dict_to_spark_row_field_validation_scalar_nullable():
    """Test various validations done on data types when converting a dictionary to a spark row"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), True),
        UnischemaField('nullable_implicitly_set', np.string_, (), ScalarCodec(StringType()), True),
    ])

    assert isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row) 
開發者ID:uber,項目名稱:petastorm,代碼行數:10,代碼來源:test_unischema.py

示例11: test_dict_to_spark_row_field_validation_ndarrays

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def test_dict_to_spark_row_field_validation_ndarrays():
    """Test various validations done on data types when converting a dictionary to a spark row"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('tensor3d', np.float32, (10, 20, 30), NdarrayCodec(), False),
    ])

    assert isinstance(dict_to_spark_row(TestSchema, {'tensor3d': np.zeros((10, 20, 30), dtype=np.float32)}), Row)

    # Null value into not nullable field
    with pytest.raises(ValueError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)

    # Wrong dimensions
    with pytest.raises(ValueError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': np.zeros((1, 2, 3), dtype=np.float32)}), Row) 
開發者ID:uber,項目名稱:petastorm,代碼行數:17,代碼來源:test_unischema.py

示例12: ping_to_row

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def ping_to_row(ping):
    return Row(client_id=ping["clientId"], os=ping["environment/system/os/name"]) 
開發者ID:mozilla,項目名稱:python_mozetl,代碼行數:4,代碼來源:main.py

示例13: dict_to_spark_row

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def dict_to_spark_row(unischema, row_dict):
    """Converts a single row into a spark Row object.

    Verifies that the data confirms with unischema definition types and encodes the data using the codec specified
    by the unischema.

    The parameters are keywords to allow use of functools.partial.

    :param unischema: an instance of Unischema object
    :param row_dict: a dictionary where the keys match name of fields in the unischema.
    :return: a single pyspark.Row object
    """

    # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
    # (currently works only with make_batch_reader)
    import pyspark

    assert isinstance(unischema, Unischema)
    # Add null fields. Be careful not to mutate the input dictionary - that would be an unexpected side effect
    copy_row_dict = copy.copy(row_dict)
    insert_explicit_nulls(unischema, copy_row_dict)

    if set(copy_row_dict.keys()) != set(unischema.fields.keys()):
        raise ValueError('Dictionary fields \n{}\n do not match schema fields \n{}'.format(
            '\n'.join(sorted(copy_row_dict.keys())), '\n'.join(unischema.fields.keys())))

    encoded_dict = {}
    for field_name, value in copy_row_dict.items():
        schema_field = unischema.fields[field_name]
        if value is None:
            if not schema_field.nullable:
                raise ValueError('Field {} is not "nullable", but got passes a None value')
        if schema_field.codec:
            encoded_dict[field_name] = schema_field.codec.encode(schema_field, value) if value is not None else None
        else:
            if isinstance(value, (np.generic,)):
                encoded_dict[field_name] = value.tolist()
            else:
                encoded_dict[field_name] = value

    field_list = list(unischema.fields.keys())
    # generate a value list which match the schema column order.
    value_list = [encoded_dict[name] for name in field_list]
    # create a row by value list
    row = pyspark.Row(*value_list)
    # set row fields
    row.__fields__ = field_list
    return row 
開發者ID:uber,項目名稱:petastorm,代碼行數:50,代碼來源:unischema.py

示例14: create_many_columns_non_petastorm_dataset

# 需要導入模塊: import pyspark [as 別名]
# 或者: from pyspark import Row [as 別名]
def create_many_columns_non_petastorm_dataset(output_url, num_rows, num_columns=1000, num_files=4, spark=None):
    """Creates a dataset with the following properties (used in tests)

    1. Has 1000 columns
    2. Each column is an int32 integer
    3. Parquet store consists of 4 files (controlled by ``num_files`` argument)

    :param output_url: The dataset is written to this url (e.g. ``file:///tmp/some_directory``)
    :param num_rows: Number of rows in the generated dataset
    :param num_columns: Number of columns (1000 is the default)
    :param num_files: Number of parquet files that will be created in the store
    :param spark: An instance of SparkSession object. A new instance will be created if non specified
    :return:
    """
    shutdown = False
    if not spark:
        spark_session = SparkSession \
            .builder \
            .appName('petastorm_end_to_end_test') \
            .master('local[*]')

        spark = spark_session.getOrCreate()
        shutdown = True

    column_names = ['col_{}'.format(col_id) for col_id in range(num_columns)]

    def generate_row(i):
        return {'col_{}'.format(col_id): i * 10000 for col_id, col_name in enumerate(column_names)}

    expected_data = [generate_row(row_number) for row_number in range(num_rows)]

    rows = [Row(**row) for row in expected_data]

    # WARNING: surprisingly, schema fields and row fields are matched only by order and not name.
    schema = StructType([StructField(column_name, IntegerType(), False) for column_name in column_names])

    dataframe = spark.createDataFrame(rows, schema)
    dataframe. \
        coalesce(num_files). \
        write.option('compression', 'none'). \
        mode('overwrite'). \
        parquet(output_url)

    if shutdown:
        spark.stop()

    return expected_data 
開發者ID:uber,項目名稱:petastorm,代碼行數:49,代碼來源:test_common.py


注:本文中的pyspark.Row方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。