本文整理汇总了Python中pyspark.Row方法的典型用法代码示例。如果您正苦于以下问题:Python pyspark.Row方法的具体用法?Python pyspark.Row怎么用?Python pyspark.Row使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark
的用法示例。
在下文中一共展示了pyspark.Row方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _convertOutputToImage
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def _convertOutputToImage(self, df, tfs_output_col, output_shape):
assert len(output_shape) == 4, str(output_shape) + " does not have 4 dimensions"
height = int(output_shape[1])
width = int(output_shape[2])
def to_image(orig_image, numeric_data):
# Assume the returned image has float pixels but same #channels as input
mode = imageIO.imageTypeByName('CV_32FC%d' % orig_image.nChannels)
data = bytearray(np.array(numeric_data).astype(np.float32).tobytes())
nChannels = orig_image.nChannels
return Row(
origin="",
mode=mode.ord,
height=height,
width=width,
nChannels=nChannels,
data=data)
to_image_udf = udf(to_image, ImageSchema.imageSchema['image'].dataType)
resDf = df.withColumn(self.getOutputCol(),
to_image_udf(df[self.getInputCol()], df[tfs_output_col]))
return resDf.drop(tfs_output_col)
示例2: imageArrayToStruct
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def imageArrayToStruct(imgArray, origin=""):
"""
Create a row representation of an image from an image array.
:param imgArray: ndarray, image data.
:return: Row, image as a DataFrame Row with schema==ImageSchema.
"""
# Sometimes tensors have a leading "batch-size" dimension. Assume to be 1 if it exists.
if len(imgArray.shape) == 4:
if imgArray.shape[0] != 1:
raise ValueError(
"The first dimension of a 4-d image array is expected to be 1.")
imgArray = imgArray.reshape(imgArray.shape[1:])
imageType = _arrayToOcvMode(imgArray)
height, width, nChannels = imgArray.shape
data = bytearray(imgArray.tobytes())
return Row(origin=origin, mode=imageType.ord, height=height,
width=width, nChannels=nChannels, data=data)
示例3: imageStructToPIL
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def imageStructToPIL(imageRow):
"""
Convert the immage from image schema struct to PIL image
:param imageRow: Row, must have ImageSchema
:return PIL image
"""
imgType = imageTypeByOrdinal(imageRow.mode)
if imgType.dtype != 'uint8':
raise ValueError("Can not convert image of type " +
imgType.dtype + " to PIL, can only deal with 8U format")
ary = imageStructToArray(imageRow)
# PIL expects RGB order, image schema is BGR
# => we need to flip the order unless there is only one channel
if imgType.nChannels != 1:
ary = _reverseChannels(ary)
if imgType.nChannels == 1:
return Image.fromarray(obj=ary, mode='L')
elif imgType.nChannels == 3:
return Image.fromarray(obj=ary, mode='RGB')
elif imgType.nChannels == 4:
return Image.fromarray(obj=ary, mode='RGBA')
else:
raise ValueError("don't know how to convert " +
imgType.name + " to PIL")
示例4: process
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def process(timestamp, rdd):
try:
# Get the singleton instance of SparkSession
spark = get_session(rdd.context.getConf())
# Convert RDD[List[String]] to RDD[Row] to DataFrame
rows = rdd.flatMap(lambda a: a).map(lambda w: Row(word=w))
words_df = spark.createDataFrame(rows)
# Creates a temporary view using the DataFrame
words_df.createOrReplaceTempView('words')
# Do word count on table using SQL and print it
sql = "SELECT word, COUNT(1) AS total FROM words GROUP BY word"
word_count_df = spark.sql(sql)
word_count_df.show()
except:
pass
示例5: imageStructToArray
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def imageStructToArray(imageRow):
"""
Convert an image to a numpy array.
:param imageRow: Row, must use imageSchema.
:return: ndarray, image data.
"""
imType = imageTypeByOrdinal(imageRow.mode)
shape = (imageRow.height, imageRow.width, imageRow.nChannels)
return np.ndarray(shape, imType.dtype, imageRow.data)
示例6: build_vocabularies
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def build_vocabularies(self, rows: RDD):
"""
Process rows to gather values and paths with their frequencies.
:param rows: row structure is ((key, doc), val) where:
* key: str with the path context
* doc: file name
* val: number of occurrences of key in doc
"""
def _flatten_row(row: Row):
# 2: removes the namespace v. from the string to parse it as tuple
k = Vocabulary2Id._unstringify_path_context(row)
return [(k[0], 1), (k[1], 1), (k[2], 1)]
rows = rows \
.flatMap(_flatten_row) \
.reduceByKey(operator.add) \
.persist()
values = rows.filter(lambda x: type(x[0]) == str).collect()
paths = rows.filter(lambda x: type(x[0]) == tuple).collect()
value2index = {w: id for id, (w, _) in enumerate(values)}
path2index = {w: id for id, (w, _) in enumerate(paths)}
value2freq = {w: freq for _, (w, freq) in enumerate(values)}
path2freq = {w: freq for _, (w, freq) in enumerate(paths)}
rows.unpersist()
return value2index, path2index, value2freq, path2freq
示例7: build_doc2pc
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def build_doc2pc(self, value2index: dict, path2index: dict, rows: RDD):
"""
Process rows and build elements (doc, [path_context_1, path_context_2, ...])
:param value2index_freq: value -> id
:param path2index_freq: path -> id
"""
bc_value2index = self.sc.broadcast(value2index)
bc_path2index = self.sc.broadcast(path2index)
def _doc2pc(row: Row):
(u, path, v), doc = Vocabulary2Id._unstringify_path_context(row), row[0][1]
return doc, (bc_value2index.value[u], bc_path2index.value[path],
bc_value2index.value[v])
rows = rows \
.map(_doc2pc) \
.distinct() \
.combineByKey(lambda value: [value],
lambda x, value: x + [value],
lambda x, y: x + y)
bc_value2index.unpersist(blocking=True)
bc_path2index.unpersist(blocking=True)
return rows
示例8: align_type
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def align_type(init_condition: dict):
def f(d):
for y, x in init_condition.items():
d[y] = type(x)(d[y])
return Row(**d)
return f
### Typefull Conversion: to Spark
# rdd -> spark
### Typeless Conversion: to Spark
# (rdd -> pandas) -> spark
示例9: test_dict_to_spark_row_field_validation_scalar_types
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def test_dict_to_spark_row_field_validation_scalar_types():
"""Test various validations done on data types when converting a dictionary to a spark row"""
TestSchema = Unischema('TestSchema', [
UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
])
assert isinstance(dict_to_spark_row(TestSchema, {'string_field': 'abc'}), Row)
# Not a nullable field
with pytest.raises(ValueError):
isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)
# Wrong field type
with pytest.raises(TypeError):
isinstance(dict_to_spark_row(TestSchema, {'string_field': []}), Row)
示例10: test_dict_to_spark_row_field_validation_scalar_nullable
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def test_dict_to_spark_row_field_validation_scalar_nullable():
"""Test various validations done on data types when converting a dictionary to a spark row"""
TestSchema = Unischema('TestSchema', [
UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), True),
UnischemaField('nullable_implicitly_set', np.string_, (), ScalarCodec(StringType()), True),
])
assert isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)
示例11: test_dict_to_spark_row_field_validation_ndarrays
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def test_dict_to_spark_row_field_validation_ndarrays():
"""Test various validations done on data types when converting a dictionary to a spark row"""
TestSchema = Unischema('TestSchema', [
UnischemaField('tensor3d', np.float32, (10, 20, 30), NdarrayCodec(), False),
])
assert isinstance(dict_to_spark_row(TestSchema, {'tensor3d': np.zeros((10, 20, 30), dtype=np.float32)}), Row)
# Null value into not nullable field
with pytest.raises(ValueError):
isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)
# Wrong dimensions
with pytest.raises(ValueError):
isinstance(dict_to_spark_row(TestSchema, {'string_field': np.zeros((1, 2, 3), dtype=np.float32)}), Row)
示例12: ping_to_row
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def ping_to_row(ping):
return Row(client_id=ping["clientId"], os=ping["environment/system/os/name"])
示例13: dict_to_spark_row
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def dict_to_spark_row(unischema, row_dict):
"""Converts a single row into a spark Row object.
Verifies that the data confirms with unischema definition types and encodes the data using the codec specified
by the unischema.
The parameters are keywords to allow use of functools.partial.
:param unischema: an instance of Unischema object
:param row_dict: a dictionary where the keys match name of fields in the unischema.
:return: a single pyspark.Row object
"""
# Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
# (currently works only with make_batch_reader)
import pyspark
assert isinstance(unischema, Unischema)
# Add null fields. Be careful not to mutate the input dictionary - that would be an unexpected side effect
copy_row_dict = copy.copy(row_dict)
insert_explicit_nulls(unischema, copy_row_dict)
if set(copy_row_dict.keys()) != set(unischema.fields.keys()):
raise ValueError('Dictionary fields \n{}\n do not match schema fields \n{}'.format(
'\n'.join(sorted(copy_row_dict.keys())), '\n'.join(unischema.fields.keys())))
encoded_dict = {}
for field_name, value in copy_row_dict.items():
schema_field = unischema.fields[field_name]
if value is None:
if not schema_field.nullable:
raise ValueError('Field {} is not "nullable", but got passes a None value')
if schema_field.codec:
encoded_dict[field_name] = schema_field.codec.encode(schema_field, value) if value is not None else None
else:
if isinstance(value, (np.generic,)):
encoded_dict[field_name] = value.tolist()
else:
encoded_dict[field_name] = value
field_list = list(unischema.fields.keys())
# generate a value list which match the schema column order.
value_list = [encoded_dict[name] for name in field_list]
# create a row by value list
row = pyspark.Row(*value_list)
# set row fields
row.__fields__ = field_list
return row
示例14: create_many_columns_non_petastorm_dataset
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import Row [as 别名]
def create_many_columns_non_petastorm_dataset(output_url, num_rows, num_columns=1000, num_files=4, spark=None):
"""Creates a dataset with the following properties (used in tests)
1. Has 1000 columns
2. Each column is an int32 integer
3. Parquet store consists of 4 files (controlled by ``num_files`` argument)
:param output_url: The dataset is written to this url (e.g. ``file:///tmp/some_directory``)
:param num_rows: Number of rows in the generated dataset
:param num_columns: Number of columns (1000 is the default)
:param num_files: Number of parquet files that will be created in the store
:param spark: An instance of SparkSession object. A new instance will be created if non specified
:return:
"""
shutdown = False
if not spark:
spark_session = SparkSession \
.builder \
.appName('petastorm_end_to_end_test') \
.master('local[*]')
spark = spark_session.getOrCreate()
shutdown = True
column_names = ['col_{}'.format(col_id) for col_id in range(num_columns)]
def generate_row(i):
return {'col_{}'.format(col_id): i * 10000 for col_id, col_name in enumerate(column_names)}
expected_data = [generate_row(row_number) for row_number in range(num_rows)]
rows = [Row(**row) for row in expected_data]
# WARNING: surprisingly, schema fields and row fields are matched only by order and not name.
schema = StructType([StructField(column_name, IntegerType(), False) for column_name in column_names])
dataframe = spark.createDataFrame(rows, schema)
dataframe. \
coalesce(num_files). \
write.option('compression', 'none'). \
mode('overwrite'). \
parquet(output_url)
if shutdown:
spark.stop()
return expected_data