本文整理汇总了Python中pyspark.sql.types.BinaryType方法的典型用法代码示例。如果您正苦于以下问题:Python types.BinaryType方法的具体用法?Python types.BinaryType怎么用?Python types.BinaryType使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types
的用法示例。
在下文中一共展示了types.BinaryType方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: filesToDF
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def filesToDF(sc, path, numPartitions=None):
"""
Read files from a directory to a DataFrame.
:param sc: SparkContext.
:param path: str, path to files.
:param numPartition: int, number or partitions to use for reading files.
:return: DataFrame, with columns: (filePath: str, fileData: BinaryType)
"""
numPartitions = numPartitions or sc.defaultParallelism
schema = StructType([StructField("filePath", StringType(), False),
StructField("fileData", BinaryType(), False)])
rdd = sc.binaryFiles(
path, minPartitions=numPartitions).repartition(numPartitions)
rdd = rdd.map(lambda x: (x[0], bytearray(x[1])))
return rdd.toDF(schema)
示例2: test_filesTODF
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def test_filesTODF(self):
df = imageIO.filesToDF(self.binaryFilesMock, "path", 217)
self.assertEqual(df.rdd.getNumPartitions(), 217)
df.schema.fields[0].dataType == StringType()
df.schema.fields[0].dataType == BinaryType()
first = df.first()
self.assertTrue(hasattr(first, "filePath"))
self.assertEqual(type(first.fileData), bytearray)
# TODO: make unit tests for arrayToImageRow on arrays of varying shapes, channels, dtypes.
示例3: spark_dtype
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def spark_dtype(self):
# Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
# (currently works only with make_batch_reader). We should move all pyspark related code into a separate module
import pyspark.sql.types as sql_types
return sql_types.BinaryType()
示例4: as_spark_type
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def as_spark_type(tpe) -> types.DataType:
"""
Given a python type, returns the equivalent spark type.
Accepts:
- the built-in types in python
- the built-in types in numpy
- list of pairs of (field_name, type)
- dictionaries of field_name -> type
- python3's typing system
"""
if tpe in (str, "str", "string"):
return types.StringType()
elif tpe in (bytes,):
return types.BinaryType()
elif tpe in (np.int8, "int8", "byte"):
return types.ByteType()
elif tpe in (np.int16, "int16", "short"):
return types.ShortType()
elif tpe in (int, "int", np.int, np.int32):
return types.IntegerType()
elif tpe in (np.int64, "int64", "long", "bigint"):
return types.LongType()
elif tpe in (float, "float", np.float):
return types.FloatType()
elif tpe in (np.float64, "float64", "double"):
return types.DoubleType()
elif tpe in (datetime.datetime, np.datetime64):
return types.TimestampType()
elif tpe in (datetime.date,):
return types.DateType()
elif tpe in (bool, "boolean", "bool", np.bool):
return types.BooleanType()
elif tpe in (np.ndarray,):
# TODO: support other child types
return types.ArrayType(types.StringType())
else:
raise TypeError("Type %s was not understood." % tpe)
示例5: __init__
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def __init__(self, series: "ks.Series"):
if not isinstance(series.spark.data_type, (StringType, BinaryType, ArrayType)):
raise ValueError("Cannot call StringMethods on type {}".format(series.spark.data_type))
self._data = series
self.name = self._data.name
# Methods
示例6: loadTFRecords
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def loadTFRecords(sc, input_dir, binary_features=[]):
"""Load TFRecords from disk into a Spark DataFrame.
This will attempt to automatically convert the tf.train.Example features into Spark DataFrame columns of equivalent types.
Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to
disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint"
from the caller in the ``binary_features`` argument.
Args:
:sc: SparkContext
:input_dir: location of TFRecords on disk.
:binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays.
Returns:
A Spark DataFrame mirroring the tf.train.Example schema.
"""
import tensorflow as tf
tfr_rdd = sc.newAPIHadoopFile(input_dir, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
keyClass="org.apache.hadoop.io.BytesWritable",
valueClass="org.apache.hadoop.io.NullWritable")
# infer Spark SQL types from tf.Example
record = tfr_rdd.take(1)[0]
example = tf.train.Example()
example.ParseFromString(bytes(record[0]))
schema = infer_schema(example, binary_features)
# convert serialized protobuf to tf.Example to Row
example_rdd = tfr_rdd.mapPartitions(lambda x: fromTFExample(x, binary_features))
# create a Spark DataFrame from RDD[Row]
df = example_rdd.toDF(schema)
# save reference of this dataframe
loadedDF[df] = input_dir
return df
示例7: infer_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def infer_schema(example, binary_features=[]):
"""Given a tf.train.Example, infer the Spark DataFrame schema (StructFields).
Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to
disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint"
from the caller in the ``binary_features`` argument.
Args:
:example: a tf.train.Example
:binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays.
Returns:
A DataFrame StructType schema
"""
def _infer_sql_type(k, v):
# special handling for binary features
if k in binary_features:
return BinaryType()
if v.int64_list.value:
result = v.int64_list.value
sql_type = LongType()
elif v.float_list.value:
result = v.float_list.value
sql_type = DoubleType()
else:
result = v.bytes_list.value
sql_type = StringType()
if len(result) > 1: # represent multi-item tensors as Spark SQL ArrayType() of base types
return ArrayType(sql_type)
else: # represent everything else as base types (and empty tensors as StringType())
return sql_type
return StructType([StructField(k, _infer_sql_type(k, v), True) for k, v in sorted(example.features.feature.items())])
示例8: fromTFExample
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def fromTFExample(iter, binary_features=[]):
"""mapPartition function to convert an RDD of serialized tf.train.Example bytestring into an RDD of Row.
Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to
disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint"
from the caller in the ``binary_features`` argument.
Args:
:iter: the RDD partition iterator
:binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays.
Returns:
An array/iterator of DataFrame Row with features converted into columns.
"""
# convert from protobuf-like dict to DataFrame-friendly dict
def _get_value(k, v):
if v.int64_list.value:
result = v.int64_list.value
elif v.float_list.value:
result = v.float_list.value
else: # string or bytearray
if k in binary_features:
return bytearray(v.bytes_list.value[0])
else:
return v.bytes_list.value[0].decode('utf-8')
if len(result) > 1: # represent multi-item tensors as python lists
return list(result)
elif len(result) == 1: # extract scalars from single-item tensors
return result[0]
else: # represent empty tensors as python None
return None
results = []
for record in iter:
example = tf.train.Example()
example.ParseFromString(bytes(record[0])) # record is (bytestr, None)
d = {k: _get_value(k, v) for k, v in sorted(example.features.feature.items())}
row = Row(**d)
results.append(row)
return results