Python types.BinaryType方法代码示例

本文整理汇总了Python中pyspark.sql.types.BinaryType方法的典型用法代码示例。如果您正苦于以下问题：Python types.BinaryType方法的具体用法？Python types.BinaryType怎么用？Python types.BinaryType使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types的用法示例。

在下文中一共展示了types.BinaryType方法的8个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: filesToDF

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def filesToDF(sc, path, numPartitions=None):
    """
    Read files from a directory to a DataFrame.

    :param sc: SparkContext.
    :param path: str, path to files.
    :param numPartition: int, number or partitions to use for reading files.
    :return: DataFrame, with columns: (filePath: str, fileData: BinaryType)
    """
    numPartitions = numPartitions or sc.defaultParallelism
    schema = StructType([StructField("filePath", StringType(), False),
                         StructField("fileData", BinaryType(), False)])
    rdd = sc.binaryFiles(
        path, minPartitions=numPartitions).repartition(numPartitions)
    rdd = rdd.map(lambda x: (x[0], bytearray(x[1])))
    return rdd.toDF(schema)

开发者ID:databricks，项目名称:spark-deep-learning，代码行数:18，代码来源:imageIO.py

示例2: test_filesTODF

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def test_filesTODF(self):
        df = imageIO.filesToDF(self.binaryFilesMock, "path", 217)
        self.assertEqual(df.rdd.getNumPartitions(), 217)
        df.schema.fields[0].dataType == StringType()
        df.schema.fields[0].dataType == BinaryType()
        first = df.first()
        self.assertTrue(hasattr(first, "filePath"))
        self.assertEqual(type(first.fileData), bytearray)


# TODO: make unit tests for arrayToImageRow on arrays of varying shapes, channels, dtypes.

开发者ID:databricks，项目名称:spark-deep-learning，代码行数:13，代码来源:test_imageIO.py

示例3: spark_dtype

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def spark_dtype(self):
        # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
        # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module
        import pyspark.sql.types as sql_types

        return sql_types.BinaryType()

开发者ID:uber，项目名称:petastorm，代码行数:8，代码来源:codecs.py

示例4: as_spark_type

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes,):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date,):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in (np.ndarray,):
        # TODO: support other child types
        return types.ArrayType(types.StringType())
    else:
        raise TypeError("Type %s was not understood." % tpe)

开发者ID:databricks，项目名称:koalas，代码行数:39，代码来源:typehints.py

示例5: init

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def __init__(self, series: "ks.Series"):
        if not isinstance(series.spark.data_type, (StringType, BinaryType, ArrayType)):
            raise ValueError("Cannot call StringMethods on type {}".format(series.spark.data_type))
        self._data = series
        self.name = self._data.name

    # Methods

开发者ID:databricks，项目名称:koalas，代码行数:9，代码来源:strings.py

示例6: loadTFRecords

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def loadTFRecords(sc, input_dir, binary_features=[]):
  """Load TFRecords from disk into a Spark DataFrame.

  This will attempt to automatically convert the tf.train.Example features into Spark DataFrame columns of equivalent types.

  Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to
  disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint"
  from the caller in the ``binary_features`` argument.

  Args:
    :sc: SparkContext
    :input_dir: location of TFRecords on disk.
    :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays.

  Returns:
    A Spark DataFrame mirroring the tf.train.Example schema.
  """
  import tensorflow as tf

  tfr_rdd = sc.newAPIHadoopFile(input_dir, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
                                keyClass="org.apache.hadoop.io.BytesWritable",
                                valueClass="org.apache.hadoop.io.NullWritable")

  # infer Spark SQL types from tf.Example
  record = tfr_rdd.take(1)[0]
  example = tf.train.Example()
  example.ParseFromString(bytes(record[0]))
  schema = infer_schema(example, binary_features)

  # convert serialized protobuf to tf.Example to Row
  example_rdd = tfr_rdd.mapPartitions(lambda x: fromTFExample(x, binary_features))

  # create a Spark DataFrame from RDD[Row]
  df = example_rdd.toDF(schema)

  # save reference of this dataframe
  loadedDF[df] = input_dir
  return df

开发者ID:yahoo，项目名称:TensorFlowOnSpark，代码行数:40，代码来源:dfutil.py

示例7: infer_schema

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def infer_schema(example, binary_features=[]):
  """Given a tf.train.Example, infer the Spark DataFrame schema (StructFields).

  Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to
  disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint"
  from the caller in the ``binary_features`` argument.

  Args:
    :example: a tf.train.Example
    :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays.

  Returns:
    A DataFrame StructType schema
  """
  def _infer_sql_type(k, v):
    # special handling for binary features
    if k in binary_features:
      return BinaryType()

    if v.int64_list.value:
      result = v.int64_list.value
      sql_type = LongType()
    elif v.float_list.value:
      result = v.float_list.value
      sql_type = DoubleType()
    else:
      result = v.bytes_list.value
      sql_type = StringType()

    if len(result) > 1:             # represent multi-item tensors as Spark SQL ArrayType() of base types
      return ArrayType(sql_type)
    else:                           # represent everything else as base types (and empty tensors as StringType())
      return sql_type

  return StructType([StructField(k, _infer_sql_type(k, v), True) for k, v in sorted(example.features.feature.items())])

开发者ID:yahoo，项目名称:TensorFlowOnSpark，代码行数:37，代码来源:dfutil.py

示例8: fromTFExample

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def fromTFExample(iter, binary_features=[]):
  """mapPartition function to convert an RDD of serialized tf.train.Example bytestring into an RDD of Row.

  Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to
  disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint"
  from the caller in the ``binary_features`` argument.

  Args:
    :iter: the RDD partition iterator
    :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays.

  Returns:
    An array/iterator of DataFrame Row with features converted into columns.
  """
  # convert from protobuf-like dict to DataFrame-friendly dict
  def _get_value(k, v):
    if v.int64_list.value:
      result = v.int64_list.value
    elif v.float_list.value:
      result = v.float_list.value
    else:  # string or bytearray
      if k in binary_features:
        return bytearray(v.bytes_list.value[0])
      else:
        return v.bytes_list.value[0].decode('utf-8')

    if len(result) > 1:         # represent multi-item tensors as python lists
      return list(result)
    elif len(result) == 1:      # extract scalars from single-item tensors
      return result[0]
    else:                       # represent empty tensors as python None
      return None

  results = []
  for record in iter:
    example = tf.train.Example()
    example.ParseFromString(bytes(record[0]))       # record is (bytestr, None)
    d = {k: _get_value(k, v) for k, v in sorted(example.features.feature.items())}
    row = Row(**d)
    results.append(row)

  return results

开发者ID:yahoo，项目名称:TensorFlowOnSpark，代码行数:44，代码来源:dfutil.py

注：本文中的pyspark.sql.types.BinaryType方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。