当前位置: 首页>>代码示例>>Python>>正文


Python types.BinaryType方法代码示例

本文整理汇总了Python中pyspark.sql.types.BinaryType方法的典型用法代码示例。如果您正苦于以下问题:Python types.BinaryType方法的具体用法?Python types.BinaryType怎么用?Python types.BinaryType使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.types的用法示例。


在下文中一共展示了types.BinaryType方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: filesToDF

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def filesToDF(sc, path, numPartitions=None):
    """
    Read files from a directory to a DataFrame.

    :param sc: SparkContext.
    :param path: str, path to files.
    :param numPartition: int, number or partitions to use for reading files.
    :return: DataFrame, with columns: (filePath: str, fileData: BinaryType)
    """
    numPartitions = numPartitions or sc.defaultParallelism
    schema = StructType([StructField("filePath", StringType(), False),
                         StructField("fileData", BinaryType(), False)])
    rdd = sc.binaryFiles(
        path, minPartitions=numPartitions).repartition(numPartitions)
    rdd = rdd.map(lambda x: (x[0], bytearray(x[1])))
    return rdd.toDF(schema) 
开发者ID:databricks,项目名称:spark-deep-learning,代码行数:18,代码来源:imageIO.py

示例2: test_filesTODF

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def test_filesTODF(self):
        df = imageIO.filesToDF(self.binaryFilesMock, "path", 217)
        self.assertEqual(df.rdd.getNumPartitions(), 217)
        df.schema.fields[0].dataType == StringType()
        df.schema.fields[0].dataType == BinaryType()
        first = df.first()
        self.assertTrue(hasattr(first, "filePath"))
        self.assertEqual(type(first.fileData), bytearray)


# TODO: make unit tests for arrayToImageRow on arrays of varying shapes, channels, dtypes. 
开发者ID:databricks,项目名称:spark-deep-learning,代码行数:13,代码来源:test_imageIO.py

示例3: spark_dtype

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def spark_dtype(self):
        # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
        # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module
        import pyspark.sql.types as sql_types

        return sql_types.BinaryType() 
开发者ID:uber,项目名称:petastorm,代码行数:8,代码来源:codecs.py

示例4: as_spark_type

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes,):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date,):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in (np.ndarray,):
        # TODO: support other child types
        return types.ArrayType(types.StringType())
    else:
        raise TypeError("Type %s was not understood." % tpe) 
开发者ID:databricks,项目名称:koalas,代码行数:39,代码来源:typehints.py

示例5: __init__

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def __init__(self, series: "ks.Series"):
        if not isinstance(series.spark.data_type, (StringType, BinaryType, ArrayType)):
            raise ValueError("Cannot call StringMethods on type {}".format(series.spark.data_type))
        self._data = series
        self.name = self._data.name

    # Methods 
开发者ID:databricks,项目名称:koalas,代码行数:9,代码来源:strings.py

示例6: loadTFRecords

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def loadTFRecords(sc, input_dir, binary_features=[]):
  """Load TFRecords from disk into a Spark DataFrame.

  This will attempt to automatically convert the tf.train.Example features into Spark DataFrame columns of equivalent types.

  Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to
  disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint"
  from the caller in the ``binary_features`` argument.

  Args:
    :sc: SparkContext
    :input_dir: location of TFRecords on disk.
    :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays.

  Returns:
    A Spark DataFrame mirroring the tf.train.Example schema.
  """
  import tensorflow as tf

  tfr_rdd = sc.newAPIHadoopFile(input_dir, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
                                keyClass="org.apache.hadoop.io.BytesWritable",
                                valueClass="org.apache.hadoop.io.NullWritable")

  # infer Spark SQL types from tf.Example
  record = tfr_rdd.take(1)[0]
  example = tf.train.Example()
  example.ParseFromString(bytes(record[0]))
  schema = infer_schema(example, binary_features)

  # convert serialized protobuf to tf.Example to Row
  example_rdd = tfr_rdd.mapPartitions(lambda x: fromTFExample(x, binary_features))

  # create a Spark DataFrame from RDD[Row]
  df = example_rdd.toDF(schema)

  # save reference of this dataframe
  loadedDF[df] = input_dir
  return df 
开发者ID:yahoo,项目名称:TensorFlowOnSpark,代码行数:40,代码来源:dfutil.py

示例7: infer_schema

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def infer_schema(example, binary_features=[]):
  """Given a tf.train.Example, infer the Spark DataFrame schema (StructFields).

  Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to
  disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint"
  from the caller in the ``binary_features`` argument.

  Args:
    :example: a tf.train.Example
    :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays.

  Returns:
    A DataFrame StructType schema
  """
  def _infer_sql_type(k, v):
    # special handling for binary features
    if k in binary_features:
      return BinaryType()

    if v.int64_list.value:
      result = v.int64_list.value
      sql_type = LongType()
    elif v.float_list.value:
      result = v.float_list.value
      sql_type = DoubleType()
    else:
      result = v.bytes_list.value
      sql_type = StringType()

    if len(result) > 1:             # represent multi-item tensors as Spark SQL ArrayType() of base types
      return ArrayType(sql_type)
    else:                           # represent everything else as base types (and empty tensors as StringType())
      return sql_type

  return StructType([StructField(k, _infer_sql_type(k, v), True) for k, v in sorted(example.features.feature.items())]) 
开发者ID:yahoo,项目名称:TensorFlowOnSpark,代码行数:37,代码来源:dfutil.py

示例8: fromTFExample

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import BinaryType [as 别名]
def fromTFExample(iter, binary_features=[]):
  """mapPartition function to convert an RDD of serialized tf.train.Example bytestring into an RDD of Row.

  Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to
  disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint"
  from the caller in the ``binary_features`` argument.

  Args:
    :iter: the RDD partition iterator
    :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays.

  Returns:
    An array/iterator of DataFrame Row with features converted into columns.
  """
  # convert from protobuf-like dict to DataFrame-friendly dict
  def _get_value(k, v):
    if v.int64_list.value:
      result = v.int64_list.value
    elif v.float_list.value:
      result = v.float_list.value
    else:  # string or bytearray
      if k in binary_features:
        return bytearray(v.bytes_list.value[0])
      else:
        return v.bytes_list.value[0].decode('utf-8')

    if len(result) > 1:         # represent multi-item tensors as python lists
      return list(result)
    elif len(result) == 1:      # extract scalars from single-item tensors
      return result[0]
    else:                       # represent empty tensors as python None
      return None

  results = []
  for record in iter:
    example = tf.train.Example()
    example.ParseFromString(bytes(record[0]))       # record is (bytestr, None)
    d = {k: _get_value(k, v) for k, v in sorted(example.features.feature.items())}
    row = Row(**d)
    results.append(row)

  return results 
开发者ID:yahoo,项目名称:TensorFlowOnSpark,代码行数:44,代码来源:dfutil.py


注:本文中的pyspark.sql.types.BinaryType方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。