Python pyarrow.RecordBatch方法代码示例

本文整理汇总了Python中pyarrow.RecordBatch方法的典型用法代码示例。如果您正苦于以下问题：Python pyarrow.RecordBatch方法的具体用法？Python pyarrow.RecordBatch怎么用？Python pyarrow.RecordBatch使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow的用法示例。

在下文中一共展示了pyarrow.RecordBatch方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: anomalies_slicer

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def anomalies_slicer(
    unused_example: pa.RecordBatch,
    anomalies: anomalies_pb2.Anomalies) -> types.SliceKeysList:
  """Returns slice keys for an example based on the given Anomalies proto.

  This slicer will generate a slice key for each anomaly reason in the proto.

  Args:
    unused_example: The example for which to generate slice keys.
    anomalies: An Anomalies proto from which to generate the list of slice
      keys.

  Returns:
    A list of slice keys.
  """
  slice_keys = []
  for feature_name, anomaly_info in anomalies.anomaly_info.items():
    for anomaly_reason in anomaly_info.reason:
      slice_keys.append(
          feature_name + '_' +
          anomalies_pb2.AnomalyInfo.Type.Name(anomaly_reason.type))
  return slice_keys

开发者ID:tensorflow，项目名称:data-validation，代码行数:24，代码来源:anomalies_util.py

示例2: _filter_features

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def _filter_features(
    record_batch: pa.RecordBatch,
    feature_whitelist: List[types.FeatureName]) -> pa.RecordBatch:
  """Removes features that are not whitelisted.

  Args:
    record_batch: Input Arrow RecordBatch.
    feature_whitelist: A set of feature names to whitelist.

  Returns:
    An Arrow RecordBatch containing only the whitelisted features of the input.
  """
  schema = record_batch.schema
  column_names = set(schema.names)
  columns_to_select = []
  column_names_to_select = []
  for feature_name in feature_whitelist:
    if feature_name in column_names:
      columns_to_select.append(
          record_batch.column(schema.get_field_index(feature_name)))
      column_names_to_select.append(feature_name)
  return pa.RecordBatch.from_arrays(columns_to_select, column_names_to_select)

开发者ID:tensorflow，项目名称:data-validation，代码行数:24，代码来源:stats_impl.py

示例3: generate_statistics_in_memory

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def generate_statistics_in_memory(
    record_batch: pa.RecordBatch,
    options: stats_options.StatsOptions = stats_options.StatsOptions()
) -> statistics_pb2.DatasetFeatureStatisticsList:
  """Generates statistics for an in-memory list of examples.

  Args:
    record_batch: Arrow RecordBatch.
    options: Options for generating data statistics.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
  stats_generators = get_generators(options, in_memory=True)  # type: List[stats_generator.CombinerStatsGenerator]
  partial_stats = generate_partial_statistics_in_memory(record_batch, options,
                                                        stats_generators)
  return extract_statistics_output(partial_stats, stats_generators)

开发者ID:tensorflow，项目名称:data-validation，代码行数:19，代码来源:stats_impl.py

示例4: convert_to_tfxio_api_inputs

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def convert_to_tfxio_api_inputs(
      self, legacy_input_data, legacy_input_metadata, label='input_data'):
    """Converts from the legacy TFT API inputs to TFXIO-based inputs.

    Args:
      legacy_input_data: a PCollection of instance dicts.
      legacy_input_metadata: a tft.DatasetMetadata.
      label: label for the PTransform that translates `legacy_input_data` into
        the TFXIO input data. Set to different values if this method is called
        multiple times in a beam Pipeline.
    Returns:
      A tuple of a PCollection of `pyarrow.RecordBatch` and a
      `tensor_adapter.TensorAdapterConfig`. This tuple can be fed directly to
      TFT's `{Analyze,Transform,AnalyzeAndTransform}Dataset` APIs.
    """
    tfxio_impl = _LegacyCompatibilityTFXIO(legacy_input_metadata.schema)
    input_data = (
        legacy_input_data |
        ('LegacyFormatToTfxio[%s]' % label >> tfxio_impl.BeamSource(
            beam_impl.Context.get_desired_batch_size())))
    return input_data, tfxio_impl.TensorAdapterConfig()

开发者ID:tensorflow，项目名称:transform，代码行数:23，代码来源:tft_unit.py

示例5: download_arrow_tabledata_list

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def download_arrow_tabledata_list(pages, bq_schema):
    """Use tabledata.list to construct an iterable of RecordBatches.

    Args:
        pages (Iterator[:class:`google.api_core.page_iterator.Page`]):
            An iterator over the result pages.
        bq_schema (Sequence[Union[ \
            :class:`~google.cloud.bigquery.schema.SchemaField`, \
            Mapping[str, Any] \
        ]]):
            A decription of the fields in result pages.
    Yields:
        :class:`pyarrow.RecordBatch`
        The next page of records as a ``pyarrow`` record batch.
    """
    bq_schema = schema._to_schema_fields(bq_schema)
    column_names = bq_to_arrow_schema(bq_schema) or [field.name for field in bq_schema]
    arrow_types = [bq_to_arrow_data_type(field) for field in bq_schema]

    for page in pages:
        yield _tabledata_list_page_to_arrow(page, column_names, arrow_types)

开发者ID:googleapis，项目名称:python-bigquery，代码行数:23，代码来源:_pandas_helpers.py

示例6: _tabledata_list_page_columns

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def _tabledata_list_page_columns(schema, response):
    """Make a generator of all the columns in a page from tabledata.list.

    This enables creating a :class:`pandas.DataFrame` and other
    column-oriented data structures such as :class:`pyarrow.RecordBatch`
    """
    columns = []
    rows = response.get("rows", [])

    def get_column_data(field_index, field):
        for row in rows:
            yield _helpers._field_from_json(row["f"][field_index]["v"], field)

    for field_index, field in enumerate(schema):
        columns.append(get_column_data(field_index, field))

    return columns


# pylint: disable=unused-argument

开发者ID:googleapis，项目名称:python-bigquery，代码行数:22，代码来源:table.py

示例7: MergeRecordBatches

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def MergeRecordBatches(record_batches: List[pa.RecordBatch]) -> pa.RecordBatch:
  """Merges a list of arrow RecordBatches into one. Similar to MergeTables."""
  if not record_batches:
    return _EMPTY_RECORD_BATCH
  first_schema = record_batches[0].schema
  assert any([r.num_rows > 0 for r in record_batches]), (
      "Unable to merge empty RecordBatches.")
  if all([r.schema.equals(first_schema) for r in record_batches[1:]]):
    one_chunk_table = pa.Table.from_batches(record_batches).combine_chunks()
    batches = one_chunk_table.to_batches(max_chunksize=None)
    assert len(batches) == 1
    return batches[0]
  else:
    # TODO(zhuo, b/158335158): switch to pa.Table.concat_tables(promote=True)
    # once the upstream bug is fixed:
    # https://jira.apache.org/jira/browse/ARROW-9071
    return _MergeRecordBatches(record_batches)

开发者ID:tensorflow，项目名称:tfx-bsl，代码行数:19，代码来源:table_util.py

示例8: DataFrameToRecordBatch

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def DataFrameToRecordBatch(
    dataframe: pd.DataFrame) -> pa.RecordBatch:
  """Convert pandas.DataFrame to a pyarrow.RecordBatch with primitive arrays.

  Args:
    dataframe: A pandas.DataFrame, where rows correspond to examples and columns
      correspond to features.

  Returns:
    A pa.RecordBatch containing the same values as the input data in primitive
    array format.
  """

  arrow_fields = []
  for col_name, col_type in zip(dataframe.columns, dataframe.dtypes):
    arrow_type = NumpyKindToArrowType(col_type.kind)
    if not arrow_type:
      logging.warning("Ignoring feature %s of type %s", col_name, col_type)
      continue
    arrow_fields.append(pa.field(col_name, arrow_type))
  return pa.RecordBatch.from_pandas(dataframe, schema=pa.schema(arrow_fields))

开发者ID:tensorflow，项目名称:tfx-bsl，代码行数:23，代码来源:table_util.py

示例9: CanonicalizeRecordBatch

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def CanonicalizeRecordBatch(
    record_batch_with_primitive_arrays: pa.RecordBatch,) -> pa.RecordBatch:
  """Converts primitive arrays in a pyarrow.RecordBatch to SingletonListArrays.

  Args:
    record_batch_with_primitive_arrays: A pyarrow.RecordBatch where values are
      stored in primitive arrays or singleton list arrays.

  Returns:
    pyArrow.RecordBatch in SingletonListArray format.
  """
  arrays = []
  for column_array in record_batch_with_primitive_arrays.columns:
    arr_type = column_array.type
    if not (pa.types.is_list(arr_type) or pa.types.is_large_list(arr_type)):
      arrays.append(array_util.ToSingletonListArray(column_array))
    else:
      arrays.append(column_array)
  # TODO(pachristopher): Consider using a list of record batches instead of a
  # single record batch to avoid having list arrays larger than 2^31 elements.
  return pa.RecordBatch.from_arrays(
      arrays, record_batch_with_primitive_arrays.schema.names)

开发者ID:tensorflow，项目名称:tfx-bsl，代码行数:24，代码来源:table_util.py

示例10: _RawRecordToRecordBatchInternal

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def _RawRecordToRecordBatchInternal(self,
                                      batch_size: Optional[int] = None
                                     ) -> beam.PTransform:

    @beam.typehints.with_input_types(bytes)
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
      return (raw_records_pcoll
              | "Batch" >> beam.BatchElements(
                  **batch_util.GetBatchElementsKwargs(batch_size))
              | "Decode" >> beam.ParDo(
                  _DecodeBatchExamplesDoFn(self._schema,
                                           self.raw_record_column_name,
                                           self._can_produce_large_types)))

    return beam.ptransform_fn(_PTransformFn)()

开发者ID:tensorflow，项目名称:tfx-bsl，代码行数:18，代码来源:tf_sequence_example_record.py

示例11: convert

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def convert(self, tensors: Dict[Text, TensorAlike]) -> pa.RecordBatch:
    """Converts a dict of tensors to a RecordBatch.

    Args:
      tensors: must contain the same keys as the dict passed to the initialier.
        and each TensorAlike must be compatible with the corresponding TypeSpec.

    Returns:
      a RecordBatch, whose schema equals to self.arrow_schema().
    """
    assert len(self._handlers) == len(tensors)
    arrays = []
    for tensor_name, handler in self._handlers:
      arrays.extend(handler.convert(tensors[tensor_name]))

    return pa.record_batch(arrays, schema=self._arrow_schema)

开发者ID:tensorflow，项目名称:tfx-bsl，代码行数:18，代码来源:tensor_to_arrow.py

示例12: BeamSource

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def BeamSource(self, batch_size: Optional[int] = None) -> beam.PTransform:
    """Returns a beam `PTransform` that produces `PCollection[pa.RecordBatch]`.

    May NOT raise an error if the TFMD schema was not provided at construction
    time.

    If a TFMD schema was provided at construction time, all the
    `pa.RecordBatch`es in the result `PCollection` must be of the same schema
    returned by `self.ArrowSchema`. If a TFMD schema was not provided, the
    `pa.RecordBatch`es might not be of the same schema (they may contain
    different numbers of columns).

    Args:
      batch_size: if not None, the `pa.RecordBatch` produced will be of the
        specified size. Otherwise it's automatically tuned by Beam.
    """

开发者ID:tensorflow，项目名称:tfx-bsl，代码行数:18，代码来源:tfxio.py

示例13: GetTensor

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def GetTensor(self, record_batch: pa.RecordBatch,
                produce_eager_tensors: bool) -> Any:
    """Converts the RecordBatch to Tensor or CompositeTensor.

    The result must be of the same (not only compatible) TypeSpec as
    self.type_spec.

    Args:
      record_batch: a RecordBatch that is of the same Schema as what was
        passed at initialization time.
      produce_eager_tensors: if True, returns Eager Tensors, otherwise returns
        ndarrays or Tensor value objects.

    Returns:
      A Tensor or a CompositeTensor. Note that their types may vary depending
      on whether the TF eager mode is on.
    """

开发者ID:tensorflow，项目名称:tfx-bsl，代码行数:19，代码来源:tensor_adapter.py

示例14: _ValidateRecordBatch

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def _ValidateRecordBatch(
      self, tfxio, record_batch, raw_record_column_name=None):
    self.assertIsInstance(record_batch, pa.RecordBatch)
    self.assertEqual(record_batch.num_rows, 3)
    expected_column_values = GetExpectedColumnValues(tfxio)
    for i, field in enumerate(record_batch.schema):
      if field.name == raw_record_column_name:
        continue
      self.assertTrue(record_batch.column(i).equals(
          expected_column_values[field.name]),
                      "Column {} did not match ({} vs {})."
                      .format(field.name, record_batch.column(i),
                              expected_column_values[field.name]))

    if raw_record_column_name is not None:
      if tfxio._can_produce_large_types:
        raw_record_column_type = pa.large_list(pa.large_binary())
      else:
        raw_record_column_type = pa.list_(pa.binary())
      self.assertEqual(record_batch.schema.names[-1], raw_record_column_name)
      self.assertTrue(
          record_batch.columns[-1].type.equals(raw_record_column_type))
      self.assertEqual(record_batch.columns[-1].flatten().to_pylist(),
                       _SERIALIZED_EXAMPLES)

开发者ID:tensorflow，项目名称:tfx-bsl，代码行数:26，代码来源:tf_example_record_test.py

示例15: test_decode

# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def test_decode(self, schema_text_proto, examples_text_proto,
                  create_expected):
    serialized_examples = [
        text_format.Parse(pbtxt, tf.train.Example()).SerializeToString()
        for pbtxt in examples_text_proto
    ]
    serialized_schema = None
    if schema_text_proto is not None:
      serialized_schema = text_format.Parse(
          schema_text_proto, schema_pb2.Schema()).SerializeToString()

    if serialized_schema:
      coder = example_coder.ExamplesToRecordBatchDecoder(serialized_schema)
    else:
      coder = example_coder.ExamplesToRecordBatchDecoder()

    result = coder.DecodeBatch(serialized_examples)
    self.assertIsInstance(result, pa.RecordBatch)
    expected = create_expected(pa.list_, pa.binary())
    self.assertTrue(
        result.equals(expected),
        "actual: {}\n expected:{}".format(result, expected))
    if serialized_schema:
      self.assertTrue(expected.schema.equals(coder.ArrowSchema()))

开发者ID:tensorflow，项目名称:tfx-bsl，代码行数:26，代码来源:example_coder_test.py

注：本文中的pyarrow.RecordBatch方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。