本文整理汇总了Python中pyarrow.RecordBatch方法的典型用法代码示例。如果您正苦于以下问题:Python pyarrow.RecordBatch方法的具体用法?Python pyarrow.RecordBatch怎么用?Python pyarrow.RecordBatch使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyarrow
的用法示例。
在下文中一共展示了pyarrow.RecordBatch方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: anomalies_slicer
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def anomalies_slicer(
unused_example: pa.RecordBatch,
anomalies: anomalies_pb2.Anomalies) -> types.SliceKeysList:
"""Returns slice keys for an example based on the given Anomalies proto.
This slicer will generate a slice key for each anomaly reason in the proto.
Args:
unused_example: The example for which to generate slice keys.
anomalies: An Anomalies proto from which to generate the list of slice
keys.
Returns:
A list of slice keys.
"""
slice_keys = []
for feature_name, anomaly_info in anomalies.anomaly_info.items():
for anomaly_reason in anomaly_info.reason:
slice_keys.append(
feature_name + '_' +
anomalies_pb2.AnomalyInfo.Type.Name(anomaly_reason.type))
return slice_keys
示例2: _filter_features
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def _filter_features(
record_batch: pa.RecordBatch,
feature_whitelist: List[types.FeatureName]) -> pa.RecordBatch:
"""Removes features that are not whitelisted.
Args:
record_batch: Input Arrow RecordBatch.
feature_whitelist: A set of feature names to whitelist.
Returns:
An Arrow RecordBatch containing only the whitelisted features of the input.
"""
schema = record_batch.schema
column_names = set(schema.names)
columns_to_select = []
column_names_to_select = []
for feature_name in feature_whitelist:
if feature_name in column_names:
columns_to_select.append(
record_batch.column(schema.get_field_index(feature_name)))
column_names_to_select.append(feature_name)
return pa.RecordBatch.from_arrays(columns_to_select, column_names_to_select)
示例3: generate_statistics_in_memory
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def generate_statistics_in_memory(
record_batch: pa.RecordBatch,
options: stats_options.StatsOptions = stats_options.StatsOptions()
) -> statistics_pb2.DatasetFeatureStatisticsList:
"""Generates statistics for an in-memory list of examples.
Args:
record_batch: Arrow RecordBatch.
options: Options for generating data statistics.
Returns:
A DatasetFeatureStatisticsList proto.
"""
stats_generators = get_generators(options, in_memory=True) # type: List[stats_generator.CombinerStatsGenerator]
partial_stats = generate_partial_statistics_in_memory(record_batch, options,
stats_generators)
return extract_statistics_output(partial_stats, stats_generators)
示例4: convert_to_tfxio_api_inputs
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def convert_to_tfxio_api_inputs(
self, legacy_input_data, legacy_input_metadata, label='input_data'):
"""Converts from the legacy TFT API inputs to TFXIO-based inputs.
Args:
legacy_input_data: a PCollection of instance dicts.
legacy_input_metadata: a tft.DatasetMetadata.
label: label for the PTransform that translates `legacy_input_data` into
the TFXIO input data. Set to different values if this method is called
multiple times in a beam Pipeline.
Returns:
A tuple of a PCollection of `pyarrow.RecordBatch` and a
`tensor_adapter.TensorAdapterConfig`. This tuple can be fed directly to
TFT's `{Analyze,Transform,AnalyzeAndTransform}Dataset` APIs.
"""
tfxio_impl = _LegacyCompatibilityTFXIO(legacy_input_metadata.schema)
input_data = (
legacy_input_data |
('LegacyFormatToTfxio[%s]' % label >> tfxio_impl.BeamSource(
beam_impl.Context.get_desired_batch_size())))
return input_data, tfxio_impl.TensorAdapterConfig()
示例5: download_arrow_tabledata_list
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def download_arrow_tabledata_list(pages, bq_schema):
"""Use tabledata.list to construct an iterable of RecordBatches.
Args:
pages (Iterator[:class:`google.api_core.page_iterator.Page`]):
An iterator over the result pages.
bq_schema (Sequence[Union[ \
:class:`~google.cloud.bigquery.schema.SchemaField`, \
Mapping[str, Any] \
]]):
A decription of the fields in result pages.
Yields:
:class:`pyarrow.RecordBatch`
The next page of records as a ``pyarrow`` record batch.
"""
bq_schema = schema._to_schema_fields(bq_schema)
column_names = bq_to_arrow_schema(bq_schema) or [field.name for field in bq_schema]
arrow_types = [bq_to_arrow_data_type(field) for field in bq_schema]
for page in pages:
yield _tabledata_list_page_to_arrow(page, column_names, arrow_types)
示例6: _tabledata_list_page_columns
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def _tabledata_list_page_columns(schema, response):
"""Make a generator of all the columns in a page from tabledata.list.
This enables creating a :class:`pandas.DataFrame` and other
column-oriented data structures such as :class:`pyarrow.RecordBatch`
"""
columns = []
rows = response.get("rows", [])
def get_column_data(field_index, field):
for row in rows:
yield _helpers._field_from_json(row["f"][field_index]["v"], field)
for field_index, field in enumerate(schema):
columns.append(get_column_data(field_index, field))
return columns
# pylint: disable=unused-argument
示例7: MergeRecordBatches
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def MergeRecordBatches(record_batches: List[pa.RecordBatch]) -> pa.RecordBatch:
"""Merges a list of arrow RecordBatches into one. Similar to MergeTables."""
if not record_batches:
return _EMPTY_RECORD_BATCH
first_schema = record_batches[0].schema
assert any([r.num_rows > 0 for r in record_batches]), (
"Unable to merge empty RecordBatches.")
if all([r.schema.equals(first_schema) for r in record_batches[1:]]):
one_chunk_table = pa.Table.from_batches(record_batches).combine_chunks()
batches = one_chunk_table.to_batches(max_chunksize=None)
assert len(batches) == 1
return batches[0]
else:
# TODO(zhuo, b/158335158): switch to pa.Table.concat_tables(promote=True)
# once the upstream bug is fixed:
# https://jira.apache.org/jira/browse/ARROW-9071
return _MergeRecordBatches(record_batches)
示例8: DataFrameToRecordBatch
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def DataFrameToRecordBatch(
dataframe: pd.DataFrame) -> pa.RecordBatch:
"""Convert pandas.DataFrame to a pyarrow.RecordBatch with primitive arrays.
Args:
dataframe: A pandas.DataFrame, where rows correspond to examples and columns
correspond to features.
Returns:
A pa.RecordBatch containing the same values as the input data in primitive
array format.
"""
arrow_fields = []
for col_name, col_type in zip(dataframe.columns, dataframe.dtypes):
arrow_type = NumpyKindToArrowType(col_type.kind)
if not arrow_type:
logging.warning("Ignoring feature %s of type %s", col_name, col_type)
continue
arrow_fields.append(pa.field(col_name, arrow_type))
return pa.RecordBatch.from_pandas(dataframe, schema=pa.schema(arrow_fields))
示例9: CanonicalizeRecordBatch
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def CanonicalizeRecordBatch(
record_batch_with_primitive_arrays: pa.RecordBatch,) -> pa.RecordBatch:
"""Converts primitive arrays in a pyarrow.RecordBatch to SingletonListArrays.
Args:
record_batch_with_primitive_arrays: A pyarrow.RecordBatch where values are
stored in primitive arrays or singleton list arrays.
Returns:
pyArrow.RecordBatch in SingletonListArray format.
"""
arrays = []
for column_array in record_batch_with_primitive_arrays.columns:
arr_type = column_array.type
if not (pa.types.is_list(arr_type) or pa.types.is_large_list(arr_type)):
arrays.append(array_util.ToSingletonListArray(column_array))
else:
arrays.append(column_array)
# TODO(pachristopher): Consider using a list of record batches instead of a
# single record batch to avoid having list arrays larger than 2^31 elements.
return pa.RecordBatch.from_arrays(
arrays, record_batch_with_primitive_arrays.schema.names)
示例10: _RawRecordToRecordBatchInternal
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def _RawRecordToRecordBatchInternal(self,
batch_size: Optional[int] = None
) -> beam.PTransform:
@beam.typehints.with_input_types(bytes)
@beam.typehints.with_output_types(pa.RecordBatch)
def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
return (raw_records_pcoll
| "Batch" >> beam.BatchElements(
**batch_util.GetBatchElementsKwargs(batch_size))
| "Decode" >> beam.ParDo(
_DecodeBatchExamplesDoFn(self._schema,
self.raw_record_column_name,
self._can_produce_large_types)))
return beam.ptransform_fn(_PTransformFn)()
示例11: convert
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def convert(self, tensors: Dict[Text, TensorAlike]) -> pa.RecordBatch:
"""Converts a dict of tensors to a RecordBatch.
Args:
tensors: must contain the same keys as the dict passed to the initialier.
and each TensorAlike must be compatible with the corresponding TypeSpec.
Returns:
a RecordBatch, whose schema equals to self.arrow_schema().
"""
assert len(self._handlers) == len(tensors)
arrays = []
for tensor_name, handler in self._handlers:
arrays.extend(handler.convert(tensors[tensor_name]))
return pa.record_batch(arrays, schema=self._arrow_schema)
示例12: BeamSource
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def BeamSource(self, batch_size: Optional[int] = None) -> beam.PTransform:
"""Returns a beam `PTransform` that produces `PCollection[pa.RecordBatch]`.
May NOT raise an error if the TFMD schema was not provided at construction
time.
If a TFMD schema was provided at construction time, all the
`pa.RecordBatch`es in the result `PCollection` must be of the same schema
returned by `self.ArrowSchema`. If a TFMD schema was not provided, the
`pa.RecordBatch`es might not be of the same schema (they may contain
different numbers of columns).
Args:
batch_size: if not None, the `pa.RecordBatch` produced will be of the
specified size. Otherwise it's automatically tuned by Beam.
"""
示例13: GetTensor
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def GetTensor(self, record_batch: pa.RecordBatch,
produce_eager_tensors: bool) -> Any:
"""Converts the RecordBatch to Tensor or CompositeTensor.
The result must be of the same (not only compatible) TypeSpec as
self.type_spec.
Args:
record_batch: a RecordBatch that is of the same Schema as what was
passed at initialization time.
produce_eager_tensors: if True, returns Eager Tensors, otherwise returns
ndarrays or Tensor value objects.
Returns:
A Tensor or a CompositeTensor. Note that their types may vary depending
on whether the TF eager mode is on.
"""
示例14: _ValidateRecordBatch
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def _ValidateRecordBatch(
self, tfxio, record_batch, raw_record_column_name=None):
self.assertIsInstance(record_batch, pa.RecordBatch)
self.assertEqual(record_batch.num_rows, 3)
expected_column_values = GetExpectedColumnValues(tfxio)
for i, field in enumerate(record_batch.schema):
if field.name == raw_record_column_name:
continue
self.assertTrue(record_batch.column(i).equals(
expected_column_values[field.name]),
"Column {} did not match ({} vs {})."
.format(field.name, record_batch.column(i),
expected_column_values[field.name]))
if raw_record_column_name is not None:
if tfxio._can_produce_large_types:
raw_record_column_type = pa.large_list(pa.large_binary())
else:
raw_record_column_type = pa.list_(pa.binary())
self.assertEqual(record_batch.schema.names[-1], raw_record_column_name)
self.assertTrue(
record_batch.columns[-1].type.equals(raw_record_column_type))
self.assertEqual(record_batch.columns[-1].flatten().to_pylist(),
_SERIALIZED_EXAMPLES)
示例15: test_decode
# 需要导入模块: import pyarrow [as 别名]
# 或者: from pyarrow import RecordBatch [as 别名]
def test_decode(self, schema_text_proto, examples_text_proto,
create_expected):
serialized_examples = [
text_format.Parse(pbtxt, tf.train.Example()).SerializeToString()
for pbtxt in examples_text_proto
]
serialized_schema = None
if schema_text_proto is not None:
serialized_schema = text_format.Parse(
schema_text_proto, schema_pb2.Schema()).SerializeToString()
if serialized_schema:
coder = example_coder.ExamplesToRecordBatchDecoder(serialized_schema)
else:
coder = example_coder.ExamplesToRecordBatchDecoder()
result = coder.DecodeBatch(serialized_examples)
self.assertIsInstance(result, pa.RecordBatch)
expected = create_expected(pa.list_, pa.binary())
self.assertTrue(
result.equals(expected),
"actual: {}\n expected:{}".format(result, expected))
if serialized_schema:
self.assertTrue(expected.schema.equals(coder.ArrowSchema()))