本文整理汇总了Python中apache_beam.ptransform_fn方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.ptransform_fn方法的具体用法?Python apache_beam.ptransform_fn怎么用?Python apache_beam.ptransform_fn使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.ptransform_fn方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: GetInputSourceToExamplePTransform
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ptransform_fn [as 别名]
def GetInputSourceToExamplePTransform(self) -> beam.PTransform:
"""Returns PTransform for converting input source to records.
The record is by default assumed to be tf.train.Example protos, subclassses
can serialize any protocol buffer into bytes as output PCollection,
so long as the downstream component can consume it.
Note that each input split will be transformed by this function separately.
For complex use case, consider override 'GenerateExamplesByBeam' instead.
Here is an example PTransform:
@beam.ptransform_fn
@beam.typehints.with_input_types(beam.Pipeline)
@beam.typehints.with_output_types(Union[tf.train.Example,
tf.train.SequenceExample,
bytes])
def ExamplePTransform(
pipeline: beam.Pipeline,
exec_properties: Dict[Text, Any],
split_pattern: Text) -> beam.pvalue.PCollection
"""
pass
示例2: _RawRecordToRecordBatchInternal
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ptransform_fn [as 别名]
def _RawRecordToRecordBatchInternal(self,
batch_size: Optional[int] = None
) -> beam.PTransform:
@beam.typehints.with_input_types(bytes)
@beam.typehints.with_output_types(pa.RecordBatch)
def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
return (raw_records_pcoll
| "Batch" >> beam.BatchElements(
**batch_util.GetBatchElementsKwargs(batch_size))
| "Decode" >> beam.ParDo(
_DecodeBatchExamplesDoFn(self._GetSchemaForDecoding(),
self.raw_record_column_name,
self._can_produce_large_types)))
return beam.ptransform_fn(_PTransformFn)()
示例3: _RawRecordToRecordBatchInternal
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ptransform_fn [as 别名]
def _RawRecordToRecordBatchInternal(self,
batch_size: Optional[int] = None
) -> beam.PTransform:
@beam.typehints.with_input_types(bytes)
@beam.typehints.with_output_types(pa.RecordBatch)
def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
return (raw_records_pcoll
| "Batch" >> beam.BatchElements(
**batch_util.GetBatchElementsKwargs(batch_size))
| "Decode" >> beam.ParDo(
_DecodeBatchExamplesDoFn(self._schema,
self.raw_record_column_name,
self._can_produce_large_types)))
return beam.ptransform_fn(_PTransformFn)()
示例4: RawRecordToRecordBatch
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ptransform_fn [as 别名]
def RawRecordToRecordBatch(self,
batch_size: Optional[int] = None
) -> beam.PTransform:
"""Returns a PTransform that converts raw records to Arrow RecordBatches.
The input PCollection must be from self.RawRecordBeamSource() (also see
the documentation for that method).
Args:
batch_size: if not None, the `pa.RecordBatch` produced will be of the
specified size. Otherwise it's automatically tuned by Beam.
"""
@beam.typehints.with_input_types(bytes)
@beam.typehints.with_output_types(pa.RecordBatch)
def _PTransformFn(pcoll: beam.pvalue.PCollection):
return (pcoll
| "RawRecordToRecordBatch" >>
self._RawRecordToRecordBatchInternal(batch_size)
| "CollectRecordBatchTelemetry" >>
telemetry.ProfileRecordBatches(self._telemetry_descriptors,
self._logical_format,
self._physical_format))
return beam.ptransform_fn(_PTransformFn)()
示例5: _RawRecordBeamSourceInternal
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ptransform_fn [as 别名]
def _RawRecordBeamSourceInternal(self):
"""A PTransform that maps batched instances to RecordBatches."""
@beam.ptransform_fn
@beam.typehints.with_output_types(pa.RecordBatch)
def _ptransform_fn(instances):
return (instances
| 'EncodeToTfExamples' >> beam.Map(
example_proto_coder.ExampleProtoCoder(self._schema).encode))
return _ptransform_fn() # pylint: disable=no-value-for-parameter
# TODO(b/156761358): deprecated; remove after tfx-bsl 0.23 release.
示例6: _RawRecordBeamSourceInternal
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ptransform_fn [as 别名]
def _RawRecordBeamSourceInternal(self) -> beam.PTransform:
@beam.typehints.with_input_types(bytes)
@beam.typehints.with_output_types(bytes)
def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
return raw_records_pcoll
return beam.ptransform_fn(_PTransformFn)()
示例7: _RawRecordToRecordBatchInternal
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ptransform_fn [as 别名]
def _RawRecordToRecordBatchInternal(self,
batch_size: Optional[int] = None
) -> beam.PTransform:
@beam.typehints.with_input_types(beam.Pipeline)
@beam.typehints.with_output_types(pa.RecordBatch)
def _PTransformFn(raw_record_pcoll: beam.pvalue.PCollection):
return (raw_record_pcoll
| "Batch" >> beam.BatchElements(
**batch_util.GetBatchElementsKwargs(batch_size))
| "ToRecordBatch" >>
beam.Map(_BatchedRecordsToArrow, self.raw_record_column_name,
self._can_produce_large_types))
return beam.ptransform_fn(_PTransformFn)()
示例8: _RawRecordBeamSourceInternal
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ptransform_fn [as 别名]
def _RawRecordBeamSourceInternal(self) -> beam.PTransform:
@beam.typehints.with_input_types(beam.Pipeline)
@beam.typehints.with_output_types(bytes)
def _PTransformFn(pipeline: beam.pvalue.PCollection):
return pipeline | "ReadFromTFRecord" >> beam.io.ReadFromTFRecord(
self._file_pattern,
coder=beam.coders.BytesCoder(),
# TODO(b/114938612): Eventually remove this override.
validate=False)
return beam.ptransform_fn(_PTransformFn)()
示例9: BeamSource
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ptransform_fn [as 别名]
def BeamSource(self, batch_size: Optional[int] = None) -> beam.PTransform:
@beam.typehints.with_input_types(beam.Pipeline)
@beam.typehints.with_output_types(pa.RecordBatch)
def _PTransformFn(pipeline: beam.pvalue.PCollection):
"""Converts raw records to RecordBatches."""
return (
pipeline
| "RawRecordBeamSource" >> self.RawRecordBeamSource()
| "RawRecordToRecordBatch" >> self.RawRecordToRecordBatch(batch_size))
return beam.ptransform_fn(_PTransformFn)()
示例10: _RawRecordToRecordBatchInternal
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ptransform_fn [as 别名]
def _RawRecordToRecordBatchInternal(self,
batch_size: Optional[int] = None
) -> beam.PTransform:
@beam.typehints.with_input_types(List[bytes])
@beam.typehints.with_output_types(pa.RecordBatch)
def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
"""Returns RecordBatch of csv lines."""
# Decode raw csv lines to record batches.
record_batches = (
raw_records_pcoll
| "CSVToRecordBatch" >> csv_decoder.CSVToRecordBatch(
column_names=self._column_names,
delimiter=self._delimiter,
skip_blank_lines=self._skip_blank_lines,
schema=self._schema,
desired_batch_size=batch_size,
multivalent_columns=self._multivalent_columns,
secondary_delimiter=self._secondary_delimiter,
produce_large_types=self._can_produce_large_types,
raw_record_column_name=self._raw_record_column_name))
return record_batches
return beam.ptransform_fn(_PTransformFn)()
示例11: GetInputSourceToExamplePTransform
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ptransform_fn [as 别名]
def GetInputSourceToExamplePTransform(self) -> beam.PTransform:
"""Returns PTransform for importing records."""
@beam.ptransform_fn
@beam.typehints.with_input_types(beam.Pipeline)
@beam.typehints.with_output_types(Union[tf.train.Example,
tf.train.SequenceExample, bytes])
def ImportRecord(pipeline: beam.Pipeline, exec_properties: Dict[Text, Any],
split_pattern: Text) -> beam.pvalue.PCollection:
"""PTransform to import records.
The records are tf.train.Example, tf.train.SequenceExample,
or serialized proto.
Args:
pipeline: Beam pipeline.
exec_properties: A dict of execution properties.
- input_base: input dir that contains input data.
split_pattern: Split.pattern in Input config, glob relative file pattern
that maps to input files with root directory given by input_base.
Returns:
PCollection of records (tf.Example, tf.SequenceExample, or bytes).
"""
output_payload_format = exec_properties.get(utils.OUTPUT_DATA_FORMAT_KEY)
serialized_records = (
pipeline
# pylint: disable=no-value-for-parameter
| _ImportSerializedRecord(exec_properties, split_pattern))
if output_payload_format == example_gen_pb2.PayloadFormat.FORMAT_PROTO:
return serialized_records
elif (output_payload_format ==
example_gen_pb2.PayloadFormat.FORMAT_TF_EXAMPLE):
return (serialized_records
| 'ToTFExample' >> beam.Map(tf.train.Example.FromString))
elif (output_payload_format ==
example_gen_pb2.PayloadFormat.FORMAT_TF_SEQUENCE_EXAMPLE):
return (serialized_records
| 'ToTFSequenceExample' >> beam.Map(
tf.train.SequenceExample.FromString))
raise ValueError('output_payload_format must be one of FORMAT_TF_EXAMPLE,'
' FORMAT_TF_SEQUENCE_EXAMPLE or FORMAT_PROTO')
return ImportRecord