本文整理汇总了Python中apache_beam.PTransform方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.PTransform方法的具体用法?Python apache_beam.PTransform怎么用?Python apache_beam.PTransform使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.PTransform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_generate_statistics_in_memory_invalid_custom_generator
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def test_generate_statistics_in_memory_invalid_custom_generator(
self):
# Dummy PTransform that does nothing.
class CustomPTransform(beam.PTransform):
def expand(self, pcoll):
pass
record_batch = pa.RecordBatch.from_arrays([pa.array([[1.0]])], ['a'])
custom_generator = stats_generator.TransformStatsGenerator(
name='CustomStatsGenerator', ptransform=CustomPTransform())
options = stats_options.StatsOptions(
generators=[custom_generator], enable_semantic_domain_stats=True)
with self.assertRaisesRegexp(
TypeError, 'Statistics generator.* found object of type '
'TransformStatsGenerator.'):
stats_impl.generate_statistics_in_memory(record_batch, options)
示例2: __init__
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def __init__(self, pipeline, cache_base_dir, dataset_keys=None, sink=None):
"""Init method.
Args:
pipeline: A beam Pipeline.
cache_base_dir: A str, the path that the cache should be stored in.
dataset_keys: (Optional) An iterable of strings.
sink: (Optional) A PTransform class that takes a path in its constructor,
and is used to write the cache. If not provided this uses a GZipped
TFRecord sink.
"""
self.pipeline = pipeline
self._cache_base_dir = cache_base_dir
if dataset_keys is None:
self._sorted_dataset_keys = None
else:
self._sorted_dataset_keys = sorted(dataset_keys)
self._sink = sink
if self._sink is None:
# TODO(b/37788560): Possibly use Riegeli as a default file format once
# possible.
self._sink = _WriteToTFRecordGzip
示例3: __init__
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def __init__(self,
input_cache_dir: Text,
output_cache_dir: Text,
analyze_data_list: List[_Dataset],
feature_spec_or_typespec: Mapping[Text, Any],
preprocessing_fn: Any,
cache_source: beam.PTransform):
# pyformat: enable
self._input_cache_dir = input_cache_dir
self._output_cache_dir = output_cache_dir
self._analyze_data_list = analyze_data_list
self._feature_spec_or_typespec = feature_spec_or_typespec
self._preprocessing_fn = preprocessing_fn
self._cache_source = cache_source
# TODO(zoy): Remove this method once beam no longer pickles PTransforms,
# once https://issues.apache.org/jira/browse/BEAM-3812 is resolved.
示例4: GetInputSourceToExamplePTransform
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def GetInputSourceToExamplePTransform(self) -> beam.PTransform:
"""Returns PTransform for converting input source to records.
The record is by default assumed to be tf.train.Example protos, subclassses
can serialize any protocol buffer into bytes as output PCollection,
so long as the downstream component can consume it.
Note that each input split will be transformed by this function separately.
For complex use case, consider override 'GenerateExamplesByBeam' instead.
Here is an example PTransform:
@beam.ptransform_fn
@beam.typehints.with_input_types(beam.Pipeline)
@beam.typehints.with_output_types(Union[tf.train.Example,
tf.train.SequenceExample,
bytes])
def ExamplePTransform(
pipeline: beam.Pipeline,
exec_properties: Dict[Text, Any],
split_pattern: Text) -> beam.pvalue.PCollection
"""
pass
示例5: _RawRecordToRecordBatchInternal
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def _RawRecordToRecordBatchInternal(self,
batch_size: Optional[int] = None
) -> beam.PTransform:
@beam.typehints.with_input_types(bytes)
@beam.typehints.with_output_types(pa.RecordBatch)
def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
return (raw_records_pcoll
| "Batch" >> beam.BatchElements(
**batch_util.GetBatchElementsKwargs(batch_size))
| "Decode" >> beam.ParDo(
_DecodeBatchExamplesDoFn(self._GetSchemaForDecoding(),
self.raw_record_column_name,
self._can_produce_large_types)))
return beam.ptransform_fn(_PTransformFn)()
示例6: BeamSource
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def BeamSource(self, batch_size: Optional[int] = None) -> beam.PTransform:
"""Returns a beam `PTransform` that produces `PCollection[pa.RecordBatch]`.
May NOT raise an error if the TFMD schema was not provided at construction
time.
If a TFMD schema was provided at construction time, all the
`pa.RecordBatch`es in the result `PCollection` must be of the same schema
returned by `self.ArrowSchema`. If a TFMD schema was not provided, the
`pa.RecordBatch`es might not be of the same schema (they may contain
different numbers of columns).
Args:
batch_size: if not None, the `pa.RecordBatch` produced will be of the
specified size. Otherwise it's automatically tuned by Beam.
"""
示例7: RawRecordToRecordBatch
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def RawRecordToRecordBatch(self,
batch_size: Optional[int] = None
) -> beam.PTransform:
"""Returns a PTransform that converts raw records to Arrow RecordBatches.
The input PCollection must be from self.RawRecordBeamSource() (also see
the documentation for that method).
Args:
batch_size: if not None, the `pa.RecordBatch` produced will be of the
specified size. Otherwise it's automatically tuned by Beam.
"""
@beam.typehints.with_input_types(bytes)
@beam.typehints.with_output_types(pa.RecordBatch)
def _PTransformFn(pcoll: beam.pvalue.PCollection):
return (pcoll
| "RawRecordToRecordBatch" >>
self._RawRecordToRecordBatchInternal(batch_size)
| "CollectRecordBatchTelemetry" >>
telemetry.ProfileRecordBatches(self._telemetry_descriptors,
self._logical_format,
self._physical_format))
return beam.ptransform_fn(_PTransformFn)()
示例8: Write
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def Write(evaluation_or_validation: Union[evaluator.Evaluation,
validator.Validation], key: Text,
ptransform: beam.PTransform) -> beam.pvalue.PDone:
"""Writes given Evaluation or Validation data using given writer PTransform.
Args:
evaluation_or_validation: Evaluation or Validation data.
key: Key for Evaluation or Validation output to write. It is valid for the
key to not exist in the dict (in which case the write is a no-op).
ptransform: PTransform to use for writing.
Raises:
ValueError: If Evaluation or Validation is empty. The key does not need to
exist in the Evaluation or Validation, but the dict must not be empty.
Returns:
beam.pvalue.PDone.
"""
if not evaluation_or_validation:
raise ValueError('Evaluations and Validations cannot be empty')
if key in evaluation_or_validation:
return evaluation_or_validation[key] | ptransform
return beam.pvalue.PDone(list(evaluation_or_validation.values())[0].pipeline)
示例9: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def expand(self, examples):
"""Runs the linters on the data and writes out the results.
The order in which the linters run is unspecified.
Args:
examples: A `PTransform` that yields a `PCollection` of `tf.Examples`.
Returns:
A pipeline containing the `DataLinter` `PTransform`s.
"""
coders = (beam.coders.coders.StrUtf8Coder(),
beam.coders.coders.ProtoCoder(lint_result_pb2.LintResult))
return (
[examples | linter for linter in self._linters if linter.should_run()]
| 'MergeResults' >> beam.Flatten()
| 'DropEmpty' >> beam.Filter(lambda (_, r): r and len(r.warnings))
| 'ToDict' >> beam.combiners.ToDict()
| 'WriteResults' >> beam.io.textio.WriteToText(
self._results_path,
coder=beam.coders.coders.PickleCoder(),
shard_name_template=''))
示例10: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def expand(self, examples):
"""Implements the interface required by `PTransform`.
Args:
examples: A `PTransform` that yields a `PCollection` of tf.Examples.
Returns:
A `PTransform` that yields a `PCollection` containing at most one tuple in
which the first element is the `LintDetector` name and the second is the
`LintResult`.
"""
result = self._lint(examples)
if not isinstance(result,
(beam.pvalue.PCollection, beam.transforms.PTransform)):
result_pcoll = beam.Create([result] if result else [])
result = examples.pipeline | 'Materialize' >> result_pcoll
return result | 'PairWithName' >> beam.Map(
lambda r: (type(self).__name__, r))
示例11: _lint
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def _lint(self, examples):
"""Returns the result of the `TokenizableStringDetector` linter.
Args:
examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.
Returns:
A `LintResult` of the format
warnings: [feature names]
lint_samples: [{ strings=[vals..] } for each warning]
"""
result = self._make_result()
string_features = utils.get_string_features(self._stats)
for feature in self._stats.features:
if feature.name not in string_features:
continue
str_stats = feature.string_stats
if (str_stats.avg_length > self._length_threshold and
str_stats.unique > self._enum_threshold):
result.warnings.append(feature.name)
samples = [bucket.label for bucket in str_stats.rank_histogram.buckets
if len(bucket.label) > self._length_threshold]
result.lint_samples.add(strings=samples[:self.N_LINT_SAMPLES])
return result
示例12: __init__
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def __init__(self,
file_path,
num_shards=1,
compression_type=CompressionTypes.AUTO,
headers=None):
# type: (str, int, str, List[str]) -> None
"""Initialize a WriteToVcf PTransform.
Args:
file_path: The file path to write to. The files written will begin
with this prefix, followed by a shard identifier (see num_shards). The
file path should include the file extension (i.e. ".vcf", ".vcf.gz",
etc).
num_shards: The number of files (shards) used for output. If not set, the
service will decide on the optimal number of shards.
Constraining the number of shards is likely to reduce
the performance of a pipeline. Setting this value is not recommended
unless you require a specific number of output files.
compression_type: Used to handle compressed output files. Typical value
for VCF files is CompressionTypes.UNCOMPRESSED. If set to
CompressionTypes.AUTO, file_path's extension will be used to detect
compression.
headers: A list of VCF meta-information lines describing the at least the
INFO and FORMAT entries in each record and a header line describing the
column names. These lines will be written at the beginning of the file.
"""
self._file_path = file_path
self._num_shards = num_shards
self._compression_type = compression_type
self._header = headers and '\n'.join([h.strip() for h in headers]) + '\n'
示例13: normalize_inputs
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def normalize_inputs(inputs):
"""Preprocessing function for tf.Transform (full-pass transformations).
Here we will do any preprocessing that requires a full-pass of the dataset.
It takes as inputs the preprocessed data from the `PTransform` we specify, in
this case `SimpleFeatureExtraction`.
Common operations might be scaling values to 0-1, getting the minimum or
maximum value of a certain field, creating a vocabulary for a string field.
There are two main types of transformations supported by tf.Transform, for
more information, check the following modules:
- analyzers: tensorflow_transform.analyzers.py
- mappers: tensorflow_transform.mappers.py
Any transformation done in tf.Transform will be embedded into the TensorFlow
model itself.
"""
return {
# Scale the input features for normalization
'NormalizedC': tft.scale_to_0_1(inputs['TotalC']),
'NormalizedH': tft.scale_to_0_1(inputs['TotalH']),
'NormalizedO': tft.scale_to_0_1(inputs['TotalO']),
'NormalizedN': tft.scale_to_0_1(inputs['TotalN']),
# Do not scale the label since we want the absolute number for prediction
'Energy': inputs['Energy'],
}
# [END dataflow_molecules_normalize_inputs]
示例14: _filter_class_attributes
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def _filter_class_attributes(path, parent, children):
"""Filter out class attirubtes that are part of the PTransform API."""
del path
skip_class_attributes = {
"expand", "label", "from_runner_api", "register_urn", "side_inputs"
}
if inspect.isclass(parent):
children = [(name, child)
for (name, child) in children
if name not in skip_class_attributes]
return children
示例15: main
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def main(args):
if args[1:]:
raise ValueError("Unrecognized Command line args", args[1:])
for obj in supress_docs_for:
doc_controls.do_not_generate_docs(obj)
for name, value in inspect.getmembers(tfdv):
if inspect.ismodule(value):
doc_controls.do_not_generate_docs(value)
for name, value in inspect.getmembers(beam.PTransform):
# This ensures that the methods of PTransform are not documented in any
# derived classes.
if name == "__init__":
continue
try:
doc_controls.do_not_doc_inheritable(value)
except (TypeError, AttributeError):
pass
doc_generator = generate_lib.DocGenerator(
root_title="TensorFlow Data Validation",
py_modules=[("tfdv", tfdv)],
code_url_prefix=FLAGS.code_url_prefix,
search_hints=FLAGS.search_hints,
site_path=FLAGS.site_path,
# Use private_map to exclude doc locations by name if excluding by object
# is insufficient.
private_map={},
# local_definitions_filter ensures that shared modules are only
# documented in the location that defines them, instead of every location
# that imports them.
callbacks=[public_api.local_definitions_filter, _filter_class_attributes])
return doc_generator.build(output_dir=FLAGS.output_dir)