Python apache_beam.PTransform方法代码示例

本文整理汇总了Python中apache_beam.PTransform方法的典型用法代码示例。如果您正苦于以下问题：Python apache_beam.PTransform方法的具体用法？Python apache_beam.PTransform怎么用？Python apache_beam.PTransform使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam的用法示例。

在下文中一共展示了apache_beam.PTransform方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_generate_statistics_in_memory_invalid_custom_generator

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def test_generate_statistics_in_memory_invalid_custom_generator(
      self):

    # Dummy PTransform that does nothing.
    class CustomPTransform(beam.PTransform):

      def expand(self, pcoll):
        pass

    record_batch = pa.RecordBatch.from_arrays([pa.array([[1.0]])], ['a'])
    custom_generator = stats_generator.TransformStatsGenerator(
        name='CustomStatsGenerator', ptransform=CustomPTransform())
    options = stats_options.StatsOptions(
        generators=[custom_generator], enable_semantic_domain_stats=True)
    with self.assertRaisesRegexp(
        TypeError, 'Statistics generator.* found object of type '
        'TransformStatsGenerator.'):
      stats_impl.generate_statistics_in_memory(record_batch, options)

开发者ID:tensorflow，项目名称:data-validation，代码行数:20，代码来源:stats_impl_test.py

示例2: init

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def __init__(self, pipeline, cache_base_dir, dataset_keys=None, sink=None):
    """Init method.

    Args:
      pipeline: A beam Pipeline.
      cache_base_dir: A str, the path that the cache should be stored in.
      dataset_keys: (Optional) An iterable of strings.
      sink: (Optional) A PTransform class that takes a path in its constructor,
        and is used to write the cache. If not provided this uses a GZipped
        TFRecord sink.
    """
    self.pipeline = pipeline
    self._cache_base_dir = cache_base_dir
    if dataset_keys is None:
      self._sorted_dataset_keys = None
    else:
      self._sorted_dataset_keys = sorted(dataset_keys)
    self._sink = sink
    if self._sink is None:
      # TODO(b/37788560): Possibly use Riegeli as a default file format once
      # possible.
      self._sink = _WriteToTFRecordGzip

开发者ID:tensorflow，项目名称:transform，代码行数:24，代码来源:analyzer_cache.py

示例3: init

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def __init__(self,
                 input_cache_dir: Text,
                 output_cache_dir: Text,
                 analyze_data_list: List[_Dataset],
                 feature_spec_or_typespec: Mapping[Text, Any],
                 preprocessing_fn: Any,
                 cache_source: beam.PTransform):
      # pyformat: enable
      self._input_cache_dir = input_cache_dir
      self._output_cache_dir = output_cache_dir
      self._analyze_data_list = analyze_data_list
      self._feature_spec_or_typespec = feature_spec_or_typespec
      self._preprocessing_fn = preprocessing_fn
      self._cache_source = cache_source

    # TODO(zoy): Remove this method once beam no longer pickles PTransforms,
    # once https://issues.apache.org/jira/browse/BEAM-3812 is resolved.

开发者ID:tensorflow，项目名称:tfx，代码行数:19，代码来源:executor.py

示例4: GetInputSourceToExamplePTransform

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def GetInputSourceToExamplePTransform(self) -> beam.PTransform:
    """Returns PTransform for converting input source to records.

    The record is by default assumed to be tf.train.Example protos, subclassses
    can serialize any protocol buffer into bytes as output PCollection,
    so long as the downstream component can consume it.

    Note that each input split will be transformed by this function separately.
    For complex use case, consider override 'GenerateExamplesByBeam' instead.

    Here is an example PTransform:
      @beam.ptransform_fn
      @beam.typehints.with_input_types(beam.Pipeline)
      @beam.typehints.with_output_types(Union[tf.train.Example,
                                              tf.train.SequenceExample,
                                              bytes])
      def ExamplePTransform(
          pipeline: beam.Pipeline,
          exec_properties: Dict[Text, Any],
          split_pattern: Text) -> beam.pvalue.PCollection
    """
    pass

开发者ID:tensorflow，项目名称:tfx，代码行数:24，代码来源:base_example_gen_executor.py

示例5: _RawRecordToRecordBatchInternal

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def _RawRecordToRecordBatchInternal(self,
                                      batch_size: Optional[int] = None
                                     ) -> beam.PTransform:

    @beam.typehints.with_input_types(bytes)
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
      return (raw_records_pcoll
              | "Batch" >> beam.BatchElements(
                  **batch_util.GetBatchElementsKwargs(batch_size))
              | "Decode" >> beam.ParDo(
                  _DecodeBatchExamplesDoFn(self._GetSchemaForDecoding(),
                                           self.raw_record_column_name,
                                           self._can_produce_large_types)))

    return beam.ptransform_fn(_PTransformFn)()

开发者ID:tensorflow，项目名称:tfx-bsl，代码行数:18，代码来源:tf_example_record.py

示例6: BeamSource

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def BeamSource(self, batch_size: Optional[int] = None) -> beam.PTransform:
    """Returns a beam `PTransform` that produces `PCollection[pa.RecordBatch]`.

    May NOT raise an error if the TFMD schema was not provided at construction
    time.

    If a TFMD schema was provided at construction time, all the
    `pa.RecordBatch`es in the result `PCollection` must be of the same schema
    returned by `self.ArrowSchema`. If a TFMD schema was not provided, the
    `pa.RecordBatch`es might not be of the same schema (they may contain
    different numbers of columns).

    Args:
      batch_size: if not None, the `pa.RecordBatch` produced will be of the
        specified size. Otherwise it's automatically tuned by Beam.
    """

开发者ID:tensorflow，项目名称:tfx-bsl，代码行数:18，代码来源:tfxio.py

示例7: RawRecordToRecordBatch

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def RawRecordToRecordBatch(self,
                             batch_size: Optional[int] = None
                            ) -> beam.PTransform:
    """Returns a PTransform that converts raw records to Arrow RecordBatches.

    The input PCollection must be from self.RawRecordBeamSource() (also see
    the documentation for that method).

    Args:
      batch_size: if not None, the `pa.RecordBatch` produced will be of the
        specified size. Otherwise it's automatically tuned by Beam.
    """

    @beam.typehints.with_input_types(bytes)
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _PTransformFn(pcoll: beam.pvalue.PCollection):
      return (pcoll
              | "RawRecordToRecordBatch" >>
              self._RawRecordToRecordBatchInternal(batch_size)
              | "CollectRecordBatchTelemetry" >>
              telemetry.ProfileRecordBatches(self._telemetry_descriptors,
                                             self._logical_format,
                                             self._physical_format))

    return beam.ptransform_fn(_PTransformFn)()

开发者ID:tensorflow，项目名称:tfx-bsl，代码行数:27，代码来源:record_based_tfxio.py

示例8: Write

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def Write(evaluation_or_validation: Union[evaluator.Evaluation,
                                          validator.Validation], key: Text,
          ptransform: beam.PTransform) -> beam.pvalue.PDone:
  """Writes given Evaluation or Validation data using given writer PTransform.

  Args:
    evaluation_or_validation: Evaluation or Validation data.
    key: Key for Evaluation or Validation output to write. It is valid for the
      key to not exist in the dict (in which case the write is a no-op).
    ptransform: PTransform to use for writing.

  Raises:
    ValueError: If Evaluation or Validation is empty. The key does not need to
      exist in the Evaluation or Validation, but the dict must not be empty.

  Returns:
    beam.pvalue.PDone.
  """
  if not evaluation_or_validation:
    raise ValueError('Evaluations and Validations cannot be empty')
  if key in evaluation_or_validation:
    return evaluation_or_validation[key] | ptransform
  return beam.pvalue.PDone(list(evaluation_or_validation.values())[0].pipeline)

开发者ID:tensorflow，项目名称:model-analysis，代码行数:25，代码来源:writer.py

示例9: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def expand(self, examples):
    """Runs the linters on the data and writes out the results.

    The order in which the linters run is unspecified.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Examples`.

    Returns:
      A pipeline containing the `DataLinter` `PTransform`s.
    """
    coders = (beam.coders.coders.StrUtf8Coder(),
              beam.coders.coders.ProtoCoder(lint_result_pb2.LintResult))
    return (
        [examples | linter for linter in self._linters if linter.should_run()]
        | 'MergeResults' >> beam.Flatten()
        | 'DropEmpty' >> beam.Filter(lambda (_, r): r and len(r.warnings))
        | 'ToDict' >> beam.combiners.ToDict()
        | 'WriteResults' >> beam.io.textio.WriteToText(
            self._results_path,
            coder=beam.coders.coders.PickleCoder(),
            shard_name_template=''))

开发者ID:brain-research，项目名称:data-linter，代码行数:24，代码来源:data_linter.py

示例10: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def expand(self, examples):
    """Implements the interface required by `PTransform`.

    Args:
      examples: A `PTransform` that yields a `PCollection` of tf.Examples.

    Returns:
      A `PTransform` that yields a `PCollection` containing at most one tuple in
      which the first element is the `LintDetector` name and the second is the
      `LintResult`.
    """

    result = self._lint(examples)
    if not isinstance(result,
                      (beam.pvalue.PCollection, beam.transforms.PTransform)):
      result_pcoll = beam.Create([result] if result else [])
      result = examples.pipeline | 'Materialize' >> result_pcoll
    return result | 'PairWithName' >> beam.Map(
        lambda r: (type(self).__name__, r))

开发者ID:brain-research，项目名称:data-linter，代码行数:21，代码来源:linters.py

示例11: _lint

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def _lint(self, examples):
    """Returns the result of the `TokenizableStringDetector` linter.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.

    Returns:
      A `LintResult` of the format
        warnings: [feature names]
        lint_samples: [{ strings=[vals..] } for each warning]
    """
    result = self._make_result()
    string_features = utils.get_string_features(self._stats)
    for feature in self._stats.features:
      if feature.name not in string_features:
        continue
      str_stats = feature.string_stats
      if (str_stats.avg_length > self._length_threshold and
          str_stats.unique > self._enum_threshold):
        result.warnings.append(feature.name)
        samples = [bucket.label for bucket in str_stats.rank_histogram.buckets
                   if len(bucket.label) > self._length_threshold]
        result.lint_samples.add(strings=samples[:self.N_LINT_SAMPLES])

    return result

开发者ID:brain-research，项目名称:data-linter，代码行数:27，代码来源:linters.py

示例12: init

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def __init__(self,
               file_path,
               num_shards=1,
               compression_type=CompressionTypes.AUTO,
               headers=None):
    # type: (str, int, str, List[str]) -> None
    """Initialize a WriteToVcf PTransform.

    Args:
      file_path: The file path to write to. The files written will begin
        with this prefix, followed by a shard identifier (see num_shards). The
        file path should include the file extension (i.e. ".vcf", ".vcf.gz",
        etc).
      num_shards: The number of files (shards) used for output. If not set, the
        service will decide on the optimal number of shards.
        Constraining the number of shards is likely to reduce
        the performance of a pipeline.  Setting this value is not recommended
        unless you require a specific number of output files.
      compression_type: Used to handle compressed output files. Typical value
        for VCF files is CompressionTypes.UNCOMPRESSED. If set to
        CompressionTypes.AUTO, file_path's extension will be used to detect
        compression.
      headers: A list of VCF meta-information lines describing the at least the
        INFO and FORMAT entries in each record and a header line describing the
        column names. These lines will be written at the beginning of the file.
    """
    self._file_path = file_path
    self._num_shards = num_shards
    self._compression_type = compression_type
    self._header = headers and '\n'.join([h.strip() for h in headers]) + '\n'

开发者ID:googlegenomics，项目名称:gcp-variant-transforms，代码行数:32，代码来源:vcfio.py

示例13: normalize_inputs

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def normalize_inputs(inputs):
  """Preprocessing function for tf.Transform (full-pass transformations).

  Here we will do any preprocessing that requires a full-pass of the dataset.
  It takes as inputs the preprocessed data from the `PTransform` we specify, in
  this case `SimpleFeatureExtraction`.

  Common operations might be scaling values to 0-1, getting the minimum or
  maximum value of a certain field, creating a vocabulary for a string field.

  There are two main types of transformations supported by tf.Transform, for
  more information, check the following modules:
    - analyzers: tensorflow_transform.analyzers.py
    - mappers:   tensorflow_transform.mappers.py

  Any transformation done in tf.Transform will be embedded into the TensorFlow
  model itself.
  """
  return {
      # Scale the input features for normalization
      'NormalizedC': tft.scale_to_0_1(inputs['TotalC']),
      'NormalizedH': tft.scale_to_0_1(inputs['TotalH']),
      'NormalizedO': tft.scale_to_0_1(inputs['TotalO']),
      'NormalizedN': tft.scale_to_0_1(inputs['TotalN']),

      # Do not scale the label since we want the absolute number for prediction
      'Energy': inputs['Energy'],
  }
# [END dataflow_molecules_normalize_inputs]

开发者ID:GoogleCloudPlatform，项目名称:cloudml-samples，代码行数:31，代码来源:pipeline.py

示例14: _filter_class_attributes

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def _filter_class_attributes(path, parent, children):
  """Filter out class attirubtes that are part of the PTransform API."""
  del path
  skip_class_attributes = {
      "expand", "label", "from_runner_api", "register_urn", "side_inputs"
  }
  if inspect.isclass(parent):
    children = [(name, child)
                for (name, child) in children
                if name not in skip_class_attributes]
  return children

开发者ID:tensorflow，项目名称:data-validation，代码行数:13，代码来源:build_docs.py

示例15: main

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import PTransform [as 别名]
def main(args):
  if args[1:]:
    raise ValueError("Unrecognized Command line args", args[1:])

  for obj in supress_docs_for:
    doc_controls.do_not_generate_docs(obj)

  for name, value in inspect.getmembers(tfdv):
    if inspect.ismodule(value):
      doc_controls.do_not_generate_docs(value)

  for name, value in inspect.getmembers(beam.PTransform):
    # This ensures that the methods of PTransform are not documented in any
    # derived classes.
    if name == "__init__":
      continue
    try:
      doc_controls.do_not_doc_inheritable(value)
    except (TypeError, AttributeError):
      pass

  doc_generator = generate_lib.DocGenerator(
      root_title="TensorFlow Data Validation",
      py_modules=[("tfdv", tfdv)],
      code_url_prefix=FLAGS.code_url_prefix,
      search_hints=FLAGS.search_hints,
      site_path=FLAGS.site_path,
      # Use private_map to exclude doc locations by name if excluding by object
      # is insufficient.
      private_map={},
      # local_definitions_filter ensures that shared modules are only
      # documented in the location that defines them, instead of every location
      # that imports them.
      callbacks=[public_api.local_definitions_filter, _filter_class_attributes])

  return doc_generator.build(output_dir=FLAGS.output_dir)

开发者ID:tensorflow，项目名称:data-validation，代码行数:38，代码来源:build_docs.py

注：本文中的apache_beam.PTransform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。