当前位置: 首页>>代码示例>>Python>>正文


Python apache_beam.Pipeline方法代码示例

本文整理汇总了Python中apache_beam.Pipeline方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.Pipeline方法的具体用法?Python apache_beam.Pipeline怎么用?Python apache_beam.Pipeline使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在apache_beam的用法示例。


在下文中一共展示了apache_beam.Pipeline方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: read_headers

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def read_headers(
    pipeline,  #type: beam.Pipeline
    pipeline_mode,  #type: int
    all_patterns  #type: List[str]
    ):
  # type: (...) -> pvalue.PCollection
  """Creates an initial PCollection by reading the VCF file headers."""
  compression_type = get_compression_type(all_patterns)
  if pipeline_mode == PipelineModes.LARGE:
    headers = (pipeline
               | beam.Create(all_patterns)
               | vcf_header_io.ReadAllVcfHeaders(
                   compression_type=compression_type))
  else:
    headers = pipeline | vcf_header_io.ReadVcfHeaders(
        all_patterns[0],
        compression_type=compression_type)

  return headers 
开发者ID:googlegenomics,项目名称:gcp-variant-transforms,代码行数:21,代码来源:pipeline_common.py

示例2: _read_variants

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def _read_variants(all_patterns,  # type: List[str]
                   pipeline,  # type: beam.Pipeline
                   known_args,  # type: argparse.Namespace
                   pipeline_mode,  # type: int
                   pre_infer_headers=False,  # type: bool
                   keep_raw_sample_names=False
                  ):
  # type: (...) -> pvalue.PCollection
  """Helper method for returning a PCollection of Variants from VCFs."""
  representative_header_lines = None
  if known_args.representative_header_file:
    representative_header_lines = vcf_header_parser.get_metadata_header_lines(
        known_args.representative_header_file)
  return pipeline_common.read_variants(
      pipeline,
      all_patterns,
      pipeline_mode,
      known_args.allow_malformed_records,
      representative_header_lines,
      pre_infer_headers=pre_infer_headers,
      sample_name_encoding=(
          SampleNameEncoding.NONE if keep_raw_sample_names else
          SampleNameEncoding[known_args.sample_name_encoding]),
      use_1_based_coordinate=known_args.use_1_based_coordinate) 
开发者ID:googlegenomics,项目名称:gcp-variant-transforms,代码行数:26,代码来源:vcf_to_bq.py

示例3: GetPipelineRoot

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def GetPipelineRoot(options=None):
  """Return the root of the beam pipeline.

  Typical usage looks like:

    with GetPipelineRoot() as root:
      _ = (root | beam.ParDo() | ...)

  In this example, the pipeline is automatically executed when the context is
  exited, though one can manually run the pipeline built from the root object as
  well.

  Args:
    options: A beam.options.pipeline_options.PipelineOptions object.

  Returns:
    A beam.Pipeline root object.
  """
  return beam.Pipeline(options=options) 
开发者ID:tensorflow,项目名称:lingvo,代码行数:21,代码来源:beam_utils.py

示例4: test_stats_pipeline_with_zero_examples

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def test_stats_pipeline_with_zero_examples(self):
    expected_result = text_format.Parse(
        """
        datasets {
          num_examples: 0
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          num_top_values=1,
          num_rank_histogram_buckets=1,
          num_values_histogram_buckets=2,
          num_histogram_buckets=1,
          num_quantiles_histogram_buckets=1,
          epsilon=0.001)
      result = (p | beam.Create([]) | stats_api.GenerateStatistics(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result)) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:22,代码来源:stats_api_test.py

示例5: test_stats_pipeline_with_sample_rate

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def test_stats_pipeline_with_sample_rate(self):
    record_batches = [
        pa.RecordBatch.from_arrays(
            [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
    ]

    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          sample_rate=1.0,
          num_top_values=2,
          num_rank_histogram_buckets=2,
          num_values_histogram_buckets=2,
          num_histogram_buckets=2,
          num_quantiles_histogram_buckets=2,
          epsilon=0.001)
      result = (
          p | beam.Create(record_batches)
          | stats_api.GenerateStatistics(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, self._sampling_test_expected_result)) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:24,代码来源:stats_api_test.py

示例6: test_write_stats_to_text

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def test_write_stats_to_text(self):
    stats = text_format.Parse(
        """
        datasets {
          name: 'x'
          num_examples: 100
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
    output_path = os.path.join(self._get_temp_dir(), 'stats')
    with beam.Pipeline() as p:
      _ = (p | beam.Create([stats]) | stats_api.WriteStatisticsToText(
          output_path))
    stats_from_file = statistics_pb2.DatasetFeatureStatisticsList()
    serialized_stats = io_util.read_file_to_string(
        output_path, binary_mode=True)
    stats_from_file.ParseFromString(serialized_stats)
    self.assertLen(stats_from_file.datasets, 1)
    test_util.assert_dataset_feature_stats_proto_equal(
        self, stats_from_file.datasets[0], stats.datasets[0]) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:21,代码来源:stats_api_test.py

示例7: test_stats_impl

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def test_stats_impl(self,
                      record_batches,
                      options,
                      expected_result_proto_text,
                      schema=None):
    expected_result = text_format.Parse(
        expected_result_proto_text,
        statistics_pb2.DatasetFeatureStatisticsList())
    if schema is not None:
      options.schema = schema
    with beam.Pipeline() as p:
      result = (
          p | beam.Create(record_batches, reshuffle=False)
          | stats_impl.GenerateStatisticsImpl(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result)) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:20,代码来源:stats_impl_test.py

示例8: test_csv_decoder

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def test_csv_decoder(self,
                       input_lines,
                       expected_result,
                       column_names,
                       delimiter=',',
                       skip_blank_lines=True,
                       schema=None,
                       multivalent_columns=None,
                       secondary_delimiter=None):
    with beam.Pipeline() as p:
      result = (
          p | beam.Create(input_lines, reshuffle=False)
          | csv_decoder.DecodeCSV(
              column_names=column_names,
              delimiter=delimiter,
              skip_blank_lines=skip_blank_lines,
              schema=schema,
              multivalent_columns=multivalent_columns,
              secondary_delimiter=secondary_delimiter))
      util.assert_that(
          result,
          test_util.make_arrow_record_batches_equal_fn(self, expected_result)) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:24,代码来源:csv_decoder_test.py

示例9: preprocess

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def preprocess(train_dataset, output_dir, eval_dataset, checkpoint):
    """Preprocess data locally."""

    import apache_beam as beam
    from google.datalab.utils import LambdaJob
    from . import _preprocess

    if checkpoint is None:
      checkpoint = _util._DEFAULT_CHECKPOINT_GSURL
    job_id = ('preprocess-image-classification-' +
              datetime.datetime.now().strftime('%y%m%d-%H%M%S'))
    # Project is needed for bigquery data source, even in local run.
    options = {
        'project': _util.default_project(),
    }
    opts = beam.pipeline.PipelineOptions(flags=[], **options)
    p = beam.Pipeline('DirectRunner', options=opts)
    _preprocess.configure_pipeline(p, train_dataset, eval_dataset, checkpoint, output_dir, job_id)
    job = LambdaJob(lambda: p.run().wait_until_finish(), job_id)
    return job 
开发者ID:googledatalab,项目名称:pydatalab,代码行数:22,代码来源:_local.py

示例10: batch_predict

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def batch_predict(dataset, model_dir, output_csv, output_bq_table):
    """Batch predict running locally."""

    import apache_beam as beam
    from google.datalab.utils import LambdaJob
    from . import _predictor

    if output_csv is None and output_bq_table is None:
      raise ValueError('output_csv and output_bq_table cannot both be None.')

    job_id = ('batch-predict-image-classification-' +
              datetime.datetime.now().strftime('%y%m%d-%H%M%S'))

    # Project is needed for bigquery data source, even in local run.
    options = {
        'project': _util.default_project(),
    }
    opts = beam.pipeline.PipelineOptions(flags=[], **options)
    p = beam.Pipeline('DirectRunner', options=opts)
    _predictor.configure_pipeline(p, dataset, model_dir, output_csv, output_bq_table)
    job = LambdaJob(lambda: p.run().wait_until_finish(), job_id)
    return job 
开发者ID:googledatalab,项目名称:pydatalab,代码行数:24,代码来源:_local.py

示例11: create_glyphazzn_dataset

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def create_glyphazzn_dataset(filepattern, output_path):
  """Creates a glyphazzn dataset, from raw Parquetio to TFRecords."""
  def pipeline(root):
    """Pipeline for creating glyphazzn dataset."""
    attrs = ['uni', 'width', 'vwidth', 'sfd', 'id', 'binary_fp']

    examples = root | 'Read' >> beam.io.parquetio.ReadFromParquet(
        file_pattern=filepattern, columns=attrs)

    examples = examples | 'FilterBadIcons' >> beam.Filter(_is_valid_glyph)
    examples = examples | 'ConvertToPath' >> beam.Map(_convert_to_path)
    examples = examples | 'FilterBadPathLenghts' >> beam.Filter(_is_valid_path)
    examples = examples | 'ProcessAndConvert' >> beam.Map(_create_example)
    (examples | 'WriteToTFRecord' >> beam.io.tfrecordio.WriteToTFRecord(
        output_path, num_shards=90))
  return pipeline 
开发者ID:magenta,项目名称:magenta,代码行数:18,代码来源:datagen_beam.py

示例12: convert_to_tfxio_api_inputs

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def convert_to_tfxio_api_inputs(
      self, legacy_input_data, legacy_input_metadata, label='input_data'):
    """Converts from the legacy TFT API inputs to TFXIO-based inputs.

    Args:
      legacy_input_data: a PCollection of instance dicts.
      legacy_input_metadata: a tft.DatasetMetadata.
      label: label for the PTransform that translates `legacy_input_data` into
        the TFXIO input data. Set to different values if this method is called
        multiple times in a beam Pipeline.
    Returns:
      A tuple of a PCollection of `pyarrow.RecordBatch` and a
      `tensor_adapter.TensorAdapterConfig`. This tuple can be fed directly to
      TFT's `{Analyze,Transform,AnalyzeAndTransform}Dataset` APIs.
    """
    tfxio_impl = _LegacyCompatibilityTFXIO(legacy_input_metadata.schema)
    input_data = (
        legacy_input_data |
        ('LegacyFormatToTfxio[%s]' % label >> tfxio_impl.BeamSource(
            beam_impl.Context.get_desired_batch_size())))
    return input_data, tfxio_impl.TensorAdapterConfig() 
开发者ID:tensorflow,项目名称:transform,代码行数:23,代码来源:tft_unit.py

示例13: _clear_shared_state_after_barrier

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def _clear_shared_state_after_barrier(pipeline, input_barrier):
  """Clears any shared state from within a pipeline context.

  This will only be cleared once input_barrier becomes available.

  Args:
    pipeline: A `beam.Pipeline` object.
    input_barrier: A `PCollection` which the pipeline should wait for.

  Returns:
    An empty `PCollection`.
  """
  empty_pcoll = input_barrier | 'MakeCheapBarrier' >> beam.FlatMap(
      lambda x: None)
  return (pipeline
          | 'PrepareToClearSharedKeepAlives' >> beam.Create([None])
          | 'WaitAndClearSharedKeepAlives' >> beam.Map(
              lambda x, empty_side_input: shared.Shared().acquire(lambda: None),
              beam.pvalue.AsIter(empty_pcoll))) 
开发者ID:tensorflow,项目名称:transform,代码行数:21,代码来源:impl.py

示例14: testReadTransformFn

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def testReadTransformFn(self):
    path = self.get_temp_dir()
    # NOTE: we don't need to create or write to the transform_fn directory since
    # ReadTransformFn never inspects this directory.
    transform_fn_dir = os.path.join(
        path, tft.TFTransformOutput.TRANSFORM_FN_DIR)
    transformed_metadata_dir = os.path.join(
        path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR)
    metadata_io.write_metadata(test_metadata.COMPLETE_METADATA,
                               transformed_metadata_dir)

    with beam.Pipeline() as pipeline:
      saved_model_dir_pcoll, metadata = (
          pipeline | transform_fn_io.ReadTransformFn(path))
      beam_test_util.assert_that(
          saved_model_dir_pcoll,
          beam_test_util.equal_to([transform_fn_dir]),
          label='AssertSavedModelDir')
      # NOTE: metadata is currently read in a non-deferred manner.
      self.assertEqual(metadata, test_metadata.COMPLETE_METADATA) 
开发者ID:tensorflow,项目名称:transform,代码行数:22,代码来源:transform_fn_io_test.py

示例15: testWriteTransformFnIsIdempotent

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def testWriteTransformFnIsIdempotent(self):
    transform_output_dir = os.path.join(self.get_temp_dir(), 'output')

    def mock_write_metadata_expand(unused_self, unused_metadata):
      raise ArithmeticError('Some error')

    with beam.Pipeline() as pipeline:
      # Create an empty directory for the source saved model dir.
      saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
      saved_model_dir_pcoll = (
          pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))

      with mock.patch.object(transform_fn_io.beam_metadata_io.WriteMetadata,
                             'expand', mock_write_metadata_expand):
        with self.assertRaisesRegexp(ArithmeticError, 'Some error'):
          _ = ((saved_model_dir_pcoll, object())
               | transform_fn_io.WriteTransformFn(transform_output_dir))

    self.assertFalse(file_io.file_exists(transform_output_dir)) 
开发者ID:tensorflow,项目名称:transform,代码行数:21,代码来源:transform_fn_io_test.py


注:本文中的apache_beam.Pipeline方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。