Python apache_beam.Create方法代码示例

本文整理汇总了Python中apache_beam.Create方法的典型用法代码示例。如果您正苦于以下问题：Python apache_beam.Create方法的具体用法？Python apache_beam.Create怎么用？Python apache_beam.Create使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam的用法示例。

在下文中一共展示了apache_beam.Create方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_pipeline_read_all_file_pattern

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_pipeline_read_all_file_pattern(self):
    with temp_dir.TempDir() as tempdir:
      headers_1 = [self.lines[1], self.lines[-1]]
      headers_2 = [self.lines[2], self.lines[3], self.lines[-1]]
      headers_3 = [self.lines[4], self.lines[-1]]

      file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=headers_1)
      file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=headers_2)
      file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=headers_3)

      pipeline = TestPipeline()
      pcoll = (pipeline
               | 'Create' >> beam.Create(
                   [os.path.join(tempdir.get_path(), '*.vcf')])
               | 'ReadHeaders' >> ReadAllVcfHeaders())

      expected = [_get_vcf_header_from_lines(h, file_name=file_name)
                  for h, file_name in [(headers_1, file_name_1),
                                       (headers_2, file_name_2),
                                       (headers_3, file_name_3)]]
      assert_that(pcoll, asserts.header_vars_equal(expected))
      pipeline.run()

开发者ID:googlegenomics，项目名称:gcp-variant-transforms，代码行数:24，代码来源:vcf_header_io_test.py

示例2: _assert_pipeline_read_files_record_count_equal

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def _assert_pipeline_read_files_record_count_equal(
      self, input_pattern, expected_count, use_read_all=False):
    """Helper method for verifying total records read.

    Args:
      input_pattern (str): Input file pattern to read.
      expected_count (int): Expected number of reacords that was read.
      use_read_all (bool): Whether to use the scalable ReadAllFromVcf transform
        instead of ReadFromVcf.
    """
    pipeline = TestPipeline()
    if use_read_all:
      pcoll = (pipeline
               | 'Create' >> beam.Create([input_pattern])
               | 'Read' >> ReadAllFromVcf())
    else:
      pcoll = pipeline | 'Read' >> ReadFromVcf(input_pattern)
    assert_that(pcoll, asserts.count_equals_to(expected_count))
    pipeline.run()

开发者ID:googlegenomics，项目名称:gcp-variant-transforms，代码行数:21，代码来源:vcfio_test.py

示例3: test_pipeline_read_all_file_pattern

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_pipeline_read_all_file_pattern(self):
    with temp_dir.TempDir() as tempdir:
      lines_1 = self.headers[1:2] + self.headers[-1:] + self.records[:2]
      lines_2 = self.headers[2:4] + self.headers[-1:] + self.records[2:4]
      lines_3 = self.headers[4:5] + self.headers[-1:] + self.records[4:]
      file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=lines_1)
      file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=lines_2)
      file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=lines_3)

      pipeline = TestPipeline()
      pcoll = pipeline | 'ReadHeaders' >> GetEstimates(
          os.path.join(tempdir.get_path(), '*.vcf'))
      pcoll = (pipeline
               | 'Create' >> beam.Create(
                   [os.path.join(tempdir.get_path(), '*.vcf')])
               | 'GetAllEstimates' >> GetAllEstimates())

      expected = [_get_estimate_from_lines(lines, file_name=file_name)
                  for lines, file_name in [(lines_1, file_name_1),
                                           (lines_2, file_name_2),
                                           (lines_3, file_name_3)]]
      assert_that(pcoll, asserts.header_vars_equal(expected))
      pipeline.run()

开发者ID:googlegenomics，项目名称:gcp-variant-transforms，代码行数:25，代码来源:vcf_estimate_io_test.py

示例4: read_headers

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def read_headers(
    pipeline,  #type: beam.Pipeline
    pipeline_mode,  #type: int
    all_patterns  #type: List[str]
    ):
  # type: (...) -> pvalue.PCollection
  """Creates an initial PCollection by reading the VCF file headers."""
  compression_type = get_compression_type(all_patterns)
  if pipeline_mode == PipelineModes.LARGE:
    headers = (pipeline
               | beam.Create(all_patterns)
               | vcf_header_io.ReadAllVcfHeaders(
                   compression_type=compression_type))
  else:
    headers = pipeline | vcf_header_io.ReadVcfHeaders(
        all_patterns[0],
        compression_type=compression_type)

  return headers

开发者ID:googlegenomics，项目名称:gcp-variant-transforms，代码行数:21，代码来源:pipeline_common.py

示例5: add_annotation_headers

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def add_annotation_headers(pipeline, known_args, pipeline_mode,
                           merged_header,
                           annotated_vcf_pattern):
  if pipeline_mode == PipelineModes.LARGE:
    annotation_headers = (pipeline
                          | 'ReadAnnotatedVCF'
                          >> beam.Create([annotated_vcf_pattern])
                          | 'ReadHeaders' >> vcf_header_io.ReadAllVcfHeaders())
  else:
    annotation_headers = (
        pipeline
        | 'ReadHeaders'
        >> vcf_header_io.ReadVcfHeaders(annotated_vcf_pattern))
  merged_header = (
      (merged_header, annotation_headers)
      | beam.Flatten()
      | 'MergeWithOriginalHeaders' >> merge_headers.MergeHeaders(
          known_args.split_alternate_allele_info_fields,
          known_args.allow_incompatible_records))
  return merged_header

开发者ID:googlegenomics，项目名称:gcp-variant-transforms，代码行数:22，代码来源:pipeline_common.py

示例6: test_stats_pipeline_with_zero_examples

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_stats_pipeline_with_zero_examples(self):
    expected_result = text_format.Parse(
        """
        datasets {
          num_examples: 0
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          num_top_values=1,
          num_rank_histogram_buckets=1,
          num_values_histogram_buckets=2,
          num_histogram_buckets=1,
          num_quantiles_histogram_buckets=1,
          epsilon=0.001)
      result = (p | beam.Create([]) | stats_api.GenerateStatistics(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result))

开发者ID:tensorflow，项目名称:data-validation，代码行数:22，代码来源:stats_api_test.py

示例7: test_stats_pipeline_with_sample_rate

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_stats_pipeline_with_sample_rate(self):
    record_batches = [
        pa.RecordBatch.from_arrays(
            [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
    ]

    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          sample_rate=1.0,
          num_top_values=2,
          num_rank_histogram_buckets=2,
          num_values_histogram_buckets=2,
          num_histogram_buckets=2,
          num_quantiles_histogram_buckets=2,
          epsilon=0.001)
      result = (
          p | beam.Create(record_batches)
          | stats_api.GenerateStatistics(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, self._sampling_test_expected_result))

开发者ID:tensorflow，项目名称:data-validation，代码行数:24，代码来源:stats_api_test.py

示例8: test_write_stats_to_text

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_write_stats_to_text(self):
    stats = text_format.Parse(
        """
        datasets {
          name: 'x'
          num_examples: 100
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
    output_path = os.path.join(self._get_temp_dir(), 'stats')
    with beam.Pipeline() as p:
      _ = (p | beam.Create([stats]) | stats_api.WriteStatisticsToText(
          output_path))
    stats_from_file = statistics_pb2.DatasetFeatureStatisticsList()
    serialized_stats = io_util.read_file_to_string(
        output_path, binary_mode=True)
    stats_from_file.ParseFromString(serialized_stats)
    self.assertLen(stats_from_file.datasets, 1)
    test_util.assert_dataset_feature_stats_proto_equal(
        self, stats_from_file.datasets[0], stats.datasets[0])

开发者ID:tensorflow，项目名称:data-validation，代码行数:21，代码来源:stats_api_test.py

示例9: test_stats_impl

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_stats_impl(self,
                      record_batches,
                      options,
                      expected_result_proto_text,
                      schema=None):
    expected_result = text_format.Parse(
        expected_result_proto_text,
        statistics_pb2.DatasetFeatureStatisticsList())
    if schema is not None:
      options.schema = schema
    with beam.Pipeline() as p:
      result = (
          p | beam.Create(record_batches, reshuffle=False)
          | stats_impl.GenerateStatisticsImpl(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result))

开发者ID:tensorflow，项目名称:data-validation，代码行数:20，代码来源:stats_impl_test.py

示例10: test_csv_decoder

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_csv_decoder(self,
                       input_lines,
                       expected_result,
                       column_names,
                       delimiter=',',
                       skip_blank_lines=True,
                       schema=None,
                       multivalent_columns=None,
                       secondary_delimiter=None):
    with beam.Pipeline() as p:
      result = (
          p | beam.Create(input_lines, reshuffle=False)
          | csv_decoder.DecodeCSV(
              column_names=column_names,
              delimiter=delimiter,
              skip_blank_lines=skip_blank_lines,
              schema=schema,
              multivalent_columns=multivalent_columns,
              secondary_delimiter=secondary_delimiter))
      util.assert_that(
          result,
          test_util.make_arrow_record_batches_equal_fn(self, expected_result))

开发者ID:tensorflow，项目名称:data-validation，代码行数:24，代码来源:csv_decoder_test.py

示例11: testHandleBatchError

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def testHandleBatchError(self):
    if self._UseTFXIO():
      return

    def preprocessing_fn(inputs):
      return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

    metadata = tft_unit.metadata_from_feature_spec({
        'x': tf.io.FixedLenFeature([], tf.float32),
    })
    pipeline = self._makeTestPipeline()
    input_data = pipeline | 'CreateTrainingData' >> beam.Create([{
        'x': 1
    }, {
        'x': [4, 1]
    }])
    with beam_impl.Context(temp_dir=self.get_temp_dir()):
      _ = ((input_data, metadata)
           | 'AnalyzeDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn))
    # Exception type depends on the running being used.
    with self.assertRaisesRegexp(
        (RuntimeError, ValueError),
        'An error occured while trying to apply the transformation:'):
      pipeline.run()

开发者ID:tensorflow，项目名称:transform，代码行数:26，代码来源:impl_test.py

示例12: _clear_shared_state_after_barrier

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def _clear_shared_state_after_barrier(pipeline, input_barrier):
  """Clears any shared state from within a pipeline context.

  This will only be cleared once input_barrier becomes available.

  Args:
    pipeline: A `beam.Pipeline` object.
    input_barrier: A `PCollection` which the pipeline should wait for.

  Returns:
    An empty `PCollection`.
  """
  empty_pcoll = input_barrier | 'MakeCheapBarrier' >> beam.FlatMap(
      lambda x: None)
  return (pipeline
          | 'PrepareToClearSharedKeepAlives' >> beam.Create([None])
          | 'WaitAndClearSharedKeepAlives' >> beam.Map(
              lambda x, empty_side_input: shared.Shared().acquire(lambda: None),
              beam.pvalue.AsIter(empty_pcoll)))

开发者ID:tensorflow，项目名称:transform，代码行数:21，代码来源:impl.py

示例13: testWriteTransformFnIsIdempotent

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def testWriteTransformFnIsIdempotent(self):
    transform_output_dir = os.path.join(self.get_temp_dir(), 'output')

    def mock_write_metadata_expand(unused_self, unused_metadata):
      raise ArithmeticError('Some error')

    with beam.Pipeline() as pipeline:
      # Create an empty directory for the source saved model dir.
      saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
      saved_model_dir_pcoll = (
          pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))

      with mock.patch.object(transform_fn_io.beam_metadata_io.WriteMetadata,
                             'expand', mock_write_metadata_expand):
        with self.assertRaisesRegexp(ArithmeticError, 'Some error'):
          _ = ((saved_model_dir_pcoll, object())
               | transform_fn_io.WriteTransformFn(transform_output_dir))

    self.assertFalse(file_io.file_exists(transform_output_dir))

开发者ID:tensorflow，项目名称:transform，代码行数:21，代码来源:transform_fn_io_test.py

示例14: testTwoLangs

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def testTwoLangs(self):
    with TestPipeline() as p:
      tokens = p | 'CreateInput' >> beam.Create(self.sample_input)
      result = tokens | beam.ParDo(utils.CompileTokenizationInfo())
      assert_that(result, equal_to([{
          'lang': 'en',
          'count': 1,
          'num_preserved_chars': 13,
          'num_dropped_chars': 2,
          'num_non_unk_wordpieces': 4,
          'preserved_ratio': [13/4],
          'dropped_ratio': [2/15],
          'wordpieces': collections.Counter(['the', 'app', '##le', 'sauce'])
      }, {
          'lang': 'fr',
          'count': 1,
          'num_preserved_chars': 14,
          'num_dropped_chars': 0,
          'num_non_unk_wordpieces': 5,
          'preserved_ratio': [14/5],
          'dropped_ratio': [0],
          'wordpieces': collections.Counter(['bon', '##jour', 'bon', '##soir'])
      }]))

开发者ID:tensorflow，项目名称:text，代码行数:25，代码来源:utils_test.py

示例15: _create_row

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def _create_row(stats, now, extra_columns=tuple()):
  """Create a BigQuery row from the given stats."""
  row = {'true_positives': stats.true_positives,
         'false_positives': stats.false_positives,
         'false_negatives': stats.false_negatives}
  if not math.isnan(stats.precision):
    row['precision'] = stats.precision
  if not math.isnan(stats.recall):
    row['recall'] = stats.recall
  if not math.isnan(stats.f_score):
    row['f_score'] = stats.f_score

  row['timestamp'] = now

  for column_name, val in extra_columns:
    row[column_name] = val

  return row

开发者ID:GoogleCloudPlatform，项目名称:healthcare-deid，代码行数:20，代码来源:run_pipeline_lib.py

注：本文中的apache_beam.Create方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。