Python apache_beam.ParDo方法代码示例

本文整理汇总了Python中apache_beam.ParDo方法的典型用法代码示例。如果您正苦于以下问题：Python apache_beam.ParDo方法的具体用法？Python apache_beam.ParDo怎么用？Python apache_beam.ParDo使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam的用法示例。

在下文中一共展示了apache_beam.ParDo方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def expand(self, pcoll):
    return (pcoll
            | 'InferHeaderFields' >> beam.ParDo(
                _InferHeaderFields(self._infer_headers,
                                   self._annotation_fields_to_infer),
                self._defined_headers)
            # TODO(nmousavi): Modify the MergeHeaders to resolve 1 vs '.'
            # mismatch for headers extracted from variants.
            #
            # Note: argument `split_alternate_allele_info_fields` is not
            # relevant here since no fields with `Number=A` will be extracted
            # from variants, therefore we let the default value (True) for it
            # be used. Should this changes, we should modify the default value.
            | 'MergeHeaders' >> merge_headers.MergeHeaders(
                split_alternate_allele_info_fields=True,
                allow_incompatible_records=(
                    self._allow_incompatible_records or
                    bool(self._annotation_fields_to_infer))))

开发者ID:googlegenomics，项目名称:gcp-variant-transforms，代码行数:20，代码来源:infer_headers.py

示例2: test_convert_variant_to_bigquery_row

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def test_convert_variant_to_bigquery_row(self):
    variant_1, row_1, header_num_dict_1 = self._get_sample_variant_1()
    variant_2, row_2, header_num_dict_2 = self._get_sample_variant_2()
    variant_3, row_3, header_num_dict_3 = self._get_sample_variant_3()
    header_num_dict = header_num_dict_1.copy()
    header_num_dict.update(header_num_dict_2)
    header_num_dict.update(header_num_dict_3)
    header_fields = vcf_header_util.make_header(header_num_dict)
    proc_var_1 = processed_variant.ProcessedVariantFactory(
        header_fields).create_processed_variant(variant_1)
    proc_var_2 = processed_variant.ProcessedVariantFactory(
        header_fields).create_processed_variant(variant_2)
    proc_var_3 = processed_variant.ProcessedVariantFactory(
        header_fields).create_processed_variant(variant_3)
    pipeline = TestPipeline(blocking=True)
    bigquery_rows = (
        pipeline
        | Create([proc_var_1, proc_var_2, proc_var_3])
        | 'ConvertToRow' >> beam.ParDo(ConvertVariantToRow(
            self._row_generator)))
    assert_that(bigquery_rows, equal_to([row_1, row_2, row_3]))
    pipeline.run()

开发者ID:googlegenomics，项目名称:gcp-variant-transforms，代码行数:24，代码来源:variant_to_bigquery_test.py

示例3: configure_pipeline

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def configure_pipeline(p, opt):
  """Specify PCollection and transformations in pipeline."""
  read_input_source = beam.io.ReadFromText(
      opt.input_path, strip_trailing_newlines=True)
  read_label_source = beam.io.ReadFromText(
      opt.input_dict, strip_trailing_newlines=True)
  labels = (p | 'Read dictionary' >> read_label_source)
  _ = (p
       | 'Read input' >> read_input_source
       | 'Parse input' >> beam.Map(lambda line: csv.reader([line]).next())
       | 'Extract label ids' >> beam.ParDo(ExtractLabelIdsDoFn(),
                                           beam.pvalue.AsIter(labels))
       | 'Read and convert to JPEG'
       >> beam.ParDo(ReadImageAndConvertToJpegDoFn())
       | 'Embed and make TFExample' >> beam.ParDo(TFExampleFromImageDoFn())
       # TODO(b/35133536): Get rid of this Map and instead use
       # coder=beam.coders.ProtoCoder(tf.train.Example) in WriteToTFRecord
       # below.
       | 'SerializeToString' >> beam.Map(lambda x: x.SerializeToString())
       | 'Save to disk'
       >> beam.io.WriteToTFRecord(opt.output_path,
                                  file_name_suffix='.tfrecord.gz'))

开发者ID:GoogleCloudPlatform，项目名称:cloudml-edge-automation，代码行数:24，代码来源:preprocess.py

示例4: run

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def run(p, input_path, output_directory, train_fraction=0.8):
    """Runs the pipeline."""

    raw_data = (p | "ReadTrainData" >> beam.io.Read(CsvFileSource(
            input_path, column_names=constants.CSV_COLUMNS)))
    train_data, eval_data = split_data(raw_data, train_fraction)

    (train_data | "PrepareCSV_train" >> beam.ParDo(
        ConvertDictToCSV(ordered_fieldnames=constants.CSV_COLUMNS))
     | "Write_train" >> beam.io.WriteToText(
            os.path.join(output_directory, "output_data", "train"),
            file_name_suffix=".csv"))
    (eval_data | "PrepareCSV_eval" >> beam.ParDo(
        ConvertDictToCSV(ordered_fieldnames=constants.CSV_COLUMNS))
     | "Write_eval" >> beam.io.WriteToText(
            os.path.join(output_directory, "output_data", "eval"),
            file_name_suffix=".csv"))

开发者ID:GoogleCloudPlatform，项目名称:cloudml-samples，代码行数:19，代码来源:preprocess.py

示例5: main

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def main(_):
  beam_utils.BeamInit()

  if not FLAGS.output_file_pattern:
    raise ValueError('Must provide an output_file_pattern')

  reader = beam.io.ReadFromTFRecord(
      FLAGS.input_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example))

  model_name = FLAGS.model_name
  split = FLAGS.split
  run_preprocessors = FLAGS.run_preprocessors

  with beam_utils.GetPipelineRoot() as root:
    _ = (
        root
        | 'Read' >> reader
        | 'ToTFExample' >> beam.ParDo(
            _ProcessShard(model_name, split, run_preprocessors))
        | 'Reshuffle' >> beam.Reshuffle()
        | 'Write' >> beam.io.WriteToTFRecord(
            FLAGS.output_file_pattern,
            coder=beam.coders.ProtoCoder(tf.train.Example)))

开发者ID:tensorflow，项目名称:lingvo，代码行数:25，代码来源:create_kitti_crop_dataset.py

示例6: GetPipelineRoot

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def GetPipelineRoot(options=None):
  """Return the root of the beam pipeline.

  Typical usage looks like:

    with GetPipelineRoot() as root:
      _ = (root | beam.ParDo() | ...)

  In this example, the pipeline is automatically executed when the context is
  exited, though one can manually run the pipeline built from the root object as
  well.

  Args:
    options: A beam.options.pipeline_options.PipelineOptions object.

  Returns:
    A beam.Pipeline root object.
  """
  return beam.Pipeline(options=options)

开发者ID:tensorflow，项目名称:lingvo，代码行数:21，代码来源:beam_utils.py

示例7: BatchSerializedExamplesToArrowRecordBatches

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def BatchSerializedExamplesToArrowRecordBatches(
    examples: beam.pvalue.PCollection,
    desired_batch_size: Optional[int] = constants
    .DEFAULT_DESIRED_INPUT_BATCH_SIZE
) -> beam.pvalue.PCollection:
  """Batches serialized examples into Arrow record batches.

  Args:
    examples: A PCollection of serialized tf.Examples.
    desired_batch_size: Batch size. The output Arrow record batches will have as
      many rows as the `desired_batch_size`.

  Returns:
    A PCollection of Arrow record batches.
  """
  return (examples
          | "BatchSerializedExamples" >> beam.BatchElements(
              **batch_util.GetBatchElementsKwargs(desired_batch_size))
          | "BatchDecodeExamples" >> beam.ParDo(_BatchDecodeExamplesDoFn()))

开发者ID:tensorflow，项目名称:data-validation，代码行数:21，代码来源:batch_util.py

示例8: make_prediction_pipeline

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def make_prediction_pipeline(pipeline, args):
  """Builds the prediction pipeline.

  Reads the csv files, prepends a ',' if the target column is missing, run
  prediction, and then prints the formated results to a file.

  Args:
    pipeline: the pipeline
    args: command line args
  """

  # DF bug: DF does not work with unicode strings
  predicted_values, errors = (
      pipeline |
      'Read CSV Files' >>
      beam.io.ReadFromText(str(args.predict_data),
                           strip_trailing_newlines=True) |
      'Batch Input' >>
      beam.ParDo(EmitAsBatchDoFn(args.batch_size)) |
      'Run TF Graph on Batches' >>
      beam.ParDo(RunGraphDoFn(args.trained_model_dir)).with_outputs('errors', main='main'))

  ((predicted_values, errors) |
   'Format and Save' >>
   FormatAndSave(args))

开发者ID:googledatalab，项目名称:pydatalab，代码行数:27，代码来源:predict.py

示例9: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def expand(self, inputs):
    saved_model_dir_pcol, input_values_pcol = inputs

    # We don't deep_copy pcollections used for the first phase, or when
    # the user defined `Context` disables it.
    if self._phase > 0 and Context.get_use_deep_copy_optimization():
      # Obviates unnecessary data materialization when the input data source is
      # safe to read more than once.
      tf.compat.v1.logging.info('Deep copying inputs for phase: %d',
                                self._phase)
      input_values_pcol = deep_copy.deep_copy(input_values_pcol)

    if not self._use_tfxio:
      input_values_pcol |= 'BatchInputs' >> _BatchElements()

    return (input_values_pcol | 'ApplySavedModel' >> beam.ParDo(
        _RunMetaGraphDoFn(
            self._tf_config,
            use_tfxio=self._use_tfxio,
            input_schema=self._input_schema,
            input_tensor_adapter_config=self._input_tensor_adapter_config,
            shared_graph_state_handle=shared.Shared(),
            passthrough_keys=Context.get_passthrough_keys()),
        saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir_pcol)))

开发者ID:tensorflow，项目名称:transform，代码行数:26，代码来源:impl.py

示例10: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def expand(self, pcoll):
        return (
            pcoll
            # Assigns window info to each Pub/Sub message based on its
            # publish timestamp.
            | "Window into Fixed Intervals"
            >> beam.WindowInto(window.FixedWindows(self.window_size))
            | "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
            # Use a dummy key to group the elements in the same window.
            # Note that all the elements in one window must fit into memory
            # for this. If the windowed elements do not fit into memory,
            # please consider using `beam.util.BatchElements`.
            # https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements
            | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
            | "Groupby" >> beam.GroupByKey()
            | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
        )

开发者ID:GoogleCloudPlatform，项目名称:python-docs-samples，代码行数:19，代码来源:PubSubToGCS.py

示例11: testTwoLangs

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def testTwoLangs(self):
    with TestPipeline() as p:
      tokens = p | 'CreateInput' >> beam.Create(self.sample_input)
      result = tokens | beam.ParDo(utils.CompileTokenizationInfo())
      assert_that(result, equal_to([{
          'lang': 'en',
          'count': 1,
          'num_preserved_chars': 13,
          'num_dropped_chars': 2,
          'num_non_unk_wordpieces': 4,
          'preserved_ratio': [13/4],
          'dropped_ratio': [2/15],
          'wordpieces': collections.Counter(['the', 'app', '##le', 'sauce'])
      }, {
          'lang': 'fr',
          'count': 1,
          'num_preserved_chars': 14,
          'num_dropped_chars': 0,
          'num_non_unk_wordpieces': 5,
          'preserved_ratio': [14/5],
          'dropped_ratio': [0],
          'wordpieces': collections.Counter(['bon', '##jour', 'bon', '##soir'])
      }]))

开发者ID:tensorflow，项目名称:text，代码行数:25，代码来源:utils_test.py

示例12: _PrestoToExample

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def _PrestoToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    exec_properties: Dict[Text, Any],
    split_pattern: Text) -> beam.pvalue.PCollection:
  """Read from Presto and transform to TF examples.

  Args:
    pipeline: beam pipeline.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, a Presto sql string.

  Returns:
    PCollection of TF examples.
  """
  conn_config = example_gen_pb2.CustomConfig()
  json_format.Parse(exec_properties['custom_config'], conn_config)
  presto_config = presto_config_pb2.PrestoConnConfig()
  conn_config.custom_config.Unpack(presto_config)

  client = _deserialize_conn_config(presto_config)
  return (pipeline
          | 'Query' >> beam.Create([split_pattern])
          | 'QueryTable' >> beam.ParDo(_ReadPrestoDoFn(client))
          | 'ToTFExample' >> beam.Map(_row_to_example))

开发者ID:tensorflow，项目名称:tfx，代码行数:26，代码来源:executor.py

示例13: _ToArrowRecordBatches

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def _ToArrowRecordBatches(
      pcoll: beam.pvalue.PCollection,
      schema: Optional[schema_pb2.Schema]) -> beam.pvalue.PCollection:
    """Converts serialized examples to Arrow RecordBatches.

    Args:
      pcoll: PCollection of Transformed data.
      schema: schema.

    Returns:
      PCollection of `DatasetFeatureStatisticsList`.
    """
    kwargs = tfdv.utils.batch_util.GetBeamBatchKwargs(
        tft_beam.Context.get_desired_batch_size())
    return (
        pcoll
        | 'Values' >> beam.Values()
        | 'BatchElements' >> beam.BatchElements(**kwargs)
        | 'ToArrowRecordBatches' >> beam.ParDo(
            Executor._ToArrowRecordBatchesFn(schema)))

开发者ID:tensorflow，项目名称:tfx，代码行数:22，代码来源:executor.py

示例14: shuffle

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def shuffle(p):
  """Shuffles data from PCollection.

  Args:
    p: PCollection.

  Returns:
    PCollection of shuffled data.
  """

  class _AddRandomKey(beam.DoFn):

    def process(self, element):
      yield random.random(), element

  shuffled_data = (
      p
      | 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
      | 'GroupByRandom' >> beam.GroupByKey()
      | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
  return shuffled_data

开发者ID:GoogleCloudPlatform，项目名称:professional-services，代码行数:23，代码来源:preprocess.py

示例15: shuffle_data

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def shuffle_data(p):
  """Shuffles data from PCollection.

  Args:
    p: PCollection.

  Returns:
    PCollection of shuffled data.
  """

  class _AddRandomKey(beam.DoFn):

    def process(self, element):
      yield (random.random(), element)

  shuffled_data = (
      p
      | 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
      | 'GroupByRandom' >> beam.GroupByKey()
      | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
  return shuffled_data

开发者ID:GoogleCloudPlatform，项目名称:professional-services，代码行数:23，代码来源:preprocess.py

注：本文中的apache_beam.ParDo方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。