当前位置: 首页>>代码示例>>Python>>正文


Python apache_beam.Map方法代码示例

本文整理汇总了Python中apache_beam.Map方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.Map方法的具体用法?Python apache_beam.Map怎么用?Python apache_beam.Map使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在apache_beam的用法示例。


在下文中一共展示了apache_beam.Map方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: configure_pipeline

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def configure_pipeline(p, opt):
  """Specify PCollection and transformations in pipeline."""
  read_input_source = beam.io.ReadFromText(
      opt.input_path, strip_trailing_newlines=True)
  read_label_source = beam.io.ReadFromText(
      opt.input_dict, strip_trailing_newlines=True)
  labels = (p | 'Read dictionary' >> read_label_source)
  _ = (p
       | 'Read input' >> read_input_source
       | 'Parse input' >> beam.Map(lambda line: csv.reader([line]).next())
       | 'Extract label ids' >> beam.ParDo(ExtractLabelIdsDoFn(),
                                           beam.pvalue.AsIter(labels))
       | 'Read and convert to JPEG'
       >> beam.ParDo(ReadImageAndConvertToJpegDoFn())
       | 'Embed and make TFExample' >> beam.ParDo(TFExampleFromImageDoFn())
       # TODO(b/35133536): Get rid of this Map and instead use
       # coder=beam.coders.ProtoCoder(tf.train.Example) in WriteToTFRecord
       # below.
       | 'SerializeToString' >> beam.Map(lambda x: x.SerializeToString())
       | 'Save to disk'
       >> beam.io.WriteToTFRecord(opt.output_path,
                                  file_name_suffix='.tfrecord.gz')) 
开发者ID:GoogleCloudPlatform,项目名称:cloudml-edge-automation,代码行数:24,代码来源:preprocess.py

示例2: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def expand(self, pvalue):
    if self._handle.endswith('.csv'):
      # The input is CSV file(s).
      schema = reddit.make_input_schema(mode=self._mode)
      csv_coder = reddit.make_csv_coder(schema, mode=self._mode)
      return (pvalue.pipeline
              | 'ReadFromText' >> beam.io.ReadFromText(
                  self._handle,
                  # TODO(b/35653662): Obviate the need for setting this.
                  coder=beam.coders.BytesCoder())
              | 'ParseCSV' >> beam.Map(csv_coder.decode))
    else:
      # The input is BigQuery table name(s).
      query = reddit.make_standard_sql(self._handle, mode=self._mode)
      return (pvalue.pipeline
              | 'ReadFromBigQuery' >> beam.io.Read(
                  beam.io.BigQuerySource(query=query, use_standard_sql=True)))


# TODO: Perhaps use Reshuffle (https://issues.apache.org/jira/browse/BEAM-1872)? 
开发者ID:GoogleCloudPlatform,项目名称:cloudml-samples,代码行数:22,代码来源:preprocess.py

示例3: BatchExamplesToArrowRecordBatches

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def BatchExamplesToArrowRecordBatches(
    examples: beam.pvalue.PCollection,
    desired_batch_size: Optional[int] = constants
    .DEFAULT_DESIRED_INPUT_BATCH_SIZE
) -> beam.pvalue.PCollection:
  """Batches example dicts into Arrow record batches.

  Args:
    examples: A PCollection of example dicts.
    desired_batch_size: Batch size. The output Arrow record batches will have as
      many rows as the `desired_batch_size`.

  Returns:
    A PCollection of Arrow record batches.
  """
  return (
      examples
      | "BatchBeamExamples" >> beam.BatchElements(
          **batch_util.GetBatchElementsKwargs(desired_batch_size))
      | "DecodeExamplesToRecordBatch" >> beam.Map(
          # pylint: disable=unnecessary-lambda
          lambda x: decoded_examples_to_arrow.DecodedExamplesToRecordBatch(x)))
          # pylint: enable=unnecessary-lambda 
开发者ID:tensorflow,项目名称:data-validation,代码行数:25,代码来源:batch_util.py

示例4: get_sources_from_dataset

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def get_sources_from_dataset(p, dataset, mode):
  """get pcollection from dataset."""

  import apache_beam as beam
  import csv
  from google.datalab.ml import CsvDataSet, BigQueryDataSet

  check_dataset(dataset, mode)
  if type(dataset) is CsvDataSet:
    source_list = []
    for ii, input_path in enumerate(dataset.files):
      source_list.append(p | 'Read from Csv %d (%s)' % (ii, mode) >>
                         beam.io.ReadFromText(input_path, strip_trailing_newlines=True))
    return (source_list |
            'Flatten Sources (%s)' % mode >>
            beam.Flatten() |
            'Create Dict from Csv (%s)' % mode >>
            beam.Map(lambda line: csv.DictReader([line], fieldnames=['image_url',
                                                                     'label']).next()))
  elif type(dataset) is BigQueryDataSet:
    bq_source = (beam.io.BigQuerySource(table=dataset.table) if dataset.table is not None else
                 beam.io.BigQuerySource(query=dataset.query))
    return p | 'Read source from BigQuery (%s)' % mode >> beam.io.Read(bq_source)
  else:
    raise ValueError('Invalid DataSet. Expect CsvDataSet or BigQueryDataSet') 
开发者ID:googledatalab,项目名称:pydatalab,代码行数:27,代码来源:_util.py

示例5: create_glyphazzn_dataset

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def create_glyphazzn_dataset(filepattern, output_path):
  """Creates a glyphazzn dataset, from raw Parquetio to TFRecords."""
  def pipeline(root):
    """Pipeline for creating glyphazzn dataset."""
    attrs = ['uni', 'width', 'vwidth', 'sfd', 'id', 'binary_fp']

    examples = root | 'Read' >> beam.io.parquetio.ReadFromParquet(
        file_pattern=filepattern, columns=attrs)

    examples = examples | 'FilterBadIcons' >> beam.Filter(_is_valid_glyph)
    examples = examples | 'ConvertToPath' >> beam.Map(_convert_to_path)
    examples = examples | 'FilterBadPathLenghts' >> beam.Filter(_is_valid_path)
    examples = examples | 'ProcessAndConvert' >> beam.Map(_create_example)
    (examples | 'WriteToTFRecord' >> beam.io.tfrecordio.WriteToTFRecord(
        output_path, num_shards=90))
  return pipeline 
开发者ID:magenta,项目名称:magenta,代码行数:18,代码来源:datagen_beam.py

示例6: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def expand(self, pipeline):

    def _make_and_increment_counters(unused_element, analyzer_counter,
                                     mapper_counter):
      del unused_element
      for counter_prefix, counter in (('tft_analyzer_{}', analyzer_counter),
                                      ('tft_mapper_{}', mapper_counter)):
        for name, count in counter.items():
          beam.metrics.Metrics.counter(beam_common.METRICS_NAMESPACE,
                                       counter_prefix.format(name)).inc(count)

    _ = (
        pipeline
        | 'CreateSoleAPIUse' >> beam.Create([None])
        | 'CountAPIUse' >>
        beam.Map(_make_and_increment_counters, self._analyzer_use_counter,
                 self._mapper_use_counter)) 
开发者ID:tensorflow,项目名称:transform,代码行数:19,代码来源:impl.py

示例7: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def expand(self, pcoll):
        return (
            pcoll
            # Assigns window info to each Pub/Sub message based on its
            # publish timestamp.
            | "Window into Fixed Intervals"
            >> beam.WindowInto(window.FixedWindows(self.window_size))
            | "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
            # Use a dummy key to group the elements in the same window.
            # Note that all the elements in one window must fit into memory
            # for this. If the windowed elements do not fit into memory,
            # please consider using `beam.util.BatchElements`.
            # https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements
            | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
            | "Groupby" >> beam.GroupByKey()
            | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
        ) 
开发者ID:GoogleCloudPlatform,项目名称:python-docs-samples,代码行数:19,代码来源:PubSubToGCS.py

示例8: run

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def run(args, input_subscription, output_table, window_interval):
    """Build and run the pipeline."""
    options = PipelineOptions(args, save_main_session=True, streaming=True)

    with beam.Pipeline(options=options) as pipeline:

        # Read the messages from PubSub and process them.
        messages = (
            pipeline
            | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
                subscription=input_subscription).with_output_types(bytes)
            | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
            | 'Parse JSON messages' >> beam.Map(parse_json_message)
            | 'Fixed-size windows' >> beam.WindowInto(
                window.FixedWindows(int(window_interval), 0))
            | 'Add URL keys' >> beam.Map(lambda msg: (msg['url'], msg))
            | 'Group by URLs' >> beam.GroupByKey()
            | 'Get statistics' >> beam.Map(get_statistics))

        # Output the results into BigQuery table.
        _ = messages | 'Write to Big Query' >> beam.io.WriteToBigQuery(
            output_table, schema=SCHEMA) 
开发者ID:GoogleCloudPlatform,项目名称:python-docs-samples,代码行数:24,代码来源:streaming_beam.py

示例9: compare_bq_row

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def compare_bq_row(row, types_to_ignore):
  """Compare the findings in the given BigQuery row.

  Args:
    row: BQ row: Map containing (findings_record_id, findings_xml, golden_xml).
    types_to_ignore: List of strings representing types that should be excluded
      from the analysis.
  Returns:
    (IndividualResult, IndividualResult), where the first is for strict entity
    matching and the second is for binary token matching.
  Raises:
    Exception: If golden_xml doesn't exist.
  """
  findings, note_text = get_findings_from_text(row['findings_xml'],
                                               types_to_ignore)
  if 'golden_xml' not in row or row['golden_xml'] is None:
    raise Exception(
        'No golden found for record %s.' % row['findings_record_id'])
  golden_findings, golden_note_text = get_findings_from_text(row['golden_xml'],
                                                             types_to_ignore)
  record_id = row['findings_record_id']

  return compare_findings(findings, golden_findings, record_id, note_text,
                          golden_note_text) 
开发者ID:GoogleCloudPlatform,项目名称:healthcare-deid,代码行数:26,代码来源:run_pipeline_lib.py

示例10: _PrestoToExample

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def _PrestoToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    exec_properties: Dict[Text, Any],
    split_pattern: Text) -> beam.pvalue.PCollection:
  """Read from Presto and transform to TF examples.

  Args:
    pipeline: beam pipeline.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, a Presto sql string.

  Returns:
    PCollection of TF examples.
  """
  conn_config = example_gen_pb2.CustomConfig()
  json_format.Parse(exec_properties['custom_config'], conn_config)
  presto_config = presto_config_pb2.PrestoConnConfig()
  conn_config.custom_config.Unpack(presto_config)

  client = _deserialize_conn_config(presto_config)
  return (pipeline
          | 'Query' >> beam.Create([split_pattern])
          | 'QueryTable' >> beam.ParDo(_ReadPrestoDoFn(client))
          | 'ToTFExample' >> beam.Map(_row_to_example)) 
开发者ID:tensorflow,项目名称:tfx,代码行数:26,代码来源:executor.py

示例11: _ReadExamples

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def _ReadExamples(
      pipeline: beam.Pipeline, dataset: _Dataset,
      input_dataset_metadata: dataset_metadata.DatasetMetadata
  ) -> beam.pvalue.PCollection:
    """Reads examples from the given `dataset`.

    Args:
      pipeline: beam pipeline.
      dataset: A `_Dataset` object that represents the data to read.
      input_dataset_metadata: A `dataset_metadata.DatasetMetadata`. Not used.

    Returns:
      A PCollection containing KV pairs of bytes.
    """
    del input_dataset_metadata
    assert dataset.file_format == labels.FORMAT_TFRECORD, dataset.file_format

    return (
        pipeline
        | 'Read' >> beam.io.ReadFromTFRecord(
            dataset.file_pattern,
            coder=beam.coders.BytesCoder(),
            # TODO(b/114938612): Eventually remove this override.
            validate=False)
        | 'AddKey' >> beam.Map(lambda x: (None, x))) 
开发者ID:tensorflow,项目名称:tfx,代码行数:27,代码来源:executor.py

示例12: _DecodeInputs

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def _DecodeInputs(pcoll: beam.pvalue.PCollection,
                    decode_fn: Any) -> beam.pvalue.PCollection:
    """Decodes the given PCollection while handling KV data.

    Args:
      pcoll: PCollection of data.
      decode_fn: Function used to decode data.

    Returns:
      PCollection of decoded data.
    """

    def decode_example(kv: Tuple[Optional[bytes], bytes]) -> Dict[Text, Any]:  # pylint: disable=invalid-name
      """Decodes a single example."""
      (key, value) = kv
      result = decode_fn(value)
      if _TRANSFORM_INTERNAL_FEATURE_FOR_KEY in result:
        raise ValueError('"{}" is a reserved feature name, '
                         'it should not be present in the dataset.'.format(
                             _TRANSFORM_INTERNAL_FEATURE_FOR_KEY))
      result[_TRANSFORM_INTERNAL_FEATURE_FOR_KEY] = key
      return result

    return pcoll | 'ApplyDecodeFn' >> beam.Map(decode_example) 
开发者ID:tensorflow,项目名称:tfx,代码行数:26,代码来源:executor.py

示例13: _AvroToExample

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def _AvroToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline, exec_properties: Dict[Text, Any],
    split_pattern: Text) -> beam.pvalue.PCollection:
  """Read Avro files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    exec_properties: A dict of execution properties.
      - input_base: input dir that contains Avro data.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.
  """
  input_base_uri = exec_properties[utils.INPUT_BASE_KEY]
  avro_pattern = os.path.join(input_base_uri, split_pattern)
  logging.info('Processing input avro data %s to TFExample.', avro_pattern)

  return (pipeline
          | 'ReadFromAvro' >> beam.io.ReadFromAvro(avro_pattern)
          | 'ToTFExample' >> beam.Map(utils.dict_to_example)) 
开发者ID:tensorflow,项目名称:tfx,代码行数:26,代码来源:avro_executor.py

示例14: testImportExample

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def testImportExample(self):
    with beam.Pipeline() as pipeline:
      examples = (
          pipeline
          | 'ToSerializedRecord' >> executor._ImportSerializedRecord(
              exec_properties={utils.INPUT_BASE_KEY: self._input_data_dir},
              split_pattern='tfrecord/*')
          | 'ToTFExample' >> beam.Map(tf.train.Example.FromString))

      def check_result(got):
        # We use Python assertion here to avoid Beam serialization error in
        # pickling tf.test.TestCase.
        assert (15000 == len(got)), 'Unexpected example count'
        assert (18 == len(got[0].features.feature)), 'Example not match'

      util.assert_that(examples, check_result) 
开发者ID:tensorflow,项目名称:tfx,代码行数:18,代码来源:executor_test.py

示例15: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def expand(self, pcoll):
    return (pcoll
            | 'ExtractIdNameTuples' >> beam.Map(self._extract_id_name)
            | 'CombineToDict' >> beam.combiners.ToDict()) 
开发者ID:googlegenomics,项目名称:gcp-variant-transforms,代码行数:6,代码来源:sample_mapping_table.py


注:本文中的apache_beam.Map方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。