本文整理汇总了Python中apache_beam.Map方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.Map方法的具体用法?Python apache_beam.Map怎么用?Python apache_beam.Map使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.Map方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: configure_pipeline
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def configure_pipeline(p, opt):
"""Specify PCollection and transformations in pipeline."""
read_input_source = beam.io.ReadFromText(
opt.input_path, strip_trailing_newlines=True)
read_label_source = beam.io.ReadFromText(
opt.input_dict, strip_trailing_newlines=True)
labels = (p | 'Read dictionary' >> read_label_source)
_ = (p
| 'Read input' >> read_input_source
| 'Parse input' >> beam.Map(lambda line: csv.reader([line]).next())
| 'Extract label ids' >> beam.ParDo(ExtractLabelIdsDoFn(),
beam.pvalue.AsIter(labels))
| 'Read and convert to JPEG'
>> beam.ParDo(ReadImageAndConvertToJpegDoFn())
| 'Embed and make TFExample' >> beam.ParDo(TFExampleFromImageDoFn())
# TODO(b/35133536): Get rid of this Map and instead use
# coder=beam.coders.ProtoCoder(tf.train.Example) in WriteToTFRecord
# below.
| 'SerializeToString' >> beam.Map(lambda x: x.SerializeToString())
| 'Save to disk'
>> beam.io.WriteToTFRecord(opt.output_path,
file_name_suffix='.tfrecord.gz'))
示例2: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def expand(self, pvalue):
if self._handle.endswith('.csv'):
# The input is CSV file(s).
schema = reddit.make_input_schema(mode=self._mode)
csv_coder = reddit.make_csv_coder(schema, mode=self._mode)
return (pvalue.pipeline
| 'ReadFromText' >> beam.io.ReadFromText(
self._handle,
# TODO(b/35653662): Obviate the need for setting this.
coder=beam.coders.BytesCoder())
| 'ParseCSV' >> beam.Map(csv_coder.decode))
else:
# The input is BigQuery table name(s).
query = reddit.make_standard_sql(self._handle, mode=self._mode)
return (pvalue.pipeline
| 'ReadFromBigQuery' >> beam.io.Read(
beam.io.BigQuerySource(query=query, use_standard_sql=True)))
# TODO: Perhaps use Reshuffle (https://issues.apache.org/jira/browse/BEAM-1872)?
示例3: BatchExamplesToArrowRecordBatches
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def BatchExamplesToArrowRecordBatches(
examples: beam.pvalue.PCollection,
desired_batch_size: Optional[int] = constants
.DEFAULT_DESIRED_INPUT_BATCH_SIZE
) -> beam.pvalue.PCollection:
"""Batches example dicts into Arrow record batches.
Args:
examples: A PCollection of example dicts.
desired_batch_size: Batch size. The output Arrow record batches will have as
many rows as the `desired_batch_size`.
Returns:
A PCollection of Arrow record batches.
"""
return (
examples
| "BatchBeamExamples" >> beam.BatchElements(
**batch_util.GetBatchElementsKwargs(desired_batch_size))
| "DecodeExamplesToRecordBatch" >> beam.Map(
# pylint: disable=unnecessary-lambda
lambda x: decoded_examples_to_arrow.DecodedExamplesToRecordBatch(x)))
# pylint: enable=unnecessary-lambda
示例4: get_sources_from_dataset
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def get_sources_from_dataset(p, dataset, mode):
"""get pcollection from dataset."""
import apache_beam as beam
import csv
from google.datalab.ml import CsvDataSet, BigQueryDataSet
check_dataset(dataset, mode)
if type(dataset) is CsvDataSet:
source_list = []
for ii, input_path in enumerate(dataset.files):
source_list.append(p | 'Read from Csv %d (%s)' % (ii, mode) >>
beam.io.ReadFromText(input_path, strip_trailing_newlines=True))
return (source_list |
'Flatten Sources (%s)' % mode >>
beam.Flatten() |
'Create Dict from Csv (%s)' % mode >>
beam.Map(lambda line: csv.DictReader([line], fieldnames=['image_url',
'label']).next()))
elif type(dataset) is BigQueryDataSet:
bq_source = (beam.io.BigQuerySource(table=dataset.table) if dataset.table is not None else
beam.io.BigQuerySource(query=dataset.query))
return p | 'Read source from BigQuery (%s)' % mode >> beam.io.Read(bq_source)
else:
raise ValueError('Invalid DataSet. Expect CsvDataSet or BigQueryDataSet')
示例5: create_glyphazzn_dataset
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def create_glyphazzn_dataset(filepattern, output_path):
"""Creates a glyphazzn dataset, from raw Parquetio to TFRecords."""
def pipeline(root):
"""Pipeline for creating glyphazzn dataset."""
attrs = ['uni', 'width', 'vwidth', 'sfd', 'id', 'binary_fp']
examples = root | 'Read' >> beam.io.parquetio.ReadFromParquet(
file_pattern=filepattern, columns=attrs)
examples = examples | 'FilterBadIcons' >> beam.Filter(_is_valid_glyph)
examples = examples | 'ConvertToPath' >> beam.Map(_convert_to_path)
examples = examples | 'FilterBadPathLenghts' >> beam.Filter(_is_valid_path)
examples = examples | 'ProcessAndConvert' >> beam.Map(_create_example)
(examples | 'WriteToTFRecord' >> beam.io.tfrecordio.WriteToTFRecord(
output_path, num_shards=90))
return pipeline
示例6: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def expand(self, pipeline):
def _make_and_increment_counters(unused_element, analyzer_counter,
mapper_counter):
del unused_element
for counter_prefix, counter in (('tft_analyzer_{}', analyzer_counter),
('tft_mapper_{}', mapper_counter)):
for name, count in counter.items():
beam.metrics.Metrics.counter(beam_common.METRICS_NAMESPACE,
counter_prefix.format(name)).inc(count)
_ = (
pipeline
| 'CreateSoleAPIUse' >> beam.Create([None])
| 'CountAPIUse' >>
beam.Map(_make_and_increment_counters, self._analyzer_use_counter,
self._mapper_use_counter))
示例7: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def expand(self, pcoll):
return (
pcoll
# Assigns window info to each Pub/Sub message based on its
# publish timestamp.
| "Window into Fixed Intervals"
>> beam.WindowInto(window.FixedWindows(self.window_size))
| "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
# Use a dummy key to group the elements in the same window.
# Note that all the elements in one window must fit into memory
# for this. If the windowed elements do not fit into memory,
# please consider using `beam.util.BatchElements`.
# https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements
| "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
| "Groupby" >> beam.GroupByKey()
| "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
)
示例8: run
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def run(args, input_subscription, output_table, window_interval):
"""Build and run the pipeline."""
options = PipelineOptions(args, save_main_session=True, streaming=True)
with beam.Pipeline(options=options) as pipeline:
# Read the messages from PubSub and process them.
messages = (
pipeline
| 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
subscription=input_subscription).with_output_types(bytes)
| 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
| 'Parse JSON messages' >> beam.Map(parse_json_message)
| 'Fixed-size windows' >> beam.WindowInto(
window.FixedWindows(int(window_interval), 0))
| 'Add URL keys' >> beam.Map(lambda msg: (msg['url'], msg))
| 'Group by URLs' >> beam.GroupByKey()
| 'Get statistics' >> beam.Map(get_statistics))
# Output the results into BigQuery table.
_ = messages | 'Write to Big Query' >> beam.io.WriteToBigQuery(
output_table, schema=SCHEMA)
示例9: compare_bq_row
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def compare_bq_row(row, types_to_ignore):
"""Compare the findings in the given BigQuery row.
Args:
row: BQ row: Map containing (findings_record_id, findings_xml, golden_xml).
types_to_ignore: List of strings representing types that should be excluded
from the analysis.
Returns:
(IndividualResult, IndividualResult), where the first is for strict entity
matching and the second is for binary token matching.
Raises:
Exception: If golden_xml doesn't exist.
"""
findings, note_text = get_findings_from_text(row['findings_xml'],
types_to_ignore)
if 'golden_xml' not in row or row['golden_xml'] is None:
raise Exception(
'No golden found for record %s.' % row['findings_record_id'])
golden_findings, golden_note_text = get_findings_from_text(row['golden_xml'],
types_to_ignore)
record_id = row['findings_record_id']
return compare_findings(findings, golden_findings, record_id, note_text,
golden_note_text)
示例10: _PrestoToExample
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def _PrestoToExample( # pylint: disable=invalid-name
pipeline: beam.Pipeline,
exec_properties: Dict[Text, Any],
split_pattern: Text) -> beam.pvalue.PCollection:
"""Read from Presto and transform to TF examples.
Args:
pipeline: beam pipeline.
exec_properties: A dict of execution properties.
split_pattern: Split.pattern in Input config, a Presto sql string.
Returns:
PCollection of TF examples.
"""
conn_config = example_gen_pb2.CustomConfig()
json_format.Parse(exec_properties['custom_config'], conn_config)
presto_config = presto_config_pb2.PrestoConnConfig()
conn_config.custom_config.Unpack(presto_config)
client = _deserialize_conn_config(presto_config)
return (pipeline
| 'Query' >> beam.Create([split_pattern])
| 'QueryTable' >> beam.ParDo(_ReadPrestoDoFn(client))
| 'ToTFExample' >> beam.Map(_row_to_example))
示例11: _ReadExamples
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def _ReadExamples(
pipeline: beam.Pipeline, dataset: _Dataset,
input_dataset_metadata: dataset_metadata.DatasetMetadata
) -> beam.pvalue.PCollection:
"""Reads examples from the given `dataset`.
Args:
pipeline: beam pipeline.
dataset: A `_Dataset` object that represents the data to read.
input_dataset_metadata: A `dataset_metadata.DatasetMetadata`. Not used.
Returns:
A PCollection containing KV pairs of bytes.
"""
del input_dataset_metadata
assert dataset.file_format == labels.FORMAT_TFRECORD, dataset.file_format
return (
pipeline
| 'Read' >> beam.io.ReadFromTFRecord(
dataset.file_pattern,
coder=beam.coders.BytesCoder(),
# TODO(b/114938612): Eventually remove this override.
validate=False)
| 'AddKey' >> beam.Map(lambda x: (None, x)))
示例12: _DecodeInputs
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def _DecodeInputs(pcoll: beam.pvalue.PCollection,
decode_fn: Any) -> beam.pvalue.PCollection:
"""Decodes the given PCollection while handling KV data.
Args:
pcoll: PCollection of data.
decode_fn: Function used to decode data.
Returns:
PCollection of decoded data.
"""
def decode_example(kv: Tuple[Optional[bytes], bytes]) -> Dict[Text, Any]: # pylint: disable=invalid-name
"""Decodes a single example."""
(key, value) = kv
result = decode_fn(value)
if _TRANSFORM_INTERNAL_FEATURE_FOR_KEY in result:
raise ValueError('"{}" is a reserved feature name, '
'it should not be present in the dataset.'.format(
_TRANSFORM_INTERNAL_FEATURE_FOR_KEY))
result[_TRANSFORM_INTERNAL_FEATURE_FOR_KEY] = key
return result
return pcoll | 'ApplyDecodeFn' >> beam.Map(decode_example)
示例13: _AvroToExample
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def _AvroToExample( # pylint: disable=invalid-name
pipeline: beam.Pipeline, exec_properties: Dict[Text, Any],
split_pattern: Text) -> beam.pvalue.PCollection:
"""Read Avro files and transform to TF examples.
Note that each input split will be transformed by this function separately.
Args:
pipeline: beam pipeline.
exec_properties: A dict of execution properties.
- input_base: input dir that contains Avro data.
split_pattern: Split.pattern in Input config, glob relative file pattern
that maps to input files with root directory given by input_base.
Returns:
PCollection of TF examples.
"""
input_base_uri = exec_properties[utils.INPUT_BASE_KEY]
avro_pattern = os.path.join(input_base_uri, split_pattern)
logging.info('Processing input avro data %s to TFExample.', avro_pattern)
return (pipeline
| 'ReadFromAvro' >> beam.io.ReadFromAvro(avro_pattern)
| 'ToTFExample' >> beam.Map(utils.dict_to_example))
示例14: testImportExample
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def testImportExample(self):
with beam.Pipeline() as pipeline:
examples = (
pipeline
| 'ToSerializedRecord' >> executor._ImportSerializedRecord(
exec_properties={utils.INPUT_BASE_KEY: self._input_data_dir},
split_pattern='tfrecord/*')
| 'ToTFExample' >> beam.Map(tf.train.Example.FromString))
def check_result(got):
# We use Python assertion here to avoid Beam serialization error in
# pickling tf.test.TestCase.
assert (15000 == len(got)), 'Unexpected example count'
assert (18 == len(got[0].features.feature)), 'Example not match'
util.assert_that(examples, check_result)
示例15: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Map [as 别名]
def expand(self, pcoll):
return (pcoll
| 'ExtractIdNameTuples' >> beam.Map(self._extract_id_name)
| 'CombineToDict' >> beam.combiners.ToDict())