本文整理汇总了Python中apache_beam.ParDo方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.ParDo方法的具体用法?Python apache_beam.ParDo怎么用?Python apache_beam.ParDo使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.ParDo方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def expand(self, pcoll):
return (pcoll
| 'InferHeaderFields' >> beam.ParDo(
_InferHeaderFields(self._infer_headers,
self._annotation_fields_to_infer),
self._defined_headers)
# TODO(nmousavi): Modify the MergeHeaders to resolve 1 vs '.'
# mismatch for headers extracted from variants.
#
# Note: argument `split_alternate_allele_info_fields` is not
# relevant here since no fields with `Number=A` will be extracted
# from variants, therefore we let the default value (True) for it
# be used. Should this changes, we should modify the default value.
| 'MergeHeaders' >> merge_headers.MergeHeaders(
split_alternate_allele_info_fields=True,
allow_incompatible_records=(
self._allow_incompatible_records or
bool(self._annotation_fields_to_infer))))
示例2: test_convert_variant_to_bigquery_row
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def test_convert_variant_to_bigquery_row(self):
variant_1, row_1, header_num_dict_1 = self._get_sample_variant_1()
variant_2, row_2, header_num_dict_2 = self._get_sample_variant_2()
variant_3, row_3, header_num_dict_3 = self._get_sample_variant_3()
header_num_dict = header_num_dict_1.copy()
header_num_dict.update(header_num_dict_2)
header_num_dict.update(header_num_dict_3)
header_fields = vcf_header_util.make_header(header_num_dict)
proc_var_1 = processed_variant.ProcessedVariantFactory(
header_fields).create_processed_variant(variant_1)
proc_var_2 = processed_variant.ProcessedVariantFactory(
header_fields).create_processed_variant(variant_2)
proc_var_3 = processed_variant.ProcessedVariantFactory(
header_fields).create_processed_variant(variant_3)
pipeline = TestPipeline(blocking=True)
bigquery_rows = (
pipeline
| Create([proc_var_1, proc_var_2, proc_var_3])
| 'ConvertToRow' >> beam.ParDo(ConvertVariantToRow(
self._row_generator)))
assert_that(bigquery_rows, equal_to([row_1, row_2, row_3]))
pipeline.run()
示例3: configure_pipeline
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def configure_pipeline(p, opt):
"""Specify PCollection and transformations in pipeline."""
read_input_source = beam.io.ReadFromText(
opt.input_path, strip_trailing_newlines=True)
read_label_source = beam.io.ReadFromText(
opt.input_dict, strip_trailing_newlines=True)
labels = (p | 'Read dictionary' >> read_label_source)
_ = (p
| 'Read input' >> read_input_source
| 'Parse input' >> beam.Map(lambda line: csv.reader([line]).next())
| 'Extract label ids' >> beam.ParDo(ExtractLabelIdsDoFn(),
beam.pvalue.AsIter(labels))
| 'Read and convert to JPEG'
>> beam.ParDo(ReadImageAndConvertToJpegDoFn())
| 'Embed and make TFExample' >> beam.ParDo(TFExampleFromImageDoFn())
# TODO(b/35133536): Get rid of this Map and instead use
# coder=beam.coders.ProtoCoder(tf.train.Example) in WriteToTFRecord
# below.
| 'SerializeToString' >> beam.Map(lambda x: x.SerializeToString())
| 'Save to disk'
>> beam.io.WriteToTFRecord(opt.output_path,
file_name_suffix='.tfrecord.gz'))
示例4: run
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def run(p, input_path, output_directory, train_fraction=0.8):
"""Runs the pipeline."""
raw_data = (p | "ReadTrainData" >> beam.io.Read(CsvFileSource(
input_path, column_names=constants.CSV_COLUMNS)))
train_data, eval_data = split_data(raw_data, train_fraction)
(train_data | "PrepareCSV_train" >> beam.ParDo(
ConvertDictToCSV(ordered_fieldnames=constants.CSV_COLUMNS))
| "Write_train" >> beam.io.WriteToText(
os.path.join(output_directory, "output_data", "train"),
file_name_suffix=".csv"))
(eval_data | "PrepareCSV_eval" >> beam.ParDo(
ConvertDictToCSV(ordered_fieldnames=constants.CSV_COLUMNS))
| "Write_eval" >> beam.io.WriteToText(
os.path.join(output_directory, "output_data", "eval"),
file_name_suffix=".csv"))
示例5: main
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def main(_):
beam_utils.BeamInit()
if not FLAGS.output_file_pattern:
raise ValueError('Must provide an output_file_pattern')
reader = beam.io.ReadFromTFRecord(
FLAGS.input_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example))
model_name = FLAGS.model_name
split = FLAGS.split
run_preprocessors = FLAGS.run_preprocessors
with beam_utils.GetPipelineRoot() as root:
_ = (
root
| 'Read' >> reader
| 'ToTFExample' >> beam.ParDo(
_ProcessShard(model_name, split, run_preprocessors))
| 'Reshuffle' >> beam.Reshuffle()
| 'Write' >> beam.io.WriteToTFRecord(
FLAGS.output_file_pattern,
coder=beam.coders.ProtoCoder(tf.train.Example)))
示例6: GetPipelineRoot
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def GetPipelineRoot(options=None):
"""Return the root of the beam pipeline.
Typical usage looks like:
with GetPipelineRoot() as root:
_ = (root | beam.ParDo() | ...)
In this example, the pipeline is automatically executed when the context is
exited, though one can manually run the pipeline built from the root object as
well.
Args:
options: A beam.options.pipeline_options.PipelineOptions object.
Returns:
A beam.Pipeline root object.
"""
return beam.Pipeline(options=options)
示例7: BatchSerializedExamplesToArrowRecordBatches
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def BatchSerializedExamplesToArrowRecordBatches(
examples: beam.pvalue.PCollection,
desired_batch_size: Optional[int] = constants
.DEFAULT_DESIRED_INPUT_BATCH_SIZE
) -> beam.pvalue.PCollection:
"""Batches serialized examples into Arrow record batches.
Args:
examples: A PCollection of serialized tf.Examples.
desired_batch_size: Batch size. The output Arrow record batches will have as
many rows as the `desired_batch_size`.
Returns:
A PCollection of Arrow record batches.
"""
return (examples
| "BatchSerializedExamples" >> beam.BatchElements(
**batch_util.GetBatchElementsKwargs(desired_batch_size))
| "BatchDecodeExamples" >> beam.ParDo(_BatchDecodeExamplesDoFn()))
示例8: make_prediction_pipeline
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def make_prediction_pipeline(pipeline, args):
"""Builds the prediction pipeline.
Reads the csv files, prepends a ',' if the target column is missing, run
prediction, and then prints the formated results to a file.
Args:
pipeline: the pipeline
args: command line args
"""
# DF bug: DF does not work with unicode strings
predicted_values, errors = (
pipeline |
'Read CSV Files' >>
beam.io.ReadFromText(str(args.predict_data),
strip_trailing_newlines=True) |
'Batch Input' >>
beam.ParDo(EmitAsBatchDoFn(args.batch_size)) |
'Run TF Graph on Batches' >>
beam.ParDo(RunGraphDoFn(args.trained_model_dir)).with_outputs('errors', main='main'))
((predicted_values, errors) |
'Format and Save' >>
FormatAndSave(args))
示例9: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def expand(self, inputs):
saved_model_dir_pcol, input_values_pcol = inputs
# We don't deep_copy pcollections used for the first phase, or when
# the user defined `Context` disables it.
if self._phase > 0 and Context.get_use_deep_copy_optimization():
# Obviates unnecessary data materialization when the input data source is
# safe to read more than once.
tf.compat.v1.logging.info('Deep copying inputs for phase: %d',
self._phase)
input_values_pcol = deep_copy.deep_copy(input_values_pcol)
if not self._use_tfxio:
input_values_pcol |= 'BatchInputs' >> _BatchElements()
return (input_values_pcol | 'ApplySavedModel' >> beam.ParDo(
_RunMetaGraphDoFn(
self._tf_config,
use_tfxio=self._use_tfxio,
input_schema=self._input_schema,
input_tensor_adapter_config=self._input_tensor_adapter_config,
shared_graph_state_handle=shared.Shared(),
passthrough_keys=Context.get_passthrough_keys()),
saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir_pcol)))
示例10: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def expand(self, pcoll):
return (
pcoll
# Assigns window info to each Pub/Sub message based on its
# publish timestamp.
| "Window into Fixed Intervals"
>> beam.WindowInto(window.FixedWindows(self.window_size))
| "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
# Use a dummy key to group the elements in the same window.
# Note that all the elements in one window must fit into memory
# for this. If the windowed elements do not fit into memory,
# please consider using `beam.util.BatchElements`.
# https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements
| "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
| "Groupby" >> beam.GroupByKey()
| "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
)
示例11: testTwoLangs
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def testTwoLangs(self):
with TestPipeline() as p:
tokens = p | 'CreateInput' >> beam.Create(self.sample_input)
result = tokens | beam.ParDo(utils.CompileTokenizationInfo())
assert_that(result, equal_to([{
'lang': 'en',
'count': 1,
'num_preserved_chars': 13,
'num_dropped_chars': 2,
'num_non_unk_wordpieces': 4,
'preserved_ratio': [13/4],
'dropped_ratio': [2/15],
'wordpieces': collections.Counter(['the', 'app', '##le', 'sauce'])
}, {
'lang': 'fr',
'count': 1,
'num_preserved_chars': 14,
'num_dropped_chars': 0,
'num_non_unk_wordpieces': 5,
'preserved_ratio': [14/5],
'dropped_ratio': [0],
'wordpieces': collections.Counter(['bon', '##jour', 'bon', '##soir'])
}]))
示例12: _PrestoToExample
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def _PrestoToExample( # pylint: disable=invalid-name
pipeline: beam.Pipeline,
exec_properties: Dict[Text, Any],
split_pattern: Text) -> beam.pvalue.PCollection:
"""Read from Presto and transform to TF examples.
Args:
pipeline: beam pipeline.
exec_properties: A dict of execution properties.
split_pattern: Split.pattern in Input config, a Presto sql string.
Returns:
PCollection of TF examples.
"""
conn_config = example_gen_pb2.CustomConfig()
json_format.Parse(exec_properties['custom_config'], conn_config)
presto_config = presto_config_pb2.PrestoConnConfig()
conn_config.custom_config.Unpack(presto_config)
client = _deserialize_conn_config(presto_config)
return (pipeline
| 'Query' >> beam.Create([split_pattern])
| 'QueryTable' >> beam.ParDo(_ReadPrestoDoFn(client))
| 'ToTFExample' >> beam.Map(_row_to_example))
示例13: _ToArrowRecordBatches
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def _ToArrowRecordBatches(
pcoll: beam.pvalue.PCollection,
schema: Optional[schema_pb2.Schema]) -> beam.pvalue.PCollection:
"""Converts serialized examples to Arrow RecordBatches.
Args:
pcoll: PCollection of Transformed data.
schema: schema.
Returns:
PCollection of `DatasetFeatureStatisticsList`.
"""
kwargs = tfdv.utils.batch_util.GetBeamBatchKwargs(
tft_beam.Context.get_desired_batch_size())
return (
pcoll
| 'Values' >> beam.Values()
| 'BatchElements' >> beam.BatchElements(**kwargs)
| 'ToArrowRecordBatches' >> beam.ParDo(
Executor._ToArrowRecordBatchesFn(schema)))
示例14: shuffle
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def shuffle(p):
"""Shuffles data from PCollection.
Args:
p: PCollection.
Returns:
PCollection of shuffled data.
"""
class _AddRandomKey(beam.DoFn):
def process(self, element):
yield random.random(), element
shuffled_data = (
p
| 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
| 'GroupByRandom' >> beam.GroupByKey()
| 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
return shuffled_data
示例15: shuffle_data
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import ParDo [as 别名]
def shuffle_data(p):
"""Shuffles data from PCollection.
Args:
p: PCollection.
Returns:
PCollection of shuffled data.
"""
class _AddRandomKey(beam.DoFn):
def process(self, element):
yield (random.random(), element)
shuffled_data = (
p
| 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
| 'GroupByRandom' >> beam.GroupByKey()
| 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
return shuffled_data