本文整理汇总了Python中apache_beam.Flatten方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.Flatten方法的具体用法?Python apache_beam.Flatten怎么用?Python apache_beam.Flatten使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.Flatten方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_inferred_headers
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def _get_inferred_headers(variants, # type: pvalue.PCollection
merged_header # type: pvalue.PCollection
):
# type: (...) -> (pvalue.PCollection, pvalue.PCollection)
inferred_headers = (variants
| 'FilterVariants' >> filter_variants.FilterVariants()
| ' InferHeaderFields' >>
infer_headers.InferHeaderFields(
pvalue.AsSingleton(merged_header),
allow_incompatible_records=True,
infer_headers=True))
merged_header = (
(inferred_headers, merged_header)
| beam.Flatten()
| 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders(
allow_incompatible_records=True))
return inferred_headers, merged_header
示例2: add_annotation_headers
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def add_annotation_headers(pipeline, known_args, pipeline_mode,
merged_header,
annotated_vcf_pattern):
if pipeline_mode == PipelineModes.LARGE:
annotation_headers = (pipeline
| 'ReadAnnotatedVCF'
>> beam.Create([annotated_vcf_pattern])
| 'ReadHeaders' >> vcf_header_io.ReadAllVcfHeaders())
else:
annotation_headers = (
pipeline
| 'ReadHeaders'
>> vcf_header_io.ReadVcfHeaders(annotated_vcf_pattern))
merged_header = (
(merged_header, annotation_headers)
| beam.Flatten()
| 'MergeWithOriginalHeaders' >> merge_headers.MergeHeaders(
known_args.split_alternate_allele_info_fields,
known_args.allow_incompatible_records))
return merged_header
示例3: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def expand(
self,
sliced_record_batchs: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
unweighted_protos = (
sliced_record_batchs
| 'ComputeUnweightedLift' >> self._unweighted_generator)
if not self._weight_column_name:
# If no weight column name is given, only compute unweighted lift.
return unweighted_protos
weighted_protos = (
sliced_record_batchs
| 'ComputeWeightedLift' >> self._weighted_generator)
return ((unweighted_protos, weighted_protos)
| 'MergeUnweightedAndWeightedProtos' >> beam.Flatten())
示例4: get_sources_from_dataset
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def get_sources_from_dataset(p, dataset, mode):
"""get pcollection from dataset."""
import apache_beam as beam
import csv
from google.datalab.ml import CsvDataSet, BigQueryDataSet
check_dataset(dataset, mode)
if type(dataset) is CsvDataSet:
source_list = []
for ii, input_path in enumerate(dataset.files):
source_list.append(p | 'Read from Csv %d (%s)' % (ii, mode) >>
beam.io.ReadFromText(input_path, strip_trailing_newlines=True))
return (source_list |
'Flatten Sources (%s)' % mode >>
beam.Flatten() |
'Create Dict from Csv (%s)' % mode >>
beam.Map(lambda line: csv.DictReader([line], fieldnames=['image_url',
'label']).next()))
elif type(dataset) is BigQueryDataSet:
bq_source = (beam.io.BigQuerySource(table=dataset.table) if dataset.table is not None else
beam.io.BigQuerySource(query=dataset.query))
return p | 'Read source from BigQuery (%s)' % mode >> beam.io.Read(bq_source)
else:
raise ValueError('Invalid DataSet. Expect CsvDataSet or BigQueryDataSet')
示例5: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def expand(self, examples):
"""Runs the linters on the data and writes out the results.
The order in which the linters run is unspecified.
Args:
examples: A `PTransform` that yields a `PCollection` of `tf.Examples`.
Returns:
A pipeline containing the `DataLinter` `PTransform`s.
"""
coders = (beam.coders.coders.StrUtf8Coder(),
beam.coders.coders.ProtoCoder(lint_result_pb2.LintResult))
return (
[examples | linter for linter in self._linters if linter.should_run()]
| 'MergeResults' >> beam.Flatten()
| 'DropEmpty' >> beam.Filter(lambda (_, r): r and len(r.warnings))
| 'ToDict' >> beam.combiners.ToDict()
| 'WriteResults' >> beam.io.textio.WriteToText(
self._results_path,
coder=beam.coders.coders.PickleCoder(),
shard_name_template=''))
示例6: _add_inferred_headers
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def _add_inferred_headers(all_patterns, # type: List[str]
pipeline, # type: beam.Pipeline
known_args, # type: argparse.Namespace
merged_header, # type: pvalue.PCollection
pipeline_mode # type: int
):
# type: (...) -> pvalue.PCollection
annotation_fields_to_infer = (known_args.annotation_fields if
known_args.infer_annotation_types else [])
inferred_headers = (
_read_variants(all_patterns,
pipeline,
known_args,
pipeline_mode,
pre_infer_headers=known_args.infer_headers)
| 'FilterVariants' >> filter_variants.FilterVariants(
reference_names=known_args.reference_names)
| 'InferHeaderFields' >> infer_headers.InferHeaderFields(
pvalue.AsSingleton(merged_header),
known_args.allow_incompatible_records,
known_args.infer_headers,
annotation_fields_to_infer))
merged_header = (
(inferred_headers, merged_header)
| 'FlattenHeaders' >> beam.Flatten()
| 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders(
known_args.split_alternate_allele_info_fields,
known_args.allow_incompatible_records))
return merged_header
示例7: main
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def main(argv):
if len(argv) > 1:
raise app.UsageError("Too many command-line arguments.")
def pipeline(root):
"""Beam pipeline for preprocessing open images."""
assert FLAGS.input_file_patterns
assert FLAGS.output_dir
assert FLAGS.output_name
assert FLAGS.num_shards
# Create Pipeline.
tfrecords = []
for i, file_pattern in enumerate(FLAGS.input_file_patterns.split(",")):
logging.info("Reading TFRecords from %s", file_pattern)
stage_name = "read_tfrecords_{}".format(i)
tfrecords.append(root | stage_name >> beam.io.tfrecordio.ReadFromTFRecord(
file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)))
# pylint: disable=expression-not-assigned
(tfrecords
| "flatten" >> beam.Flatten()
| "count_labels" >> beam.ParDo(CountLabelsDoFn())
| "reshuffle" >> beam.Reshuffle()
| "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
os.path.join(FLAGS.output_dir, FLAGS.output_name),
coder=beam.coders.ProtoCoder(tf.train.Example),
num_shards=FLAGS.num_shards))
# pylint: enable=expression-not-assigned
pipeline.run()
logging.info("Processing complete.")
示例8: _labels_pipeline
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def _labels_pipeline(sources):
labels = (sources |
'Flatten Sources for labels' >> beam.Flatten() |
'Parse input for labels' >> beam.Map(lambda x: str(x['label'])) |
'Combine labels' >> beam.transforms.combiners.Count.PerElement() |
'Get labels' >> beam.Map(lambda label_count: label_count[0]))
return labels
示例9: configure_pipeline
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def configure_pipeline(p, dataset_train, dataset_eval, checkpoint_path, output_dir, job_id):
source_train = _util.get_sources_from_dataset(p, dataset_train, 'train')
labels_source = [source_train]
if dataset_eval is not None:
source_eval = _util.get_sources_from_dataset(p, dataset_eval, 'eval')
labels_source.append(source_eval)
labels = _labels_pipeline(labels_source)
train_preprocessed = _transformation_pipeline(source_train, checkpoint_path, labels, 'train')
if dataset_eval is not None:
# explicit eval data.
eval_preprocessed = _transformation_pipeline(source_eval, checkpoint_path, labels, 'eval')
else:
# Split train/eval.
train_preprocessed, eval_preprocessed = (train_preprocessed |
'Random Partition' >>
beam.Partition(TrainEvalSplitPartitionFn(), 2))
output_train_path = os.path.join(output_dir, job_id, 'train')
output_eval_path = os.path.join(output_dir, job_id, 'eval')
labels_file = os.path.join(output_dir, job_id, 'labels')
labels_save = (labels |
'Write labels' >>
beam.io.textio.WriteToText(labels_file, shard_name_template=''))
train_save = train_preprocessed | 'Save train to disk' >> SaveFeatures(output_train_path)
eval_save = eval_preprocessed | 'Save eval to disk' >> SaveFeatures(output_eval_path)
# Make sure we write "latest" file after train and eval data are successfully written.
output_latest_file = os.path.join(output_dir, 'latest')
([eval_save, train_save, labels_save] | 'Wait for train eval saving' >> beam.Flatten() |
'Fixed One' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) |
beam.Map(lambda path: job_id) |
'WriteLatest' >> beam.io.textio.WriteToText(output_latest_file, shard_name_template=''))
示例10: ReadAndShuffleData
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def ReadAndShuffleData(pcoll, filepatterns):
"""Read a train or test dataset from disk and shuffle it."""
# NOTE: we pass filepatterns as a tuple instead of two args, as the current
# version of beam assumes that if the first arg to a ptransfrom_fn is a
# string, then that string is the label.
neg_filepattern, pos_filepattern = filepatterns
# Read from each file pattern and create a tuple of the review text and the
# correct label.
negative_examples = (
pcoll
| 'ReadNegativeExamples' >> beam.io.ReadFromText(neg_filepattern)
| 'PairWithZero' >> beam.Map(lambda review: (review, 0)))
positive_examples = (
pcoll
| 'ReadPositiveExamples' >> beam.io.ReadFromText(pos_filepattern)
| 'PairWithOne' >> beam.Map(lambda review: (review, 1)))
all_examples = (
[negative_examples, positive_examples] | 'Merge' >> beam.Flatten())
# Shuffle the data. Note that the data does in fact contain duplicate reviews
# for reasons that are unclear. This means that NUM_TRAIN_INSTANCES and
# NUM_TRAIN_INSTANCES are slightly wrong for the preprocessed data.
# pylint: disable=no-value-for-parameter
shuffled_examples = (
all_examples
| 'Distinct' >> beam.Distinct()
| 'Shuffle' >> Shuffle())
# Put the data in the format that can be accepted directly by tf.Transform.
return shuffled_examples | 'MakeInstances' >> beam.Map(
lambda p: {REVIEW_KEY: p[0], LABEL_KEY: p[1]})
示例11: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def expand(self, inputs):
if self._top_k is not None and self._top_k < 0:
raise ValueError('top_k for VocabularyImpl should be >= 0 or None, got '
'{}.'.format(self._top_k))
if self._frequency_threshold is not None and self._frequency_threshold < 0:
raise ValueError(
'frequency_threshold for VocabularyImpl should be >= 0 or None, '
'got {}.'.format(self._frequency_threshold))
if self._coverage_top_k is not None and self._coverage_top_k < 0:
raise ValueError('coverage_top_k for VocabularyImpl should be >= 0 or '
'None, got {}.'.format(self._coverage_top_k))
if (self._coverage_frequency_threshold is not None and
self._coverage_frequency_threshold < 0):
raise ValueError(
'coverage_frequency_threshold for VocabularyImpl should be >= 0 or '
'None, got {}.'.format(self._coverage_frequency_threshold))
pcoll, = inputs
result = (
pcoll | 'ApplyThresholdsAndTopK' >> (
_ApplyThresholdsAndTopK( # pylint: disable=no-value-for-parameter
self._frequency_threshold, self._top_k,
self._informativeness_threshold, None)))
if self._key_fn:
# Note: current APIs do not allow for specifying a coverage
# informativeness threshold.
coverage_counts = (
pcoll | 'ApplyCoverageThresholdAndTopK' >> (
_ApplyThresholdsAndTopK( # pylint: disable=no-value-for-parameter
self._coverage_frequency_threshold, self._coverage_top_k,
self._coverage_informativeness_threshold, self._key_fn)))
result = ((result, coverage_counts)
| 'MergeStandardAndCoverageArms' >> beam.Flatten()
| 'RemoveDuplicates' >> beam.RemoveDuplicates())
return result
示例12: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def expand(self, pbegin):
# TODO(b/151921205): we have to do an identity map for unmodified
# PCollections below because otherwise we get an error from beam.
identity_map = 'Identity' >> beam.Map(lambda x: x)
if self._dataset_key.is_flattened_dataset_key():
if self._flat_pcollection:
return self._flat_pcollection | identity_map
else:
return (
list(self._pcollection_dict.values())
| 'FlattenAnalysisInputs' >> beam.Flatten(pipeline=pbegin.pipeline))
else:
return self._pcollection_dict[self._dataset_key] | identity_map
示例13: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def expand(self, pcoll):
to_dict = lambda x: {x[0]: x[1]}
example_counts = (
pcoll
| "count_examples" >> beam.combiners.Count.Globally()
| "key_example_counts" >> beam.Map(
lambda x: ("examples", x))
| "example_count_dict" >> beam.Map(to_dict))
def _count_tokens(pcoll, feat):
return (
pcoll
| "key_%s_toks" % feat >> beam.Map(
lambda x: # pylint:disable=g-long-lambda
("%s_tokens" % feat, int(sum(x[feat] > 1)) if feat in x else 0)))
token_counts = (
[_count_tokens(pcoll, feat)
for feat in self._output_features]
| "flatten_tokens" >> beam.Flatten()
| "count_tokens" >> beam.CombinePerKey(sum)
| "token_count_dict" >> beam.Map(to_dict))
def _merge_dicts(dicts):
merged_dict = {}
for d in dicts:
assert not set(merged_dict).intersection(d)
merged_dict.update(d)
return merged_dict
return (
[example_counts, token_counts]
| "flatten_counts" >> beam.Flatten()
| "merge_stats" >> beam.CombineGlobally(_merge_dicts))
示例14: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def expand(self, sliced_extracts):
def partition_fn(_, num_partitions):
return self._random_state.randint(num_partitions)
# Partition the data
# List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]]
partitions = (
sliced_extracts
| 'Partition' >> beam.Partition(partition_fn,
self._num_jackknife_samples))
def add_partition_index(slice_key,
accumulator_and_size,
partition_index=None):
accumulator, size = accumulator_and_size
return slice_key, _PartitionInfo(accumulator, size, partition_index)
# Within each partition, partially combine per slice key to get accumulators
# and partition sizes; add partition_id for determinism.
# List[PCollection[slicer.SliceKeyType, _PartitionInfo]]
partition_accumulators = []
for i, partition in enumerate(partitions):
partition_accumulators.append(
partition
| 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey(
beam.transforms.combiners.SingleInputTupleCombineFn(
_AccumulateOnlyCombiner(combiner=self._combiner),
beam.transforms.combiners.CountCombineFn()))
| 'AddPartitionId[{}]'.format(i) >> beam.MapTuple(
add_partition_index, i))
# Group partitions for the same slice, compute LOO metrics, and flatten back
# into per-partition LOO metrics.
# (slicer.SliceKeyType, Tuple[metric_types.MetricsDict])
return (partition_accumulators
| 'FlattenPartitionAccumulators' >> beam.Flatten()
| 'CollectPerSlicePartitions' >> beam.GroupByKey()
| 'MakeJackknifeSamples' >> beam.FlatMap(
_make_jackknife_samples, combiner=self._combiner))
示例15: combine_dict_based_evaluations
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Flatten [as 别名]
def combine_dict_based_evaluations(
evaluations: Dict[Text, List[beam.pvalue.PCollection]]) -> Evaluation:
"""Combines multiple evaluation outputs together when the outputs are dicts.
Note that the dict here refers to the output in the PCollection. The
evaluations themselves are dicts of PCollections keyed by category ('metrics',
'plots', 'analysis', etc). This util is used to group the outputs of one or
more of these evaluations where the PCollections themselves must be dicts. For
example, a 'metrics' evaluation might store its output in PCollection of dicts
containing metric keys and metric values. This util would be used to group the
outputs from running two or more independent metrics evaluations together into
a single PCollection.
Args:
evaluations: Dict of lists of PCollections of outputs from different
evaluators keyed by type of output ('metrics', 'plots', 'analysis', etc).
Returns:
Dict of consolidated PCollections of outputs keyed by type of output.
"""
result = {}
for k, v in evaluations.items():
if len(v) == 1:
result[k] = v[0]
continue
result[k] = (
v
| 'FlattenEvaluationOutput(%s)' % k >> beam.Flatten()
| 'CombineEvaluationOutput(%s)' % k >> beam.CombinePerKey(
_CombineEvaluationDictionariesFn()))
return result