本文整理汇总了Python中apache_beam.CombineGlobally方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.CombineGlobally方法的具体用法?Python apache_beam.CombineGlobally怎么用?Python apache_beam.CombineGlobally使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.CombineGlobally方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def expand(self, inputs):
pcoll, = inputs
# We specify a fanout so that the packed combiner doesn't exhibit stragglers
# during the 'reduce' phase when we have a lot of combine analyzers packed.
fanout = int(math.ceil(math.sqrt(len(self._combiners))))
# TODO(b/34792459): Don't set with_defaults.
return (
pcoll
| 'InitialPackedCombineGlobally' >> beam.CombineGlobally(
_PackedCombinerWrapper(
self._combiners,
self._tf_config,
is_combining_accumulators=False
)
).with_fanout(fanout).with_defaults(False)
| 'Count' >>
common.IncrementCounter('num_packed_accumulate_combiners'))
示例2: _lint
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def _lint(self, examples):
"""Returns the `PTransform` for the EmptyExampleDetector linter.
Args:
examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.
Returns:
A `PTransform` that yields a `LintResult` of the format
warnings: [num empties]
lint_sample: None
"""
n_empties = (
examples
| 'DetectEmpties' >> beam.Map(self._example_is_empty)
| 'Count' >> beam.CombineGlobally(sum)
| 'NoZero' >> beam.Filter(bool)
| 'ToResult' >> beam.Map(
lambda w: self._make_result(warnings=[str(w)])))
return n_empties
示例3: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def expand(self, pcoll):
return pcoll | 'MergeHeaders' >> beam.CombineGlobally(
_MergeHeadersFn(self._header_merger)).without_defaults()
示例4: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def expand(self, estimates):
return (estimates
| 'ExtractFileSize' >> beam.Map(
lambda estimate: estimate.size_in_bytes)
| 'SumFileSizes' >> beam.CombineGlobally(sum))
示例5: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def expand(self, pcoll):
return pcoll | beam.CombineGlobally(
_MergeDefinitionsFn(self._definitions_merger)).without_defaults()
示例6: MakeSummary
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def MakeSummary(pcoll, metric_fn, metric_keys): # pylint: disable=invalid-name
"""
Summary PTransofrm used in Dataflow.
"""
return (
pcoll |
"ApplyMetricFnPerInstance" >> beam.Map(metric_fn) |
"PairWith1" >> beam.Map(lambda tup: tup + (1,)) |
"SumTuple" >> beam.CombineGlobally(beam.combiners.TupleCombineFn(
*([sum] * (len(metric_keys) + 1)))) |
"AverageAndMakeDict" >> beam.Map(
lambda tup: dict(
[(name, tup[i] / tup[-1]) for i, name in enumerate(metric_keys)] +
[("count", tup[-1])])))
示例7: get_stats_of_glyphazzn
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def get_stats_of_glyphazzn(filepattern, output_path):
"""Computes the Mean and Std across examples in glyphazzn dataset."""
def pipeline(root):
"""Pipeline for computing means/std from dataset."""
examples = root | 'Read' >> beam.io.tfrecordio.ReadFromTFRecord(filepattern)
examples = examples | 'Deserialize' >> beam.Map(_decode_tfexample)
examples = examples | 'GetMeanStdev' >> beam.CombineGlobally(MeanStddev())
examples = examples | 'MeanStdevToSerializedTFRecord' >> beam.Map(
_mean_to_example)
(examples | 'WriteToTFRecord' >> beam.io.tfrecordio.WriteToTFRecord(
output_path, coder=beam.coders.ProtoCode(tf.train.Example)))
return pipeline
示例8: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def expand(self, pcoll):
output_tuple = (
pcoll
| beam.FlatMap(self._flatten_fn)
| beam.CombineGlobally(self._sum_fn)
| beam.FlatMap(self._extract_outputs).with_outputs('0', '1'))
return (output_tuple['0'], output_tuple['1'])
示例9: testEqual
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def testEqual(self):
with TestPipeline() as p:
tokens = p | beam.Create(self.sample_input)
result = tokens | beam.CombineGlobally(utils.CalculateCoefficients(0.5))
assert_that(result, equal_to([{'en': 1.0, 'fr': 1.0}]))
示例10: testNotEqual
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def testNotEqual(self):
with TestPipeline() as p:
sample_input = [('I', 'en'), ('kind', 'en'), ('of', 'en'), ('like', 'en'),
('to', 'en'), ('eat', 'en'), ('pie', 'en'), ('!', 'en'),
('Je', 'fr'), ('suis', 'fr'), ('une', 'fr'),
('fille', 'fr'), ('.', 'fr')]
tokens = p | beam.Create(sample_input)
result = (tokens
| beam.CombineGlobally(utils.CalculateCoefficients(0.5))
| beam.ParDo(CompareValues()))
assert_that(result, equal_to([True]))
示例11: testUnsorted
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def testUnsorted(self):
with TestPipeline() as p:
tokens = p | 'CreateInput' >> beam.Create(self.sample_input)
result = tokens | beam.CombineGlobally(utils.SortByCount())
assert_that(result, equal_to([[('c', 9), ('a', 5), ('d', 4), ('b', 2)]]))
示例12: check_size
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def check_size(p, name, path):
"""Performs checks on the input pipeline and stores stats in specfied path.
Checks performed: counts rows and derives class distribution.
Args:
p: PCollection, input pipeline.
name: string, unique identifier for the beam step.
path: string: path to store stats.
Returns:
PCollection
"""
class _Combine(beam.CombineFn):
"""Counts and take the average of positive classes in the pipeline."""
def create_accumulator(self):
return (0.0, 0.0)
def add_input(self, sum_count, inputs):
(s, count) = sum_count
return s + inputs, count + 1
def merge_accumulators(self, accumulators):
sums, counts = zip(*accumulators)
return sum(sums), sum(counts)
# We should not consider the case count == 0 as an error (class initialized
# with count == 0).
def extract_output(self, sum_count):
(s, count) = sum_count
return count, (1.0 * s / count) if count else float('NaN')
return (p
| 'CheckMapTo_1_{}'.format(name) >>
beam.Map(lambda x: x[constants.LABEL_COLUMN])
| 'CheckSum_{}'.format(name) >> beam.CombineGlobally(_Combine())
| 'CheckRecord_{}'.format(name) >> beam.io.WriteToText(
'{}.txt'.format(path)))
示例13: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def expand(self, pcoll):
to_dict = lambda x: {x[0]: x[1]}
example_counts = (
pcoll
| "count_examples" >> beam.combiners.Count.Globally()
| "key_example_counts" >> beam.Map(
lambda x: ("examples", x))
| "example_count_dict" >> beam.Map(to_dict))
def _count_tokens(pcoll, feat):
return (
pcoll
| "key_%s_toks" % feat >> beam.Map(
lambda x: # pylint:disable=g-long-lambda
("%s_tokens" % feat, int(sum(x[feat] > 1)) if feat in x else 0)))
token_counts = (
[_count_tokens(pcoll, feat)
for feat in self._output_features]
| "flatten_tokens" >> beam.Flatten()
| "count_tokens" >> beam.CombinePerKey(sum)
| "token_count_dict" >> beam.Map(to_dict))
def _merge_dicts(dicts):
merged_dict = {}
for d in dicts:
assert not set(merged_dict).intersection(d)
merged_dict.update(d)
return merged_dict
return (
[example_counts, token_counts]
| "flatten_counts" >> beam.Flatten()
| "merge_stats" >> beam.CombineGlobally(_merge_dicts))
示例14: test_invalid_row
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def test_invalid_row(self):
input_lines = ['1,2.0,hello', '5,12.34']
column_names = ['int_feature', 'float_feature', 'str_feature']
with self.assertRaisesRegex( # pylint: disable=g-error-prone-assert-raises
ValueError, '.*Columns do not match specified csv headers.*'):
with beam.Pipeline() as p:
result = (
p | beam.Create(input_lines, reshuffle=False)
| beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))
| beam.Keys()
| beam.CombineGlobally(
csv_decoder.ColumnTypeInferrer(
column_names, skip_blank_lines=False)))
beam_test_util.assert_that(result, lambda _: None)
示例15: word_count
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombineGlobally [as 别名]
def word_count(input_path, output_path, raw_metadata, min_token_frequency=2):
"""Returns a pipeline counting words and writing the output.
Args:
input_path: recordio file to read
output_path: path in which to write the output
raw_metadata: metadata of input tf.Examples
min_token_frequency: the min frequency for a token to be included
"""
lang_set = set(FLAGS.lang_set.split(','))
# Create pipeline.
pipeline = beam.Pipeline()
with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
converter = tft.coders.ExampleProtoCoder(
raw_metadata.schema, serialized=False)
# Read raw data and convert to TF Transform encoded dict.
raw_data = (
pipeline
| 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
input_path, coder=beam.coders.ProtoCoder(tf.train.Example))
| 'DecodeInputData' >> beam.Map(converter.decode))
# Apply TF Transform.
(transformed_data, _), _ = (
(raw_data, raw_metadata)
| 'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset(
utils.count_preprocessing_fn(FLAGS.text_key,
FLAGS.language_code_key)))
# Filter by languages.
tokens = (
transformed_data
| 'FilterByLang' >> beam.ParDo(utils.FilterTokensByLang(lang_set)))
# Calculate smoothing coefficients.
coeffs = (
tokens
| 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
utils.CalculateCoefficients(FLAGS.smoothing_exponent)))
# Apply smoothing, aggregate counts, and sort words by count.
_ = (
tokens
| 'ApplyExponentialSmoothing' >> beam.ParDo(
utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs))
| 'SumCounts' >> beam.CombinePerKey(sum)
| 'FilterLowCounts' >> beam.ParDo(utils.FilterByCount(
FLAGS.max_word_length, min_token_frequency))
| 'MergeAndSortCounts' >> beam.CombineGlobally(utils.SortByCount())
| 'Flatten' >> beam.FlatMap(lambda x: x)
| 'FormatCounts' >> beam.Map(lambda tc: '%s\t%s' % (tc[0], tc[1]))
| 'WriteSortedCount' >> beam.io.WriteToText(
output_path, shard_name_template=''))
return pipeline