本文整理汇总了Python中apache_beam.CombinePerKey方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.CombinePerKey方法的具体用法?Python apache_beam.CombinePerKey怎么用?Python apache_beam.CombinePerKey使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.CombinePerKey方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def expand(self, pcoll: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
"""Estimates the user defined statistic."""
return (
pcoll
| 'AssignBatchToPartition' >> beam.Map(
_assign_to_partition, num_partitions=self._num_partitions)
| 'GroupPartitionsIntoList' >> beam.CombinePerKey(
beam.combiners.SampleCombineFn(self._max_batches_per_partition))
| 'ProcessPartition' >> beam.Map(_process_partition,
stats_fn=self._stats_fn)
| 'ComputeMetaStats' >> beam.CombinePerKey(
PartitionedStatisticsAnalyzer(min_partitions_stat_presence=self
._min_partitions_stat_presence)))
示例2: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def expand(self, sliced_record_batchs):
# slice, example_count
example_counts = (
sliced_record_batchs
| 'ToExampleCounts' >> beam.MapTuple(lambda k, v: (k, v.num_rows))
| 'SumExampleCounts' >> beam.CombinePerKey(sum))
def move_y_to_value(slice_and_y, y_count):
slice_key, y = slice_and_y
return slice_key, (y, y_count)
# slice, (y, y_count)
y_counts = (
sliced_record_batchs
| 'ToPartialYCounts' >>
beam.FlatMap(_to_partial_counts, self._y_path, self._y_boundaries,
self._weight_column_name)
| 'SumYCounts' >> beam.CombinePerKey(sum)
| 'MoveYToValue' >> beam.MapTuple(move_y_to_value))
# _SlicedYKey(slice, y), _YRate(y_count, example_count)
return ({
'y_counts': y_counts,
'example_count': example_counts
}
| 'CoGroupByForYRates' >> beam.CoGroupByKey()
| 'JoinExampleCounts' >> beam.FlatMap(_join_example_counts))
示例3: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def expand(self, inputs):
pcoll, = inputs
# Create a PCollection of (count, element) pairs, then iterates over
# this to create a single element PCollection containing this list of
# pairs in sorted order by decreasing counts (and by values for equal
# counts).
# TODO(b/112916494): Unify the graph in both cases once possible.
if (self._vocab_ordering_type ==
_VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION):
flatten_map_fn = _flatten_to_key_and_means_accumulator_list
combine_transform = _MutualInformationTransformAccumulate() # pylint: disable=no-value-for-parameter
elif self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_FREQUENCY:
flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples
combine_transform = beam.CombinePerKey(sum)
elif self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_LABELS:
flatten_map_fn = _flatten_value_and_labeled_weights_to_list_of_tuples
combine_transform = beam.CombinePerKey(sum_labeled_weights)
else:
flatten_map_fn = _flatten_value_to_list
combine_transform = beam.combiners.Count.PerElement()
result = (
pcoll
| 'FlattenTokensAndMaybeWeightsLabels' >> beam.FlatMap(flatten_map_fn)
| 'CountPerToken' >> combine_transform)
if self._input_dtype == tf.string:
# TODO(b/62379925) Filter empty strings or strings containing the \n or \r
# tokens since index_table_from_file doesn't allow empty rows.
def is_problematic_string(kv):
string, _ = kv # Ignore counts.
return string and b'\n' not in string and b'\r' not in string
result |= 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)
return result
示例4: _MutualInformationTransformAccumulate
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def _MutualInformationTransformAccumulate(pcol): # pylint: disable=invalid-name
"""Accumulates information needed for mutual information computation."""
return (pcol | 'VocabCountPerLabelPerTokenAccumulate' >> beam.CombinePerKey(
_WeightedMeanCombineFn(output_shape=(None,))))
示例5: _MutualInformationTransformMerge
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def _MutualInformationTransformMerge( # pylint: disable=invalid-name
pcol, use_adjusted_mutual_info, min_diff_from_avg):
"""Computes mutual information for each key using the given accumulators."""
feature_accumulator_pcol = (
pcol | 'VocabCountPerLabelPerTokenMerge' >> beam.CombinePerKey(
_WeightedMeanCombineFn(output_shape=(None,))))
accumulators_by_feature, global_accumulator = (
feature_accumulator_pcol
| 'ExtractSentinels' >> beam.FlatMap(_extract_sentinels).with_outputs(
'feature', 'global'))
if min_diff_from_avg is None:
min_diff_from_avg = (
global_accumulator | 'AutoMinDiffFromAvg' >>
beam.Map(lambda acc: analyzers.calculate_recommended_min_diff_from_avg( # pylint: disable=g-long-lambda
acc.count * acc.weight)))
min_diff_from_avg = beam.pvalue.AsSingleton(min_diff_from_avg)
def _extract_merged_values(term, results):
"""Returns the key and tuple of (mutual information, frequency)."""
# Ignore the second value, which is the Expected Mutual Info.
(mi, _, frequency) = results
return term, (mi, frequency)
return (accumulators_by_feature
| 'CalculateMutualInformationPerToken' >> beam.Map(
_calculate_mutual_information_for_feature_value,
beam.pvalue.AsSingleton(global_accumulator),
use_adjusted_mutual_info=use_adjusted_mutual_info,
min_diff_from_avg=min_diff_from_avg)
| beam.MapTuple(_extract_merged_values))
示例6: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def expand(self, pcoll):
to_dict = lambda x: {x[0]: x[1]}
example_counts = (
pcoll
| "count_examples" >> beam.combiners.Count.Globally()
| "key_example_counts" >> beam.Map(
lambda x: ("examples", x))
| "example_count_dict" >> beam.Map(to_dict))
def _count_tokens(pcoll, feat):
return (
pcoll
| "key_%s_toks" % feat >> beam.Map(
lambda x: # pylint:disable=g-long-lambda
("%s_tokens" % feat, int(sum(x[feat] > 1)) if feat in x else 0)))
token_counts = (
[_count_tokens(pcoll, feat)
for feat in self._output_features]
| "flatten_tokens" >> beam.Flatten()
| "count_tokens" >> beam.CombinePerKey(sum)
| "token_count_dict" >> beam.Map(to_dict))
def _merge_dicts(dicts):
merged_dict = {}
for d in dicts:
assert not set(merged_dict).intersection(d)
merged_dict.update(d)
return merged_dict
return (
[example_counts, token_counts]
| "flatten_counts" >> beam.Flatten()
| "merge_stats" >> beam.CombineGlobally(_merge_dicts))
示例7: testTjurDiscriminationMetricsWithNan
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testTjurDiscriminationMetricsWithNan(self, metric):
computations = metric.computations()
shared_metrics = computations[0]
metric = computations[1]
example = {
'labels': np.array([0.0]),
'predictions': np.array([1.0]),
'example_weights': np.array([1.0]),
}
with beam.Pipeline() as pipeline:
# pylint: disable=no-value-for-parameter
result = (
pipeline
| 'Create' >> beam.Create([example])
| 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
| 'AddSlice' >> beam.Map(lambda x: ((), x))
|
'ComputeWeightedTotals' >> beam.CombinePerKey(shared_metrics.combiner)
| 'ComputeMetric' >> beam.Map(lambda x: (x[0], metric.result(x[1]))))
# pylint: enable=no-value-for-parameter
def check_result(got):
try:
self.assertLen(got, 1)
got_slice_key, got_metrics = got[0]
self.assertEqual(got_slice_key, ())
key = metric.keys[0]
self.assertIn(key, got_metrics)
self.assertTrue(math.isnan(got_metrics[key]))
except AssertionError as err:
raise util.BeamAssertException(err)
util.assert_that(result, check_result, label='result')
示例8: testRaisesErrorWhenExampleWeightsDiffer
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testRaisesErrorWhenExampleWeightsDiffer(self):
with self.assertRaises(ValueError):
metric = min_label_position.MinLabelPosition().computations(
query_key='query')[0]
query1_example1 = {
'labels': np.array([0.0]),
'predictions': np.array([0.2]),
'example_weights': np.array([1.0]),
'features': {
'query': np.array(['query1'])
}
}
query1_example2 = {
'labels': np.array([1.0]),
'predictions': np.array([0.8]),
'example_weights': np.array([0.5]),
'features': {
'query': np.array(['query1'])
}
}
def to_standard_metric_inputs_list(list_of_extracts):
return [
metric_util.to_standard_metric_inputs(e, True)
for e in list_of_extracts
]
with beam.Pipeline() as pipeline:
# pylint: disable=no-value-for-parameter
_ = (
pipeline
| 'Create' >> beam.Create([[query1_example1, query1_example2]])
| 'Process' >> beam.Map(to_standard_metric_inputs_list)
| 'AddSlice' >> beam.Map(lambda x: ((), x))
| 'Combine' >> beam.CombinePerKey(metric.combiner))
示例9: testSimpleMetric
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testSimpleMetric(self):
computation = tf_metric_wrapper.tf_metric_computations(
[tf.keras.metrics.MeanSquaredError(name='mse')])[0]
example = {
'labels': [0, 0, 1, 1],
'predictions': [0, 0.5, 0.3, 0.9],
'example_weights': [1.0]
}
with beam.Pipeline() as pipeline:
# pylint: disable=no-value-for-parameter
result = (
pipeline
| 'Create' >> beam.Create([example])
| 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
| 'AddSlice' >> beam.Map(lambda x: ((), x))
| 'Combine' >> beam.CombinePerKey(computation.combiner))
# pylint: enable=no-value-for-parameter
def check_result(got):
try:
self.assertLen(got, 1)
got_slice_key, got_metrics = got[0]
self.assertEqual(got_slice_key, ())
mse_key = metric_types.MetricKey(name='mse')
self.assertDictElementsAlmostEqual(got_metrics, {mse_key: 0.1875})
except AssertionError as err:
raise util.BeamAssertException(err)
util.assert_that(result, check_result, label='result')
示例10: testSparseMetric
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testSparseMetric(self):
computation = tf_metric_wrapper.tf_metric_computations([
tf.keras.metrics.SparseCategoricalCrossentropy(
name='sparse_categorical_crossentropy')
])[0]
# Simulate a multi-class problem with 3 labels.
example = {
'labels': [1],
'predictions': [0.3, 0.6, 0.1],
'example_weights': [1.0]
}
with beam.Pipeline() as pipeline:
# pylint: disable=no-value-for-parameter
result = (
pipeline
| 'Create' >> beam.Create([example])
| 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
| 'AddSlice' >> beam.Map(lambda x: ((), x))
| 'Combine' >> beam.CombinePerKey(computation.combiner))
# pylint: enable=no-value-for-parameter
def check_result(got):
try:
self.assertLen(got, 1)
got_slice_key, got_metrics = got[0]
self.assertEqual(got_slice_key, ())
key = metric_types.MetricKey(name='sparse_categorical_crossentropy')
# 0*log(.3) -1*log(0.6)-0*log(.1) = 0.51
self.assertDictElementsAlmostEqual(got_metrics, {key: 0.51083})
except AssertionError as err:
raise util.BeamAssertException(err)
util.assert_that(result, check_result, label='result')
示例11: testCustomTFMetric
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testCustomTFMetric(self):
metric = tf_metric_wrapper.tf_metric_computations([_CustomMetric()])[0]
example1 = {'labels': [0.0], 'predictions': [0.2], 'example_weights': [1.0]}
example2 = {'labels': [0.0], 'predictions': [0.8], 'example_weights': [1.0]}
example3 = {'labels': [0.0], 'predictions': [0.5], 'example_weights': [2.0]}
with beam.Pipeline() as pipeline:
# pylint: disable=no-value-for-parameter
result = (
pipeline
| 'Create' >> beam.Create([example1, example2, example3])
| 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
| 'AddSlice' >> beam.Map(lambda x: ((), x))
| 'Combine' >> beam.CombinePerKey(metric.combiner))
# pylint: enable=no-value-for-parameter
def check_result(got):
try:
self.assertLen(got, 1)
got_slice_key, got_metrics = got[0]
self.assertEqual(got_slice_key, ())
custom_key = metric_types.MetricKey(name='custom')
self.assertDictElementsAlmostEqual(
got_metrics,
{custom_key: (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0)})
except AssertionError as err:
raise util.BeamAssertException(err)
util.assert_that(result, check_result, label='result')
示例12: testExampleCount
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testExampleCount(self):
metric = example_count.ExampleCount().computations()[0]
example1 = {}
example2 = {}
with beam.Pipeline() as pipeline:
# pylint: disable=no-value-for-parameter
result = (
pipeline
| 'Create' >> beam.Create([example1, example2])
| 'Process' >> beam.ParDo(metric.preprocessor)
| 'AddSlice' >> beam.Map(lambda x: ((), x))
| 'ComputeMetric' >> beam.CombinePerKey(metric.combiner))
# pylint: enable=no-value-for-parameter
def check_result(got):
try:
self.assertLen(got, 1)
got_slice_key, got_metrics = got[0]
self.assertEqual(got_slice_key, ())
example_count_key = metric_types.MetricKey(name='example_count')
self.assertDictElementsAlmostEqual(got_metrics,
{example_count_key: 2})
except AssertionError as err:
raise util.BeamAssertException(err)
util.assert_that(result, check_result, label='result')
示例13: testSquaredPearsonCorrelationMetricsWithNan
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testSquaredPearsonCorrelationMetricsWithNan(self):
computations = (
squared_pearson_correlation.SquaredPearsonCorrelation().computations())
metric = computations[0]
example = {
'labels': np.array([0.0]),
'predictions': np.array([1.0]),
'example_weights': np.array([1.0]),
}
with beam.Pipeline() as pipeline:
# pylint: disable=no-value-for-parameter
result = (
pipeline
| 'Create' >> beam.Create([example])
| 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
| 'AddSlice' >> beam.Map(lambda x: ((), x))
| 'ComputeMetric' >> beam.CombinePerKey(metric.combiner))
# pylint: enable=no-value-for-parameter
def check_result(got):
try:
self.assertLen(got, 1)
got_slice_key, got_metrics = got[0]
self.assertEqual(got_slice_key, ())
key = metric.keys[0]
self.assertIn(key, got_metrics)
self.assertTrue(math.isnan(got_metrics[key]))
except AssertionError as err:
raise util.BeamAssertException(err)
util.assert_that(result, check_result, label='result')
示例14: _GroupByQueryKey
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def _GroupByQueryKey( # pylint: disable=invalid-name
extracts: beam.pvalue.PCollection,
query_key: Text,
) -> beam.pvalue.PCollection:
"""PTransform for grouping extracts by a query key.
Args:
extracts: Incoming PCollection consisting of extracts.
query_key: Query key to group extracts by. Must be a member of the dict of
features stored under tfma.FEATURES_KEY.
Returns:
PCollection of lists of extracts where each list is associated with same
query key.
"""
missing_query_key_counter = beam.metrics.Metrics.counter(
constants.METRICS_NAMESPACE, 'missing_query_key')
def key_by_query_key(extracts: types.Extracts,
query_key: Text) -> Tuple[Text, types.Extracts]:
"""Extract the query key from the extract and key by that."""
value = metric_util.to_scalar(
util.get_by_keys(
extracts, [constants.FEATURES_KEY, query_key], optional=True),
tensor_name=query_key)
if value is None:
missing_query_key_counter.inc()
return ('', extracts)
return ('{}'.format(value), extracts)
# pylint: disable=no-value-for-parameter
return (extracts
| 'KeyByQueryId' >> beam.Map(key_by_query_key, query_key)
| 'GroupByKey' >> beam.CombinePerKey(beam.combiners.ToListCombineFn())
| 'DropQueryId' >> beam.Map(lambda kv: kv[1]))
示例15: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def expand(self, sliced_extracts):
def partition_fn(_, num_partitions):
return self._random_state.randint(num_partitions)
# Partition the data
# List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]]
partitions = (
sliced_extracts
| 'Partition' >> beam.Partition(partition_fn,
self._num_jackknife_samples))
def add_partition_index(slice_key,
accumulator_and_size,
partition_index=None):
accumulator, size = accumulator_and_size
return slice_key, _PartitionInfo(accumulator, size, partition_index)
# Within each partition, partially combine per slice key to get accumulators
# and partition sizes; add partition_id for determinism.
# List[PCollection[slicer.SliceKeyType, _PartitionInfo]]
partition_accumulators = []
for i, partition in enumerate(partitions):
partition_accumulators.append(
partition
| 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey(
beam.transforms.combiners.SingleInputTupleCombineFn(
_AccumulateOnlyCombiner(combiner=self._combiner),
beam.transforms.combiners.CountCombineFn()))
| 'AddPartitionId[{}]'.format(i) >> beam.MapTuple(
add_partition_index, i))
# Group partitions for the same slice, compute LOO metrics, and flatten back
# into per-partition LOO metrics.
# (slicer.SliceKeyType, Tuple[metric_types.MetricsDict])
return (partition_accumulators
| 'FlattenPartitionAccumulators' >> beam.Flatten()
| 'CollectPerSlicePartitions' >> beam.GroupByKey()
| 'MakeJackknifeSamples' >> beam.FlatMap(
_make_jackknife_samples, combiner=self._combiner))