本文整理汇总了Python中apache_beam.CoGroupByKey方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.CoGroupByKey方法的具体用法?Python apache_beam.CoGroupByKey怎么用?Python apache_beam.CoGroupByKey使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.CoGroupByKey方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _join_x_counts
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CoGroupByKey [as 别名]
def _join_x_counts(
join_info: Tuple[_SlicedXKey, Dict[Text, Sequence[Any]]]
# TODO(b/147153346) update dict value list element type annotation to:
# Union[_CountType, Tuple[_YType, _CountType]]
) -> Iterator[Tuple[_SlicedYKey, _ConditionalYRate]]:
"""Joins x_count with all xy_counts for that x.
This function expects the result of a CoGroupByKey, in which the key is a
tuple of the form (slice_key, x_path, x), and one of the grouped streams has
just one element, the number of examples in a given slice for which x is
present in x_path, and the other grouped stream is the set of all (x, y) pairs
for that x along with the number of examples in which both x and y are
present in their respective paths. Schematically, join_info looks like:
(slice, x_path, x), {'x_count': [x_count],
'xy_counts': [(y_1, xy_1_count), ..., (y_k, xy_k_count)]}
If the value of x_count is less than min_x_count, no rates will be yielded.
Args:
join_info: A CoGroupByKey result
Yields:
Per-(slice, x_path, y, x) tuples of the form (_SlicedYKey(slice, y),
_ConditionalYRate(x_path, x, xy_count, x_count)).
"""
# (slice_key, x_path, x), join_inputs = join_info
key, join_inputs = join_info
if not join_inputs['x_count']:
return
x_count = join_inputs['x_count'][0]
for y, xy_count in join_inputs['xy_counts']:
yield _SlicedYKey(key.slice_key, y), _ConditionalYRate(
x_path=key.x_path, x=key.x, xy_count=xy_count, x_count=x_count)
示例2: _join_example_counts
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CoGroupByKey [as 别名]
def _join_example_counts(
join_info: Tuple[types.SliceKey, Dict[Text, Sequence[Any]]]
# TODO(b/147153346) update dict value list element type annotation to:
# Union[_CountType, Tuple[_YType, _CountType]]
) -> Iterator[Tuple[_SlicedYKey, _YRate]]:
"""Joins slice example count with all values of y within that slice.
This function expects the result of a CoGroupByKey, in which the key is the
slice_key, one of the grouped streams has just one element, the total number
of examples within the slice, and the other grouped stream is the set of all
y values and number of times that y value appears in this slice.
Schematically, join_info looks like:
slice_key, {'example_count': [example_count],
'y_counts': [(y_1, y_1_count), ..., (y_k, y_k_count)]}
Args:
join_info: A CoGroupByKey result.
Yields:
Per-(slice, y) tuples (_SlicedYKey(slice, y),
_YRate(y_count, example_count)).
"""
slice_key, join_inputs = join_info
example_count = join_inputs['example_count'][0]
for y, y_count in join_inputs['y_counts']:
yield _SlicedYKey(slice_key, y), _YRate(y_count, example_count)
示例3: _compute_lifts
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CoGroupByKey [as 别名]
def _compute_lifts(
join_info: Tuple[_SlicedYKey, Dict[Text, Sequence[Any]]]
# TODO(b/147153346) update dict value list element type annotation to:
# Sequence[Union[_YRate, _ConditionalYRate]]
) -> Iterator[Tuple[_SlicedFeatureKey, _LiftInfo]]:
"""Joins y_counts with all x-y pairs for that y and computes lift.
This function expects the result of a CoGroupByKey, in which the key is a
tuple of the form (slice_key, y), one of the grouped streams has just one
element, the y_rate for that value of y, and the other grouped stream is the
set of all conditional_y_rate values for that same value of y. Schematically,
join_info looks like:
(slice_key, y), {'y_rate': [y_count, example_count], 'conditional_y_rate': [
(x_path_1, x_1, x_1_y_count, x_1_count), ...,
(x_path_1, x_k, x_k_y_count, x_k_count)
...
(x_path_m, x_1, x_1_y_count, x_1_count), ...,
(x_path_m, x_k, x_k_y_count, x_k_count)]}
Args:
join_info: A CoGroupByKey result.
Yields:
Per-(slice, x_path) tuples of the form ((slice_key, x_path),
_LiftInfo(x, y, lift, xy_count, x_count, y_count)).
"""
(slice_key, y), join_inputs = join_info
y_rate = join_inputs['y_rate'][0]
for conditional_y_rate in join_inputs['conditional_y_rate']:
lift = ((float(conditional_y_rate.xy_count) / conditional_y_rate.x_count) /
(float(y_rate.y_count) / y_rate.example_count))
yield (_SlicedFeatureKey(slice_key, conditional_y_rate.x_path),
_LiftInfo(
x=conditional_y_rate.x,
y=y,
lift=lift,
xy_count=conditional_y_rate.xy_count,
x_count=conditional_y_rate.x_count,
y_count=y_rate.y_count))
示例4: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CoGroupByKey [as 别名]
def expand(self, x_counts_and_ys: Tuple[Tuple[_SlicedXKey, _CountType],
_SlicedYKey]):
x_counts, y_keys = x_counts_and_ys
# slice, y
y_keys_by_slice = (
y_keys
| 'MoveYToValue_YKey' >> beam.Map(lambda k: (k.slice_key, k.y)))
# slice, (x_path, x, x_count)
x_counts_by_slice = (
x_counts
| 'MoveXToValue_XCountsKey' >> beam.MapTuple(
lambda k, v: (k.slice_key, (k.x_path, k.x, v))))
# _SlicedXYKey(slice, x_path, x, y), 0
return (
{
'y_keys': y_keys_by_slice,
'x_counts': x_counts_by_slice
}
| 'CoGroupByForPlaceholderYRates' >> beam.CoGroupByKey()
| 'CrossXYValues' >> beam.FlatMap(_cross_join_y_keys))
# No typehint for input, since it's a multi-input PTransform for which Beam
# doesn't yet support typehints (BEAM-3280).
示例5: _lint
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CoGroupByKey [as 别名]
def _lint(self, examples):
"""Returns the result of the TailedDistributionDetector linter.
Args:
examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.
Returns:
A `PTransform` that yields a `LintResult` of the format
warnings: [feature names]
lint_samples: [
[stats: {min: feature_min if outlying, max: feature_max if outlying}]
for each warning
]
"""
feature_values = (
examples
| 'FlattenFeatureValue' >> beam.FlatMap(
self._flatten_feature_vals(self.numeric_features)))
feature_min_trimmed_mean = (
feature_values | self._make_trimmed_averager(self._MIN))
feature_max_trimmed_mean = (
feature_values | self._make_trimmed_averager(self._MAX))
return (
(feature_min_trimmed_mean, feature_max_trimmed_mean)
| 'MergeTrimmedMeans' >> beam.CoGroupByKey()
| 'AsList' >> beam.combiners.ToList()
| 'ToResult' >> beam.Map(self._to_result))