本文整理汇总了Python中apache_beam.GroupByKey方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.GroupByKey方法的具体用法?Python apache_beam.GroupByKey怎么用?Python apache_beam.GroupByKey使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.GroupByKey方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def expand(self, pcoll):
return (
pcoll
# Assigns window info to each Pub/Sub message based on its
# publish timestamp.
| "Window into Fixed Intervals"
>> beam.WindowInto(window.FixedWindows(self.window_size))
| "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
# Use a dummy key to group the elements in the same window.
# Note that all the elements in one window must fit into memory
# for this. If the windowed elements do not fit into memory,
# please consider using `beam.util.BatchElements`.
# https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements
| "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
| "Groupby" >> beam.GroupByKey()
| "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
)
示例2: run
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def run(args, input_subscription, output_table, window_interval):
"""Build and run the pipeline."""
options = PipelineOptions(args, save_main_session=True, streaming=True)
with beam.Pipeline(options=options) as pipeline:
# Read the messages from PubSub and process them.
messages = (
pipeline
| 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
subscription=input_subscription).with_output_types(bytes)
| 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
| 'Parse JSON messages' >> beam.Map(parse_json_message)
| 'Fixed-size windows' >> beam.WindowInto(
window.FixedWindows(int(window_interval), 0))
| 'Add URL keys' >> beam.Map(lambda msg: (msg['url'], msg))
| 'Group by URLs' >> beam.GroupByKey()
| 'Get statistics' >> beam.Map(get_statistics))
# Output the results into BigQuery table.
_ = messages | 'Write to Big Query' >> beam.io.WriteToBigQuery(
output_table, schema=SCHEMA)
示例3: shuffle
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def shuffle(p):
"""Shuffles data from PCollection.
Args:
p: PCollection.
Returns:
PCollection of shuffled data.
"""
class _AddRandomKey(beam.DoFn):
def process(self, element):
yield random.random(), element
shuffled_data = (
p
| 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
| 'GroupByRandom' >> beam.GroupByKey()
| 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
return shuffled_data
示例4: shuffle_data
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def shuffle_data(p):
"""Shuffles data from PCollection.
Args:
p: PCollection.
Returns:
PCollection of shuffled data.
"""
class _AddRandomKey(beam.DoFn):
def process(self, element):
yield (random.random(), element)
shuffled_data = (
p
| 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
| 'GroupByRandom' >> beam.GroupByKey()
| 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
return shuffled_data
示例5: get_enriched_events
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def get_enriched_events(salesevent: beam.pvalue.PCollection,sideinput_collections: Dict[str,beam.pvalue.PCollection]) \
-> beam.pvalue.PCollection:
"""Gets enriched events by
a) Call a transform that combining primary event with corresponding side input values
b) Group events by dummy key to combine all events in a window into one shard
c) Discard dummy key
Args:
salesevent: Event representing sales transaction
sideinput_collections: Set of Side Input Collections
"""
# yapf: disable
return (salesevent
| "Enrich event" >> beam.Map(transforms.enrich_event,
AsDict(sideinput_collections["bonuspoints"]),
AsDict(sideinput_collections["discountpct"]),
AsDict(sideinput_collections["category"]))
| "Group events by dummy Key" >> beam.GroupByKey()
| "Discard dummy Key" >> beam.Values()
)
# yapf: enable
示例6: _lint
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def _lint(self, examples):
feature_val_w_counts = (
examples
| 'Tuplize' >> beam.FlatMap(
utils.example_tuplizer(self._counted_features))
| 'FlattenFeatureVals' >> beam.FlatMap(self._flatten_feature_vals)
| 'CountFeatureVals' >> beam.combiners.Count.PerElement())
if hasattr(self, '_count_transformer'):
feature_val_w_counts |= 'TransformCounts' >> self._count_transformer
return (
feature_val_w_counts
| 'PairValWithCount' >> beam.Map(self._shift_key)
| 'GroupByFeature' >> beam.GroupByKey()
| 'ValCountsToDict' >> beam.Map(self._val_counts_as_dict)
| 'GenResults' >> beam.Map(self._check_feature)
| 'DropUnwarned' >> beam.Filter(bool)
| 'AsList' >> beam.combiners.ToList()
| 'ToResult' >> beam.Map(self._to_result))
示例7: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def expand(self, pcoll):
return (pcoll
| 'MapVariantsByKey' >> beam.FlatMap(self._map_by_variant_keys)
| 'GroupVariantsByKey' >> beam.GroupByKey()
| 'MergeVariantsByKey' >> beam.FlatMap(self._merge_variants_by_key))
示例8: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def expand(self, estimates):
return (estimates
| 'MapSamplesToValueCount' >> beam.FlatMap(
self._get_sample_ids)
| 'GroupAllSamples' >> beam.GroupByKey())
示例9: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def expand(self, pcoll):
return (pcoll
| beam.ParDo(_RoundRobinKeyFn(self._count))
| beam.GroupByKey()
| beam.FlatMap(lambda kv: kv[1]))
示例10: _Shuffle
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def _Shuffle(pcoll): # pylint: disable=invalid-name
import random
return (pcoll
| 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x))
| 'GroupByRandom' >> beam.GroupByKey()
| 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
示例11: _Shuffle
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def _Shuffle(pcoll): # pylint: disable=invalid-name
"""Shuffles a PCollection."""
import random
return (pcoll
| 'PairWithRand' >> beam.Map(lambda x: (random.random(), x))
| 'GroupByRand' >> beam.GroupByKey()
| 'DropRand' >> beam.FlatMap(lambda (k, vs): vs))
示例12: preprocess
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def preprocess(pipeline, args):
"""Run pre-processing step as a pipeline.
Args:
pipeline: beam pipeline.
args: parsed command line arguments.
"""
from preproc import movielens # pylint: disable=g-import-not-at-top
# 1) Read the data into pcollections.
movies_coder = tft_coders.CsvCoder(movielens.MOVIE_COLUMNS,
movielens.make_movies_schema(),
secondary_delimiter='|',
multivalent_columns=['genres'])
movies_data = (pipeline
| 'ReadMoviesData' >> beam.io.ReadFromText(
os.path.join(args.input_dir, 'movies.csv'),
coder=beam.coders.BytesCoder(),
# TODO(b/35653662): Obviate the need for setting this.
skip_header_lines=args.skip_header_lines)
| 'DecodeMovies' >> beam.Map(movies_coder.decode)
| 'KeyByMovie' >> beam.Map(lambda x: (x['movie_id'], x)))
ratings_coder = tft_coders.CsvCoder(movielens.RATING_COLUMNS,
movielens.make_ratings_schema())
ratings_data = (pipeline
| 'ReadRatingsData' >> beam.io.ReadFromText(
os.path.join(args.input_dir, 'ratings*'),
skip_header_lines=args.skip_header_lines)
| 'DecodeRatings' >> beam.Map(ratings_coder.decode)
| 'KeyByUser' >> beam.Map(lambda x: (x['user_id'], x))
| 'GroupByUser' >> beam.GroupByKey())
def train_eval_partition_fn((user_id, _), unused_num_partitions):
return movielens.partition_fn(
user_id, args.partition_random_seed, args.percent_eval)
# Split train/eval data based on the integer user id.
示例13: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def expand(
self,
sliced_record_batchs: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
# Compute P(Y=y)
# _SlicedYKey(slice, y), _YRate(y_count, example_count)
y_rates = sliced_record_batchs | 'GetYRates' >> _GetYRates(
self._y_path, self._y_boundaries, self._weight_column_name)
y_keys = y_rates | 'ExtractYKeys' >> beam.Keys()
# Compute P(Y=y | X=x)
# _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
conditional_y_rates = ((sliced_record_batchs, y_keys)
| 'GetConditionalYRates' >> _GetConditionalYRates(
self._y_path, self._y_boundaries, self._x_paths,
self._min_x_count, self._weight_column_name))
return (
{
'conditional_y_rate': conditional_y_rates,
'y_rate': y_rates
}
| 'CoGroupByForLift' >> beam.CoGroupByKey()
| 'ComputeLifts' >> beam.FlatMap(_compute_lifts)
| 'FilterLifts' >> _FilterLifts(self._top_k_per_y, self._bottom_k_per_y)
| 'GroupLiftsForOutput' >> beam.GroupByKey()
| 'MakeProtos' >> beam.Map(_make_dataset_feature_stats_proto,
self._y_path, self._y_boundaries,
self._weight_column_name is not None,
self._output_custom_stats))
示例14: shuffle
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def shuffle(pcoll): # pylint: disable=invalid-name
import random
return (pcoll
| 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x))
| 'GroupByRandom' >> beam.GroupByKey()
| 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
示例15: Shuffle
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def Shuffle(pcoll):
"""Shuffles a PCollection. Collection should not contain duplicates."""
return (pcoll
| 'PairWithHash' >> beam.Map(lambda x: (hash(x), x))
| 'GroupByHash' >> beam.GroupByKey()
| 'DropHash' >> beam.FlatMap(
lambda hash_and_values: hash_and_values[1]))
# pylint: disable=invalid-name