当前位置: 首页>>代码示例>>Python>>正文


Python apache_beam.GroupByKey方法代码示例

本文整理汇总了Python中apache_beam.GroupByKey方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.GroupByKey方法的具体用法?Python apache_beam.GroupByKey怎么用?Python apache_beam.GroupByKey使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在apache_beam的用法示例。


在下文中一共展示了apache_beam.GroupByKey方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def expand(self, pcoll):
        return (
            pcoll
            # Assigns window info to each Pub/Sub message based on its
            # publish timestamp.
            | "Window into Fixed Intervals"
            >> beam.WindowInto(window.FixedWindows(self.window_size))
            | "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
            # Use a dummy key to group the elements in the same window.
            # Note that all the elements in one window must fit into memory
            # for this. If the windowed elements do not fit into memory,
            # please consider using `beam.util.BatchElements`.
            # https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements
            | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
            | "Groupby" >> beam.GroupByKey()
            | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
        ) 
开发者ID:GoogleCloudPlatform,项目名称:python-docs-samples,代码行数:19,代码来源:PubSubToGCS.py

示例2: run

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def run(args, input_subscription, output_table, window_interval):
    """Build and run the pipeline."""
    options = PipelineOptions(args, save_main_session=True, streaming=True)

    with beam.Pipeline(options=options) as pipeline:

        # Read the messages from PubSub and process them.
        messages = (
            pipeline
            | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
                subscription=input_subscription).with_output_types(bytes)
            | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
            | 'Parse JSON messages' >> beam.Map(parse_json_message)
            | 'Fixed-size windows' >> beam.WindowInto(
                window.FixedWindows(int(window_interval), 0))
            | 'Add URL keys' >> beam.Map(lambda msg: (msg['url'], msg))
            | 'Group by URLs' >> beam.GroupByKey()
            | 'Get statistics' >> beam.Map(get_statistics))

        # Output the results into BigQuery table.
        _ = messages | 'Write to Big Query' >> beam.io.WriteToBigQuery(
            output_table, schema=SCHEMA) 
开发者ID:GoogleCloudPlatform,项目名称:python-docs-samples,代码行数:24,代码来源:streaming_beam.py

示例3: shuffle

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def shuffle(p):
  """Shuffles data from PCollection.

  Args:
    p: PCollection.

  Returns:
    PCollection of shuffled data.
  """

  class _AddRandomKey(beam.DoFn):

    def process(self, element):
      yield random.random(), element

  shuffled_data = (
      p
      | 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
      | 'GroupByRandom' >> beam.GroupByKey()
      | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
  return shuffled_data 
开发者ID:GoogleCloudPlatform,项目名称:professional-services,代码行数:23,代码来源:preprocess.py

示例4: shuffle_data

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def shuffle_data(p):
  """Shuffles data from PCollection.

  Args:
    p: PCollection.

  Returns:
    PCollection of shuffled data.
  """

  class _AddRandomKey(beam.DoFn):

    def process(self, element):
      yield (random.random(), element)

  shuffled_data = (
      p
      | 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
      | 'GroupByRandom' >> beam.GroupByKey()
      | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
  return shuffled_data 
开发者ID:GoogleCloudPlatform,项目名称:professional-services,代码行数:23,代码来源:preprocess.py

示例5: get_enriched_events

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def get_enriched_events(salesevent: beam.pvalue.PCollection,sideinput_collections: Dict[str,beam.pvalue.PCollection]) \
        -> beam.pvalue.PCollection:
    """Gets enriched events by
        a) Call a transform that combining primary event with corresponding side input values
        b) Group events by dummy key to combine all events in a window into one shard
        c) Discard dummy key

     Args:
        salesevent: Event representing sales transaction
        sideinput_collections: Set of Side Input Collections
    """
    # yapf: disable
    return (salesevent
             | "Enrich event" >> beam.Map(transforms.enrich_event,
                                       AsDict(sideinput_collections["bonuspoints"]),
                                       AsDict(sideinput_collections["discountpct"]),
                                       AsDict(sideinput_collections["category"]))
             | "Group events by dummy Key" >> beam.GroupByKey()
             | "Discard dummy Key" >> beam.Values()
          )
    # yapf: enable 
开发者ID:GoogleCloudPlatform,项目名称:professional-services,代码行数:23,代码来源:main.py

示例6: _lint

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def _lint(self, examples):
    feature_val_w_counts = (
        examples
        | 'Tuplize' >> beam.FlatMap(
            utils.example_tuplizer(self._counted_features))
        | 'FlattenFeatureVals' >> beam.FlatMap(self._flatten_feature_vals)
        | 'CountFeatureVals' >> beam.combiners.Count.PerElement())

    if hasattr(self, '_count_transformer'):
      feature_val_w_counts |= 'TransformCounts' >> self._count_transformer

    return (
        feature_val_w_counts
        | 'PairValWithCount' >> beam.Map(self._shift_key)
        | 'GroupByFeature' >> beam.GroupByKey()
        | 'ValCountsToDict' >> beam.Map(self._val_counts_as_dict)
        | 'GenResults' >> beam.Map(self._check_feature)
        | 'DropUnwarned' >> beam.Filter(bool)
        | 'AsList' >> beam.combiners.ToList()
        | 'ToResult' >> beam.Map(self._to_result)) 
开发者ID:brain-research,项目名称:data-linter,代码行数:22,代码来源:linters.py

示例7: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def expand(self, pcoll):
    return (pcoll
            | 'MapVariantsByKey' >> beam.FlatMap(self._map_by_variant_keys)
            | 'GroupVariantsByKey' >> beam.GroupByKey()
            | 'MergeVariantsByKey' >> beam.FlatMap(self._merge_variants_by_key)) 
开发者ID:googlegenomics,项目名称:gcp-variant-transforms,代码行数:7,代码来源:merge_variants.py

示例8: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def expand(self, estimates):
    return (estimates
            | 'MapSamplesToValueCount' >> beam.FlatMap(
                self._get_sample_ids)
            | 'GroupAllSamples' >> beam.GroupByKey()) 
开发者ID:googlegenomics,项目名称:gcp-variant-transforms,代码行数:7,代码来源:extract_input_size.py

示例9: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def expand(self, pcoll):
    return (pcoll
            | beam.ParDo(_RoundRobinKeyFn(self._count))
            | beam.GroupByKey()
            | beam.FlatMap(lambda kv: kv[1])) 
开发者ID:googlegenomics,项目名称:gcp-variant-transforms,代码行数:7,代码来源:limit_write.py

示例10: _Shuffle

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def _Shuffle(pcoll):  # pylint: disable=invalid-name
  import random
  return (pcoll
          | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x))
          | 'GroupByRandom' >> beam.GroupByKey()
          | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs)) 
开发者ID:GoogleCloudPlatform,项目名称:cloudml-samples,代码行数:8,代码来源:preprocess.py

示例11: _Shuffle

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def _Shuffle(pcoll):  # pylint: disable=invalid-name
  """Shuffles a PCollection."""
  import random
  return (pcoll
          | 'PairWithRand' >> beam.Map(lambda x: (random.random(), x))
          | 'GroupByRand' >> beam.GroupByKey()
          | 'DropRand' >> beam.FlatMap(lambda (k, vs): vs)) 
开发者ID:GoogleCloudPlatform,项目名称:cloudml-samples,代码行数:9,代码来源:preprocess.py

示例12: preprocess

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def preprocess(pipeline, args):
  """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline.
    args: parsed command line arguments.
  """
  from preproc import movielens  # pylint: disable=g-import-not-at-top

  # 1) Read the data into pcollections.
  movies_coder = tft_coders.CsvCoder(movielens.MOVIE_COLUMNS,
                                     movielens.make_movies_schema(),
                                     secondary_delimiter='|',
                                     multivalent_columns=['genres'])
  movies_data = (pipeline
                 | 'ReadMoviesData' >> beam.io.ReadFromText(
                     os.path.join(args.input_dir, 'movies.csv'),
                     coder=beam.coders.BytesCoder(),
                     # TODO(b/35653662): Obviate the need for setting this.
                     skip_header_lines=args.skip_header_lines)
                 | 'DecodeMovies' >> beam.Map(movies_coder.decode)
                 | 'KeyByMovie' >> beam.Map(lambda x: (x['movie_id'], x)))
  ratings_coder = tft_coders.CsvCoder(movielens.RATING_COLUMNS,
                                      movielens.make_ratings_schema())
  ratings_data = (pipeline
                  | 'ReadRatingsData' >> beam.io.ReadFromText(
                      os.path.join(args.input_dir, 'ratings*'),
                      skip_header_lines=args.skip_header_lines)
                  | 'DecodeRatings' >> beam.Map(ratings_coder.decode)
                  | 'KeyByUser' >> beam.Map(lambda x: (x['user_id'], x))
                  | 'GroupByUser' >> beam.GroupByKey())
  def train_eval_partition_fn((user_id, _), unused_num_partitions):
    return movielens.partition_fn(
        user_id, args.partition_random_seed, args.percent_eval)

  # Split train/eval data based on the integer user id. 
开发者ID:GoogleCloudPlatform,项目名称:cloudml-samples,代码行数:38,代码来源:preprocess.py

示例13: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def expand(
      self,
      sliced_record_batchs: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
    # Compute P(Y=y)
    # _SlicedYKey(slice, y), _YRate(y_count, example_count)
    y_rates = sliced_record_batchs | 'GetYRates' >> _GetYRates(
        self._y_path, self._y_boundaries, self._weight_column_name)
    y_keys = y_rates | 'ExtractYKeys' >> beam.Keys()

    # Compute P(Y=y | X=x)
    # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
    conditional_y_rates = ((sliced_record_batchs, y_keys)
                           | 'GetConditionalYRates' >> _GetConditionalYRates(
                               self._y_path, self._y_boundaries, self._x_paths,
                               self._min_x_count, self._weight_column_name))

    return (
        {
            'conditional_y_rate': conditional_y_rates,
            'y_rate': y_rates
        }
        | 'CoGroupByForLift' >> beam.CoGroupByKey()
        | 'ComputeLifts' >> beam.FlatMap(_compute_lifts)
        | 'FilterLifts' >> _FilterLifts(self._top_k_per_y, self._bottom_k_per_y)
        | 'GroupLiftsForOutput' >> beam.GroupByKey()
        | 'MakeProtos' >> beam.Map(_make_dataset_feature_stats_proto,
                                   self._y_path, self._y_boundaries,
                                   self._weight_column_name is not None,
                                   self._output_custom_stats)) 
开发者ID:tensorflow,项目名称:data-validation,代码行数:31,代码来源:lift_stats_generator.py

示例14: shuffle

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def shuffle(pcoll):  # pylint: disable=invalid-name
  import random
  return (pcoll
          | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x))
          | 'GroupByRandom' >> beam.GroupByKey()
          | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs)) 
开发者ID:googledatalab,项目名称:pydatalab,代码行数:8,代码来源:transform.py

示例15: Shuffle

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import GroupByKey [as 别名]
def Shuffle(pcoll):
  """Shuffles a PCollection.  Collection should not contain duplicates."""
  return (pcoll
          | 'PairWithHash' >> beam.Map(lambda x: (hash(x), x))
          | 'GroupByHash' >> beam.GroupByKey()
          | 'DropHash' >> beam.FlatMap(
              lambda hash_and_values: hash_and_values[1]))


# pylint: disable=invalid-name 
开发者ID:tensorflow,项目名称:transform,代码行数:12,代码来源:sentiment_example.py


注:本文中的apache_beam.GroupByKey方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。