Python apache_beam.CombinePerKey方法代码示例

本文整理汇总了Python中apache_beam.CombinePerKey方法的典型用法代码示例。如果您正苦于以下问题：Python apache_beam.CombinePerKey方法的具体用法？Python apache_beam.CombinePerKey怎么用？Python apache_beam.CombinePerKey使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam的用法示例。

在下文中一共展示了apache_beam.CombinePerKey方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def expand(self, pcoll: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
    """Estimates the user defined statistic."""

    return (
        pcoll
        | 'AssignBatchToPartition' >> beam.Map(
            _assign_to_partition, num_partitions=self._num_partitions)
        | 'GroupPartitionsIntoList' >> beam.CombinePerKey(
            beam.combiners.SampleCombineFn(self._max_batches_per_partition))
        | 'ProcessPartition' >> beam.Map(_process_partition,
                                         stats_fn=self._stats_fn)
        | 'ComputeMetaStats' >> beam.CombinePerKey(
            PartitionedStatisticsAnalyzer(min_partitions_stat_presence=self
                                          ._min_partitions_stat_presence)))

开发者ID:tensorflow，项目名称:data-validation，代码行数:16，代码来源:partitioned_stats_generator.py

示例2: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def expand(self, sliced_record_batchs):
    # slice, example_count
    example_counts = (
        sliced_record_batchs
        | 'ToExampleCounts' >> beam.MapTuple(lambda k, v: (k, v.num_rows))
        | 'SumExampleCounts' >> beam.CombinePerKey(sum))

    def move_y_to_value(slice_and_y, y_count):
      slice_key, y = slice_and_y
      return slice_key, (y, y_count)

    # slice, (y, y_count)
    y_counts = (
        sliced_record_batchs
        | 'ToPartialYCounts' >>
        beam.FlatMap(_to_partial_counts, self._y_path, self._y_boundaries,
                     self._weight_column_name)
        | 'SumYCounts' >> beam.CombinePerKey(sum)
        | 'MoveYToValue' >> beam.MapTuple(move_y_to_value))

    # _SlicedYKey(slice, y), _YRate(y_count, example_count)
    return ({
        'y_counts': y_counts,
        'example_count': example_counts
    }
            | 'CoGroupByForYRates' >> beam.CoGroupByKey()
            | 'JoinExampleCounts' >> beam.FlatMap(_join_example_counts))

开发者ID:tensorflow，项目名称:data-validation，代码行数:29，代码来源:lift_stats_generator.py

示例3: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def expand(self, inputs):
    pcoll, = inputs

    # Create a PCollection of (count, element) pairs, then iterates over
    # this to create a single element PCollection containing this list of
    # pairs in sorted order by decreasing counts (and by values for equal
    # counts).

    # TODO(b/112916494): Unify the graph in both cases once possible.
    if (self._vocab_ordering_type ==
        _VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION):
      flatten_map_fn = _flatten_to_key_and_means_accumulator_list
      combine_transform = _MutualInformationTransformAccumulate()  # pylint: disable=no-value-for-parameter
    elif self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_FREQUENCY:
      flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples
      combine_transform = beam.CombinePerKey(sum)
    elif self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_LABELS:
      flatten_map_fn = _flatten_value_and_labeled_weights_to_list_of_tuples
      combine_transform = beam.CombinePerKey(sum_labeled_weights)
    else:
      flatten_map_fn = _flatten_value_to_list
      combine_transform = beam.combiners.Count.PerElement()

    result = (
        pcoll
        | 'FlattenTokensAndMaybeWeightsLabels' >> beam.FlatMap(flatten_map_fn)
        | 'CountPerToken' >> combine_transform)

    if self._input_dtype == tf.string:
      # TODO(b/62379925) Filter empty strings or strings containing the \n or \r
      # tokens since index_table_from_file doesn't allow empty rows.
      def is_problematic_string(kv):
        string, _ = kv  # Ignore counts.
        return string and b'\n' not in string and b'\r' not in string

      result |= 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)

    return result

开发者ID:tensorflow，项目名称:transform，代码行数:40，代码来源:analyzer_impls.py

示例4: _MutualInformationTransformAccumulate

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def _MutualInformationTransformAccumulate(pcol):  # pylint: disable=invalid-name
  """Accumulates information needed for mutual information computation."""
  return (pcol | 'VocabCountPerLabelPerTokenAccumulate' >> beam.CombinePerKey(
      _WeightedMeanCombineFn(output_shape=(None,))))

开发者ID:tensorflow，项目名称:transform，代码行数:6，代码来源:analyzer_impls.py

示例5: _MutualInformationTransformMerge

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def _MutualInformationTransformMerge(  # pylint: disable=invalid-name
    pcol, use_adjusted_mutual_info, min_diff_from_avg):
  """Computes mutual information for each key using the given accumulators."""
  feature_accumulator_pcol = (
      pcol | 'VocabCountPerLabelPerTokenMerge' >> beam.CombinePerKey(
          _WeightedMeanCombineFn(output_shape=(None,))))

  accumulators_by_feature, global_accumulator = (
      feature_accumulator_pcol
      | 'ExtractSentinels' >> beam.FlatMap(_extract_sentinels).with_outputs(
          'feature', 'global'))
  if min_diff_from_avg is None:
    min_diff_from_avg = (
        global_accumulator | 'AutoMinDiffFromAvg' >>
        beam.Map(lambda acc: analyzers.calculate_recommended_min_diff_from_avg(  # pylint: disable=g-long-lambda
            acc.count * acc.weight)))
    min_diff_from_avg = beam.pvalue.AsSingleton(min_diff_from_avg)

  def _extract_merged_values(term, results):
    """Returns the key and tuple of (mutual information, frequency)."""
    # Ignore the second value, which is the Expected Mutual Info.
    (mi, _, frequency) = results
    return term, (mi, frequency)

  return (accumulators_by_feature
          | 'CalculateMutualInformationPerToken' >> beam.Map(
              _calculate_mutual_information_for_feature_value,
              beam.pvalue.AsSingleton(global_accumulator),
              use_adjusted_mutual_info=use_adjusted_mutual_info,
              min_diff_from_avg=min_diff_from_avg)
          | beam.MapTuple(_extract_merged_values))

开发者ID:tensorflow，项目名称:transform，代码行数:33，代码来源:analyzer_impls.py

示例6: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def expand(self, pcoll):
    to_dict = lambda x: {x[0]: x[1]}
    example_counts = (
        pcoll
        | "count_examples" >> beam.combiners.Count.Globally()
        | "key_example_counts" >> beam.Map(
            lambda x: ("examples", x))
        | "example_count_dict" >> beam.Map(to_dict))
    def _count_tokens(pcoll, feat):
      return (
          pcoll
          | "key_%s_toks" % feat >> beam.Map(
              lambda x:  # pylint:disable=g-long-lambda
              ("%s_tokens" % feat, int(sum(x[feat] > 1)) if feat in x else 0)))
    token_counts = (
        [_count_tokens(pcoll, feat)
         for feat in self._output_features]
        | "flatten_tokens" >> beam.Flatten()
        | "count_tokens" >> beam.CombinePerKey(sum)
        | "token_count_dict" >> beam.Map(to_dict))

    def _merge_dicts(dicts):
      merged_dict = {}
      for d in dicts:
        assert not set(merged_dict).intersection(d)
        merged_dict.update(d)
      return merged_dict
    return (
        [example_counts, token_counts]
        | "flatten_counts" >> beam.Flatten()
        | "merge_stats" >> beam.CombineGlobally(_merge_dicts))

开发者ID:google-research，项目名称:text-to-text-transfer-transformer，代码行数:33，代码来源:cache_tasks_main.py

示例7: testTjurDiscriminationMetricsWithNan

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testTjurDiscriminationMetricsWithNan(self, metric):
    computations = metric.computations()
    shared_metrics = computations[0]
    metric = computations[1]

    example = {
        'labels': np.array([0.0]),
        'predictions': np.array([1.0]),
        'example_weights': np.array([1.0]),
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          |
          'ComputeWeightedTotals' >> beam.CombinePerKey(shared_metrics.combiner)
          | 'ComputeMetric' >> beam.Map(lambda x: (x[0], metric.result(x[1]))))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          key = metric.keys[0]
          self.assertIn(key, got_metrics)
          self.assertTrue(math.isnan(got_metrics[key]))

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')

开发者ID:tensorflow，项目名称:model-analysis，代码行数:39，代码来源:tjur_discrimination_test.py

示例8: testRaisesErrorWhenExampleWeightsDiffer

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testRaisesErrorWhenExampleWeightsDiffer(self):
    with self.assertRaises(ValueError):
      metric = min_label_position.MinLabelPosition().computations(
          query_key='query')[0]

      query1_example1 = {
          'labels': np.array([0.0]),
          'predictions': np.array([0.2]),
          'example_weights': np.array([1.0]),
          'features': {
              'query': np.array(['query1'])
          }
      }
      query1_example2 = {
          'labels': np.array([1.0]),
          'predictions': np.array([0.8]),
          'example_weights': np.array([0.5]),
          'features': {
              'query': np.array(['query1'])
          }
      }

      def to_standard_metric_inputs_list(list_of_extracts):
        return [
            metric_util.to_standard_metric_inputs(e, True)
            for e in list_of_extracts
        ]

      with beam.Pipeline() as pipeline:
        # pylint: disable=no-value-for-parameter
        _ = (
            pipeline
            | 'Create' >> beam.Create([[query1_example1, query1_example2]])
            | 'Process' >> beam.Map(to_standard_metric_inputs_list)
            | 'AddSlice' >> beam.Map(lambda x: ((), x))
            | 'Combine' >> beam.CombinePerKey(metric.combiner))

开发者ID:tensorflow，项目名称:model-analysis，代码行数:38，代码来源:min_label_position_test.py

示例9: testSimpleMetric

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testSimpleMetric(self):
    computation = tf_metric_wrapper.tf_metric_computations(
        [tf.keras.metrics.MeanSquaredError(name='mse')])[0]

    example = {
        'labels': [0, 0, 1, 1],
        'predictions': [0, 0.5, 0.3, 0.9],
        'example_weights': [1.0]
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'Combine' >> beam.CombinePerKey(computation.combiner))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          mse_key = metric_types.MetricKey(name='mse')
          self.assertDictElementsAlmostEqual(got_metrics, {mse_key: 0.1875})

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')

开发者ID:tensorflow，项目名称:model-analysis，代码行数:35，代码来源:tf_metric_wrapper_test.py

示例10: testSparseMetric

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testSparseMetric(self):
    computation = tf_metric_wrapper.tf_metric_computations([
        tf.keras.metrics.SparseCategoricalCrossentropy(
            name='sparse_categorical_crossentropy')
    ])[0]

    # Simulate a multi-class problem with 3 labels.
    example = {
        'labels': [1],
        'predictions': [0.3, 0.6, 0.1],
        'example_weights': [1.0]
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'Combine' >> beam.CombinePerKey(computation.combiner))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          key = metric_types.MetricKey(name='sparse_categorical_crossentropy')
          # 0*log(.3) -1*log(0.6)-0*log(.1) = 0.51
          self.assertDictElementsAlmostEqual(got_metrics, {key: 0.51083})

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')

开发者ID:tensorflow，项目名称:model-analysis，代码行数:39，代码来源:tf_metric_wrapper_test.py

示例11: testCustomTFMetric

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testCustomTFMetric(self):
    metric = tf_metric_wrapper.tf_metric_computations([_CustomMetric()])[0]

    example1 = {'labels': [0.0], 'predictions': [0.2], 'example_weights': [1.0]}
    example2 = {'labels': [0.0], 'predictions': [0.8], 'example_weights': [1.0]}
    example3 = {'labels': [0.0], 'predictions': [0.5], 'example_weights': [2.0]}

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example1, example2, example3])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'Combine' >> beam.CombinePerKey(metric.combiner))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())

          custom_key = metric_types.MetricKey(name='custom')
          self.assertDictElementsAlmostEqual(
              got_metrics,
              {custom_key: (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0)})

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')

开发者ID:tensorflow，项目名称:model-analysis，代码行数:35，代码来源:tf_metric_wrapper_test.py

示例12: testExampleCount

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testExampleCount(self):
    metric = example_count.ExampleCount().computations()[0]

    example1 = {}
    example2 = {}

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example1, example2])
          | 'Process' >> beam.ParDo(metric.preprocessor)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'ComputeMetric' >> beam.CombinePerKey(metric.combiner))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          example_count_key = metric_types.MetricKey(name='example_count')
          self.assertDictElementsAlmostEqual(got_metrics,
                                             {example_count_key: 2})

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')

开发者ID:tensorflow，项目名称:model-analysis，代码行数:32，代码来源:example_count_test.py

示例13: testSquaredPearsonCorrelationMetricsWithNan

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def testSquaredPearsonCorrelationMetricsWithNan(self):
    computations = (
        squared_pearson_correlation.SquaredPearsonCorrelation().computations())
    metric = computations[0]

    example = {
        'labels': np.array([0.0]),
        'predictions': np.array([1.0]),
        'example_weights': np.array([1.0]),
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'ComputeMetric' >> beam.CombinePerKey(metric.combiner))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          key = metric.keys[0]
          self.assertIn(key, got_metrics)
          self.assertTrue(math.isnan(got_metrics[key]))

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')

开发者ID:tensorflow，项目名称:model-analysis，代码行数:37，代码来源:squared_pearson_correlation_test.py

示例14: _GroupByQueryKey

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def _GroupByQueryKey(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    query_key: Text,
) -> beam.pvalue.PCollection:
  """PTransform for grouping extracts by a query key.

  Args:
    extracts: Incoming PCollection consisting of extracts.
    query_key: Query key to group extracts by. Must be a member of the dict of
      features stored under tfma.FEATURES_KEY.

  Returns:
    PCollection of lists of extracts where each list is associated with same
    query key.
  """
  missing_query_key_counter = beam.metrics.Metrics.counter(
      constants.METRICS_NAMESPACE, 'missing_query_key')

  def key_by_query_key(extracts: types.Extracts,
                       query_key: Text) -> Tuple[Text, types.Extracts]:
    """Extract the query key from the extract and key by that."""
    value = metric_util.to_scalar(
        util.get_by_keys(
            extracts, [constants.FEATURES_KEY, query_key], optional=True),
        tensor_name=query_key)
    if value is None:
      missing_query_key_counter.inc()
      return ('', extracts)
    return ('{}'.format(value), extracts)

  # pylint: disable=no-value-for-parameter
  return (extracts
          | 'KeyByQueryId' >> beam.Map(key_by_query_key, query_key)
          | 'GroupByKey' >> beam.CombinePerKey(beam.combiners.ToListCombineFn())
          | 'DropQueryId' >> beam.Map(lambda kv: kv[1]))

开发者ID:tensorflow，项目名称:model-analysis，代码行数:37，代码来源:metrics_and_plots_evaluator_v2.py

示例15: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import CombinePerKey [as 别名]
def expand(self, sliced_extracts):

    def partition_fn(_, num_partitions):
      return self._random_state.randint(num_partitions)

    # Partition the data
    # List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]]
    partitions = (
        sliced_extracts
        | 'Partition' >> beam.Partition(partition_fn,
                                        self._num_jackknife_samples))

    def add_partition_index(slice_key,
                            accumulator_and_size,
                            partition_index=None):
      accumulator, size = accumulator_and_size
      return slice_key, _PartitionInfo(accumulator, size, partition_index)

    # Within each partition, partially combine per slice key to get accumulators
    # and partition sizes; add partition_id for determinism.
    # List[PCollection[slicer.SliceKeyType, _PartitionInfo]]
    partition_accumulators = []
    for i, partition in enumerate(partitions):
      partition_accumulators.append(
          partition
          | 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey(
              beam.transforms.combiners.SingleInputTupleCombineFn(
                  _AccumulateOnlyCombiner(combiner=self._combiner),
                  beam.transforms.combiners.CountCombineFn()))
          | 'AddPartitionId[{}]'.format(i) >> beam.MapTuple(
              add_partition_index, i))

    # Group partitions for the same slice, compute LOO metrics, and flatten back
    # into per-partition LOO metrics.
    # (slicer.SliceKeyType, Tuple[metric_types.MetricsDict])
    return (partition_accumulators
            | 'FlattenPartitionAccumulators' >> beam.Flatten()
            | 'CollectPerSlicePartitions' >> beam.GroupByKey()
            | 'MakeJackknifeSamples' >> beam.FlatMap(
                _make_jackknife_samples, combiner=self._combiner))

开发者ID:tensorflow，项目名称:model-analysis，代码行数:42，代码来源:jackknife.py

注：本文中的apache_beam.CombinePerKey方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。