Python apache_beam.Keys方法代码示例

本文整理汇总了Python中apache_beam.Keys方法的典型用法代码示例。如果您正苦于以下问题：Python apache_beam.Keys方法的具体用法？Python apache_beam.Keys怎么用？Python apache_beam.Keys使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam的用法示例。

在下文中一共展示了apache_beam.Keys方法的7个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def expand(self, sample_map):
    return (sample_map
            | 'GetListOfSamles' >> beam.Keys()
            | 'CountAllUniqueSamples' >> beam.combiners.Count.Globally())

开发者ID:googlegenomics，项目名称:gcp-variant-transforms，代码行数:6，代码来源:extract_input_size.py

示例2: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def expand(
      self,
      sliced_record_batchs: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
    # Compute P(Y=y)
    # _SlicedYKey(slice, y), _YRate(y_count, example_count)
    y_rates = sliced_record_batchs | 'GetYRates' >> _GetYRates(
        self._y_path, self._y_boundaries, self._weight_column_name)
    y_keys = y_rates | 'ExtractYKeys' >> beam.Keys()

    # Compute P(Y=y | X=x)
    # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
    conditional_y_rates = ((sliced_record_batchs, y_keys)
                           | 'GetConditionalYRates' >> _GetConditionalYRates(
                               self._y_path, self._y_boundaries, self._x_paths,
                               self._min_x_count, self._weight_column_name))

    return (
        {
            'conditional_y_rate': conditional_y_rates,
            'y_rate': y_rates
        }
        | 'CoGroupByForLift' >> beam.CoGroupByKey()
        | 'ComputeLifts' >> beam.FlatMap(_compute_lifts)
        | 'FilterLifts' >> _FilterLifts(self._top_k_per_y, self._bottom_k_per_y)
        | 'GroupLiftsForOutput' >> beam.GroupByKey()
        | 'MakeProtos' >> beam.Map(_make_dataset_feature_stats_proto,
                                   self._y_path, self._y_boundaries,
                                   self._weight_column_name is not None,
                                   self._output_custom_stats))

开发者ID:tensorflow，项目名称:data-validation，代码行数:31，代码来源:lift_stats_generator.py

示例3: test_invalid_row

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def test_invalid_row(self):
    input_lines = ['1,2.0,hello', '5,12.34']
    column_names = ['int_feature', 'float_feature', 'str_feature']
    with self.assertRaisesRegex(  # pylint: disable=g-error-prone-assert-raises
        ValueError, '.*Columns do not match specified csv headers.*'):
      with beam.Pipeline() as p:
        result = (
            p | beam.Create(input_lines, reshuffle=False)
            | beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))
            | beam.Keys()
            | beam.CombineGlobally(
                csv_decoder.ColumnTypeInferrer(
                    column_names, skip_blank_lines=False)))
        beam_test_util.assert_that(result, lambda _: None)

开发者ID:tensorflow，项目名称:tfx-bsl，代码行数:16，代码来源:csv_decoder_test.py

示例4: _TrackDistinctSliceKeys

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def _TrackDistinctSliceKeys(  # pylint: disable=invalid-name
    slice_keys_and_values: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
  """Gathers slice key telemetry post slicing."""

  def increment_counter(element):  # pylint: disable=invalid-name
    num_distinct_slice_keys = beam.metrics.Metrics.counter(
        constants.METRICS_NAMESPACE, 'num_distinct_slice_keys')
    num_distinct_slice_keys.inc(element)
    return element

  return (slice_keys_and_values
          | 'ExtractSliceKeys' >> beam.Keys()
          | 'RemoveDuplicates' >> beam.Distinct()
          | 'Size' >> beam.combiners.Count.Globally()
          | 'IncrementCounter' >> beam.Map(increment_counter))

开发者ID:tensorflow，项目名称:model-analysis，代码行数:17，代码来源:slicer_lib.py

示例5: IncrementSliceSpecCounters

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def IncrementSliceSpecCounters(pipeline: beam.Pipeline):
  """To track count of all slicing spec computed using TFMA."""

  def _MakeAndIncrementCounters(slice_list):
    for slice_key, slice_value in slice_list:
      # LINT.IfChange
      slice_name = 'slice_computed_%s_%s' % (slice_key, slice_value)
      # LINT.ThenChange(../../../../learning/fairness/infra/plx/scripts/tfma_metrics_computed_tracker_macros.sql)
      slice_counter = beam.metrics.Metrics.counter(constants.METRICS_NAMESPACE,
                                                   slice_name)
      slice_counter.inc(1)

  return (pipeline
          | 'GetSliceCountKeys' >> beam.Keys()
          | 'Count' >> beam.Map(_MakeAndIncrementCounters))

开发者ID:tensorflow，项目名称:model-analysis，代码行数:17，代码来源:counter_util.py

示例6: convert_csv_to_tf_examples

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def convert_csv_to_tf_examples(self, csv_path, tfrecords_output_path):
    """Runs a Beam pipeline to convert the CSV file into a TFRecords file.

    This is needed because the conversion is orders of magnitude more
    time-consuming than the functions we want to benchmark, so instead of
    doing the conversion each time, we do it once to generate a converted
    dataset and use that for the benchmark instead.

    Args:
      csv_path: Path to CSV file containing examples.
      tfrecords_output_path: Path to output TFRecords file containing parsed
        examples.
    """
    # Copied from CSV example gen.
    fp = open(csv_path, "r")
    column_names = next(fp).strip().split(",")
    fp.close()

    with beam.Pipeline() as p:
      parsed_csv_lines = (
          p
          | "ReadFromText" >> beam.io.ReadFromText(
              file_pattern=csv_path, skip_header_lines=1)
          |
          "ParseCSVLine" >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=",")))
      # TODO(b/155997704) clean this up once tfx_bsl makes a release.
      if getattr(csv_decoder, "PARSE_CSV_LINE_YIELDS_RAW_RECORDS", False):
        # parsed_csv_lines is the following tuple (parsed_lines, raw_records)
        # we only want the parsed_lines.
        parsed_csv_lines |= "ExtractParsedCSVLines" >> beam.Keys()

      column_infos = beam.pvalue.AsSingleton(
          parsed_csv_lines
          | "InferColumnTypes" >> beam.CombineGlobally(
              csv_decoder.ColumnTypeInferrer(
                  column_names, skip_blank_lines=True)))
      _ = (
          parsed_csv_lines
          | "ToTFExample" >> beam.ParDo(
              csv_exgen._ParsedCsvToTfExample(),  # pylint: disable=protected-access
              column_infos)
          | "Serialize" >> beam.Map(lambda x: x.SerializeToString())
          | "WriteToTFRecord" >> beam.io.tfrecordio.WriteToTFRecord(
              file_path_prefix=tfrecords_output_path,
              shard_name_template="",
              compression_type=beam.io.filesystem.CompressionTypes.GZIP))

开发者ID:tensorflow，项目名称:tfx，代码行数:48，代码来源:dataset.py

示例7: _CsvToExample

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def _CsvToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline, exec_properties: Dict[Text, Any],
    split_pattern: Text) -> beam.pvalue.PCollection:
  """Read CSV files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    exec_properties: A dict of execution properties.
      - input_base: input dir that contains CSV data. CSV must have header line.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.

  Raises:
    RuntimeError: if split is empty or csv headers are not equal.
  """
  input_base_uri = exec_properties[utils.INPUT_BASE_KEY]
  csv_pattern = os.path.join(input_base_uri, split_pattern)
  logging.info('Processing input csv data %s to TFExample.', csv_pattern)

  csv_files = tf.io.gfile.glob(csv_pattern)
  if not csv_files:
    raise RuntimeError(
        'Split pattern {} does not match any files.'.format(csv_pattern))

  column_names = io_utils.load_csv_column_names(csv_files[0])
  for csv_file in csv_files[1:]:
    if io_utils.load_csv_column_names(csv_file) != column_names:
      raise RuntimeError(
          'Files in same split {} have different header.'.format(csv_pattern))

  parsed_csv_lines = (
      pipeline
      | 'ReadFromText' >> beam.io.ReadFromText(
          file_pattern=csv_pattern, skip_header_lines=1)
      | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=',')))
  # TODO(b/155997704) clean this up once tfx_bsl makes a release.
  if getattr(csv_decoder, 'PARSE_CSV_LINE_YIELDS_RAW_RECORDS', False):
    # parsed_csv_lines is the following tuple (parsed_lines, raw_records)
    # we only want the parsed_lines.
    parsed_csv_lines |= 'ExtractParsedCSVLines' >> beam.Keys()
  column_infos = beam.pvalue.AsSingleton(
      parsed_csv_lines
      | 'InferColumnTypes' >> beam.CombineGlobally(
          csv_decoder.ColumnTypeInferrer(column_names, skip_blank_lines=True)))

  return (parsed_csv_lines
          | 'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))

开发者ID:tensorflow，项目名称:tfx，代码行数:54，代码来源:executor.py

注：本文中的apache_beam.Keys方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。