本文整理汇总了Python中apache_beam.Keys方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.Keys方法的具体用法?Python apache_beam.Keys怎么用?Python apache_beam.Keys使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.Keys方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def expand(self, sample_map):
return (sample_map
| 'GetListOfSamles' >> beam.Keys()
| 'CountAllUniqueSamples' >> beam.combiners.Count.Globally())
示例2: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def expand(
self,
sliced_record_batchs: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
# Compute P(Y=y)
# _SlicedYKey(slice, y), _YRate(y_count, example_count)
y_rates = sliced_record_batchs | 'GetYRates' >> _GetYRates(
self._y_path, self._y_boundaries, self._weight_column_name)
y_keys = y_rates | 'ExtractYKeys' >> beam.Keys()
# Compute P(Y=y | X=x)
# _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
conditional_y_rates = ((sliced_record_batchs, y_keys)
| 'GetConditionalYRates' >> _GetConditionalYRates(
self._y_path, self._y_boundaries, self._x_paths,
self._min_x_count, self._weight_column_name))
return (
{
'conditional_y_rate': conditional_y_rates,
'y_rate': y_rates
}
| 'CoGroupByForLift' >> beam.CoGroupByKey()
| 'ComputeLifts' >> beam.FlatMap(_compute_lifts)
| 'FilterLifts' >> _FilterLifts(self._top_k_per_y, self._bottom_k_per_y)
| 'GroupLiftsForOutput' >> beam.GroupByKey()
| 'MakeProtos' >> beam.Map(_make_dataset_feature_stats_proto,
self._y_path, self._y_boundaries,
self._weight_column_name is not None,
self._output_custom_stats))
示例3: test_invalid_row
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def test_invalid_row(self):
input_lines = ['1,2.0,hello', '5,12.34']
column_names = ['int_feature', 'float_feature', 'str_feature']
with self.assertRaisesRegex( # pylint: disable=g-error-prone-assert-raises
ValueError, '.*Columns do not match specified csv headers.*'):
with beam.Pipeline() as p:
result = (
p | beam.Create(input_lines, reshuffle=False)
| beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))
| beam.Keys()
| beam.CombineGlobally(
csv_decoder.ColumnTypeInferrer(
column_names, skip_blank_lines=False)))
beam_test_util.assert_that(result, lambda _: None)
示例4: _TrackDistinctSliceKeys
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def _TrackDistinctSliceKeys( # pylint: disable=invalid-name
slice_keys_and_values: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
"""Gathers slice key telemetry post slicing."""
def increment_counter(element): # pylint: disable=invalid-name
num_distinct_slice_keys = beam.metrics.Metrics.counter(
constants.METRICS_NAMESPACE, 'num_distinct_slice_keys')
num_distinct_slice_keys.inc(element)
return element
return (slice_keys_and_values
| 'ExtractSliceKeys' >> beam.Keys()
| 'RemoveDuplicates' >> beam.Distinct()
| 'Size' >> beam.combiners.Count.Globally()
| 'IncrementCounter' >> beam.Map(increment_counter))
示例5: IncrementSliceSpecCounters
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def IncrementSliceSpecCounters(pipeline: beam.Pipeline):
"""To track count of all slicing spec computed using TFMA."""
def _MakeAndIncrementCounters(slice_list):
for slice_key, slice_value in slice_list:
# LINT.IfChange
slice_name = 'slice_computed_%s_%s' % (slice_key, slice_value)
# LINT.ThenChange(../../../../learning/fairness/infra/plx/scripts/tfma_metrics_computed_tracker_macros.sql)
slice_counter = beam.metrics.Metrics.counter(constants.METRICS_NAMESPACE,
slice_name)
slice_counter.inc(1)
return (pipeline
| 'GetSliceCountKeys' >> beam.Keys()
| 'Count' >> beam.Map(_MakeAndIncrementCounters))
示例6: convert_csv_to_tf_examples
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def convert_csv_to_tf_examples(self, csv_path, tfrecords_output_path):
"""Runs a Beam pipeline to convert the CSV file into a TFRecords file.
This is needed because the conversion is orders of magnitude more
time-consuming than the functions we want to benchmark, so instead of
doing the conversion each time, we do it once to generate a converted
dataset and use that for the benchmark instead.
Args:
csv_path: Path to CSV file containing examples.
tfrecords_output_path: Path to output TFRecords file containing parsed
examples.
"""
# Copied from CSV example gen.
fp = open(csv_path, "r")
column_names = next(fp).strip().split(",")
fp.close()
with beam.Pipeline() as p:
parsed_csv_lines = (
p
| "ReadFromText" >> beam.io.ReadFromText(
file_pattern=csv_path, skip_header_lines=1)
|
"ParseCSVLine" >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=",")))
# TODO(b/155997704) clean this up once tfx_bsl makes a release.
if getattr(csv_decoder, "PARSE_CSV_LINE_YIELDS_RAW_RECORDS", False):
# parsed_csv_lines is the following tuple (parsed_lines, raw_records)
# we only want the parsed_lines.
parsed_csv_lines |= "ExtractParsedCSVLines" >> beam.Keys()
column_infos = beam.pvalue.AsSingleton(
parsed_csv_lines
| "InferColumnTypes" >> beam.CombineGlobally(
csv_decoder.ColumnTypeInferrer(
column_names, skip_blank_lines=True)))
_ = (
parsed_csv_lines
| "ToTFExample" >> beam.ParDo(
csv_exgen._ParsedCsvToTfExample(), # pylint: disable=protected-access
column_infos)
| "Serialize" >> beam.Map(lambda x: x.SerializeToString())
| "WriteToTFRecord" >> beam.io.tfrecordio.WriteToTFRecord(
file_path_prefix=tfrecords_output_path,
shard_name_template="",
compression_type=beam.io.filesystem.CompressionTypes.GZIP))
示例7: _CsvToExample
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Keys [as 别名]
def _CsvToExample( # pylint: disable=invalid-name
pipeline: beam.Pipeline, exec_properties: Dict[Text, Any],
split_pattern: Text) -> beam.pvalue.PCollection:
"""Read CSV files and transform to TF examples.
Note that each input split will be transformed by this function separately.
Args:
pipeline: beam pipeline.
exec_properties: A dict of execution properties.
- input_base: input dir that contains CSV data. CSV must have header line.
split_pattern: Split.pattern in Input config, glob relative file pattern
that maps to input files with root directory given by input_base.
Returns:
PCollection of TF examples.
Raises:
RuntimeError: if split is empty or csv headers are not equal.
"""
input_base_uri = exec_properties[utils.INPUT_BASE_KEY]
csv_pattern = os.path.join(input_base_uri, split_pattern)
logging.info('Processing input csv data %s to TFExample.', csv_pattern)
csv_files = tf.io.gfile.glob(csv_pattern)
if not csv_files:
raise RuntimeError(
'Split pattern {} does not match any files.'.format(csv_pattern))
column_names = io_utils.load_csv_column_names(csv_files[0])
for csv_file in csv_files[1:]:
if io_utils.load_csv_column_names(csv_file) != column_names:
raise RuntimeError(
'Files in same split {} have different header.'.format(csv_pattern))
parsed_csv_lines = (
pipeline
| 'ReadFromText' >> beam.io.ReadFromText(
file_pattern=csv_pattern, skip_header_lines=1)
| 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=',')))
# TODO(b/155997704) clean this up once tfx_bsl makes a release.
if getattr(csv_decoder, 'PARSE_CSV_LINE_YIELDS_RAW_RECORDS', False):
# parsed_csv_lines is the following tuple (parsed_lines, raw_records)
# we only want the parsed_lines.
parsed_csv_lines |= 'ExtractParsedCSVLines' >> beam.Keys()
column_infos = beam.pvalue.AsSingleton(
parsed_csv_lines
| 'InferColumnTypes' >> beam.CombineGlobally(
csv_decoder.ColumnTypeInferrer(column_names, skip_blank_lines=True)))
return (parsed_csv_lines
| 'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))