本文整理汇总了Python中apache_beam.Partition方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.Partition方法的具体用法?Python apache_beam.Partition怎么用?Python apache_beam.Partition使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.Partition方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_shard_variants
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Partition [as 别名]
def test_shard_variants(self):
expected_shards = self._get_expected_variant_shards()
variants = [variant
for variant_list in expected_shards.values()
for variant in variant_list]
sharding = variant_sharding.VariantSharding(
'gcp_variant_transforms/data/sharding_configs/'
'homo_sapiens_default.yaml')
pipeline = TestPipeline()
shards = (
pipeline
| Create(variants, reshuffle=False)
| 'ShardVariants' >> beam.Partition(
shard_variants.ShardVariants(sharding),
sharding.get_num_shards()))
for i in range(sharding.get_num_shards()):
assert_that(shards[i], equal_to(expected_shards.get(i, [])),
label=str(i))
pipeline.run()
示例2: split_data
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Partition [as 别名]
def split_data(examples, train_fraction):
"""Splits the data into train/eval.
Args:
examples: A PCollection.
train_fraction: fraction of examples to keep in the train set (float).
"""
def partition_fn(data, n_partition):
random_value = random.random()
if random_value < train_fraction:
return 0
return 1
examples_split = (examples
| "SplitData" >> beam.Partition(partition_fn, 2))
return examples_split
示例3: _PartitionFn
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Partition [as 别名]
def _PartitionFn(
record: Union[tf.train.Example, tf.train.SequenceExample, bytes],
num_partitions: int,
buckets: List[int],
split_config: example_gen_pb2.SplitConfig,
) -> int:
"""Partition function for the ExampleGen's output splits."""
assert num_partitions == len(
buckets), 'Partitions do not match bucket number.'
partition_str = _GeneratePartitionKey(record, split_config)
bucket = int(hashlib.sha256(partition_str).hexdigest(), 16) % buckets[-1]
# For example, if buckets is [10,50,80], there will be 3 splits:
# bucket >=0 && < 10, returns 0
# bucket >=10 && < 50, returns 1
# bucket >=50 && < 80, returns 2
return bisect.bisect(buckets, bucket)
示例4: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Partition [as 别名]
def expand(self, pcoll):
bq_rows = pcoll | 'ConvertToBigQueryTableRow' >> beam.ParDo(
ConvertVariantToRow(
self._bigquery_row_generator,
self._allow_incompatible_records,
self._omit_empty_sample_calls))
if self._num_bigquery_write_shards > 1:
# We split data into self._num_bigquery_write_shards random partitions
# and then write each part to final BQ by appending them together.
# Combined with LimitWrite transform, this will avoid the BQ failure.
bq_row_partitions = bq_rows | beam.Partition(
lambda _, n: random.randint(0, n - 1),
self._num_bigquery_write_shards)
bq_writes = []
for i in range(self._num_bigquery_write_shards):
bq_rows = (bq_row_partitions[i] | 'LimitWrite' + str(i) >>
limit_write.LimitWrite(_WRITE_SHARDS_LIMIT))
bq_writes.append(
bq_rows | 'WriteToBigQuery' + str(i) >>
beam.io.Write(beam.io.BigQuerySink(
self._output_table,
schema=self._schema,
create_disposition=(
beam.io.BigQueryDisposition.CREATE_NEVER),
write_disposition=(
beam.io.BigQueryDisposition.WRITE_APPEND))))
return bq_writes
else:
return (bq_rows
| 'WriteToBigQuery' >> beam.io.Write(beam.io.BigQuerySink(
self._output_table,
schema=self._schema,
create_disposition=(
beam.io.BigQueryDisposition.CREATE_NEVER),
write_disposition=(
beam.io.BigQueryDisposition.WRITE_APPEND
if self._append
else beam.io.BigQueryDisposition.WRITE_EMPTY))))
示例5: configure_pipeline
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Partition [as 别名]
def configure_pipeline(p, dataset_train, dataset_eval, checkpoint_path, output_dir, job_id):
source_train = _util.get_sources_from_dataset(p, dataset_train, 'train')
labels_source = [source_train]
if dataset_eval is not None:
source_eval = _util.get_sources_from_dataset(p, dataset_eval, 'eval')
labels_source.append(source_eval)
labels = _labels_pipeline(labels_source)
train_preprocessed = _transformation_pipeline(source_train, checkpoint_path, labels, 'train')
if dataset_eval is not None:
# explicit eval data.
eval_preprocessed = _transformation_pipeline(source_eval, checkpoint_path, labels, 'eval')
else:
# Split train/eval.
train_preprocessed, eval_preprocessed = (train_preprocessed |
'Random Partition' >>
beam.Partition(TrainEvalSplitPartitionFn(), 2))
output_train_path = os.path.join(output_dir, job_id, 'train')
output_eval_path = os.path.join(output_dir, job_id, 'eval')
labels_file = os.path.join(output_dir, job_id, 'labels')
labels_save = (labels |
'Write labels' >>
beam.io.textio.WriteToText(labels_file, shard_name_template=''))
train_save = train_preprocessed | 'Save train to disk' >> SaveFeatures(output_train_path)
eval_save = eval_preprocessed | 'Save eval to disk' >> SaveFeatures(output_eval_path)
# Make sure we write "latest" file after train and eval data are successfully written.
output_latest_file = os.path.join(output_dir, 'latest')
([eval_save, train_save, labels_save] | 'Wait for train eval saving' >> beam.Flatten() |
'Fixed One' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) |
beam.Map(lambda path: job_id) |
'WriteLatest' >> beam.io.textio.WriteToText(output_latest_file, shard_name_template=''))
示例6: testEachPTransformCopiedOnce
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Partition [as 别名]
def testEachPTransformCopiedOnce(self):
with beam.Pipeline() as p:
created = p | 'Create1' >> beam.Create([(1, 'a'), (2, 'b')])
modified1 = (created
| 'Transform1' >> beam.Map(
lambda x: DeepCopyTest._CountingIdentityFn(
'Transform1', x)))
partition_fn = lambda element, partitions: element[0] % partitions
p1, p2 = (modified1
| 'Partition' >> beam.Partition(partition_fn, 2))
merged = (p1, p2) | 'Flatten1' >> beam.Flatten()
modified2 = (merged
| 'Transform2' >> beam.Map(
lambda x: DeepCopyTest._CountingIdentityFn(
'Transform2', x)))
copied = deep_copy.deep_copy(modified2)
# Check that deep copy was performed.
self.assertIsNot(copied.producer.inputs[0], modified2.producer.inputs[0])
self.assertIsNot(copied.producer.inputs[0].producer.inputs[0],
modified2.producer.inputs[0].producer.inputs[0])
self.assertIsNot(copied.producer.inputs[0].producer.inputs[1],
modified2.producer.inputs[0].producer.inputs[1])
# Check counts of processed items.
self.assertEqual(DeepCopyTest._counts['Transform1'], 4)
self.assertEqual(DeepCopyTest._counts['Transform2'], 4)
示例7: _GeneratePartitionKey
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Partition [as 别名]
def _GeneratePartitionKey(record: Union[tf.train.Example,
tf.train.SequenceExample, bytes],
split_config: example_gen_pb2.SplitConfig) -> bytes:
"""Generates key for partition."""
if not split_config.HasField('partition_feature_name'):
if isinstance(record, bytes):
return record
return record.SerializeToString(deterministic=True)
if isinstance(record, tf.train.Example):
features = record.features.feature # pytype: disable=attribute-error
elif isinstance(record, tf.train.SequenceExample):
features = record.context.feature # pytype: disable=attribute-error
else:
raise RuntimeError('Split by `partition_feature_name` is only supported '
'for FORMAT_TF_EXAMPLE and FORMAT_TF_SEQUENCE_EXAMPLE '
'payload format.')
# Use a feature for partitioning the examples.
feature_name = split_config.partition_feature_name
if feature_name not in features:
raise RuntimeError('Feature name `{}` does not exist.'.format(feature_name))
feature = features[feature_name]
if not feature.HasField('kind'):
raise RuntimeError('Partition feature does not contain any value.')
if (not feature.HasField('bytes_list') and
not feature.HasField('int64_list')):
raise RuntimeError('Only `bytes_list` and `int64_list` features are '
'supported for partition.')
return feature.SerializeToString(deterministic=True)
示例8: _split_data
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Partition [as 别名]
def _split_data(examples, train_fraction=constants.TRAIN_SIZE,
val_fraction=constants.VAL_SIZE):
"""Splits the data into train/validation/test."""
def partition_fn(*_):
random_value = np.random.random()
if random_value < train_fraction:
return 0
if random_value < train_fraction + val_fraction:
return 1
return 2
examples_split = examples | "SplitData" >> beam.Partition(partition_fn, 3)
return zip([constants.TRAIN, constants.VAL, constants.TEST], examples_split)
示例9: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Partition [as 别名]
def expand(self, sliced_extracts):
def partition_fn(_, num_partitions):
return self._random_state.randint(num_partitions)
# Partition the data
# List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]]
partitions = (
sliced_extracts
| 'Partition' >> beam.Partition(partition_fn,
self._num_jackknife_samples))
def add_partition_index(slice_key,
accumulator_and_size,
partition_index=None):
accumulator, size = accumulator_and_size
return slice_key, _PartitionInfo(accumulator, size, partition_index)
# Within each partition, partially combine per slice key to get accumulators
# and partition sizes; add partition_id for determinism.
# List[PCollection[slicer.SliceKeyType, _PartitionInfo]]
partition_accumulators = []
for i, partition in enumerate(partitions):
partition_accumulators.append(
partition
| 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey(
beam.transforms.combiners.SingleInputTupleCombineFn(
_AccumulateOnlyCombiner(combiner=self._combiner),
beam.transforms.combiners.CountCombineFn()))
| 'AddPartitionId[{}]'.format(i) >> beam.MapTuple(
add_partition_index, i))
# Group partitions for the same slice, compute LOO metrics, and flatten back
# into per-partition LOO metrics.
# (slicer.SliceKeyType, Tuple[metric_types.MetricsDict])
return (partition_accumulators
| 'FlattenPartitionAccumulators' >> beam.Flatten()
| 'CollectPerSlicePartitions' >> beam.GroupByKey()
| 'MakeJackknifeSamples' >> beam.FlatMap(
_make_jackknife_samples, combiner=self._combiner))