本文整理汇总了Python中apache_beam.Reshuffle方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.Reshuffle方法的具体用法?Python apache_beam.Reshuffle怎么用?Python apache_beam.Reshuffle使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.Reshuffle方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def main(_):
beam_utils.BeamInit()
if not FLAGS.output_file_pattern:
raise ValueError('Must provide an output_file_pattern')
reader = beam.io.ReadFromTFRecord(
FLAGS.input_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example))
model_name = FLAGS.model_name
split = FLAGS.split
run_preprocessors = FLAGS.run_preprocessors
with beam_utils.GetPipelineRoot() as root:
_ = (
root
| 'Read' >> reader
| 'ToTFExample' >> beam.ParDo(
_ProcessShard(model_name, split, run_preprocessors))
| 'Reshuffle' >> beam.Reshuffle()
| 'Write' >> beam.io.WriteToTFRecord(
FLAGS.output_file_pattern,
coder=beam.coders.ProtoCoder(tf.train.Example)))
示例2: construct_pipeline
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def construct_pipeline(pipeline, input_tfrecord, output_tfrecord, model_dir,
confidence_threshold, num_shards):
"""Returns a Beam pipeline to run object detection inference.
Args:
pipeline: Initialized beam pipeline.
input_tfrecord: A TFRecord of tf.train.Example protos containing images.
output_tfrecord: A TFRecord of tf.train.Example protos that contain images
in the input TFRecord and the detections from the model.
model_dir: Path to `saved_model` to use for inference.
confidence_threshold: Threshold to use when keeping detection results.
num_shards: The number of output shards.
"""
input_collection = (
pipeline | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
input_tfrecord,
coder=beam.coders.BytesCoder()))
output_collection = input_collection | 'RunInference' >> beam.ParDo(
GenerateDetectionDataFn(model_dir, confidence_threshold))
output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle()
_ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
output_tfrecord,
num_shards=num_shards,
coder=beam.coders.ProtoCoder(tf.train.Example))
示例3: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def expand(self, pcoll):
return (pcoll
| 'InputFiles' >> beam.Create(self._input_files)
| 'SplitSource' >> beam.FlatMap(bgzf_io.split_bgzf)
| 'Reshuffle' >> beam.Reshuffle()
| 'ReadBlock' >> beam.ParDo(self._read_records))
示例4: main
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def main(argv):
if len(argv) > 1:
raise app.UsageError("Too many command-line arguments.")
def pipeline(root):
"""Beam pipeline for preprocessing open images."""
assert FLAGS.input_file_patterns
assert FLAGS.output_dir
assert FLAGS.output_name
assert FLAGS.num_shards
# Create Pipeline.
tfrecords = []
for i, file_pattern in enumerate(FLAGS.input_file_patterns.split(",")):
logging.info("Reading TFRecords from %s", file_pattern)
stage_name = "read_tfrecords_{}".format(i)
tfrecords.append(root | stage_name >> beam.io.tfrecordio.ReadFromTFRecord(
file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)))
# pylint: disable=expression-not-assigned
(tfrecords
| "flatten" >> beam.Flatten()
| "count_labels" >> beam.ParDo(CountLabelsDoFn())
| "reshuffle" >> beam.Reshuffle()
| "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
os.path.join(FLAGS.output_dir, FLAGS.output_name),
coder=beam.coders.ProtoCoder(tf.train.Example),
num_shards=FLAGS.num_shards))
# pylint: enable=expression-not-assigned
pipeline.run()
logging.info("Processing complete.")
示例5: main
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def main(argv):
if len(argv) > 1:
raise app.UsageError("Too many command-line arguments.")
def pipeline(root):
"""Beam pipeline for preprocessing open images."""
assert FLAGS.input_file_pattern
assert FLAGS.output_dir
assert FLAGS.output_name
assert FLAGS.num_shards
assert FLAGS.kepid_whitelist
# Read label whitelist.
kepid_whitelist = [int(kepid) for kepid in FLAGS.kepid_whitelist.split(",")]
logging.info("Read Kepid whitelist with %d labels", len(kepid_whitelist))
# Initialize DoFn.
process_example = ProcessExampleDoFn(kepid_whitelist)
# Create Pipeline.
# pylint: disable=expression-not-assigned
(root
| "read_tfrecord" >> beam.io.tfrecordio.ReadFromTFRecord(
FLAGS.input_file_pattern,
coder=beam.coders.ProtoCoder(tf.train.Example))
| "process_examples" >> beam.ParDo(process_example)
| "reshuffle" >> beam.Reshuffle()
| "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
os.path.join(FLAGS.output_dir, FLAGS.output_name),
coder=beam.coders.ProtoCoder(tf.train.Example),
num_shards=FLAGS.num_shards))
# pylint: enable=expression-not-assigned
pipeline.run()
logging.info("Processing complete.")
示例6: wiki_pipeline
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def wiki_pipeline():
"""Read WikiText103 filenames and create Beam pipeline."""
train_files = FLAGS.input_file + "/wiki.train.raw"
dev_files = FLAGS.input_file + "/wiki.valid.raw"
test_files = FLAGS.input_file + "/wiki.test.raw"
def pipeline(root):
"""Beam pipeline for converting WikiText103 files to TF Examples."""
_ = (
root | "Create test files" >> beam.Create([test_files])
| "Read test files" >> beam.FlatMap(read_file)
| "test Shuffle" >> beam.Reshuffle()
| "Preproc test docs" >> beam.FlatMap(preproc_doc)
| "record test Shuffle" >> beam.Reshuffle()
| "Write to test tfrecord" >> beam.io.WriteToTFRecord(
FLAGS.output_file + ".cpc." + FLAGS.format + ".test.tfrecord",
num_shards=10))
_ = (
root | "Create dev files" >> beam.Create([dev_files])
| "Read dev files" >> beam.FlatMap(read_file)
| "dev Shuffle" >> beam.Reshuffle()
| "Preproc dev docs" >> beam.FlatMap(preproc_doc)
| "record dev Shuffle" >> beam.Reshuffle()
| "Write to dev tfrecord" >> beam.io.WriteToTFRecord(
FLAGS.output_file + ".cpc." + FLAGS.format + ".dev.tfrecord",
num_shards=10))
_ = (
root | "Create train files" >> beam.Create([train_files])
| "Read train files" >> beam.FlatMap(read_file)
| "train Shuffle" >> beam.Reshuffle()
| "Preproc train docs" >> beam.FlatMap(preproc_doc)
| "record train Shuffle" >> beam.Reshuffle()
| "Write to train tfrecord" >> beam.io.WriteToTFRecord(
FLAGS.output_file + ".cpc." + FLAGS.format + ".train.tfrecord",
num_shards=100))
return
return pipeline
示例7: ccnews_pipeline
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def ccnews_pipeline():
"""Read CCNews filenames and create Beam pipeline."""
if FLAGS.dataset == "ccnews":
data_filename = "ccnews.txt-%05d-of-01000"
datasize = 1000
testsize = 100
else:
data_filename = "wikipedia.txt-%05d-of-00500"
datasize = 500
testsize = 50
train_files = [
FLAGS.input_file + data_filename % i for i in range(datasize - testsize)
]
test_files = [
FLAGS.input_file + data_filename % i
for i in range(datasize - testsize, testsize)
]
def pipeline(root):
"""Beam pipeline for converting CCNews files to TF Examples."""
_ = (
root | "Create test files" >> beam.Create(test_files)
| "Read test files" >> beam.FlatMap(read_file)
| "test Shuffle" >> beam.Reshuffle()
| "Preproc test docs" >> beam.FlatMap(preproc_doc)
| "record test Shuffle" >> beam.Reshuffle()
| "Write to test tfrecord" >> beam.io.WriteToTFRecord(
FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=testsize))
_ = (
root | "Create train files" >> beam.Create(train_files)
| "Read train files" >> beam.FlatMap(read_file)
| "train Shuffle" >> beam.Reshuffle()
| "Preproc train docs" >> beam.FlatMap(preproc_doc)
| "record train Shuffle" >> beam.Reshuffle()
| "Write to train tfrecord" >> beam.io.WriteToTFRecord(
FLAGS.output_file + ".cc_cpc.train.tfrecord",
num_shards=datasize - testsize))
return
return pipeline
示例8: wiki_pipeline
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def wiki_pipeline():
"""Read WikiText103 filenames and create Beam pipeline."""
train_files = FLAGS.input_file + "/wiki.train.raw"
dev_files = FLAGS.input_file + "/wiki.valid.raw"
test_files = FLAGS.input_file + "/wiki.test.raw"
def pipeline(root):
"""Beam pipeline for converting WikiText103 files to TF Examples."""
_ = (
root | "Create test files" >> beam.Create([test_files])
| "Read test files" >> beam.FlatMap(read_file)
| "test Shuffle" >> beam.Reshuffle()
| "Preproc test docs" >> beam.FlatMap(preproc_doc)
| "record test Shuffle" >> beam.Reshuffle()
| "Write to test tfrecord" >> beam.io.WriteToTFRecord(
FLAGS.output_file + "." + FLAGS.format + ".test.tfrecord",
num_shards=10))
_ = (
root | "Create dev files" >> beam.Create([dev_files])
| "Read dev files" >> beam.FlatMap(read_file)
| "dev Shuffle" >> beam.Reshuffle()
| "Preproc dev docs" >> beam.FlatMap(preproc_doc)
| "record dev Shuffle" >> beam.Reshuffle()
| "Write to dev tfrecord" >> beam.io.WriteToTFRecord(
FLAGS.output_file + "." + FLAGS.format + ".dev.tfrecord",
num_shards=10))
_ = (
root | "Create train files" >> beam.Create([train_files])
| "Read train files" >> beam.FlatMap(read_file)
| "train Shuffle" >> beam.Reshuffle()
| "Preproc train docs" >> beam.FlatMap(preproc_doc)
| "record train Shuffle" >> beam.Reshuffle()
| "Write to train tfrecord" >> beam.io.WriteToTFRecord(
FLAGS.output_file + "." + FLAGS.format + ".train.tfrecord",
num_shards=100))
return
return pipeline
示例9: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def expand(self, pipeline):
return (
pipeline
| beam.Create(self.files)
| beam.FlatMap(self._emit_tokenized_examples)
| beam.Reshuffle()) # Allows for additional parallelization.
示例10: create_pipeline
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def create_pipeline(pipeline,
image_directory,
input_annotations_file,
output_tfrecord_prefix=None,
num_images_per_shard=200,
keep_bboxes=True):
"""Creates a beam pipeline for producing a COCO-CameraTraps Image dataset.
Args:
pipeline: Initialized beam pipeline.
image_directory: Path to image directory
input_annotations_file: Path to a coco-cameratraps annotation file
output_tfrecord_prefix: Absolute path for tfrecord outputs. Final files will
be named {output_tfrecord_prefix}@N.
num_images_per_shard: The number of images to store in each shard
keep_bboxes: Whether to keep any bounding boxes that exist in the json file
"""
logging.info('Reading data from COCO-CameraTraps Dataset.')
data = load_json_data(input_annotations_file)
num_shards = int(np.ceil(float(len(data['images']))/num_images_per_shard))
image_examples = (
pipeline | ('CreateCollections') >> beam.Create(
[im['id'] for im in data['images']])
| ('ParseImage') >> beam.ParDo(ParseImage(
image_directory, data['images'], data['annotations'],
data['categories'], keep_bboxes=keep_bboxes)))
_ = (image_examples
| ('Reshuffle') >> beam.Reshuffle()
| ('WriteTfImageExample') >> beam.io.tfrecordio.WriteToTFRecord(
output_tfrecord_prefix,
num_shards=num_shards,
coder=beam.coders.ProtoCoder(tf.train.Example)))
示例11: construct_pipeline
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def construct_pipeline(pipeline, input_tfrecord, output_tfrecord, model_dir,
top_k_embedding_count, bottom_k_embedding_count,
num_shards):
"""Returns a beam pipeline to run object detection inference.
Args:
pipeline: Initialized beam pipeline.
input_tfrecord: An TFRecord of tf.train.Example protos containing images.
output_tfrecord: An TFRecord of tf.train.Example protos that contain images
in the input TFRecord and the detections from the model.
model_dir: Path to `saved_model` to use for inference.
top_k_embedding_count: The number of high-confidence embeddings to store.
bottom_k_embedding_count: The number of low-confidence embeddings to store.
num_shards: The number of output shards.
"""
input_collection = (
pipeline | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
input_tfrecord,
coder=beam.coders.BytesCoder()))
output_collection = input_collection | 'ExtractEmbedding' >> beam.ParDo(
GenerateEmbeddingDataFn(model_dir, top_k_embedding_count,
bottom_k_embedding_count))
output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle()
_ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
output_tfrecord,
num_shards=num_shards,
coder=beam.coders.ProtoCoder(tf.train.Example))
示例12: main
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def main(argv):
del argv # Unused.
logging.set_verbosity(logging.INFO)
def pipeline(root):
"""Beam pipeline for preprocessing Kepler events."""
# Separately process and write each TCE dataset, and gather all the results.
configs = _parse_configs()
subsets = {
"train": [],
"val": [],
"test": [],
}
for config in configs:
output_dir = os.path.join(FLAGS.output_dir, config.name)
# Write the config.
config_json = json.dumps(config, indent=2)
logging.info(config_json)
(root
| "{}-create-config".format(config.name) >> beam.Create([config_json])
| "{}-write_config".format(config.name) >> beam.io.WriteToText(
os.path.join(output_dir, "config.json"),
num_shards=1,
shard_name_template=""))
# Process TCEs and write each subset.
results = _process_tces(root, config)
for subset_name, subset_values in results:
_write_subset(config.name, subset_name, subset_values)
subsets[subset_name].append(subset_values)
# Create one dataset comprising all TCE datasets.
for subset_name, subset_values in subsets.items():
combined_subset_values = (
subset_values
| "combined-{}-flatten".format(subset_name) >> beam.Flatten()
| "combined-{}-count_labels".format(subset_name) >> beam.ParDo(
_CountLabelsDoFn(prefix="combined-{}".format(subset_name)))
| "combined-{}-reshuffle".format(subset_name) >> beam.Reshuffle())
_write_subset("combined", subset_name, combined_subset_values)
pipeline.run()
logging.info("Preprocessing complete.")
示例13: ccnews_pipeline
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def ccnews_pipeline():
"""Read Books Corpus filenames and create Beam pipeline."""
# set a random seed for reproducability
rng = random.Random(FLAGS.random_seed)
# BooksCorpus is organized into directories of genre and files of books
# adventure-all.txt seems to contain all the adventure books in 1 file
# romance-all.txt is the same. None of the other directories have this,
# so we will skip it to not double count those books
file_name_set = set()
input_files_by_genre = collections.defaultdict(list)
for path, _, fnames in tf.gfile.Walk(FLAGS.input_file):
genre = path.split("/")[-1]
for fname in fnames:
if fname == "adventure-all.txt" or fname == "romance-all.txt":
continue
if fname in file_name_set:
continue
file_name_set.add(fname)
input_files_by_genre[genre].append(path + "/" + fname)
# Sort genres and iterate in order for reproducability
train_files, test_files = [], []
for genre, file_list in sorted(input_files_by_genre.items()):
rng.shuffle(file_list)
genre_size = len(file_list)
test_size = int(FLAGS.test_size * genre_size)
test_files.extend(file_list[:test_size])
train_files.extend(file_list[test_size:])
assert len(file_list[:test_size]) + \
len(file_list[test_size:]) == len(file_list)
# make sure there is no test train overlap
for filename in train_files:
assert filename not in test_files
rng.shuffle(train_files)
rng.shuffle(test_files)
def pipeline(root):
"""Beam pipeline for converting CCNews files to TF Examples."""
_ = (
root | "Create test files" >> beam.Create(test_files)
| "Read test files" >> beam.FlatMap(read_file)
| "test Shuffle" >> beam.Reshuffle()
| "Preproc test docs" >> beam.FlatMap(preproc_doc)
| "record test Shuffle" >> beam.Reshuffle()
| "Write to test tfrecord" >> beam.io.WriteToTFRecord(
FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=50))
_ = (
root | "Create train files" >> beam.Create(train_files)
| "Read train files" >> beam.FlatMap(read_file)
| "train Shuffle" >> beam.Reshuffle()
| "Preproc train docs" >> beam.FlatMap(preproc_doc)
| "record train Shuffle" >> beam.Reshuffle()
| "Write to train tfrecord" >> beam.io.WriteToTFRecord(
FLAGS.output_file + ".cc_cpc.train.tfrecord", num_shards=450))
return
return pipeline