当前位置: 首页>>代码示例>>Python>>正文


Python apache_beam.Reshuffle方法代码示例

本文整理汇总了Python中apache_beam.Reshuffle方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.Reshuffle方法的具体用法?Python apache_beam.Reshuffle怎么用?Python apache_beam.Reshuffle使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在apache_beam的用法示例。


在下文中一共展示了apache_beam.Reshuffle方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def main(_):
  beam_utils.BeamInit()

  if not FLAGS.output_file_pattern:
    raise ValueError('Must provide an output_file_pattern')

  reader = beam.io.ReadFromTFRecord(
      FLAGS.input_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example))

  model_name = FLAGS.model_name
  split = FLAGS.split
  run_preprocessors = FLAGS.run_preprocessors

  with beam_utils.GetPipelineRoot() as root:
    _ = (
        root
        | 'Read' >> reader
        | 'ToTFExample' >> beam.ParDo(
            _ProcessShard(model_name, split, run_preprocessors))
        | 'Reshuffle' >> beam.Reshuffle()
        | 'Write' >> beam.io.WriteToTFRecord(
            FLAGS.output_file_pattern,
            coder=beam.coders.ProtoCoder(tf.train.Example))) 
开发者ID:tensorflow,项目名称:lingvo,代码行数:25,代码来源:create_kitti_crop_dataset.py

示例2: construct_pipeline

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def construct_pipeline(pipeline, input_tfrecord, output_tfrecord, model_dir,
                       confidence_threshold, num_shards):
  """Returns a Beam pipeline to run object detection inference.

  Args:
    pipeline: Initialized beam pipeline.
    input_tfrecord: A TFRecord of tf.train.Example protos containing images.
    output_tfrecord: A TFRecord of tf.train.Example protos that contain images
      in the input TFRecord and the detections from the model.
    model_dir: Path to `saved_model` to use for inference.
    confidence_threshold: Threshold to use when keeping detection results.
    num_shards: The number of output shards.
  """
  input_collection = (
      pipeline | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
          input_tfrecord,
          coder=beam.coders.BytesCoder()))
  output_collection = input_collection | 'RunInference' >> beam.ParDo(
      GenerateDetectionDataFn(model_dir, confidence_threshold))
  output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle()
  _ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
      output_tfrecord,
      num_shards=num_shards,
      coder=beam.coders.ProtoCoder(tf.train.Example)) 
开发者ID:tensorflow,项目名称:models,代码行数:26,代码来源:generate_detection_data.py

示例3: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def expand(self, pcoll):
    return (pcoll
            | 'InputFiles' >> beam.Create(self._input_files)
            | 'SplitSource' >> beam.FlatMap(bgzf_io.split_bgzf)
            | 'Reshuffle' >> beam.Reshuffle()
            | 'ReadBlock' >> beam.ParDo(self._read_records)) 
开发者ID:googlegenomics,项目名称:gcp-variant-transforms,代码行数:8,代码来源:vcfio.py

示例4: main

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def main(argv):
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")

  def pipeline(root):
    """Beam pipeline for preprocessing open images."""
    assert FLAGS.input_file_patterns
    assert FLAGS.output_dir
    assert FLAGS.output_name
    assert FLAGS.num_shards

    # Create Pipeline.
    tfrecords = []
    for i, file_pattern in enumerate(FLAGS.input_file_patterns.split(",")):
      logging.info("Reading TFRecords from %s", file_pattern)
      stage_name = "read_tfrecords_{}".format(i)
      tfrecords.append(root | stage_name >> beam.io.tfrecordio.ReadFromTFRecord(
          file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)))

    # pylint: disable=expression-not-assigned
    (tfrecords
     | "flatten" >> beam.Flatten()
     | "count_labels" >> beam.ParDo(CountLabelsDoFn())
     | "reshuffle" >> beam.Reshuffle()
     | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
         os.path.join(FLAGS.output_dir, FLAGS.output_name),
         coder=beam.coders.ProtoCoder(tf.train.Example),
         num_shards=FLAGS.num_shards))
    # pylint: enable=expression-not-assigned

  pipeline.run()
  logging.info("Processing complete.") 
开发者ID:google-research,项目名称:exoplanet-ml,代码行数:34,代码来源:beam_reshuffle.py

示例5: main

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def main(argv):
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")

  def pipeline(root):
    """Beam pipeline for preprocessing open images."""
    assert FLAGS.input_file_pattern
    assert FLAGS.output_dir
    assert FLAGS.output_name
    assert FLAGS.num_shards
    assert FLAGS.kepid_whitelist

    # Read label whitelist.
    kepid_whitelist = [int(kepid) for kepid in FLAGS.kepid_whitelist.split(",")]
    logging.info("Read Kepid whitelist with %d labels", len(kepid_whitelist))

    # Initialize DoFn.
    process_example = ProcessExampleDoFn(kepid_whitelist)

    # Create Pipeline.
    # pylint: disable=expression-not-assigned
    (root
     | "read_tfrecord" >> beam.io.tfrecordio.ReadFromTFRecord(
         FLAGS.input_file_pattern,
         coder=beam.coders.ProtoCoder(tf.train.Example))
     | "process_examples" >> beam.ParDo(process_example)
     | "reshuffle" >> beam.Reshuffle()
     | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
         os.path.join(FLAGS.output_dir, FLAGS.output_name),
         coder=beam.coders.ProtoCoder(tf.train.Example),
         num_shards=FLAGS.num_shards))
    # pylint: enable=expression-not-assigned

  pipeline.run()
  logging.info("Processing complete.") 
开发者ID:google-research,项目名称:exoplanet-ml,代码行数:37,代码来源:beam_sample_tfrecord.py

示例6: wiki_pipeline

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def wiki_pipeline():
  """Read WikiText103 filenames and create Beam pipeline."""

  train_files = FLAGS.input_file + "/wiki.train.raw"
  dev_files = FLAGS.input_file + "/wiki.valid.raw"
  test_files = FLAGS.input_file + "/wiki.test.raw"

  def pipeline(root):
    """Beam pipeline for converting WikiText103 files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create([test_files])
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".test.tfrecord",
            num_shards=10))
    _ = (
        root | "Create dev files" >> beam.Create([dev_files])
        | "Read dev files" >> beam.FlatMap(read_file)
        | "dev Shuffle" >> beam.Reshuffle()
        | "Preproc dev docs" >> beam.FlatMap(preproc_doc)
        | "record dev Shuffle" >> beam.Reshuffle()
        | "Write to dev tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".dev.tfrecord",
            num_shards=10))
    _ = (
        root | "Create train files" >> beam.Create([train_files])
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".train.tfrecord",
            num_shards=100))
    return

  return pipeline 
开发者ID:google-research,项目名称:language,代码行数:41,代码来源:wiki_preproc_pipeline.py

示例7: ccnews_pipeline

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def ccnews_pipeline():
  """Read CCNews filenames and create Beam pipeline."""

  if FLAGS.dataset == "ccnews":
    data_filename = "ccnews.txt-%05d-of-01000"
    datasize = 1000
    testsize = 100
  else:
    data_filename = "wikipedia.txt-%05d-of-00500"
    datasize = 500
    testsize = 50
  train_files = [
      FLAGS.input_file + data_filename % i for i in range(datasize - testsize)
  ]
  test_files = [
      FLAGS.input_file + data_filename % i
      for i in range(datasize - testsize, testsize)
  ]

  def pipeline(root):
    """Beam pipeline for converting CCNews files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create(test_files)
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=testsize))
    _ = (
        root | "Create train files" >> beam.Create(train_files)
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.train.tfrecord",
            num_shards=datasize - testsize))
    return

  return pipeline 
开发者ID:google-research,项目名称:language,代码行数:43,代码来源:ccnews_preproc_pipeline.py

示例8: wiki_pipeline

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def wiki_pipeline():
  """Read WikiText103 filenames and create Beam pipeline."""

  train_files = FLAGS.input_file + "/wiki.train.raw"
  dev_files = FLAGS.input_file + "/wiki.valid.raw"
  test_files = FLAGS.input_file + "/wiki.test.raw"

  def pipeline(root):
    """Beam pipeline for converting WikiText103 files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create([test_files])
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".test.tfrecord",
            num_shards=10))
    _ = (
        root | "Create dev files" >> beam.Create([dev_files])
        | "Read dev files" >> beam.FlatMap(read_file)
        | "dev Shuffle" >> beam.Reshuffle()
        | "Preproc dev docs" >> beam.FlatMap(preproc_doc)
        | "record dev Shuffle" >> beam.Reshuffle()
        | "Write to dev tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".dev.tfrecord",
            num_shards=10))
    _ = (
        root | "Create train files" >> beam.Create([train_files])
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".train.tfrecord",
            num_shards=100))
    return

  return pipeline 
开发者ID:google-research,项目名称:language,代码行数:41,代码来源:wiki_preproc_pipeline.py

示例9: expand

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def expand(self, pipeline):
    return (
        pipeline
        | beam.Create(self.files)
        | beam.FlatMap(self._emit_tokenized_examples)
        | beam.Reshuffle())  # Allows for additional parallelization. 
开发者ID:google-research,项目名称:text-to-text-transfer-transformer,代码行数:8,代码来源:cache_tasks_main.py

示例10: create_pipeline

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def create_pipeline(pipeline,
                    image_directory,
                    input_annotations_file,
                    output_tfrecord_prefix=None,
                    num_images_per_shard=200,
                    keep_bboxes=True):
  """Creates a beam pipeline for producing a COCO-CameraTraps Image dataset.

  Args:
    pipeline: Initialized beam pipeline.
    image_directory: Path to image directory
    input_annotations_file: Path to a coco-cameratraps annotation file
    output_tfrecord_prefix: Absolute path for tfrecord outputs. Final files will
      be named {output_tfrecord_prefix}@N.
    num_images_per_shard: The number of images to store in each shard
    keep_bboxes: Whether to keep any bounding boxes that exist in the json file
  """

  logging.info('Reading data from COCO-CameraTraps Dataset.')

  data = load_json_data(input_annotations_file)

  num_shards = int(np.ceil(float(len(data['images']))/num_images_per_shard))

  image_examples = (
      pipeline | ('CreateCollections') >> beam.Create(
          [im['id'] for im in data['images']])
      | ('ParseImage') >> beam.ParDo(ParseImage(
          image_directory, data['images'], data['annotations'],
          data['categories'], keep_bboxes=keep_bboxes)))
  _ = (image_examples
       | ('Reshuffle') >> beam.Reshuffle()
       | ('WriteTfImageExample') >> beam.io.tfrecordio.WriteToTFRecord(
           output_tfrecord_prefix,
           num_shards=num_shards,
           coder=beam.coders.ProtoCoder(tf.train.Example))) 
开发者ID:tensorflow,项目名称:models,代码行数:38,代码来源:create_cococameratraps_tfexample_main.py

示例11: construct_pipeline

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def construct_pipeline(pipeline, input_tfrecord, output_tfrecord, model_dir,
                       top_k_embedding_count, bottom_k_embedding_count,
                       num_shards):
  """Returns a beam pipeline to run object detection inference.

  Args:
    pipeline: Initialized beam pipeline.
    input_tfrecord: An TFRecord of tf.train.Example protos containing images.
    output_tfrecord: An TFRecord of tf.train.Example protos that contain images
      in the input TFRecord and the detections from the model.
    model_dir: Path to `saved_model` to use for inference.
    top_k_embedding_count: The number of high-confidence embeddings to store.
    bottom_k_embedding_count: The number of low-confidence embeddings to store.
    num_shards: The number of output shards.
  """
  input_collection = (
      pipeline | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
          input_tfrecord,
          coder=beam.coders.BytesCoder()))
  output_collection = input_collection | 'ExtractEmbedding' >> beam.ParDo(
      GenerateEmbeddingDataFn(model_dir, top_k_embedding_count,
                              bottom_k_embedding_count))
  output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle()
  _ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
      output_tfrecord,
      num_shards=num_shards,
      coder=beam.coders.ProtoCoder(tf.train.Example)) 
开发者ID:tensorflow,项目名称:models,代码行数:29,代码来源:generate_embedding_data.py

示例12: main

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def main(argv):
  del argv  # Unused.
  logging.set_verbosity(logging.INFO)

  def pipeline(root):
    """Beam pipeline for preprocessing Kepler events."""
    # Separately process and write each TCE dataset, and gather all the results.
    configs = _parse_configs()
    subsets = {
        "train": [],
        "val": [],
        "test": [],
    }
    for config in configs:
      output_dir = os.path.join(FLAGS.output_dir, config.name)
      # Write the config.
      config_json = json.dumps(config, indent=2)
      logging.info(config_json)
      (root
       | "{}-create-config".format(config.name) >> beam.Create([config_json])
       | "{}-write_config".format(config.name) >> beam.io.WriteToText(
           os.path.join(output_dir, "config.json"),
           num_shards=1,
           shard_name_template=""))
      # Process TCEs and write each subset.
      results = _process_tces(root, config)
      for subset_name, subset_values in results:
        _write_subset(config.name, subset_name, subset_values)
        subsets[subset_name].append(subset_values)

    # Create one dataset comprising all TCE datasets.
    for subset_name, subset_values in subsets.items():
      combined_subset_values = (
          subset_values
          | "combined-{}-flatten".format(subset_name) >> beam.Flatten()
          | "combined-{}-count_labels".format(subset_name) >> beam.ParDo(
              _CountLabelsDoFn(prefix="combined-{}".format(subset_name)))
          | "combined-{}-reshuffle".format(subset_name) >> beam.Reshuffle())
      _write_subset("combined", subset_name, combined_subset_values)

  pipeline.run()
  logging.info("Preprocessing complete.") 
开发者ID:google-research,项目名称:exoplanet-ml,代码行数:44,代码来源:beam_prepare_embedding_inputs.py

示例13: ccnews_pipeline

# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Reshuffle [as 别名]
def ccnews_pipeline():
  """Read Books Corpus filenames and create Beam pipeline."""

  # set a random seed for reproducability
  rng = random.Random(FLAGS.random_seed)

  # BooksCorpus is organized into directories of genre and files of books
  # adventure-all.txt seems to contain all the adventure books in 1 file
  # romance-all.txt is the same. None of the other directories have this,
  # so we will skip it to not double count those books
  file_name_set = set()
  input_files_by_genre = collections.defaultdict(list)
  for path, _, fnames in tf.gfile.Walk(FLAGS.input_file):
    genre = path.split("/")[-1]
    for fname in fnames:
      if fname == "adventure-all.txt" or fname == "romance-all.txt":
        continue
      if fname in file_name_set:
        continue
      file_name_set.add(fname)
      input_files_by_genre[genre].append(path + "/" + fname)

  # Sort genres and iterate in order for reproducability
  train_files, test_files = [], []
  for genre, file_list in sorted(input_files_by_genre.items()):
    rng.shuffle(file_list)
    genre_size = len(file_list)
    test_size = int(FLAGS.test_size * genre_size)
    test_files.extend(file_list[:test_size])
    train_files.extend(file_list[test_size:])
    assert len(file_list[:test_size]) + \
        len(file_list[test_size:]) == len(file_list)

  # make sure there is no test train overlap
  for filename in train_files:
    assert filename not in test_files

  rng.shuffle(train_files)
  rng.shuffle(test_files)

  def pipeline(root):
    """Beam pipeline for converting CCNews files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create(test_files)
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=50))
    _ = (
        root | "Create train files" >> beam.Create(train_files)
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.train.tfrecord", num_shards=450))
    return

  return pipeline 
开发者ID:google-research,项目名称:language,代码行数:63,代码来源:raw_books_preproc_pipeline.py


注:本文中的apache_beam.Reshuffle方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。