当前位置: 首页>>代码示例>>Python>>正文


Python pipeline_options.PipelineOptions方法代码示例

本文整理汇总了Python中apache_beam.options.pipeline_options.PipelineOptions方法的典型用法代码示例。如果您正苦于以下问题:Python pipeline_options.PipelineOptions方法的具体用法?Python pipeline_options.PipelineOptions怎么用?Python pipeline_options.PipelineOptions使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在apache_beam.options.pipeline_options的用法示例。


在下文中一共展示了pipeline_options.PipelineOptions方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _process_pipeline_args

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def _process_pipeline_args(self, pipeline_args):
    # type: (List[str]) -> None
    flags_dict = pipeline_options.PipelineOptions(
        pipeline_args).get_all_options()
    self._project = self._get_flag(flags_dict, 'project')
    self._region = self._get_flag(flags_dict, 'region')
    # TODO(bahsir2): Fix the error messages of _check_flag since
    # --worker_machine_type has dest='machine_type'.
    try:
      self._machine_type = self._get_flag(flags_dict, 'machine_type')
    except ValueError:
      self._machine_type = self._get_machine_type_from_fork()
    self._max_num_workers = self._get_flag(
        flags_dict, 'max_num_workers', 'num_workers')
    if self._max_num_workers <= 0:
      raise ValueError(
          '--max_num_workers and --num_workers should be positive numbers, '
          'got: {}'.format(self._max_num_workers)) 
开发者ID:googlegenomics,项目名称:gcp-variant-transforms,代码行数:20,代码来源:vep_runner.py

示例2: run

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run(args, input_subscription, output_table, window_interval):
    """Build and run the pipeline."""
    options = PipelineOptions(args, save_main_session=True, streaming=True)

    with beam.Pipeline(options=options) as pipeline:

        # Read the messages from PubSub and process them.
        messages = (
            pipeline
            | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
                subscription=input_subscription).with_output_types(bytes)
            | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
            | 'Parse JSON messages' >> beam.Map(parse_json_message)
            | 'Fixed-size windows' >> beam.WindowInto(
                window.FixedWindows(int(window_interval), 0))
            | 'Add URL keys' >> beam.Map(lambda msg: (msg['url'], msg))
            | 'Group by URLs' >> beam.GroupByKey()
            | 'Get statistics' >> beam.Map(get_statistics))

        # Output the results into BigQuery table.
        _ = messages | 'Write to Big Query' >> beam.io.WriteToBigQuery(
            output_table, schema=SCHEMA) 
开发者ID:GoogleCloudPlatform,项目名称:python-docs-samples,代码行数:24,代码来源:streaming_beam.py

示例3: _make_beam_pipeline

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def _make_beam_pipeline(self) -> beam.Pipeline:
    """Makes beam pipeline."""
    pipeline_options = PipelineOptions(self._beam_pipeline_args)
    if pipeline_options.view_as(StandardOptions).runner:
      return beam.Pipeline(argv=self._beam_pipeline_args)

    # TODO(b/159468583): move this warning to Beam.
    direct_running_mode = pipeline_options.view_as(
        DirectOptions).direct_running_mode
    direct_num_workers = pipeline_options.view_as(
        DirectOptions).direct_num_workers
    if direct_running_mode == 'in_memory' and direct_num_workers != 1:
      absl.logging.warning(
          'If direct_num_workers is not equal to 1, direct_running_mode should '
          'be `multi_processing` or `multi_threading` instead of `in_memory` '
          'in order for it to have the desired worker parallelism effect.')

    return beam.Pipeline(
        options=pipeline_options, runner=fn_api_runner.FnApiRunner()) 
开发者ID:tensorflow,项目名称:tfx,代码行数:21,代码来源:base_executor.py

示例4: run

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run():
    """Run Apache Beam pipeline to generate TFRecords for Survival Analysis."""

    flags = parse_arguments(sys.argv[1:])
    pipeline_args = get_pipeline_args(flags)

    options = pipeline_options.PipelineOptions(flags=[], **pipeline_args)
    options.view_as(pipeline_options.WorkerOptions).machine_type = (
        flags.machine_type)

    temp_dir = os.path.join(flags.output_dir, 'tmp')

    runner = 'DataflowRunner' if flags.cloud else 'DirectRunner'

    with beam.Pipeline(runner, options=options) as p:
        with tft_beam.Context(temp_dir=temp_dir):
            build_pipeline(p, flags) 
开发者ID:GoogleCloudPlatform,项目名称:professional-services,代码行数:19,代码来源:preprocess.py

示例5: main

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def main():
    # get the cmd args
    db_args, pipeline_args = get_args()

    # Create the pipeline
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        source_config = relational_db.SourceConfiguration(
            drivername=db_args.drivername,
            host=db_args.host,
            port=db_args.port,
            database=db_args.database,
            username=db_args.username,
            password=db_args.password,
        )

        months = p | "Reading records from db" >> relational_db.ReadFromDB(
            source_config=source_config,
            table_name=db_args.table
        )
        months | 'Writing to stdout' >> beam.Map(print) 
开发者ID:mohaseeb,项目名称:beam-nuggets,代码行数:24,代码来源:read_from_relational_db.py

示例6: __init__

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def __init__(self, inference_spec_type: model_spec_pb2.InferenceSpecType,
               pipeline_options: PipelineOptions):
    super(_RemotePredictDoFn, self).__init__(inference_spec_type)
    self._api_client = None

    project_id = (
        inference_spec_type.ai_platform_prediction_model_spec.project_id or
        pipeline_options.view_as(GoogleCloudOptions).project)
    if not project_id:
      raise ValueError('Either a non-empty project id or project flag in '
                       ' beam pipeline options needs be provided.')

    model_name = (
        inference_spec_type.ai_platform_prediction_model_spec.model_name)
    if not model_name:
      raise ValueError('A non-empty model name must be provided.')

    version_name = (
        inference_spec_type.ai_platform_prediction_model_spec.version_name)
    name_spec = 'projects/{}/models/{}'
    # If version is not specified, the default version for a model is used.
    if version_name:
      name_spec += '/versions/{}'
    self._full_model_name = name_spec.format(project_id, model_name,
                                             version_name) 
开发者ID:tensorflow,项目名称:tfx-bsl,代码行数:27,代码来源:run_inference.py

示例7: _shard_variants

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def _shard_variants(known_args, pipeline_args, pipeline_mode):
  # type: (argparse.Namespace, List[str], int) -> List[str]
  """Reads the variants and writes them to VCF shards.

  Returns:
   The VCF shards directory.
  """
  options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  shard_files_job_name = pipeline_common.generate_unique_name(
      _SHARD_VCF_FILES_JOB_NAME)
  _update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
  vcf_shards_output_dir = filesystems.FileSystems.join(
      known_args.annotation_output_dir, _SHARDS_FOLDER)
  with beam.Pipeline(options=options) as p:
    variants = _read_variants(known_args.all_patterns,
                              p,
                              known_args,
                              pipeline_mode,
                              pre_infer_headers=False,
                              keep_raw_sample_names=True)
    sample_ids = (variants
                  | 'CombineSampleIds' >>
                  combine_sample_ids.SampleIdsCombiner()
                  | 'CombineToList' >> beam.combiners.ToList())
    # TODO(tneymanov): Annotation pipeline currently stores sample IDs instead
    # of sample names in the the sharded VCF files, which would lead to double
    # hashing of samples. Needs to be fixed ASAP.
    _ = (variants
         | 'DensifyVariants' >> densify_variants.DensifyVariants(
             beam.pvalue.AsSingleton(sample_ids))
         | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
             vcf_shards_output_dir,
             beam.pvalue.AsSingleton(sample_ids),
             known_args.number_of_variants_per_shard))

  return [vep_runner_util.format_dir_path(vcf_shards_output_dir) +
          _GCS_RECURSIVE_WILDCARD] 
开发者ID:googlegenomics,项目名称:gcp-variant-transforms,代码行数:40,代码来源:vcf_to_bq.py

示例8: _validate_annotation_pipeline_args

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def _validate_annotation_pipeline_args(known_args, pipeline_args):
  match_results = filesystems.FileSystems.match(['{}*'.format(
      vep_runner_util.format_dir_path(known_args.annotation_output_dir))])
  if match_results and match_results[0].metadata_list:
    raise ValueError('Output directory {} already exists.'.format(
        known_args.annotation_output_dir))

  flags_dict = pipeline_options.PipelineOptions(pipeline_args).get_all_options()
  expected_flags = ['max_num_workers', 'num_workers']
  for flag in expected_flags:
    if flag in flags_dict and flags_dict[flag] > 0:
      return
  raise ValueError('Could not find any of {} with a valid value among pipeline '
                   'flags {}'.format(expected_flags, flags_dict)) 
开发者ID:googlegenomics,项目名称:gcp-variant-transforms,代码行数:16,代码来源:vcf_to_bq.py

示例9: generate_statistics_from_tfrecord

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def generate_statistics_from_tfrecord(pipeline_args,  # type: List[str]
                                      data_location,  # type: str
                                      output_path,    # type: str
                                      stats_options   # type: StatsOptions
                                      ):
    # type: (...) ->  statistics_pb2.DatasetFeatureStatisticsList
    """
    Generate stats file from a tfrecord dataset using TFDV

    :param pipeline_args: un-parsed Dataflow arguments
    :param data_location: input data dir containing tfrecord files
    :param output_path: output path for the stats file
    :param stats_options: tfdv.StatsOptions for statistics generation settings
    :return a DatasetFeatureStatisticsList proto.
    """
    assert_not_empty_string(data_location)
    assert_not_empty_string(output_path)

    args_in_snake_case = clean_up_pipeline_args(pipeline_args)
    pipeline_options = PipelineOptions(flags=args_in_snake_case)

    all_options = pipeline_options.get_all_options()

    if all_options["job_name"] is None:
        gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
        gcloud_options.job_name = "generatestats-%s" % str(int(time.time()))

    if all_options["setup_file"] is None:
        setup_file_path = create_setup_file()
        setup_options = pipeline_options.view_as(SetupOptions)
        setup_options.setup_file = setup_file_path

    input_files = os.path.join(data_location, "*.tfrecords*")
    return tfdv.generate_statistics_from_tfrecord(data_location=input_files,
                                                  output_path=output_path,
                                                  stats_options=stats_options,
                                                  pipeline_options=pipeline_options) 
开发者ID:spotify,项目名称:spotify-tensorflow,代码行数:39,代码来源:tfdv.py

示例10: run

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run(input_topic, output_path, window_size=1.0, pipeline_args=None):
    # `save_main_session` is set to true because some DoFn's rely on
    # globally imported modules.
    pipeline_options = PipelineOptions(
        pipeline_args, streaming=True, save_main_session=True
    )

    with beam.Pipeline(options=pipeline_options) as pipeline:
        (
            pipeline
            | "Read PubSub Messages"
            >> beam.io.ReadFromPubSub(topic=input_topic)
            | "Window into" >> GroupWindowsIntoBatches(window_size)
            | "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(output_path))
        ) 
开发者ID:GoogleCloudPlatform,项目名称:python-docs-samples,代码行数:17,代码来源:PubSubToGCS.py

示例11: main

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def main(argv=None):
  known_args, pipeline_args = get_args(argv)
  pipeline_options = PipelineOptions(pipeline_args)
  setup_options = pipeline_options.view_as(SetupOptions)
  setup_options.save_main_session = True
  pipeline.run(pipeline_options, known_args) 
开发者ID:GoogleCloudPlatform,项目名称:realtime-embeddings-matching,代码行数:8,代码来源:run.py

示例12: run_pipeline

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run_pipeline(input_pattern, output_table, pipeline_args):
  """Read the records from GCS and write them to BigQuery."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(f2pn.map_file_to_records) |
       'parse_physionet_record' >> beam.Map(f2pn.parse_physionet_record) |
       'write' >> beam.io.Write(beam.io.BigQuerySink(
           output_table,
           schema='patient_id:INTEGER, record_number:INTEGER, note:STRING',
           write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result) 
开发者ID:GoogleCloudPlatform,项目名称:healthcare-deid,代码行数:15,代码来源:gcs_to_bigquery_lib.py

示例13: run_pipeline

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run_pipeline(input_query, output_file, pipeline_args):
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p
       | 'read' >> beam.io.Read(beam.io.BigQuerySource(query=input_query))
       | 'to_physionet' >> beam.Map(map_to_physionet_record)
       | 'write' >> beam.io.WriteToText(output_file))
  result = p.run().wait_until_finish()

  logging.info('BigQuery to GCS result: %s', result) 
开发者ID:GoogleCloudPlatform,项目名称:healthcare-deid,代码行数:11,代码来源:bigquery_to_gcs_lib.py

示例14: run_pipeline

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run_pipeline(input_pattern, output_dir, mae_task_name, project,
                 pipeline_args):
  """Read the physionet records from GCS and write them out as MAE."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(f2pn.map_phi_to_findings) |
       'generate_mae' >> beam.Map(mae.generate_mae, mae_task_name, {},
                                  ['patient_id', 'record_number']) |
       'write_mae' >> beam.Map(write_mae, project, output_dir)
      )
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result) 
开发者ID:GoogleCloudPlatform,项目名称:healthcare-deid,代码行数:15,代码来源:physionet_to_mae_lib.py

示例15: run_pipeline

# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run_pipeline(input_pattern, output_table, pipeline_args):
  """Read the records from GCS and write them to BigQuery."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(map_file_to_records) |
       'map_to_bq_inputs' >> beam.Map(map_to_bq_inputs) |
       'write' >> beam.io.Write(beam.io.BigQuerySink(
           output_table,
           schema='patient_id:INTEGER, note:STRING',
           write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result) 
开发者ID:GoogleCloudPlatform,项目名称:healthcare-deid,代码行数:15,代码来源:gcs_to_bigquery_lib.py


注:本文中的apache_beam.options.pipeline_options.PipelineOptions方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。