本文整理汇总了Python中apache_beam.options.pipeline_options.PipelineOptions方法的典型用法代码示例。如果您正苦于以下问题:Python pipeline_options.PipelineOptions方法的具体用法?Python pipeline_options.PipelineOptions怎么用?Python pipeline_options.PipelineOptions使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam.options.pipeline_options
的用法示例。
在下文中一共展示了pipeline_options.PipelineOptions方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _process_pipeline_args
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def _process_pipeline_args(self, pipeline_args):
# type: (List[str]) -> None
flags_dict = pipeline_options.PipelineOptions(
pipeline_args).get_all_options()
self._project = self._get_flag(flags_dict, 'project')
self._region = self._get_flag(flags_dict, 'region')
# TODO(bahsir2): Fix the error messages of _check_flag since
# --worker_machine_type has dest='machine_type'.
try:
self._machine_type = self._get_flag(flags_dict, 'machine_type')
except ValueError:
self._machine_type = self._get_machine_type_from_fork()
self._max_num_workers = self._get_flag(
flags_dict, 'max_num_workers', 'num_workers')
if self._max_num_workers <= 0:
raise ValueError(
'--max_num_workers and --num_workers should be positive numbers, '
'got: {}'.format(self._max_num_workers))
示例2: run
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run(args, input_subscription, output_table, window_interval):
"""Build and run the pipeline."""
options = PipelineOptions(args, save_main_session=True, streaming=True)
with beam.Pipeline(options=options) as pipeline:
# Read the messages from PubSub and process them.
messages = (
pipeline
| 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
subscription=input_subscription).with_output_types(bytes)
| 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
| 'Parse JSON messages' >> beam.Map(parse_json_message)
| 'Fixed-size windows' >> beam.WindowInto(
window.FixedWindows(int(window_interval), 0))
| 'Add URL keys' >> beam.Map(lambda msg: (msg['url'], msg))
| 'Group by URLs' >> beam.GroupByKey()
| 'Get statistics' >> beam.Map(get_statistics))
# Output the results into BigQuery table.
_ = messages | 'Write to Big Query' >> beam.io.WriteToBigQuery(
output_table, schema=SCHEMA)
示例3: _make_beam_pipeline
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def _make_beam_pipeline(self) -> beam.Pipeline:
"""Makes beam pipeline."""
pipeline_options = PipelineOptions(self._beam_pipeline_args)
if pipeline_options.view_as(StandardOptions).runner:
return beam.Pipeline(argv=self._beam_pipeline_args)
# TODO(b/159468583): move this warning to Beam.
direct_running_mode = pipeline_options.view_as(
DirectOptions).direct_running_mode
direct_num_workers = pipeline_options.view_as(
DirectOptions).direct_num_workers
if direct_running_mode == 'in_memory' and direct_num_workers != 1:
absl.logging.warning(
'If direct_num_workers is not equal to 1, direct_running_mode should '
'be `multi_processing` or `multi_threading` instead of `in_memory` '
'in order for it to have the desired worker parallelism effect.')
return beam.Pipeline(
options=pipeline_options, runner=fn_api_runner.FnApiRunner())
示例4: run
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run():
"""Run Apache Beam pipeline to generate TFRecords for Survival Analysis."""
flags = parse_arguments(sys.argv[1:])
pipeline_args = get_pipeline_args(flags)
options = pipeline_options.PipelineOptions(flags=[], **pipeline_args)
options.view_as(pipeline_options.WorkerOptions).machine_type = (
flags.machine_type)
temp_dir = os.path.join(flags.output_dir, 'tmp')
runner = 'DataflowRunner' if flags.cloud else 'DirectRunner'
with beam.Pipeline(runner, options=options) as p:
with tft_beam.Context(temp_dir=temp_dir):
build_pipeline(p, flags)
示例5: main
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def main():
# get the cmd args
db_args, pipeline_args = get_args()
# Create the pipeline
options = PipelineOptions(pipeline_args)
options.view_as(SetupOptions).save_main_session = True
with beam.Pipeline(options=options) as p:
source_config = relational_db.SourceConfiguration(
drivername=db_args.drivername,
host=db_args.host,
port=db_args.port,
database=db_args.database,
username=db_args.username,
password=db_args.password,
)
months = p | "Reading records from db" >> relational_db.ReadFromDB(
source_config=source_config,
table_name=db_args.table
)
months | 'Writing to stdout' >> beam.Map(print)
示例6: __init__
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def __init__(self, inference_spec_type: model_spec_pb2.InferenceSpecType,
pipeline_options: PipelineOptions):
super(_RemotePredictDoFn, self).__init__(inference_spec_type)
self._api_client = None
project_id = (
inference_spec_type.ai_platform_prediction_model_spec.project_id or
pipeline_options.view_as(GoogleCloudOptions).project)
if not project_id:
raise ValueError('Either a non-empty project id or project flag in '
' beam pipeline options needs be provided.')
model_name = (
inference_spec_type.ai_platform_prediction_model_spec.model_name)
if not model_name:
raise ValueError('A non-empty model name must be provided.')
version_name = (
inference_spec_type.ai_platform_prediction_model_spec.version_name)
name_spec = 'projects/{}/models/{}'
# If version is not specified, the default version for a model is used.
if version_name:
name_spec += '/versions/{}'
self._full_model_name = name_spec.format(project_id, model_name,
version_name)
示例7: _shard_variants
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def _shard_variants(known_args, pipeline_args, pipeline_mode):
# type: (argparse.Namespace, List[str], int) -> List[str]
"""Reads the variants and writes them to VCF shards.
Returns:
The VCF shards directory.
"""
options = pipeline_options.PipelineOptions(pipeline_args)
google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
shard_files_job_name = pipeline_common.generate_unique_name(
_SHARD_VCF_FILES_JOB_NAME)
_update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
vcf_shards_output_dir = filesystems.FileSystems.join(
known_args.annotation_output_dir, _SHARDS_FOLDER)
with beam.Pipeline(options=options) as p:
variants = _read_variants(known_args.all_patterns,
p,
known_args,
pipeline_mode,
pre_infer_headers=False,
keep_raw_sample_names=True)
sample_ids = (variants
| 'CombineSampleIds' >>
combine_sample_ids.SampleIdsCombiner()
| 'CombineToList' >> beam.combiners.ToList())
# TODO(tneymanov): Annotation pipeline currently stores sample IDs instead
# of sample names in the the sharded VCF files, which would lead to double
# hashing of samples. Needs to be fixed ASAP.
_ = (variants
| 'DensifyVariants' >> densify_variants.DensifyVariants(
beam.pvalue.AsSingleton(sample_ids))
| 'WriteToShards' >> write_variants_to_shards.WriteToShards(
vcf_shards_output_dir,
beam.pvalue.AsSingleton(sample_ids),
known_args.number_of_variants_per_shard))
return [vep_runner_util.format_dir_path(vcf_shards_output_dir) +
_GCS_RECURSIVE_WILDCARD]
示例8: _validate_annotation_pipeline_args
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def _validate_annotation_pipeline_args(known_args, pipeline_args):
match_results = filesystems.FileSystems.match(['{}*'.format(
vep_runner_util.format_dir_path(known_args.annotation_output_dir))])
if match_results and match_results[0].metadata_list:
raise ValueError('Output directory {} already exists.'.format(
known_args.annotation_output_dir))
flags_dict = pipeline_options.PipelineOptions(pipeline_args).get_all_options()
expected_flags = ['max_num_workers', 'num_workers']
for flag in expected_flags:
if flag in flags_dict and flags_dict[flag] > 0:
return
raise ValueError('Could not find any of {} with a valid value among pipeline '
'flags {}'.format(expected_flags, flags_dict))
示例9: generate_statistics_from_tfrecord
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def generate_statistics_from_tfrecord(pipeline_args, # type: List[str]
data_location, # type: str
output_path, # type: str
stats_options # type: StatsOptions
):
# type: (...) -> statistics_pb2.DatasetFeatureStatisticsList
"""
Generate stats file from a tfrecord dataset using TFDV
:param pipeline_args: un-parsed Dataflow arguments
:param data_location: input data dir containing tfrecord files
:param output_path: output path for the stats file
:param stats_options: tfdv.StatsOptions for statistics generation settings
:return a DatasetFeatureStatisticsList proto.
"""
assert_not_empty_string(data_location)
assert_not_empty_string(output_path)
args_in_snake_case = clean_up_pipeline_args(pipeline_args)
pipeline_options = PipelineOptions(flags=args_in_snake_case)
all_options = pipeline_options.get_all_options()
if all_options["job_name"] is None:
gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
gcloud_options.job_name = "generatestats-%s" % str(int(time.time()))
if all_options["setup_file"] is None:
setup_file_path = create_setup_file()
setup_options = pipeline_options.view_as(SetupOptions)
setup_options.setup_file = setup_file_path
input_files = os.path.join(data_location, "*.tfrecords*")
return tfdv.generate_statistics_from_tfrecord(data_location=input_files,
output_path=output_path,
stats_options=stats_options,
pipeline_options=pipeline_options)
示例10: run
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run(input_topic, output_path, window_size=1.0, pipeline_args=None):
# `save_main_session` is set to true because some DoFn's rely on
# globally imported modules.
pipeline_options = PipelineOptions(
pipeline_args, streaming=True, save_main_session=True
)
with beam.Pipeline(options=pipeline_options) as pipeline:
(
pipeline
| "Read PubSub Messages"
>> beam.io.ReadFromPubSub(topic=input_topic)
| "Window into" >> GroupWindowsIntoBatches(window_size)
| "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(output_path))
)
示例11: main
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def main(argv=None):
known_args, pipeline_args = get_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
setup_options = pipeline_options.view_as(SetupOptions)
setup_options.save_main_session = True
pipeline.run(pipeline_options, known_args)
示例12: run_pipeline
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run_pipeline(input_pattern, output_table, pipeline_args):
"""Read the records from GCS and write them to BigQuery."""
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
_ = (p |
'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
'to_records' >> beam.FlatMap(f2pn.map_file_to_records) |
'parse_physionet_record' >> beam.Map(f2pn.parse_physionet_record) |
'write' >> beam.io.Write(beam.io.BigQuerySink(
output_table,
schema='patient_id:INTEGER, record_number:INTEGER, note:STRING',
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
result = p.run().wait_until_finish()
logging.info('GCS to BigQuery result: %s', result)
示例13: run_pipeline
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run_pipeline(input_query, output_file, pipeline_args):
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
_ = (p
| 'read' >> beam.io.Read(beam.io.BigQuerySource(query=input_query))
| 'to_physionet' >> beam.Map(map_to_physionet_record)
| 'write' >> beam.io.WriteToText(output_file))
result = p.run().wait_until_finish()
logging.info('BigQuery to GCS result: %s', result)
示例14: run_pipeline
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run_pipeline(input_pattern, output_dir, mae_task_name, project,
pipeline_args):
"""Read the physionet records from GCS and write them out as MAE."""
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
_ = (p |
'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
'to_records' >> beam.FlatMap(f2pn.map_phi_to_findings) |
'generate_mae' >> beam.Map(mae.generate_mae, mae_task_name, {},
['patient_id', 'record_number']) |
'write_mae' >> beam.Map(write_mae, project, output_dir)
)
result = p.run().wait_until_finish()
logging.info('GCS to BigQuery result: %s', result)
示例15: run_pipeline
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import PipelineOptions [as 别名]
def run_pipeline(input_pattern, output_table, pipeline_args):
"""Read the records from GCS and write them to BigQuery."""
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
_ = (p |
'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
'to_records' >> beam.FlatMap(map_file_to_records) |
'map_to_bq_inputs' >> beam.Map(map_to_bq_inputs) |
'write' >> beam.io.Write(beam.io.BigQuerySink(
output_table,
schema='patient_id:INTEGER, note:STRING',
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
result = p.run().wait_until_finish()
logging.info('GCS to BigQuery result: %s', result)