本文整理汇总了Python中apache_beam.options.pipeline_options.GoogleCloudOptions方法的典型用法代码示例。如果您正苦于以下问题:Python pipeline_options.GoogleCloudOptions方法的具体用法?Python pipeline_options.GoogleCloudOptions怎么用?Python pipeline_options.GoogleCloudOptions使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam.options.pipeline_options
的用法示例。
在下文中一共展示了pipeline_options.GoogleCloudOptions方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import GoogleCloudOptions [as 别名]
def __init__(self, inference_spec_type: model_spec_pb2.InferenceSpecType,
pipeline_options: PipelineOptions):
super(_RemotePredictDoFn, self).__init__(inference_spec_type)
self._api_client = None
project_id = (
inference_spec_type.ai_platform_prediction_model_spec.project_id or
pipeline_options.view_as(GoogleCloudOptions).project)
if not project_id:
raise ValueError('Either a non-empty project id or project flag in '
' beam pipeline options needs be provided.')
model_name = (
inference_spec_type.ai_platform_prediction_model_spec.model_name)
if not model_name:
raise ValueError('A non-empty model name must be provided.')
version_name = (
inference_spec_type.ai_platform_prediction_model_spec.version_name)
name_spec = 'projects/{}/models/{}'
# If version is not specified, the default version for a model is used.
if version_name:
name_spec += '/versions/{}'
self._full_model_name = name_spec.format(project_id, model_name,
version_name)
示例2: _shard_variants
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import GoogleCloudOptions [as 别名]
def _shard_variants(known_args, pipeline_args, pipeline_mode):
# type: (argparse.Namespace, List[str], int) -> List[str]
"""Reads the variants and writes them to VCF shards.
Returns:
The VCF shards directory.
"""
options = pipeline_options.PipelineOptions(pipeline_args)
google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
shard_files_job_name = pipeline_common.generate_unique_name(
_SHARD_VCF_FILES_JOB_NAME)
_update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
vcf_shards_output_dir = filesystems.FileSystems.join(
known_args.annotation_output_dir, _SHARDS_FOLDER)
with beam.Pipeline(options=options) as p:
variants = _read_variants(known_args.all_patterns,
p,
known_args,
pipeline_mode,
pre_infer_headers=False,
keep_raw_sample_names=True)
sample_ids = (variants
| 'CombineSampleIds' >>
combine_sample_ids.SampleIdsCombiner()
| 'CombineToList' >> beam.combiners.ToList())
# TODO(tneymanov): Annotation pipeline currently stores sample IDs instead
# of sample names in the the sharded VCF files, which would lead to double
# hashing of samples. Needs to be fixed ASAP.
_ = (variants
| 'DensifyVariants' >> densify_variants.DensifyVariants(
beam.pvalue.AsSingleton(sample_ids))
| 'WriteToShards' >> write_variants_to_shards.WriteToShards(
vcf_shards_output_dir,
beam.pvalue.AsSingleton(sample_ids),
known_args.number_of_variants_per_shard))
return [vep_runner_util.format_dir_path(vcf_shards_output_dir) +
_GCS_RECURSIVE_WILDCARD]
示例3: _annotate_vcf_files
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import GoogleCloudOptions [as 别名]
def _annotate_vcf_files(all_patterns, known_args, pipeline_args):
# type: (List[str], argparse.Namespace, List[str]) -> str
"""Annotates the VCF files using VEP.
Returns:
The annotated VCF files directory.
"""
options = pipeline_options.PipelineOptions(pipeline_args)
google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
annotate_files_job_name = pipeline_common.generate_unique_name(
_ANNOTATE_FILES_JOB_NAME)
_update_google_cloud_job_name(google_cloud_options, annotate_files_job_name)
with beam.Pipeline(options=options) as p:
_ = (p
| beam.Create(all_patterns)
| 'AnnotateShards' >> beam.ParDo(
annotate_files.AnnotateFile(known_args, pipeline_args)))
if known_args.annotation_fields:
known_args.annotation_fields.append(known_args.vep_info_field)
else:
known_args.annotation_fields = [known_args.vep_info_field]
# TODO(bashir2): The VEP runner by default runs VEP with --allele_number hence
# we turn on this feature here. However, this might be inconsistent with other
# annotation fields that are originally present in input files, if they do not
# have ALLELE_NUM annotation. The fix is to make annotation ALT matching
# smarter to fall back on other matching methods if ALLELE_NUM is not present.
# When this is implemented, we may even consider removing use_allele_num flag
# and always start by checking if ALLELE_NUM is present.
known_args.use_allele_num = True
return vep_runner_util.get_output_pattern(known_args.annotation_output_dir)
示例4: generate_statistics_from_tfrecord
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import GoogleCloudOptions [as 别名]
def generate_statistics_from_tfrecord(pipeline_args, # type: List[str]
data_location, # type: str
output_path, # type: str
stats_options # type: StatsOptions
):
# type: (...) -> statistics_pb2.DatasetFeatureStatisticsList
"""
Generate stats file from a tfrecord dataset using TFDV
:param pipeline_args: un-parsed Dataflow arguments
:param data_location: input data dir containing tfrecord files
:param output_path: output path for the stats file
:param stats_options: tfdv.StatsOptions for statistics generation settings
:return a DatasetFeatureStatisticsList proto.
"""
assert_not_empty_string(data_location)
assert_not_empty_string(output_path)
args_in_snake_case = clean_up_pipeline_args(pipeline_args)
pipeline_options = PipelineOptions(flags=args_in_snake_case)
all_options = pipeline_options.get_all_options()
if all_options["job_name"] is None:
gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
gcloud_options.job_name = "generatestats-%s" % str(int(time.time()))
if all_options["setup_file"] is None:
setup_file_path = create_setup_file()
setup_options = pipeline_options.view_as(SetupOptions)
setup_options.setup_file = setup_file_path
input_files = os.path.join(data_location, "*.tfrecords*")
return tfdv.generate_statistics_from_tfrecord(data_location=input_files,
output_path=output_path,
stats_options=stats_options,
pipeline_options=pipeline_options)
示例5: run
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import GoogleCloudOptions [as 别名]
def run(argv=None):
# type: (List[str]) -> None
"""Runs BigQuery to VCF pipeline."""
logging.info('Command: %s', ' '.join(argv or sys.argv))
known_args, pipeline_args = pipeline_common.parse_args(argv,
_COMMAND_LINE_OPTIONS)
options = pipeline_options.PipelineOptions(pipeline_args)
is_direct_runner = pipeline_common.is_pipeline_direct_runner(
beam.Pipeline(options=options))
google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
if not google_cloud_options.project:
raise ValueError('project must be set.')
if not is_direct_runner and not known_args.output_file.startswith('gs://'):
raise ValueError('Please set the output file {} to GCS when running with '
'DataflowRunner.'.format(known_args.output_file))
if is_direct_runner:
known_args.number_of_bases_per_shard = sys.maxsize
temp_folder = google_cloud_options.temp_location or tempfile.mkdtemp()
unique_temp_id = pipeline_common.generate_unique_name(
google_cloud_options.job_name or _BQ_TO_VCF_SHARDS_JOB_NAME)
vcf_data_temp_folder = filesystems.FileSystems.join(
temp_folder,
'{}_data_temp_files'.format(unique_temp_id))
# Create the directory manually. FileSystems cannot create a file if the
# directory does not exist when using Direct Runner.
filesystems.FileSystems.mkdirs(vcf_data_temp_folder)
vcf_header_file_path = filesystems.FileSystems.join(
temp_folder,
'{}_header_with_sample_ids.vcf'.format(unique_temp_id))
if not known_args.representative_header_file:
known_args.representative_header_file = filesystems.FileSystems.join(
temp_folder,
'{}_meta_info.vcf'.format(unique_temp_id))
_write_vcf_meta_info(known_args.input_table,
known_args.representative_header_file,
known_args.allow_incompatible_schema)
_bigquery_to_vcf_shards(known_args,
options,
vcf_data_temp_folder,
vcf_header_file_path)
if is_direct_runner:
vcf_file_composer.compose_local_vcf_shards(vcf_header_file_path,
vcf_data_temp_folder,
known_args.output_file)
else:
vcf_file_composer.compose_gcs_vcf_shards(google_cloud_options.project,
vcf_header_file_path,
vcf_data_temp_folder,
known_args.output_file)
示例6: _get_input_dimensions
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import GoogleCloudOptions [as 别名]
def _get_input_dimensions(known_args, pipeline_args):
pipeline_mode = pipeline_common.get_pipeline_mode(known_args.all_patterns)
beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
google_cloud_options = beam_pipeline_options.view_as(
pipeline_options.GoogleCloudOptions)
estimate_sizes_job_name = pipeline_common.generate_unique_name(
_ESTIMATE_SIZES_JOB_NAME)
if google_cloud_options.job_name:
google_cloud_options.job_name += '-' + estimate_sizes_job_name
else:
google_cloud_options.job_name = estimate_sizes_job_name
temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
temp_estimated_input_size_file_name = '-'.join(
[google_cloud_options.job_name,
_ESTIMATE_SIZES_FILE_NAME])
temp_estimated_input_size_file_path = filesystems.FileSystems.join(
temp_directory, temp_estimated_input_size_file_name)
with beam.Pipeline(options=beam_pipeline_options) as p:
estimates = pipeline_common.get_estimates(
p, pipeline_mode, known_args.all_patterns)
files_size = (estimates
| 'GetFilesSize' >> extract_input_size.GetFilesSize())
file_count = (estimates
| 'CountAllFiles' >> beam.combiners.Count.Globally())
sample_map = (estimates
| 'ExtractSampleMap' >> extract_input_size.GetSampleMap())
estimated_value_count = (sample_map
| extract_input_size.GetEstimatedValueCount())
estimated_sample_count = (sample_map
| extract_input_size.GetEstimatedSampleCount())
estimated_variant_count = (estimates
| 'GetEstimatedVariantCount'
>> extract_input_size.GetEstimatedVariantCount())
_ = (estimated_variant_count
| beam.ParDo(extract_input_size.print_estimates_to_file,
beam.pvalue.AsSingleton(estimated_sample_count),
beam.pvalue.AsSingleton(estimated_value_count),
beam.pvalue.AsSingleton(files_size),
beam.pvalue.AsSingleton(file_count),
temp_estimated_input_size_file_path))
with filesystems.FileSystems.open(temp_estimated_input_size_file_path) as f:
estimates = f.readlines()
if len(estimates) != 5:
raise ValueError('Exactly 5 estimates were expected in {}.'.format(
temp_estimated_input_size_file_path))
known_args.estimated_variant_count = int(estimates[0].strip())
known_args.estimated_sample_count = int(estimates[1].strip())
known_args.estimated_value_count = int(estimates[2].strip())
known_args.files_size = int(estimates[3].strip())
known_args.file_count = int(estimates[4].strip())
示例7: _merge_headers
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import GoogleCloudOptions [as 别名]
def _merge_headers(known_args, pipeline_args,
pipeline_mode, avro_root_path, annotated_vcf_pattern=None):
# type: (str, argparse.Namespace, List[str], int, str) -> None
"""Merges VCF headers using beam based on pipeline_mode."""
options = pipeline_options.PipelineOptions(pipeline_args)
# Always run pipeline locally if data is small.
if (pipeline_mode == pipeline_common.PipelineModes.SMALL and
not known_args.infer_headers and not known_args.infer_annotation_types):
options.view_as(pipeline_options.StandardOptions).runner = 'DirectRunner'
google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
merge_headers_job_name = pipeline_common.generate_unique_name(
_MERGE_HEADERS_JOB_NAME)
if google_cloud_options.job_name:
google_cloud_options.job_name += '-' + merge_headers_job_name
else:
google_cloud_options.job_name = merge_headers_job_name
temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
temp_merged_headers_file_name = '-'.join([google_cloud_options.job_name,
_MERGE_HEADERS_FILE_NAME])
temp_merged_headers_file_path = filesystems.FileSystems.join(
temp_directory, temp_merged_headers_file_name)
with beam.Pipeline(options=options) as p:
headers = pipeline_common.read_headers(
p, pipeline_mode,
known_args.all_patterns)
_ = (headers
| 'SampleInfoToAvro'
>> sample_info_to_avro.SampleInfoToAvro(
avro_root_path +
sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX,
SampleNameEncoding[known_args.sample_name_encoding]))
if known_args.representative_header_file:
return
merged_header = pipeline_common.get_merged_headers(
headers,
known_args.split_alternate_allele_info_fields,
known_args.allow_incompatible_records)
if annotated_vcf_pattern:
merged_header = pipeline_common.add_annotation_headers(
p, known_args, pipeline_mode, merged_header,
annotated_vcf_pattern)
if known_args.infer_headers or known_args.infer_annotation_types:
infer_headers_input_pattern = (
[annotated_vcf_pattern] if
annotated_vcf_pattern else known_args.all_patterns)
merged_header = _add_inferred_headers(infer_headers_input_pattern, p,
known_args, merged_header,
pipeline_mode)
pipeline_common.write_headers(merged_header, temp_merged_headers_file_path)
known_args.representative_header_file = temp_merged_headers_file_path
示例8: run
# 需要导入模块: from apache_beam.options import pipeline_options [as 别名]
# 或者: from apache_beam.options.pipeline_options import GoogleCloudOptions [as 别名]
def run(argv=None):
"""Runs the revise preprocessed data pipeline.
Args:
argv: Pipeline options as a list of arguments.
"""
pipeline_options = PipelineOptions(flags=argv)
revise_options = pipeline_options.view_as(ReviseOptions)
cloud_options = pipeline_options.view_as(GoogleCloudOptions)
output_dir = os.path.join(revise_options.output,
datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
pipeline_options.view_as(SetupOptions).save_main_session = True
pipeline_options.view_as(
WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging')
cloud_options.temp_location = os.path.join(output_dir, 'tmp')
cloud_options.job_name = 'relabel-examples-%s' % (
datetime.datetime.now().strftime('%y%m%d-%H%M%S'))
metadata_query = str(
Template(open(revise_options.metadata, 'r').read()).render(
METADATA_QUERY_REPLACEMENTS))
logging.info('metadata query : %s', metadata_query)
with beam.Pipeline(options=pipeline_options) as p:
# Gather our sample metadata into a python dictionary.
samples_metadata = (
p
| 'ReadSampleMetadata' >> beam.io.Read(
beam.io.BigQuerySource(query=metadata_query, use_standard_sql=True))
| 'TableToDictionary' >> beam.CombineGlobally(
util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN)))
# Read the tf.Example protos into a PCollection.
examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord(
file_pattern=revise_options.input,
compression_type=CompressionTypes.GZIP)
# Filter the TensorFlow Example Protocol Buffers.
filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap(
lambda example, samples_metadata:
filter_and_revise_example(example, samples_metadata),
beam.pvalue.AsSingleton(samples_metadata)))
# Write the subset of tf.Example protos to Cloud Storage.
_ = (filtered_examples
| 'SerializeExamples' >>
beam.Map(lambda example: example.SerializeToString())
| 'WriteExamples' >> tfrecordio.WriteToTFRecord(
file_path_prefix=os.path.join(output_dir, 'examples'),
compression_type=CompressionTypes.GZIP,
file_name_suffix='.tfrecord.gz'))