本文整理匯總了Python中apache_beam.options.pipeline_options.SetupOptions方法的典型用法代碼示例。如果您正苦於以下問題:Python pipeline_options.SetupOptions方法的具體用法?Python pipeline_options.SetupOptions怎麽用?Python pipeline_options.SetupOptions使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類apache_beam.options.pipeline_options
的用法示例。
在下文中一共展示了pipeline_options.SetupOptions方法的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: main
# 需要導入模塊: from apache_beam.options import pipeline_options [as 別名]
# 或者: from apache_beam.options.pipeline_options import SetupOptions [as 別名]
def main():
# get the cmd args
db_args, pipeline_args = get_args()
# Create the pipeline
options = PipelineOptions(pipeline_args)
options.view_as(SetupOptions).save_main_session = True
with beam.Pipeline(options=options) as p:
source_config = relational_db.SourceConfiguration(
drivername=db_args.drivername,
host=db_args.host,
port=db_args.port,
database=db_args.database,
username=db_args.username,
password=db_args.password,
)
months = p | "Reading records from db" >> relational_db.ReadFromDB(
source_config=source_config,
table_name=db_args.table
)
months | 'Writing to stdout' >> beam.Map(print)
示例2: generate_statistics_from_tfrecord
# 需要導入模塊: from apache_beam.options import pipeline_options [as 別名]
# 或者: from apache_beam.options.pipeline_options import SetupOptions [as 別名]
def generate_statistics_from_tfrecord(pipeline_args, # type: List[str]
data_location, # type: str
output_path, # type: str
stats_options # type: StatsOptions
):
# type: (...) -> statistics_pb2.DatasetFeatureStatisticsList
"""
Generate stats file from a tfrecord dataset using TFDV
:param pipeline_args: un-parsed Dataflow arguments
:param data_location: input data dir containing tfrecord files
:param output_path: output path for the stats file
:param stats_options: tfdv.StatsOptions for statistics generation settings
:return a DatasetFeatureStatisticsList proto.
"""
assert_not_empty_string(data_location)
assert_not_empty_string(output_path)
args_in_snake_case = clean_up_pipeline_args(pipeline_args)
pipeline_options = PipelineOptions(flags=args_in_snake_case)
all_options = pipeline_options.get_all_options()
if all_options["job_name"] is None:
gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
gcloud_options.job_name = "generatestats-%s" % str(int(time.time()))
if all_options["setup_file"] is None:
setup_file_path = create_setup_file()
setup_options = pipeline_options.view_as(SetupOptions)
setup_options.setup_file = setup_file_path
input_files = os.path.join(data_location, "*.tfrecords*")
return tfdv.generate_statistics_from_tfrecord(data_location=input_files,
output_path=output_path,
stats_options=stats_options,
pipeline_options=pipeline_options)
示例3: main
# 需要導入模塊: from apache_beam.options import pipeline_options [as 別名]
# 或者: from apache_beam.options.pipeline_options import SetupOptions [as 別名]
def main(argv=None):
known_args, pipeline_args = get_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
setup_options = pipeline_options.view_as(SetupOptions)
setup_options.save_main_session = True
pipeline.run(pipeline_options, known_args)
示例4: main
# 需要導入模塊: from apache_beam.options import pipeline_options [as 別名]
# 或者: from apache_beam.options.pipeline_options import SetupOptions [as 別名]
def main():
# get the cmd args
db_args, pipeline_args = get_args()
# Target database instance
source_config = relational_db.SourceConfiguration(
drivername=db_args.drivername,
host=db_args.host,
port=db_args.port,
database=db_args.database,
username=db_args.username,
password=db_args.password,
create_if_missing=db_args.create_if_missing
)
# The data to be written
records = [
{'name': 'Jan', 'num': 1},
{'name': 'Feb', 'num': 2},
{'name': 'Mar', 'num': 3},
{'name': 'Apr', 'num': 4},
{'name': 'May', 'num': 5},
{'name': 'Jun', 'num': 6},
]
# Target database table
table_config = relational_db.TableConfiguration(
name='months',
create_if_missing=True, # automatically create the table if not there
primary_key_columns=['num'] # and use 'num' column as a primary key
)
# Create the pipeline
options = PipelineOptions(pipeline_args)
options.view_as(SetupOptions).save_main_session = True
with beam.Pipeline(options=options) as p:
months = p | "Reading records" >> beam.Create(records)
months | 'Writing to DB' >> relational_db.Write(
source_config=source_config,
table_config=table_config
)
示例5: run
# 需要導入模塊: from apache_beam.options import pipeline_options [as 別名]
# 或者: from apache_beam.options.pipeline_options import SetupOptions [as 別名]
def run(argv=None):
"""Run the beam pipeline."""
args, pipeline_args = _parse_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
p = beam.Pipeline(options=pipeline_options)
sentence_files_match = FileSystems.match([args.sentence_files])[0]
sentence_files = [
file_metadata.path
for file_metadata in sentence_files_match.metadata_list]
logging.info("Reading %i files from %s.",
len(sentence_files), args.sentence_files)
assert len(sentence_files) > 0
sentence_files = p | beam.Create(sentence_files)
examples = sentence_files | "create examples" >> beam.FlatMap(
partial(_create_examples_from_file,
min_length=args.min_length,
max_length=args.max_length,
num_extra_contexts=args.num_extra_contexts)
)
examples = _shuffle_examples(examples)
examples |= "split train and test" >> beam.ParDo(
_TrainTestSplitFn(args.train_split)).with_outputs(
_TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)
if args.dataset_format == _JSON_FORMAT:
write_sink = WriteToText
file_name_suffix = ".json"
serialize_fn = json.dumps
else:
assert args.dataset_format == _TF_FORMAT
write_sink = WriteToTFRecord
file_name_suffix = ".tfrecord"
serialize_fn = _features_to_serialized_tf_example
for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG),
("test", _TrainTestSplitFn.TEST_TAG)]:
serialized_examples = examples[tag] | (
"serialize {} examples".format(name) >> beam.Map(serialize_fn))
(
serialized_examples | ("write " + name)
>> write_sink(
os.path.join(args.output_dir, name),
file_name_suffix=file_name_suffix,
num_shards=args.num_shards_train,
)
)
result = p.run()
result.wait_until_finish()
示例6: run
# 需要導入模塊: from apache_beam.options import pipeline_options [as 別名]
# 或者: from apache_beam.options.pipeline_options import SetupOptions [as 別名]
def run(argv=None):
"""Run the beam pipeline."""
args, pipeline_args = _parse_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
p = beam.Pipeline(options=pipeline_options)
lines = p | "read qa files" >> ReadFromText(args.file_pattern)
# The lines are not JSON, but the string representation of python
# dictionary objects. Parse them with ast.literal_eval.
json_objects = lines | "parsing dictionaries" >> beam.Map(ast.literal_eval)
qa_tuples = json_objects | "create tuples" >> beam.FlatMap(
partial(
_create_tuples,
min_words=args.min_words, max_words=args.max_words)
)
# Remove duplicate examples.
qa_tuples |= "key by QA" >> beam.Map(lambda v: (v[1:], v))
qa_tuples |= "group duplicates" >> beam.GroupByKey()
qa_tuples |= "remove duplicates" >> beam.Map(lambda v: sorted(v[1])[0])
# Create the examples.
examples = qa_tuples | "create examples" >> beam.Map(
lambda args: _create_example(*args)
)
examples = _shuffle_examples(examples)
examples |= "split train and test" >> beam.ParDo(
_TrainTestSplitFn(args.train_split)
).with_outputs(_TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)
if args.dataset_format == _JSON_FORMAT:
write_sink = WriteToText
file_name_suffix = ".json"
serialize_fn = json.dumps
else:
assert args.dataset_format == _TF_FORMAT
write_sink = WriteToTFRecord
file_name_suffix = ".tfrecord"
serialize_fn = _features_to_serialized_tf_example
for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG),
("test", _TrainTestSplitFn.TEST_TAG)]:
serialized_examples = examples[tag] | (
"serialize {} examples".format(name) >> beam.Map(serialize_fn))
(
serialized_examples | ("write " + name)
>> write_sink(
os.path.join(args.output_dir, name),
file_name_suffix=file_name_suffix,
num_shards=args.num_shards_train,
)
)
result = p.run()
result.wait_until_finish()
示例7: run
# 需要導入模塊: from apache_beam.options import pipeline_options [as 別名]
# 或者: from apache_beam.options.pipeline_options import SetupOptions [as 別名]
def run(argv=None):
"""Runs the revise preprocessed data pipeline.
Args:
argv: Pipeline options as a list of arguments.
"""
pipeline_options = PipelineOptions(flags=argv)
revise_options = pipeline_options.view_as(ReviseOptions)
cloud_options = pipeline_options.view_as(GoogleCloudOptions)
output_dir = os.path.join(revise_options.output,
datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
pipeline_options.view_as(SetupOptions).save_main_session = True
pipeline_options.view_as(
WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging')
cloud_options.temp_location = os.path.join(output_dir, 'tmp')
cloud_options.job_name = 'relabel-examples-%s' % (
datetime.datetime.now().strftime('%y%m%d-%H%M%S'))
metadata_query = str(
Template(open(revise_options.metadata, 'r').read()).render(
METADATA_QUERY_REPLACEMENTS))
logging.info('metadata query : %s', metadata_query)
with beam.Pipeline(options=pipeline_options) as p:
# Gather our sample metadata into a python dictionary.
samples_metadata = (
p
| 'ReadSampleMetadata' >> beam.io.Read(
beam.io.BigQuerySource(query=metadata_query, use_standard_sql=True))
| 'TableToDictionary' >> beam.CombineGlobally(
util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN)))
# Read the tf.Example protos into a PCollection.
examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord(
file_pattern=revise_options.input,
compression_type=CompressionTypes.GZIP)
# Filter the TensorFlow Example Protocol Buffers.
filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap(
lambda example, samples_metadata:
filter_and_revise_example(example, samples_metadata),
beam.pvalue.AsSingleton(samples_metadata)))
# Write the subset of tf.Example protos to Cloud Storage.
_ = (filtered_examples
| 'SerializeExamples' >>
beam.Map(lambda example: example.SerializeToString())
| 'WriteExamples' >> tfrecordio.WriteToTFRecord(
file_path_prefix=os.path.join(output_dir, 'examples'),
compression_type=CompressionTypes.GZIP,
file_name_suffix='.tfrecord.gz'))