本文整理汇总了Python中apache_beam.utils.pipeline_options.PipelineOptions类的典型用法代码示例。如果您正苦于以下问题:Python PipelineOptions类的具体用法?Python PipelineOptions怎么用?Python PipelineOptions使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PipelineOptions类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: model_pcollection
def model_pcollection(argv):
"""Creating a PCollection from data in local memory."""
from apache_beam.utils.pipeline_options import PipelineOptions
class MyOptions(PipelineOptions):
@classmethod
def _add_argparse_args(cls, parser):
parser.add_argument('--output',
dest='output',
required=True,
help='Output file to write results to.')
pipeline_options = PipelineOptions(argv)
my_options = pipeline_options.view_as(MyOptions)
# [START model_pcollection]
p = beam.Pipeline(options=pipeline_options)
(p
| beam.Create([
'To be, or not to be: that is the question: ',
'Whether \'tis nobler in the mind to suffer ',
'The slings and arrows of outrageous fortune, ',
'Or to take arms against a sea of troubles, '])
| beam.io.WriteToText(my_options.output))
result = p.run()
# [END model_pcollection]
result.wait_until_finish()
示例2: test_with_setup_file
def test_with_setup_file(self):
staging_dir = tempfile.mkdtemp()
source_dir = tempfile.mkdtemp()
self.create_temp_file(
os.path.join(source_dir, 'setup.py'), 'notused')
options = PipelineOptions()
options.view_as(GoogleCloudOptions).staging_location = staging_dir
self.update_options(options)
options.view_as(SetupOptions).setup_file = os.path.join(
source_dir, 'setup.py')
self.assertEqual(
[dependency.WORKFLOW_TARBALL_FILE],
dependency.stage_job_resources(
options,
# We replace the build setup command because a realistic one would
# require the setuptools package to be installed. Note that we can't
# use "touch" here to create the expected output tarball file, since
# touch is not available on Windows, so we invoke python to produce
# equivalent behavior.
build_setup_args=[
'python', '-c', 'open(__import__("sys").argv[1], "a")',
os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)],
temp_dir=source_dir))
self.assertTrue(
os.path.isfile(
os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
示例3: run
def run(argv=None):
"""Runs the Wikipedia top edits pipeline.
Args:
argv: Pipeline options as a list of arguments.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
default='gs://dataflow-samples/wikipedia_edits/*.json',
help='Input specified as a GCS path containing a BigQuery table exported '
'as json.')
parser.add_argument('--output',
required=True,
help='Output file to write results to.')
parser.add_argument('--sampling_threshold',
type=float,
default=0.1,
help='Fraction of entries used for session tracking')
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
p = beam.Pipeline(options=pipeline_options)
(p # pylint: disable=expression-not-assigned
| ReadFromText(known_args.input)
| ComputeTopSessions(known_args.sampling_threshold)
| WriteToText(known_args.output))
p.run()
示例4: test_with_requirements_file
def test_with_requirements_file(self):
try:
staging_dir = tempfile.mkdtemp()
requirements_cache_dir = tempfile.mkdtemp()
source_dir = tempfile.mkdtemp()
options = PipelineOptions()
options.view_as(GoogleCloudOptions).staging_location = staging_dir
self.update_options(options)
options.view_as(SetupOptions).requirements_cache = requirements_cache_dir
options.view_as(SetupOptions).requirements_file = os.path.join(
source_dir, dependency.REQUIREMENTS_FILE)
self.create_temp_file(
os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
self.assertEqual(
sorted([dependency.REQUIREMENTS_FILE,
'abc.txt', 'def.txt']),
sorted(dependency.stage_job_resources(
options,
populate_requirements_cache=self.populate_requirements_cache)))
self.assertTrue(
os.path.isfile(
os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt')))
self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
finally:
shutil.rmtree(staging_dir)
shutil.rmtree(requirements_cache_dir)
shutil.rmtree(source_dir)
示例5: run
def run():
parser = argparse.ArgumentParser()
parser.add_argument('--run_locally', dest='run_locally', default='', help='Run data subset and do not save.')
known_args, pipeline_args = parser.parse_known_args()
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
delete_from_datastore('dancedeets-hrd', gcloud_options, known_args.run_locally)
示例6: test_default_resources
def test_default_resources(self):
staging_dir = tempfile.mkdtemp()
options = PipelineOptions()
options.view_as(GoogleCloudOptions).staging_location = staging_dir
self.update_options(options)
self.assertEqual(
[],
dependency.stage_job_resources(options))
示例7: run
def run(argv=None):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument('--input',
dest='input',
default='gs://dataflow-samples/shakespeare/kinglear.txt',
help='Input file to process.')
parser.add_argument('--output',
dest='output',
# CHANGE 1/5: The Google Cloud Storage path is required
# for outputting the results.
default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
help='Output file to write results to.')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
# CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
# run your pipeline on the Google Cloud Dataflow Service.
'--runner=DirectRunner',
# CHANGE 3/5: Your project ID is required in order to run your pipeline on
# the Google Cloud Dataflow Service.
'--project=SET_YOUR_PROJECT_ID_HERE',
# CHANGE 4/5: Your Google Cloud Storage path is required for staging local
# files.
'--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
# CHANGE 5/5: Your Google Cloud Storage path is required for temporary
# files.
'--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
'--job_name=your-wordcount-job',
])
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
p = beam.Pipeline(options=pipeline_options)
# Read the text file[pattern] into a PCollection.
lines = p | 'read' >> ReadFromText(known_args.input)
# Count the occurrences of each word.
counts = (lines
| 'split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
.with_output_types(unicode))
| 'pair_with_one' >> beam.Map(lambda x: (x, 1))
| 'group' >> beam.GroupByKey()
| 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))
# Format the counts into a PCollection of strings.
output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
# Write the output using a "Write" transform that has side effects.
# pylint: disable=expression-not-assigned
output | 'write' >> WriteToText(known_args.output)
# Actually run the pipeline (all operations above are deferred).
p.run().wait_until_finish()
示例8: examples_wordcount_minimal
def examples_wordcount_minimal(renames):
"""MinimalWordCount example snippets."""
import re
import apache_beam as beam
from apache_beam.utils.pipeline_options import GoogleCloudOptions
from apache_beam.utils.pipeline_options import StandardOptions
from apache_beam.utils.pipeline_options import PipelineOptions
# [START examples_wordcount_minimal_options]
options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'my-project-id'
google_cloud_options.job_name = 'myjob'
google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging'
google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp'
options.view_as(StandardOptions).runner = 'DataflowRunner'
# [END examples_wordcount_minimal_options]
# Run it locally for testing.
options = PipelineOptions()
# [START examples_wordcount_minimal_create]
p = beam.Pipeline(options=options)
# [END examples_wordcount_minimal_create]
(
# [START examples_wordcount_minimal_read]
p | beam.io.ReadFromText(
'gs://dataflow-samples/shakespeare/kinglear.txt')
# [END examples_wordcount_minimal_read]
# [START examples_wordcount_minimal_pardo]
| 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
# [END examples_wordcount_minimal_pardo]
# [START examples_wordcount_minimal_count]
| beam.combiners.Count.PerElement()
# [END examples_wordcount_minimal_count]
# [START examples_wordcount_minimal_map]
| beam.Map(lambda (word, count): '%s: %s' % (word, count))
# [END examples_wordcount_minimal_map]
# [START examples_wordcount_minimal_write]
| beam.io.WriteToText('gs://my-bucket/counts.txt')
# [END examples_wordcount_minimal_write]
)
p.visit(SnippetUtils.RenameFiles(renames))
# [START examples_wordcount_minimal_run]
result = p.run()
# [END examples_wordcount_minimal_run]
result.wait_until_finish()
示例9: test_get_all_options
def test_get_all_options(self):
for case in PipelineOptionsTest.TEST_CASES:
options = PipelineOptions(flags=case['flags'])
self.assertDictContainsSubset(case['expected'], options.get_all_options())
self.assertEqual(options.view_as(
PipelineOptionsTest.MockOptions).mock_flag,
case['expected']['mock_flag'])
self.assertEqual(options.view_as(
PipelineOptionsTest.MockOptions).mock_option,
case['expected']['mock_option'])
示例10: test_extra_package
def test_extra_package(self):
options = PipelineOptions(['--extra_package', 'abc',
'--extra_packages', 'def',
'--extra_packages', 'ghi'])
self.assertEqual(
sorted(options.get_all_options()['extra_packages']),
['abc', 'def', 'ghi'])
options = PipelineOptions(flags=[''])
self.assertEqual(options.get_all_options()['extra_packages'], None)
示例11: test_option_with_space
def test_option_with_space(self):
options = PipelineOptions(flags=['--option with space= value with space'])
self.assertEqual(
getattr(options.view_as(PipelineOptionsTest.MockOptions),
'option with space'), ' value with space')
options_from_dict = PipelineOptions.from_dictionary(
options.get_all_options())
self.assertEqual(
getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions),
'option with space'), ' value with space')
示例12: run
def run(argv=None):
"""Main entry point; defines and runs the hourly_team_score pipeline."""
parser = argparse.ArgumentParser()
# The default maps to two large Google Cloud Storage files (each ~12GB)
# holding two subsequent day's worth (roughly) of data.
parser.add_argument('--input',
dest='input',
default='gs://dataflow-samples/game/gaming_data*.csv',
help='Path to the data file(s) containing game data.')
parser.add_argument('--dataset',
dest='dataset',
required=True,
help='BigQuery Dataset to write tables to. '
'Must already exist.')
parser.add_argument('--table_name',
dest='table_name',
default='hourly_team_score',
help='The BigQuery table name. Should not already exist.')
parser.add_argument('--window_duration',
type=int,
default=60,
help='Numeric value of fixed window duration, in minutes')
parser.add_argument('--start_min',
dest='start_min',
default='1970-01-01-00-00',
help='String representation of the first minute after '
'which to generate results in the format: '
'yyyy-MM-dd-HH-mm. Any input data timestamped '
'prior to that minute won\'t be included in the '
'sums.')
parser.add_argument('--stop_min',
dest='stop_min',
default='2100-01-01-00-00',
help='String representation of the first minute for '
'which to generate results in the format: '
'yyyy-MM-dd-HH-mm. Any input data timestamped '
'after to that minute won\'t be included in the '
'sums.')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
p = beam.Pipeline(options=pipeline_options)
pipeline_options.view_as(SetupOptions).save_main_session = True
(p # pylint: disable=expression-not-assigned
| ReadFromText(known_args.input)
| HourlyTeamScore(
known_args.start_min, known_args.stop_min, known_args.window_duration)
| WriteWindowedToBigQuery(
known_args.table_name, known_args.dataset, configure_bigquery_write()))
result = p.run()
result.wait_until_finish()
示例13: test_no_main_session
def test_no_main_session(self):
staging_dir = tempfile.mkdtemp()
options = PipelineOptions()
options.view_as(GoogleCloudOptions).staging_location = staging_dir
options.view_as(SetupOptions).save_main_session = False
self.update_options(options)
self.assertEqual(
[],
dependency.stage_job_resources(options))
示例14: test_from_dictionary
def test_from_dictionary(self):
for case in PipelineOptionsTest.TEST_CASES:
options = PipelineOptions(flags=case['flags'])
all_options_dict = options.get_all_options()
options_from_dict = PipelineOptions.from_dictionary(all_options_dict)
self.assertEqual(options_from_dict.view_as(
PipelineOptionsTest.MockOptions).mock_flag,
case['expected']['mock_flag'])
self.assertEqual(options.view_as(
PipelineOptionsTest.MockOptions).mock_option,
case['expected']['mock_option'])
示例15: test_no_temp_location
def test_no_temp_location(self):
staging_dir = tempfile.mkdtemp()
options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.staging_location = staging_dir
self.update_options(options)
google_cloud_options.temp_location = None
with self.assertRaises(RuntimeError) as cm:
dependency.stage_job_resources(options)
self.assertEqual('The --temp_location option must be specified.',
cm.exception.message)