本文整理汇总了Python中apache_beam.utils.pipeline_options.PipelineOptions.view_as方法的典型用法代码示例。如果您正苦于以下问题:Python PipelineOptions.view_as方法的具体用法?Python PipelineOptions.view_as怎么用?Python PipelineOptions.view_as使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam.utils.pipeline_options.PipelineOptions
的用法示例。
在下文中一共展示了PipelineOptions.view_as方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_with_setup_file
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def test_with_setup_file(self):
staging_dir = tempfile.mkdtemp()
source_dir = tempfile.mkdtemp()
self.create_temp_file(
os.path.join(source_dir, 'setup.py'), 'notused')
options = PipelineOptions()
options.view_as(GoogleCloudOptions).staging_location = staging_dir
self.update_options(options)
options.view_as(SetupOptions).setup_file = os.path.join(
source_dir, 'setup.py')
self.assertEqual(
[dependency.WORKFLOW_TARBALL_FILE],
dependency.stage_job_resources(
options,
# We replace the build setup command because a realistic one would
# require the setuptools package to be installed. Note that we can't
# use "touch" here to create the expected output tarball file, since
# touch is not available on Windows, so we invoke python to produce
# equivalent behavior.
build_setup_args=[
'python', '-c', 'open(__import__("sys").argv[1], "a")',
os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)],
temp_dir=source_dir))
self.assertTrue(
os.path.isfile(
os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
示例2: test_with_requirements_file
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def test_with_requirements_file(self):
try:
staging_dir = tempfile.mkdtemp()
requirements_cache_dir = tempfile.mkdtemp()
source_dir = tempfile.mkdtemp()
options = PipelineOptions()
options.view_as(GoogleCloudOptions).staging_location = staging_dir
self.update_options(options)
options.view_as(SetupOptions).requirements_cache = requirements_cache_dir
options.view_as(SetupOptions).requirements_file = os.path.join(
source_dir, dependency.REQUIREMENTS_FILE)
self.create_temp_file(
os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
self.assertEqual(
sorted([dependency.REQUIREMENTS_FILE,
'abc.txt', 'def.txt']),
sorted(dependency.stage_job_resources(
options,
populate_requirements_cache=self.populate_requirements_cache)))
self.assertTrue(
os.path.isfile(
os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt')))
self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
finally:
shutil.rmtree(staging_dir)
shutil.rmtree(requirements_cache_dir)
shutil.rmtree(source_dir)
示例3: run
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def run():
parser = argparse.ArgumentParser()
parser.add_argument('--run_locally', dest='run_locally', default='', help='Run data subset and do not save.')
known_args, pipeline_args = parser.parse_known_args()
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
delete_from_datastore('dancedeets-hrd', gcloud_options, known_args.run_locally)
示例4: examples_wordcount_minimal
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def examples_wordcount_minimal(renames):
"""MinimalWordCount example snippets."""
import re
import apache_beam as beam
from apache_beam.utils.pipeline_options import GoogleCloudOptions
from apache_beam.utils.pipeline_options import StandardOptions
from apache_beam.utils.pipeline_options import PipelineOptions
# [START examples_wordcount_minimal_options]
options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'my-project-id'
google_cloud_options.job_name = 'myjob'
google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging'
google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp'
options.view_as(StandardOptions).runner = 'DataflowRunner'
# [END examples_wordcount_minimal_options]
# Run it locally for testing.
options = PipelineOptions()
# [START examples_wordcount_minimal_create]
p = beam.Pipeline(options=options)
# [END examples_wordcount_minimal_create]
(
# [START examples_wordcount_minimal_read]
p | beam.io.ReadFromText(
'gs://dataflow-samples/shakespeare/kinglear.txt')
# [END examples_wordcount_minimal_read]
# [START examples_wordcount_minimal_pardo]
| 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
# [END examples_wordcount_minimal_pardo]
# [START examples_wordcount_minimal_count]
| beam.combiners.Count.PerElement()
# [END examples_wordcount_minimal_count]
# [START examples_wordcount_minimal_map]
| beam.Map(lambda (word, count): '%s: %s' % (word, count))
# [END examples_wordcount_minimal_map]
# [START examples_wordcount_minimal_write]
| beam.io.WriteToText('gs://my-bucket/counts.txt')
# [END examples_wordcount_minimal_write]
)
p.visit(SnippetUtils.RenameFiles(renames))
# [START examples_wordcount_minimal_run]
result = p.run()
# [END examples_wordcount_minimal_run]
result.wait_until_finish()
示例5: test_get_all_options
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def test_get_all_options(self):
for case in PipelineOptionsTest.TEST_CASES:
options = PipelineOptions(flags=case['flags'])
self.assertDictContainsSubset(case['expected'], options.get_all_options())
self.assertEqual(options.view_as(
PipelineOptionsTest.MockOptions).mock_flag,
case['expected']['mock_flag'])
self.assertEqual(options.view_as(
PipelineOptionsTest.MockOptions).mock_option,
case['expected']['mock_option'])
示例6: test_no_main_session
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def test_no_main_session(self):
staging_dir = tempfile.mkdtemp()
options = PipelineOptions()
options.view_as(GoogleCloudOptions).staging_location = staging_dir
options.view_as(SetupOptions).save_main_session = False
self.update_options(options)
self.assertEqual(
[],
dependency.stage_job_resources(options))
示例7: run
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def run(argv=None):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument('--input',
dest='input',
default='gs://dataflow-samples/shakespeare/kinglear.txt',
help='Input file to process.')
parser.add_argument('--kind',
dest='kind',
required=True,
help='Datastore Kind')
parser.add_argument('--namespace',
dest='namespace',
help='Datastore Namespace')
parser.add_argument('--ancestor',
dest='ancestor',
default='root',
help='The ancestor key name for all entities.')
parser.add_argument('--output',
dest='output',
required=True,
help='Output file to write results to.')
parser.add_argument('--read_only',
action='store_true',
help='Read an existing dataset, do not write first')
parser.add_argument('--num_shards',
dest='num_shards',
type=int,
# If the system should choose automatically.
default=0,
help='Number of output shards')
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
# Write to Datastore if `read_only` options is not specified.
if not known_args.read_only:
write_to_datastore(gcloud_options.project, known_args, pipeline_options)
# Read entities from Datastore.
result = read_from_datastore(gcloud_options.project, known_args,
pipeline_options)
empty_lines_filter = MetricsFilter().with_name('empty_lines')
query_result = result.metrics().query(empty_lines_filter)
if query_result['counters']:
empty_lines_counter = query_result['counters'][0]
logging.info('number of empty lines: %d', empty_lines_counter.committed)
示例8: test_sdk_location_gcs
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def test_sdk_location_gcs(self):
staging_dir = tempfile.mkdtemp()
sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
self.override_file_copy(sdk_location, staging_dir)
options = PipelineOptions()
options.view_as(GoogleCloudOptions).staging_location = staging_dir
self.update_options(options)
options.view_as(SetupOptions).sdk_location = sdk_location
self.assertEqual(
[names.DATAFLOW_SDK_TARBALL_FILE],
dependency.stage_job_resources(options))
示例9: test_requirements_file_not_present
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def test_requirements_file_not_present(self):
staging_dir = tempfile.mkdtemp()
with self.assertRaises(RuntimeError) as cm:
options = PipelineOptions()
options.view_as(GoogleCloudOptions).staging_location = staging_dir
self.update_options(options)
options.view_as(SetupOptions).requirements_file = 'nosuchfile'
dependency.stage_job_resources(
options, populate_requirements_cache=self.populate_requirements_cache)
self.assertEqual(
cm.exception.message,
'The file %s cannot be found. It was specified in the '
'--requirements_file command line option.' % 'nosuchfile')
示例10: test_with_main_session
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def test_with_main_session(self):
staging_dir = tempfile.mkdtemp()
options = PipelineOptions()
options.view_as(GoogleCloudOptions).staging_location = staging_dir
options.view_as(SetupOptions).save_main_session = True
self.update_options(options)
self.assertEqual(
[names.PICKLED_MAIN_SESSION_FILE],
dependency.stage_job_resources(options))
self.assertTrue(
os.path.isfile(
os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
示例11: test_with_extra_packages_missing_files
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def test_with_extra_packages_missing_files(self):
staging_dir = tempfile.mkdtemp()
with self.assertRaises(RuntimeError) as cm:
options = PipelineOptions()
options.view_as(GoogleCloudOptions).staging_location = staging_dir
self.update_options(options)
options.view_as(SetupOptions).extra_packages = ['nosuchfile.tar.gz']
dependency.stage_job_resources(options)
self.assertEqual(
cm.exception.message,
'The file %s cannot be found. It was specified in the '
'--extra_packages command line option.' % 'nosuchfile.tar.gz')
示例12: run
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def run(argv=None):
"""Main entry point; defines and runs the wordcount pipeline."""
class WordcountOptions(PipelineOptions):
@classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument(
'--input',
dest='input',
default='gs://dataflow-samples/shakespeare/kinglear.txt',
help='Input file to process.')
parser.add_value_provider_argument(
'--output',
dest='output',
required=True,
help='Output file to write results to.')
pipeline_options = PipelineOptions(argv)
wordcount_options = pipeline_options.view_as(WordcountOptions)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options.view_as(SetupOptions).save_main_session = True
p = beam.Pipeline(options=pipeline_options)
# Read the text file[pattern] into a PCollection.
lines = p | 'read' >> ReadFromText(wordcount_options.input)
# Count the occurrences of each word.
counts = (lines
| 'split' >> (beam.ParDo(WordExtractingDoFn())
.with_output_types(unicode))
| 'pair_with_one' >> beam.Map(lambda x: (x, 1))
| 'group' >> beam.GroupByKey()
| 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))
# Format the counts into a PCollection of strings.
output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
# Write the output using a "Write" transform that has side effects.
# pylint: disable=expression-not-assigned
output | 'write' >> WriteToText(wordcount_options.output)
# Actually run the pipeline (all operations above are deferred).
result = p.run()
result.wait_until_finish()
empty_lines_filter = MetricsFilter().with_name('empty_lines')
query_result = result.metrics().query(empty_lines_filter)
if query_result['counters']:
empty_lines_counter = query_result['counters'][0]
logging.info('number of empty lines: %d', empty_lines_counter.committed)
示例13: pipeline_options_remote
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def pipeline_options_remote(argv):
"""Creating a Pipeline using a PipelineOptions object for remote execution."""
from apache_beam import Pipeline
from apache_beam.utils.pipeline_options import PipelineOptions
# [START pipeline_options_create]
options = PipelineOptions(flags=argv)
# [END pipeline_options_create]
# [START pipeline_options_define_custom]
class MyOptions(PipelineOptions):
@classmethod
def _add_argparse_args(cls, parser):
parser.add_argument('--input')
parser.add_argument('--output')
# [END pipeline_options_define_custom]
from apache_beam.utils.pipeline_options import GoogleCloudOptions
from apache_beam.utils.pipeline_options import StandardOptions
# [START pipeline_options_dataflow_service]
# Create and set your PipelineOptions.
options = PipelineOptions(flags=argv)
# For Cloud execution, set the Cloud Platform project, job_name,
# staging location, temp_location and specify DataflowRunner.
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'my-project-id'
google_cloud_options.job_name = 'myjob'
google_cloud_options.staging_location = 'gs://my-bucket/binaries'
google_cloud_options.temp_location = 'gs://my-bucket/temp'
options.view_as(StandardOptions).runner = 'DataflowRunner'
# Create the Pipeline with the specified options.
p = Pipeline(options=options)
# [END pipeline_options_dataflow_service]
my_options = options.view_as(MyOptions)
my_input = my_options.input
my_output = my_options.output
p = TestPipeline() # Use TestPipeline for testing.
lines = p | beam.io.ReadFromText(my_input)
lines | beam.io.WriteToText(my_output)
p.run()
示例14: test_sdk_location_local_not_present
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def test_sdk_location_local_not_present(self):
staging_dir = tempfile.mkdtemp()
sdk_location = 'nosuchdir'
with self.assertRaises(RuntimeError) as cm:
options = PipelineOptions()
options.view_as(GoogleCloudOptions).staging_location = staging_dir
self.update_options(options)
options.view_as(SetupOptions).sdk_location = sdk_location
dependency.stage_job_resources(options)
self.assertEqual(
'The file "%s" cannot be found. Its '
'location was specified by the --sdk_location command-line option.' %
sdk_location,
cm.exception.message)
示例15: test_with_extra_packages_invalid_file_name
# 需要导入模块: from apache_beam.utils.pipeline_options import PipelineOptions [as 别名]
# 或者: from apache_beam.utils.pipeline_options.PipelineOptions import view_as [as 别名]
def test_with_extra_packages_invalid_file_name(self):
staging_dir = tempfile.mkdtemp()
source_dir = tempfile.mkdtemp()
self.create_temp_file(
os.path.join(source_dir, 'abc.tgz'), 'nothing')
with self.assertRaises(RuntimeError) as cm:
options = PipelineOptions()
options.view_as(GoogleCloudOptions).staging_location = staging_dir
self.update_options(options)
options.view_as(SetupOptions).extra_packages = [
os.path.join(source_dir, 'abc.tgz')]
dependency.stage_job_resources(options)
self.assertEqual(
cm.exception.message,
'The --extra_package option expects a full path ending with ".tar" or '
'".tar.gz" instead of %s' % os.path.join(source_dir, 'abc.tgz'))