本文整理汇总了Python中apache_beam.pipeline.Pipeline类的典型用法代码示例。如果您正苦于以下问题:Python Pipeline类的具体用法?Python Pipeline怎么用?Python Pipeline使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Pipeline类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_full_completion
def test_full_completion(self):
# Create dummy file and close it. Note that we need to do this because
# Windows does not allow NamedTemporaryFiles to be reopened elsewhere
# before the temporary file is closed.
dummy_file = tempfile.NamedTemporaryFile(delete=False)
dummy_file_name = dummy_file.name
dummy_file.close()
dummy_dir = tempfile.mkdtemp()
remote_runner = DataflowRunner()
pipeline = Pipeline(remote_runner,
options=PipelineOptions([
'--dataflow_endpoint=ignored',
'--sdk_location=' + dummy_file_name,
'--job_name=test-job',
'--project=test-project',
'--staging_location=' + dummy_dir,
'--temp_location=/dev/null',
'--template_location=' + dummy_file_name,
'--no_auth=True']))
pipeline | beam.Create([1, 2, 3]) | beam.Map(lambda x: x) # pylint: disable=expression-not-assigned
pipeline.run().wait_until_finish()
with open(dummy_file_name) as template_file:
saved_job_dict = json.load(template_file)
self.assertEqual(
saved_job_dict['environment']['sdkPipelineOptions']
['options']['project'], 'test-project')
self.assertEqual(
saved_job_dict['environment']['sdkPipelineOptions']
['options']['job_name'], 'test-job')
示例2: test_direct_runner_metrics
def test_direct_runner_metrics(self):
class MyDoFn(beam.DoFn):
def start_bundle(self):
count = Metrics.counter(self.__class__, 'bundles')
count.inc()
def finish_bundle(self):
count = Metrics.counter(self.__class__, 'finished_bundles')
count.inc()
def process(self, element):
gauge = Metrics.gauge(self.__class__, 'latest_element')
gauge.set(element)
count = Metrics.counter(self.__class__, 'elements')
count.inc()
distro = Metrics.distribution(self.__class__, 'element_dist')
distro.update(element)
return [element]
runner = DirectRunner()
p = Pipeline(runner,
options=PipelineOptions(self.default_properties))
pcoll = (p | ptransform.Create([1, 2, 3, 4, 5])
| 'Do' >> beam.ParDo(MyDoFn()))
assert_that(pcoll, equal_to([1, 2, 3, 4, 5]))
result = p.run()
result.wait_until_finish()
metrics = result.metrics().query()
namespace = '{}.{}'.format(MyDoFn.__module__,
MyDoFn.__name__)
hc.assert_that(
metrics['counters'],
hc.contains_inanyorder(
MetricResult(
MetricKey('Do', MetricName(namespace, 'elements')),
5, 5),
MetricResult(
MetricKey('Do', MetricName(namespace, 'bundles')),
1, 1),
MetricResult(
MetricKey('Do', MetricName(namespace, 'finished_bundles')),
1, 1)))
hc.assert_that(
metrics['distributions'],
hc.contains_inanyorder(
MetricResult(
MetricKey('Do', MetricName(namespace, 'element_dist')),
DistributionResult(DistributionData(15, 5, 1, 5)),
DistributionResult(DistributionData(15, 5, 1, 5)))))
gauge_result = metrics['gauges'][0]
hc.assert_that(
gauge_result.key,
hc.equal_to(MetricKey('Do', MetricName(namespace, 'latest_element'))))
hc.assert_that(gauge_result.committed.value, hc.equal_to(5))
hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))
示例3: test_biqquery_read_streaming_fail
def test_biqquery_read_streaming_fail(self):
remote_runner = DataflowRunner()
self.default_properties.append("--streaming")
p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
_ = p | beam.io.Read(beam.io.BigQuerySource('some.table'))
with self.assertRaisesRegexp(ValueError,
r'source is not currently available'):
p.run()
示例4: test_remote_runner_translation
def test_remote_runner_translation(self):
remote_runner = DataflowRunner()
p = Pipeline(remote_runner,
options=PipelineOptions(self.default_properties))
(p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned
| 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
| ptransform.GroupByKey())
p.run()
示例5: run
def run(self, transform, options=None):
"""Run the given transform with this runner.
"""
# Imported here to avoid circular dependencies.
# pylint: disable=wrong-import-order, wrong-import-position
from apache_beam.pipeline import Pipeline
p = Pipeline(runner=self, options=options)
p | transform
return p.run()
示例6: test_parent_pointer
def test_parent_pointer(self):
class MyPTransform(beam.PTransform):
def expand(self, p):
self.p = p
return p | beam.Create([None])
p = beam.Pipeline()
p | MyPTransform() # pylint: disable=expression-not-assigned
p = Pipeline.from_runner_api(Pipeline.to_runner_api(p), None, None)
self.assertIsNotNone(p.transforms_stack[0].parts[0].parent)
self.assertEquals(p.transforms_stack[0].parts[0].parent,
p.transforms_stack[0])
示例7: test_remote_runner_display_data
def test_remote_runner_display_data(self):
remote_runner = DataflowRunner()
p = Pipeline(remote_runner,
options=PipelineOptions(self.default_properties))
# TODO: Should not subclass ParDo. Switch to PTransform as soon as
# composite transforms support display data.
class SpecialParDo(beam.ParDo):
def __init__(self, fn, now):
super(SpecialParDo, self).__init__(fn)
self.fn = fn
self.now = now
# Make this a list to be accessible within closure
def display_data(self):
return {'asubcomponent': self.fn,
'a_class': SpecialParDo,
'a_time': self.now}
class SpecialDoFn(beam.DoFn):
def display_data(self):
return {'dofn_value': 42}
def process(self):
pass
now = datetime.now()
# pylint: disable=expression-not-assigned
(p | ptransform.Create([1, 2, 3, 4, 5])
| 'Do' >> SpecialParDo(SpecialDoFn(), now))
p.run()
job_dict = json.loads(str(remote_runner.job))
steps = [step
for step in job_dict['steps']
if len(step['properties'].get('display_data', [])) > 0]
step = steps[1]
disp_data = step['properties']['display_data']
disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key'])
nspace = SpecialParDo.__module__+ '.'
expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo',
'value': DisplayDataItem._format_value(now, 'TIMESTAMP'),
'key': 'a_time'},
{'type': 'STRING', 'namespace': nspace+'SpecialParDo',
'value': nspace+'SpecialParDo', 'key': 'a_class',
'shortValue': 'SpecialParDo'},
{'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn',
'value': 42, 'key': 'dofn_value'}]
expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key'])
self.assertEqual(len(disp_data), 3)
self.assertEqual(disp_data, expected_data)
示例8: run
def run(self, transform, options=None):
"""Run the given transform or callable with this runner.
"""
# Imported here to avoid circular dependencies.
# pylint: disable=wrong-import-order, wrong-import-position
from apache_beam import PTransform
from apache_beam.pvalue import PBegin
from apache_beam.pipeline import Pipeline
p = Pipeline(runner=self, options=options)
if isinstance(transform, PTransform):
p | transform
else:
transform(PBegin(p))
return p.run()
示例9: test_reuse_custom_transform_instance
def test_reuse_custom_transform_instance(self):
pipeline = Pipeline()
pcoll1 = pipeline | 'pcoll1' >> Create([1, 2, 3])
pcoll2 = pipeline | 'pcoll2' >> Create([4, 5, 6])
transform = PipelineTest.CustomTransform()
pcoll1 | transform
with self.assertRaises(RuntimeError) as cm:
pipeline.apply(transform, pcoll2)
self.assertEqual(
cm.exception.args[0],
'Transform "CustomTransform" does not have a stable unique label. '
'This will prevent updating of pipelines. '
'To apply a transform with a specified label write '
'pvalue | "label" >> transform')
示例10: test_streaming_create_translation
def test_streaming_create_translation(self):
remote_runner = DataflowRunner()
self.default_properties.append("--streaming")
p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
p | ptransform.Create([1]) # pylint: disable=expression-not-assigned
p.run()
job_dict = json.loads(str(remote_runner.job))
self.assertEqual(len(job_dict[u'steps']), 2)
self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
self.assertEqual(
job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
'_starting_signal/')
self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
示例11: setUp
def setUp(self):
self.pipeline = Pipeline(DirectRunner())
self.visitor = ConsumerTrackingPipelineVisitor()
try: # Python 2
self.assertCountEqual = self.assertItemsEqual
except AttributeError: # Python 3
pass
示例12: run
def run(self, request, context):
job_id = uuid.uuid4().get_hex()
pipeline_result = Pipeline.from_runner_api(
request.pipeline,
'DirectRunner',
PipelineOptions()).run()
self.jobs[job_id] = pipeline_result
return beam_job_api_pb2.SubmitJobResponse(jobId=job_id)
示例13: test_streaming_create_translation
def test_streaming_create_translation(self):
remote_runner = DataflowRunner()
self.default_properties.append("--streaming")
p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
p | ptransform.Create([1]) # pylint: disable=expression-not-assigned
remote_runner.job = apiclient.Job(p._options)
# Performing configured PTransform overrides here.
p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)
super(DataflowRunner, remote_runner).run(p)
job_dict = json.loads(str(remote_runner.job))
self.assertEqual(len(job_dict[u'steps']), 2)
self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
self.assertEqual(
job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
'_starting_signal/')
self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
示例14: test_bad_path
def test_bad_path(self):
dummy_sdk_file = tempfile.NamedTemporaryFile()
remote_runner = DataflowRunner()
pipeline = Pipeline(remote_runner,
options=PipelineOptions([
'--dataflow_endpoint=ignored',
'--sdk_location=' + dummy_sdk_file.name,
'--job_name=test-job',
'--project=test-project',
'--staging_location=ignored',
'--temp_location=/dev/null',
'--template_location=/bad/path',
'--no_auth=True']))
remote_runner.job = apiclient.Job(pipeline._options)
with self.assertRaises(IOError):
pipeline.run().wait_until_finish()
示例15: test_visit_entire_graph
def test_visit_entire_graph(self):
pipeline = Pipeline()
pcoll1 = pipeline | 'pcoll' >> Create([1, 2, 3])
pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1])
pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1])
pcoll4 = pcoll2 | 'do3' >> FlatMap(lambda x: [x + 1])
transform = PipelineTest.CustomTransform()
pcoll5 = pcoll4 | transform
visitor = PipelineTest.Visitor(visited=[])
pipeline.visit(visitor)
self.assertEqual(set([pcoll1, pcoll2, pcoll3, pcoll4, pcoll5]),
set(visitor.visited))
self.assertEqual(set(visitor.enter_composite),
set(visitor.leave_composite))
self.assertEqual(3, len(visitor.enter_composite))
self.assertEqual(visitor.enter_composite[2].transform, transform)
self.assertEqual(visitor.leave_composite[1].transform, transform)