本文整理匯總了Python中apache_beam.runners.DataflowRunner類的典型用法代碼示例。如果您正苦於以下問題:Python DataflowRunner類的具體用法?Python DataflowRunner怎麽用?Python DataflowRunner使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了DataflowRunner類的13個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: test_group_by_key_input_visitor_for_non_gbk_transforms
def test_group_by_key_input_visitor_for_non_gbk_transforms(self):
p = TestPipeline()
pcoll = PCollection(p)
for transform in [beam.Flatten(), beam.Map(lambda x: x)]:
pcoll.element_type = typehints.Any
DataflowRunner.group_by_key_input_visitor().visit_transform(
AppliedPTransform(None, transform, "label", [pcoll]))
self.assertEqual(pcoll.element_type, typehints.Any)
示例2: test_serialize_windowing_strategy
def test_serialize_windowing_strategy(self):
# This just tests the basic path; more complete tests
# are in window_test.py.
strategy = Windowing(window.FixedWindows(10))
self.assertEqual(
strategy,
DataflowRunner.deserialize_windowing_strategy(
DataflowRunner.serialize_windowing_strategy(strategy)))
示例3: test_remote_runner_translation
def test_remote_runner_translation(self):
remote_runner = DataflowRunner()
p = Pipeline(remote_runner,
options=PipelineOptions(self.default_properties))
(p | ptransform.Create([1, 2, 3]) # pylint: disable=expression-not-assigned
| 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
| ptransform.GroupByKey())
remote_runner.job = apiclient.Job(p.options)
super(DataflowRunner, remote_runner).run(p)
示例4: test_group_by_key_input_visitor_with_invalid_inputs
def test_group_by_key_input_visitor_with_invalid_inputs(self):
p = TestPipeline()
pcoll1 = PCollection(p)
pcoll2 = PCollection(p)
for transform in [_GroupByKeyOnly(), beam.GroupByKey()]:
pcoll1.element_type = typehints.TupleSequenceConstraint
pcoll2.element_type = typehints.Set
err_msg = "Input to GroupByKey must be of Tuple or Any type"
for pcoll in [pcoll1, pcoll2]:
with self.assertRaisesRegexp(ValueError, err_msg):
DataflowRunner.group_by_key_input_visitor().visit_transform(
AppliedPTransform(None, transform, "label", [pcoll]))
示例5: test_remote_runner_display_data
def test_remote_runner_display_data(self):
remote_runner = DataflowRunner()
p = Pipeline(remote_runner,
options=PipelineOptions(self.default_properties))
# TODO: Should not subclass ParDo. Switch to PTransform as soon as
# composite transforms support display data.
class SpecialParDo(beam.ParDo):
def __init__(self, fn, now):
super(SpecialParDo, self).__init__(fn)
self.fn = fn
self.now = now
# Make this a list to be accessible within closure
def display_data(self):
return {'asubcomponent': self.fn,
'a_class': SpecialParDo,
'a_time': self.now}
class SpecialDoFn(beam.DoFn):
def display_data(self):
return {'dofn_value': 42}
def process(self):
pass
now = datetime.now()
# pylint: disable=expression-not-assigned
(p | ptransform.Create([1, 2, 3, 4, 5])
| 'Do' >> SpecialParDo(SpecialDoFn(), now))
remote_runner.job = apiclient.Job(p.options)
super(DataflowRunner, remote_runner).run(p)
job_dict = json.loads(str(remote_runner.job))
steps = [step
for step in job_dict['steps']
if len(step['properties'].get('display_data', [])) > 0]
step = steps[0]
disp_data = step['properties']['display_data']
disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key'])
nspace = SpecialParDo.__module__+ '.'
expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo',
'value': DisplayDataItem._format_value(now, 'TIMESTAMP'),
'key': 'a_time'},
{'type': 'STRING', 'namespace': nspace+'SpecialParDo',
'value': nspace+'SpecialParDo', 'key': 'a_class',
'shortValue': 'SpecialParDo'},
{'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn',
'value': 42, 'key': 'dofn_value'}]
expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key'])
self.assertEqual(len(disp_data), 3)
self.assertEqual(disp_data, expected_data)
示例6: test_group_by_key_input_visitor_with_valid_inputs
def test_group_by_key_input_visitor_with_valid_inputs(self):
p = TestPipeline()
pcoll1 = PCollection(p)
pcoll2 = PCollection(p)
pcoll3 = PCollection(p)
for transform in [_GroupByKeyOnly(), beam.GroupByKey()]:
pcoll1.element_type = None
pcoll2.element_type = typehints.Any
pcoll3.element_type = typehints.KV[typehints.Any, typehints.Any]
for pcoll in [pcoll1, pcoll2, pcoll3]:
DataflowRunner.group_by_key_input_visitor().visit_transform(
AppliedPTransform(None, transform, "label", [pcoll]))
self.assertEqual(pcoll.element_type,
typehints.KV[typehints.Any, typehints.Any])
示例7: test_group_by_key_input_visitor_with_invalid_inputs
def test_group_by_key_input_visitor_with_invalid_inputs(self):
p = TestPipeline()
pcoll1 = PCollection(p)
pcoll2 = PCollection(p)
for transform in [_GroupByKeyOnly(), beam.GroupByKey()]:
pcoll1.element_type = str
pcoll2.element_type = typehints.Set
err_msg = (
r"Input to 'label' must be compatible with KV\[Any, Any\]. "
"Found .*")
for pcoll in [pcoll1, pcoll2]:
with self.assertRaisesRegexp(ValueError, err_msg):
DataflowRunner.group_by_key_input_visitor().visit_transform(
AppliedPTransform(None, transform, "label", [pcoll]))
示例8: _test_flatten_input_visitor
def _test_flatten_input_visitor(self, input_type, output_type, num_inputs):
p = TestPipeline()
inputs = []
for _ in range(num_inputs):
input_pcoll = PCollection(p)
input_pcoll.element_type = input_type
inputs.append(input_pcoll)
output_pcoll = PCollection(p)
output_pcoll.element_type = output_type
flatten = AppliedPTransform(None, beam.Flatten(), "label", inputs)
flatten.add_output(output_pcoll, None)
DataflowRunner.flatten_input_visitor().visit_transform(flatten)
for _ in range(num_inputs):
self.assertEqual(inputs[0].element_type, output_type)
示例9: test_side_input_visitor
def test_side_input_visitor(self):
p = TestPipeline()
pc = p | beam.Create([])
transform = beam.Map(
lambda x, y, z: (x, y, z),
beam.pvalue.AsSingleton(pc),
beam.pvalue.AsMultiMap(pc))
applied_transform = AppliedPTransform(None, transform, "label", [pc])
DataflowRunner.side_input_visitor().visit_transform(applied_transform)
self.assertEqual(2, len(applied_transform.side_inputs))
for side_input in applied_transform.side_inputs:
self.assertEqual(
dataflow_runner._DataflowSideInput.DATAFLOW_MULTIMAP_URN,
side_input._side_input_data().access_pattern)
示例10: test_streaming_create_translation
def test_streaming_create_translation(self):
remote_runner = DataflowRunner()
self.default_properties.append("--streaming")
p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
p | ptransform.Create([1]) # pylint: disable=expression-not-assigned
remote_runner.job = apiclient.Job(p._options)
# Performing configured PTransform overrides here.
p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)
super(DataflowRunner, remote_runner).run(p)
job_dict = json.loads(str(remote_runner.job))
self.assertEqual(len(job_dict[u'steps']), 2)
self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
self.assertEqual(
job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
'_starting_signal/')
self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
示例11: test_gbk_then_flatten_input_visitor
def test_gbk_then_flatten_input_visitor(self):
p = TestPipeline(
runner=DataflowRunner(),
options=PipelineOptions(self.default_properties))
none_str_pc = p | 'c1' >> beam.Create({None: 'a'})
none_int_pc = p | 'c2' >> beam.Create({None: 3})
flat = (none_str_pc, none_int_pc) | beam.Flatten()
_ = flat | beam.GroupByKey()
# This may change if type inference changes, but we assert it here
# to make sure the check below is not vacuous.
self.assertNotIsInstance(flat.element_type, typehints.TupleConstraint)
p.visit(DataflowRunner.group_by_key_input_visitor())
p.visit(DataflowRunner.flatten_input_visitor())
# The dataflow runner requires gbk input to be tuples *and* flatten
# inputs to be equal to their outputs. Assert both hold.
self.assertIsInstance(flat.element_type, typehints.TupleConstraint)
self.assertEqual(flat.element_type, none_str_pc.element_type)
self.assertEqual(flat.element_type, none_int_pc.element_type)
示例12: _get_coder
def _get_coder(self, pvalue, windowed=True):
# TODO(robertwb): This should be an attribute of the pvalue itself.
return DataflowRunner._get_coder(
pvalue.element_type or typehints.Any,
pvalue.windowing.windowfn.get_window_coder() if windowed else None)
示例13: run_ParDo
def run_ParDo(self, transform_node):
transform = transform_node.transform
output = transform_node.outputs[None]
element_coder = self._get_coder(output)
map_task_index, producer_index, output_index = self.outputs[
transform_node.inputs[0]]
# If any of this ParDo's side inputs depend on outputs from this map_task,
# we can't continue growing this map task.
def is_reachable(leaf, root):
if leaf == root:
return True
else:
return any(is_reachable(x, root) for x in self.dependencies[leaf])
if any(is_reachable(self.outputs[side_input.pvalue][0], map_task_index)
for side_input in transform_node.side_inputs):
# Start a new map tasks.
input_element_coder = self._get_coder(transform_node.inputs[0])
output_buffer = OutputBuffer(input_element_coder)
fusion_break_write = operation_specs.WorkerInMemoryWrite(
output_buffer=output_buffer,
write_windowed_values=True,
input=(producer_index, output_index),
output_coders=[input_element_coder])
self.map_tasks[map_task_index].append(
(transform_node.full_label + '/Write', fusion_break_write))
original_map_task_index = map_task_index
map_task_index, producer_index, output_index = len(self.map_tasks), 0, 0
fusion_break_read = operation_specs.WorkerRead(
output_buffer.source_bundle(),
output_coders=[input_element_coder])
self.map_tasks.append(
[(transform_node.full_label + '/Read', fusion_break_read)])
self.dependencies[map_task_index].add(original_map_task_index)
def create_side_read(side_input):
label = self.side_input_labels[side_input]
output_buffer = self.run_side_write(
side_input.pvalue, '%s/%s' % (transform_node.full_label, label))
return operation_specs.WorkerSideInputSource(
output_buffer.source(), label)
do_op = operation_specs.WorkerDoFn( #
serialized_fn=pickler.dumps(DataflowRunner._pardo_fn_data(
transform_node,
lambda side_input: self.side_input_labels[side_input])),
output_tags=[PropertyNames.OUT] + ['%s_%s' % (PropertyNames.OUT, tag)
for tag in transform.output_tags
],
# Same assumption that DataflowRunner has about coders being compatible
# across outputs.
output_coders=[element_coder] * (len(transform.output_tags) + 1),
input=(producer_index, output_index),
side_inputs=[create_side_read(side_input)
for side_input in transform_node.side_inputs])
producer_index = len(self.map_tasks[map_task_index])
self.outputs[transform_node.outputs[None]] = (
map_task_index, producer_index, 0)
for ix, tag in enumerate(transform.output_tags):
self.outputs[transform_node.outputs[
tag]] = map_task_index, producer_index, ix + 1
self.map_tasks[map_task_index].append((transform_node.full_label, do_op))
for side_input in transform_node.side_inputs:
self.dependencies[map_task_index].add(self.outputs[side_input.pvalue][0])