當前位置: 首頁>>代碼示例>>Python>>正文


Python runners.DataflowRunner類代碼示例

本文整理匯總了Python中apache_beam.runners.DataflowRunner的典型用法代碼示例。如果您正苦於以下問題:Python DataflowRunner類的具體用法?Python DataflowRunner怎麽用?Python DataflowRunner使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。


在下文中一共展示了DataflowRunner類的13個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: test_group_by_key_input_visitor_for_non_gbk_transforms

 def test_group_by_key_input_visitor_for_non_gbk_transforms(self):
   p = TestPipeline()
   pcoll = PCollection(p)
   for transform in [beam.Flatten(), beam.Map(lambda x: x)]:
     pcoll.element_type = typehints.Any
     DataflowRunner.group_by_key_input_visitor().visit_transform(
         AppliedPTransform(None, transform, "label", [pcoll]))
     self.assertEqual(pcoll.element_type, typehints.Any)
開發者ID:aaltay,項目名稱:incubator-beam,代碼行數:8,代碼來源:dataflow_runner_test.py

示例2: test_serialize_windowing_strategy

 def test_serialize_windowing_strategy(self):
   # This just tests the basic path; more complete tests
   # are in window_test.py.
   strategy = Windowing(window.FixedWindows(10))
   self.assertEqual(
       strategy,
       DataflowRunner.deserialize_windowing_strategy(
           DataflowRunner.serialize_windowing_strategy(strategy)))
開發者ID:aaltay,項目名稱:incubator-beam,代碼行數:8,代碼來源:dataflow_runner_test.py

示例3: test_remote_runner_translation

  def test_remote_runner_translation(self):
    remote_runner = DataflowRunner()
    p = Pipeline(remote_runner,
                 options=PipelineOptions(self.default_properties))

    (p | ptransform.Create([1, 2, 3])  # pylint: disable=expression-not-assigned
     | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
     | ptransform.GroupByKey())
    remote_runner.job = apiclient.Job(p.options)
    super(DataflowRunner, remote_runner).run(p)
開發者ID:jasonkuster,項目名稱:incubator-beam,代碼行數:10,代碼來源:runner_test.py

示例4: test_group_by_key_input_visitor_with_invalid_inputs

 def test_group_by_key_input_visitor_with_invalid_inputs(self):
   p = TestPipeline()
   pcoll1 = PCollection(p)
   pcoll2 = PCollection(p)
   for transform in [_GroupByKeyOnly(), beam.GroupByKey()]:
     pcoll1.element_type = typehints.TupleSequenceConstraint
     pcoll2.element_type = typehints.Set
     err_msg = "Input to GroupByKey must be of Tuple or Any type"
     for pcoll in [pcoll1, pcoll2]:
       with self.assertRaisesRegexp(ValueError, err_msg):
         DataflowRunner.group_by_key_input_visitor().visit_transform(
             AppliedPTransform(None, transform, "label", [pcoll]))
開發者ID:aaltay,項目名稱:incubator-beam,代碼行數:12,代碼來源:dataflow_runner_test.py

示例5: test_remote_runner_display_data

  def test_remote_runner_display_data(self):
    remote_runner = DataflowRunner()
    p = Pipeline(remote_runner,
                 options=PipelineOptions(self.default_properties))

    # TODO: Should not subclass ParDo. Switch to PTransform as soon as
    # composite transforms support display data.
    class SpecialParDo(beam.ParDo):
      def __init__(self, fn, now):
        super(SpecialParDo, self).__init__(fn)
        self.fn = fn
        self.now = now

      # Make this a list to be accessible within closure
      def display_data(self):
        return {'asubcomponent': self.fn,
                'a_class': SpecialParDo,
                'a_time': self.now}

    class SpecialDoFn(beam.DoFn):
      def display_data(self):
        return {'dofn_value': 42}

      def process(self):
        pass

    now = datetime.now()
    # pylint: disable=expression-not-assigned
    (p | ptransform.Create([1, 2, 3, 4, 5])
     | 'Do' >> SpecialParDo(SpecialDoFn(), now))

    remote_runner.job = apiclient.Job(p.options)
    super(DataflowRunner, remote_runner).run(p)
    job_dict = json.loads(str(remote_runner.job))
    steps = [step
             for step in job_dict['steps']
             if len(step['properties'].get('display_data', [])) > 0]
    step = steps[0]
    disp_data = step['properties']['display_data']
    disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key'])
    nspace = SpecialParDo.__module__+ '.'
    expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo',
                      'value': DisplayDataItem._format_value(now, 'TIMESTAMP'),
                      'key': 'a_time'},
                     {'type': 'STRING', 'namespace': nspace+'SpecialParDo',
                      'value': nspace+'SpecialParDo', 'key': 'a_class',
                      'shortValue': 'SpecialParDo'},
                     {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn',
                      'value': 42, 'key': 'dofn_value'}]
    expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key'])
    self.assertEqual(len(disp_data), 3)
    self.assertEqual(disp_data, expected_data)
開發者ID:jasonkuster,項目名稱:incubator-beam,代碼行數:52,代碼來源:runner_test.py

示例6: test_group_by_key_input_visitor_with_valid_inputs

 def test_group_by_key_input_visitor_with_valid_inputs(self):
   p = TestPipeline()
   pcoll1 = PCollection(p)
   pcoll2 = PCollection(p)
   pcoll3 = PCollection(p)
   for transform in [_GroupByKeyOnly(), beam.GroupByKey()]:
     pcoll1.element_type = None
     pcoll2.element_type = typehints.Any
     pcoll3.element_type = typehints.KV[typehints.Any, typehints.Any]
     for pcoll in [pcoll1, pcoll2, pcoll3]:
       DataflowRunner.group_by_key_input_visitor().visit_transform(
           AppliedPTransform(None, transform, "label", [pcoll]))
       self.assertEqual(pcoll.element_type,
                        typehints.KV[typehints.Any, typehints.Any])
開發者ID:aaltay,項目名稱:incubator-beam,代碼行數:14,代碼來源:dataflow_runner_test.py

示例7: test_group_by_key_input_visitor_with_invalid_inputs

 def test_group_by_key_input_visitor_with_invalid_inputs(self):
   p = TestPipeline()
   pcoll1 = PCollection(p)
   pcoll2 = PCollection(p)
   for transform in [_GroupByKeyOnly(), beam.GroupByKey()]:
     pcoll1.element_type = str
     pcoll2.element_type = typehints.Set
     err_msg = (
         r"Input to 'label' must be compatible with KV\[Any, Any\]. "
         "Found .*")
     for pcoll in [pcoll1, pcoll2]:
       with self.assertRaisesRegexp(ValueError, err_msg):
         DataflowRunner.group_by_key_input_visitor().visit_transform(
             AppliedPTransform(None, transform, "label", [pcoll]))
開發者ID:charlesccychen,項目名稱:incubator-beam,代碼行數:14,代碼來源:dataflow_runner_test.py

示例8: _test_flatten_input_visitor

  def _test_flatten_input_visitor(self, input_type, output_type, num_inputs):
    p = TestPipeline()
    inputs = []
    for _ in range(num_inputs):
      input_pcoll = PCollection(p)
      input_pcoll.element_type = input_type
      inputs.append(input_pcoll)
    output_pcoll = PCollection(p)
    output_pcoll.element_type = output_type

    flatten = AppliedPTransform(None, beam.Flatten(), "label", inputs)
    flatten.add_output(output_pcoll, None)
    DataflowRunner.flatten_input_visitor().visit_transform(flatten)
    for _ in range(num_inputs):
      self.assertEqual(inputs[0].element_type, output_type)
開發者ID:aaltay,項目名稱:incubator-beam,代碼行數:15,代碼來源:dataflow_runner_test.py

示例9: test_side_input_visitor

  def test_side_input_visitor(self):
    p = TestPipeline()
    pc = p | beam.Create([])

    transform = beam.Map(
        lambda x, y, z: (x, y, z),
        beam.pvalue.AsSingleton(pc),
        beam.pvalue.AsMultiMap(pc))
    applied_transform = AppliedPTransform(None, transform, "label", [pc])
    DataflowRunner.side_input_visitor().visit_transform(applied_transform)
    self.assertEqual(2, len(applied_transform.side_inputs))
    for side_input in applied_transform.side_inputs:
      self.assertEqual(
          dataflow_runner._DataflowSideInput.DATAFLOW_MULTIMAP_URN,
          side_input._side_input_data().access_pattern)
開發者ID:charlesccychen,項目名稱:incubator-beam,代碼行數:15,代碼來源:dataflow_runner_test.py

示例10: test_streaming_create_translation

  def test_streaming_create_translation(self):
    remote_runner = DataflowRunner()
    self.default_properties.append("--streaming")
    p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
    p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
    remote_runner.job = apiclient.Job(p._options)
    # Performing configured PTransform overrides here.
    p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)
    super(DataflowRunner, remote_runner).run(p)
    job_dict = json.loads(str(remote_runner.job))
    self.assertEqual(len(job_dict[u'steps']), 2)

    self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
    self.assertEqual(
        job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
        '_starting_signal/')
    self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
開發者ID:amarouni,項目名稱:incubator-beam,代碼行數:17,代碼來源:dataflow_runner_test.py

示例11: test_gbk_then_flatten_input_visitor

  def test_gbk_then_flatten_input_visitor(self):
    p = TestPipeline(
        runner=DataflowRunner(),
        options=PipelineOptions(self.default_properties))
    none_str_pc = p | 'c1' >> beam.Create({None: 'a'})
    none_int_pc = p | 'c2' >> beam.Create({None: 3})
    flat = (none_str_pc, none_int_pc) | beam.Flatten()
    _ = flat | beam.GroupByKey()

    # This may change if type inference changes, but we assert it here
    # to make sure the check below is not vacuous.
    self.assertNotIsInstance(flat.element_type, typehints.TupleConstraint)

    p.visit(DataflowRunner.group_by_key_input_visitor())
    p.visit(DataflowRunner.flatten_input_visitor())

    # The dataflow runner requires gbk input to be tuples *and* flatten
    # inputs to be equal to their outputs. Assert both hold.
    self.assertIsInstance(flat.element_type, typehints.TupleConstraint)
    self.assertEqual(flat.element_type, none_str_pc.element_type)
    self.assertEqual(flat.element_type, none_int_pc.element_type)
開發者ID:aaltay,項目名稱:incubator-beam,代碼行數:21,代碼來源:dataflow_runner_test.py

示例12: _get_coder

 def _get_coder(self, pvalue, windowed=True):
   # TODO(robertwb): This should be an attribute of the pvalue itself.
   return DataflowRunner._get_coder(
       pvalue.element_type or typehints.Any,
       pvalue.windowing.windowfn.get_window_coder() if windowed else None)
開發者ID:aaltay,項目名稱:incubator-beam,代碼行數:5,代碼來源:maptask_executor_runner.py

示例13: run_ParDo

  def run_ParDo(self, transform_node):
    transform = transform_node.transform
    output = transform_node.outputs[None]
    element_coder = self._get_coder(output)
    map_task_index, producer_index, output_index = self.outputs[
        transform_node.inputs[0]]

    # If any of this ParDo's side inputs depend on outputs from this map_task,
    # we can't continue growing this map task.
    def is_reachable(leaf, root):
      if leaf == root:
        return True
      else:
        return any(is_reachable(x, root) for x in self.dependencies[leaf])

    if any(is_reachable(self.outputs[side_input.pvalue][0], map_task_index)
           for side_input in transform_node.side_inputs):
      # Start a new map tasks.
      input_element_coder = self._get_coder(transform_node.inputs[0])

      output_buffer = OutputBuffer(input_element_coder)

      fusion_break_write = operation_specs.WorkerInMemoryWrite(
          output_buffer=output_buffer,
          write_windowed_values=True,
          input=(producer_index, output_index),
          output_coders=[input_element_coder])
      self.map_tasks[map_task_index].append(
          (transform_node.full_label + '/Write', fusion_break_write))

      original_map_task_index = map_task_index
      map_task_index, producer_index, output_index = len(self.map_tasks), 0, 0

      fusion_break_read = operation_specs.WorkerRead(
          output_buffer.source_bundle(),
          output_coders=[input_element_coder])
      self.map_tasks.append(
          [(transform_node.full_label + '/Read', fusion_break_read)])

      self.dependencies[map_task_index].add(original_map_task_index)

    def create_side_read(side_input):
      label = self.side_input_labels[side_input]
      output_buffer = self.run_side_write(
          side_input.pvalue, '%s/%s' % (transform_node.full_label, label))
      return operation_specs.WorkerSideInputSource(
          output_buffer.source(), label)

    do_op = operation_specs.WorkerDoFn(  #
        serialized_fn=pickler.dumps(DataflowRunner._pardo_fn_data(
            transform_node,
            lambda side_input: self.side_input_labels[side_input])),
        output_tags=[PropertyNames.OUT] + ['%s_%s' % (PropertyNames.OUT, tag)
                                           for tag in transform.output_tags
                                          ],
        # Same assumption that DataflowRunner has about coders being compatible
        # across outputs.
        output_coders=[element_coder] * (len(transform.output_tags) + 1),
        input=(producer_index, output_index),
        side_inputs=[create_side_read(side_input)
                     for side_input in transform_node.side_inputs])

    producer_index = len(self.map_tasks[map_task_index])
    self.outputs[transform_node.outputs[None]] = (
        map_task_index, producer_index, 0)
    for ix, tag in enumerate(transform.output_tags):
      self.outputs[transform_node.outputs[
          tag]] = map_task_index, producer_index, ix + 1
    self.map_tasks[map_task_index].append((transform_node.full_label, do_op))

    for side_input in transform_node.side_inputs:
      self.dependencies[map_task_index].add(self.outputs[side_input.pvalue][0])
開發者ID:aaltay,項目名稱:incubator-beam,代碼行數:72,代碼來源:maptask_executor_runner.py


注:本文中的apache_beam.runners.DataflowRunner類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。