本文整理汇总了Python中apache_beam.test_pipeline.TestPipeline类的典型用法代码示例。如果您正苦于以下问题:Python TestPipeline类的具体用法?Python TestPipeline怎么用?Python TestPipeline使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了TestPipeline类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run_pipeline
def run_pipeline(self, count_implementation, factor=1):
p = TestPipeline()
words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG'])
result = words | count_implementation
assert_that(
result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))]))
p.run()
示例2: test_dataflow_single_file
def test_dataflow_single_file(self):
file_name, expected_data = write_data(5)
assert len(expected_data) == 5
pipeline = TestPipeline()
pcoll = pipeline | 'Read' >> ReadFromText(file_name)
assert_that(pcoll, equal_to(expected_data))
pipeline.run()
示例3: model_multiple_pcollections_partition
def model_multiple_pcollections_partition(contents, output_path):
"""Splitting a PCollection with Partition."""
some_hash_fn = lambda s: ord(s[0])
def get_percentile(i):
"""Assume i in [0,100)."""
return i
import apache_beam as beam
p = TestPipeline() # Use TestPipeline for testing.
students = p | beam.Create(contents)
# [START model_multiple_pcollections_partition]
def partition_fn(student, num_partitions):
return int(get_percentile(student) * num_partitions / 100)
by_decile = students | beam.Partition(partition_fn, 10)
# [END model_multiple_pcollections_partition]
# [START model_multiple_pcollections_partition_40th]
fortieth_percentile = by_decile[4]
# [END model_multiple_pcollections_partition_40th]
([by_decile[d] for d in xrange(10) if d != 4] + [fortieth_percentile]
| beam.Flatten()
| beam.io.WriteToText(output_path))
p.run()
示例4: pipeline_logging
def pipeline_logging(lines, output):
"""Logging Pipeline Messages."""
import re
import apache_beam as beam
# [START pipeline_logging]
# import Python logging module.
import logging
class ExtractWordsFn(beam.DoFn):
def process(self, element):
words = re.findall(r'[A-Za-z\']+', element)
for word in words:
yield word
if word.lower() == 'love':
# Log using the root logger at info or higher levels
logging.info('Found : %s', word.lower())
# Remaining WordCount example code ...
# [END pipeline_logging]
p = TestPipeline() # Use TestPipeline for testing.
(p
| beam.Create(lines)
| beam.ParDo(ExtractWordsFn())
| beam.io.WriteToText(output))
p.run()
示例5: test_compute_top_sessions
def test_compute_top_sessions(self):
p = TestPipeline()
edits = p | beam.Create(self.EDITS)
result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0)
beam.assert_that(result, beam.equal_to(self.EXPECTED))
p.run()
示例6: test_timestamped_with_combiners
def test_timestamped_with_combiners(self):
p = TestPipeline()
result = (p
# Create some initial test values.
| 'start' >> Create([(k, k) for k in range(10)])
# The purpose of the WindowInto transform is to establish a
# FixedWindows windowing function for the PCollection.
# It does not bucket elements into windows since the timestamps
# from Create are not spaced 5 ms apart and very likely they all
# fall into the same window.
| 'w' >> WindowInto(FixedWindows(5))
# Generate timestamped values using the values as timestamps.
# Now there are values 5 ms apart and since Map propagates the
# windowing function from input to output the output PCollection
# will have elements falling into different 5ms windows.
| Map(lambda (x, t): TimestampedValue(x, t))
# We add a 'key' to each value representing the index of the
# window. This is important since there is no guarantee of
# order for the elements of a PCollection.
| Map(lambda v: (v / 5, v)))
# Sum all elements associated with a key and window. Although it
# is called CombinePerKey it is really CombinePerKeyAndWindow the
# same way GroupByKey is really GroupByKeyAndWindow.
sum_per_window = result | CombinePerKey(sum)
# Compute mean per key and window.
mean_per_window = result | combiners.Mean.PerKey()
assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]),
label='assert:sum')
assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]),
label='assert:mean')
p.run()
示例7: model_co_group_by_key_tuple
def model_co_group_by_key_tuple(email_list, phone_list, output_path):
"""Applying a CoGroupByKey Transform to a tuple."""
import apache_beam as beam
p = TestPipeline() # Use TestPipeline for testing.
# [START model_group_by_key_cogroupbykey_tuple]
# Each data set is represented by key-value pairs in separate PCollections.
# Both data sets share a common key type (in this example str).
# The email_list contains values such as: ('joe', '[email protected]') with
# multiple possible values for each key.
# The phone_list contains values such as: ('mary': '111-222-3333') with
# multiple possible values for each key.
emails = p | 'email' >> beam.Create(email_list)
phones = p | 'phone' >> beam.Create(phone_list)
# The result PCollection contains one key-value element for each key in the
# input PCollections. The key of the pair will be the key from the input and
# the value will be a dictionary with two entries: 'emails' - an iterable of
# all values for the current key in the emails PCollection and 'phones': an
# iterable of all values for the current key in the phones PCollection.
# For instance, if 'emails' contained ('joe', '[email protected]') and
# ('joe', '[email protected]'), then 'result' will contain the element
# ('joe', {'emails': ['[email protected]', '[email protected]'], 'phones': ...})
result = {'emails': emails, 'phones': phones} | beam.CoGroupByKey()
def join_info((name, info)):
return '; '.join(['%s' % name,
'%s' % ','.join(info['emails']),
'%s' % ','.join(info['phones'])])
contact_lines = result | beam.Map(join_info)
# [END model_group_by_key_cogroupbykey_tuple]
contact_lines | beam.io.WriteToText(output_path)
p.run()
示例8: test_bad_types
def test_bad_types(self):
p = TestPipeline()
evens = None # pylint: disable=unused-variable
# [START type_hints_missing_define_numbers]
numbers = p | beam.Create(['1', '2', '3'])
# [END type_hints_missing_define_numbers]
# Consider the following code.
# pylint: disable=expression-not-assigned
# pylint: disable=unused-variable
# [START type_hints_missing_apply]
evens = numbers | beam.Filter(lambda x: x % 2 == 0)
# [END type_hints_missing_apply]
# Now suppose numbers was defined as [snippet above].
# When running this pipeline, you'd get a runtime error,
# possibly on a remote machine, possibly very late.
with self.assertRaises(TypeError):
p.run()
# To catch this early, we can assert what types we expect.
with self.assertRaises(typehints.TypeCheckError):
# [START type_hints_takes]
p.options.view_as(TypeOptions).pipeline_type_check = True
evens = numbers | beam.Filter(lambda x: x % 2 == 0).with_input_types(int)
# [END type_hints_takes]
# Type hints can be declared on DoFns and callables as well, rather
# than where they're used, to be more self contained.
with self.assertRaises(typehints.TypeCheckError):
# [START type_hints_do_fn]
@beam.typehints.with_input_types(int)
class FilterEvensDoFn(beam.DoFn):
def process(self, element):
if element % 2 == 0:
yield element
evens = numbers | beam.ParDo(FilterEvensDoFn())
# [END type_hints_do_fn]
words = p | 'words' >> beam.Create(['a', 'bb', 'c'])
# One can assert outputs and apply them to transforms as well.
# Helps document the contract and checks it at pipeline construction time.
# [START type_hints_transform]
T = beam.typehints.TypeVariable('T')
@beam.typehints.with_input_types(T)
@beam.typehints.with_output_types(beam.typehints.Tuple[int, T])
class MyTransform(beam.PTransform):
def expand(self, pcoll):
return pcoll | beam.Map(lambda x: (len(x), x))
words_with_lens = words | MyTransform()
# [END type_hints_transform]
# pylint: disable=expression-not-assigned
with self.assertRaises(typehints.TypeCheckError):
words_with_lens | beam.Map(lambda x: x).with_input_types(
beam.typehints.Tuple[int, int])
示例9: model_multiple_pcollections_flatten
def model_multiple_pcollections_flatten(contents, output_path):
"""Merging a PCollection with Flatten."""
some_hash_fn = lambda s: ord(s[0])
import apache_beam as beam
p = TestPipeline() # Use TestPipeline for testing.
partition_fn = lambda element, partitions: some_hash_fn(element) % partitions
# Partition into deciles
partitioned = p | beam.Create(contents) | beam.Partition(partition_fn, 3)
pcoll1 = partitioned[0]
pcoll2 = partitioned[1]
pcoll3 = partitioned[2]
# Flatten them back into 1
# A collection of PCollection objects can be represented simply
# as a tuple (or list) of PCollections.
# (The SDK for Python has no separate type to store multiple
# PCollection objects, whether containing the same or different
# types.)
# [START model_multiple_pcollections_flatten]
merged = (
# [START model_multiple_pcollections_tuple]
(pcoll1, pcoll2, pcoll3)
# [END model_multiple_pcollections_tuple]
# A list of tuples can be "piped" directly into a Flatten transform.
| beam.Flatten())
# [END model_multiple_pcollections_flatten]
merged | beam.io.WriteToText(output_path)
p.run()
示例10: test_run_direct
def test_run_direct(self):
file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd')
pipeline = TestPipeline()
pcoll = pipeline | beam.io.Read(LineSource(file_name))
assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd']))
pipeline.run()
示例11: test_dataflow_file_pattern
def test_dataflow_file_pattern(self):
pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4])
assert len(expected_data) == 40
pipeline = TestPipeline()
pcoll = pipeline | 'Read' >> ReadFromText(pattern)
assert_that(pcoll, equal_to(expected_data))
pipeline.run()
示例12: model_composite_transform_example
def model_composite_transform_example(contents, output_path):
"""Example of a composite transform.
To declare a composite transform, define a subclass of PTransform.
To override the apply method, define a method "apply" that
takes a PCollection as its only parameter and returns a PCollection.
"""
import re
import apache_beam as beam
# [START composite_transform_example]
# [START composite_ptransform_apply_method]
# [START composite_ptransform_declare]
class CountWords(beam.PTransform):
# [END composite_ptransform_declare]
def expand(self, pcoll):
return (pcoll
| beam.FlatMap(lambda x: re.findall(r'\w+', x))
| beam.combiners.Count.PerElement()
| beam.Map(lambda (word, c): '%s: %s' % (word, c)))
# [END composite_ptransform_apply_method]
# [END composite_transform_example]
p = TestPipeline() # Use TestPipeline for testing.
(p
| beam.Create(contents)
| CountWords()
| beam.io.WriteToText(output_path))
p.run()
示例13: test_runtime_checks_on
def test_runtime_checks_on(self):
# pylint: disable=expression-not-assigned
p = TestPipeline()
with self.assertRaises(typehints.TypeCheckError):
# [START type_hints_runtime_on]
p.options.view_as(TypeOptions).runtime_type_check = True
p | beam.Create(['a']) | beam.Map(lambda x: 3).with_output_types(str)
p.run()
示例14: test_basics
def test_basics(self):
p = TestPipeline()
result = p | 'Estimate' >> estimate_pi.EstimatePiTransform(5000)
# Note: Probabilistically speaking this test can fail with a probability
# that is very small (VERY) given that we run at least 500 thousand trials.
assert_that(result, in_between(3.125, 3.155))
p.run()
示例15: test_timestamp_param
def test_timestamp_param(self):
class TestDoFn(DoFn):
def process(self, element, timestamp=DoFn.TimestampParam):
yield timestamp
pipeline = TestPipeline()
pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
pipeline.run()