本文整理汇总了Python中apache_beam.Create方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.Create方法的具体用法?Python apache_beam.Create怎么用?Python apache_beam.Create使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.Create方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_pipeline_read_all_file_pattern
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_pipeline_read_all_file_pattern(self):
with temp_dir.TempDir() as tempdir:
headers_1 = [self.lines[1], self.lines[-1]]
headers_2 = [self.lines[2], self.lines[3], self.lines[-1]]
headers_3 = [self.lines[4], self.lines[-1]]
file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=headers_1)
file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=headers_2)
file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=headers_3)
pipeline = TestPipeline()
pcoll = (pipeline
| 'Create' >> beam.Create(
[os.path.join(tempdir.get_path(), '*.vcf')])
| 'ReadHeaders' >> ReadAllVcfHeaders())
expected = [_get_vcf_header_from_lines(h, file_name=file_name)
for h, file_name in [(headers_1, file_name_1),
(headers_2, file_name_2),
(headers_3, file_name_3)]]
assert_that(pcoll, asserts.header_vars_equal(expected))
pipeline.run()
示例2: _assert_pipeline_read_files_record_count_equal
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def _assert_pipeline_read_files_record_count_equal(
self, input_pattern, expected_count, use_read_all=False):
"""Helper method for verifying total records read.
Args:
input_pattern (str): Input file pattern to read.
expected_count (int): Expected number of reacords that was read.
use_read_all (bool): Whether to use the scalable ReadAllFromVcf transform
instead of ReadFromVcf.
"""
pipeline = TestPipeline()
if use_read_all:
pcoll = (pipeline
| 'Create' >> beam.Create([input_pattern])
| 'Read' >> ReadAllFromVcf())
else:
pcoll = pipeline | 'Read' >> ReadFromVcf(input_pattern)
assert_that(pcoll, asserts.count_equals_to(expected_count))
pipeline.run()
示例3: test_pipeline_read_all_file_pattern
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_pipeline_read_all_file_pattern(self):
with temp_dir.TempDir() as tempdir:
lines_1 = self.headers[1:2] + self.headers[-1:] + self.records[:2]
lines_2 = self.headers[2:4] + self.headers[-1:] + self.records[2:4]
lines_3 = self.headers[4:5] + self.headers[-1:] + self.records[4:]
file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=lines_1)
file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=lines_2)
file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=lines_3)
pipeline = TestPipeline()
pcoll = pipeline | 'ReadHeaders' >> GetEstimates(
os.path.join(tempdir.get_path(), '*.vcf'))
pcoll = (pipeline
| 'Create' >> beam.Create(
[os.path.join(tempdir.get_path(), '*.vcf')])
| 'GetAllEstimates' >> GetAllEstimates())
expected = [_get_estimate_from_lines(lines, file_name=file_name)
for lines, file_name in [(lines_1, file_name_1),
(lines_2, file_name_2),
(lines_3, file_name_3)]]
assert_that(pcoll, asserts.header_vars_equal(expected))
pipeline.run()
示例4: read_headers
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def read_headers(
pipeline, #type: beam.Pipeline
pipeline_mode, #type: int
all_patterns #type: List[str]
):
# type: (...) -> pvalue.PCollection
"""Creates an initial PCollection by reading the VCF file headers."""
compression_type = get_compression_type(all_patterns)
if pipeline_mode == PipelineModes.LARGE:
headers = (pipeline
| beam.Create(all_patterns)
| vcf_header_io.ReadAllVcfHeaders(
compression_type=compression_type))
else:
headers = pipeline | vcf_header_io.ReadVcfHeaders(
all_patterns[0],
compression_type=compression_type)
return headers
示例5: add_annotation_headers
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def add_annotation_headers(pipeline, known_args, pipeline_mode,
merged_header,
annotated_vcf_pattern):
if pipeline_mode == PipelineModes.LARGE:
annotation_headers = (pipeline
| 'ReadAnnotatedVCF'
>> beam.Create([annotated_vcf_pattern])
| 'ReadHeaders' >> vcf_header_io.ReadAllVcfHeaders())
else:
annotation_headers = (
pipeline
| 'ReadHeaders'
>> vcf_header_io.ReadVcfHeaders(annotated_vcf_pattern))
merged_header = (
(merged_header, annotation_headers)
| beam.Flatten()
| 'MergeWithOriginalHeaders' >> merge_headers.MergeHeaders(
known_args.split_alternate_allele_info_fields,
known_args.allow_incompatible_records))
return merged_header
示例6: test_stats_pipeline_with_zero_examples
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_stats_pipeline_with_zero_examples(self):
expected_result = text_format.Parse(
"""
datasets {
num_examples: 0
}
""", statistics_pb2.DatasetFeatureStatisticsList())
with beam.Pipeline() as p:
options = stats_options.StatsOptions(
num_top_values=1,
num_rank_histogram_buckets=1,
num_values_histogram_buckets=2,
num_histogram_buckets=1,
num_quantiles_histogram_buckets=1,
epsilon=0.001)
result = (p | beam.Create([]) | stats_api.GenerateStatistics(options))
util.assert_that(
result,
test_util.make_dataset_feature_stats_list_proto_equal_fn(
self, expected_result))
示例7: test_stats_pipeline_with_sample_rate
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_stats_pipeline_with_sample_rate(self):
record_batches = [
pa.RecordBatch.from_arrays(
[pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
]
with beam.Pipeline() as p:
options = stats_options.StatsOptions(
sample_rate=1.0,
num_top_values=2,
num_rank_histogram_buckets=2,
num_values_histogram_buckets=2,
num_histogram_buckets=2,
num_quantiles_histogram_buckets=2,
epsilon=0.001)
result = (
p | beam.Create(record_batches)
| stats_api.GenerateStatistics(options))
util.assert_that(
result,
test_util.make_dataset_feature_stats_list_proto_equal_fn(
self, self._sampling_test_expected_result))
示例8: test_write_stats_to_text
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_write_stats_to_text(self):
stats = text_format.Parse(
"""
datasets {
name: 'x'
num_examples: 100
}
""", statistics_pb2.DatasetFeatureStatisticsList())
output_path = os.path.join(self._get_temp_dir(), 'stats')
with beam.Pipeline() as p:
_ = (p | beam.Create([stats]) | stats_api.WriteStatisticsToText(
output_path))
stats_from_file = statistics_pb2.DatasetFeatureStatisticsList()
serialized_stats = io_util.read_file_to_string(
output_path, binary_mode=True)
stats_from_file.ParseFromString(serialized_stats)
self.assertLen(stats_from_file.datasets, 1)
test_util.assert_dataset_feature_stats_proto_equal(
self, stats_from_file.datasets[0], stats.datasets[0])
示例9: test_stats_impl
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_stats_impl(self,
record_batches,
options,
expected_result_proto_text,
schema=None):
expected_result = text_format.Parse(
expected_result_proto_text,
statistics_pb2.DatasetFeatureStatisticsList())
if schema is not None:
options.schema = schema
with beam.Pipeline() as p:
result = (
p | beam.Create(record_batches, reshuffle=False)
| stats_impl.GenerateStatisticsImpl(options))
util.assert_that(
result,
test_util.make_dataset_feature_stats_list_proto_equal_fn(
self, expected_result))
示例10: test_csv_decoder
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def test_csv_decoder(self,
input_lines,
expected_result,
column_names,
delimiter=',',
skip_blank_lines=True,
schema=None,
multivalent_columns=None,
secondary_delimiter=None):
with beam.Pipeline() as p:
result = (
p | beam.Create(input_lines, reshuffle=False)
| csv_decoder.DecodeCSV(
column_names=column_names,
delimiter=delimiter,
skip_blank_lines=skip_blank_lines,
schema=schema,
multivalent_columns=multivalent_columns,
secondary_delimiter=secondary_delimiter))
util.assert_that(
result,
test_util.make_arrow_record_batches_equal_fn(self, expected_result))
示例11: testHandleBatchError
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def testHandleBatchError(self):
if self._UseTFXIO():
return
def preprocessing_fn(inputs):
return {'x_scaled': tft.scale_to_0_1(inputs['x'])}
metadata = tft_unit.metadata_from_feature_spec({
'x': tf.io.FixedLenFeature([], tf.float32),
})
pipeline = self._makeTestPipeline()
input_data = pipeline | 'CreateTrainingData' >> beam.Create([{
'x': 1
}, {
'x': [4, 1]
}])
with beam_impl.Context(temp_dir=self.get_temp_dir()):
_ = ((input_data, metadata)
| 'AnalyzeDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn))
# Exception type depends on the running being used.
with self.assertRaisesRegexp(
(RuntimeError, ValueError),
'An error occured while trying to apply the transformation:'):
pipeline.run()
示例12: _clear_shared_state_after_barrier
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def _clear_shared_state_after_barrier(pipeline, input_barrier):
"""Clears any shared state from within a pipeline context.
This will only be cleared once input_barrier becomes available.
Args:
pipeline: A `beam.Pipeline` object.
input_barrier: A `PCollection` which the pipeline should wait for.
Returns:
An empty `PCollection`.
"""
empty_pcoll = input_barrier | 'MakeCheapBarrier' >> beam.FlatMap(
lambda x: None)
return (pipeline
| 'PrepareToClearSharedKeepAlives' >> beam.Create([None])
| 'WaitAndClearSharedKeepAlives' >> beam.Map(
lambda x, empty_side_input: shared.Shared().acquire(lambda: None),
beam.pvalue.AsIter(empty_pcoll)))
示例13: testWriteTransformFnIsIdempotent
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def testWriteTransformFnIsIdempotent(self):
transform_output_dir = os.path.join(self.get_temp_dir(), 'output')
def mock_write_metadata_expand(unused_self, unused_metadata):
raise ArithmeticError('Some error')
with beam.Pipeline() as pipeline:
# Create an empty directory for the source saved model dir.
saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
saved_model_dir_pcoll = (
pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
with mock.patch.object(transform_fn_io.beam_metadata_io.WriteMetadata,
'expand', mock_write_metadata_expand):
with self.assertRaisesRegexp(ArithmeticError, 'Some error'):
_ = ((saved_model_dir_pcoll, object())
| transform_fn_io.WriteTransformFn(transform_output_dir))
self.assertFalse(file_io.file_exists(transform_output_dir))
示例14: testTwoLangs
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def testTwoLangs(self):
with TestPipeline() as p:
tokens = p | 'CreateInput' >> beam.Create(self.sample_input)
result = tokens | beam.ParDo(utils.CompileTokenizationInfo())
assert_that(result, equal_to([{
'lang': 'en',
'count': 1,
'num_preserved_chars': 13,
'num_dropped_chars': 2,
'num_non_unk_wordpieces': 4,
'preserved_ratio': [13/4],
'dropped_ratio': [2/15],
'wordpieces': collections.Counter(['the', 'app', '##le', 'sauce'])
}, {
'lang': 'fr',
'count': 1,
'num_preserved_chars': 14,
'num_dropped_chars': 0,
'num_non_unk_wordpieces': 5,
'preserved_ratio': [14/5],
'dropped_ratio': [0],
'wordpieces': collections.Counter(['bon', '##jour', 'bon', '##soir'])
}]))
示例15: _create_row
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Create [as 别名]
def _create_row(stats, now, extra_columns=tuple()):
"""Create a BigQuery row from the given stats."""
row = {'true_positives': stats.true_positives,
'false_positives': stats.false_positives,
'false_negatives': stats.false_negatives}
if not math.isnan(stats.precision):
row['precision'] = stats.precision
if not math.isnan(stats.recall):
row['recall'] = stats.recall
if not math.isnan(stats.f_score):
row['f_score'] = stats.f_score
row['timestamp'] = now
for column_name, val in extra_columns:
row[column_name] = val
return row