本文整理汇总了Python中apache_beam.Pipeline方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.Pipeline方法的具体用法?Python apache_beam.Pipeline怎么用?Python apache_beam.Pipeline使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.Pipeline方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: read_headers
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def read_headers(
pipeline, #type: beam.Pipeline
pipeline_mode, #type: int
all_patterns #type: List[str]
):
# type: (...) -> pvalue.PCollection
"""Creates an initial PCollection by reading the VCF file headers."""
compression_type = get_compression_type(all_patterns)
if pipeline_mode == PipelineModes.LARGE:
headers = (pipeline
| beam.Create(all_patterns)
| vcf_header_io.ReadAllVcfHeaders(
compression_type=compression_type))
else:
headers = pipeline | vcf_header_io.ReadVcfHeaders(
all_patterns[0],
compression_type=compression_type)
return headers
示例2: _read_variants
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def _read_variants(all_patterns, # type: List[str]
pipeline, # type: beam.Pipeline
known_args, # type: argparse.Namespace
pipeline_mode, # type: int
pre_infer_headers=False, # type: bool
keep_raw_sample_names=False
):
# type: (...) -> pvalue.PCollection
"""Helper method for returning a PCollection of Variants from VCFs."""
representative_header_lines = None
if known_args.representative_header_file:
representative_header_lines = vcf_header_parser.get_metadata_header_lines(
known_args.representative_header_file)
return pipeline_common.read_variants(
pipeline,
all_patterns,
pipeline_mode,
known_args.allow_malformed_records,
representative_header_lines,
pre_infer_headers=pre_infer_headers,
sample_name_encoding=(
SampleNameEncoding.NONE if keep_raw_sample_names else
SampleNameEncoding[known_args.sample_name_encoding]),
use_1_based_coordinate=known_args.use_1_based_coordinate)
示例3: GetPipelineRoot
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def GetPipelineRoot(options=None):
"""Return the root of the beam pipeline.
Typical usage looks like:
with GetPipelineRoot() as root:
_ = (root | beam.ParDo() | ...)
In this example, the pipeline is automatically executed when the context is
exited, though one can manually run the pipeline built from the root object as
well.
Args:
options: A beam.options.pipeline_options.PipelineOptions object.
Returns:
A beam.Pipeline root object.
"""
return beam.Pipeline(options=options)
示例4: test_stats_pipeline_with_zero_examples
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def test_stats_pipeline_with_zero_examples(self):
expected_result = text_format.Parse(
"""
datasets {
num_examples: 0
}
""", statistics_pb2.DatasetFeatureStatisticsList())
with beam.Pipeline() as p:
options = stats_options.StatsOptions(
num_top_values=1,
num_rank_histogram_buckets=1,
num_values_histogram_buckets=2,
num_histogram_buckets=1,
num_quantiles_histogram_buckets=1,
epsilon=0.001)
result = (p | beam.Create([]) | stats_api.GenerateStatistics(options))
util.assert_that(
result,
test_util.make_dataset_feature_stats_list_proto_equal_fn(
self, expected_result))
示例5: test_stats_pipeline_with_sample_rate
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def test_stats_pipeline_with_sample_rate(self):
record_batches = [
pa.RecordBatch.from_arrays(
[pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
]
with beam.Pipeline() as p:
options = stats_options.StatsOptions(
sample_rate=1.0,
num_top_values=2,
num_rank_histogram_buckets=2,
num_values_histogram_buckets=2,
num_histogram_buckets=2,
num_quantiles_histogram_buckets=2,
epsilon=0.001)
result = (
p | beam.Create(record_batches)
| stats_api.GenerateStatistics(options))
util.assert_that(
result,
test_util.make_dataset_feature_stats_list_proto_equal_fn(
self, self._sampling_test_expected_result))
示例6: test_write_stats_to_text
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def test_write_stats_to_text(self):
stats = text_format.Parse(
"""
datasets {
name: 'x'
num_examples: 100
}
""", statistics_pb2.DatasetFeatureStatisticsList())
output_path = os.path.join(self._get_temp_dir(), 'stats')
with beam.Pipeline() as p:
_ = (p | beam.Create([stats]) | stats_api.WriteStatisticsToText(
output_path))
stats_from_file = statistics_pb2.DatasetFeatureStatisticsList()
serialized_stats = io_util.read_file_to_string(
output_path, binary_mode=True)
stats_from_file.ParseFromString(serialized_stats)
self.assertLen(stats_from_file.datasets, 1)
test_util.assert_dataset_feature_stats_proto_equal(
self, stats_from_file.datasets[0], stats.datasets[0])
示例7: test_stats_impl
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def test_stats_impl(self,
record_batches,
options,
expected_result_proto_text,
schema=None):
expected_result = text_format.Parse(
expected_result_proto_text,
statistics_pb2.DatasetFeatureStatisticsList())
if schema is not None:
options.schema = schema
with beam.Pipeline() as p:
result = (
p | beam.Create(record_batches, reshuffle=False)
| stats_impl.GenerateStatisticsImpl(options))
util.assert_that(
result,
test_util.make_dataset_feature_stats_list_proto_equal_fn(
self, expected_result))
示例8: test_csv_decoder
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def test_csv_decoder(self,
input_lines,
expected_result,
column_names,
delimiter=',',
skip_blank_lines=True,
schema=None,
multivalent_columns=None,
secondary_delimiter=None):
with beam.Pipeline() as p:
result = (
p | beam.Create(input_lines, reshuffle=False)
| csv_decoder.DecodeCSV(
column_names=column_names,
delimiter=delimiter,
skip_blank_lines=skip_blank_lines,
schema=schema,
multivalent_columns=multivalent_columns,
secondary_delimiter=secondary_delimiter))
util.assert_that(
result,
test_util.make_arrow_record_batches_equal_fn(self, expected_result))
示例9: preprocess
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def preprocess(train_dataset, output_dir, eval_dataset, checkpoint):
"""Preprocess data locally."""
import apache_beam as beam
from google.datalab.utils import LambdaJob
from . import _preprocess
if checkpoint is None:
checkpoint = _util._DEFAULT_CHECKPOINT_GSURL
job_id = ('preprocess-image-classification-' +
datetime.datetime.now().strftime('%y%m%d-%H%M%S'))
# Project is needed for bigquery data source, even in local run.
options = {
'project': _util.default_project(),
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)
p = beam.Pipeline('DirectRunner', options=opts)
_preprocess.configure_pipeline(p, train_dataset, eval_dataset, checkpoint, output_dir, job_id)
job = LambdaJob(lambda: p.run().wait_until_finish(), job_id)
return job
示例10: batch_predict
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def batch_predict(dataset, model_dir, output_csv, output_bq_table):
"""Batch predict running locally."""
import apache_beam as beam
from google.datalab.utils import LambdaJob
from . import _predictor
if output_csv is None and output_bq_table is None:
raise ValueError('output_csv and output_bq_table cannot both be None.')
job_id = ('batch-predict-image-classification-' +
datetime.datetime.now().strftime('%y%m%d-%H%M%S'))
# Project is needed for bigquery data source, even in local run.
options = {
'project': _util.default_project(),
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)
p = beam.Pipeline('DirectRunner', options=opts)
_predictor.configure_pipeline(p, dataset, model_dir, output_csv, output_bq_table)
job = LambdaJob(lambda: p.run().wait_until_finish(), job_id)
return job
示例11: create_glyphazzn_dataset
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def create_glyphazzn_dataset(filepattern, output_path):
"""Creates a glyphazzn dataset, from raw Parquetio to TFRecords."""
def pipeline(root):
"""Pipeline for creating glyphazzn dataset."""
attrs = ['uni', 'width', 'vwidth', 'sfd', 'id', 'binary_fp']
examples = root | 'Read' >> beam.io.parquetio.ReadFromParquet(
file_pattern=filepattern, columns=attrs)
examples = examples | 'FilterBadIcons' >> beam.Filter(_is_valid_glyph)
examples = examples | 'ConvertToPath' >> beam.Map(_convert_to_path)
examples = examples | 'FilterBadPathLenghts' >> beam.Filter(_is_valid_path)
examples = examples | 'ProcessAndConvert' >> beam.Map(_create_example)
(examples | 'WriteToTFRecord' >> beam.io.tfrecordio.WriteToTFRecord(
output_path, num_shards=90))
return pipeline
示例12: convert_to_tfxio_api_inputs
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def convert_to_tfxio_api_inputs(
self, legacy_input_data, legacy_input_metadata, label='input_data'):
"""Converts from the legacy TFT API inputs to TFXIO-based inputs.
Args:
legacy_input_data: a PCollection of instance dicts.
legacy_input_metadata: a tft.DatasetMetadata.
label: label for the PTransform that translates `legacy_input_data` into
the TFXIO input data. Set to different values if this method is called
multiple times in a beam Pipeline.
Returns:
A tuple of a PCollection of `pyarrow.RecordBatch` and a
`tensor_adapter.TensorAdapterConfig`. This tuple can be fed directly to
TFT's `{Analyze,Transform,AnalyzeAndTransform}Dataset` APIs.
"""
tfxio_impl = _LegacyCompatibilityTFXIO(legacy_input_metadata.schema)
input_data = (
legacy_input_data |
('LegacyFormatToTfxio[%s]' % label >> tfxio_impl.BeamSource(
beam_impl.Context.get_desired_batch_size())))
return input_data, tfxio_impl.TensorAdapterConfig()
示例13: _clear_shared_state_after_barrier
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def _clear_shared_state_after_barrier(pipeline, input_barrier):
"""Clears any shared state from within a pipeline context.
This will only be cleared once input_barrier becomes available.
Args:
pipeline: A `beam.Pipeline` object.
input_barrier: A `PCollection` which the pipeline should wait for.
Returns:
An empty `PCollection`.
"""
empty_pcoll = input_barrier | 'MakeCheapBarrier' >> beam.FlatMap(
lambda x: None)
return (pipeline
| 'PrepareToClearSharedKeepAlives' >> beam.Create([None])
| 'WaitAndClearSharedKeepAlives' >> beam.Map(
lambda x, empty_side_input: shared.Shared().acquire(lambda: None),
beam.pvalue.AsIter(empty_pcoll)))
示例14: testReadTransformFn
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def testReadTransformFn(self):
path = self.get_temp_dir()
# NOTE: we don't need to create or write to the transform_fn directory since
# ReadTransformFn never inspects this directory.
transform_fn_dir = os.path.join(
path, tft.TFTransformOutput.TRANSFORM_FN_DIR)
transformed_metadata_dir = os.path.join(
path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR)
metadata_io.write_metadata(test_metadata.COMPLETE_METADATA,
transformed_metadata_dir)
with beam.Pipeline() as pipeline:
saved_model_dir_pcoll, metadata = (
pipeline | transform_fn_io.ReadTransformFn(path))
beam_test_util.assert_that(
saved_model_dir_pcoll,
beam_test_util.equal_to([transform_fn_dir]),
label='AssertSavedModelDir')
# NOTE: metadata is currently read in a non-deferred manner.
self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
示例15: testWriteTransformFnIsIdempotent
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Pipeline [as 别名]
def testWriteTransformFnIsIdempotent(self):
transform_output_dir = os.path.join(self.get_temp_dir(), 'output')
def mock_write_metadata_expand(unused_self, unused_metadata):
raise ArithmeticError('Some error')
with beam.Pipeline() as pipeline:
# Create an empty directory for the source saved model dir.
saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
saved_model_dir_pcoll = (
pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
with mock.patch.object(transform_fn_io.beam_metadata_io.WriteMetadata,
'expand', mock_write_metadata_expand):
with self.assertRaisesRegexp(ArithmeticError, 'Some error'):
_ = ((saved_model_dir_pcoll, object())
| transform_fn_io.WriteTransformFn(transform_output_dir))
self.assertFalse(file_io.file_exists(transform_output_dir))