本文整理汇总了Python中apache_beam.DoFn方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.DoFn方法的具体用法?Python apache_beam.DoFn怎么用?Python apache_beam.DoFn使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.DoFn方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: shuffle
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def shuffle(p):
"""Shuffles data from PCollection.
Args:
p: PCollection.
Returns:
PCollection of shuffled data.
"""
class _AddRandomKey(beam.DoFn):
def process(self, element):
yield random.random(), element
shuffled_data = (
p
| 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
| 'GroupByRandom' >> beam.GroupByKey()
| 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
return shuffled_data
示例2: shuffle_data
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def shuffle_data(p):
"""Shuffles data from PCollection.
Args:
p: PCollection.
Returns:
PCollection of shuffled data.
"""
class _AddRandomKey(beam.DoFn):
def process(self, element):
yield (random.random(), element)
shuffled_data = (
p
| 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
| 'GroupByRandom' >> beam.GroupByKey()
| 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
return shuffled_data
示例3: process
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def process(self,
element,
timestamp=beam.DoFn.TimestampParam,
window=beam.DoFn.WindowParam,
pane_info=beam.DoFn.PaneInfoParam):
# Logging to audit triggering of side input refresh process. Statement will be logged only whenever the pubsub notification
# triggers side input refresh process (i.e normally once in every x hours)
if isinstance(window, beam.transforms.window.GlobalWindow):
logging.info(
f"(Re)loading side input data from basepath {element.decode()} for global window: {timestamp} - {window}"
)
else:
logging.info(
f"(Re)loading side input data from basepath {element.decode()} for window: {util.get_formatted_time(window.start)} - {util.get_formatted_time(window.end)}"
)
for sideinput_type in self.sideinput_types:
yield beam.pvalue.TaggedOutput(
sideinput_type,
FileSystems.join(element.decode(), sideinput_type,
self.file_prefix))
示例4: process
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def process(self, element, *args, **kwargs):
"""This function creates a random record based on the properties
of the passed DataGenerator object for each element in prior the
PCollection.
Args:
element: A single element of the PCollection
"""
faker_schema = self.data_gen.get_faker_schema()
try:
# Here the element is treated as the dictionary representing a single row
# of the histogram table.
frequency = element.get('frequency')
#TODO make this a splittable DoFn to avoid scenario where we hang for large
# frequency values.
for i in range(int(frequency)):
row = self.generate_fake(fschema=faker_schema,
key_dict=element)
yield row
except AttributeError:
# The contents of this element are ignored if they are a string.
row = self.generate_fake(fschema=faker_schema, key_dict=element)
yield row
示例5: process
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def process(self, inputs):
# Create a session for every worker only once. The session is not
# pickleable, so it can't be created at the DoFn constructor.
if not self.session:
self.graph = ops.Graph()
with self.graph.as_default():
self.session = tf.Session()
metagraph_def = tf.compat.v1.saved_model.load(
self.session, {self.meta_tag}, self.model_dir)
signature_def = metagraph_def.signature_def[self.meta_signature]
# inputs
self.feed_tensors = {
k: self.graph.get_tensor_by_name(v.name)
for k, v in signature_def.inputs.items()
}
# outputs/predictions
self.fetch_tensors = {
k: self.graph.get_tensor_by_name(v.name)
for k, v in signature_def.outputs.items()
}
# Create a feed_dict for a single element.
feed_dict = {
tensor: [inputs[key]]
for key, tensor in self.feed_tensors.items()
if key in inputs
}
results = self.session.run(self.fetch_tensors, feed_dict)
yield {
'id': inputs[self.id_key],
'predictions': results[self.meta_predictions][0].tolist()
}
# [START dataflow_molecules_run_definition]
示例6: __init__
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def __init__(self, all_periods, all_nbins, weight_min_factor,
duration_density_min, duration_min_days, duration_density_max,
duration_min_fraction):
"""Initializes the DoFn."""
self.all_periods = all_periods
self.all_nbins = all_nbins
self.max_nbins = max(self.all_nbins)
self.weight_min_factor = weight_min_factor
self.duration_density_min = duration_density_min
self.duration_min_days = duration_min_days
self.duration_density_max = duration_density_max
self.duration_min_fraction = duration_min_fraction
示例7: __init__
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def __init__(self, model_name, model_dir, config_name=None):
"""Initializes the DoFn.
Args:
model_name: Name of the model class.
model_dir: Directory containing a model checkpoint.
config_name: Optional name of the model configuration. If not specified,
the file 'config.json' in model_dir is used.
"""
# Look up the model class.
model_class = models.get_model_class(model_name)
# Find the latest checkpoint.
checkpoint_file = tf.train.latest_checkpoint(model_dir)
if not checkpoint_file:
raise ValueError("No checkpoint file found in: {}".format(model_dir))
# Get the model configuration.
if config_name:
config = models.get_model_config(model_name, config_name)
else:
with tf.gfile.Open(os.path.join(model_dir, "config.json")) as f:
config = json.load(f)
config = configdict.ConfigDict(config)
self.model_class = model_class
self.checkpoint_file = checkpoint_file
self.config = config
示例8: __init__
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def __init__(self,
gap_width,
normalize_method,
normalize_args,
upward_outlier_sigma_cut=None,
downward_outlier_sigma_cut=None,
remove_events_width_factor=1.5,
output_name="light_curve"):
"""Initializes the DoFn.
Args:
gap_width: Minimum gap size (in time units) to split the light curve
before fitting the normalization curve.
normalize_method: Method for fitting the normalization curve.
normalize_args: Arguments passed to the function that computes the
normalization curve.
upward_outlier_sigma_cut: Number of standard deviations from the median
flux value above which upward outliers are removed.
downward_outlier_sigma_cut: Number of standard deviations from the median
flux value above which downward outliers are removed.
remove_events_width_factor: Fraction of the duration to remove when
removing periodic events.
output_name: Name of the processed light curve in the output dict.
"""
self.remove_events_width_factor = remove_events_width_factor
self.gap_width = gap_width
self.normalize_method = normalize_method
self.normalize_args = normalize_args
self.upward_outlier_sigma_cut = upward_outlier_sigma_cut
self.downward_outlier_sigma_cut = downward_outlier_sigma_cut
self.output_name = output_name
示例9: main
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def main(argv):
if len(argv) > 1:
raise app.UsageError("Too many command-line arguments.")
def pipeline(root):
"""Beam pipeline for preprocessing open images."""
assert FLAGS.input_file_pattern
assert FLAGS.output_dir
assert FLAGS.output_name
assert FLAGS.num_shards
assert FLAGS.kepid_whitelist
# Read label whitelist.
kepid_whitelist = [int(kepid) for kepid in FLAGS.kepid_whitelist.split(",")]
logging.info("Read Kepid whitelist with %d labels", len(kepid_whitelist))
# Initialize DoFn.
process_example = ProcessExampleDoFn(kepid_whitelist)
# Create Pipeline.
# pylint: disable=expression-not-assigned
(root
| "read_tfrecord" >> beam.io.tfrecordio.ReadFromTFRecord(
FLAGS.input_file_pattern,
coder=beam.coders.ProtoCoder(tf.train.Example))
| "process_examples" >> beam.ParDo(process_example)
| "reshuffle" >> beam.Reshuffle()
| "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
os.path.join(FLAGS.output_dir, FLAGS.output_name),
coder=beam.coders.ProtoCoder(tf.train.Example),
num_shards=FLAGS.num_shards))
# pylint: enable=expression-not-assigned
pipeline.run()
logging.info("Processing complete.")
示例10: __init__
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def __init__(self,
kepler_data_dir,
flux_column="PDCSAP_FLUX",
injected_group=None,
scramble_type=None,
invert_light_curves=False,
upward_outlier_clipping=None,
downward_outlier_clipping=None,
clip_lowest_n_values=None,
normalize_stddev=False):
"""Initializes the DoFn.
Args:
kepler_data_dir: Base directory containing Kepler data.
flux_column: Name of the flux column to extract.
injected_group: Optional string specifying the injected group. One of
{'inj1', 'inj2', 'inj3'}.
scramble_type: Optional string specifying the scramble order. One of
{'SCR1', 'SCR2', 'SCR3'}.
invert_light_curves: Whether to reflect light curves around the median
flux value.
upward_outlier_clipping: If specified, clip upward flux values to this
number of multiples of the standard deviation.
downward_outlier_clipping: If specified, clip downward flux values to this
number of multiples of the standard deviation.
clip_lowest_n_values: If specified, clip lowest flux values to the value
of the nth lowest value.
normalize_stddev: Whether to divide the flux by the standard deviation.
"""
self.kepler_data_dir = kepler_data_dir
self.flux_column = flux_column
self.injected_group = injected_group
self.extension = "INJECTED LIGHTCURVE" if injected_group else "LIGHTCURVE"
self.scramble_type = scramble_type
self.invert_light_curves = invert_light_curves
self.upward_outlier_clipping = upward_outlier_clipping
self.downward_outlier_clipping = downward_outlier_clipping
self.clip_lowest_n_values = clip_lowest_n_values
self.normalize_stddev = normalize_stddev
示例11: __init__
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def __init__(self, hparams, dataset_overrides):
"""Initializes the DoFn."""
self.hparams = hparams
self.dataset_overrides = dataset_overrides
示例12: __init__
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def __init__(self, batch_size):
"""Constructor of EmitAsBatchDoFn beam.DoFn class.
Args:
batch_size: the max size we want to buffer the records before emitting.
"""
self._batch_size = batch_size
self._cached = []
示例13: process
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def process(self, batch, saved_model_dir):
"""Runs the given graph to realize the output `Tensor` or `SparseTensor`s.
Runs the graph in a TF session for computing the output values of the
`Tensor` or `SparseTensor`s, given an input row of data (input `Tensor` or
`SparseTensor`s).
Args:
batch: the batch of elements being processed by the DoFn
saved_model_dir: Directory containing saved model.
Yields:
A representation of output features as a dict mapping keys (logical column
names) to values.
"""
if self._graph_state is None:
# If available, acquire will return a cached _GraphState, since calling
# _make_graph_state is expensive.
self._graph_state = self._shared_graph_state_handle.acquire(
lambda: self._make_graph_state(saved_model_dir))
# This should remain true throughout the lifetime of this DoFn, regardless
# of whether or not self._graph_state was cached.
assert self._graph_state.saved_model_dir == saved_model_dir
yield self._handle_batch(batch)
示例14: process
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def process(self, element, publish_time=beam.DoFn.TimestampParam):
"""Processes each incoming windowed element by extracting the Pub/Sub
message and its publish timestamp into a dictionary. `publish_time`
defaults to the publish timestamp returned by the Pub/Sub server. It
is bound to each element by Beam at runtime.
"""
yield {
"message_body": element.decode("utf-8"),
"publish_time": datetime.datetime.utcfromtimestamp(
float(publish_time)
).strftime("%Y-%m-%d %H:%M:%S.%f"),
}
示例15: run
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import DoFn [as 别名]
def run(input_topic, output_path, window_size=1.0, pipeline_args=None):
# `save_main_session` is set to true because some DoFn's rely on
# globally imported modules.
pipeline_options = PipelineOptions(
pipeline_args, streaming=True, save_main_session=True
)
with beam.Pipeline(options=pipeline_options) as pipeline:
(
pipeline
| "Read PubSub Messages"
>> beam.io.ReadFromPubSub(topic=input_topic)
| "Window into" >> GroupWindowsIntoBatches(window_size)
| "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(output_path))
)