本文整理汇总了Python中apache_beam.Filter方法的典型用法代码示例。如果您正苦于以下问题:Python apache_beam.Filter方法的具体用法?Python apache_beam.Filter怎么用?Python apache_beam.Filter使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apache_beam
的用法示例。
在下文中一共展示了apache_beam.Filter方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_glyphazzn_dataset
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Filter [as 别名]
def create_glyphazzn_dataset(filepattern, output_path):
"""Creates a glyphazzn dataset, from raw Parquetio to TFRecords."""
def pipeline(root):
"""Pipeline for creating glyphazzn dataset."""
attrs = ['uni', 'width', 'vwidth', 'sfd', 'id', 'binary_fp']
examples = root | 'Read' >> beam.io.parquetio.ReadFromParquet(
file_pattern=filepattern, columns=attrs)
examples = examples | 'FilterBadIcons' >> beam.Filter(_is_valid_glyph)
examples = examples | 'ConvertToPath' >> beam.Map(_convert_to_path)
examples = examples | 'FilterBadPathLenghts' >> beam.Filter(_is_valid_path)
examples = examples | 'ProcessAndConvert' >> beam.Map(_create_example)
(examples | 'WriteToTFRecord' >> beam.io.tfrecordio.WriteToTFRecord(
output_path, num_shards=90))
return pipeline
示例2: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Filter [as 别名]
def expand(self, examples):
"""Runs the linters on the data and writes out the results.
The order in which the linters run is unspecified.
Args:
examples: A `PTransform` that yields a `PCollection` of `tf.Examples`.
Returns:
A pipeline containing the `DataLinter` `PTransform`s.
"""
coders = (beam.coders.coders.StrUtf8Coder(),
beam.coders.coders.ProtoCoder(lint_result_pb2.LintResult))
return (
[examples | linter for linter in self._linters if linter.should_run()]
| 'MergeResults' >> beam.Flatten()
| 'DropEmpty' >> beam.Filter(lambda (_, r): r and len(r.warnings))
| 'ToDict' >> beam.combiners.ToDict()
| 'WriteResults' >> beam.io.textio.WriteToText(
self._results_path,
coder=beam.coders.coders.PickleCoder(),
shard_name_template=''))
示例3: _lint
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Filter [as 别名]
def _lint(self, examples):
feature_val_w_counts = (
examples
| 'Tuplize' >> beam.FlatMap(
utils.example_tuplizer(self._counted_features))
| 'FlattenFeatureVals' >> beam.FlatMap(self._flatten_feature_vals)
| 'CountFeatureVals' >> beam.combiners.Count.PerElement())
if hasattr(self, '_count_transformer'):
feature_val_w_counts |= 'TransformCounts' >> self._count_transformer
return (
feature_val_w_counts
| 'PairValWithCount' >> beam.Map(self._shift_key)
| 'GroupByFeature' >> beam.GroupByKey()
| 'ValCountsToDict' >> beam.Map(self._val_counts_as_dict)
| 'GenResults' >> beam.Map(self._check_feature)
| 'DropUnwarned' >> beam.Filter(bool)
| 'AsList' >> beam.combiners.ToList()
| 'ToResult' >> beam.Map(self._to_result))
示例4: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Filter [as 别名]
def expand(self, inputs):
pcoll, = inputs
# Create a PCollection of (count, element) pairs, then iterates over
# this to create a single element PCollection containing this list of
# pairs in sorted order by decreasing counts (and by values for equal
# counts).
# TODO(b/112916494): Unify the graph in both cases once possible.
if (self._vocab_ordering_type ==
_VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION):
flatten_map_fn = _flatten_to_key_and_means_accumulator_list
combine_transform = _MutualInformationTransformAccumulate() # pylint: disable=no-value-for-parameter
elif self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_FREQUENCY:
flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples
combine_transform = beam.CombinePerKey(sum)
elif self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_LABELS:
flatten_map_fn = _flatten_value_and_labeled_weights_to_list_of_tuples
combine_transform = beam.CombinePerKey(sum_labeled_weights)
else:
flatten_map_fn = _flatten_value_to_list
combine_transform = beam.combiners.Count.PerElement()
result = (
pcoll
| 'FlattenTokensAndMaybeWeightsLabels' >> beam.FlatMap(flatten_map_fn)
| 'CountPerToken' >> combine_transform)
if self._input_dtype == tf.string:
# TODO(b/62379925) Filter empty strings or strings containing the \n or \r
# tokens since index_table_from_file doesn't allow empty rows.
def is_problematic_string(kv):
string, _ = kv # Ignore counts.
return string and b'\n' not in string and b'\r' not in string
result |= 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)
return result
示例5: _count_transformer
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Filter [as 别名]
def _count_transformer(self):
return (
'DropNaN' >> beam.Filter(lambda (f_v, _): not np.isnan(f_v[1]))
| 'IsIntegral' >> beam.Map(
lambda (f_v, c): ((f_v[0], f_v[1] % 1 == 0), c))
| 'Count' >> beam.CombinePerKey(sum))
示例6: expand
# 需要导入模块: import apache_beam [as 别名]
# 或者: from apache_beam import Filter [as 别名]
def expand(self, sliced_record_batchs_and_ys: Tuple[types.SlicedRecordBatch,
_SlicedYKey]):
sliced_record_batchs, y_keys = sliced_record_batchs_and_ys
# _SlicedXYKey(slice, x_path, x, y), xy_count
partial_copresence_counts = (
sliced_record_batchs
| 'ToPartialCopresenceCounts' >> beam.FlatMap(
_to_partial_copresence_counts, self._y_path, self._x_paths,
self._y_boundaries, self._weight_column_name))
# Compute placerholder copresence counts.
# partial_copresence_counts will only include x-y pairs that are present,
# but we would also like to keep track of x-y pairs that never appear, as
# long as x and y independently occur in the slice.
# _SlicedXKey(slice, x_path, x), x_count
x_counts = (
sliced_record_batchs
| 'ToPartialXCounts' >> beam.FlatMap(
_to_partial_x_counts, self._x_paths, self._weight_column_name)
| 'SumXCounts' >> beam.CombinePerKey(sum))
if self._min_x_count:
x_counts = x_counts | 'FilterXCounts' >> beam.Filter(
lambda kv: kv[1] > self._min_x_count)
# _SlicedXYKey(slice, x_path, x, y), 0
placeholder_copresence_counts = (
(x_counts, y_keys)
| 'GetPlaceholderCopresenceCounts' >> _GetPlaceholderCopresenceCounts(
self._x_paths, self._min_x_count))
def move_y_to_value(key, xy_count):
return _SlicedXKey(key.slice_key, key.x_path, key.x), (key.y, xy_count)
# _SlicedXKey(slice, x_path, x), (y, xy_count)
copresence_counts = (
(placeholder_copresence_counts, partial_copresence_counts)
| 'FlattenCopresenceCounts' >> beam.Flatten()
| 'SumCopresencePairs' >> beam.CombinePerKey(sum)
| 'MoveYToValue' >> beam.MapTuple(move_y_to_value))
# _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
return ({
'x_count': x_counts,
'xy_counts': copresence_counts
}
| 'CoGroupByForConditionalYRates' >> beam.CoGroupByKey()
| 'JoinXCounts' >> beam.FlatMap(_join_x_counts))