本文整理汇总了Python中tensorflow_transform.compute_and_apply_vocabulary方法的典型用法代码示例。如果您正苦于以下问题:Python tensorflow_transform.compute_and_apply_vocabulary方法的具体用法?Python tensorflow_transform.compute_and_apply_vocabulary怎么用?Python tensorflow_transform.compute_and_apply_vocabulary使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tensorflow_transform
的用法示例。
在下文中一共展示了tensorflow_transform.compute_and_apply_vocabulary方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: testTFIDFNoData
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def testTFIDFNoData(self):
def preprocessing_fn(inputs):
inputs_as_ints = tft.compute_and_apply_vocabulary(
tf.compat.v1.strings.split(inputs['a']))
out_index, out_values = tft.tfidf(inputs_as_ints, 6)
return {
'tf_idf': out_values,
'index': out_index
}
input_data = [{'a': ''}]
input_metadata = tft_unit.metadata_from_feature_spec(
{'a': tf.io.FixedLenFeature([], tf.string)})
expected_transformed_data = [{'tf_idf': [], 'index': []}]
expected_metadata = tft_unit.metadata_from_feature_spec({
'tf_idf': tf.io.VarLenFeature(tf.float32),
'index': tf.io.VarLenFeature(tf.int64)
})
self.assertAnalyzeAndTransformResults(
input_data, input_metadata, preprocessing_fn, expected_transformed_data,
expected_metadata)
示例2: testVocabularyAnalyzerWithTokenization
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def testVocabularyAnalyzerWithTokenization(self):
def preprocessing_fn(inputs):
return {
'index':
tft.compute_and_apply_vocabulary(
tf.compat.v1.strings.split(inputs['a']))
}
input_data = [{'a': 'hello hello world'}, {'a': 'hello goodbye world'}]
input_metadata = tft_unit.metadata_from_feature_spec(
{'a': tf.io.FixedLenFeature([], tf.string)})
expected_data = [{'index': [0, 0, 1]}, {'index': [0, 2, 1]}]
expected_metadata = tft_unit.metadata_from_feature_spec({
'index': tf.io.VarLenFeature(tf.int64),
}, {
'index': schema_pb2.IntDomain(min=-1, max=2, is_categorical=True),
})
self.assertAnalyzeAndTransformResults(input_data, input_metadata,
preprocessing_fn, expected_data,
expected_metadata)
示例3: preprocessing_fn
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def preprocessing_fn(inputs):
out = dict()
for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
# Preserve this feature as a dense float, setting nan's to the mean.
out[taxi.transformed_name(key)] = tft.scale_to_z_score(
taxi.fill_in_missing(inputs[key]))
for key in taxi.VOCAB_FEATURE_KEYS:
# Build a vocabulary for this feature.
out[taxi.transformed_name(key)] = tft.compute_and_apply_vocabulary(
taxi.fill_in_missing(inputs[key]), top_k=10, num_oov_buckets=10)
for key in taxi.BUCKET_FEATURE_KEYS:
out[taxi.transformed_name(key)] = tft.bucketize(taxi.fill_in_missing(inputs[key]),
num_buckets=10)
for key in taxi.CATEGORICAL_FEATURE_KEYS:
out[taxi.transformed_name(key)] = taxi.fill_in_missing(inputs[key])
# Was this passenger a big tipper?
taxi_fare = taxi.fill_in_missing(inputs[taxi.FARE_KEY])
tips = taxi.fill_in_missing(inputs[taxi.LABEL_KEY])
out[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
tf.is_nan(taxi_fare),
tf.cast(tf.zeros_like(taxi_fare), tf.int64),
# Test if the tip was > 20% of the fare.
tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)
)
return out
示例4: main
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def main():
def preprocessing_fn(inputs):
"""Preprocess input columns into transformed columns."""
x = inputs['x']
y = inputs['y']
s = inputs['s']
x_centered = x - tft.mean(x)
y_normalized = tft.scale_to_0_1(y)
s_integerized = tft.compute_and_apply_vocabulary(s)
x_centered_times_y_normalized = (x_centered * y_normalized)
return {
'x_centered': x_centered,
'y_normalized': y_normalized,
'x_centered_times_y_normalized': x_centered_times_y_normalized,
's_integerized': s_integerized
}
raw_data = [
{'x': 1, 'y': 1, 's': 'hello'},
{'x': 2, 'y': 2, 's': 'world'},
{'x': 3, 'y': 3, 's': 'hello'}
]
raw_data_metadata = dataset_metadata.DatasetMetadata(
schema_utils.schema_from_feature_spec({
's': tf.io.FixedLenFeature([], tf.string),
'y': tf.io.FixedLenFeature([], tf.float32),
'x': tf.io.FixedLenFeature([], tf.float32),
}))
with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
transformed_dataset, transform_fn = ( # pylint: disable=unused-variable
(raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
preprocessing_fn))
transformed_data, transformed_metadata = transformed_dataset # pylint: disable=unused-variable
pprint.pprint(transformed_data)
示例5: testWithMoreThanDesiredBatchSize
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def testWithMoreThanDesiredBatchSize(self):
def preprocessing_fn(inputs):
return {
'ab': tf.multiply(inputs['a'], inputs['b']),
'i': tft.compute_and_apply_vocabulary(inputs['c'])
}
batch_size = 100
num_instances = batch_size + 1
input_data = [{
'a': 2,
'b': i,
'c': '%.10i' % i, # Front-padded to facilitate lexicographic sorting.
} for i in range(num_instances)]
input_metadata = tft_unit.metadata_from_feature_spec({
'a': tf.io.FixedLenFeature([], tf.float32),
'b': tf.io.FixedLenFeature([], tf.float32),
'c': tf.io.FixedLenFeature([], tf.string)
})
expected_data = [{
'ab': 2*i,
'i': (len(input_data) - 1) - i, # Due to reverse lexicographic sorting.
} for i in range(len(input_data))]
expected_metadata = tft_unit.metadata_from_feature_spec({
'ab': tf.io.FixedLenFeature([], tf.float32),
'i': tf.io.FixedLenFeature([], tf.int64),
}, {
'i':
schema_pb2.IntDomain(
min=-1, max=num_instances - 1, is_categorical=True)
})
self.assertAnalyzeAndTransformResults(
input_data,
input_metadata,
preprocessing_fn,
expected_data,
expected_metadata,
desired_batch_size=batch_size)
示例6: testStringToTFIDF
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def testStringToTFIDF(self):
def preprocessing_fn(inputs):
inputs_as_ints = tft.compute_and_apply_vocabulary(
tf.compat.v1.strings.split(inputs['a']))
out_index, out_values = tft.tfidf(inputs_as_ints, 6)
return {
'tf_idf': out_values,
'index': out_index
}
input_data = [{'a': 'hello hello world'},
{'a': 'hello goodbye hello world'},
{'a': 'I like pie pie pie'}]
input_metadata = tft_unit.metadata_from_feature_spec(
{'a': tf.io.FixedLenFeature([], tf.string)})
# IDFs
# hello = log(4/3) = 0.28768
# world = log(4/3)
# goodbye = log(4/2) = 0.69314
# I = log(4/2)
# like = log(4/2)
# pie = log(4/2)
log_4_over_2 = 1.69314718056
log_4_over_3 = 1.28768207245
expected_transformed_data = [{
'tf_idf': [(2/3)*log_4_over_3, (1/3)*log_4_over_3],
'index': [0, 2]
}, {
'tf_idf': [(2/4)*log_4_over_3, (1/4)*log_4_over_3, (1/4)*log_4_over_2],
'index': [0, 2, 4]
}, {
'tf_idf': [(3/5)*log_4_over_2, (1/5)*log_4_over_2, (1/5)*log_4_over_2],
'index': [1, 3, 5]
}]
expected_metadata = tft_unit.metadata_from_feature_spec({
'tf_idf': tf.io.VarLenFeature(tf.float32),
'index': tf.io.VarLenFeature(tf.int64)
})
self.assertAnalyzeAndTransformResults(
input_data, input_metadata, preprocessing_fn,
expected_transformed_data, expected_metadata)
示例7: testStringToTFIDFEmptyDoc
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def testStringToTFIDFEmptyDoc(self):
def preprocessing_fn(inputs):
inputs_as_ints = tft.compute_and_apply_vocabulary(
tf.compat.v1.strings.split(inputs['a']))
out_index, out_values = tft.tfidf(inputs_as_ints, 6)
return {
'tf_idf': out_values,
'index': out_index
}
input_data = [{'a': 'hello hello world'},
{'a': ''},
{'a': 'hello goodbye hello world'},
{'a': 'I like pie pie pie'}]
input_metadata = tft_unit.metadata_from_feature_spec(
{'a': tf.io.FixedLenFeature([], tf.string)})
log_5_over_2 = 1.91629073187
log_5_over_3 = 1.51082562376
expected_transformed_data = [{
'tf_idf': [(2/3)*log_5_over_3, (1/3)*log_5_over_3],
'index': [0, 2]
}, {
'tf_idf': [],
'index': []
}, {
'tf_idf': [(2/4)*log_5_over_3, (1/4)*log_5_over_3, (1/4)*log_5_over_2],
'index': [0, 2, 4]
}, {
'tf_idf': [(3/5)*log_5_over_2, (1/5)*log_5_over_2, (1/5)*log_5_over_2],
'index': [1, 3, 5]
}]
expected_metadata = tft_unit.metadata_from_feature_spec({
'tf_idf': tf.io.VarLenFeature(tf.float32),
'index': tf.io.VarLenFeature(tf.int64)
})
self.assertAnalyzeAndTransformResults(
input_data, input_metadata, preprocessing_fn,
expected_transformed_data, expected_metadata)
示例8: testPipelineAPICounters
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def testPipelineAPICounters(self):
def preprocessing_fn(inputs):
_ = tft.vocabulary(inputs['a'])
return {
'a_int': tft.compute_and_apply_vocabulary(inputs['a']),
'x_scaled': tft.scale_to_0_1(inputs['x']),
'y_scaled': tft.scale_to_0_1(inputs['y'])
}
with self._makeTestPipeline() as pipeline:
input_data = pipeline | 'CreateTrainingData' >> beam.Create([{
'x': 4,
'y': 5,
'a': 'hello'
}, {
'x': 1,
'y': 3,
'a': 'world'
}])
metadata = tft_unit.metadata_from_feature_spec({
'x': tf.io.FixedLenFeature([], tf.float32),
'y': tf.io.FixedLenFeature([], tf.float32),
'a': tf.io.FixedLenFeature([], tf.string)
})
with beam_impl.Context(temp_dir=self.get_temp_dir()):
input_data, metadata = self._MaybeConvertInputsToTFXIO(
input_data, metadata)
_ = ((input_data, metadata)
| 'AnalyzeDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn))
metrics = pipeline.metrics
self.assertMetricsCounterEqual(metrics, 'tft_analyzer_vocabulary', 1)
self.assertMetricsCounterEqual(metrics, 'tft_mapper_scale_to_0_1', 2)
self.assertMetricsCounterEqual(metrics,
'tft_mapper_compute_and_apply_vocabulary', 1)
# compute_and_apply_vocabulary implicitly calls apply_vocabulary.
# We check that that call is not logged.
self.assertMetricsCounterEqual(metrics, 'tft_mapper_apply_vocabulary', 0)
示例9: preprocessing_fn
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
outputs = {}
for key in _DENSE_FLOAT_FEATURE_KEYS:
# Preserve this feature as a dense float, setting nan's to the mean.
outputs[_transformed_name(key)] = tft.scale_to_z_score(
_fill_in_missing(inputs[key]))
for key in _VOCAB_FEATURE_KEYS:
# Build a vocabulary for this feature.
outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
_fill_in_missing(inputs[key]),
top_k=_VOCAB_SIZE,
num_oov_buckets=_OOV_SIZE)
for key in _BUCKET_FEATURE_KEYS:
outputs[_transformed_name(key)] = tft.bucketize(
_fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT,
always_return_num_quantiles=False)
for key in _CATEGORICAL_FEATURE_KEYS:
outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])
# Was this passenger a big tipper?
taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
tips = _fill_in_missing(inputs[_LABEL_KEY])
outputs[_transformed_name(_LABEL_KEY)] = tf.where(
tf.is_nan(taxi_fare),
tf.cast(tf.zeros_like(taxi_fare), tf.int64),
# Test if the tip was > 20% of the fare.
tf.cast(
tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))
return outputs
示例10: _tokenize_review
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def _tokenize_review(review):
"""Tokenize the reviews by spliting the reviews.
Constructing a vocabulary. Map the words to their frequency index in the
vocabulary.
Args:
review: tensors containing the reviews. (batch_size/None, 1)
Returns:
Tokenized and padded review tensors. (batch_size/None, _MAX_LEN)
"""
review_sparse = tf.strings.split(tf.reshape(review, [-1])).to_sparse()
# tft.apply_vocabulary doesn't reserve 0 for oov words. In order to comply
# with convention and use mask_zero in keras.embedding layer, set oov value
# to _VOCAB_SIZE and padding value to -1. Then add 1 to all the tokens.
review_indices = tft.compute_and_apply_vocabulary(
review_sparse, default_value=_VOCAB_SIZE, top_k=_VOCAB_SIZE)
dense = tf.sparse.to_dense(review_indices, default_value=-1)
# TFX transform expects the transform result to be FixedLenFeature.
padding_config = [[0, 0], [0, _MAX_LEN]]
dense = tf.pad(dense, padding_config, 'CONSTANT', -1)
padded = tf.slice(dense, [0, 0], [-1, _MAX_LEN])
padded += 1
return padded
# TFX Transform will call this function.
示例11: preprocessing_fn
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
outputs = {}
for key in _DENSE_FLOAT_FEATURE_KEYS:
# Preserve this feature as a dense float, setting nan's to the mean.
outputs[_transformed_name(key)] = tft.scale_to_z_score(
_fill_in_missing(inputs[key]))
for key in _VOCAB_FEATURE_KEYS:
# Build a vocabulary for this feature.
outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
_fill_in_missing(inputs[key]),
top_k=_VOCAB_SIZE,
num_oov_buckets=_OOV_SIZE)
for key in _BUCKET_FEATURE_KEYS:
outputs[_transformed_name(key)] = tft.bucketize(
_fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)
for key in _CATEGORICAL_FEATURE_KEYS:
outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])
# Was this passenger a big tipper?
taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
tips = _fill_in_missing(inputs[_LABEL_KEY])
outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
tf.math.is_nan(taxi_fare),
tf.cast(tf.zeros_like(taxi_fare), tf.int64),
# Test if the tip was > 20% of the fare.
tf.cast(
tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))
return outputs
示例12: preprocessing_fn
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
outputs = {}
for key in _DENSE_FLOAT_FEATURE_KEYS:
# Preserve this feature as a dense float, setting nan's to the mean.
outputs[_transformed_name(key)] = tft.scale_to_z_score(
_fill_in_missing(inputs[key]))
for key in _VOCAB_FEATURE_KEYS:
# Build a vocabulary for this feature.
outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
_fill_in_missing(inputs[key]),
top_k=_VOCAB_SIZE,
num_oov_buckets=_OOV_SIZE)
for key in _BUCKET_FEATURE_KEYS:
outputs[_transformed_name(key)] = tft.bucketize(
_fill_in_missing(inputs[key]),
_FEATURE_BUCKET_COUNT)
for key in _CATEGORICAL_FEATURE_KEYS:
outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])
# TODO(b/157064428): Support label transformation for Keras.
# Do not apply label transformation as it will result in wrong evaluation.
outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]
return outputs
# TFX Trainer will call this function.
示例13: preprocessing_fn
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
outputs = {}
for key in features.DENSE_FLOAT_FEATURE_KEYS:
# Preserve this feature as a dense float, setting nan's to the mean.
outputs[features.transformed_name(key)] = tft.scale_to_z_score(
_fill_in_missing(inputs[key]))
for key in features.VOCAB_FEATURE_KEYS:
# Build a vocabulary for this feature.
outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
_fill_in_missing(inputs[key]),
top_k=features.VOCAB_SIZE,
num_oov_buckets=features.OOV_SIZE)
for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS,
features.BUCKET_FEATURE_BUCKET_COUNT):
outputs[features.transformed_name(key)] = tft.bucketize(
_fill_in_missing(inputs[key]),
num_buckets)
for key in features.CATEGORICAL_FEATURE_KEYS:
outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key])
# TODO(b/157064428): Support label transformation for Keras.
# Do not apply label transformation as it will result in wrong evaluation.
outputs[features.transformed_name(
features.LABEL_KEY)] = inputs[features.LABEL_KEY]
return outputs
示例14: preprocessing_fn
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
outputs = {}
for key in _DENSE_FLOAT_FEATURE_KEYS:
# Preserve this feature as a dense float, setting nan's to the mean.
outputs[_transformed_name(key)] = tft.scale_to_z_score(
_fill_in_missing(_identity(inputs[key])))
for key in _VOCAB_FEATURE_KEYS:
# Build a vocabulary for this feature.
outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
_fill_in_missing(inputs[key]),
top_k=_VOCAB_SIZE,
num_oov_buckets=_OOV_SIZE)
for key in _BUCKET_FEATURE_KEYS:
outputs[_transformed_name(key)] = tft.bucketize(
_fill_in_missing(inputs[key]),
_FEATURE_BUCKET_COUNT)
for key in _CATEGORICAL_FEATURE_KEYS:
outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])
# Was this passenger a big tipper?
taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
tips = _fill_in_missing(inputs[_LABEL_KEY])
outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
tf.math.is_nan(taxi_fare),
tf.cast(tf.zeros_like(taxi_fare), tf.int64),
# Test if the tip was > 20% of the fare.
tf.cast(
tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))
return outputs
示例15: _preprocess_tft
# 需要导入模块: import tensorflow_transform [as 别名]
# 或者: from tensorflow_transform import compute_and_apply_vocabulary [as 别名]
def _preprocess_tft(raw_data, user_freq, item_freq):
"""Creates vocabularies for users and items and maps their ids to ints.
Args:
raw_data: a dict of shape {$user_key: tensor, $item_key: tensor, ...}.
user_freq: minimum frequency of a user to include it in the user vocab.
item_freq: minimum frequency of an item to include it in the item vocab.
Returns:
A dict containing int ids cooresponding to a user_id and item_id and other
features: {$user_key: $user_id, $item_key: $item_id, ...}.
"""
features = {feature: raw_data[feature] for feature in constants.BQ_FEATURES}
tft_features = {
constants.TFT_USER_KEY: tft.compute_and_apply_vocabulary(
raw_data[constants.USER_KEY],
vocab_filename=constants.USER_VOCAB_NAME,
frequency_threshold=user_freq,
default_value=constants.TFT_DEFAULT_ID),
constants.TFT_ITEM_KEY: tft.compute_and_apply_vocabulary(
raw_data[constants.ITEM_KEY],
vocab_filename=constants.ITEM_VOCAB_NAME,
frequency_threshold=item_freq,
default_value=constants.TFT_DEFAULT_ID),
constants.TFT_ARTIST_KEY: tft.compute_and_apply_vocabulary(
raw_data[constants.ARTIST_KEY],
vocab_filename=constants.ARTIST_VOCAB_NAME,
default_value=constants.TFT_DEFAULT_ID),
constants.TFT_TAGS_KEY: tft.compute_and_apply_vocabulary(
raw_data[constants.TAGS_KEY],
vocab_filename=constants.TAG_VOCAB_NAME,
default_value=constants.TFT_DEFAULT_ID),
}
features.update(tft_features)
return features