本文整理汇总了Python中tensor2tensor.data_generators.text_encoder.SubwordTextEncoder方法的典型用法代码示例。如果您正苦于以下问题:Python text_encoder.SubwordTextEncoder方法的具体用法?Python text_encoder.SubwordTextEncoder怎么用?Python text_encoder.SubwordTextEncoder使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tensor2tensor.data_generators.text_encoder
的用法示例。
在下文中一共展示了text_encoder.SubwordTextEncoder方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_custom_reserved_tokens
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder [as 别名]
def test_custom_reserved_tokens(self):
"""Test that we can pass custom reserved tokens to SubwordTextEncoder."""
corpus = "The quick brown fox jumps over the lazy dog"
token_counts = collections.Counter(corpus.split(" "))
start_symbol = "<S>"
end_symbol = "<E>"
reserved_tokens = text_encoder.RESERVED_TOKENS + [start_symbol,
end_symbol]
encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
10, token_counts, 2, 10, reserved_tokens=reserved_tokens)
# Make sure that reserved tokens appear in the right places.
self.assertEqual(encoder.decode([2]), start_symbol)
self.assertEqual(encoder.decode([3]), end_symbol)
# Make sure that we haven't messed up the ability to reconstruct.
reconstructed_corpus = encoder.decode(encoder.encode(corpus))
self.assertEqual(corpus, reconstructed_corpus)
示例2: test_encodable_when_not_in_alphabet
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder [as 别名]
def test_encodable_when_not_in_alphabet(self):
corpus = "the quick brown fox jumps over the lazy dog"
token_counts = collections.Counter(corpus.split(" "))
encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
100, token_counts, 2, 10)
original = "This has UPPER CASE letters that are out of alphabet"
# Early versions could have an infinite loop when breaking into subtokens
# if there was any out-of-alphabet characters in the encoded string.
encoded = encoder.encode(original)
decoded = encoder.decode(encoded)
self.assertEqual(original, decoded)
encoded_str = "".join(encoder.all_subtoken_strings[i] for i in encoded)
self.assertIn("\\84;", encoded_str)
示例3: test_reserved_token_chars_not_in_alphabet
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder [as 别名]
def test_reserved_token_chars_not_in_alphabet(self):
corpus = "dog"
token_counts = collections.Counter(corpus.split(" "))
encoder1 = text_encoder.SubwordTextEncoder.build_to_target_size(
100, token_counts, 2, 100)
filename = os.path.join(self.test_temp_dir, "out.voc")
encoder1.store_to_file(filename)
encoder2 = text_encoder.SubwordTextEncoder(filename=filename)
self.assertEqual(encoder1._alphabet, encoder2._alphabet)
for t in text_encoder.RESERVED_TOKENS:
for c in t:
# Verify that encoders can encode all reserved token chars.
encoder1.encode(c)
encoder2.encode(c)
示例4: test_save_and_reload
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder [as 别名]
def test_save_and_reload(self):
corpus = "the quick brown fox jumps over the lazy dog"
token_counts = collections.Counter(corpus.split(" "))
# Deliberately exclude some required encoding chars from the alphabet
# and token list, making some strings unencodable.
encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
100, token_counts, 2, 10)
filename = os.path.join(self.test_temp_dir, "out.voc")
encoder.store_to_file(filename)
new_encoder = text_encoder.SubwordTextEncoder(filename)
self.assertEqual(encoder._alphabet, new_encoder._alphabet)
self.assertEqual(encoder.all_subtoken_strings,
new_encoder.all_subtoken_strings)
self.assertEqual(encoder._subtoken_string_to_id,
new_encoder._subtoken_string_to_id)
self.assertEqual(encoder._max_subtoken_len, new_encoder._max_subtoken_len)
示例5: test_save_and_reload_no_single_quotes
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder [as 别名]
def test_save_and_reload_no_single_quotes(self):
corpus = "the quick brown fox jumps over the lazy dog"
token_counts = collections.Counter(corpus.split(" "))
# Deliberately exclude some required encoding chars from the alphabet
# and token list, making some strings unencodable.
encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
100, token_counts, 2, 10)
filename = os.path.join(self.test_temp_dir, "out.voc")
encoder.store_to_file(filename, add_single_quotes=False)
new_encoder = text_encoder.SubwordTextEncoder(filename)
self.assertEqual(encoder._alphabet, new_encoder._alphabet)
self.assertEqual(encoder.all_subtoken_strings,
new_encoder.all_subtoken_strings)
self.assertEqual(encoder._subtoken_string_to_id,
new_encoder._subtoken_string_to_id)
self.assertEqual(encoder._max_subtoken_len, new_encoder._max_subtoken_len)
示例6: test_build_from_generator
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder [as 别名]
def test_build_from_generator(self):
corpus = "The quick brown fox jumps over the lazy dog"
def gen():
for _ in range(3):
yield corpus
start_symbol = "<S>"
end_symbol = "<E>"
reserved_tokens = text_encoder.RESERVED_TOKENS + [start_symbol,
end_symbol]
encoder = text_encoder.SubwordTextEncoder.build_from_generator(
gen(), 10, reserved_tokens=reserved_tokens)
# Make sure that reserved tokens appear in the right places.
self.assertEqual(encoder.decode([2]), start_symbol)
self.assertEqual(encoder.decode([3]), end_symbol)
self.assertEqual("hi%s" % start_symbol,
encoder.decode(encoder.encode("hi") + [2]))
# Make sure that we haven't messed up the ability to reconstruct.
reconstructed_corpus = encoder.decode(encoder.encode(corpus))
self.assertEqual(corpus, reconstructed_corpus)
示例7: _get_or_generate_vocab
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder [as 别名]
def _get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
"""Read or create vocabulary."""
vocab_filepath = os.path.join(tmp_dir, vocab_filename)
print('Vocab file written to: ' + vocab_filepath)
if tf.gfile.Exists(vocab_filepath):
gs = text_encoder.SubwordTextEncoder(vocab_filepath)
return gs
example_file = os.path.join(tmp_dir, _EXAMPLES_FILE)
gs = text_encoder.SubwordTextEncoder()
token_counts = tokenizer.corpus_token_counts(
example_file, corpus_max_lines=1000000)
gs = gs.build_to_target_size(
vocab_size, token_counts, min_val=1, max_val=1e3)
gs.store_to_file(vocab_filepath)
return gs
示例8: vocab_type
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder [as 别名]
def vocab_type(self):
"""What kind of vocabulary to use.
`VocabType`s:
* `SUBWORD`: `SubwordTextEncoder`, an invertible wordpiece vocabulary.
Must provide `self.approx_vocab_size`. Generates the vocabulary based on
the training data. To limit the number of samples the vocab generation
looks at, override `self.max_samples_for_vocab`. Recommended and
default.
* `CHARACTER`: `ByteTextEncoder`, encode raw bytes.
* `TOKEN`: `TokenTextEncoder`, vocabulary based on a file. Must provide a
vocabulary file yourself (`TokenTextEncoder.store_to_file`) because one
will not be generated for you. The vocab file should be stored in
`data_dir/` with the name specified by `self.vocab_filename`.
Returns:
VocabType constant
"""
return VocabType.SUBWORD
示例9: get_or_create_vocab
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder [as 别名]
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
if self.vocab_type == VocabType.CHARACTER:
encoder = text_encoder.ByteTextEncoder()
elif self.vocab_type == VocabType.SUBWORD:
if force_get:
vocab_filepath = os.path.join(data_dir, self.vocab_filename)
encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
else:
encoder = generator_utils.get_or_generate_vocab_inner(
data_dir, self.vocab_filename, self.approx_vocab_size,
self.generate_text_for_vocab(data_dir, tmp_dir),
max_subtoken_length=self.max_subtoken_length,
reserved_tokens=(
text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens))
elif self.vocab_type == VocabType.TOKEN:
vocab_filename = os.path.join(data_dir, self.vocab_filename)
encoder = text_encoder.TokenTextEncoder(vocab_filename,
replace_oov=self.oov_token)
else:
raise ValueError(
"Unrecognized VocabType: %s" % str(self.vocab_type))
return encoder
示例10: main
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder [as 别名]
def main(unused_argv):
if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern:
raise ValueError(
'Must only provide one of --corpus_filepattern or --vocab_filepattern')
elif FLAGS.corpus_filepattern:
token_counts = tokenizer.corpus_token_counts(
FLAGS.corpus_filepattern,
FLAGS.corpus_max_lines,
split_on_newlines=FLAGS.split_on_newlines)
elif FLAGS.vocab_filepattern:
token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
FLAGS.corpus_max_lines)
else:
raise ValueError(
'Must provide one of --corpus_filepattern or --vocab_filepattern')
encoder = text_encoder.SubwordTextEncoder()
encoder.build_from_token_counts(token_counts, FLAGS.min_count,
FLAGS.num_iterations)
encoder.store_to_file(FLAGS.output_filename)
示例11: get_encoder_from_vocab
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder [as 别名]
def get_encoder_from_vocab(vocab_filepath):
"""Get encoder from vocab file.
If vocab is not found in output dir, it will be copied there by
copy_vocab_to_output_dir to clarify the vocab used to generate the data.
Args:
vocab_filepath: path to vocab, either local or cns
Returns:
A SubwordTextEncoder vocabulary object. None if the output_parallel_text
is set.
"""
if not tf.gfile.Exists(vocab_filepath):
raise ValueError("Vocab file does not exist: {}.".format(vocab_filepath))
tf.logging.info("Found vocab file: %s", vocab_filepath)
encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
return encoder
示例12: test_load_from_file
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder [as 别名]
def test_load_from_file(self):
# Test a vocab file with words not wrapped with single quotes
encoder = text_encoder.SubwordTextEncoder()
correct_vocab = ["the", "and", "of"]
vocab = io.StringIO("the\n"
"and\n"
"of\n")
encoder._load_from_file_object(vocab)
self.assertAllEqual(encoder.all_subtoken_strings, correct_vocab)
# Test a vocab file with words wrapped in single quotes
encoder = text_encoder.SubwordTextEncoder()
vocab = io.StringIO("\"the\"\n"
"\"and\"\n"
"\"of\"\n")
encoder._load_from_file_object(vocab)
self.assertAllEqual(encoder.all_subtoken_strings, correct_vocab)
示例13: get_or_create_vocab
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder [as 别名]
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
if self.vocab_type == VocabType.CHARACTER:
encoder = text_encoder.ByteTextEncoder()
elif self.vocab_type == VocabType.SUBWORD:
if force_get:
vocab_filepath = os.path.join(data_dir, self.vocab_filename)
encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
else:
other_problem = self.use_vocab_from_other_problem
if other_problem:
return other_problem.get_or_create_vocab(data_dir, tmp_dir, force_get)
encoder = generator_utils.get_or_generate_vocab_inner(
data_dir, self.vocab_filename, self.approx_vocab_size,
self.generate_text_for_vocab(data_dir, tmp_dir),
max_subtoken_length=self.max_subtoken_length,
reserved_tokens=(
text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens))
elif self.vocab_type == VocabType.TOKEN:
vocab_filename = os.path.join(data_dir, self.vocab_filename)
encoder = text_encoder.TokenTextEncoder(vocab_filename,
replace_oov=self.oov_token)
else:
raise ValueError(
"Unrecognized VocabType: %s" % str(self.vocab_type))
return encoder