本文整理汇总了Python中tensor2tensor.data_generators.text_encoder.RESERVED_TOKENS属性的典型用法代码示例。如果您正苦于以下问题:Python text_encoder.RESERVED_TOKENS属性的具体用法?Python text_encoder.RESERVED_TOKENS怎么用?Python text_encoder.RESERVED_TOKENS使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类tensor2tensor.data_generators.text_encoder
的用法示例。
在下文中一共展示了text_encoder.RESERVED_TOKENS属性的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_custom_reserved_tokens
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 别名]
def test_custom_reserved_tokens(self):
"""Test that we can pass custom reserved tokens to SubwordTextEncoder."""
corpus = "The quick brown fox jumps over the lazy dog"
token_counts = collections.Counter(corpus.split(" "))
start_symbol = "<S>"
end_symbol = "<E>"
reserved_tokens = text_encoder.RESERVED_TOKENS + [start_symbol,
end_symbol]
encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
10, token_counts, 2, 10, reserved_tokens=reserved_tokens)
# Make sure that reserved tokens appear in the right places.
self.assertEqual(encoder.decode([2]), start_symbol)
self.assertEqual(encoder.decode([3]), end_symbol)
# Make sure that we haven't messed up the ability to reconstruct.
reconstructed_corpus = encoder.decode(encoder.encode(corpus))
self.assertEqual(corpus, reconstructed_corpus)
示例2: test_reserved_token_chars_not_in_alphabet
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 别名]
def test_reserved_token_chars_not_in_alphabet(self):
corpus = "dog"
token_counts = collections.Counter(corpus.split(" "))
encoder1 = text_encoder.SubwordTextEncoder.build_to_target_size(
100, token_counts, 2, 100)
filename = os.path.join(self.test_temp_dir, "out.voc")
encoder1.store_to_file(filename)
encoder2 = text_encoder.SubwordTextEncoder(filename=filename)
self.assertEqual(encoder1._alphabet, encoder2._alphabet)
for t in text_encoder.RESERVED_TOKENS:
for c in t:
# Verify that encoders can encode all reserved token chars.
encoder1.encode(c)
encoder2.encode(c)
示例3: test_build_from_generator
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 别名]
def test_build_from_generator(self):
corpus = "The quick brown fox jumps over the lazy dog"
def gen():
for _ in range(3):
yield corpus
start_symbol = "<S>"
end_symbol = "<E>"
reserved_tokens = text_encoder.RESERVED_TOKENS + [start_symbol,
end_symbol]
encoder = text_encoder.SubwordTextEncoder.build_from_generator(
gen(), 10, reserved_tokens=reserved_tokens)
# Make sure that reserved tokens appear in the right places.
self.assertEqual(encoder.decode([2]), start_symbol)
self.assertEqual(encoder.decode([3]), end_symbol)
self.assertEqual("hi%s" % start_symbol,
encoder.decode(encoder.encode("hi") + [2]))
# Make sure that we haven't messed up the ability to reconstruct.
reconstructed_corpus = encoder.decode(encoder.encode(corpus))
self.assertEqual(corpus, reconstructed_corpus)
示例4: get_or_create_vocab
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 别名]
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
if self.vocab_type == VocabType.CHARACTER:
encoder = text_encoder.ByteTextEncoder()
elif self.vocab_type == VocabType.SUBWORD:
if force_get:
vocab_filepath = os.path.join(data_dir, self.vocab_filename)
encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
else:
encoder = generator_utils.get_or_generate_vocab_inner(
data_dir, self.vocab_filename, self.approx_vocab_size,
self.generate_text_for_vocab(data_dir, tmp_dir),
max_subtoken_length=self.max_subtoken_length,
reserved_tokens=(
text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens))
elif self.vocab_type == VocabType.TOKEN:
vocab_filename = os.path.join(data_dir, self.vocab_filename)
encoder = text_encoder.TokenTextEncoder(vocab_filename,
replace_oov=self.oov_token)
else:
raise ValueError(
"Unrecognized VocabType: %s" % str(self.vocab_type))
return encoder
示例5: get_or_create_vocab
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 别名]
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
if self.vocab_type == VocabType.CHARACTER:
encoder = text_encoder.ByteTextEncoder()
elif self.vocab_type == VocabType.SUBWORD:
if force_get:
vocab_filepath = os.path.join(data_dir, self.vocab_filename)
encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
else:
other_problem = self.use_vocab_from_other_problem
if other_problem:
return other_problem.get_or_create_vocab(data_dir, tmp_dir, force_get)
encoder = generator_utils.get_or_generate_vocab_inner(
data_dir, self.vocab_filename, self.approx_vocab_size,
self.generate_text_for_vocab(data_dir, tmp_dir),
max_subtoken_length=self.max_subtoken_length,
reserved_tokens=(
text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens))
elif self.vocab_type == VocabType.TOKEN:
vocab_filename = os.path.join(data_dir, self.vocab_filename)
encoder = text_encoder.TokenTextEncoder(vocab_filename,
replace_oov=self.oov_token)
else:
raise ValueError(
"Unrecognized VocabType: %s" % str(self.vocab_type))
return encoder
示例6: _init_vocab_from_list
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 别名]
def _init_vocab_from_list(self, vocab_list):
"""Initialize symbols from a list of symbols.
It is ok if reserved symbols appear in the vocab list. They will be
removed. The set of symbols in vocab_list should be unique.
Args:
vocab_list: A list of symbols.
"""
def sym_gen():
"""Symbols generator for vocab initializer from list."""
for sym in vocab_list:
if sym not in text_encoder.RESERVED_TOKENS:
yield sym
self._init_vocab(sym_gen())
示例7: decode
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 别名]
def decode(self, ids, strip_extraneous=False):
bases = []
for idx in ids:
if idx >= self._num_reserved_ids:
chunk = self._ids_to_tokens[idx]
if self.PAD in chunk:
chunk = chunk[:chunk.index(self.PAD)]
else:
if strip_extraneous:
continue
chunk = [text_encoder.RESERVED_TOKENS[idx]]
bases.extend(chunk)
return "".join(bases)
示例8: _init_vocab
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 别名]
def _init_vocab(self, sym_generator, add_reserved_symbols=True):
"""Initialize vocabulary with sym from sym_generator."""
self._id_to_sym = {}
non_reserved_start_index = 0
if add_reserved_symbols:
self._id_to_sym.update(enumerate(text_encoder.RESERVED_TOKENS))
non_reserved_start_index = len(text_encoder.RESERVED_TOKENS)
self._id_to_sym.update(
enumerate(sym_generator, start=non_reserved_start_index))
# _sym_to_id is the reverse of _id_to_sym
self._sym_to_id = dict((v, k) for k, v in six.iteritems(self._id_to_sym))
示例9: get_tag_id
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 别名]
def get_tag_id(tag):
"""Given the tag string, returns its index in the vocabulary."""
index = LANG_TAGS.index(tag)
# Adjust index to account for the tokens reserved by text_encoder.
index += len(text_encoder.RESERVED_TOKENS)
return index