當前位置: 首頁>>代碼示例>>Python>>正文


Python text_encoder.RESERVED_TOKENS屬性代碼示例

本文整理匯總了Python中tensor2tensor.data_generators.text_encoder.RESERVED_TOKENS屬性的典型用法代碼示例。如果您正苦於以下問題:Python text_encoder.RESERVED_TOKENS屬性的具體用法?Python text_encoder.RESERVED_TOKENS怎麽用?Python text_encoder.RESERVED_TOKENS使用的例子?那麽, 這裏精選的屬性代碼示例或許可以為您提供幫助。您也可以進一步了解該屬性所在tensor2tensor.data_generators.text_encoder的用法示例。


在下文中一共展示了text_encoder.RESERVED_TOKENS屬性的9個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: test_custom_reserved_tokens

# 需要導入模塊: from tensor2tensor.data_generators import text_encoder [as 別名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 別名]
def test_custom_reserved_tokens(self):
    """Test that we can pass custom reserved tokens to SubwordTextEncoder."""
    corpus = "The quick brown fox jumps over the lazy dog"
    token_counts = collections.Counter(corpus.split(" "))

    start_symbol = "<S>"
    end_symbol = "<E>"
    reserved_tokens = text_encoder.RESERVED_TOKENS + [start_symbol,
                                                      end_symbol]
    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
        10, token_counts, 2, 10, reserved_tokens=reserved_tokens)

    # Make sure that reserved tokens appear in the right places.
    self.assertEqual(encoder.decode([2]), start_symbol)
    self.assertEqual(encoder.decode([3]), end_symbol)

    # Make sure that we haven't messed up the ability to reconstruct.
    reconstructed_corpus = encoder.decode(encoder.encode(corpus))
    self.assertEqual(corpus, reconstructed_corpus) 
開發者ID:akzaidi,項目名稱:fine-lm,代碼行數:21,代碼來源:text_encoder_test.py

示例2: test_reserved_token_chars_not_in_alphabet

# 需要導入模塊: from tensor2tensor.data_generators import text_encoder [as 別名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 別名]
def test_reserved_token_chars_not_in_alphabet(self):
    corpus = "dog"
    token_counts = collections.Counter(corpus.split(" "))
    encoder1 = text_encoder.SubwordTextEncoder.build_to_target_size(
        100, token_counts, 2, 100)
    filename = os.path.join(self.test_temp_dir, "out.voc")
    encoder1.store_to_file(filename)
    encoder2 = text_encoder.SubwordTextEncoder(filename=filename)

    self.assertEqual(encoder1._alphabet, encoder2._alphabet)

    for t in text_encoder.RESERVED_TOKENS:
      for c in t:
        # Verify that encoders can encode all reserved token chars.
        encoder1.encode(c)
        encoder2.encode(c) 
開發者ID:akzaidi,項目名稱:fine-lm,代碼行數:18,代碼來源:text_encoder_test.py

示例3: test_build_from_generator

# 需要導入模塊: from tensor2tensor.data_generators import text_encoder [as 別名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 別名]
def test_build_from_generator(self):

    corpus = "The quick brown fox jumps over the lazy dog"

    def gen():
      for _ in range(3):
        yield corpus

    start_symbol = "<S>"
    end_symbol = "<E>"
    reserved_tokens = text_encoder.RESERVED_TOKENS + [start_symbol,
                                                      end_symbol]
    encoder = text_encoder.SubwordTextEncoder.build_from_generator(
        gen(), 10, reserved_tokens=reserved_tokens)

    # Make sure that reserved tokens appear in the right places.
    self.assertEqual(encoder.decode([2]), start_symbol)
    self.assertEqual(encoder.decode([3]), end_symbol)

    self.assertEqual("hi%s" % start_symbol,
                     encoder.decode(encoder.encode("hi") + [2]))

    # Make sure that we haven't messed up the ability to reconstruct.
    reconstructed_corpus = encoder.decode(encoder.encode(corpus))
    self.assertEqual(corpus, reconstructed_corpus) 
開發者ID:akzaidi,項目名稱:fine-lm,代碼行數:27,代碼來源:text_encoder_test.py

示例4: get_or_create_vocab

# 需要導入模塊: from tensor2tensor.data_generators import text_encoder [as 別名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 別名]
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
    if self.vocab_type == VocabType.CHARACTER:
      encoder = text_encoder.ByteTextEncoder()
    elif self.vocab_type == VocabType.SUBWORD:
      if force_get:
        vocab_filepath = os.path.join(data_dir, self.vocab_filename)
        encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
      else:
        encoder = generator_utils.get_or_generate_vocab_inner(
            data_dir, self.vocab_filename, self.approx_vocab_size,
            self.generate_text_for_vocab(data_dir, tmp_dir),
            max_subtoken_length=self.max_subtoken_length,
            reserved_tokens=(
                text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens))
    elif self.vocab_type == VocabType.TOKEN:
      vocab_filename = os.path.join(data_dir, self.vocab_filename)
      encoder = text_encoder.TokenTextEncoder(vocab_filename,
                                              replace_oov=self.oov_token)
    else:
      raise ValueError(
          "Unrecognized VocabType: %s" % str(self.vocab_type))
    return encoder 
開發者ID:akzaidi,項目名稱:fine-lm,代碼行數:24,代碼來源:text_problems.py

示例5: get_or_create_vocab

# 需要導入模塊: from tensor2tensor.data_generators import text_encoder [as 別名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 別名]
def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
    if self.vocab_type == VocabType.CHARACTER:
      encoder = text_encoder.ByteTextEncoder()
    elif self.vocab_type == VocabType.SUBWORD:
      if force_get:
        vocab_filepath = os.path.join(data_dir, self.vocab_filename)
        encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
      else:
        other_problem = self.use_vocab_from_other_problem
        if other_problem:
          return other_problem.get_or_create_vocab(data_dir, tmp_dir, force_get)
        encoder = generator_utils.get_or_generate_vocab_inner(
            data_dir, self.vocab_filename, self.approx_vocab_size,
            self.generate_text_for_vocab(data_dir, tmp_dir),
            max_subtoken_length=self.max_subtoken_length,
            reserved_tokens=(
                text_encoder.RESERVED_TOKENS + self.additional_reserved_tokens))
    elif self.vocab_type == VocabType.TOKEN:
      vocab_filename = os.path.join(data_dir, self.vocab_filename)
      encoder = text_encoder.TokenTextEncoder(vocab_filename,
                                              replace_oov=self.oov_token)
    else:
      raise ValueError(
          "Unrecognized VocabType: %s" % str(self.vocab_type))
    return encoder 
開發者ID:tensorflow,項目名稱:tensor2tensor,代碼行數:27,代碼來源:text_problems.py

示例6: _init_vocab_from_list

# 需要導入模塊: from tensor2tensor.data_generators import text_encoder [as 別名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 別名]
def _init_vocab_from_list(self, vocab_list):
    """Initialize symbols from a list of symbols.

    It is ok if reserved symbols appear in the vocab list. They will be
    removed. The set of symbols in vocab_list should be unique.

    Args:
      vocab_list: A list of symbols.
    """
    def sym_gen():
      """Symbols generator for vocab initializer from list."""
      for sym in vocab_list:
        if sym not in text_encoder.RESERVED_TOKENS:
          yield sym

    self._init_vocab(sym_gen()) 
開發者ID:steveash,項目名稱:NETransliteration-COLING2018,代碼行數:18,代碼來源:g2p_encoder.py

示例7: decode

# 需要導入模塊: from tensor2tensor.data_generators import text_encoder [as 別名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 別名]
def decode(self, ids, strip_extraneous=False):
    bases = []
    for idx in ids:
      if idx >= self._num_reserved_ids:
        chunk = self._ids_to_tokens[idx]
        if self.PAD in chunk:
          chunk = chunk[:chunk.index(self.PAD)]
      else:
        if strip_extraneous:
          continue
        chunk = [text_encoder.RESERVED_TOKENS[idx]]
      bases.extend(chunk)
    return "".join(bases) 
開發者ID:akzaidi,項目名稱:fine-lm,代碼行數:15,代碼來源:dna_encoder.py

示例8: _init_vocab

# 需要導入模塊: from tensor2tensor.data_generators import text_encoder [as 別名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 別名]
def _init_vocab(self, sym_generator, add_reserved_symbols=True):
    """Initialize vocabulary with sym from sym_generator."""

    self._id_to_sym = {}
    non_reserved_start_index = 0

    if add_reserved_symbols:
      self._id_to_sym.update(enumerate(text_encoder.RESERVED_TOKENS))
      non_reserved_start_index = len(text_encoder.RESERVED_TOKENS)

    self._id_to_sym.update(
        enumerate(sym_generator, start=non_reserved_start_index))

    # _sym_to_id is the reverse of _id_to_sym
    self._sym_to_id = dict((v, k) for k, v in six.iteritems(self._id_to_sym)) 
開發者ID:steveash,項目名稱:NETransliteration-COLING2018,代碼行數:17,代碼來源:g2p_encoder.py

示例9: get_tag_id

# 需要導入模塊: from tensor2tensor.data_generators import text_encoder [as 別名]
# 或者: from tensor2tensor.data_generators.text_encoder import RESERVED_TOKENS [as 別名]
def get_tag_id(tag):
  """Given the tag string, returns its index in the vocabulary."""
  index = LANG_TAGS.index(tag)
  # Adjust index to account for the tokens reserved by text_encoder.
  index += len(text_encoder.RESERVED_TOKENS)
  return index 
開發者ID:google-research,項目名稱:language,代碼行數:8,代碼來源:translate_multilingual.py


注:本文中的tensor2tensor.data_generators.text_encoder.RESERVED_TOKENS屬性示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。