本文整理汇总了Python中allennlp.data.Vocabulary.get_token_from_index方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.get_token_from_index方法的具体用法?Python Vocabulary.get_token_from_index怎么用?Python Vocabulary.get_token_from_index使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.Vocabulary
的用法示例。
在下文中一共展示了Vocabulary.get_token_from_index方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_vocab_index_mapping
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_token_from_index [as 别名]
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
vocab_index_mapping: List[Tuple[int, int]] = []
for index in range(self.vocab.get_vocab_size(namespace='tokens')):
token = self.vocab.get_token_from_index(index=index, namespace='tokens')
archived_token_index = archived_vocab.get_token_index(token, namespace='tokens')
# Checking if we got the UNK token index, because we don't want all new token
# representations initialized to UNK token's representation. We do that by checking if
# the two tokens are the same. They will not be if the token at the archived index is
# UNK.
if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token:
vocab_index_mapping.append((index, archived_token_index))
return vocab_index_mapping
示例2: _read_pretrained_word2vec_format_embedding_file
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_token_from_index [as 别名]
def _read_pretrained_word2vec_format_embedding_file(embeddings_filename: str, # pylint: disable=invalid-name
embedding_dim: int,
vocab: Vocabulary,
namespace: str = "tokens") -> torch.FloatTensor:
"""
Read from a gzipped-word2vec format file. The embeddings file is assumed to be gzipped and
space delimited, e.g. [word] [dim 1] [dim 2] ...
The remainder of the docstring is identical to ``_read_pretrained_embedding_file``.
"""
words_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
vocab_size = vocab.get_vocab_size(namespace)
embeddings = {}
# First we read the embeddings from the file, only keeping vectors for the words we need.
logger.info("Reading embeddings from file")
with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
for line in embeddings_file:
fields = line.decode('utf-8').strip().split(' ')
if len(fields) - 1 != embedding_dim:
# Sometimes there are funny unicode parsing problems that lead to different
# fields lengths (e.g., a word with a unicode space character that splits
# into more than one column). We skip those lines. Note that if you have
# some kind of long header, this could result in all of your lines getting
# skipped. It's hard to check for that here; you just have to look in the
# embedding_misses_file and at the model summary to make sure things look
# like they are supposed to.
logger.warning("Found line with wrong number of dimensions (expected %d, was %d): %s",
embedding_dim, len(fields) - 1, line)
continue
word = fields[0]
if word in words_to_keep:
vector = numpy.asarray(fields[1:], dtype='float32')
embeddings[word] = vector
if not embeddings:
raise ConfigurationError("No embeddings of correct dimension found; you probably "
"misspecified your embedding_dim parameter, or didn't "
"pre-populate your Vocabulary")
all_embeddings = numpy.asarray(list(embeddings.values()))
embeddings_mean = float(numpy.mean(all_embeddings))
embeddings_std = float(numpy.std(all_embeddings))
# Now we initialize the weight matrix for an embedding layer, starting with random vectors,
# then filling in the word vectors we just read.
logger.info("Initializing pre-trained embedding layer")
embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean,
embeddings_std)
for i in range(0, vocab_size):
word = vocab.get_token_from_index(i, namespace)
# If we don't have a pre-trained vector for this word, we'll just leave this row alone,
# so the word has a random initialization.
if word in embeddings:
embedding_matrix[i] = torch.FloatTensor(embeddings[word])
else:
logger.debug("Word %s was not found in the embedding file. Initialising randomly.", word)
# The weight matrix is initialized, so we construct and return the actual Embedding.
return embedding_matrix