本文整理汇总了Python中allennlp.data.Vocabulary.get_index_to_token_vocabulary方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.get_index_to_token_vocabulary方法的具体用法?Python Vocabulary.get_index_to_token_vocabulary怎么用?Python Vocabulary.get_index_to_token_vocabulary使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.Vocabulary
的用法示例。
在下文中一共展示了Vocabulary.get_index_to_token_vocabulary方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _read_embeddings_from_text_file
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_index_to_token_vocabulary [as 别名]
def _read_embeddings_from_text_file(file_uri: str,
embedding_dim: int,
vocab: Vocabulary,
namespace: str = "tokens") -> torch.FloatTensor:
"""
Read pre-trained word vectors from an eventually compressed text file, possibly contained
inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
space-separated fields: [word] [dim 1] [dim 2] ...
Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped.
The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``.
"""
tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
vocab_size = vocab.get_vocab_size(namespace)
embeddings = {}
# First we read the embeddings from the file, only keeping vectors for the words we need.
logger.info("Reading pretrained embeddings from file")
with EmbeddingsTextFile(file_uri) as embeddings_file:
for line in Tqdm.tqdm(embeddings_file):
token = line.split(' ', 1)[0]
if token in tokens_to_keep:
fields = line.rstrip().split(' ')
if len(fields) - 1 != embedding_dim:
# Sometimes there are funny unicode parsing problems that lead to different
# fields lengths (e.g., a word with a unicode space character that splits
# into more than one column). We skip those lines. Note that if you have
# some kind of long header, this could result in all of your lines getting
# skipped. It's hard to check for that here; you just have to look in the
# embedding_misses_file and at the model summary to make sure things look
# like they are supposed to.
logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
embedding_dim, len(fields) - 1, line)
continue
vector = numpy.asarray(fields[1:], dtype='float32')
embeddings[token] = vector
if not embeddings:
raise ConfigurationError("No embeddings of correct dimension found; you probably "
"misspecified your embedding_dim parameter, or didn't "
"pre-populate your Vocabulary")
all_embeddings = numpy.asarray(list(embeddings.values()))
embeddings_mean = float(numpy.mean(all_embeddings))
embeddings_std = float(numpy.std(all_embeddings))
# Now we initialize the weight matrix for an embedding layer, starting with random vectors,
# then filling in the word vectors we just read.
logger.info("Initializing pre-trained embedding layer")
embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean,
embeddings_std)
num_tokens_found = 0
index_to_token = vocab.get_index_to_token_vocabulary(namespace)
for i in range(vocab_size):
token = index_to_token[i]
# If we don't have a pre-trained vector for this word, we'll just leave this row alone,
# so the word has a random initialization.
if token in embeddings:
embedding_matrix[i] = torch.FloatTensor(embeddings[token])
num_tokens_found += 1
else:
logger.debug("Token %s was not found in the embedding file. Initialising randomly.", token)
logger.info("Pretrained embeddings were found for %d out of %d tokens",
num_tokens_found, vocab_size)
return embedding_matrix
示例2: _read_pretrained_word2vec_format_embedding_file
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_index_to_token_vocabulary [as 别名]
def _read_pretrained_word2vec_format_embedding_file(embeddings_filename: str, # pylint: disable=invalid-name
embedding_dim: int,
vocab: Vocabulary,
namespace: str = "tokens") -> torch.FloatTensor:
"""
Read from a gzipped-word2vec format file. The embeddings file is assumed to be gzipped and
space delimited, e.g. [word] [dim 1] [dim 2] ...
The remainder of the docstring is identical to ``_read_pretrained_embedding_file``.
"""
words_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
vocab_size = vocab.get_vocab_size(namespace)
embeddings = {}
# First we read the embeddings from the file, only keeping vectors for the words we need.
logger.info("Reading embeddings from file")
with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
for line in embeddings_file:
fields = line.decode('utf-8').strip().split(' ')
if len(fields) - 1 != embedding_dim:
# Sometimes there are funny unicode parsing problems that lead to different
# fields lengths (e.g., a word with a unicode space character that splits
# into more than one column). We skip those lines. Note that if you have
# some kind of long header, this could result in all of your lines getting
# skipped. It's hard to check for that here; you just have to look in the
# embedding_misses_file and at the model summary to make sure things look
# like they are supposed to.
logger.warning("Found line with wrong number of dimensions (expected %d, was %d): %s",
embedding_dim, len(fields) - 1, line)
continue
word = fields[0]
if word in words_to_keep:
vector = numpy.asarray(fields[1:], dtype='float32')
embeddings[word] = vector
if not embeddings:
raise ConfigurationError("No embeddings of correct dimension found; you probably "
"misspecified your embedding_dim parameter, or didn't "
"pre-populate your Vocabulary")
all_embeddings = numpy.asarray(list(embeddings.values()))
embeddings_mean = float(numpy.mean(all_embeddings))
embeddings_std = float(numpy.std(all_embeddings))
# Now we initialize the weight matrix for an embedding layer, starting with random vectors,
# then filling in the word vectors we just read.
logger.info("Initializing pre-trained embedding layer")
embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean,
embeddings_std)
for i in range(0, vocab_size):
word = vocab.get_token_from_index(i, namespace)
# If we don't have a pre-trained vector for this word, we'll just leave this row alone,
# so the word has a random initialization.
if word in embeddings:
embedding_matrix[i] = torch.FloatTensor(embeddings[word])
else:
logger.debug("Word %s was not found in the embedding file. Initialising randomly.", word)
# The weight matrix is initialized, so we construct and return the actual Embedding.
return embedding_matrix