本文整理汇总了Python中allennlp.data.Vocabulary.get_token_index方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.get_token_index方法的具体用法?Python Vocabulary.get_token_index怎么用?Python Vocabulary.get_token_index使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.Vocabulary
的用法示例。
在下文中一共展示了Vocabulary.get_token_index方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_embedding_layer_actually_initializes_word_vectors_correctly
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_token_index [as 别名]
def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("word")
vocab.add_token_to_namespace("word2")
embeddings_filename = self.TEST_DIR + "embeddings.gz"
with gzip.open(embeddings_filename, 'wb') as embeddings_file:
embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
params = Params({
'pretrained_file': embeddings_filename,
'embedding_dim': 3,
})
embedding_layer = Embedding.from_params(vocab, params)
word_vector = embedding_layer.weight.data[vocab.get_token_index("word")]
assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
示例2: test_read_embedding_file_inside_archive
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_token_index [as 别名]
def test_read_embedding_file_inside_archive(self):
token2vec = {
"think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
"make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
"difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
"àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
}
vocab = Vocabulary()
for token in token2vec:
vocab.add_token_to_namespace(token)
params = Params({
'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
'embedding_dim': 5
})
with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"):
Embedding.from_params(vocab, params)
for ext in ['.zip', '.tar.gz']:
archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt')
params = Params({
'pretrained_file': file_uri,
'embedding_dim': 5
})
embeddings = Embedding.from_params(vocab, params).weight.data
for tok, vec in token2vec.items():
i = vocab.get_token_index(tok)
assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
示例3: tokens_to_indices
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_token_index [as 别名]
def tokens_to_indices(self, tokens: List[Token],
vocabulary: Vocabulary,
index_name: str) -> Dict[str, List[int]]: # pylint: disable=unused-argument
return {
"token_ids": [10, 15] + \
[vocabulary.get_token_index(token.text, 'words') for token in tokens] + \
[25],
"additional_key": [22, 29]
}
示例4: _get_vocab_index_mapping
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_token_index [as 别名]
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
vocab_index_mapping: List[Tuple[int, int]] = []
for index in range(self.vocab.get_vocab_size(namespace='tokens')):
token = self.vocab.get_token_from_index(index=index, namespace='tokens')
archived_token_index = archived_vocab.get_token_index(token, namespace='tokens')
# Checking if we got the UNK token index, because we don't want all new token
# representations initialized to UNK token's representation. We do that by checking if
# the two tokens are the same. They will not be if the token at the archived index is
# UNK.
if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token:
vocab_index_mapping.append((index, archived_token_index))
return vocab_index_mapping
示例5: test_blank_pos_tag
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_token_index [as 别名]
def test_blank_pos_tag(self):
tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
for token in tokens:
token.pos_ = ""
indexer = PosTagIndexer()
counter = defaultdict(lambda: defaultdict(int))
for token in tokens:
indexer.count_vocab_items(token, counter)
# spacy uses a empty string to indicate "no POS tag"
# we convert it to "NONE"
assert counter["pos_tokens"]["NONE"] == 4
vocab = Vocabulary(counter)
none_index = vocab.get_token_index('NONE', 'pos_tokens')
# should raise no exception
indices = indexer.tokens_to_indices(tokens, vocab, index_name="pos")
assert {"pos": [none_index, none_index, none_index, none_index]} == indices
示例6: KnowledgeGraphFieldTest
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_token_index [as 别名]
class KnowledgeGraphFieldTest(AllenNlpTestCase):
def setUp(self):
self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True))
self.utterance = self.tokenizer.tokenize("where is mersin?")
self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}
json = {
'question': self.utterance,
'columns': ['Name in English', 'Location in English'],
'cells': [['Paradeniz', 'Mersin'],
['Lake Gala', 'Edirne']]
}
self.graph = TableQuestionKnowledgeGraph.read_from_json(json)
self.vocab = Vocabulary()
self.name_index = self.vocab.add_token_to_namespace("name", namespace='tokens')
self.in_index = self.vocab.add_token_to_namespace("in", namespace='tokens')
self.english_index = self.vocab.add_token_to_namespace("english", namespace='tokens')
self.location_index = self.vocab.add_token_to_namespace("location", namespace='tokens')
self.paradeniz_index = self.vocab.add_token_to_namespace("paradeniz", namespace='tokens')
self.mersin_index = self.vocab.add_token_to_namespace("mersin", namespace='tokens')
self.lake_index = self.vocab.add_token_to_namespace("lake", namespace='tokens')
self.gala_index = self.vocab.add_token_to_namespace("gala", namespace='tokens')
self.negative_one_index = self.vocab.add_token_to_namespace("-1", namespace='tokens')
self.zero_index = self.vocab.add_token_to_namespace("0", namespace='tokens')
self.one_index = self.vocab.add_token_to_namespace("1", namespace='tokens')
self.oov_index = self.vocab.get_token_index('random OOV string', namespace='tokens')
self.edirne_index = self.oov_index
self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer)
super(KnowledgeGraphFieldTest, self).setUp()
def test_count_vocab_items(self):
namespace_token_counts = defaultdict(lambda: defaultdict(int))
self.field.count_vocab_items(namespace_token_counts)
assert namespace_token_counts["tokens"] == {
'-1': 1,
'0': 1,
'1': 1,
'name': 1,
'in': 2,
'english': 2,
'location': 1,
'paradeniz': 1,
'mersin': 1,
'lake': 1,
'gala': 1,
'edirne': 1,
}
def test_index_converts_field_correctly(self):
# pylint: disable=protected-access
self.field.index(self.vocab)
assert self.field._indexed_entity_texts.keys() == {'tokens'}
# Note that these are sorted by their _identifiers_, not their cell text, so the
# `fb:row.rows` show up after the `fb:cells`.
expected_array = [[self.negative_one_index],
[self.zero_index],
[self.one_index],
[self.edirne_index],
[self.lake_index, self.gala_index],
[self.mersin_index],
[self.paradeniz_index],
[self.location_index, self.in_index, self.english_index],
[self.name_index, self.in_index, self.english_index]]
assert self.field._indexed_entity_texts['tokens'] == expected_array
def test_get_padding_lengths_raises_if_not_indexed(self):
with pytest.raises(AssertionError):
self.field.get_padding_lengths()
def test_padding_lengths_are_computed_correctly(self):
# pylint: disable=protected-access
self.field.index(self.vocab)
assert self.field.get_padding_lengths() == {'num_entities': 9, 'num_entity_tokens': 3,
'num_utterance_tokens': 4}
self.field._token_indexers['token_characters'] = TokenCharactersIndexer()
self.field.index(self.vocab)
assert self.field.get_padding_lengths() == {'num_entities': 9, 'num_entity_tokens': 3,
'num_utterance_tokens': 4,
'num_token_characters': 9}
def test_as_tensor_produces_correct_output(self):
self.field.index(self.vocab)
padding_lengths = self.field.get_padding_lengths()
padding_lengths['num_utterance_tokens'] += 1
padding_lengths['num_entities'] += 1
tensor_dict = self.field.as_tensor(padding_lengths)
assert tensor_dict.keys() == {'text', 'linking'}
expected_text_tensor = [[self.negative_one_index, 0, 0],
[self.zero_index, 0, 0],
[self.one_index, 0, 0],
[self.edirne_index, 0, 0],
[self.lake_index, self.gala_index, 0],
[self.mersin_index, 0, 0],
[self.paradeniz_index, 0, 0],
[self.location_index, self.in_index, self.english_index],
[self.name_index, self.in_index, self.english_index],
[0, 0, 0]]
#.........这里部分代码省略.........