本文整理汇总了Python中allennlp.data.Vocabulary类的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary类的具体用法?Python Vocabulary怎么用?Python Vocabulary使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Vocabulary类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_read_embedding_file_inside_archive
def test_read_embedding_file_inside_archive(self):
token2vec = {
"think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
"make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
"difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
"àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
}
vocab = Vocabulary()
for token in token2vec:
vocab.add_token_to_namespace(token)
params = Params({
'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
'embedding_dim': 5
})
with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"):
Embedding.from_params(vocab, params)
for ext in ['.zip', '.tar.gz']:
archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt')
params = Params({
'pretrained_file': file_uri,
'embedding_dim': 5
})
embeddings = Embedding.from_params(vocab, params).weight.data
for tok, vec in token2vec.items():
i = vocab.get_token_index(tok)
assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
示例2: get_vocab
def get_vocab(word2freq, max_v_sizes):
'''Build vocabulary'''
vocab = Vocabulary(counter=None, max_vocab_size=max_v_sizes['word'])
words_by_freq = [(word, freq) for word, freq in word2freq.items()]
words_by_freq.sort(key=lambda x: x[1], reverse=True)
for word, _ in words_by_freq[:max_v_sizes['word']]:
vocab.add_token_to_namespace(word, 'tokens')
log.info("\tFinished building vocab. Using %d words", vocab.get_vocab_size('tokens'))
return vocab
示例3: test_tokens_to_indices_uses_pos_tags
def test_tokens_to_indices_uses_pos_tags(self):
tokens = self.tokenizer.split_words("This is a sentence.")
tokens = [t for t in tokens] + [Token("</S>")]
vocab = Vocabulary()
root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels')
none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels')
indexer = DepLabelIndexer()
assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [root_index]}
assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}
示例4: test_token_to_indices_uses_ner_tags
def test_token_to_indices_uses_ner_tags(self):
tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
tokens = [t for t in tokens] + [Token("</S>")]
vocab = Vocabulary()
person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags')
none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
vocab.add_token_to_namespace('ORG', namespace='ner_tags')
indexer = NerTagIndexer()
assert indexer.token_to_indices(tokens[1], vocab) == person_index
assert indexer.token_to_indices(tokens[-1], vocab) == none_index
示例5: test_get_embedding_layer_uses_correct_embedding_dim
def test_get_embedding_layer_uses_correct_embedding_dim(self):
vocab = Vocabulary()
vocab.add_token_to_namespace('word1')
vocab.add_token_to_namespace('word2')
embeddings_filename = self.TEST_DIR + "embeddings.gz"
with gzip.open(embeddings_filename, 'wb') as embeddings_file:
embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8'))
embedding_weights = _read_pretrained_embedding_file(embeddings_filename, 3, vocab)
assert tuple(embedding_weights.size()) == (4, 3) # 4 because of padding and OOV
with pytest.raises(ConfigurationError):
_read_pretrained_embedding_file(embeddings_filename, 4, vocab)
示例6: test_token_to_indices_uses_pos_tags
def test_token_to_indices_uses_pos_tags(self):
tokens = self.tokenizer.split_words("This is a sentence.")
tokens = [t for t in tokens] + [Token("</S>")]
vocab = Vocabulary()
verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
indexer = PosTagIndexer(coarse_tags=True)
assert indexer.token_to_indices(tokens[1], vocab) == verb_index
assert indexer.token_to_indices(tokens[-1], vocab) == none_index
indexer._coarse_tags = False # pylint: disable=protected-access
assert indexer.token_to_indices(tokens[1], vocab) == cop_index
示例7: _get_vocab_index_mapping
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
vocab_index_mapping: List[Tuple[int, int]] = []
for index in range(self.vocab.get_vocab_size(namespace='tokens')):
token = self.vocab.get_token_from_index(index=index, namespace='tokens')
archived_token_index = archived_vocab.get_token_index(token, namespace='tokens')
# Checking if we got the UNK token index, because we don't want all new token
# representations initialized to UNK token's representation. We do that by checking if
# the two tokens are the same. They will not be if the token at the archived index is
# UNK.
if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token:
vocab_index_mapping.append((index, archived_token_index))
return vocab_index_mapping
示例8: test_as_tensor_produces_integer_targets
def test_as_tensor_produces_integer_targets(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("B", namespace='*labels')
vocab.add_token_to_namespace("I", namespace='*labels')
vocab.add_token_to_namespace("O", namespace='*labels')
tags = ["B", "I", "O", "O", "O"]
sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels")
sequence_label_field.index(vocab)
padding_lengths = sequence_label_field.get_padding_lengths()
tensor = sequence_label_field.as_tensor(padding_lengths).detach().cpu().numpy()
numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 1, 2, 2, 2]))
示例9: test_index_converts_field_correctly
def test_index_converts_field_correctly(self):
vocab = Vocabulary()
b_index = vocab.add_token_to_namespace("B", namespace='*labels')
i_index = vocab.add_token_to_namespace("I", namespace='*labels')
o_index = vocab.add_token_to_namespace("O", namespace='*labels')
tags = ["B", "I", "O", "O", "O"]
sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels")
sequence_label_field.index(vocab)
# pylint: disable=protected-access
assert sequence_label_field._indexed_labels == [b_index, i_index, o_index, o_index, o_index]
示例10: TestDataset
class TestDataset(AllenNlpTestCase):
def setUp(self):
self.vocab = Vocabulary()
self.vocab.add_token_to_namespace("this")
self.vocab.add_token_to_namespace("is")
self.vocab.add_token_to_namespace("a")
self.vocab.add_token_to_namespace("sentence")
self.vocab.add_token_to_namespace(".")
self.token_indexer = {"tokens": SingleIdTokenIndexer()}
self.instances = self.get_instances()
super(TestDataset, self).setUp()
def test_instances_must_have_homogeneous_fields(self):
instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))})
instance2 = Instance({"words": TextField([Token("hello")], {})})
with pytest.raises(ConfigurationError):
_ = Batch([instance1, instance2])
def test_padding_lengths_uses_max_instance_lengths(self):
dataset = Batch(self.instances)
dataset.index_instances(self.vocab)
padding_lengths = dataset.get_padding_lengths()
assert padding_lengths == {"text1": {"num_tokens": 5, "tokens_length": 5},
"text2": {"num_tokens": 6, "tokens_length": 6}}
def test_as_tensor_dict(self):
dataset = Batch(self.instances)
dataset.index_instances(self.vocab)
padding_lengths = dataset.get_padding_lengths()
tensors = dataset.as_tensor_dict(padding_lengths)
text1 = tensors["text1"]["tokens"].detach().cpu().numpy()
text2 = tensors["text2"]["tokens"].detach().cpu().numpy()
numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6],
[1, 3, 4, 5, 6]]))
numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6],
[2, 3, 1, 0, 0, 0]]))
def get_instances(self):
field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]],
self.token_indexer)
field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
self.token_indexer)
field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]],
self.token_indexer)
field4 = TextField([Token(t) for t in ["this", "is", "short"]],
self.token_indexer)
instances = [Instance({"text1": field1, "text2": field2}),
Instance({"text1": field3, "text2": field4})]
return instances
示例11: setUp
def setUp(self):
super(TestTokenCharactersEncoder, self).setUp()
self.vocab = Vocabulary()
self.vocab.add_token_to_namespace("1", "token_characters")
self.vocab.add_token_to_namespace("2", "token_characters")
self.vocab.add_token_to_namespace("3", "token_characters")
self.vocab.add_token_to_namespace("4", "token_characters")
params = Params({
"embedding": {
"embedding_dim": 2,
"vocab_namespace": "token_characters"
},
"encoder": {
"type": "cnn",
"embedding_dim": 2,
"num_filters": 4,
"ngram_filter_sizes": [1, 2],
"output_dim": 3
}
})
self.encoder = TokenCharactersEncoder.from_params(vocab=self.vocab, params=deepcopy(params))
self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"])
self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"])
constant_init = Initializer.from_params(Params({"type": "constant", "val": 1.}))
initializer = InitializerApplicator([(".*", constant_init)])
initializer(self.encoder)
initializer(self.embedding)
initializer(self.inner_encoder)
示例12: test_adjacency_field_can_index_with_vocab
def test_adjacency_field_can_index_with_vocab(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("a", namespace="labels")
vocab.add_token_to_namespace("b", namespace="labels")
vocab.add_token_to_namespace("c", namespace="labels")
labels = ["a", "b"]
indices = [(0, 1), (2, 1)]
adjacency_field = AdjacencyField(indices, self.text, labels)
adjacency_field.index(vocab)
tensor = adjacency_field.as_tensor(adjacency_field.get_padding_lengths())
numpy.testing.assert_equal(tensor.numpy(), numpy.array([[-1, 0, -1, -1, -1],
[-1, -1, -1, -1, -1],
[-1, 1, -1, -1, -1],
[-1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1]]))
示例13: setUp
def setUp(self):
self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True))
self.utterance = self.tokenizer.tokenize("where is mersin?")
self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}
json = {
'question': self.utterance,
'columns': ['Name in English', 'Location in English'],
'cells': [['Paradeniz', 'Mersin'],
['Lake Gala', 'Edirne']]
}
self.graph = TableQuestionKnowledgeGraph.read_from_json(json)
self.vocab = Vocabulary()
self.name_index = self.vocab.add_token_to_namespace("name", namespace='tokens')
self.in_index = self.vocab.add_token_to_namespace("in", namespace='tokens')
self.english_index = self.vocab.add_token_to_namespace("english", namespace='tokens')
self.location_index = self.vocab.add_token_to_namespace("location", namespace='tokens')
self.paradeniz_index = self.vocab.add_token_to_namespace("paradeniz", namespace='tokens')
self.mersin_index = self.vocab.add_token_to_namespace("mersin", namespace='tokens')
self.lake_index = self.vocab.add_token_to_namespace("lake", namespace='tokens')
self.gala_index = self.vocab.add_token_to_namespace("gala", namespace='tokens')
self.negative_one_index = self.vocab.add_token_to_namespace("-1", namespace='tokens')
self.zero_index = self.vocab.add_token_to_namespace("0", namespace='tokens')
self.one_index = self.vocab.add_token_to_namespace("1", namespace='tokens')
self.oov_index = self.vocab.get_token_index('random OOV string', namespace='tokens')
self.edirne_index = self.oov_index
self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer)
super(KnowledgeGraphFieldTest, self).setUp()
示例14: from_params
def from_params(cls, vocab: Vocabulary, params: Params) -> 'ElmoTokenEmbedder': # type: ignore
# pylint: disable=arguments-differ
params.add_file_to_archive('options_file')
params.add_file_to_archive('weight_file')
options_file = params.pop('options_file')
weight_file = params.pop('weight_file')
requires_grad = params.pop('requires_grad', False)
do_layer_norm = params.pop_bool('do_layer_norm', False)
dropout = params.pop_float("dropout", 0.5)
namespace_to_cache = params.pop("namespace_to_cache", None)
if namespace_to_cache is not None:
vocab_to_cache = list(vocab.get_token_to_index_vocabulary(namespace_to_cache).keys())
else:
vocab_to_cache = None
projection_dim = params.pop_int("projection_dim", None)
scalar_mix_parameters = params.pop('scalar_mix_parameters', None)
params.assert_empty(cls.__name__)
return cls(options_file=options_file,
weight_file=weight_file,
do_layer_norm=do_layer_norm,
dropout=dropout,
requires_grad=requires_grad,
projection_dim=projection_dim,
vocab_to_cache=vocab_to_cache,
scalar_mix_parameters=scalar_mix_parameters)
示例15: setUp
def setUp(self):
super(IteratorTest, self).setUp()
self.token_indexers = {"tokens": SingleIdTokenIndexer()}
self.vocab = Vocabulary()
self.this_index = self.vocab.add_token_to_namespace('this')
self.is_index = self.vocab.add_token_to_namespace('is')
self.a_index = self.vocab.add_token_to_namespace('a')
self.sentence_index = self.vocab.add_token_to_namespace('sentence')
self.another_index = self.vocab.add_token_to_namespace('another')
self.yet_index = self.vocab.add_token_to_namespace('yet')
self.very_index = self.vocab.add_token_to_namespace('very')
self.long_index = self.vocab.add_token_to_namespace('long')
instances = [
self.create_instance(["this", "is", "a", "sentence"]),
self.create_instance(["this", "is", "another", "sentence"]),
self.create_instance(["yet", "another", "sentence"]),
self.create_instance(["this", "is", "a", "very", "very", "very", "very", "long", "sentence"]),
self.create_instance(["sentence"]),
]
class LazyIterable:
def __iter__(self):
return (instance for instance in instances)
self.instances = instances
self.lazy_instances = LazyIterable()