本文整理汇总了Python中allennlp.data.Vocabulary.add_token_to_namespace方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.add_token_to_namespace方法的具体用法?Python Vocabulary.add_token_to_namespace怎么用?Python Vocabulary.add_token_to_namespace使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.Vocabulary
的用法示例。
在下文中一共展示了Vocabulary.add_token_to_namespace方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_read_embedding_file_inside_archive
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def test_read_embedding_file_inside_archive(self):
token2vec = {
"think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
"make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
"difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
"àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
}
vocab = Vocabulary()
for token in token2vec:
vocab.add_token_to_namespace(token)
params = Params({
'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
'embedding_dim': 5
})
with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"):
Embedding.from_params(vocab, params)
for ext in ['.zip', '.tar.gz']:
archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt')
params = Params({
'pretrained_file': file_uri,
'embedding_dim': 5
})
embeddings = Embedding.from_params(vocab, params).weight.data
for tok, vec in token2vec.items():
i = vocab.get_token_index(tok)
assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
示例2: test_dry_run_without_extension
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def test_dry_run_without_extension(self):
existing_serialization_dir = self.TEST_DIR / 'existing'
extended_serialization_dir = self.TEST_DIR / 'extended'
existing_vocab_path = existing_serialization_dir / 'vocabulary'
extended_vocab_path = extended_serialization_dir / 'vocabulary'
vocab = Vocabulary()
# if extend is False, its users responsibility to make sure that dataset instances
# will be indexible by provided vocabulary. At least @@[email protected]@ should be present in
# namespace for which there could be OOV entries seen in dataset during indexing.
# For `tokens` ns, new words will be seen but `tokens` has @@[email protected]@ token.
# but for 'labels' ns, there is no @@[email protected]@ so required to add 'N', 'V' upfront.
vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
vocab.add_token_to_namespace('N', namespace='labels')
vocab.add_token_to_namespace('V', namespace='labels')
os.makedirs(existing_serialization_dir, exist_ok=True)
vocab.save_to_files(existing_vocab_path)
self.params['vocabulary'] = {}
self.params['vocabulary']['directory_path'] = existing_vocab_path
self.params['vocabulary']['extend'] = False
dry_run_from_params(self.params, extended_serialization_dir)
with open(extended_vocab_path / 'tokens.txt') as f:
tokens = [line.strip() for line in f]
assert tokens[0] == '@@[email protected]@'
assert tokens[1] == 'some_weird_token_1'
assert tokens[2] == 'some_weird_token_2'
assert len(tokens) == 3
示例3: get_vocab
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def get_vocab(word2freq, max_v_sizes):
'''Build vocabulary'''
vocab = Vocabulary(counter=None, max_vocab_size=max_v_sizes['word'])
words_by_freq = [(word, freq) for word, freq in word2freq.items()]
words_by_freq.sort(key=lambda x: x[1], reverse=True)
for word, _ in words_by_freq[:max_v_sizes['word']]:
vocab.add_token_to_namespace(word, 'tokens')
log.info("\tFinished building vocab. Using %d words", vocab.get_vocab_size('tokens'))
return vocab
示例4: test_token_to_indices_uses_ner_tags
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def test_token_to_indices_uses_ner_tags(self):
tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
tokens = [t for t in tokens] + [Token("</S>")]
vocab = Vocabulary()
person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags')
none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
vocab.add_token_to_namespace('ORG', namespace='ner_tags')
indexer = NerTagIndexer()
assert indexer.token_to_indices(tokens[1], vocab) == person_index
assert indexer.token_to_indices(tokens[-1], vocab) == none_index
示例5: test_get_embedding_layer_uses_correct_embedding_dim
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def test_get_embedding_layer_uses_correct_embedding_dim(self):
vocab = Vocabulary()
vocab.add_token_to_namespace('word1')
vocab.add_token_to_namespace('word2')
embeddings_filename = self.TEST_DIR + "embeddings.gz"
with gzip.open(embeddings_filename, 'wb') as embeddings_file:
embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8'))
embedding_weights = _read_pretrained_embedding_file(embeddings_filename, 3, vocab)
assert tuple(embedding_weights.size()) == (4, 3) # 4 because of padding and OOV
with pytest.raises(ConfigurationError):
_read_pretrained_embedding_file(embeddings_filename, 4, vocab)
示例6: test_index_converts_field_correctly
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def test_index_converts_field_correctly(self):
vocab = Vocabulary()
b_index = vocab.add_token_to_namespace("B", namespace='*labels')
i_index = vocab.add_token_to_namespace("I", namespace='*labels')
o_index = vocab.add_token_to_namespace("O", namespace='*labels')
tags = ["B", "I", "O", "O", "O"]
sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels")
sequence_label_field.index(vocab)
# pylint: disable=protected-access
assert sequence_label_field._indexed_labels == [b_index, i_index, o_index, o_index, o_index]
示例7: test_as_tensor_produces_integer_targets
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def test_as_tensor_produces_integer_targets(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("B", namespace='*labels')
vocab.add_token_to_namespace("I", namespace='*labels')
vocab.add_token_to_namespace("O", namespace='*labels')
tags = ["B", "I", "O", "O", "O"]
sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels")
sequence_label_field.index(vocab)
padding_lengths = sequence_label_field.get_padding_lengths()
tensor = sequence_label_field.as_tensor(padding_lengths).detach().cpu().numpy()
numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 1, 2, 2, 2]))
示例8: test_token_to_indices_uses_pos_tags
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def test_token_to_indices_uses_pos_tags(self):
tokens = self.tokenizer.split_words("This is a sentence.")
tokens = [t for t in tokens] + [Token("</S>")]
vocab = Vocabulary()
verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
indexer = PosTagIndexer(coarse_tags=True)
assert indexer.token_to_indices(tokens[1], vocab) == verb_index
assert indexer.token_to_indices(tokens[-1], vocab) == none_index
indexer._coarse_tags = False # pylint: disable=protected-access
assert indexer.token_to_indices(tokens[1], vocab) == cop_index
示例9: test_get_embedding_layer_initializes_unseen_words_randomly_not_zero
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("word")
vocab.add_token_to_namespace("word2")
embeddings_filename = self.TEST_DIR + "embeddings.gz"
with gzip.open(embeddings_filename, 'wb') as embeddings_file:
embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
params = Params({
'pretrained_file': embeddings_filename,
'embedding_dim': 3,
})
embedding_layer = Embedding.from_params(vocab, params)
word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0]))
示例10: TestDataset
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
class TestDataset(AllenNlpTestCase):
def setUp(self):
self.vocab = Vocabulary()
self.vocab.add_token_to_namespace("this")
self.vocab.add_token_to_namespace("is")
self.vocab.add_token_to_namespace("a")
self.vocab.add_token_to_namespace("sentence")
self.vocab.add_token_to_namespace(".")
self.token_indexer = {"tokens": SingleIdTokenIndexer()}
self.instances = self.get_instances()
super(TestDataset, self).setUp()
def test_instances_must_have_homogeneous_fields(self):
instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))})
instance2 = Instance({"words": TextField([Token("hello")], {})})
with pytest.raises(ConfigurationError):
_ = Batch([instance1, instance2])
def test_padding_lengths_uses_max_instance_lengths(self):
dataset = Batch(self.instances)
dataset.index_instances(self.vocab)
padding_lengths = dataset.get_padding_lengths()
assert padding_lengths == {"text1": {"num_tokens": 5, "tokens_length": 5},
"text2": {"num_tokens": 6, "tokens_length": 6}}
def test_as_tensor_dict(self):
dataset = Batch(self.instances)
dataset.index_instances(self.vocab)
padding_lengths = dataset.get_padding_lengths()
tensors = dataset.as_tensor_dict(padding_lengths)
text1 = tensors["text1"]["tokens"].detach().cpu().numpy()
text2 = tensors["text2"]["tokens"].detach().cpu().numpy()
numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6],
[1, 3, 4, 5, 6]]))
numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6],
[2, 3, 1, 0, 0, 0]]))
def get_instances(self):
field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]],
self.token_indexer)
field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
self.token_indexer)
field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]],
self.token_indexer)
field4 = TextField([Token(t) for t in ["this", "is", "short"]],
self.token_indexer)
instances = [Instance({"text1": field1, "text2": field2}),
Instance({"text1": field3, "text2": field4})]
return instances
示例11: test_read_hdf5_raises_on_invalid_shape
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def test_read_hdf5_raises_on_invalid_shape(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("word")
embeddings_filename = self.TEST_DIR + "embeddings.hdf5"
embeddings = numpy.random.rand(vocab.get_vocab_size(), 10)
with h5py.File(embeddings_filename, 'w') as fout:
_ = fout.create_dataset(
'embedding', embeddings.shape, dtype='float32', data=embeddings
)
params = Params({
'pretrained_file': embeddings_filename,
'embedding_dim': 5,
})
with pytest.raises(ConfigurationError):
_ = Embedding.from_params(vocab, params)
示例12: test_adjacency_field_can_index_with_vocab
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def test_adjacency_field_can_index_with_vocab(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("a", namespace="labels")
vocab.add_token_to_namespace("b", namespace="labels")
vocab.add_token_to_namespace("c", namespace="labels")
labels = ["a", "b"]
indices = [(0, 1), (2, 1)]
adjacency_field = AdjacencyField(indices, self.text, labels)
adjacency_field.index(vocab)
tensor = adjacency_field.as_tensor(adjacency_field.get_padding_lengths())
numpy.testing.assert_equal(tensor.numpy(), numpy.array([[-1, 0, -1, -1, -1],
[-1, -1, -1, -1, -1],
[-1, 1, -1, -1, -1],
[-1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1]]))
示例13: test_forward_works_with_projection_layer
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def test_forward_works_with_projection_layer(self):
vocab = Vocabulary()
vocab.add_token_to_namespace('the')
vocab.add_token_to_namespace('a')
params = Params({
'pretrained_file': 'tests/fixtures/glove.6B.300d.sample.txt.gz',
'embedding_dim': 300,
'projection_dim': 20
})
embedding_layer = Embedding.from_params(vocab, params)
input_tensor = Variable(torch.LongTensor([[3, 2, 1, 0]]))
embedded = embedding_layer(input_tensor).data.numpy()
assert embedded.shape == (1, 4, 20)
input_tensor = Variable(torch.LongTensor([[[3, 2, 1, 0]]]))
embedded = embedding_layer(input_tensor).data.numpy()
assert embedded.shape == (1, 1, 4, 20)
示例14: test_read_hdf5_format_file
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def test_read_hdf5_format_file(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("word")
vocab.add_token_to_namespace("word2")
embeddings_filename = self.TEST_DIR + "embeddings.hdf5"
embeddings = numpy.random.rand(vocab.get_vocab_size(), 5)
with h5py.File(embeddings_filename, 'w') as fout:
_ = fout.create_dataset(
'embedding', embeddings.shape, dtype='float32', data=embeddings
)
params = Params({
'pretrained_file': embeddings_filename,
'embedding_dim': 5,
})
embedding_layer = Embedding.from_params(vocab, params)
assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
示例15: test_start_and_end_tokens
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def test_start_and_end_tokens(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("A", namespace='characters') # 2
vocab.add_token_to_namespace("s", namespace='characters') # 3
vocab.add_token_to_namespace("e", namespace='characters') # 4
vocab.add_token_to_namespace("n", namespace='characters') # 5
vocab.add_token_to_namespace("t", namespace='characters') # 6
vocab.add_token_to_namespace("c", namespace='characters') # 7
vocab.add_token_to_namespace("<", namespace='characters') # 8
vocab.add_token_to_namespace(">", namespace='characters') # 9
vocab.add_token_to_namespace("/", namespace='characters') # 10
indexer = TokenCharactersIndexer("characters", start_tokens=["<s>"], end_tokens=["</s>"])
indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char")
assert indices == {"char": [[8, 3, 9],
[3, 4, 5, 6, 4, 5, 6, 1, 1, 1],
[8, 10, 3, 9]]}