本文整理汇总了Python中allennlp.data.vocabulary.Vocabulary.from_params方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.from_params方法的具体用法?Python Vocabulary.from_params怎么用?Python Vocabulary.from_params使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.vocabulary.Vocabulary
的用法示例。
在下文中一共展示了Vocabulary.from_params方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_from_params
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_params [as 别名]
def test_from_params(self):
# Save a vocab to check we can load it from_params.
vocab_dir = self.TEST_DIR / 'vocab_save'
vocab = Vocabulary(non_padded_namespaces=["a", "c"])
vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0
vocab.add_token_to_namespace("a1", namespace="a")
vocab.add_token_to_namespace("a2", namespace="a")
vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2
vocab.add_token_to_namespace("b3", namespace="b")
vocab.save_to_files(vocab_dir)
params = Params({"directory_path": vocab_dir})
vocab2 = Vocabulary.from_params(params)
assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
# Test case where we build a vocab from a dataset.
vocab2 = Vocabulary.from_params(Params({}), self.dataset)
assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@[email protected]@',
1: '@@[email protected]@',
2: 'a', 3: 'c', 4: 'b'}
# Test from_params raises when we have neither a dataset and a vocab_directory.
with pytest.raises(ConfigurationError):
_ = Vocabulary.from_params(Params({}))
# Test from_params raises when there are any other dict keys
# present apart from 'directory_path' and we aren't calling from_dataset.
with pytest.raises(ConfigurationError):
_ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
示例2: test_invalid_vocab_extension
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_params [as 别名]
def test_invalid_vocab_extension(self):
vocab_dir = self.TEST_DIR / 'vocab_save'
original_vocab = Vocabulary(non_padded_namespaces=["tokens1"])
original_vocab.add_token_to_namespace("a", namespace="tokens1")
original_vocab.add_token_to_namespace("b", namespace="tokens1")
original_vocab.add_token_to_namespace("p", namespace="tokens2")
original_vocab.save_to_files(vocab_dir)
text_field1 = TextField([Token(t) for t in ["a" "c"]],
{"tokens1": SingleIdTokenIndexer("tokens1")})
text_field2 = TextField([Token(t) for t in ["p", "q", "r"]],
{"tokens2": SingleIdTokenIndexer("tokens2")})
instances = Batch([Instance({"text1": text_field1, "text2": text_field2})])
# Following 2 should give error: token1 is non-padded in original_vocab but not in instances
params = Params({"directory_path": vocab_dir, "extend": True,
"non_padded_namespaces": []})
with pytest.raises(ConfigurationError):
_ = Vocabulary.from_params(params, instances)
with pytest.raises(ConfigurationError):
extended_vocab = copy.copy(original_vocab)
params = Params({"non_padded_namespaces": []})
extended_vocab.extend_from_instances(params, instances)
with pytest.raises(ConfigurationError):
extended_vocab = copy.copy(original_vocab)
extended_vocab._extend(non_padded_namespaces=[],
tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
# Following 2 should not give error: overlapping namespaces have same padding setting
params = Params({"directory_path": vocab_dir, "extend": True,
"non_padded_namespaces": ["tokens1"]})
Vocabulary.from_params(params, instances)
extended_vocab = copy.copy(original_vocab)
params = Params({"non_padded_namespaces": ["tokens1"]})
extended_vocab.extend_from_instances(params, instances)
extended_vocab = copy.copy(original_vocab)
extended_vocab._extend(non_padded_namespaces=["tokens1"],
tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
# Following 2 should give error: token1 is padded in instances but not in original_vocab
params = Params({"directory_path": vocab_dir, "extend": True,
"non_padded_namespaces": ["tokens1", "tokens2"]})
with pytest.raises(ConfigurationError):
_ = Vocabulary.from_params(params, instances)
with pytest.raises(ConfigurationError):
extended_vocab = copy.copy(original_vocab)
params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]})
extended_vocab.extend_from_instances(params, instances)
with pytest.raises(ConfigurationError):
extended_vocab = copy.copy(original_vocab)
extended_vocab._extend(non_padded_namespaces=["tokens1", "tokens2"],
tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
示例3: setUp
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_params [as 别名]
def setUp(self):
super(TestCopyNetReader, self).setUp()
params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json")
self.reader = DatasetReader.from_params(params["dataset_reader"])
instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv")
self.instances = ensure_list(instances)
self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
示例4: test_min_pretrained_embeddings
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_params [as 别名]
def test_min_pretrained_embeddings(self):
params = Params({
"pretrained_files": {
"tokens": str(self.FIXTURES_ROOT / "embeddings/glove.6B.100d.sample.txt.gz")
},
"min_pretrained_embeddings": {"tokens": 50},
})
vocab = Vocabulary.from_params(params=params, instances=self.dataset)
assert vocab.get_vocab_size() >= 50
assert vocab.get_token_index("his") > 1 # not @@[email protected]@
示例5: test_max_vocab_size_dict
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_params [as 别名]
def test_max_vocab_size_dict(self):
params = Params({
"max_vocab_size": {
"tokens": 1,
"characters": 20
}
})
vocab = Vocabulary.from_params(params=params, instances=self.dataset)
words = vocab.get_index_to_token_vocabulary().values()
# Additional 2 tokens are '@@[email protected]@' and '@@[email protected]@' by default
assert len(words) == 3
示例6: test_from_params_extend_config
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_params [as 别名]
def test_from_params_extend_config(self):
vocab_dir = self.TEST_DIR / 'vocab_save'
original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
original_vocab.add_token_to_namespace("a", namespace="tokens")
original_vocab.save_to_files(vocab_dir)
text_field = TextField([Token(t) for t in ["a", "b"]],
{"tokens": SingleIdTokenIndexer("tokens")})
instances = Batch([Instance({"text": text_field})])
# If you ask to extend vocab from `directory_path`, instances must be passed
# in Vocabulary constructor, or else there is nothing to extend to.
params = Params({"directory_path": vocab_dir, "extend": True})
with pytest.raises(ConfigurationError):
_ = Vocabulary.from_params(params)
# If you ask to extend vocab, `directory_path` key must be present in params,
# or else there is nothing to extend from.
params = Params({"extend": True})
with pytest.raises(ConfigurationError):
_ = Vocabulary.from_params(params, instances)
示例7: test_max_vocab_size_partial_dict
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_params [as 别名]
def test_max_vocab_size_partial_dict(self):
indexers = {"tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer()}
instance = Instance({
'text': TextField([Token(w) for w in 'Abc def ghi jkl mno pqr stu vwx yz'.split(' ')], indexers)
})
dataset = Batch([instance])
params = Params({
"max_vocab_size": {
"tokens": 1
}
})
vocab = Vocabulary.from_params(params=params, instances=dataset)
assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2
assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
示例8: test_registrability
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_params [as 别名]
def test_registrability(self):
@Vocabulary.register('my-vocabulary')
class MyVocabulary:
@classmethod
def from_params(cls, params, instances=None):
# pylint: disable=unused-argument
return MyVocabulary()
params = Params({'type': 'my-vocabulary'})
instance = Instance(fields={})
vocab = Vocabulary.from_params(params=params, instances=[instance])
assert isinstance(vocab, MyVocabulary)
示例9: test_from_params_valid_vocab_extension_thoroughly
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_params [as 别名]
def test_from_params_valid_vocab_extension_thoroughly(self):
'''
Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
when overlapping namespaces have same padding behaviour (padded/non-padded)
Summary of namespace paddings in this test:
original_vocab namespaces
tokens0 padded
tokens1 non-padded
tokens2 padded
tokens3 non-padded
instances namespaces
tokens0 padded
tokens1 non-padded
tokens4 padded
tokens5 non-padded
TypicalExtention example: (of tokens1 namespace)
-> original_vocab index2token
apple #0->apple
bat #1->bat
cat #2->cat
-> Token to be extended with: cat, an, apple, banana, atom, bat
-> extended_vocab: index2token
apple #0->apple
bat #1->bat
cat #2->cat
an #3->an
atom #4->atom
banana #5->banana
'''
vocab_dir = self.TEST_DIR / 'vocab_save'
original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"])
original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2
original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3
original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4
original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0
original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1
original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2
original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0
original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1
original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2
original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0
original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1
original_vocab.save_to_files(vocab_dir)
text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
{"tokens0": SingleIdTokenIndexer("tokens0")})
text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
{"tokens1": SingleIdTokenIndexer("tokens1")})
text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
{"tokens4": SingleIdTokenIndexer("tokens4")})
text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
{"tokens5": SingleIdTokenIndexer("tokens5")})
instances = Batch([Instance({"text0": text_field0, "text1": text_field1,
"text4": text_field4, "text5": text_field5})])
params = Params({"directory_path": vocab_dir,
"extend": True,
"non_padded_namespaces": ["tokens1", "tokens5"]})
extended_vocab = Vocabulary.from_params(params, instances)
# namespaces: tokens0, tokens1 is common.
# tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
extended_namespaces = {*extended_vocab._token_to_index}
assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}
# # Check that _non_padded_namespaces list is consistent after extension
assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"}
# # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
assert extended_vocab.get_vocab_size("tokens1") == 6
assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded
# namespace tokens3, tokens4 was only in original_vocab,
# and its token count should be same in extended_vocab
assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2")
assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3")
# namespace tokens2 was only in instances,
# and its token count should be same in extended_vocab
assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding
assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z
# Word2index mapping of all words in all namespaces of original_vocab
# should be maintained in extended_vocab
for namespace, token2index in original_vocab._token_to_index.items():
for token, _ in token2index.items():
vocab_index = original_vocab.get_token_index(token, namespace)
extended_vocab_index = extended_vocab.get_token_index(token, namespace)
assert vocab_index == extended_vocab_index
# And same for Index2Word mapping
for namespace, index2token in original_vocab._index_to_token.items():
for index, _ in index2token.items():
vocab_token = original_vocab.get_token_from_index(index, namespace)
extended_vocab_token = extended_vocab.get_token_from_index(index, namespace)
assert vocab_token == extended_vocab_token
示例10: test_valid_vocab_extension
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_params [as 别名]
def test_valid_vocab_extension(self):
vocab_dir = self.TEST_DIR / 'vocab_save'
extension_ways = ["from_params", "extend_from_instances"]
# Test: padded/non-padded common namespaces are extending appropriately
non_padded_namespaces_list = [[], ["tokens"]]
for non_padded_namespaces in non_padded_namespaces_list:
original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces)
original_vocab.add_token_to_namespace("d", namespace="tokens")
original_vocab.add_token_to_namespace("a", namespace="tokens")
original_vocab.add_token_to_namespace("b", namespace="tokens")
text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]],
{"tokens": SingleIdTokenIndexer("tokens")})
instances = Batch([Instance({"text": text_field})])
for way in extension_ways:
if way == "extend_from_instances":
extended_vocab = copy.copy(original_vocab)
params = Params({"non_padded_namespaces": non_padded_namespaces})
extended_vocab.extend_from_instances(params, instances)
else:
shutil.rmtree(vocab_dir, ignore_errors=True)
original_vocab.save_to_files(vocab_dir)
params = Params({"directory_path": vocab_dir, "extend": True,
"non_padded_namespaces": non_padded_namespaces})
extended_vocab = Vocabulary.from_params(params, instances)
extra_count = 2 if extended_vocab.is_padded("tokens") else 0
assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count
assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count
assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count
assert extended_vocab.get_token_index("c", "tokens") # should be present
assert extended_vocab.get_token_index("e", "tokens") # should be present
assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count
# Test: padded/non-padded non-common namespaces are extending appropriately
non_padded_namespaces_list = [[],
["tokens1"],
["tokens1", "tokens2"]]
for non_padded_namespaces in non_padded_namespaces_list:
original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces)
original_vocab.add_token_to_namespace("a", namespace="tokens1") # index2
text_field = TextField([Token(t) for t in ["b"]],
{"tokens2": SingleIdTokenIndexer("tokens2")})
instances = Batch([Instance({"text": text_field})])
for way in extension_ways:
if way == "extend_from_instances":
extended_vocab = copy.copy(original_vocab)
params = Params({"non_padded_namespaces": non_padded_namespaces})
extended_vocab.extend_from_instances(params, instances)
else:
shutil.rmtree(vocab_dir, ignore_errors=True)
original_vocab.save_to_files(vocab_dir)
params = Params({"directory_path": vocab_dir, "extend": True,
"non_padded_namespaces": non_padded_namespaces})
extended_vocab = Vocabulary.from_params(params, instances)
# Should have two namespaces
assert len(extended_vocab._token_to_index) == 2
extra_count = 2 if extended_vocab.is_padded("tokens1") else 0
assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count
extra_count = 2 if extended_vocab.is_padded("tokens2") else 0
assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
示例11: test_from_params_adds_tokens_to_vocab
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_params [as 别名]
def test_from_params_adds_tokens_to_vocab(self):
vocab = Vocabulary.from_params(Params({'tokens_to_add': {'tokens': ['q', 'x', 'z']}}), self.dataset)
assert vocab.get_index_to_token_vocabulary("tokens") == {0: '@@[email protected]@',
1: '@@[email protected]@',
2: 'a', 3: 'c', 4: 'b',
5: 'q', 6: 'x', 7: 'z'}