本文整理汇总了Python中allennlp.data.vocabulary.Vocabulary.from_instances方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.from_instances方法的具体用法?Python Vocabulary.from_instances怎么用?Python Vocabulary.from_instances使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.vocabulary.Vocabulary
的用法示例。
在下文中一共展示了Vocabulary.from_instances方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_from_instances_exclusive_embeddings_file_inside_archive
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_instances [as 别名]
def test_from_instances_exclusive_embeddings_file_inside_archive(self):
""" Just for ensuring there are no problems when reading pretrained tokens from an archive """
# Read embeddings file from archive
archive_path = str(self.TEST_DIR / "embeddings-archive.zip")
with zipfile.ZipFile(archive_path, 'w') as archive:
file_path = 'embedding.3d.vec'
with archive.open(file_path, 'w') as embeddings_file:
embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8'))
embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8'))
with archive.open('dummy.vec', 'w') as dummy_file:
dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8'))
embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path)
vocab = Vocabulary.from_instances(self.dataset,
min_count={'tokens': 4},
pretrained_files={'tokens': embeddings_file_uri},
only_include_pretrained_words=True)
words = set(vocab.get_index_to_token_vocabulary().values())
assert 'a' in words
assert 'b' not in words
assert 'c' not in words
vocab = Vocabulary.from_instances(self.dataset,
pretrained_files={'tokens': embeddings_file_uri},
only_include_pretrained_words=True)
words = set(vocab.get_index_to_token_vocabulary().values())
assert 'a' in words
assert 'b' in words
assert 'c' not in words
示例2: test_from_dataset_respects_max_vocab_size_single_int
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_instances [as 别名]
def test_from_dataset_respects_max_vocab_size_single_int(self):
max_vocab_size = 1
vocab = Vocabulary.from_instances(self.dataset, max_vocab_size=max_vocab_size)
words = vocab.get_index_to_token_vocabulary().values()
# Additional 2 tokens are '@@[email protected]@' and '@@[email protected]@' by default
assert len(words) == max_vocab_size + 2
vocab = Vocabulary.from_instances(self.dataset, min_count=None)
words = vocab.get_index_to_token_vocabulary().values()
assert len(words) == 5
示例3: test_from_dataset_respects_min_count
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_instances [as 别名]
def test_from_dataset_respects_min_count(self):
vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4})
words = vocab.get_index_to_token_vocabulary().values()
assert 'a' in words
assert 'b' not in words
assert 'c' not in words
vocab = Vocabulary.from_instances(self.dataset, min_count=None)
words = vocab.get_index_to_token_vocabulary().values()
assert 'a' in words
assert 'b' in words
assert 'c' in words
示例4: test_vocab_from_instances_namespaces
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_instances [as 别名]
def test_vocab_from_instances_namespaces(self):
reader = CcgBankDatasetReader(feature_labels=['modified_pos', 'original_pos', 'predicate_arg'])
instances = ensure_list(reader.read(self.FIXTURES_ROOT / 'data' / 'ccgbank.txt'))
# check that we didn't clobber the labels namespace
vocab = Vocabulary.from_instances(instances)
self.assertSetEqual(
set(vocab._token_to_index.keys()), # pylint: disable=protected-access
{'tokens', 'labels', 'modified_pos_tags', 'original_pos_tags',
'predicate_arg_tags'}
)
示例5: test_from_dataset_respects_inclusive_embedding_file
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_instances [as 别名]
def test_from_dataset_respects_inclusive_embedding_file(self):
embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
with gzip.open(embeddings_filename, 'wb') as embeddings_file:
embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8'))
embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8'))
vocab = Vocabulary.from_instances(self.dataset,
min_count={'tokens': 4},
pretrained_files={'tokens': embeddings_filename},
only_include_pretrained_words=False)
words = vocab.get_index_to_token_vocabulary().values()
assert 'a' in words
assert 'b' in words
assert 'c' not in words
vocab = Vocabulary.from_instances(self.dataset,
pretrained_files={'tokens': embeddings_filename},
only_include_pretrained_words=False)
words = vocab.get_index_to_token_vocabulary().values()
assert 'a' in words
assert 'b' in words
assert 'c' in words
示例6: setUp
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_instances [as 别名]
def setUp(self):
super().setUp()
self.base_reader = SequenceTaggingDatasetReader(lazy=True)
base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'
# Make 100 copies of the data
raw_data = open(base_file_path).read()
for i in range(100):
file_path = self.TEST_DIR / f'sequence_tagging_{i}.tsv'
with open(file_path, 'w') as f:
f.write(raw_data)
self.glob = str(self.TEST_DIR / 'sequence_tagging_*.tsv')
# For some of the tests we need a vocab, we'll just use the base_reader for that.
self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))
示例7: test_saving_and_loading_works_with_byte_encoding
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_instances [as 别名]
def test_saving_and_loading_works_with_byte_encoding(self):
# We're going to set a vocabulary from a TextField using byte encoding, index it, save the
# vocab, load the vocab, then index the text field again, and make sure we get the same
# result.
tokenizer = CharacterTokenizer(byte_encoding='utf-8')
token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
text_field = TextField(tokens, {"characters": token_indexer})
dataset = Batch([Instance({"sentence": text_field})])
vocab = Vocabulary.from_instances(dataset)
text_field.index(vocab)
indexed_tokens = deepcopy(text_field._indexed_tokens) # pylint: disable=protected-access
vocab_dir = self.TEST_DIR / 'vocab_save'
vocab.save_to_files(vocab_dir)
vocab2 = Vocabulary.from_files(vocab_dir)
text_field2 = TextField(tokens, {"characters": token_indexer})
text_field2.index(vocab2)
indexed_tokens2 = deepcopy(text_field2._indexed_tokens) # pylint: disable=protected-access
assert indexed_tokens == indexed_tokens2
示例8: get_metrics
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_instances [as 别名]
def get_metrics(self, reset: bool = False) -> Dict[str, float]:
return {"accuracy": self.accuracy.get_metric(reset)}
#### Now that we've implemented a <code>DatasetReader</code> and <code>Model</code>, we're ready to train. We first need an instance of our dataset reader.
reader = PosDatasetReader()
#### Which we can use to read in the training data and validation data. Here we read them in from a URL, but you could read them in from local files if your data was local. We use <code>cached_path</code> to cache the files locally (and to hand <code>reader.read</code> the path to the local cached version.)
train_dataset = reader.read(cached_path(
'https://raw.githubusercontent.com/allenai/allennlp'
'/master/tutorials/tagger/training.txt'))
validation_dataset = reader.read(cached_path(
'https://raw.githubusercontent.com/allenai/allennlp'
'/master/tutorials/tagger/validation.txt'))
#### Once we've read in the datasets, we use them to create our <code>Vocabulary</code> (that is, the mapping[s] from tokens / labels to ids).
vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
#### Now we need to construct the model. We'll choose a size for our embedding layer and for the hidden layer of our LSTM.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
#### For embedding the tokens we'll just use the <code>BasicTextFieldEmbedder</code> which takes a mapping from index names to embeddings. If you go back to where we defined our <code>DatasetReader</code>, the default parameters included a single index called "tokens", so our mapping just needs an embedding corresponding to that index. We use the <code>Vocabulary</code> to find how many embeddings we need and our <code>EMBEDDING_DIM</code> parameter to specify the output dimension. It's also possible to start with pre-trained embeddings (for example, GloVe vectors), but there's no need to do that on this tiny toy dataset.
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
#### We next need to specify the sequence encoder. The need for <code>PytorchSeq2SeqWrapper</code> here is slightly unfortunate (and if you use <a href = "https://github.com/allenai/allennlp/blob/master/tutorials/tagger/README.md#using-config-files">configuration files</a> you won't need to worry about it) but here it's required to add some extra functionality (and a cleaner interface) to the built-in PyTorch module. In AllenNLP we do everything batch first, so we specify that as well.
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
#### Finally, we can instantiate the model.
model = LstmTagger(word_embeddings, lstm, vocab)