本文整理汇总了Python中allennlp.data.Vocabulary.from_instances方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.from_instances方法的具体用法?Python Vocabulary.from_instances怎么用?Python Vocabulary.from_instances使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.Vocabulary
的用法示例。
在下文中一共展示了Vocabulary.from_instances方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: set_up_model
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_instances [as 别名]
def set_up_model(self, param_file, dataset_file):
self.param_file = param_file
params = Params.from_file(self.param_file)
reader = DatasetReader.from_params(params["dataset_reader"])
# The dataset reader might be lazy, but a lazy list here breaks some of our tests.
instances = reader.read(str(dataset_file))
# Use parameters for vocabulary if they are present in the config file, so that choices like
# "non_padded_namespaces", "min_count" etc. can be set if needed.
if "vocabulary" in params:
vocab_params = params["vocabulary"]
vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
else:
vocab = Vocabulary.from_instances(instances)
self.vocab = vocab
self.instances = instances
self.instances.index_with(vocab)
self.model = Model.from_params(vocab=self.vocab, params=params["model"])
# TODO(joelgrus) get rid of these
# (a lot of the model tests use them, so they'll have to be changed)
self.dataset = Batch(list(self.instances))
self.dataset.index_instances(self.vocab)
示例2: setup_method
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_instances [as 别名]
def setup_method(self):
super().setup_method()
self.instances = SequenceTaggingDatasetReader().read(
self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"
)
self.vocab = Vocabulary.from_instances(self.instances)
self.model_params = Params(
{
"text_field_embedder": {
"token_embedders": {
"tokens": {"type": "embedding", "embedding_dim": 5, "sparse": True}
}
},
"encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
}
)
self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
示例3: get_vocab_and_both_elmo_indexed_ids
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_instances [as 别名]
def get_vocab_and_both_elmo_indexed_ids(batch ):
instances = []
indexer = ELMoTokenCharactersIndexer()
indexer2 = SingleIdTokenIndexer()
for sentence in batch:
tokens = [Token(token) for token in sentence]
field = TextField(tokens,
{u'character_ids': indexer,
u'tokens': indexer2})
instance = Instance({u"elmo": field})
instances.append(instance)
dataset = Batch(instances)
vocab = Vocabulary.from_instances(instances)
dataset.index_instances(vocab)
return vocab, dataset.as_tensor_dict()[u"elmo"]
示例4: setUp
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_instances [as 别名]
def setUp(self):
super(TestOptimizer, self).setUp()
self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv')
vocab = Vocabulary.from_instances(self.instances)
self.model_params = Params({
u"text_field_embedder": {
u"tokens": {
u"type": u"embedding",
u"embedding_dim": 5
}
},
u"encoder": {
u"type": u"lstm",
u"input_size": 5,
u"hidden_size": 7,
u"num_layers": 2
}
})
self.model = SimpleTagger.from_params(vocab=vocab, params=self.model_params)
示例5: setUp
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_instances [as 别名]
def setUp(self):
super(TestTrainer, self).setUp()
self.instances = SequenceTaggingDatasetReader().read(self.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv')
vocab = Vocabulary.from_instances(self.instances)
self.vocab = vocab
self.model_params = Params({
u"text_field_embedder": {
u"tokens": {
u"type": u"embedding",
u"embedding_dim": 5
}
},
u"encoder": {
u"type": u"lstm",
u"input_size": 5,
u"hidden_size": 7,
u"num_layers": 2
}
})
self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01)
self.iterator = BasicIterator(batch_size=2)
self.iterator.index_with(vocab)
示例6: set_up_model
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_instances [as 别名]
def set_up_model(self, param_file, dataset_file):
# pylint: disable=attribute-defined-outside-init
self.param_file = param_file
params = Params.from_file(self.param_file)
reader = DatasetReader.from_params(params['dataset_reader'])
# The dataset reader might be lazy, but a lazy list here breaks some of our tests.
instances = list(reader.read(str(dataset_file)))
# Use parameters for vocabulary if they are present in the config file, so that choices like
# "non_padded_namespaces", "min_count" etc. can be set if needed.
if 'vocabulary' in params:
vocab_params = params['vocabulary']
vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
else:
vocab = Vocabulary.from_instances(instances)
self.vocab = vocab
self.instances = instances
self.model = Model.from_params(vocab=self.vocab, params=params['model'])
# TODO(joelgrus) get rid of these
# (a lot of the model tests use them, so they'll have to be changed)
self.dataset = Batch(self.instances)
self.dataset.index_instances(self.vocab)
示例7: get_vocab_and_both_elmo_indexed_ids
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_instances [as 别名]
def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]):
instances = []
indexer = ELMoTokenCharactersIndexer()
indexer2 = SingleIdTokenIndexer()
for sentence in batch:
tokens = [Token(token) for token in sentence]
field = TextField(tokens, {"character_ids": indexer, "tokens": indexer2})
instance = Instance({"elmo": field})
instances.append(instance)
dataset = Batch(instances)
vocab = Vocabulary.from_instances(instances)
dataset.index_instances(vocab)
return vocab, dataset.as_tensor_dict()["elmo"]
示例8: setup_method
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_instances [as 别名]
def setup_method(self):
super().setup_method()
self.instances = SequenceTaggingDatasetReader().read(
self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"
)
self.instances_lazy = SequenceTaggingDatasetReader(lazy=True).read(
self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"
)
vocab = Vocabulary.from_instances(self.instances)
self.vocab = vocab
self.model_params = Params(
{
"text_field_embedder": {
"token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
},
"encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
}
)
self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9)
self.data_loader = DataLoader(self.instances, batch_size=2, collate_fn=allennlp_collate)
self.data_loader_lazy = DataLoader(
self.instances_lazy, batch_size=2, collate_fn=allennlp_collate
)
self.validation_data_loader = DataLoader(
self.instances, batch_size=2, collate_fn=allennlp_collate
)
self.instances.index_with(vocab)
self.instances_lazy.index_with(vocab)
示例9: test_batch_predictions_are_consistent
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_instances [as 别名]
def test_batch_predictions_are_consistent(self):
# The CNN encoder has problems with this kind of test - it's not properly masked yet, so
# changing the amount of padding in the batch will result in small differences in the
# output of the encoder. Because BiDAF is so deep, these differences get magnified through
# the network and make this test impossible. So, we'll remove the CNN encoder entirely
# from the model for this test. If/when we fix the CNN encoder to work correctly with
# masking, we can change this back to how the other models run this test, with just a
# single line.
# pylint: disable=protected-access,attribute-defined-outside-init
# Save some state.
saved_model = self.model
saved_instances = self.instances
# Modify the state, run the test with modified state.
params = Params.from_file(self.param_file)
reader = DatasetReader.from_params(params[u'dataset_reader'])
reader._token_indexers = {u'tokens': reader._token_indexers[u'tokens']}
self.instances = reader.read(self.FIXTURES_ROOT / u'data' / u'squad.json')
vocab = Vocabulary.from_instances(self.instances)
for instance in self.instances:
instance.index_fields(vocab)
del params[u'model'][u'text_field_embedder'][u'token_embedders'][u'token_characters']
params[u'model'][u'phrase_layer'][u'input_size'] = 2
self.model = Model.from_params(vocab=vocab, params=params[u'model'])
self.ensure_batch_predictions_are_consistent()
# Restore the state.
self.model = saved_model
self.instances = saved_instances