本文整理汇总了Python中allennlp.data.Vocabulary.from_params方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.from_params方法的具体用法?Python Vocabulary.from_params怎么用?Python Vocabulary.from_params使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.Vocabulary
的用法示例。
在下文中一共展示了Vocabulary.from_params方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: set_up_model
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def set_up_model(self, param_file, dataset_file):
self.param_file = param_file
params = Params.from_file(self.param_file)
reader = DatasetReader.from_params(params["dataset_reader"])
# The dataset reader might be lazy, but a lazy list here breaks some of our tests.
instances = reader.read(str(dataset_file))
# Use parameters for vocabulary if they are present in the config file, so that choices like
# "non_padded_namespaces", "min_count" etc. can be set if needed.
if "vocabulary" in params:
vocab_params = params["vocabulary"]
vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
else:
vocab = Vocabulary.from_instances(instances)
self.vocab = vocab
self.instances = instances
self.instances.index_with(vocab)
self.model = Model.from_params(vocab=self.vocab, params=params["model"])
# TODO(joelgrus) get rid of these
# (a lot of the model tests use them, so they'll have to be changed)
self.dataset = Batch(list(self.instances))
self.dataset.index_instances(self.vocab)
示例2: set_up_model
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def set_up_model(self, param_file, dataset_file):
# pylint: disable=attribute-defined-outside-init
self.param_file = param_file
params = Params.from_file(self.param_file)
reader = DatasetReader.from_params(params[u'dataset_reader'])
instances = reader.read(dataset_file)
# Use parameters for vocabulary if they are present in the config file, so that choices like
# "non_padded_namespaces", "min_count" etc. can be set if needed.
if u'vocabulary' in params:
vocab_params = params[u'vocabulary']
vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
else:
vocab = Vocabulary.from_instances(instances)
self.vocab = vocab
self.instances = instances
self.model = Model.from_params(vocab=self.vocab, params=params[u'model'])
# TODO(joelgrus) get rid of these
# (a lot of the model tests use them, so they'll have to be changed)
self.dataset = Batch(self.instances)
self.dataset.index_instances(self.vocab)
示例3: get_bert_test_fixture
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def get_bert_test_fixture():
embedder_params = {
"type": "bert-pretrained",
"pretrained_model": "tests/fixtures/bert/bert_test_fixture.tar.gz",
"requires_grad": True,
"top_layer_only": True,
}
embedder_params_copy = dict(embedder_params)
embedder = TokenEmbedder.from_params(Params(embedder_params))
indexer_params = {
"type": "bert-pretrained",
"pretrained_model": "tests/fixtures/bert/vocab.txt",
"do_lowercase": True,
"use_starting_offsets": True,
"max_pieces": 512,
}
indexer_params_copy = dict(indexer_params)
indexer = TokenIndexer.from_params(Params(indexer_params))
return {'embedder': embedder, 'embedder_params': embedder_params_copy,
'indexer': indexer, 'indexer_params': indexer_params_copy}
示例4: test_kg_reader_with_eval
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def test_kg_reader_with_eval(self):
train_file = 'tests/fixtures/kg_embeddings/wn18rr_train.txt'
dev_file = 'tests/fixtures/kg_embeddings/wn18rr_dev.txt'
train_instances = KGTupleReader().read(train_file)
reader = KGTupleReader(extra_files_for_gold_pairs=[train_file])
instances = reader.read(dev_file)
self.assertEqual(len(instances), 2)
vocab = Vocabulary.from_params(Params({}), train_instances + instances)
iterator = BasicIterator(batch_size=32)
iterator.index_with(vocab)
for batch in iterator(instances, num_epochs=1, shuffle=False):
pass
expected_entity = [1, 5]
expected_relation = ['_hypernym', '_hypernym_reverse']
expected_entity2 = [[5, 2, 3], [1, 4]]
self._check_batch(batch, vocab,
expected_entity, expected_relation, expected_entity2)
示例5: test_read
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def test_read(self):
params, file_paths = get_dataset_params_paths(["ner"])
multitask_reader = DatasetReader.from_params(params)
dataset = multitask_reader.read(file_paths)
# get all the instances -- only should have "original_pos_tags"
# for NER
for name, instances in dataset.datasets.items():
self.assertTrue(name in ('ner', 'ccg'))
for instance in instances:
if name == 'ner':
self.assertTrue("original_pos_tags" not in instance.fields)
else:
self.assertTrue("original_pos_tags" in instance.fields)
# when iterating directly, only get 'ner'
for instance in dataset:
self.assertTrue("original_pos_tags" not in instance.fields)
示例6: test_knowbert_wiki_wordnet
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def test_knowbert_wiki_wordnet(self):
from kb.testing import get_bert_pretraining_reader_with_kg
reader = get_bert_pretraining_reader_with_kg(
mask_candidate_strategy='full_mask', masked_lm_prob=0.35, include_wiki=True)
instances = reader.read("tests/fixtures/bert_pretraining/shard1.txt")
vocab = Vocabulary.from_params(Params({
"directory_path": "tests/fixtures/wordnet_wiki_vocab",
}))
iterator = BasicIterator()
iterator.index_with(vocab)
for batch in iterator(instances, num_epochs=1, shuffle=False):
pass
# hack, incompatitable fixtures...
batch['tokens']['tokens'] = torch.min(batch['tokens']['tokens'], torch.tensor([17]))
batch['lm_label_ids']['lm_labels'] = torch.min(batch['lm_label_ids']['lm_labels'], torch.tensor([17]))
model = get_knowbert(vocab, None, include_wiki=True)
output = model(**batch)
loss = output['loss']
loss.backward()
self.assertTrue(True)
示例7: set_up_model
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def set_up_model(self, param_file, dataset_file):
# pylint: disable=attribute-defined-outside-init
self.param_file = param_file
params = Params.from_file(self.param_file)
reader = DatasetReader.from_params(params['dataset_reader'])
# The dataset reader might be lazy, but a lazy list here breaks some of our tests.
instances = list(reader.read(str(dataset_file)))
# Use parameters for vocabulary if they are present in the config file, so that choices like
# "non_padded_namespaces", "min_count" etc. can be set if needed.
if 'vocabulary' in params:
vocab_params = params['vocabulary']
vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
else:
vocab = Vocabulary.from_instances(instances)
self.vocab = vocab
self.instances = instances
self.model = Model.from_params(vocab=self.vocab, params=params['model'])
# TODO(joelgrus) get rid of these
# (a lot of the model tests use them, so they'll have to be changed)
self.dataset = Batch(self.instances)
self.dataset.index_instances(self.vocab)
示例8: from_params
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def from_params(cls, params: Params) -> 'WordSplitter':
params.assert_empty(cls.__name__)
return cls()
示例9: setup_method
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def setup_method(self):
super().setup_method()
params = Params(
{
"model": {
"type": "simple_tagger",
"text_field_embedder": {
"token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
},
"encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
},
"dataset_reader": {"type": "sequence_tagging"},
"train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
"validation_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
"data_loader": {"batch_size": 2},
"trainer": {"cuda_device": -1, "num_epochs": 2, "optimizer": "adam"},
}
)
all_datasets = datasets_from_params(params)
vocab = Vocabulary.from_params(
params.pop("vocabulary", {}),
instances=(instance for dataset in all_datasets.values() for instance in dataset),
)
model = Model.from_params(vocab=vocab, params=params.pop("model"))
train_data = all_datasets["train"]
train_data.index_with(vocab)
data_loader = DataLoader.from_params(dataset=train_data, params=params.pop("data_loader"))
trainer_params = params.pop("trainer")
serialization_dir = os.path.join(self.TEST_DIR, "test_search_learning_rate")
self.trainer = Trainer.from_params(
model=model,
serialization_dir=serialization_dir,
data_loader=data_loader,
train_data=train_data,
params=trainer_params,
validation_data=None,
validation_iterator=None,
)
示例10: make_vocab_from_params
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def make_vocab_from_params(params , serialization_dir ):
prepare_environment(params)
vocab_params = params.pop(u"vocabulary", {})
os.makedirs(serialization_dir, exist_ok=True)
vocab_dir = os.path.join(serialization_dir, u"vocabulary")
if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
raise ConfigurationError(u"The 'vocabulary' directory in the provided "
u"serialization directory is non-empty")
all_datasets = datasets_from_params(params)
datasets_for_vocab_creation = set(params.pop(u"datasets_for_vocab_creation", all_datasets))
for dataset in datasets_for_vocab_creation:
if dataset not in all_datasets:
raise ConfigurationError("invalid 'dataset_for_vocab_creation' {dataset}")
logger.info(u"From dataset instances, %s will be considered for vocabulary creation.",
u", ".join(datasets_for_vocab_creation))
instances = [instance for key, dataset in list(all_datasets.items())
for instance in dataset
if key in datasets_for_vocab_creation]
vocab = Vocabulary.from_params(vocab_params, instances)
logger.info("writing the vocabulary to {vocab_dir}.")
vocab.save_to_files(vocab_dir)
logger.info(u"done creating vocab")
示例11: datasets_from_params
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def datasets_from_params(params ) :
u"""
Load all the datasets specified by the config.
"""
dataset_reader = DatasetReader.from_params(params.pop(u'dataset_reader'))
validation_dataset_reader_params = params.pop(u"validation_dataset_reader", None)
validation_and_test_dataset_reader = dataset_reader
if validation_dataset_reader_params is not None:
logger.info(u"Using a separate dataset reader to load validation and test data.")
validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
train_data_path = params.pop(u'train_data_path')
logger.info(u"Reading training data from %s", train_data_path)
train_data = dataset_reader.read(train_data_path)
datasets = {u"train": train_data}
validation_data_path = params.pop(u'validation_data_path', None)
if validation_data_path is not None:
logger.info(u"Reading validation data from %s", validation_data_path)
validation_data = validation_and_test_dataset_reader.read(validation_data_path)
datasets[u"validation"] = validation_data
test_data_path = params.pop(u"test_data_path", None)
if test_data_path is not None:
logger.info(u"Reading test data from %s", test_data_path)
test_data = validation_and_test_dataset_reader.read(test_data_path)
datasets[u"test"] = test_data
return datasets
示例12: write_for_official_eval
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def write_for_official_eval(model_archive_file, test_file, output_file,
label_ids_to_label):
archive = load_archive(model_archive_file)
model = archive.model
reader = DatasetReader.from_params(archive.config['dataset_reader'])
iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 4}))
vocab = Vocabulary.from_params(archive.config['vocabulary'])
iterator.index_with(vocab)
model.cuda()
model.eval()
instances = reader.read(test_file)
predictions = []
for batch in iterator(instances, num_epochs=1, shuffle=False):
batch = move_to_device(batch, cuda_device=0)
output = model(**batch)
batch_labels = [
label_ids_to_label[i]
for i in output['predictions'].cpu().numpy().tolist()
]
predictions.extend(batch_labels)
to_write = ''.join(["{}\t{}\n".format(i + 8001, e) for i, e in enumerate(model.metrics[0].pred)])
with open(output_file, 'w') as fout:
fout.write(to_write)
示例13: write_for_official_eval
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def write_for_official_eval(model_archive_file, test_file, output_file):
archive = load_archive(model_archive_file)
model = archive.model
reader = DatasetReader.from_params(archive.config['dataset_reader'])
iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 32}))
vocab = Vocabulary.from_params(archive.config['vocabulary'])
iterator.index_with(vocab)
model.cuda()
model.eval()
label_ids_to_label = {0: 'F', 1: 'T'}
instances = reader.read(test_file)
predictions = []
for batch in iterator(instances, num_epochs=1, shuffle=False):
batch = move_to_device(batch, cuda_device=0)
output = model(**batch)
batch_labels = [
label_ids_to_label[i]
for i in output['predictions'].cpu().numpy().tolist()
]
predictions.extend(batch_labels)
assert len(predictions) == 1400
with open(output_file, 'w') as fout:
for p in predictions:
fout.write("{}\n".format(p))
示例14: write_for_official_eval
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def write_for_official_eval(model_archive_file, test_file, output_file,
label_ids_to_label):
archive = load_archive(model_archive_file)
model = archive.model
reader = DatasetReader.from_params(archive.config['dataset_reader'])
iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 4}))
vocab = Vocabulary.from_params(archive.config['vocabulary'])
iterator.index_with(vocab)
model.cuda()
model.eval()
instances = reader.read(test_file)
predictions = []
for batch in iterator(instances, num_epochs=1, shuffle=False):
batch = move_to_device(batch, cuda_device=0)
output = model(**batch)
batch_labels = [
label_ids_to_label[i]
for i in output['predictions'].cpu().numpy().tolist()
]
predictions.extend(batch_labels)
with open(output_file, 'w') as fout:
for p in predictions:
fout.write("{}\n".format(p))
示例15: __init__
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def __init__(self, model_archive, batch_size=32,
masking_strategy=None,
wordnet_entity_file=None, vocab_dir=None):
# get bert_tokenizer_and_candidate_generator
config = _extract_config_from_archive(cached_path(model_archive))
# look for the bert_tokenizers and candidate_generator
candidate_generator_params = _find_key(
config['dataset_reader'].as_dict(), 'tokenizer_and_candidate_generator'
)
if wordnet_entity_file is not None:
candidate_generator_params['entity_candidate_generators']['wordnet']['entity_file'] = wordnet_entity_file
self.tokenizer_and_candidate_generator = TokenizerAndCandidateGenerator.\
from_params(Params(candidate_generator_params))
self.tokenizer_and_candidate_generator.whitespace_tokenize = False
assert masking_strategy is None or masking_strategy == 'full_mask'
self.masking_strategy = masking_strategy
# need bert_tokenizer_and_candidate_generator
if vocab_dir is not None:
vocab_params = Params({"directory_path": vocab_dir})
else:
vocab_params = config['vocabulary']
self.vocab = Vocabulary.from_params(vocab_params)
self.iterator = DataIterator.from_params(
Params({"type": "basic", "batch_size": batch_size})
)
self.iterator.index_with(self.vocab)