本文整理汇总了Python中allennlp.data.Vocabulary.from_files方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.from_files方法的具体用法?Python Vocabulary.from_files怎么用?Python Vocabulary.from_files使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.Vocabulary
的用法示例。
在下文中一共展示了Vocabulary.from_files方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _load
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_files [as 别名]
def _load(cls,
config: Params,
serialization_dir: str,
weights_file: str = None,
cuda_device: int = -1) -> 'Model':
"""
Instantiates an already-trained model, based on the experiment
configuration and some optional overrides.
"""
weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)
# Load vocabulary from file
vocab_dir = os.path.join(serialization_dir, 'vocabulary')
vocab = Vocabulary.from_files(vocab_dir)
model_params = config.get('model')
# The experiment config tells us how to _train_ a model, including where to get pre-trained
# embeddings from. We're now _loading_ the model, so those embeddings will already be
# stored in our weights. We don't need any pretrained weight file anymore, and we don't
# want the code to look for it, so we remove it from the parameters here.
remove_pretrained_embedding_params(model_params)
model = Model.from_params(vocab=vocab, params=model_params)
model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
model.load_state_dict(model_state)
# Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
# in sync with the weights
if cuda_device >= 0:
model.cuda(cuda_device)
else:
model.cpu()
return model
示例2: load
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_files [as 别名]
def load(cls,
config: Params,
serialization_dir: str,
weights_file: str = None,
cuda_device: int = -1) -> 'Model':
"""
Instantiates an already-trained model, based on the experiment
configuration and some optional overrides.
Parameters
----------
config: Params
The configuration that was used to train the model. It should definitely
have a `model` section, and should probably have a `trainer` section
as well.
serialization_dir: str = None
The directory containing the serialized weights, parameters, and vocabulary
of the model.
weights_file: str = None
By default we load the weights from `best.th` in the serialization
directory, but you can override that value here.
cuda_device: int = -1
By default we load the model on the CPU, but if you want to load it
for GPU usage you can specify the id of your GPU here
Returns
-------
model: Model
The model specified in the configuration, loaded with the serialized
vocabulary and the trained weights.
"""
weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS)
# Load vocabulary from file
vocab_dir = os.path.join(serialization_dir, 'vocabulary')
vocab = Vocabulary.from_files(vocab_dir)
model_params = config.get('model')
# The experiment config tells us how to _train_ a model, including where to get pre-trained
# embeddings from. We're now _loading_ the model, so those embeddings will already be
# stored in our weights. We don't need any pretrained weight file anymore, and we don't
# want the code to look for it, so we remove it from the parameters here.
_remove_pretrained_embedding_params(model_params)
model = Model.from_params(vocab, model_params)
model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device))
model.load_state_dict(model_state)
# Force model to cpu or gpu, as appropriate, to make sure that the embeddings are
# in sync with the weights
if cuda_device >= 0:
model.cuda(cuda_device)
else:
model.cpu()
return model
示例3: train_model
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_files [as 别名]
def train_model(params: Params,
serialization_dir: str,
file_friendly_logging: bool = False,
recover: bool = False,
force: bool = False) -> Model:
"""
Trains the model specified in the given :class:`Params` object, using the data and training
parameters also specified in that object, and saves the results in ``serialization_dir``.
Parameters
----------
params : ``Params``
A parameter object specifying an AllenNLP Experiment.
serialization_dir : ``str``
The directory in which to save results and logs.
file_friendly_logging : ``bool``, optional (default=False)
If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
down tqdm's output to only once every 10 seconds.
recover : ``bool``, optional (default=False)
If ``True``, we will try to recover a training run from an existing serialization
directory. This is only intended for use when something actually crashed during the middle
of a run. For continuing training a model on new data, see the ``fine-tune`` command.
force : ``bool``, optional (default=False)
If ``True``, we will overwrite the serialization directory if it already exists.
Returns
-------
best_model: ``Model``
The model with the best epoch weights.
"""
prepare_environment(params)
create_serialization_dir(params, serialization_dir, recover, force)
prepare_global_logging(serialization_dir, file_friendly_logging)
cuda_device = params.params.get('trainer').get('cuda_device', -1)
if isinstance(cuda_device, list):
for device in cuda_device:
check_for_gpu(device)
else:
check_for_gpu(cuda_device)
params.to_file(os.path.join(serialization_dir, CONFIG_NAME))
all_datasets = datasets_from_params(params)
datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))
for dataset in datasets_for_vocab_creation:
if dataset not in all_datasets:
raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")
logger.info("From dataset instances, %s will be considered for vocabulary creation.",
", ".join(datasets_for_vocab_creation))
if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")):
vocab = Vocabulary.from_files(os.path.join(serialization_dir, "vocabulary"))
else:
vocab = Vocabulary.from_params(
params.pop("vocabulary", {}),
(instance for key, dataset in all_datasets.items()
for instance in dataset
if key in datasets_for_vocab_creation)
)
model = Model.from_params(vocab=vocab, params=params.pop('model'))
# Initializing the model can have side effect of expanding the vocabulary
vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))
iterator = DataIterator.from_params(params.pop("iterator"))
iterator.index_with(vocab)
validation_iterator_params = params.pop("validation_iterator", None)
if validation_iterator_params:
validation_iterator = DataIterator.from_params(validation_iterator_params)
validation_iterator.index_with(vocab)
else:
validation_iterator = None
train_data = all_datasets['train']
validation_data = all_datasets.get('validation')
test_data = all_datasets.get('test')
trainer_params = params.pop("trainer")
no_grad_regexes = trainer_params.pop("no_grad", ())
for name, parameter in model.named_parameters():
if any(re.search(regex, name) for regex in no_grad_regexes):
parameter.requires_grad_(False)
frozen_parameter_names, tunable_parameter_names = \
get_frozen_and_tunable_parameter_names(model)
logger.info("Following parameters are Frozen (without gradient):")
for name in frozen_parameter_names:
logger.info(name)
logger.info("Following parameters are Tunable (with gradient):")
for name in tunable_parameter_names:
logger.info(name)
trainer_choice = trainer_params.pop_choice("type",
Trainer.list_available(),
default_to_first_choice=True)
#.........这里部分代码省略.........
示例4: build_tasks
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_files [as 别名]
def build_tasks(args):
'''Prepare tasks'''
def parse_tasks(task_list):
'''parse string of tasks'''
if task_list == 'all':
tasks = ALL_TASKS
elif task_list == 'none':
tasks = []
else:
tasks = task_list.split(',')
return tasks
train_task_names = parse_tasks(args.train_tasks)
eval_task_names = parse_tasks(args.eval_tasks)
all_task_names = list(set(train_task_names + eval_task_names))
tasks = get_tasks(all_task_names, args.max_seq_len, args.load_tasks)
max_v_sizes = {'word': args.max_word_v_size}
token_indexer = {}
if args.elmo:
token_indexer["elmo"] = ELMoTokenCharactersIndexer("elmo")
if not args.elmo_no_glove:
token_indexer["words"] = SingleIdTokenIndexer()
else:
token_indexer["words"] = SingleIdTokenIndexer()
vocab_path = os.path.join(args.exp_dir, 'vocab')
preproc_file = os.path.join(args.exp_dir, args.preproc_file)
if args.load_preproc and os.path.exists(preproc_file):
preproc = pkl.load(open(preproc_file, 'rb'))
vocab = Vocabulary.from_files(vocab_path)
word_embs = preproc['word_embs']
for task in tasks:
train, val, test = preproc[task.name]
task.train_data = train
task.val_data = val
task.test_data = test
log.info("\tFinished building vocab. Using %d words",
vocab.get_vocab_size('tokens'))
log.info("\tLoaded data from %s", preproc_file)
else:
log.info("\tProcessing tasks from scratch")
word2freq = get_words(tasks)
vocab = get_vocab(word2freq, max_v_sizes)
word_embs = get_embeddings(vocab, args.word_embs_file, args.d_word)
preproc = {'word_embs': word_embs}
for task in tasks:
train, val, test = process_task(task, token_indexer, vocab)
task.train_data = train
task.val_data = val
task.test_data = test
del_field_tokens(task)
preproc[task.name] = (train, val, test)
log.info("\tFinished indexing tasks")
pkl.dump(preproc, open(preproc_file, 'wb'))
vocab.save_to_files(vocab_path)
log.info("\tSaved data to %s", preproc_file)
del word2freq
del preproc
train_tasks = [task for task in tasks if task.name in train_task_names]
eval_tasks = [task for task in tasks if task.name in eval_task_names]
log.info('\t Training on %s', ', '.join([task.name for task in train_tasks]))
log.info('\t Evaluating on %s', ', '.join([task.name for task in eval_tasks]))
return train_tasks, eval_tasks, vocab, word_embs