本文整理汇总了Python中allennlp.data.Vocabulary.from_params方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.from_params方法的具体用法?Python Vocabulary.from_params怎么用?Python Vocabulary.from_params使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.Vocabulary
的用法示例。
在下文中一共展示了Vocabulary.from_params方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: make_vocab_from_params
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def make_vocab_from_params(params: Params, serialization_dir: str):
prepare_environment(params)
vocab_params = params.pop("vocabulary", {})
os.makedirs(serialization_dir, exist_ok=True)
vocab_dir = os.path.join(serialization_dir, "vocabulary")
if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
raise ConfigurationError("The 'vocabulary' directory in the provided "
"serialization directory is non-empty")
all_datasets = datasets_from_params(params)
datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))
for dataset in datasets_for_vocab_creation:
if dataset not in all_datasets:
raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")
logger.info("From dataset instances, %s will be considered for vocabulary creation.",
", ".join(datasets_for_vocab_creation))
instances = [instance for key, dataset in all_datasets.items()
for instance in dataset
if key in datasets_for_vocab_creation]
vocab = Vocabulary.from_params(vocab_params, instances)
logger.info(f"writing the vocabulary to {vocab_dir}.")
vocab.save_to_files(vocab_dir)
logger.info("done creating vocab")
示例2: make_vocab_from_params
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def make_vocab_from_params(params: Params):
prepare_environment(params)
vocab_params = params.pop("vocabulary", {})
vocab_dir = vocab_params.get('directory_path')
if vocab_dir is None:
raise ConfigurationError("To use `make-vocab` your configuration must contain a value "
"at vocabulary.directory_path")
os.makedirs(vocab_dir, exist_ok=True)
all_datasets = datasets_from_params(params)
datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))
for dataset in datasets_for_vocab_creation:
if dataset not in all_datasets:
raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")
logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
vocab = Vocabulary.from_params(Params({}),
(instance for key, dataset in all_datasets.items()
for instance in dataset
if key in datasets_for_vocab_creation))
vocab.save_to_files(vocab_dir)
logger.info("done creating vocab")
示例3: setUp
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def setUp(self):
super().setUp()
params = Params({
"model": {
"type": "simple_tagger",
"text_field_embedder": {
"token_embedders": {
"tokens": {
"type": "embedding",
"embedding_dim": 5
}
}
},
"encoder": {
"type": "lstm",
"input_size": 5,
"hidden_size": 7,
"num_layers": 2
}
},
"dataset_reader": {"type": "sequence_tagging"},
"train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
"validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
"iterator": {"type": "basic", "batch_size": 2},
"trainer": {
"cuda_device": -1,
"num_epochs": 2,
"optimizer": "adam"
}
})
all_datasets = datasets_from_params(params)
vocab = Vocabulary.from_params(
params.pop("vocabulary", {}),
(instance for dataset in all_datasets.values()
for instance in dataset)
)
model = Model.from_params(vocab=vocab, params=params.pop('model'))
iterator = DataIterator.from_params(params.pop("iterator"))
iterator.index_with(vocab)
train_data = all_datasets['train']
trainer_params = params.pop("trainer")
serialization_dir = os.path.join(self.TEST_DIR, 'test_search_learning_rate')
self.trainer = Trainer.from_params(model,
serialization_dir,
iterator,
train_data,
params=trainer_params,
validation_data=None,
validation_iterator=None)
示例4: dry_run_from_params
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def dry_run_from_params(params: Params, serialization_dir: str) -> None:
prepare_environment(params)
vocab_params = params.pop("vocabulary", {})
os.makedirs(serialization_dir, exist_ok=True)
vocab_dir = os.path.join(serialization_dir, "vocabulary")
if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
raise ConfigurationError("The 'vocabulary' directory in the provided "
"serialization directory is non-empty")
all_datasets = datasets_from_params(params)
datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))
for dataset in datasets_for_vocab_creation:
if dataset not in all_datasets:
raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")
logger.info("From dataset instances, %s will be considered for vocabulary creation.",
", ".join(datasets_for_vocab_creation))
instances = [instance for key, dataset in all_datasets.items()
for instance in dataset
if key in datasets_for_vocab_creation]
vocab = Vocabulary.from_params(vocab_params, instances)
dataset = Batch(instances)
dataset.index_instances(vocab)
dataset.print_statistics()
vocab.print_statistics()
logger.info(f"writing the vocabulary to {vocab_dir}.")
vocab.save_to_files(vocab_dir)
model = Model.from_params(vocab=vocab, params=params.pop('model'))
trainer_params = params.pop("trainer")
no_grad_regexes = trainer_params.pop("no_grad", ())
for name, parameter in model.named_parameters():
if any(re.search(regex, name) for regex in no_grad_regexes):
parameter.requires_grad_(False)
frozen_parameter_names, tunable_parameter_names = \
get_frozen_and_tunable_parameter_names(model)
logger.info("Following parameters are Frozen (without gradient):")
for name in frozen_parameter_names:
logger.info(name)
logger.info("Following parameters are Tunable (with gradient):")
for name in tunable_parameter_names:
logger.info(name)
示例5: set_up_model
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def set_up_model(self, param_file, dataset_file):
# pylint: disable=attribute-defined-outside-init
self.param_file = param_file
params = Params.from_file(self.param_file)
reader = DatasetReader.from_params(params['dataset_reader'])
instances = reader.read(dataset_file)
# Use parameters for vocabulary if they are present in the config file, so that choices like
# "non_padded_namespaces", "min_count" etc. can be set if needed.
if 'vocabulary' in params:
vocab_params = params['vocabulary']
vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
else:
vocab = Vocabulary.from_instances(instances)
self.vocab = vocab
self.instances = instances
self.model = Model.from_params(vocab=self.vocab, params=params['model'])
# TODO(joelgrus) get rid of these
# (a lot of the model tests use them, so they'll have to be changed)
self.dataset = Batch(self.instances)
self.dataset.index_instances(self.vocab)
示例6: train_model
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def train_model(params: Params,
serialization_dir: str,
file_friendly_logging: bool = False,
recover: bool = False) -> Model:
"""
Trains the model specified in the given :class:`Params` object, using the data and training
parameters also specified in that object, and saves the results in ``serialization_dir``.
Parameters
----------
params : ``Params``
A parameter object specifying an AllenNLP Experiment.
serialization_dir : ``str``
The directory in which to save results and logs.
file_friendly_logging : ``bool``, optional (default=False)
If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
down tqdm's output to only once every 10 seconds.
recover : ``bool`, optional (default=False)
If ``True``, we will try to recover a training run from an existing serialization
directory. This is only intended for use when something actually crashed during the middle
of a run. For continuing training a model on new data, see the ``fine-tune`` command.
"""
prepare_environment(params)
create_serialization_dir(params, serialization_dir, recover)
prepare_global_logging(serialization_dir, file_friendly_logging)
serialization_params = deepcopy(params).as_dict(quiet=True)
with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
json.dump(serialization_params, param_file, indent=4)
all_datasets = datasets_from_params(params)
datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))
for dataset in datasets_for_vocab_creation:
if dataset not in all_datasets:
raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")
logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
(instance for key, dataset in all_datasets.items()
for instance in dataset
if key in datasets_for_vocab_creation))
vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))
model = Model.from_params(vocab, params.pop('model'))
iterator = DataIterator.from_params(params.pop("iterator"))
iterator.index_with(vocab)
train_data = all_datasets['train']
validation_data = all_datasets.get('validation')
test_data = all_datasets.get('test')
trainer_params = params.pop("trainer")
trainer = Trainer.from_params(model,
serialization_dir,
iterator,
train_data,
validation_data,
trainer_params)
evaluate_on_test = params.pop_bool("evaluate_on_test", False)
params.assert_empty('base train command')
try:
metrics = trainer.train()
except KeyboardInterrupt:
# if we have completed an epoch, try to create a model archive.
if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
logging.info("Training interrupted by the user. Attempting to create "
"a model archive using the current best epoch weights.")
archive_model(serialization_dir, files_to_archive=params.files_to_archive)
raise
# Now tar up results
archive_model(serialization_dir, files_to_archive=params.files_to_archive)
if test_data and evaluate_on_test:
test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access
for key, value in test_metrics.items():
metrics["test_" + key] = value
elif test_data:
logger.info("To evaluate on the test set after training, pass the "
"'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")
metrics_json = json.dumps(metrics, indent=2)
with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
metrics_file.write(metrics_json)
logger.info("Metrics: %s", metrics_json)
return model
示例7: train_model
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def train_model(params: Params,
serialization_dir: str,
file_friendly_logging: bool = False,
recover: bool = False,
force: bool = False) -> Model:
"""
Trains the model specified in the given :class:`Params` object, using the data and training
parameters also specified in that object, and saves the results in ``serialization_dir``.
Parameters
----------
params : ``Params``
A parameter object specifying an AllenNLP Experiment.
serialization_dir : ``str``
The directory in which to save results and logs.
file_friendly_logging : ``bool``, optional (default=False)
If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
down tqdm's output to only once every 10 seconds.
recover : ``bool``, optional (default=False)
If ``True``, we will try to recover a training run from an existing serialization
directory. This is only intended for use when something actually crashed during the middle
of a run. For continuing training a model on new data, see the ``fine-tune`` command.
Returns
-------
best_model: ``Model``
The model with the best epoch weights.
"""
prepare_environment(params)
create_serialization_dir(params, serialization_dir, recover, force)
prepare_global_logging(serialization_dir, file_friendly_logging)
cuda_device = params.params.get('trainer').get('cuda_device', -1)
if isinstance(cuda_device, list):
for device in cuda_device:
check_for_gpu(device)
else:
check_for_gpu(cuda_device)
params.to_file(os.path.join(serialization_dir, CONFIG_NAME))
all_datasets = datasets_from_params(params)
datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))
for dataset in datasets_for_vocab_creation:
if dataset not in all_datasets:
raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")
logger.info("From dataset instances, %s will be considered for vocabulary creation.",
", ".join(datasets_for_vocab_creation))
vocab = Vocabulary.from_params(
params.pop("vocabulary", {}),
(instance for key, dataset in all_datasets.items()
for instance in dataset
if key in datasets_for_vocab_creation)
)
model = Model.from_params(vocab=vocab, params=params.pop('model'))
# Initializing the model can have side effect of expanding the vocabulary
vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))
iterator = DataIterator.from_params(params.pop("iterator"))
iterator.index_with(vocab)
validation_iterator_params = params.pop("validation_iterator", None)
if validation_iterator_params:
validation_iterator = DataIterator.from_params(validation_iterator_params)
validation_iterator.index_with(vocab)
else:
validation_iterator = None
train_data = all_datasets['train']
validation_data = all_datasets.get('validation')
test_data = all_datasets.get('test')
trainer_params = params.pop("trainer")
no_grad_regexes = trainer_params.pop("no_grad", ())
for name, parameter in model.named_parameters():
if any(re.search(regex, name) for regex in no_grad_regexes):
parameter.requires_grad_(False)
frozen_parameter_names, tunable_parameter_names = \
get_frozen_and_tunable_parameter_names(model)
logger.info("Following parameters are Frozen (without gradient):")
for name in frozen_parameter_names:
logger.info(name)
logger.info("Following parameters are Tunable (with gradient):")
for name in tunable_parameter_names:
logger.info(name)
trainer_choice = trainer_params.pop_choice("type",
Trainer.list_available(),
default_to_first_choice=True)
trainer = Trainer.by_name(trainer_choice).from_params(model=model,
serialization_dir=serialization_dir,
iterator=iterator,
train_data=train_data,
validation_data=validation_data,
params=trainer_params,
#.........这里部分代码省略.........
示例8: find_learning_rate_model
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import from_params [as 别名]
def find_learning_rate_model(params: Params, serialization_dir: str,
start_lr: float = 1e-5,
end_lr: float = 10,
num_batches: int = 100,
linear_steps: bool = False,
stopping_factor: float = None,
force: bool = False) -> None:
"""
Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``
Parameters
----------
trainer: :class:`~allennlp.common.registrable.Registrable`
params : ``Params``
A parameter object specifying an AllenNLP Experiment.
serialization_dir : ``str``
The directory in which to save results.
start_lr: ``float``
Learning rate to start the search.
end_lr: ``float``
Learning rate upto which search is done.
num_batches: ``int``
Number of mini-batches to run Learning rate finder.
linear_steps: ``bool``
Increase learning rate linearly if False exponentially.
stopping_factor: ``float``
Stop the search when the current loss exceeds the best loss recorded by
multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
force: ``bool``
If True and the serialization directory already exists, everything in it will
be removed prior to finding the learning rate.
"""
if os.path.exists(serialization_dir) and force:
shutil.rmtree(serialization_dir)
if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is '
f'not empty.')
else:
os.makedirs(serialization_dir, exist_ok=True)
prepare_environment(params)
cuda_device = params.params.get('trainer').get('cuda_device', -1)
if isinstance(cuda_device, list):
for device in cuda_device:
check_for_gpu(device)
else:
check_for_gpu(cuda_device)
all_datasets = datasets_from_params(params)
datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))
for dataset in datasets_for_vocab_creation:
if dataset not in all_datasets:
raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")
logger.info("From dataset instances, %s will be considered for vocabulary creation.",
", ".join(datasets_for_vocab_creation))
vocab = Vocabulary.from_params(
params.pop("vocabulary", {}),
(instance for key, dataset in all_datasets.items()
for instance in dataset
if key in datasets_for_vocab_creation)
)
model = Model.from_params(vocab=vocab, params=params.pop('model'))
iterator = DataIterator.from_params(params.pop("iterator"))
iterator.index_with(vocab)
train_data = all_datasets['train']
trainer_params = params.pop("trainer")
no_grad_regexes = trainer_params.pop("no_grad", ())
for name, parameter in model.named_parameters():
if any(re.search(regex, name) for regex in no_grad_regexes):
parameter.requires_grad_(False)
trainer = Trainer.from_params(model,
serialization_dir,
iterator,
train_data,
params=trainer_params,
validation_data=None,
validation_iterator=None)
logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.')
learning_rates, losses = search_learning_rate(trainer,
start_lr=start_lr,
end_lr=end_lr,
num_batches=num_batches,
linear_steps=linear_steps,
stopping_factor=stopping_factor)
logger.info(f'Finished learning rate search.')
losses = _smooth(losses, 0.98)
_save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))