本文整理汇总了Python中allennlp.data.vocabulary.Vocabulary.from_files方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.from_files方法的具体用法?Python Vocabulary.from_files怎么用?Python Vocabulary.from_files使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.vocabulary.Vocabulary
的用法示例。
在下文中一共展示了Vocabulary.from_files方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_generator
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_files [as 别名]
def get_generator(self,
model_path: str,
token_vocab_path: str,
stress_vocab_dump_path: str) -> Generator:
if self.generator is None:
assert os.path.isdir(model_path) and os.path.isdir(token_vocab_path)
vocabulary = Vocabulary.from_files(token_vocab_path)
stress_vocabulary = StressVocabulary()
if not os.path.isfile(stress_vocab_dump_path):
stress_vocabulary = inflate_stress_vocabulary(vocabulary, self.get_stress_predictor())
stress_vocabulary.save(stress_vocab_dump_path)
else:
stress_vocabulary.load(stress_vocab_dump_path)
eos_index = vocabulary.get_token_index(END_SYMBOL)
unk_index = vocabulary.get_token_index(DEFAULT_OOV_TOKEN)
exclude_transform = ExcludeTransform((unk_index, eos_index))
model = LanguageModel.load(model_path, vocabulary_dir=token_vocab_path,
transforms=[exclude_transform, ])
self.generator = Generator(model, vocabulary, stress_vocabulary, eos_index)
return self.generator
示例2: multiprocess_training_loader
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_files [as 别名]
def multiprocess_training_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):
# workflow: we tokenize the data files with the costly spacy before training in a preprocessing step
# (and concat the tokens with single whitespaces), so here we only split on the whitepsaces
_tokenizer = None
if _config["preprocessed_tokenized"] == True:
_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
if _config["token_embedder_type"] == "embedding":
_token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
_vocab = Vocabulary.from_files(_config["vocab_directory"])
elif _config["token_embedder_type"] == "fasttext":
_token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
_vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])
elif _config["token_embedder_type"] == "elmo":
_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
_vocab = None
_triple_loader = IrTripleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])
_iterator.index_with(_vocab)
for training_batch in _iterator(_triple_loader.read(_local_file), num_epochs=1):
_queue.put(training_batch) # this moves the tensors in to shared memory
_queue.close() # indicate this local thread is done
_wait_for_exit.wait() # keep this process alive until all the shared memory is used and not needed anymore
#
# validation instance generator
# - filling the _queue with ready to run validation batches
# - everything is defined thread local
#
示例3: multiprocess_validation_loader
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_files [as 别名]
def multiprocess_validation_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):
# workflow: we tokenize the data files with the costly spacy before training in a preprocessing step
# (and concat the tokens with single whitespaces), so here we only split on the whitepsaces
_tokenizer = None
if _config and _config["preprocessed_tokenized"] == True:
_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
if _config["token_embedder_type"] == "embedding":
_token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
_vocab = Vocabulary.from_files(_config["vocab_directory"])
elif _config["token_embedder_type"] == "fasttext":
_token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
_vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])
elif _config["token_embedder_type"] == "elmo":
_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
_vocab = None
_tuple_loader = IrLabeledTupleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
sorting_keys=[("doc_tokens", "num_tokens"), ("query_tokens", "num_tokens")])
_iterator.index_with(_vocab)
for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1):
_queue.put(training_batch) # this moves the tensors in to shared memory
_queue.close() # indicate this local thread is done
_wait_for_exit.wait() # keep this process alive until all the shared memory is used and not needed anymore
示例4: __init__
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_files [as 别名]
def __init__(self, vocab_path=None, model_paths=None,
weigths=None,
max_len=50,
min_len=3,
lowercase_tokens=False,
log=False,
iterations=3,
min_probability=0.0,
model_name='roberta',
special_tokens_fix=1,
is_ensemble=True,
min_error_probability=0.0,
confidence=0,
resolve_cycles=False,
):
self.model_weights = list(map(float, weigths)) if weigths else [1] * len(model_paths)
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self.max_len = max_len
self.min_len = min_len
self.lowercase_tokens = lowercase_tokens
self.min_probability = min_probability
self.min_error_probability = min_error_probability
self.vocab = Vocabulary.from_files(vocab_path)
self.log = log
self.iterations = iterations
self.confidence = confidence
self.resolve_cycles = resolve_cycles
# set training parameters and operations
self.indexers = []
self.models = []
print("Model paths:", model_paths)
for model_path in model_paths:
print("Model path:", model_path)
if is_ensemble:
model_name, special_tokens_fix = self._get_model_data(model_path)
weights_name = get_weights_name(model_name, lowercase_tokens)
self.indexers.append(self._get_indexer(weights_name, special_tokens_fix))
model = Seq2Labels(vocab=self.vocab,
text_field_embedder=self._get_embbeder(weights_name, special_tokens_fix),
confidence=self.confidence
).to(self.device)
if torch.cuda.is_available():
model.load_state_dict(torch.load(model_path))
else:
model.load_state_dict(torch.load(model_path,
map_location=torch.device('cpu')))
model.eval()
self.models.append(model)
示例5: multiprocess_training_loader
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_files [as 别名]
def multiprocess_training_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):
torch.manual_seed(_config["random_seed"])
numpy.random.seed(_config["random_seed"])
random.seed(_config["random_seed"])
if _config["token_embedder_type"] == "bert_cls":
_tokenizer = BlingFireTokenizer()
_ind = PretrainedBertIndexer(pretrained_model=_config["bert_pretrained_model"], do_lowercase=True)
_token_indexers = {"tokens": _ind}
_triple_loader = BertTripleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers,
max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"],
min_doc_length=_config["min_doc_length"],min_query_length=_config["min_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])
_iterator.index_with(Vocabulary())#.from_files(_config["vocab_directory"]))
else:
_tokenizer = BlingFireTokenizer()
if _config["token_embedder_type"] == "embedding":
_token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
_vocab = Vocabulary.from_files(_config["vocab_directory"])
elif _config["token_embedder_type"] == "fasttext":
_token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
_vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])
elif _config["token_embedder_type"] == "elmo":
_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
_vocab = None
_triple_loader = IrTripleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers,
max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"],
min_doc_length=_config["min_doc_length"],min_query_length=_config["min_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])
_iterator.index_with(_vocab)
for training_batch in _iterator(_triple_loader.read(_local_file), num_epochs=1):
_queue.put(training_batch) # this moves the tensors in to shared memory
_queue.put(None) # signal end of queue
_queue.close() # indicate this local thread is done
_wait_for_exit.wait() # keep this process alive until all the shared memory is used and not needed anymore
#
# validation instance generator
# - filling the _queue with ready to run validation batches
# - everything is defined thread local
#
开发者ID:sebastian-hofstaetter,项目名称:transformer-kernel-ranking,代码行数:60,代码来源:multiprocess_input_pipeline.py
示例6: multiprocess_validation_loader
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_files [as 别名]
def multiprocess_validation_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):
torch.manual_seed(_config["random_seed"])
numpy.random.seed(_config["random_seed"])
random.seed(_config["random_seed"])
if _config["token_embedder_type"] == "bert_cls":
_tokenizer = BlingFireTokenizer()
_ind = PretrainedBertIndexer(pretrained_model=_config["bert_pretrained_model"], do_lowercase=True)
_token_indexers = {"tokens": _ind}
_tuple_loader = BertLabeledTupleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers,
max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"],
min_doc_length=_config["min_doc_length"],min_query_length=_config["min_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
sorting_keys=[("doc_tokens", "num_tokens")])
_iterator.index_with(Vocabulary.from_files(_config["vocab_directory"]))
else:
_tokenizer = BlingFireTokenizer()
if _config["token_embedder_type"] == "embedding":
_token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
_vocab = Vocabulary.from_files(_config["vocab_directory"])
elif _config["token_embedder_type"] == "fasttext":
_token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
_vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])
elif _config["token_embedder_type"] == "elmo":
_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
_vocab = None
_tuple_loader = IrLabeledTupleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers,
max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"],
min_doc_length=_config["min_doc_length"],min_query_length=_config["min_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
sorting_keys=[("doc_tokens", "num_tokens"), ("query_tokens", "num_tokens")])
_iterator.index_with(_vocab)
for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1):
_queue.put(training_batch) # this moves the tensors in to shared memory
_queue.put(None) # signal end of queue
_queue.close() # indicate this local thread is done
_wait_for_exit.wait() # keep this process alive until all the shared memory is used and not needed anymore
#
# single sequence loader from multiple files
#
开发者ID:sebastian-hofstaetter,项目名称:transformer-kernel-ranking,代码行数:58,代码来源:multiprocess_input_pipeline.py
示例7: multiprocess_single_sequence_loader
# 需要导入模块: from allennlp.data.vocabulary import Vocabulary [as 别名]
# 或者: from allennlp.data.vocabulary.Vocabulary import from_files [as 别名]
def multiprocess_single_sequence_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):
torch.manual_seed(_config["random_seed"])
numpy.random.seed(_config["random_seed"])
random.seed(_config["random_seed"])
if _config["token_embedder_type"] == "bert_cls":
_tokenizer = BlingFireTokenizer()
_ind = PretrainedBertIndexer(pretrained_model=_config["bert_pretrained_model"], do_lowercase=True)
_token_indexers = {"tokens": _ind}
_tuple_loader = IrSingleSequenceDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers,
max_seq_length= _config["max_doc_length"], min_seq_length=_config["min_doc_length"],)
_iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
sorting_keys=[("seq_tokens", "num_tokens")])
_iterator.index_with(Vocabulary.from_files(_config["vocab_directory"]))
else:
_tokenizer = BlingFireTokenizer()
if _config["token_embedder_type"] == "embedding":
_token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
_vocab = Vocabulary.from_files(_config["vocab_directory"])
elif _config["token_embedder_type"] == "fasttext":
_token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
_vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])
elif _config["token_embedder_type"] == "elmo":
_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
_vocab = None
_tuple_loader = IrSingleSequenceDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers,
max_seq_length= _config["max_doc_length"], min_seq_length=_config["min_doc_length"],)
_iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
sorting_keys=[("seq_tokens", "num_tokens")])
_iterator.index_with(_vocab)
for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1):
_queue.put(training_batch) # this moves the tensors in to shared memory
_queue.put(None) # signal end of queue
_queue.close() # indicate this local thread is done
_wait_for_exit.wait() # keep this process alive until all the shared memory is used and not needed anymore
开发者ID:sebastian-hofstaetter,项目名称:transformer-kernel-ranking,代码行数:52,代码来源:multiprocess_input_pipeline.py