本文整理汇总了Python中allennlp.data.iterators.BucketIterator方法的典型用法代码示例。如果您正苦于以下问题:Python iterators.BucketIterator方法的具体用法?Python iterators.BucketIterator怎么用?Python iterators.BucketIterator使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.iterators
的用法示例。
在下文中一共展示了iterators.BucketIterator方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: multiprocess_training_loader
# 需要导入模块: from allennlp.data import iterators [as 别名]
# 或者: from allennlp.data.iterators import BucketIterator [as 别名]
def multiprocess_training_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):
# workflow: we tokenize the data files with the costly spacy before training in a preprocessing step
# (and concat the tokens with single whitespaces), so here we only split on the whitepsaces
_tokenizer = None
if _config["preprocessed_tokenized"] == True:
_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
if _config["token_embedder_type"] == "embedding":
_token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
_vocab = Vocabulary.from_files(_config["vocab_directory"])
elif _config["token_embedder_type"] == "fasttext":
_token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
_vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])
elif _config["token_embedder_type"] == "elmo":
_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
_vocab = None
_triple_loader = IrTripleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])
_iterator.index_with(_vocab)
for training_batch in _iterator(_triple_loader.read(_local_file), num_epochs=1):
_queue.put(training_batch) # this moves the tensors in to shared memory
_queue.close() # indicate this local thread is done
_wait_for_exit.wait() # keep this process alive until all the shared memory is used and not needed anymore
#
# validation instance generator
# - filling the _queue with ready to run validation batches
# - everything is defined thread local
#
示例2: multiprocess_validation_loader
# 需要导入模块: from allennlp.data import iterators [as 别名]
# 或者: from allennlp.data.iterators import BucketIterator [as 别名]
def multiprocess_validation_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):
# workflow: we tokenize the data files with the costly spacy before training in a preprocessing step
# (and concat the tokens with single whitespaces), so here we only split on the whitepsaces
_tokenizer = None
if _config and _config["preprocessed_tokenized"] == True:
_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
if _config["token_embedder_type"] == "embedding":
_token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
_vocab = Vocabulary.from_files(_config["vocab_directory"])
elif _config["token_embedder_type"] == "fasttext":
_token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
_vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])
elif _config["token_embedder_type"] == "elmo":
_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
_vocab = None
_tuple_loader = IrLabeledTupleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
sorting_keys=[("doc_tokens", "num_tokens"), ("query_tokens", "num_tokens")])
_iterator.index_with(_vocab)
for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1):
_queue.put(training_batch) # this moves the tensors in to shared memory
_queue.close() # indicate this local thread is done
_wait_for_exit.wait() # keep this process alive until all the shared memory is used and not needed anymore
示例3: test_self_attn_iterator
# 需要导入模块: from allennlp.data import iterators [as 别名]
# 或者: from allennlp.data.iterators import BucketIterator [as 别名]
def test_self_attn_iterator(self):
indexer = {'tokens': SingleIdTokenIndexer()}
# make some instances
instances = []
for k in range(100):
l = max(int(torch.rand(1).item() * 500), 1)
instances.append(Instance(
{'tokens': TextField(
[Token('a') for i in range(l)], token_indexers=indexer)})
)
schedule = [[16, 128], [8, 256], [4, 512]]
sub_iterator = BucketIterator(
batch_size=16,
sorting_keys=[['tokens', 'num_tokens']],
padding_noise=0.0
)
it = SelfAttnBucketIterator(schedule, sub_iterator)
it.index_with(Vocabulary())
batches = [batch for batch in it(instances, num_epochs=1)]
n_instances = 0
for batch in batches:
batch_size = batch['tokens']['tokens'].shape[0]
n_instances += batch_size
timesteps = batch['tokens']['tokens'].shape[1]
if timesteps <= 128:
expected_batch_size = 16
elif timesteps <= 256:
expected_batch_size = 8
else:
expected_batch_size = 4
# batch might be smaller then expected if we split a larger batch
# and the sequence length for the shorter segment falls into a lower
# bucket
self.assertTrue(batch_size <= expected_batch_size)
self.assertEqual(n_instances, 100)
示例4: multiprocess_training_loader
# 需要导入模块: from allennlp.data import iterators [as 别名]
# 或者: from allennlp.data.iterators import BucketIterator [as 别名]
def multiprocess_training_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):
torch.manual_seed(_config["random_seed"])
numpy.random.seed(_config["random_seed"])
random.seed(_config["random_seed"])
if _config["token_embedder_type"] == "bert_cls":
_tokenizer = BlingFireTokenizer()
_ind = PretrainedBertIndexer(pretrained_model=_config["bert_pretrained_model"], do_lowercase=True)
_token_indexers = {"tokens": _ind}
_triple_loader = BertTripleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers,
max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"],
min_doc_length=_config["min_doc_length"],min_query_length=_config["min_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])
_iterator.index_with(Vocabulary())#.from_files(_config["vocab_directory"]))
else:
_tokenizer = BlingFireTokenizer()
if _config["token_embedder_type"] == "embedding":
_token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
_vocab = Vocabulary.from_files(_config["vocab_directory"])
elif _config["token_embedder_type"] == "fasttext":
_token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
_vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])
elif _config["token_embedder_type"] == "elmo":
_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
_vocab = None
_triple_loader = IrTripleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers,
max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"],
min_doc_length=_config["min_doc_length"],min_query_length=_config["min_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])
_iterator.index_with(_vocab)
for training_batch in _iterator(_triple_loader.read(_local_file), num_epochs=1):
_queue.put(training_batch) # this moves the tensors in to shared memory
_queue.put(None) # signal end of queue
_queue.close() # indicate this local thread is done
_wait_for_exit.wait() # keep this process alive until all the shared memory is used and not needed anymore
#
# validation instance generator
# - filling the _queue with ready to run validation batches
# - everything is defined thread local
#
开发者ID:sebastian-hofstaetter,项目名称:transformer-kernel-ranking,代码行数:60,代码来源:multiprocess_input_pipeline.py
示例5: multiprocess_validation_loader
# 需要导入模块: from allennlp.data import iterators [as 别名]
# 或者: from allennlp.data.iterators import BucketIterator [as 别名]
def multiprocess_validation_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):
torch.manual_seed(_config["random_seed"])
numpy.random.seed(_config["random_seed"])
random.seed(_config["random_seed"])
if _config["token_embedder_type"] == "bert_cls":
_tokenizer = BlingFireTokenizer()
_ind = PretrainedBertIndexer(pretrained_model=_config["bert_pretrained_model"], do_lowercase=True)
_token_indexers = {"tokens": _ind}
_tuple_loader = BertLabeledTupleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers,
max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"],
min_doc_length=_config["min_doc_length"],min_query_length=_config["min_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
sorting_keys=[("doc_tokens", "num_tokens")])
_iterator.index_with(Vocabulary.from_files(_config["vocab_directory"]))
else:
_tokenizer = BlingFireTokenizer()
if _config["token_embedder_type"] == "embedding":
_token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
_vocab = Vocabulary.from_files(_config["vocab_directory"])
elif _config["token_embedder_type"] == "fasttext":
_token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
_vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])
elif _config["token_embedder_type"] == "elmo":
_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
_vocab = None
_tuple_loader = IrLabeledTupleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers,
max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"],
min_doc_length=_config["min_doc_length"],min_query_length=_config["min_query_length"])
_iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
sorting_keys=[("doc_tokens", "num_tokens"), ("query_tokens", "num_tokens")])
_iterator.index_with(_vocab)
for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1):
_queue.put(training_batch) # this moves the tensors in to shared memory
_queue.put(None) # signal end of queue
_queue.close() # indicate this local thread is done
_wait_for_exit.wait() # keep this process alive until all the shared memory is used and not needed anymore
#
# single sequence loader from multiple files
#
开发者ID:sebastian-hofstaetter,项目名称:transformer-kernel-ranking,代码行数:58,代码来源:multiprocess_input_pipeline.py
示例6: multiprocess_single_sequence_loader
# 需要导入模块: from allennlp.data import iterators [as 别名]
# 或者: from allennlp.data.iterators import BucketIterator [as 别名]
def multiprocess_single_sequence_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):
torch.manual_seed(_config["random_seed"])
numpy.random.seed(_config["random_seed"])
random.seed(_config["random_seed"])
if _config["token_embedder_type"] == "bert_cls":
_tokenizer = BlingFireTokenizer()
_ind = PretrainedBertIndexer(pretrained_model=_config["bert_pretrained_model"], do_lowercase=True)
_token_indexers = {"tokens": _ind}
_tuple_loader = IrSingleSequenceDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers,
max_seq_length= _config["max_doc_length"], min_seq_length=_config["min_doc_length"],)
_iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
sorting_keys=[("seq_tokens", "num_tokens")])
_iterator.index_with(Vocabulary.from_files(_config["vocab_directory"]))
else:
_tokenizer = BlingFireTokenizer()
if _config["token_embedder_type"] == "embedding":
_token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
_vocab = Vocabulary.from_files(_config["vocab_directory"])
elif _config["token_embedder_type"] == "fasttext":
_token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
_vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])
elif _config["token_embedder_type"] == "elmo":
_token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
_vocab = None
_tuple_loader = IrSingleSequenceDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers,
max_seq_length= _config["max_doc_length"], min_seq_length=_config["min_doc_length"],)
_iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
sorting_keys=[("seq_tokens", "num_tokens")])
_iterator.index_with(_vocab)
for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1):
_queue.put(training_batch) # this moves the tensors in to shared memory
_queue.put(None) # signal end of queue
_queue.close() # indicate this local thread is done
_wait_for_exit.wait() # keep this process alive until all the shared memory is used and not needed anymore
开发者ID:sebastian-hofstaetter,项目名称:transformer-kernel-ranking,代码行数:52,代码来源:multiprocess_input_pipeline.py