當前位置: 首頁>>代碼示例>>Python>>正文


Python iterators.BucketIterator方法代碼示例

本文整理匯總了Python中allennlp.data.iterators.BucketIterator方法的典型用法代碼示例。如果您正苦於以下問題:Python iterators.BucketIterator方法的具體用法?Python iterators.BucketIterator怎麽用?Python iterators.BucketIterator使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在allennlp.data.iterators的用法示例。


在下文中一共展示了iterators.BucketIterator方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: multiprocess_training_loader

# 需要導入模塊: from allennlp.data import iterators [as 別名]
# 或者: from allennlp.data.iterators import BucketIterator [as 別名]
def multiprocess_training_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):

    # workflow: we tokenize the data files with the costly spacy before training in a preprocessing step 
    # (and concat the tokens with single whitespaces), so here we only split on the whitepsaces
    _tokenizer = None
    if _config["preprocessed_tokenized"] == True:
        _tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())

    if _config["token_embedder_type"] == "embedding":
        _token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        _vocab = Vocabulary.from_files(_config["vocab_directory"])

    elif _config["token_embedder_type"] == "fasttext":
        _token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
        _vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])

    elif _config["token_embedder_type"] == "elmo":
        _token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
        _vocab = None

    _triple_loader = IrTripleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"])

    _iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
                               sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])

    _iterator.index_with(_vocab)

    for training_batch in _iterator(_triple_loader.read(_local_file), num_epochs=1):

        _queue.put(training_batch)  # this moves the tensors in to shared memory

    _queue.close()  # indicate this local thread is done
    _wait_for_exit.wait()  # keep this process alive until all the shared memory is used and not needed anymore

#
# validation instance generator
#   - filling the _queue with ready to run validation batches
#   - everything is defined thread local
# 
開發者ID:sebastian-hofstaetter,項目名稱:sigir19-neural-ir,代碼行數:41,代碼來源:multiprocess_input_pipeline.py

示例2: multiprocess_validation_loader

# 需要導入模塊: from allennlp.data import iterators [as 別名]
# 或者: from allennlp.data.iterators import BucketIterator [as 別名]
def multiprocess_validation_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):

    # workflow: we tokenize the data files with the costly spacy before training in a preprocessing step 
    # (and concat the tokens with single whitespaces), so here we only split on the whitepsaces
    _tokenizer = None
    if _config and _config["preprocessed_tokenized"] == True:
        _tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())

    if _config["token_embedder_type"] == "embedding":
        _token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        _vocab = Vocabulary.from_files(_config["vocab_directory"])

    elif _config["token_embedder_type"] == "fasttext":
        _token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
        _vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])

    elif _config["token_embedder_type"] == "elmo":
        _token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
        _vocab = None

    _tuple_loader = IrLabeledTupleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"])

    _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
                               sorting_keys=[("doc_tokens", "num_tokens"), ("query_tokens", "num_tokens")])

    _iterator.index_with(_vocab)

    for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1):

        _queue.put(training_batch)  # this moves the tensors in to shared memory

    _queue.close()  # indicate this local thread is done
    _wait_for_exit.wait()  # keep this process alive until all the shared memory is used and not needed anymore 
開發者ID:sebastian-hofstaetter,項目名稱:sigir19-neural-ir,代碼行數:35,代碼來源:multiprocess_input_pipeline.py

示例3: test_self_attn_iterator

# 需要導入模塊: from allennlp.data import iterators [as 別名]
# 或者: from allennlp.data.iterators import BucketIterator [as 別名]
def test_self_attn_iterator(self):
        indexer = {'tokens': SingleIdTokenIndexer()}

        # make some instances
        instances = []
        for k in range(100):
            l = max(int(torch.rand(1).item() * 500), 1)
            instances.append(Instance(
                {'tokens': TextField(
                    [Token('a') for i in range(l)], token_indexers=indexer)})
            )

        schedule = [[16, 128], [8, 256], [4, 512]]

        sub_iterator = BucketIterator(
                batch_size=16,
                sorting_keys=[['tokens', 'num_tokens']],
                padding_noise=0.0
        )

        it = SelfAttnBucketIterator(schedule, sub_iterator)
        it.index_with(Vocabulary())

        batches = [batch for batch in it(instances, num_epochs=1)]

        n_instances = 0
        for batch in batches:
            batch_size = batch['tokens']['tokens'].shape[0]
            n_instances += batch_size
            timesteps = batch['tokens']['tokens'].shape[1]
            if timesteps <= 128:
                expected_batch_size = 16
            elif timesteps <= 256:
                expected_batch_size = 8
            else:
                expected_batch_size = 4
            # batch might be smaller then expected if we split a larger batch
            # and the sequence length for the shorter segment falls into a lower
            # bucket
            self.assertTrue(batch_size <= expected_batch_size)

        self.assertEqual(n_instances, 100) 
開發者ID:allenai,項目名稱:kb,代碼行數:44,代碼來源:test_self_attn_iterator.py

示例4: multiprocess_training_loader

# 需要導入模塊: from allennlp.data import iterators [as 別名]
# 或者: from allennlp.data.iterators import BucketIterator [as 別名]
def multiprocess_training_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):

    torch.manual_seed(_config["random_seed"])
    numpy.random.seed(_config["random_seed"])
    random.seed(_config["random_seed"])

    if _config["token_embedder_type"] == "bert_cls":
        _tokenizer = BlingFireTokenizer()
        _ind = PretrainedBertIndexer(pretrained_model=_config["bert_pretrained_model"], do_lowercase=True)
        _token_indexers = {"tokens": _ind}

        _triple_loader = BertTripleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, 
                                               max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"],
                                               min_doc_length=_config["min_doc_length"],min_query_length=_config["min_query_length"])
    
        _iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
                                   sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])
    
        _iterator.index_with(Vocabulary())#.from_files(_config["vocab_directory"]))

    else:
        _tokenizer = BlingFireTokenizer()

        if _config["token_embedder_type"] == "embedding":
            _token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
            _vocab = Vocabulary.from_files(_config["vocab_directory"])

        elif _config["token_embedder_type"] == "fasttext":
            _token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
            _vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])

        elif _config["token_embedder_type"] == "elmo":
            _token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
            _vocab = None

        _triple_loader = IrTripleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, 
                                               max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"],
                                               min_doc_length=_config["min_doc_length"],min_query_length=_config["min_query_length"])

        _iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
                                   sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")])

        _iterator.index_with(_vocab)

    for training_batch in _iterator(_triple_loader.read(_local_file), num_epochs=1):

        _queue.put(training_batch)  # this moves the tensors in to shared memory

    _queue.put(None) # signal end of queue

    _queue.close()  # indicate this local thread is done
    _wait_for_exit.wait()  # keep this process alive until all the shared memory is used and not needed anymore

#
# validation instance generator
#   - filling the _queue with ready to run validation batches
#   - everything is defined thread local
# 
開發者ID:sebastian-hofstaetter,項目名稱:transformer-kernel-ranking,代碼行數:60,代碼來源:multiprocess_input_pipeline.py

示例5: multiprocess_validation_loader

# 需要導入模塊: from allennlp.data import iterators [as 別名]
# 或者: from allennlp.data.iterators import BucketIterator [as 別名]
def multiprocess_validation_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):

    torch.manual_seed(_config["random_seed"])
    numpy.random.seed(_config["random_seed"])
    random.seed(_config["random_seed"])

    if _config["token_embedder_type"] == "bert_cls":
        _tokenizer = BlingFireTokenizer()
        _ind = PretrainedBertIndexer(pretrained_model=_config["bert_pretrained_model"], do_lowercase=True)
        _token_indexers = {"tokens": _ind}

        _tuple_loader = BertLabeledTupleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, 
                                                      max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"],
                                                      min_doc_length=_config["min_doc_length"],min_query_length=_config["min_query_length"])
    
        _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
                                   sorting_keys=[("doc_tokens", "num_tokens")])
    
        _iterator.index_with(Vocabulary.from_files(_config["vocab_directory"]))

    else:
        _tokenizer = BlingFireTokenizer()

        if _config["token_embedder_type"] == "embedding":
            _token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
            _vocab = Vocabulary.from_files(_config["vocab_directory"])

        elif _config["token_embedder_type"] == "fasttext":
            _token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
            _vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])

        elif _config["token_embedder_type"] == "elmo":
            _token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
            _vocab = None

        _tuple_loader = IrLabeledTupleDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, 
                                                    max_doc_length=_config["max_doc_length"],max_query_length=_config["max_query_length"],
                                                    min_doc_length=_config["min_doc_length"],min_query_length=_config["min_query_length"])

        _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
                                   sorting_keys=[("doc_tokens", "num_tokens"), ("query_tokens", "num_tokens")])

        _iterator.index_with(_vocab)

    for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1):

        _queue.put(training_batch)  # this moves the tensors in to shared memory

    _queue.put(None) # signal end of queue

    _queue.close()  # indicate this local thread is done
    _wait_for_exit.wait()  # keep this process alive until all the shared memory is used and not needed anymore

#
# single sequence loader from multiple files 
# 
開發者ID:sebastian-hofstaetter,項目名稱:transformer-kernel-ranking,代碼行數:58,代碼來源:multiprocess_input_pipeline.py

示例6: multiprocess_single_sequence_loader

# 需要導入模塊: from allennlp.data import iterators [as 別名]
# 或者: from allennlp.data.iterators import BucketIterator [as 別名]
def multiprocess_single_sequence_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file,_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data):

    torch.manual_seed(_config["random_seed"])
    numpy.random.seed(_config["random_seed"])
    random.seed(_config["random_seed"])

    if _config["token_embedder_type"] == "bert_cls":
        _tokenizer = BlingFireTokenizer()
        _ind = PretrainedBertIndexer(pretrained_model=_config["bert_pretrained_model"], do_lowercase=True)
        _token_indexers = {"tokens": _ind}

        _tuple_loader = IrSingleSequenceDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, 
                                                max_seq_length= _config["max_doc_length"], min_seq_length=_config["min_doc_length"],)

        _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
                                   sorting_keys=[("seq_tokens", "num_tokens")])

        _iterator.index_with(Vocabulary.from_files(_config["vocab_directory"]))

    else:
        _tokenizer = BlingFireTokenizer()

        if _config["token_embedder_type"] == "embedding":
            _token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
            _vocab = Vocabulary.from_files(_config["vocab_directory"])

        elif _config["token_embedder_type"] == "fasttext":
            _token_indexers = {"tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])}
            _vocab = FastTextVocab(_fasttext_vocab_cached_mapping,_fasttext_vocab_cached_data,_config["fasttext_max_subwords"])

        elif _config["token_embedder_type"] == "elmo":
            _token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
            _vocab = None

        _tuple_loader = IrSingleSequenceDatasetReader(lazy=True, tokenizer=_tokenizer,token_indexers=_token_indexers, 
                                                    max_seq_length= _config["max_doc_length"], min_seq_length=_config["min_doc_length"],)

        _iterator = BucketIterator(batch_size=int(_config["batch_size_eval"]),
                                   sorting_keys=[("seq_tokens", "num_tokens")])

        _iterator.index_with(_vocab)

    for training_batch in _iterator(_tuple_loader.read(_local_file), num_epochs=1):

        _queue.put(training_batch)  # this moves the tensors in to shared memory

    _queue.put(None) # signal end of queue

    _queue.close()  # indicate this local thread is done
    _wait_for_exit.wait()  # keep this process alive until all the shared memory is used and not needed anymore 
開發者ID:sebastian-hofstaetter,項目名稱:transformer-kernel-ranking,代碼行數:52,代碼來源:multiprocess_input_pipeline.py


注:本文中的allennlp.data.iterators.BucketIterator方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。