Python dataset.Batch方法代码示例

本文整理汇总了Python中allennlp.data.dataset.Batch方法的典型用法代码示例。如果您正苦于以下问题：Python dataset.Batch方法的具体用法？Python dataset.Batch怎么用？Python dataset.Batch使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.dataset的用法示例。

在下文中一共展示了dataset.Batch方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: preprocess

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def preprocess(self, token_batch):
        seq_lens = [len(sequence) for sequence in token_batch if sequence]
        if not seq_lens:
            return []
        max_len = min(max(seq_lens), self.max_len)
        batches = []
        for indexer in self.indexers:
            batch = []
            for sequence in token_batch:
                tokens = sequence[:max_len]
                tokens = [Token(token) for token in ['$START'] + tokens]
                batch.append(Instance({'tokens': TextField(tokens, indexer)}))
            batch = Batch(batch)
            batch.index_instances(self.vocab)
            batches.append(batch)

        return batches

开发者ID:plkmo，项目名称:NLP_Toolkit，代码行数:19，代码来源:gec_model.py

示例2: test_saving_and_loading_works_with_byte_encoding

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding=u'utf-8')
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
        tokens = [Token(t) for t in [u"Øyvind", u"für", u"汉字"]]
        text_field = TextField(tokens, {u"characters": token_indexer})
        dataset = Batch([Instance({u"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)  # pylint: disable=protected-access

        vocab_dir = self.TEST_DIR / u'vocab_save'
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {u"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)  # pylint: disable=protected-access
        assert indexed_tokens == indexed_tokens2

开发者ID:plasticityai，项目名称:magnitude，代码行数:22，代码来源:vocabulary_test.py

示例3: test_from_params_extend_config

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def test_from_params_extend_config(self):

        vocab_dir = self.TEST_DIR / u'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=[u"tokens"])
        original_vocab.add_token_to_namespace(u"a", namespace=u"tokens")
        original_vocab.save_to_files(vocab_dir)

        text_field = TextField([Token(t) for t in [u"a", u"b"]],
                               {u"tokens": SingleIdTokenIndexer(u"tokens")})
        instances = Batch([Instance({u"text": text_field})])

        # If you ask to extend vocab from `directory_path`, instances must be passed
        # in Vocabulary constructor, or else there is nothing to extend to.
        params = Params({u"directory_path": vocab_dir, u"extend": True})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params)

        # If you ask to extend vocab, `directory_path` key must be present in params,
        # or else there is nothing to extend from.
        params = Params({u"extend": True})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)

开发者ID:plasticityai，项目名称:magnitude，代码行数:24，代码来源:vocabulary_test.py

示例4: set_up_model

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params[u'dataset_reader'])
        instances = reader.read(dataset_file)
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if u'vocabulary' in params:
            vocab_params = params[u'vocabulary']
            vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(vocab=self.vocab, params=params[u'model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)

开发者ID:plasticityai，项目名称:magnitude，代码行数:24，代码来源:model_test_case.py

示例5: _create_batches

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
        for batch in self.iterator._create_batches(instances, shuffle):
            # split after shuffling so smaller batches are kept together
            batch_instances = batch.instances

            # split if needed
            batch_length = -1
            for instance in batch_instances:
                instance.index_fields(self.vocab)
                field_lengths = instance.get_padding_lengths()
                batch_length = max(batch_length, field_lengths['tokens']['num_tokens'])

            # get the required batch size
            index = bisect.bisect_left(self._schedule_lengths, batch_length)
            if index == len(self._schedule_lengths):
                # this batch exceeds the maximum allowed, just skip it
                continue
            batch_size = self._schedule_batch_sizes[index]
            start = 0
            while start < len(batch_instances):
                end = start + batch_size
                yield Batch(batch_instances[start:end])
                start = end

开发者ID:allenai，项目名称:kb，代码行数:25，代码来源:self_attn_bucket_iterator.py

示例6: set_up_model

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        # The dataset reader might be lazy, but a lazy list here breaks some of our tests.
        instances = list(reader.read(str(dataset_file)))
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if 'vocabulary' in params:
            vocab_params = params['vocabulary']
            vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(vocab=self.vocab, params=params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)

开发者ID:allenai，项目名称:vampire，代码行数:25，代码来源:test_case.py

示例7: predict_batch_instance

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
        model = self._model

        with torch.no_grad():
            cuda_device = model._get_prediction_device()
            dataset = Batch(instances)
            dataset.index_instances(model.vocab)
            model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device)
            outputs = model.decode(model(**model_input))

        return sanitize(outputs)

开发者ID:DFKI-NLP，项目名称:DISTRE，代码行数:13，代码来源:predictor.py

示例8: collate_fn

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def collate_fn(data, to_gpu=False):
    """Creates mini-batch tensors
    """
    images, instances = zip(*data)
    images = torch.stack(images, 0)
    batch = Batch(instances)
    td = batch.as_tensor_dict()
    if 'question' in td:
        td['question_mask'] = get_text_field_mask(td['question'], num_wrapping_dims=1)
        td['question_tags'][td['question_mask'] == 0] = -2  # Padding

    td['answer_mask'] = get_text_field_mask(td['answers'], num_wrapping_dims=1)
    td['answer_tags'][td['answer_mask'] == 0] = -2

    td['box_mask'] = torch.all(td['boxes'] >= 0, -1).long()
    td['images'] = images

    # Deprecated
    # if to_gpu:
    #     for k in td:
    #         if k != 'metadata':
    #             td[k] = {k2: v.cuda(non_blocking=True) for k2, v in td[k].items()} if isinstance(td[k], dict) else td[k].cuda(
    #             non_blocking=True)
 
    # # No nested dicts
    # for k in sorted(td.keys()):
    #     if isinstance(td[k], dict):
    #         for k2 in sorted(td[k].keys()):
    #             td['{}_{}'.format(k, k2)] = td[k].pop(k2)
    #         td.pop(k)

    return td

开发者ID:yuweijiang，项目名称:HGL-pytorch，代码行数:34，代码来源:vcr.py

示例9: forward_on_instances

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def forward_on_instances(self,
                             instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.

        Parameters
        ----------
        instances : List[Instance], required
            The instances to run the model on.

        Returns
        -------
        A list of the models output for each instance.
        """
        with torch.no_grad():
            dataset = Batch(instances)
            dataset.index_instances(self.vocab)
            model_input = dataset.as_tensor_dict()
            outputs = self.decode(self(**model_input))

            instance_separated_output = []

            metadata = [x.fields["metadata"].metadata for x in dataset.instances]
            for res in export_output_data_arc_multi_choice_json(metadata, outputs):
                instance_separated_output.append(res)

            return instance_separated_output

开发者ID:allenai，项目名称:OpenBookQA，代码行数:36，代码来源:qa_multi_choice_know_reader_v1.py

示例10: _sentences_to_ids

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def _sentences_to_ids(sentences):
    indexer = ELMoTokenCharactersIndexer()

    # For each sentence, first create a TextField, then create an instance
    instances = []
    for sentence in sentences:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'character_ids': indexer})
        instance = Instance({'elmo': field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']

开发者ID:cnt-dev，项目名称:pytorch-fast-elmo，代码行数:17，代码来源:test_elmo.py

示例11: batch_to_ids

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def batch_to_ids(batch                 )                :
    u"""
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    Parameters
    ----------
    batch : ``List[List[str]]``, required
        A list of tokenized sentences.

    Returns
    -------
        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens,
                          {u'character_ids': indexer})
        instance = Instance({u"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()[u'elmo'][u'character_ids']

开发者ID:plasticityai，项目名称:magnitude，代码行数:29，代码来源:elmo.py

示例12: _create_batches

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def _create_batches(self, instances                    , shuffle      )                   :
        # First break the dataset into memory-sized lists:
        for instance_list in self._memory_sized_lists(instances):
            if shuffle:
                random.shuffle(instance_list)
            iterator = iter(instance_list)
            # Then break each memory-sized list into batches.
            for batch_instances in lazy_groups_of(iterator, self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances):
                    batch = Batch(possibly_smaller_batches)
                    yield batch

开发者ID:plasticityai，项目名称:magnitude，代码行数:13，代码来源:basic_iterator.py

示例13: _create_batches

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def _create_batches(self, instances                    , shuffle      )                   :
        for instance_list in self._memory_sized_lists(instances):

            instance_list = sort_by_padding(instance_list,
                                            self._sorting_keys,
                                            self.vocab,
                                            self._padding_noise)

            batches = []
            for batch_instances in lazy_groups_of(iter(instance_list), self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances):
                    batches.append(Batch(possibly_smaller_batches))

            move_to_front = self._biggest_batch_first and len(batches) > 1
            if move_to_front:
                # We'll actually pop the last _two_ batches, because the last one might not be full.
                last_batch = batches.pop()
                penultimate_batch = batches.pop()
            if shuffle:
                random.shuffle(batches)
            else:
                logger.warning(u"shuffle parameter is set to False,"
                               u" while bucket iterators by definition change the order of your data.")
            if move_to_front:
                batches.insert(0, penultimate_batch)
                batches.insert(0, last_batch)

            _i = batches
            while True:
                yield _i.next()

开发者ID:plasticityai，项目名称:magnitude，代码行数:32，代码来源:bucket_iterator.py

示例14: _sentences_to_ids

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {u'character_ids': indexer})
            instance = Instance({u'elmo': field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()[u'elmo'][u'character_ids']

开发者ID:plasticityai，项目名称:magnitude，代码行数:17，代码来源:elmo_test.py

示例15: test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly

# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        training_tensors = dataset.as_tensor_dict()
        output_dict = self.model(**training_tensors)
        tags = output_dict[u'tags']
        assert len(tags) == 2
        assert len(tags[0]) == 7
        assert len(tags[1]) == 7
        for example_tags in tags:
            for tag_id in example_tags:
                tag = self.model.vocab.get_token_from_index(tag_id, namespace=u"labels")
                assert tag in set([u'O', u'I-ORG', u'I-PER', u'I-LOC'])

开发者ID:plasticityai，项目名称:magnitude，代码行数:15，代码来源:elmo_token_embedder_test.py

注：本文中的allennlp.data.dataset.Batch方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。