当前位置: 首页>>代码示例>>Python>>正文


Python data.TokenBlockDataset方法代码示例

本文整理汇总了Python中fairseq.data.TokenBlockDataset方法的典型用法代码示例。如果您正苦于以下问题:Python data.TokenBlockDataset方法的具体用法?Python data.TokenBlockDataset怎么用?Python data.TokenBlockDataset使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在fairseq.data的用法示例。


在下文中一共展示了data.TokenBlockDataset方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: make_batches

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import TokenBlockDataset [as 别名]
def make_batches(self, lines):
        token_lst = [self.task.source_dictionary.encode_line(line, add_if_not_exist=False).long()
                     for line in lines]
        length_lst = torch.LongTensor([tokens.numel() for tokens in token_lst])

        ds = data.TokenBlockDataset(token_lst, length_lst, self.args.tokens_per_sample, pad=self.task.dictionary.pad(),
                                    eos=self.task.dictionary.eos(),
                                    break_mode='eos', include_targets=True)
        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'
        itr = self.task.get_batch_iterator(
            dataset=data.MonolingualDataset(ds, ds.sizes, self.task.dictionary, self.task.target_dictionary,
                                            add_eos_for_other_targets, shuffle=False, targets=self.task.targets),
            max_tokens=self.args.max_tokens or 3000,
            max_sentences=self.args.max_sentences,
            max_positions=utils.resolve_max_positions(*[
                model.max_positions() for model in self.models
            ]),
            num_shards=self.args.num_shards,
            shard_id=self.args.shard_id,
            ignore_invalid_inputs=True,
            num_workers=self.args.num_workers,
        ).next_epoch_itr(shuffle=False)

        return itr 
开发者ID:kakaobrain,项目名称:helo_word,代码行数:26,代码来源:lm_scorer.py

示例2: get_trainer_and_epoch_itr

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import TokenBlockDataset [as 别名]
def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch):
    tokens = torch.LongTensor(list(range(epoch_size)))
    tokens_ds = data.TokenBlockDataset(tokens, [len(tokens)], 1, include_targets=False)
    trainer = mock_trainer(epoch, num_updates, iterations_in_epoch)
    epoch_itr = data.EpochBatchIterator(
        dataset=data.LanguagePairDataset(tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False),
        max_tokens=1,
    )
    return trainer, epoch_itr 
开发者ID:nusnlp,项目名称:crosentgec,代码行数:11,代码来源:test_train.py

示例3: setUp

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import TokenBlockDataset [as 别名]
def setUp(self):
        d = mock_dict()
        tokens_1 = torch.LongTensor([1]).view(1, -1)
        tokens_ds1 = TokenBlockDataset(
            tokens_1,
            sizes=[tokens_1.size(-1)],
            block_size=1,
            pad=0,
            eos=1,
            include_targets=False,
        )
        self.dataset_1 = LanguagePairDataset(
            tokens_ds1, tokens_ds1.sizes, d, shuffle=False
        )
        tokens_2 = torch.LongTensor([2]).view(1, -1)
        tokens_ds2 = TokenBlockDataset(
            tokens_2,
            sizes=[tokens_2.size(-1)],
            block_size=1,
            pad=0,
            eos=1,
            include_targets=False,
        )
        self.dataset_2 = LanguagePairDataset(
            tokens_ds2, tokens_ds2.sizes, d, shuffle=False
        ) 
开发者ID:pytorch,项目名称:fairseq,代码行数:28,代码来源:test_multi_corpus_sampled_dataset.py

示例4: _build_dataset

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import TokenBlockDataset [as 别名]
def _build_dataset(self, data, **kwargs):
        sizes = [len(x) for x in data]
        underlying_ds = test_utils.TestDataset(data)
        return TokenBlockDataset(underlying_ds, sizes, **kwargs) 
开发者ID:pytorch,项目名称:fairseq,代码行数:6,代码来源:test_token_block_dataset.py

示例5: get_trainer_and_epoch_itr

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import TokenBlockDataset [as 别名]
def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch):
    tokens = torch.LongTensor(list(range(epoch_size))).view(1, -1)
    tokens_ds = data.TokenBlockDataset(
        tokens, sizes=[tokens.size(-1)], block_size=1, pad=0, eos=1, include_targets=False,
    )
    trainer = mock_trainer(epoch, num_updates, iterations_in_epoch)
    dataset = data.LanguagePairDataset(tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False)
    epoch_itr = data.EpochBatchIterator(
        dataset=dataset,
        collate_fn=dataset.collater,
        batch_sampler=[[i] for i in range(epoch_size)],
    )
    return trainer, epoch_itr 
开发者ID:pytorch,项目名称:fairseq,代码行数:15,代码来源:test_train.py

示例6: score_sentence

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import TokenBlockDataset [as 别名]
def score_sentence(self, line):
        # Tokenize the input sentence into a batch of size one.
        tokens = tokenizer.Tokenizer.tokenize(line, self.task.dictionary, add_if_not_exist=False).long()
        lengths = np.array([tokens.numel()])
        ds = data.TokenBlockDataset(tokens, lengths, self.args.tokens_per_sample, pad=self.task.dictionary.pad(), eos=self.task.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True)

        # Create a batch iterator to wrap the data.
        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'
        itr = self.task.get_batch_iterator(
            dataset=data.MonolingualDataset(ds, ds.sizes, self.task.dictionary, self.task.target_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=False, targets=self.task.targets),
            max_tokens=self.args.max_tokens or 3000,
            max_sentences=self.args.max_sentences,
            max_positions=utils.resolve_max_positions(*[
                model.max_positions() for model in self.models 
            ]),
            num_shards=self.args.num_shards,
            shard_id=self.args.shard_id,
            ignore_invalid_inputs=True,
        ).next_epoch_itr(shuffle=False)
        
        # Evaluate the sentence and return the fluency score.
        results = self.scorer.score_batched_itr(itr, cuda=self.use_cuda)
        for _, _, _, hypos in results:
            for hypo in hypos:
                # Ignore words with infinite probability. This can happen when
                # running low-precision inference on the GPU. 
                pos_scores = hypo['positional_scores']
                word_prob = [score for score in pos_scores if score != float('-inf') and score != float('inf')]
                return self._fluency_score(word_prob)
        return 0.0 
开发者ID:rgcottrell,项目名称:pytorch-human-performance-gec,代码行数:32,代码来源:fluency_scorer.py


注:本文中的fairseq.data.TokenBlockDataset方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。