当前位置: 首页>>代码示例>>Python>>正文


Python utils.strip_pad方法代码示例

本文整理汇总了Python中fairseq.utils.strip_pad方法的典型用法代码示例。如果您正苦于以下问题:Python utils.strip_pad方法的具体用法?Python utils.strip_pad怎么用?Python utils.strip_pad使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在fairseq.utils的用法示例。


在下文中一共展示了utils.strip_pad方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: generate_batched_itr

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def generate_batched_itr(
        self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None,
        cuda=False, timer=None, prefix_size=0,
    ):
        """Iterate over a batched dataset and yield individual translations.
        Args:
            maxlen_a/b: generate sequences of maximum length ax + b,
                where x is the source sentence length.
            cuda: use GPU for generation
            timer: StopwatchMeter for timing generations.
        """
        if maxlen_b is None:
            maxlen_b = self.maxlen

        for sample in data_itr:
            s = utils.move_to_cuda(sample) if cuda else sample
            if 'net_input' not in s:
                continue
            input = s['net_input']
            srclen = input['src_tokens'].size(1)
            if timer is not None:
                timer.start()
            with torch.no_grad():
                hypos = self.generate(
                    input['src_tokens'],
                    input['src_lengths'],
                    beam_size=beam_size,
                    maxlen=int(maxlen_a*srclen + maxlen_b),
                    prefix_tokens=s['target'][:, :prefix_size] if prefix_size > 0 else None,
                )
            if timer is not None:
                timer.stop(sum(len(h[0]['tokens']) for h in hypos))
            for i, id in enumerate(s['id'].data):
                # remove padding
                src = utils.strip_pad(input['src_tokens'].data[i, :], self.pad)
                ref = utils.strip_pad(s['target'].data[i, :], self.pad) if s['target'] is not None else None
                yield id, src, ref, hypos[i] 
开发者ID:nusnlp,项目名称:crosentgec,代码行数:39,代码来源:sequence_generator.py

示例2: score_batched_itr

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def score_batched_itr(self, data_itr, cuda=False, timer=None):
        """Iterate over a batched dataset and yield scored translations."""
        for sample in data_itr:
            s = utils.move_to_cuda(sample) if cuda else sample
            if timer is not None:
                timer.start()
            pos_scores, attn = self.score(s)
            for i, id in enumerate(s['id'].data):
                # remove padding from ref
                src = utils.strip_pad(s['net_input']['src_tokens'].data[i, :], self.pad)
                ref = utils.strip_pad(s['target'].data[i, :], self.pad) if s['target'] is not None else None
                tgt_len = ref.numel()
                pos_scores_i = pos_scores[i][:tgt_len]
                score_i = pos_scores_i.sum() / tgt_len
                if attn is not None:
                    attn_i = attn[i]
                    _, alignment = attn_i.max(dim=0)
                else:
                    attn_i = alignment = None
                hypos = [{
                    'tokens': ref,
                    'score': score_i,
                    'attention': attn_i,
                    'alignment': alignment,
                    'positional_scores': pos_scores_i,
                }]
                if timer is not None:
                    timer.stop(s['ntokens'])
                # return results in the same format as SequenceGenerator
                yield id, src, ref, hypos 
开发者ID:nusnlp,项目名称:crosentgec,代码行数:32,代码来源:sequence_scorer.py

示例3: test_noising_dataset_with_eos

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def test_noising_dataset_with_eos(self):
        src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker(
            append_eos=True
        )

        # Format data for src_dataset
        src_tokens = torch.t(src_tokens)
        src_tokens_no_pad = []
        for src_sentence in src_tokens:
            src_tokens_no_pad.append(
                utils.strip_pad(tensor=src_sentence, pad=src_dict.pad())
            )
        denoising_batch_result = self._get_noising_dataset_batch(
            src_tokens_no_pad=src_tokens_no_pad, src_dict=src_dict
        )

        eos, pad = src_dict.eos(), src_dict.pad()

        # Generated noisy source as source
        expected_src = torch.LongTensor(
            [[4, 5, 10, 11, 8, 12, 13, eos], [pad, pad, pad, 6, 8, 9, 7, eos]]
        )
        # Original clean source as target (right-padded)
        expected_tgt = torch.LongTensor(
            [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]]
        )
        generated_src = denoising_batch_result["net_input"]["src_tokens"]
        tgt_tokens = denoising_batch_result["target"]

        self.assertTensorEqual(expected_src, generated_src)
        self.assertTensorEqual(expected_tgt, tgt_tokens) 
开发者ID:pytorch,项目名称:fairseq,代码行数:33,代码来源:test_noising.py

示例4: test_noising_dataset_without_eos

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def test_noising_dataset_without_eos(self):
        """
        Similar to test noising dataset with eos except that we have to set
        *append_eos_to_tgt* to ``True``.
        """

        src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker(
            append_eos=False
        )

        # Format data for src_dataset
        src_tokens = torch.t(src_tokens)
        src_tokens_no_pad = []
        for src_sentence in src_tokens:
            src_tokens_no_pad.append(
                utils.strip_pad(tensor=src_sentence, pad=src_dict.pad())
            )
        denoising_batch_result = self._get_noising_dataset_batch(
            src_tokens_no_pad=src_tokens_no_pad,
            src_dict=src_dict,
            append_eos_to_tgt=True,
        )

        eos, pad = src_dict.eos(), src_dict.pad()

        # Generated noisy source as source
        expected_src = torch.LongTensor(
            [[4, 5, 10, 11, 8, 12, 13], [pad, pad, pad, 6, 8, 9, 7]]
        )
        # Original clean source as target (right-padded)
        expected_tgt = torch.LongTensor(
            [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]]
        )

        generated_src = denoising_batch_result["net_input"]["src_tokens"]
        tgt_tokens = denoising_batch_result["target"]

        self.assertTensorEqual(expected_src, generated_src)
        self.assertTensorEqual(expected_tgt, tgt_tokens) 
开发者ID:pytorch,项目名称:fairseq,代码行数:41,代码来源:test_noising.py

示例5: generate_batched_itr

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def generate_batched_itr(
        self,
        data_itr,
        maxlen_a=None,
        maxlen_b=None,
        cuda=False,
        timer=None,
        prefix_size=0,
    ):
        """Iterate over a batched dataset and yield individual translations.

        Args:
            maxlen_a/b: generate sequences of maximum length ax + b,
                where x is the source sentence length.
            cuda: use GPU for generation
            timer: StopwatchMeter for timing generations.
        """

        for sample in data_itr:
            if "net_input" not in sample:
                continue
            if timer is not None:
                timer.start()
            with torch.no_grad():
                hypos = self.generate(
                    self.models,
                    sample,
                    prefix_tokens=sample["target"][:, :prefix_size]
                    if prefix_size > 0
                    else None,
                )
            if timer is not None:
                timer.stop(sample["ntokens"])
            for i, id in enumerate(sample["id"]):
                # remove padding
                src = utils.strip_pad(sample["net_input"]["src_tokens"][i, :], self.pad)
                ref = utils.strip_pad(sample["target"][i, :], self.pad)
                yield id, src, ref, hypos[i] 
开发者ID:pytorch,项目名称:fairseq,代码行数:40,代码来源:iterative_refinement_generator.py

示例6: _inference_with_bleu

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def _inference_with_bleu(self, generator, sample, model):
        import sacrebleu

        def decode(toks, escape_unk=False):
            s = self.tgt_dict.string(
                toks.int().cpu(),
                self.args.eval_bleu_remove_bpe,
                # The default unknown string in fairseq is `<unk>`, but
                # this is tokenized by sacrebleu as `< unk >`, inflating
                # BLEU scores. Instead, we use a somewhat more verbose
                # alternative that is unlikely to appear in the real
                # reference, but doesn't get split into multiple tokens.
                unk_string=(
                    "UNKNOWNTOKENINREF" if escape_unk else "UNKNOWNTOKENINHYP"
                ),
            )
            if self.tokenizer:
                s = self.tokenizer.decode(s)
            return s

        gen_out = self.inference_step(generator, [model], sample, None)
        hyps, refs = [], []
        for i in range(len(gen_out)):
            hyps.append(decode(gen_out[i][0]['tokens']))
            refs.append(decode(
                utils.strip_pad(sample['target'][i], self.tgt_dict.pad()),
                escape_unk=True,  # don't count <unk> as matches to the hypo
            ))
        if self.args.eval_bleu_print_samples:
            logger.info('example hypothesis: ' + hyps[0])
            logger.info('example reference: ' + refs[0])
        if self.args.eval_tokenized_bleu:
            return sacrebleu.corpus_bleu(hyps, [refs], tokenize='none')
        else:
            return sacrebleu.corpus_bleu(hyps, [refs]) 
开发者ID:pytorch,项目名称:fairseq,代码行数:37,代码来源:translation.py

示例7: _generate_translation

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def _generate_translation(self, model, tgt_dict, sample, beam_size, **kwargs):
        translator_class = beam_decode.SequenceGenerator
        translator = translator_class(models=[model], tgt_dict=tgt_dict, **kwargs)
        translator.cuda()
        s = utils.move_to_cuda(sample)

        # TODO: nbest
        input = s["net_input"]
        srclen = input["src_tokens"].size(1)
        if self.task.use_char_source:
            encoder_input = {
                k: v
                for k, v in input.items()
                if k in ["src_tokens", "src_lengths", "char_inds", "word_lengths"]
            }
        else:
            encoder_input = {
                k: v for k, v in input.items() if k in ["src_tokens", "src_lengths"]
            }
        with torch.no_grad():
            hypos = translator.generate(
                encoder_input=encoder_input,
                beam_size=beam_size,
                maxlen=int(self.args.max_len_a * srclen + self.args.max_len_b),
            )
            for i, id in enumerate(s["id"]):
                # remove padding
                src = utils.strip_pad(input["src_tokens"][i, :], tgt_dict.pad())
                yield id, src, hypos[i] 
开发者ID:pytorch,项目名称:translate,代码行数:31,代码来源:dual_learning_criterion.py

示例8: generate_batched_itr

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def generate_batched_itr(self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None,
                             cuda=False, timer=None, prefix_size=0):
        """Iterate over a batched dataset and yield individual translations.

        Args:
            maxlen_a/b: generate sequences of maximum length ax + b,
                where x is the source sentence length.
            cuda: use GPU for generation
            timer: StopwatchMeter for timing generations.
        """
        if maxlen_b is None:
            maxlen_b = self.maxlen

        for sample in data_itr:
            s = utils.make_variable(sample, volatile=True, cuda=cuda)
            input = s['net_input']
            srclen = input['src_tokens'].size(1)
            if timer is not None:
                timer.start()
            with utils.maybe_no_grad():
                hypos = self.generate(
                    input['src_tokens'],
                    input['src_lengths'],
                    input['src_doctopic'],
                    input['src_wordtopics'],
                    beam_size=beam_size,
                    maxlen=int(maxlen_a*srclen + maxlen_b),
                    prefix_tokens=s['target'][:, :prefix_size] if prefix_size > 0 else None,
                )
            if timer is not None:
                timer.stop(sum([len(h[0]['tokens']) for h in hypos]))
            for i, id in enumerate(s['id'].data):
                src = input['src_tokens'].data[i, :]
                # remove padding from ref
                ref = utils.strip_pad(s['target'].data[i, :], self.pad) if s['target'] is not None else None
                yield id, src, ref, hypos[i] 
开发者ID:EdinburghNLP,项目名称:XSum,代码行数:38,代码来源:sequence_generator.py

示例9: score_batched_itr

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def score_batched_itr(self, data_itr, cuda=False, timer=None):
        """Iterate over a batched dataset and yield scored translations."""
        for sample in data_itr:
            s = utils.make_variable(sample, volatile=True, cuda=cuda)
            if timer is not None:
                timer.start()
            pos_scores, attn = self.score(s)
            if timer is not None:
                timer.stop(s['ntokens'])
            for i, id in enumerate(s['id'].data):
                src = s['net_input']['src_tokens'].data[i, :]
                # remove padding from ref
                ref = utils.strip_pad(s['target'].data[i, :], self.pad)
                tgt_len = ref.numel()
                pos_scores_i = pos_scores[i][:tgt_len]
                score_i = pos_scores_i.sum() / tgt_len
                attn_i = attn[i]
                _, alignment = attn_i.max(dim=0)
                hypos = [{
                    'tokens': ref,
                    'score': score_i,
                    'attention': attn_i,
                    'alignment': alignment,
                    'positional_scores': pos_scores_i,
                }]
                # return results in the same format as SequenceGenerator
                yield id, src, ref, hypos 
开发者ID:EdinburghNLP,项目名称:XSum,代码行数:29,代码来源:sequence_scorer.py

示例10: generate_batched_itr

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def generate_batched_itr(self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None,
                             cuda=False, timer=None, prefix_size=0):
        """Iterate over a batched dataset and yield individual translations.

        Args:
            maxlen_a/b: generate sequences of maximum length ax + b,
                where x is the source sentence length.
            cuda: use GPU for generation
            timer: StopwatchMeter for timing generations.
        """
        if maxlen_b is None:
            maxlen_b = self.maxlen

        for sample in data_itr:
            s = utils.make_variable(sample, volatile=True, cuda=cuda)
            input = s['net_input']
            srclen = input['src_tokens'].size(1)
            if timer is not None:
                timer.start()
            with utils.maybe_no_grad():
                hypos = self.generate(
                    input['src_tokens'],
                    input['src_lengths'],
                    beam_size=beam_size,
                    maxlen=int(maxlen_a*srclen + maxlen_b),
                    prefix_tokens=s['target'][:, :prefix_size] if prefix_size > 0 else None,
                )
            if timer is not None:
                timer.stop(sum([len(h[0]['tokens']) for h in hypos]))
            for i, id in enumerate(s['id'].data):
                src = input['src_tokens'].data[i, :]
                # remove padding from ref
                ref = utils.strip_pad(s['target'].data[i, :], self.pad) if s['target'] is not None else None
                yield id, src, ref, hypos[i] 
开发者ID:EdinburghNLP,项目名称:XSum,代码行数:36,代码来源:sequence_generator.py

示例11: generate_batched_itr

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def generate_batched_itr(self, data_itr, beam_size=None, cuda=False, timer=None):
        """Iterate over a batched dataset and yield individual translations.
        Args:
            cuda (bool, optional): use GPU for generation
            timer (StopwatchMeter, optional): time generations
        """
        for sample in data_itr:
            s = utils.move_to_cuda(sample) if cuda else sample
            if "net_input" not in s:
                continue
            input = s["net_input"]
            # model.forward normally channels prev_output_tokens into the decoder
            # separately, but SequenceGenerator directly calls model.encoder
            encoder_input = {
                k: v for k, v in input.items() if k != "prev_output_tokens"
            }
            if timer is not None:
                timer.start()
            with torch.no_grad():
                hypos = self.generate(encoder_input)
            if timer is not None:
                timer.stop(sum(len(h[0]["tokens"]) for h in hypos))
            for i, id in enumerate(s["id"].data):
                # remove padding
                src = utils.strip_pad(input["src_tokens"].data[i, :], self.pad)
                ref = (
                    utils.strip_pad(s["target"].data[i, :], self.pad)
                    if s["target"] is not None
                    else None
                )
                yield id, src, ref, hypos[i] 
开发者ID:elbayadm,项目名称:attn2d,代码行数:33,代码来源:waitk_sequence_generator.py

示例12: _inference_with_bleu

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def _inference_with_bleu(self, generator, sample, model):
        import sacrebleu

        def decode(toks, escape_unk=False):
            s = self.tgt_dict.string(
                toks.int().cpu(),
                self.args.eval_bleu_remove_bpe,
                escape_unk=escape_unk,
            )
            if self.tokenizer:
                s = self.tokenizer.decode(s)
            return s

        gen_out = self.inference_step(generator, [model], sample, None)
        hyps, refs = [], []
        for i in range(len(gen_out)):
            hyps.append(decode(gen_out[i][0]['tokens']))
            refs.append(decode(
                utils.strip_pad(sample['target'][i], self.tgt_dict.pad()),
                escape_unk=True,  # don't count <unk> as matches to the hypo
            ))
        if self.args.eval_bleu_print_samples:
            logger.info('example hypothesis: ' + hyps[0])
            logger.info('example reference: ' + refs[0])
        if self.args.eval_tokenized_bleu:
            return sacrebleu.corpus_bleu(hyps, [refs], tokenize='none')
        else:
            return sacrebleu.corpus_bleu(hyps, [refs]) 
开发者ID:elbayadm,项目名称:attn2d,代码行数:30,代码来源:translation.py

示例13: forward

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def forward(self, model, sample, reduce=True):
        """Compute the loss for the given sample.

        Returns a tuple with three elements:
        1) the loss
        2) the sample size, which is used as the denominator for the gradient
        3) logging outputs to display while training
        """

        assert hasattr(model.decoder, 'adaptive_softmax') and model.decoder.adaptive_softmax is not None
        adaptive_softmax = model.decoder.adaptive_softmax

        net_output = model(**sample['net_input'])
        orig_target = model.get_targets(sample, net_output)

        nsentences = orig_target.size(0)
        orig_target = orig_target.view(-1)

        bsz = orig_target.size(0)

        logits, target = adaptive_softmax(net_output[0], orig_target)
        assert len(target) == len(logits)

        loss = net_output[0].new(1 if reduce else bsz).zero_()

        for i in range(len(target)):
            if target[i] is not None:
                assert (target[i].min() >= 0 and target[i].max() <= logits[i].size(1))
                loss += F.cross_entropy(logits[i], target[i], size_average=False, ignore_index=self.padding_idx,
                                        reduce=reduce)

        orig = utils.strip_pad(orig_target, self.padding_idx)
        ntokens = orig.numel()
        sample_size = sample['target'].size(0) if self.args.sentence_avg else ntokens
        logging_output = {
            'loss': utils.item(loss.data) if reduce else loss.data,
            'ntokens': ntokens,
            'nsentences': nsentences,
            'sample_size': sample_size,
        }
        return loss, sample_size, logging_output 
开发者ID:kakaobrain,项目名称:helo_word,代码行数:43,代码来源:adaptive_loss.py

示例14: forward

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def forward(self, model, sample, reduce=True):
        """Compute the loss for the given sample.

        Returns a tuple with three elements:
        1) the loss
        2) the sample size, which is used as the denominator for the gradient
        3) logging outputs to display while training
        """

        assert hasattr(model.decoder, 'adaptive_softmax') and model.decoder.adaptive_softmax is not None
        adaptive_softmax = model.decoder.adaptive_softmax

        net_output = model(**sample['net_input'])
        orig_target = model.get_targets(sample, net_output)

        nsentences = orig_target.size(0)
        orig_target = orig_target.view(-1)

        bsz = orig_target.size(0)

        logits, target = adaptive_softmax(net_output[0], orig_target)
        assert len(target) == len(logits)

        loss = net_output[0].new(1 if reduce else bsz).zero_()

        for i in range(len(target)):
            if target[i] is not None:
                assert (target[i].min() >= 0 and target[i].max() <= logits[i].size(1))
                loss += F.cross_entropy(
                    logits[i],
                    target[i],
                    ignore_index=self.padding_idx,
                    reduction='sum' if reduce else 'none',
                )

        orig = utils.strip_pad(orig_target, self.padding_idx)
        ntokens = orig.numel()
        sample_size = sample['target'].size(0) if self.sentence_avg else ntokens
        logging_output = {
            'loss': loss.data,
            'ntokens': ntokens,
            'nsentences': nsentences,
            'sample_size': sample_size,
        }
        return loss, sample_size, logging_output 
开发者ID:pytorch,项目名称:fairseq,代码行数:47,代码来源:adaptive_loss.py

示例15: generate_batched_itr

# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def generate_batched_itr(
        self,
        data_itr,
        beam_size=None,
        maxlen_a=0.0,
        maxlen_b=None,
        cuda=False,
        timer=None,
        prefix_size=0,
    ):
        """Iterate over a batched dataset and yield individual translations.

        Args:
            maxlen_a/b: generate sequences of maximum length ax + b,
                where x is the source sentence length.
            cuda: use GPU for generation
            timer: StopwatchMeter for timing generations.
        """
        if maxlen_b is None:
            maxlen_b = self.maxlen

        for sample in data_itr:
            if cuda:
                s = utils.move_to_cuda(sample)
            input = s["net_input"]
            # Take the max source length to compute the max target length
            srclen = input["src_tokens"].size(1)
            # FIXME: handle characters properly
            if self.use_char_source:
                raise ValueError(
                    "Character level encoder is not supported yet for "
                    "multisource sentences."
                )
            encoder_inputs = (input["src_tokens"], input["src_lengths"])
            if timer is not None:
                timer.start()
            with torch.no_grad():
                hypos = self.generate(
                    encoder_inputs,
                    srcs_ids=input["src_ids"],
                    beam_size=beam_size,
                    maxlen=int(maxlen_a * srclen + maxlen_b),
                    prefix_tokens=s["target"][:, :prefix_size]
                    if prefix_size > 0
                    else None,
                )
            if timer is not None:
                timer.stop(s["ntokens"])
            for i, id in enumerate(s["id"]):
                src = input["src_tokens"].index_select(
                    0, input["src_ids"][self.align_to]
                )
                # remove padding from ref
                ref = utils.strip_pad(s["target"][i, :], self.pad)
                yield id, src, ref, hypos[i] 
开发者ID:pytorch,项目名称:translate,代码行数:57,代码来源:multisource_decode.py


注:本文中的fairseq.utils.strip_pad方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。