本文整理汇总了Python中fairseq.utils.strip_pad方法的典型用法代码示例。如果您正苦于以下问题:Python utils.strip_pad方法的具体用法?Python utils.strip_pad怎么用?Python utils.strip_pad使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类fairseq.utils
的用法示例。
在下文中一共展示了utils.strip_pad方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: generate_batched_itr
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def generate_batched_itr(
self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None,
cuda=False, timer=None, prefix_size=0,
):
"""Iterate over a batched dataset and yield individual translations.
Args:
maxlen_a/b: generate sequences of maximum length ax + b,
where x is the source sentence length.
cuda: use GPU for generation
timer: StopwatchMeter for timing generations.
"""
if maxlen_b is None:
maxlen_b = self.maxlen
for sample in data_itr:
s = utils.move_to_cuda(sample) if cuda else sample
if 'net_input' not in s:
continue
input = s['net_input']
srclen = input['src_tokens'].size(1)
if timer is not None:
timer.start()
with torch.no_grad():
hypos = self.generate(
input['src_tokens'],
input['src_lengths'],
beam_size=beam_size,
maxlen=int(maxlen_a*srclen + maxlen_b),
prefix_tokens=s['target'][:, :prefix_size] if prefix_size > 0 else None,
)
if timer is not None:
timer.stop(sum(len(h[0]['tokens']) for h in hypos))
for i, id in enumerate(s['id'].data):
# remove padding
src = utils.strip_pad(input['src_tokens'].data[i, :], self.pad)
ref = utils.strip_pad(s['target'].data[i, :], self.pad) if s['target'] is not None else None
yield id, src, ref, hypos[i]
示例2: score_batched_itr
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def score_batched_itr(self, data_itr, cuda=False, timer=None):
"""Iterate over a batched dataset and yield scored translations."""
for sample in data_itr:
s = utils.move_to_cuda(sample) if cuda else sample
if timer is not None:
timer.start()
pos_scores, attn = self.score(s)
for i, id in enumerate(s['id'].data):
# remove padding from ref
src = utils.strip_pad(s['net_input']['src_tokens'].data[i, :], self.pad)
ref = utils.strip_pad(s['target'].data[i, :], self.pad) if s['target'] is not None else None
tgt_len = ref.numel()
pos_scores_i = pos_scores[i][:tgt_len]
score_i = pos_scores_i.sum() / tgt_len
if attn is not None:
attn_i = attn[i]
_, alignment = attn_i.max(dim=0)
else:
attn_i = alignment = None
hypos = [{
'tokens': ref,
'score': score_i,
'attention': attn_i,
'alignment': alignment,
'positional_scores': pos_scores_i,
}]
if timer is not None:
timer.stop(s['ntokens'])
# return results in the same format as SequenceGenerator
yield id, src, ref, hypos
示例3: test_noising_dataset_with_eos
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def test_noising_dataset_with_eos(self):
src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker(
append_eos=True
)
# Format data for src_dataset
src_tokens = torch.t(src_tokens)
src_tokens_no_pad = []
for src_sentence in src_tokens:
src_tokens_no_pad.append(
utils.strip_pad(tensor=src_sentence, pad=src_dict.pad())
)
denoising_batch_result = self._get_noising_dataset_batch(
src_tokens_no_pad=src_tokens_no_pad, src_dict=src_dict
)
eos, pad = src_dict.eos(), src_dict.pad()
# Generated noisy source as source
expected_src = torch.LongTensor(
[[4, 5, 10, 11, 8, 12, 13, eos], [pad, pad, pad, 6, 8, 9, 7, eos]]
)
# Original clean source as target (right-padded)
expected_tgt = torch.LongTensor(
[[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]]
)
generated_src = denoising_batch_result["net_input"]["src_tokens"]
tgt_tokens = denoising_batch_result["target"]
self.assertTensorEqual(expected_src, generated_src)
self.assertTensorEqual(expected_tgt, tgt_tokens)
示例4: test_noising_dataset_without_eos
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def test_noising_dataset_without_eos(self):
"""
Similar to test noising dataset with eos except that we have to set
*append_eos_to_tgt* to ``True``.
"""
src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker(
append_eos=False
)
# Format data for src_dataset
src_tokens = torch.t(src_tokens)
src_tokens_no_pad = []
for src_sentence in src_tokens:
src_tokens_no_pad.append(
utils.strip_pad(tensor=src_sentence, pad=src_dict.pad())
)
denoising_batch_result = self._get_noising_dataset_batch(
src_tokens_no_pad=src_tokens_no_pad,
src_dict=src_dict,
append_eos_to_tgt=True,
)
eos, pad = src_dict.eos(), src_dict.pad()
# Generated noisy source as source
expected_src = torch.LongTensor(
[[4, 5, 10, 11, 8, 12, 13], [pad, pad, pad, 6, 8, 9, 7]]
)
# Original clean source as target (right-padded)
expected_tgt = torch.LongTensor(
[[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]]
)
generated_src = denoising_batch_result["net_input"]["src_tokens"]
tgt_tokens = denoising_batch_result["target"]
self.assertTensorEqual(expected_src, generated_src)
self.assertTensorEqual(expected_tgt, tgt_tokens)
示例5: generate_batched_itr
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def generate_batched_itr(
self,
data_itr,
maxlen_a=None,
maxlen_b=None,
cuda=False,
timer=None,
prefix_size=0,
):
"""Iterate over a batched dataset and yield individual translations.
Args:
maxlen_a/b: generate sequences of maximum length ax + b,
where x is the source sentence length.
cuda: use GPU for generation
timer: StopwatchMeter for timing generations.
"""
for sample in data_itr:
if "net_input" not in sample:
continue
if timer is not None:
timer.start()
with torch.no_grad():
hypos = self.generate(
self.models,
sample,
prefix_tokens=sample["target"][:, :prefix_size]
if prefix_size > 0
else None,
)
if timer is not None:
timer.stop(sample["ntokens"])
for i, id in enumerate(sample["id"]):
# remove padding
src = utils.strip_pad(sample["net_input"]["src_tokens"][i, :], self.pad)
ref = utils.strip_pad(sample["target"][i, :], self.pad)
yield id, src, ref, hypos[i]
示例6: _inference_with_bleu
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def _inference_with_bleu(self, generator, sample, model):
import sacrebleu
def decode(toks, escape_unk=False):
s = self.tgt_dict.string(
toks.int().cpu(),
self.args.eval_bleu_remove_bpe,
# The default unknown string in fairseq is `<unk>`, but
# this is tokenized by sacrebleu as `< unk >`, inflating
# BLEU scores. Instead, we use a somewhat more verbose
# alternative that is unlikely to appear in the real
# reference, but doesn't get split into multiple tokens.
unk_string=(
"UNKNOWNTOKENINREF" if escape_unk else "UNKNOWNTOKENINHYP"
),
)
if self.tokenizer:
s = self.tokenizer.decode(s)
return s
gen_out = self.inference_step(generator, [model], sample, None)
hyps, refs = [], []
for i in range(len(gen_out)):
hyps.append(decode(gen_out[i][0]['tokens']))
refs.append(decode(
utils.strip_pad(sample['target'][i], self.tgt_dict.pad()),
escape_unk=True, # don't count <unk> as matches to the hypo
))
if self.args.eval_bleu_print_samples:
logger.info('example hypothesis: ' + hyps[0])
logger.info('example reference: ' + refs[0])
if self.args.eval_tokenized_bleu:
return sacrebleu.corpus_bleu(hyps, [refs], tokenize='none')
else:
return sacrebleu.corpus_bleu(hyps, [refs])
示例7: _generate_translation
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def _generate_translation(self, model, tgt_dict, sample, beam_size, **kwargs):
translator_class = beam_decode.SequenceGenerator
translator = translator_class(models=[model], tgt_dict=tgt_dict, **kwargs)
translator.cuda()
s = utils.move_to_cuda(sample)
# TODO: nbest
input = s["net_input"]
srclen = input["src_tokens"].size(1)
if self.task.use_char_source:
encoder_input = {
k: v
for k, v in input.items()
if k in ["src_tokens", "src_lengths", "char_inds", "word_lengths"]
}
else:
encoder_input = {
k: v for k, v in input.items() if k in ["src_tokens", "src_lengths"]
}
with torch.no_grad():
hypos = translator.generate(
encoder_input=encoder_input,
beam_size=beam_size,
maxlen=int(self.args.max_len_a * srclen + self.args.max_len_b),
)
for i, id in enumerate(s["id"]):
# remove padding
src = utils.strip_pad(input["src_tokens"][i, :], tgt_dict.pad())
yield id, src, hypos[i]
示例8: generate_batched_itr
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def generate_batched_itr(self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None,
cuda=False, timer=None, prefix_size=0):
"""Iterate over a batched dataset and yield individual translations.
Args:
maxlen_a/b: generate sequences of maximum length ax + b,
where x is the source sentence length.
cuda: use GPU for generation
timer: StopwatchMeter for timing generations.
"""
if maxlen_b is None:
maxlen_b = self.maxlen
for sample in data_itr:
s = utils.make_variable(sample, volatile=True, cuda=cuda)
input = s['net_input']
srclen = input['src_tokens'].size(1)
if timer is not None:
timer.start()
with utils.maybe_no_grad():
hypos = self.generate(
input['src_tokens'],
input['src_lengths'],
input['src_doctopic'],
input['src_wordtopics'],
beam_size=beam_size,
maxlen=int(maxlen_a*srclen + maxlen_b),
prefix_tokens=s['target'][:, :prefix_size] if prefix_size > 0 else None,
)
if timer is not None:
timer.stop(sum([len(h[0]['tokens']) for h in hypos]))
for i, id in enumerate(s['id'].data):
src = input['src_tokens'].data[i, :]
# remove padding from ref
ref = utils.strip_pad(s['target'].data[i, :], self.pad) if s['target'] is not None else None
yield id, src, ref, hypos[i]
示例9: score_batched_itr
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def score_batched_itr(self, data_itr, cuda=False, timer=None):
"""Iterate over a batched dataset and yield scored translations."""
for sample in data_itr:
s = utils.make_variable(sample, volatile=True, cuda=cuda)
if timer is not None:
timer.start()
pos_scores, attn = self.score(s)
if timer is not None:
timer.stop(s['ntokens'])
for i, id in enumerate(s['id'].data):
src = s['net_input']['src_tokens'].data[i, :]
# remove padding from ref
ref = utils.strip_pad(s['target'].data[i, :], self.pad)
tgt_len = ref.numel()
pos_scores_i = pos_scores[i][:tgt_len]
score_i = pos_scores_i.sum() / tgt_len
attn_i = attn[i]
_, alignment = attn_i.max(dim=0)
hypos = [{
'tokens': ref,
'score': score_i,
'attention': attn_i,
'alignment': alignment,
'positional_scores': pos_scores_i,
}]
# return results in the same format as SequenceGenerator
yield id, src, ref, hypos
示例10: generate_batched_itr
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def generate_batched_itr(self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None,
cuda=False, timer=None, prefix_size=0):
"""Iterate over a batched dataset and yield individual translations.
Args:
maxlen_a/b: generate sequences of maximum length ax + b,
where x is the source sentence length.
cuda: use GPU for generation
timer: StopwatchMeter for timing generations.
"""
if maxlen_b is None:
maxlen_b = self.maxlen
for sample in data_itr:
s = utils.make_variable(sample, volatile=True, cuda=cuda)
input = s['net_input']
srclen = input['src_tokens'].size(1)
if timer is not None:
timer.start()
with utils.maybe_no_grad():
hypos = self.generate(
input['src_tokens'],
input['src_lengths'],
beam_size=beam_size,
maxlen=int(maxlen_a*srclen + maxlen_b),
prefix_tokens=s['target'][:, :prefix_size] if prefix_size > 0 else None,
)
if timer is not None:
timer.stop(sum([len(h[0]['tokens']) for h in hypos]))
for i, id in enumerate(s['id'].data):
src = input['src_tokens'].data[i, :]
# remove padding from ref
ref = utils.strip_pad(s['target'].data[i, :], self.pad) if s['target'] is not None else None
yield id, src, ref, hypos[i]
示例11: generate_batched_itr
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def generate_batched_itr(self, data_itr, beam_size=None, cuda=False, timer=None):
"""Iterate over a batched dataset and yield individual translations.
Args:
cuda (bool, optional): use GPU for generation
timer (StopwatchMeter, optional): time generations
"""
for sample in data_itr:
s = utils.move_to_cuda(sample) if cuda else sample
if "net_input" not in s:
continue
input = s["net_input"]
# model.forward normally channels prev_output_tokens into the decoder
# separately, but SequenceGenerator directly calls model.encoder
encoder_input = {
k: v for k, v in input.items() if k != "prev_output_tokens"
}
if timer is not None:
timer.start()
with torch.no_grad():
hypos = self.generate(encoder_input)
if timer is not None:
timer.stop(sum(len(h[0]["tokens"]) for h in hypos))
for i, id in enumerate(s["id"].data):
# remove padding
src = utils.strip_pad(input["src_tokens"].data[i, :], self.pad)
ref = (
utils.strip_pad(s["target"].data[i, :], self.pad)
if s["target"] is not None
else None
)
yield id, src, ref, hypos[i]
示例12: _inference_with_bleu
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def _inference_with_bleu(self, generator, sample, model):
import sacrebleu
def decode(toks, escape_unk=False):
s = self.tgt_dict.string(
toks.int().cpu(),
self.args.eval_bleu_remove_bpe,
escape_unk=escape_unk,
)
if self.tokenizer:
s = self.tokenizer.decode(s)
return s
gen_out = self.inference_step(generator, [model], sample, None)
hyps, refs = [], []
for i in range(len(gen_out)):
hyps.append(decode(gen_out[i][0]['tokens']))
refs.append(decode(
utils.strip_pad(sample['target'][i], self.tgt_dict.pad()),
escape_unk=True, # don't count <unk> as matches to the hypo
))
if self.args.eval_bleu_print_samples:
logger.info('example hypothesis: ' + hyps[0])
logger.info('example reference: ' + refs[0])
if self.args.eval_tokenized_bleu:
return sacrebleu.corpus_bleu(hyps, [refs], tokenize='none')
else:
return sacrebleu.corpus_bleu(hyps, [refs])
示例13: forward
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def forward(self, model, sample, reduce=True):
"""Compute the loss for the given sample.
Returns a tuple with three elements:
1) the loss
2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training
"""
assert hasattr(model.decoder, 'adaptive_softmax') and model.decoder.adaptive_softmax is not None
adaptive_softmax = model.decoder.adaptive_softmax
net_output = model(**sample['net_input'])
orig_target = model.get_targets(sample, net_output)
nsentences = orig_target.size(0)
orig_target = orig_target.view(-1)
bsz = orig_target.size(0)
logits, target = adaptive_softmax(net_output[0], orig_target)
assert len(target) == len(logits)
loss = net_output[0].new(1 if reduce else bsz).zero_()
for i in range(len(target)):
if target[i] is not None:
assert (target[i].min() >= 0 and target[i].max() <= logits[i].size(1))
loss += F.cross_entropy(logits[i], target[i], size_average=False, ignore_index=self.padding_idx,
reduce=reduce)
orig = utils.strip_pad(orig_target, self.padding_idx)
ntokens = orig.numel()
sample_size = sample['target'].size(0) if self.args.sentence_avg else ntokens
logging_output = {
'loss': utils.item(loss.data) if reduce else loss.data,
'ntokens': ntokens,
'nsentences': nsentences,
'sample_size': sample_size,
}
return loss, sample_size, logging_output
示例14: forward
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def forward(self, model, sample, reduce=True):
"""Compute the loss for the given sample.
Returns a tuple with three elements:
1) the loss
2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training
"""
assert hasattr(model.decoder, 'adaptive_softmax') and model.decoder.adaptive_softmax is not None
adaptive_softmax = model.decoder.adaptive_softmax
net_output = model(**sample['net_input'])
orig_target = model.get_targets(sample, net_output)
nsentences = orig_target.size(0)
orig_target = orig_target.view(-1)
bsz = orig_target.size(0)
logits, target = adaptive_softmax(net_output[0], orig_target)
assert len(target) == len(logits)
loss = net_output[0].new(1 if reduce else bsz).zero_()
for i in range(len(target)):
if target[i] is not None:
assert (target[i].min() >= 0 and target[i].max() <= logits[i].size(1))
loss += F.cross_entropy(
logits[i],
target[i],
ignore_index=self.padding_idx,
reduction='sum' if reduce else 'none',
)
orig = utils.strip_pad(orig_target, self.padding_idx)
ntokens = orig.numel()
sample_size = sample['target'].size(0) if self.sentence_avg else ntokens
logging_output = {
'loss': loss.data,
'ntokens': ntokens,
'nsentences': nsentences,
'sample_size': sample_size,
}
return loss, sample_size, logging_output
示例15: generate_batched_itr
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import strip_pad [as 别名]
def generate_batched_itr(
self,
data_itr,
beam_size=None,
maxlen_a=0.0,
maxlen_b=None,
cuda=False,
timer=None,
prefix_size=0,
):
"""Iterate over a batched dataset and yield individual translations.
Args:
maxlen_a/b: generate sequences of maximum length ax + b,
where x is the source sentence length.
cuda: use GPU for generation
timer: StopwatchMeter for timing generations.
"""
if maxlen_b is None:
maxlen_b = self.maxlen
for sample in data_itr:
if cuda:
s = utils.move_to_cuda(sample)
input = s["net_input"]
# Take the max source length to compute the max target length
srclen = input["src_tokens"].size(1)
# FIXME: handle characters properly
if self.use_char_source:
raise ValueError(
"Character level encoder is not supported yet for "
"multisource sentences."
)
encoder_inputs = (input["src_tokens"], input["src_lengths"])
if timer is not None:
timer.start()
with torch.no_grad():
hypos = self.generate(
encoder_inputs,
srcs_ids=input["src_ids"],
beam_size=beam_size,
maxlen=int(maxlen_a * srclen + maxlen_b),
prefix_tokens=s["target"][:, :prefix_size]
if prefix_size > 0
else None,
)
if timer is not None:
timer.stop(s["ntokens"])
for i, id in enumerate(s["id"]):
src = input["src_tokens"].index_select(
0, input["src_ids"][self.align_to]
)
# remove padding from ref
ref = utils.strip_pad(s["target"][i, :], self.pad)
yield id, src, ref, hypos[i]