本文整理汇总了Python中fairseq.data.TokenBlockDataset方法的典型用法代码示例。如果您正苦于以下问题:Python data.TokenBlockDataset方法的具体用法?Python data.TokenBlockDataset怎么用?Python data.TokenBlockDataset使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类fairseq.data
的用法示例。
在下文中一共展示了data.TokenBlockDataset方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: make_batches
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import TokenBlockDataset [as 别名]
def make_batches(self, lines):
token_lst = [self.task.source_dictionary.encode_line(line, add_if_not_exist=False).long()
for line in lines]
length_lst = torch.LongTensor([tokens.numel() for tokens in token_lst])
ds = data.TokenBlockDataset(token_lst, length_lst, self.args.tokens_per_sample, pad=self.task.dictionary.pad(),
eos=self.task.dictionary.eos(),
break_mode='eos', include_targets=True)
add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'
itr = self.task.get_batch_iterator(
dataset=data.MonolingualDataset(ds, ds.sizes, self.task.dictionary, self.task.target_dictionary,
add_eos_for_other_targets, shuffle=False, targets=self.task.targets),
max_tokens=self.args.max_tokens or 3000,
max_sentences=self.args.max_sentences,
max_positions=utils.resolve_max_positions(*[
model.max_positions() for model in self.models
]),
num_shards=self.args.num_shards,
shard_id=self.args.shard_id,
ignore_invalid_inputs=True,
num_workers=self.args.num_workers,
).next_epoch_itr(shuffle=False)
return itr
示例2: get_trainer_and_epoch_itr
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import TokenBlockDataset [as 别名]
def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch):
tokens = torch.LongTensor(list(range(epoch_size)))
tokens_ds = data.TokenBlockDataset(tokens, [len(tokens)], 1, include_targets=False)
trainer = mock_trainer(epoch, num_updates, iterations_in_epoch)
epoch_itr = data.EpochBatchIterator(
dataset=data.LanguagePairDataset(tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False),
max_tokens=1,
)
return trainer, epoch_itr
示例3: setUp
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import TokenBlockDataset [as 别名]
def setUp(self):
d = mock_dict()
tokens_1 = torch.LongTensor([1]).view(1, -1)
tokens_ds1 = TokenBlockDataset(
tokens_1,
sizes=[tokens_1.size(-1)],
block_size=1,
pad=0,
eos=1,
include_targets=False,
)
self.dataset_1 = LanguagePairDataset(
tokens_ds1, tokens_ds1.sizes, d, shuffle=False
)
tokens_2 = torch.LongTensor([2]).view(1, -1)
tokens_ds2 = TokenBlockDataset(
tokens_2,
sizes=[tokens_2.size(-1)],
block_size=1,
pad=0,
eos=1,
include_targets=False,
)
self.dataset_2 = LanguagePairDataset(
tokens_ds2, tokens_ds2.sizes, d, shuffle=False
)
示例4: _build_dataset
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import TokenBlockDataset [as 别名]
def _build_dataset(self, data, **kwargs):
sizes = [len(x) for x in data]
underlying_ds = test_utils.TestDataset(data)
return TokenBlockDataset(underlying_ds, sizes, **kwargs)
示例5: get_trainer_and_epoch_itr
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import TokenBlockDataset [as 别名]
def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch):
tokens = torch.LongTensor(list(range(epoch_size))).view(1, -1)
tokens_ds = data.TokenBlockDataset(
tokens, sizes=[tokens.size(-1)], block_size=1, pad=0, eos=1, include_targets=False,
)
trainer = mock_trainer(epoch, num_updates, iterations_in_epoch)
dataset = data.LanguagePairDataset(tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False)
epoch_itr = data.EpochBatchIterator(
dataset=dataset,
collate_fn=dataset.collater,
batch_sampler=[[i] for i in range(epoch_size)],
)
return trainer, epoch_itr
示例6: score_sentence
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import TokenBlockDataset [as 别名]
def score_sentence(self, line):
# Tokenize the input sentence into a batch of size one.
tokens = tokenizer.Tokenizer.tokenize(line, self.task.dictionary, add_if_not_exist=False).long()
lengths = np.array([tokens.numel()])
ds = data.TokenBlockDataset(tokens, lengths, self.args.tokens_per_sample, pad=self.task.dictionary.pad(), eos=self.task.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True)
# Create a batch iterator to wrap the data.
add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'
itr = self.task.get_batch_iterator(
dataset=data.MonolingualDataset(ds, ds.sizes, self.task.dictionary, self.task.target_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=False, targets=self.task.targets),
max_tokens=self.args.max_tokens or 3000,
max_sentences=self.args.max_sentences,
max_positions=utils.resolve_max_positions(*[
model.max_positions() for model in self.models
]),
num_shards=self.args.num_shards,
shard_id=self.args.shard_id,
ignore_invalid_inputs=True,
).next_epoch_itr(shuffle=False)
# Evaluate the sentence and return the fluency score.
results = self.scorer.score_batched_itr(itr, cuda=self.use_cuda)
for _, _, _, hypos in results:
for hypo in hypos:
# Ignore words with infinite probability. This can happen when
# running low-precision inference on the GPU.
pos_scores = hypo['positional_scores']
word_prob = [score for score in pos_scores if score != float('-inf') and score != float('inf')]
return self._fluency_score(word_prob)
return 0.0