本文整理汇总了Python中fairseq.data.LanguagePairDataset方法的典型用法代码示例。如果您正苦于以下问题:Python data.LanguagePairDataset方法的具体用法?Python data.LanguagePairDataset怎么用?Python data.LanguagePairDataset使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类fairseq.data
的用法示例。
在下文中一共展示了data.LanguagePairDataset方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: make_batches
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def make_batches(lines, args, src_dict, max_positions):
tokens = [
tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False).long()
for src_str in lines
]
lengths = np.array([t.numel() for t in tokens])
itr = data.EpochBatchIterator(
dataset=data.LanguagePairDataset(tokens, lengths, src_dict),
max_tokens=args.max_tokens,
max_sentences=args.max_sentences,
max_positions=max_positions,
).next_epoch_itr(shuffle=False)
for batch in itr:
yield Batch(
srcs=[lines[i] for i in batch['id']],
tokens=batch['net_input']['src_tokens'],
lengths=batch['net_input']['src_lengths'],
), batch['id']
示例2: make_batches
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def make_batches(lines, args, task, max_positions):
tokens = [
tokenizer.Tokenizer.tokenize(src_str, task.source_dictionary, add_if_not_exist=False).long()
for src_str in lines
]
lengths = np.array([t.numel() for t in tokens])
itr = task.get_batch_iterator(
dataset=data.LanguagePairDataset(tokens, lengths, task.source_dictionary),
max_tokens=args.max_tokens,
max_sentences=args.max_sentences,
max_positions=max_positions,
).next_epoch_itr(shuffle=False)
for batch in itr:
yield Batch(
srcs=[lines[i] for i in batch['id']],
tokens=batch['net_input']['src_tokens'],
lengths=batch['net_input']['src_lengths'],
), batch['id']
示例3: get_trainer_and_epoch_itr
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch):
tokens = torch.LongTensor(list(range(epoch_size)))
tokens_ds = data.TokenBlockDataset(tokens, [len(tokens)], 1, include_targets=False)
trainer = mock_trainer(epoch, num_updates, iterations_in_epoch)
epoch_itr = data.EpochBatchIterator(
dataset=data.LanguagePairDataset(tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False),
max_tokens=1,
)
return trainer, epoch_itr
示例4: setUp
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def setUp(self):
d = mock_dict()
tokens_1 = torch.LongTensor([1]).view(1, -1)
tokens_ds1 = TokenBlockDataset(
tokens_1,
sizes=[tokens_1.size(-1)],
block_size=1,
pad=0,
eos=1,
include_targets=False,
)
self.dataset_1 = LanguagePairDataset(
tokens_ds1, tokens_ds1.sizes, d, shuffle=False
)
tokens_2 = torch.LongTensor([2]).view(1, -1)
tokens_ds2 = TokenBlockDataset(
tokens_2,
sizes=[tokens_2.size(-1)],
block_size=1,
pad=0,
eos=1,
include_targets=False,
)
self.dataset_2 = LanguagePairDataset(
tokens_ds2, tokens_ds2.sizes, d, shuffle=False
)
示例5: get_trainer_and_epoch_itr
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch):
tokens = torch.LongTensor(list(range(epoch_size))).view(1, -1)
tokens_ds = data.TokenBlockDataset(
tokens, sizes=[tokens.size(-1)], block_size=1, pad=0, eos=1, include_targets=False,
)
trainer = mock_trainer(epoch, num_updates, iterations_in_epoch)
dataset = data.LanguagePairDataset(tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False)
epoch_itr = data.EpochBatchIterator(
dataset=dataset,
collate_fn=dataset.collater,
batch_sampler=[[i] for i in range(epoch_size)],
)
return trainer, epoch_itr
示例6: build_dataset_for_inference
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def build_dataset_for_inference(self, src_tokens, src_lengths):
src_lang_id = self.source_dictionary.index('[{}]'.format(self.args.source_lang))
source_tokens = []
for s_t in src_tokens:
s_t = torch.cat([s_t, s_t.new(1).fill_(src_lang_id)])
source_tokens.append(s_t)
dataset = LanguagePairDataset(source_tokens, src_lengths, self.source_dictionary)
return dataset
示例7: load_dataset_from_text_multilingual
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def load_dataset_from_text_multilingual(
self,
split: str,
source_text_file: str,
target_text_file: str,
source_lang_id: int,
target_lang_id: int,
append_eos: Optional[bool] = False,
reverse_source: Optional[bool] = True,
):
src_dataset = pytorch_translate_data.IndexedRawTextDatasetWithLangId(
path=source_text_file,
dictionary=self.source_dictionary,
lang_id=source_lang_id,
append_eos=append_eos,
reverse_order=reverse_source,
prepend_language_id=False,
)
tgt_dataset = pytorch_translate_data.IndexedRawTextDatasetWithLangId(
path=target_text_file,
dictionary=self.target_dictionary,
lang_id=target_lang_id,
append_eos=True,
reverse_order=False,
prepend_language_id=True,
)
self.datasets[split] = data.LanguagePairDataset(
src=src_dataset,
src_sizes=src_dataset.sizes,
src_dict=self.source_dictionary,
tgt=tgt_dataset,
tgt_sizes=tgt_dataset.sizes,
tgt_dict=self.target_dictionary,
)
print(f"| {split} {len(self.datasets[split])} examples")
示例8: test_load_data_single_path
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def test_load_data_single_path(self):
test_args = test_utils.ModelParamsDict()
test_args.source_lang = "en"
test_args.target_lang = "fr"
test_args.log_verbose = False
src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
src_text_file, tgt_text_file = test_utils.create_test_text_files()
src_bin_path = preprocess.binarize_text_file(
text_file=src_text_file,
dictionary=src_dict,
output_path=tempfile.NamedTemporaryFile().name,
append_eos=True,
reverse_order=False,
)
tgt_bin_path = preprocess.binarize_text_file(
text_file=tgt_text_file,
dictionary=tgt_dict,
output_path=tempfile.NamedTemporaryFile().name,
append_eos=True,
reverse_order=False,
)
task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
split = "0"
task.load_dataset(split, src_bin_path, tgt_bin_path)
self.assertEqual(len(task.datasets[split]), 4)
self.assertIsInstance(task.datasets[split], LanguagePairDataset)
示例9: test_subsample_pair_dataset
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def test_subsample_pair_dataset(self):
src_dataset = data.InMemoryIndexedDataset()
trg_dataset = data.InMemoryIndexedDataset()
for _ in range(5):
src_dataset.parse(
self.src_txt, self.d, reverse_order=True, append_eos=False
)
trg_dataset.parse(
self.trg_txt, self.d, reverse_order=False, append_eos=True
)
pair_dataset = LanguagePairDataset(
src=src_dataset,
src_sizes=src_dataset.sizes,
src_dict=self.d,
tgt=trg_dataset,
tgt_sizes=trg_dataset.sizes,
tgt_dict=self.d,
left_pad_source=False,
)
data.subsample_pair_dataset(pair_dataset, 2)
self.assertEqual(len(pair_dataset.src), 2)
self.assertEqual(pair_dataset.src_sizes.size, 2)
self.assertEqual(len(pair_dataset.tgt), 2)
self.assertEqual(pair_dataset.tgt_sizes.size, 2)
示例10: build_dataset_for_inference
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def build_dataset_for_inference(self, src_tokens, src_lengths):
src_lang_id = self.source_dictionary.index('[{}]'.format(self.args.source_lang))
source_tokens = []
for s_t in src_tokens:
s_t = torch.cat([s_t, s_t.new(1).fill_(src_lang_id)])
source_tokens.append(s_t)
dataset = LanguagePairDataset(src_tokens, src_lengths, self.source_dictionary)
return dataset
示例11: test_load_data_single_path_idx_bin
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def test_load_data_single_path_idx_bin(self):
test_args = test_utils.ModelParamsDict()
test_args.source_lang = "en"
test_args.target_lang = "fr"
test_args.log_verbose = False
src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
src_text_file, tgt_text_file = test_utils.create_test_text_files()
task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
with tempfile.TemporaryDirectory() as destdir:
preprocess_args = [
"--source-lang",
test_args.source_lang,
"--target-lang",
test_args.target_lang,
"--destdir",
destdir,
]
preproc_parser = preprocess_options.get_preprocessing_parser()
preproc_args = preproc_parser.parse_args(preprocess_args)
preproc_args.dataset_impl = "mmap"
split = "train"
binarize(
preproc_args,
src_text_file,
src_dict,
split,
test_args.source_lang,
offset=0,
end=-1,
)
binarize(
preproc_args,
tgt_text_file,
tgt_dict,
split,
test_args.target_lang,
offset=0,
end=-1,
)
src_path = dataset_dest_prefix(preproc_args, split, test_args.source_lang)
tgt_path = dataset_dest_prefix(preproc_args, split, test_args.target_lang)
task.load_dataset(split, src_path, tgt_path, is_npz=False)
self.assertEqual(len(task.datasets[split]), 4)
self.assertIsInstance(task.datasets[split], LanguagePairDataset)