当前位置: 首页>>代码示例>>Python>>正文


Python data.LanguagePairDataset方法代码示例

本文整理汇总了Python中fairseq.data.LanguagePairDataset方法的典型用法代码示例。如果您正苦于以下问题:Python data.LanguagePairDataset方法的具体用法?Python data.LanguagePairDataset怎么用?Python data.LanguagePairDataset使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在fairseq.data的用法示例。


在下文中一共展示了data.LanguagePairDataset方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: make_batches

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def make_batches(lines, args, src_dict, max_positions):
    tokens = [
        tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False).long()
        for src_str in lines
    ]
    lengths = np.array([t.numel() for t in tokens])
    itr = data.EpochBatchIterator(
        dataset=data.LanguagePairDataset(tokens, lengths, src_dict),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=max_positions,
    ).next_epoch_itr(shuffle=False)
    for batch in itr:
        yield Batch(
            srcs=[lines[i] for i in batch['id']],
            tokens=batch['net_input']['src_tokens'],
            lengths=batch['net_input']['src_lengths'],
        ), batch['id'] 
开发者ID:nusnlp,项目名称:crosentgec,代码行数:20,代码来源:interactive.py

示例2: make_batches

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def make_batches(lines, args, task, max_positions):
    tokens = [
        tokenizer.Tokenizer.tokenize(src_str, task.source_dictionary, add_if_not_exist=False).long()
        for src_str in lines
    ]
    lengths = np.array([t.numel() for t in tokens])
    itr = task.get_batch_iterator(
        dataset=data.LanguagePairDataset(tokens, lengths, task.source_dictionary),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=max_positions,
    ).next_epoch_itr(shuffle=False)
    for batch in itr:
        yield Batch(
            srcs=[lines[i] for i in batch['id']],
            tokens=batch['net_input']['src_tokens'],
            lengths=batch['net_input']['src_lengths'],
        ), batch['id'] 
开发者ID:rgcottrell,项目名称:pytorch-human-performance-gec,代码行数:20,代码来源:interactive.py

示例3: get_trainer_and_epoch_itr

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch):
    tokens = torch.LongTensor(list(range(epoch_size)))
    tokens_ds = data.TokenBlockDataset(tokens, [len(tokens)], 1, include_targets=False)
    trainer = mock_trainer(epoch, num_updates, iterations_in_epoch)
    epoch_itr = data.EpochBatchIterator(
        dataset=data.LanguagePairDataset(tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False),
        max_tokens=1,
    )
    return trainer, epoch_itr 
开发者ID:nusnlp,项目名称:crosentgec,代码行数:11,代码来源:test_train.py

示例4: setUp

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def setUp(self):
        d = mock_dict()
        tokens_1 = torch.LongTensor([1]).view(1, -1)
        tokens_ds1 = TokenBlockDataset(
            tokens_1,
            sizes=[tokens_1.size(-1)],
            block_size=1,
            pad=0,
            eos=1,
            include_targets=False,
        )
        self.dataset_1 = LanguagePairDataset(
            tokens_ds1, tokens_ds1.sizes, d, shuffle=False
        )
        tokens_2 = torch.LongTensor([2]).view(1, -1)
        tokens_ds2 = TokenBlockDataset(
            tokens_2,
            sizes=[tokens_2.size(-1)],
            block_size=1,
            pad=0,
            eos=1,
            include_targets=False,
        )
        self.dataset_2 = LanguagePairDataset(
            tokens_ds2, tokens_ds2.sizes, d, shuffle=False
        ) 
开发者ID:pytorch,项目名称:fairseq,代码行数:28,代码来源:test_multi_corpus_sampled_dataset.py

示例5: get_trainer_and_epoch_itr

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch):
    tokens = torch.LongTensor(list(range(epoch_size))).view(1, -1)
    tokens_ds = data.TokenBlockDataset(
        tokens, sizes=[tokens.size(-1)], block_size=1, pad=0, eos=1, include_targets=False,
    )
    trainer = mock_trainer(epoch, num_updates, iterations_in_epoch)
    dataset = data.LanguagePairDataset(tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False)
    epoch_itr = data.EpochBatchIterator(
        dataset=dataset,
        collate_fn=dataset.collater,
        batch_sampler=[[i] for i in range(epoch_size)],
    )
    return trainer, epoch_itr 
开发者ID:pytorch,项目名称:fairseq,代码行数:15,代码来源:test_train.py

示例6: build_dataset_for_inference

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def build_dataset_for_inference(self, src_tokens, src_lengths):
        src_lang_id = self.source_dictionary.index('[{}]'.format(self.args.source_lang))
        source_tokens = []
        for s_t in src_tokens:
            s_t = torch.cat([s_t, s_t.new(1).fill_(src_lang_id)])
            source_tokens.append(s_t)
        dataset = LanguagePairDataset(source_tokens, src_lengths, self.source_dictionary)
        return dataset 
开发者ID:pytorch,项目名称:fairseq,代码行数:10,代码来源:translation_from_pretrained_bart.py

示例7: load_dataset_from_text_multilingual

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def load_dataset_from_text_multilingual(
        self,
        split: str,
        source_text_file: str,
        target_text_file: str,
        source_lang_id: int,
        target_lang_id: int,
        append_eos: Optional[bool] = False,
        reverse_source: Optional[bool] = True,
    ):
        src_dataset = pytorch_translate_data.IndexedRawTextDatasetWithLangId(
            path=source_text_file,
            dictionary=self.source_dictionary,
            lang_id=source_lang_id,
            append_eos=append_eos,
            reverse_order=reverse_source,
            prepend_language_id=False,
        )
        tgt_dataset = pytorch_translate_data.IndexedRawTextDatasetWithLangId(
            path=target_text_file,
            dictionary=self.target_dictionary,
            lang_id=target_lang_id,
            append_eos=True,
            reverse_order=False,
            prepend_language_id=True,
        )
        self.datasets[split] = data.LanguagePairDataset(
            src=src_dataset,
            src_sizes=src_dataset.sizes,
            src_dict=self.source_dictionary,
            tgt=tgt_dataset,
            tgt_sizes=tgt_dataset.sizes,
            tgt_dict=self.target_dictionary,
        )
        print(f"| {split} {len(self.datasets[split])} examples") 
开发者ID:pytorch,项目名称:translate,代码行数:37,代码来源:pytorch_translate_task.py

示例8: test_load_data_single_path

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def test_load_data_single_path(self):
        test_args = test_utils.ModelParamsDict()
        test_args.source_lang = "en"
        test_args.target_lang = "fr"
        test_args.log_verbose = False
        src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
        src_text_file, tgt_text_file = test_utils.create_test_text_files()
        src_bin_path = preprocess.binarize_text_file(
            text_file=src_text_file,
            dictionary=src_dict,
            output_path=tempfile.NamedTemporaryFile().name,
            append_eos=True,
            reverse_order=False,
        )
        tgt_bin_path = preprocess.binarize_text_file(
            text_file=tgt_text_file,
            dictionary=tgt_dict,
            output_path=tempfile.NamedTemporaryFile().name,
            append_eos=True,
            reverse_order=False,
        )
        task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
        split = "0"
        task.load_dataset(split, src_bin_path, tgt_bin_path)
        self.assertEqual(len(task.datasets[split]), 4)
        self.assertIsInstance(task.datasets[split], LanguagePairDataset) 
开发者ID:pytorch,项目名称:translate,代码行数:28,代码来源:test_data.py

示例9: test_subsample_pair_dataset

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def test_subsample_pair_dataset(self):
        src_dataset = data.InMemoryIndexedDataset()
        trg_dataset = data.InMemoryIndexedDataset()
        for _ in range(5):
            src_dataset.parse(
                self.src_txt, self.d, reverse_order=True, append_eos=False
            )
            trg_dataset.parse(
                self.trg_txt, self.d, reverse_order=False, append_eos=True
            )

        pair_dataset = LanguagePairDataset(
            src=src_dataset,
            src_sizes=src_dataset.sizes,
            src_dict=self.d,
            tgt=trg_dataset,
            tgt_sizes=trg_dataset.sizes,
            tgt_dict=self.d,
            left_pad_source=False,
        )

        data.subsample_pair_dataset(pair_dataset, 2)
        self.assertEqual(len(pair_dataset.src), 2)
        self.assertEqual(pair_dataset.src_sizes.size, 2)
        self.assertEqual(len(pair_dataset.tgt), 2)
        self.assertEqual(pair_dataset.tgt_sizes.size, 2) 
开发者ID:pytorch,项目名称:translate,代码行数:28,代码来源:test_data.py

示例10: build_dataset_for_inference

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def build_dataset_for_inference(self, src_tokens, src_lengths):
        src_lang_id = self.source_dictionary.index('[{}]'.format(self.args.source_lang))
        source_tokens = []
        for s_t in src_tokens:
            s_t = torch.cat([s_t, s_t.new(1).fill_(src_lang_id)])
            source_tokens.append(s_t)
        dataset = LanguagePairDataset(src_tokens, src_lengths, self.source_dictionary)
        return dataset 
开发者ID:elbayadm,项目名称:attn2d,代码行数:10,代码来源:translation_from_pretrained_bart.py

示例11: test_load_data_single_path_idx_bin

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import LanguagePairDataset [as 别名]
def test_load_data_single_path_idx_bin(self):
        test_args = test_utils.ModelParamsDict()
        test_args.source_lang = "en"
        test_args.target_lang = "fr"
        test_args.log_verbose = False
        src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
        src_text_file, tgt_text_file = test_utils.create_test_text_files()
        task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
        with tempfile.TemporaryDirectory() as destdir:
            preprocess_args = [
                "--source-lang",
                test_args.source_lang,
                "--target-lang",
                test_args.target_lang,
                "--destdir",
                destdir,
            ]
            preproc_parser = preprocess_options.get_preprocessing_parser()
            preproc_args = preproc_parser.parse_args(preprocess_args)
            preproc_args.dataset_impl = "mmap"
            split = "train"
            binarize(
                preproc_args,
                src_text_file,
                src_dict,
                split,
                test_args.source_lang,
                offset=0,
                end=-1,
            )
            binarize(
                preproc_args,
                tgt_text_file,
                tgt_dict,
                split,
                test_args.target_lang,
                offset=0,
                end=-1,
            )
            src_path = dataset_dest_prefix(preproc_args, split, test_args.source_lang)
            tgt_path = dataset_dest_prefix(preproc_args, split, test_args.target_lang)
            task.load_dataset(split, src_path, tgt_path, is_npz=False)
            self.assertEqual(len(task.datasets[split]), 4)
            self.assertIsInstance(task.datasets[split], LanguagePairDataset) 
开发者ID:pytorch,项目名称:translate,代码行数:46,代码来源:test_data.py


注:本文中的fairseq.data.LanguagePairDataset方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。