Python vocab.Vocab方法代碼示例

本文整理匯總了Python中torchtext.vocab.Vocab方法的典型用法代碼示例。如果您正苦於以下問題：Python vocab.Vocab方法的具體用法？Python vocab.Vocab怎麽用？Python vocab.Vocab使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類torchtext.vocab的用法示例。

在下文中一共展示了vocab.Vocab方法的15個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: _dynamic_dict

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def _dynamic_dict(self, example, unk, pad):
        # it would not be necessary to pass unk and pad if the method were
        # called after fields becomes an attribute of self
        src = example["src"]
        src_vocab = Vocab(Counter(src), specials=[unk, pad])
        self.src_vocabs.append(src_vocab)
        # Map source tokens to indices in the dynamic dict.
        src_map = torch.LongTensor([src_vocab.stoi[w] for w in src])
        example["src_map"] = src_map

        if "tgt" in example:
            tgt = example["tgt"]
            mask = torch.LongTensor(
                [0] + [src_vocab.stoi[w] for w in tgt] + [0])
            example["alignment"] = mask
        return example

開發者ID:lizekang，項目名稱:ITDD，代碼行數:18，代碼來源:dataset_base.py

示例2: test_imdb

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def test_imdb(self):
        from torchtext.experimental.datasets import IMDB
        from torchtext.vocab import Vocab
        # smoke test to ensure imdb works properly
        train_dataset, test_dataset = IMDB()
        self.assertEqual(len(train_dataset), 25000)
        self.assertEqual(len(test_dataset), 25000)
        self.assertEqual(train_dataset[0][1][:10],
                         torch.tensor([13, 1568, 13, 246, 35468, 43, 64, 398, 1135, 92]).long())
        self.assertEqual(train_dataset[-1][1][:10],
                         torch.tensor([2, 71, 4555, 194, 3328, 15144, 42, 227, 148, 8]).long())
        self.assertEqual(test_dataset[0][1][:10],
                         torch.tensor([13, 125, 1051, 5, 246, 1652, 8, 277, 66, 20]).long())
        self.assertEqual(test_dataset[-1][1][:10],
                         torch.tensor([13, 1035, 14, 21, 28, 2, 1051, 1275, 1008, 3]).long())

        # Test API with a vocab input object
        old_vocab = train_dataset.get_vocab()
        new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
        new_train_data, new_test_data = IMDB(vocab=new_vocab)

開發者ID:pytorch，項目名稱:text，代碼行數:22，代碼來源:test_builtin_datasets.py

示例3: test_squad1

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def test_squad1(self):
        from torchtext.experimental.datasets import SQuAD1
        from torchtext.vocab import Vocab
        # smoke test to ensure imdb works properly
        train_dataset, dev_dataset = SQuAD1()
        self.assertEqual(len(train_dataset), 87599)
        self.assertEqual(len(dev_dataset), 10570)
        self.assertEqual(train_dataset[100]['question'],
                         torch.tensor([7, 24, 86, 52, 2, 373, 887, 18, 12797, 11090, 1356, 2, 1788, 3273, 16]).long())
        self.assertEqual(train_dataset[100]['ans_pos'][0],
                         torch.tensor([72, 72]).long())
        self.assertEqual(dev_dataset[100]['question'],
                         torch.tensor([42, 27, 669, 7438, 17, 2, 1950, 3273, 17252, 389, 16]).long())
        self.assertEqual(dev_dataset[100]['ans_pos'][0],
                         torch.tensor([45, 48]).long())

        # Test API with a vocab input object
        old_vocab = train_dataset.get_vocab()
        new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
        new_train_data, new_test_data = SQuAD1(vocab=new_vocab)

開發者ID:pytorch，項目名稱:text，代碼行數:22，代碼來源:test_builtin_datasets.py

示例4: test_squad2

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def test_squad2(self):
        from torchtext.experimental.datasets import SQuAD2
        from torchtext.vocab import Vocab
        # smoke test to ensure imdb works properly
        train_dataset, dev_dataset = SQuAD2()
        self.assertEqual(len(train_dataset), 130319)
        self.assertEqual(len(dev_dataset), 11873)
        self.assertEqual(train_dataset[200]['question'],
                         torch.tensor([84, 50, 1421, 12, 5439, 4569, 17, 30, 2, 15202, 4754, 1421, 16]).long())
        self.assertEqual(train_dataset[200]['ans_pos'][0],
                         torch.tensor([9, 9]).long())
        self.assertEqual(dev_dataset[200]['question'],
                         torch.tensor([41, 29, 2, 66, 17016, 30, 0, 1955, 16]).long())
        self.assertEqual(dev_dataset[200]['ans_pos'][0],
                         torch.tensor([40, 46]).long())

        # Test API with a vocab input object
        old_vocab = train_dataset.get_vocab()
        new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
        new_train_data, new_test_data = SQuAD2(vocab=new_vocab)

開發者ID:pytorch，項目名稱:text，代碼行數:22，代碼來源:test_builtin_datasets.py

示例5: test_vocab_specials_first

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def test_vocab_specials_first(self):
        c = Counter("a a b b c c".split())

        # add specials into vocabulary at first
        v = vocab.Vocab(c, max_size=2, specials=['<pad>', '<eos>'])
        expected_itos = ['<pad>', '<eos>', 'a', 'b']
        expected_stoi = {x: index for index, x in enumerate(expected_itos)}
        self.assertEqual(v.itos, expected_itos)
        self.assertEqual(dict(v.stoi), expected_stoi)

        # add specials into vocabulary at last
        v = vocab.Vocab(c, max_size=2, specials=['<pad>', '<eos>'], specials_first=False)
        expected_itos = ['a', 'b', '<pad>', '<eos>']
        expected_stoi = {x: index for index, x in enumerate(expected_itos)}
        self.assertEqual(v.itos, expected_itos)
        self.assertEqual(dict(v.stoi), expected_stoi)

開發者ID:pytorch，項目名稱:text，代碼行數:18，代碼來源:test_vocab.py

示例6: test_vocab_download_custom_vectors

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def test_vocab_download_custom_vectors(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
        # Build a vocab and get vectors twice to test caching.
        for i in range(2):
            v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                            vectors=Vectors('wiki.simple.vec',
                                            url=FastText.url_base.format('simple')))

            self.assertEqual(v.itos, ['<unk>', '<pad>', '<bos>',
                                      'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'])
            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_fasttext_simple_en = {
                'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645],
                'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165],
            }

            for word in expected_fasttext_simple_en:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_fasttext_simple_en[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec")
            conditional_remove(vec_file)

開發者ID:salesforce，項目名稱:decaNLP，代碼行數:29，代碼來源:test_vocab.py

示例7: build_data

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def build_data(all_tokens, all_seqs):
    vocab = Vocab.Vocab(collections.Counter(all_tokens),
                        specials=[PAD, BOS, EOS])
    indices = [[vocab.stoi[w] for w in seq] for seq in all_seqs]
    return vocab, torch.tensor(indices)

開發者ID:wdxtub，項目名稱:deep-learning-note，代碼行數:7，代碼來源:53_machine_translation.py

示例8: test_vocab_basic

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def test_vocab_basic(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
        v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'])

        expected_itos = ['<unk>', '<pad>', '<bos>',
                         'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']
        expected_stoi = {x: index for index, x in enumerate(expected_itos)}
        self.assertEqual(v.itos, expected_itos)
        self.assertEqual(dict(v.stoi), expected_stoi)

開發者ID:salesforce，項目名稱:decaNLP，代碼行數:11，代碼來源:test_vocab.py

示例9: test_vocab_set_vectors

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def test_vocab_set_vectors(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5,
                     'ｔｅｓｔ': 4, 'freq_too_low': 2})
        v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'])
        stoi = {"hello": 0, "world": 1, "ｔｅｓｔ": 2}
        vectors = torch.FloatTensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
        dim = 2
        v.set_vectors(stoi, vectors, dim)
        expected_vectors = np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0],
                                     [0.0, 0.0], [0.1, 0.2], [0.5, 0.6],
                                     [0.3, 0.4]])
        assert_allclose(v.vectors.numpy(), expected_vectors)

開發者ID:salesforce，項目名稱:decaNLP，代碼行數:14，代碼來源:test_vocab.py

示例10: test_vocab_download_fasttext_vectors

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def test_vocab_download_fasttext_vectors(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
        # Build a vocab and get vectors twice to test caching, then once more
        # to test string aliases.
        for i in range(3):
            if i == 2:
                vectors = str("fasttext.simple.300d")  # must handle str on Py2
            else:
                vectors = FastText(language='simple')

            v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                            vectors=vectors)

            expected_itos = ['<unk>', '<pad>', '<bos>',
                             'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']
            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
            self.assertEqual(v.itos, expected_itos)
            self.assertEqual(dict(v.stoi), expected_stoi)
            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_fasttext_simple_en = {
                'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645],
                'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165],
            }

            for word in expected_fasttext_simple_en:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_fasttext_simple_en[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300))
            assert_allclose(vectors[v.stoi['OOV token']], np.zeros(300))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec")
            conditional_remove(vec_file)

開發者ID:salesforce，項目名稱:decaNLP，代碼行數:38，代碼來源:test_vocab.py

示例11: test_vocab_extend

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def test_vocab_extend(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
        # Build a vocab and get vectors twice to test caching.
        for i in range(2):
            f = FastText(language='simple')
            v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                            vectors=f)
            n_vocab = len(v)
            v.extend(f)  # extend the vocab with the words contained in f.itos
            self.assertGreater(len(v), n_vocab)

            self.assertEqual(v.itos[:6], ['<unk>', '<pad>', '<bos>',
                             'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'])
            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_fasttext_simple_en = {
                'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645],
                'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165],
            }

            for word in expected_fasttext_simple_en:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_fasttext_simple_en[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec")
            conditional_remove(vec_file)

開發者ID:salesforce，項目名稱:decaNLP，代碼行數:32，代碼來源:test_vocab.py

示例12: test_vocab_download_charngram_vectors

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def test_vocab_download_charngram_vectors(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
        # Build a vocab and get vectors twice to test caching, then once more
        # to test string aliases.
        for i in range(3):
            if i == 2:
                vectors = "charngram.100d"
            else:
                vectors = CharNGram()
            v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                            vectors=vectors)
            expected_itos = ['<unk>', '<pad>', '<bos>',
                             'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']
            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
            self.assertEqual(v.itos, expected_itos)
            self.assertEqual(dict(v.stoi), expected_stoi)
            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_charngram = {
                'hello': [-0.44782442, -0.08937783, -0.34227219,
                          -0.16233221, -0.39343098],
                'world': [-0.29590717, -0.05275926, -0.37334684, 0.27117205, -0.3868292],
            }

            for word in expected_charngram:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_charngram[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(100))
            assert_allclose(vectors[v.stoi['OOV token']], np.zeros(100))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            conditional_remove(
                os.path.join(self.project_root, ".vector_cache", "charNgram.txt"))
            conditional_remove(
                os.path.join(self.project_root, ".vector_cache",
                             "jmt_pre-trained_embeddings.tar.gz"))

開發者ID:salesforce，項目名稱:decaNLP，代碼行數:40，代碼來源:test_vocab.py

示例13: test_errors

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def test_errors(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
        with self.assertRaises(ValueError):
            # Test proper error raised when using unknown string alias
            vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                        vectors=["fasttext.english.300d"])
            vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                        vectors="fasttext.english.300d")
        with self.assertRaises(ValueError):
            # Test proper error is raised when vectors argument is
            # non-string or non-Vectors
            vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                        vectors={"word": [1, 2, 3]})

開發者ID:salesforce，項目名稱:decaNLP，代碼行數:15，代碼來源:test_vocab.py

示例14: test_serialization

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def test_serialization(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
        v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'])
        pickle_path = os.path.join(self.test_dir, "vocab.pkl")
        pickle.dump(v, open(pickle_path, "wb"))
        v_loaded = pickle.load(open(pickle_path, "rb"))
        assert v == v_loaded

開發者ID:salesforce，項目名稱:decaNLP，代碼行數:9，代碼來源:test_vocab.py

示例15: fit

# 需要導入模塊: from torchtext import vocab [as 別名]
# 或者: from torchtext.vocab import Vocab [as 別名]
def fit(self, sents, **kwargs):
        """Builds a vocabulary object based on the tokens in the input.

        Args:
            sents: A list of lists of tokens (representing sentences)

        Vocab kwargs include:
            max_size
            min_freq
            specials
            unk_init
        """
        tokens = list(itertools.chain.from_iterable(sents))
        counter = Counter(tokens)
        self.vocab = self.build_vocab(counter, **kwargs)

開發者ID:HazyResearch，項目名稱:metal，代碼行數:17，代碼來源:embedding_featurizer.py

注：本文中的torchtext.vocab.Vocab方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。