当前位置: 首页>>代码示例>>Python>>正文


Python Vocabulary.add_token_to_namespace方法代码示例

本文整理汇总了Python中allennlp.data.Vocabulary.add_token_to_namespace方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.add_token_to_namespace方法的具体用法?Python Vocabulary.add_token_to_namespace怎么用?Python Vocabulary.add_token_to_namespace使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在allennlp.data.Vocabulary的用法示例。


在下文中一共展示了Vocabulary.add_token_to_namespace方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_read_embedding_file_inside_archive

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
                "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
                "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
                "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
                "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
                }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
                'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
                'embedding_dim': 5
                })
        with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"):
            Embedding.from_params(vocab, params)

        for ext in ['.zip', '.tar.gz']:
            archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt')
            params = Params({
                    'pretrained_file': file_uri,
                    'embedding_dim': 5
                    })
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in token2vec.items():
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
开发者ID:apmoore1,项目名称:allennlp,代码行数:31,代码来源:embedding_test.py

示例2: test_dry_run_without_extension

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
    def test_dry_run_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@[email protected]@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@[email protected]@ token.
        # but for 'labels' ns, there is no @@[email protected]@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        dry_run_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@[email protected]@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3
开发者ID:pyknife,项目名称:allennlp,代码行数:33,代码来源:dry_run_test.py

示例3: get_vocab

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
def get_vocab(word2freq, max_v_sizes):
    '''Build vocabulary'''
    vocab = Vocabulary(counter=None, max_vocab_size=max_v_sizes['word'])
    words_by_freq = [(word, freq) for word, freq in word2freq.items()]
    words_by_freq.sort(key=lambda x: x[1], reverse=True)
    for word, _ in words_by_freq[:max_v_sizes['word']]:
        vocab.add_token_to_namespace(word, 'tokens')
    log.info("\tFinished building vocab. Using %d words", vocab.get_vocab_size('tokens'))
    return vocab
开发者ID:cyzhangAThit,项目名称:GLUE-baselines,代码行数:11,代码来源:preprocess.py

示例4: test_token_to_indices_uses_ner_tags

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
 def test_token_to_indices_uses_ner_tags(self):
     tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags')
     none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
     vocab.add_token_to_namespace('ORG', namespace='ner_tags')
     indexer = NerTagIndexer()
     assert indexer.token_to_indices(tokens[1], vocab) == person_index
     assert indexer.token_to_indices(tokens[-1], vocab) == none_index
开发者ID:Jordan-Sauchuk,项目名称:allennlp,代码行数:12,代码来源:ner_tag_indexer_test.py

示例5: test_get_embedding_layer_uses_correct_embedding_dim

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
 def test_get_embedding_layer_uses_correct_embedding_dim(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace('word1')
     vocab.add_token_to_namespace('word2')
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8'))
     embedding_weights = _read_pretrained_embedding_file(embeddings_filename, 3, vocab)
     assert tuple(embedding_weights.size()) == (4, 3)  # 4 because of padding and OOV
     with pytest.raises(ConfigurationError):
         _read_pretrained_embedding_file(embeddings_filename, 4, vocab)
开发者ID:Jordan-Sauchuk,项目名称:allennlp,代码行数:14,代码来源:embedding_test.py

示例6: test_index_converts_field_correctly

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        b_index = vocab.add_token_to_namespace("B", namespace='*labels')
        i_index = vocab.add_token_to_namespace("I", namespace='*labels')
        o_index = vocab.add_token_to_namespace("O", namespace='*labels')

        tags = ["B", "I", "O", "O", "O"]
        sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels")
        sequence_label_field.index(vocab)

        # pylint: disable=protected-access
        assert sequence_label_field._indexed_labels == [b_index, i_index, o_index, o_index, o_index]
开发者ID:pyknife,项目名称:allennlp,代码行数:14,代码来源:sequence_label_field_test.py

示例7: test_as_tensor_produces_integer_targets

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
    def test_as_tensor_produces_integer_targets(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("B", namespace='*labels')
        vocab.add_token_to_namespace("I", namespace='*labels')
        vocab.add_token_to_namespace("O", namespace='*labels')

        tags = ["B", "I", "O", "O", "O"]
        sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels")
        sequence_label_field.index(vocab)
        padding_lengths = sequence_label_field.get_padding_lengths()
        tensor = sequence_label_field.as_tensor(padding_lengths).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 1, 2, 2, 2]))
开发者ID:pyknife,项目名称:allennlp,代码行数:14,代码来源:sequence_label_field_test.py

示例8: test_token_to_indices_uses_pos_tags

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
 def test_token_to_indices_uses_pos_tags(self):
     tokens = self.tokenizer.split_words("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
     cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
     none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
     indexer = PosTagIndexer(coarse_tags=True)
     assert indexer.token_to_indices(tokens[1], vocab) == verb_index
     assert indexer.token_to_indices(tokens[-1], vocab) == none_index
     indexer._coarse_tags = False  # pylint: disable=protected-access
     assert indexer.token_to_indices(tokens[1], vocab) == cop_index
开发者ID:Jordan-Sauchuk,项目名称:allennlp,代码行数:14,代码来源:pos_tag_indexer_test.py

示例9: test_get_embedding_layer_initializes_unseen_words_randomly_not_zero

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
 def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
     params = Params({
             'pretrained_file': embeddings_filename,
             'embedding_dim': 3,
             })
     embedding_layer = Embedding.from_params(vocab, params)
     word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
     assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0]))
开发者ID:Jordan-Sauchuk,项目名称:allennlp,代码行数:16,代码来源:embedding_test.py

示例10: TestDataset

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
class TestDataset(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this")
        self.vocab.add_token_to_namespace("is")
        self.vocab.add_token_to_namespace("a")
        self.vocab.add_token_to_namespace("sentence")
        self.vocab.add_token_to_namespace(".")
        self.token_indexer = {"tokens": SingleIdTokenIndexer()}
        self.instances = self.get_instances()
        super(TestDataset, self).setUp()

    def test_instances_must_have_homogeneous_fields(self):
        instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))})
        instance2 = Instance({"words": TextField([Token("hello")], {})})
        with pytest.raises(ConfigurationError):
            _ = Batch([instance1, instance2])

    def test_padding_lengths_uses_max_instance_lengths(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        assert padding_lengths == {"text1": {"num_tokens": 5, "tokens_length": 5},
                                   "text2": {"num_tokens": 6, "tokens_length": 6}}

    def test_as_tensor_dict(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        tensors = dataset.as_tensor_dict(padding_lengths)
        text1 = tensors["text1"]["tokens"].detach().cpu().numpy()
        text2 = tensors["text2"]["tokens"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6],
                                                                    [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6],
                                                                    [2, 3, 1, 0, 0, 0]]))

    def get_instances(self):
        field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]],
                           self.token_indexer)
        field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
                           self.token_indexer)
        field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]],
                           self.token_indexer)
        field4 = TextField([Token(t) for t in ["this", "is", "short"]],
                           self.token_indexer)
        instances = [Instance({"text1": field1, "text2": field2}),
                     Instance({"text1": field3, "text2": field4})]
        return instances
开发者ID:apmoore1,项目名称:allennlp,代码行数:52,代码来源:dataset_test.py

示例11: test_read_hdf5_raises_on_invalid_shape

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
    def test_read_hdf5_raises_on_invalid_shape(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        embeddings_filename = self.TEST_DIR + "embeddings.hdf5"
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 10)
        with h5py.File(embeddings_filename, 'w') as fout:
            _ = fout.create_dataset(
                    'embedding', embeddings.shape, dtype='float32', data=embeddings
            )

        params = Params({
                'pretrained_file': embeddings_filename,
                'embedding_dim': 5,
                })
        with pytest.raises(ConfigurationError):
            _ = Embedding.from_params(vocab, params)
开发者ID:Jordan-Sauchuk,项目名称:allennlp,代码行数:18,代码来源:embedding_test.py

示例12: test_adjacency_field_can_index_with_vocab

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
    def test_adjacency_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("a", namespace="labels")
        vocab.add_token_to_namespace("b", namespace="labels")
        vocab.add_token_to_namespace("c", namespace="labels")

        labels = ["a", "b"]
        indices = [(0, 1), (2, 1)]
        adjacency_field = AdjacencyField(indices, self.text, labels)
        adjacency_field.index(vocab)
        tensor = adjacency_field.as_tensor(adjacency_field.get_padding_lengths())
        numpy.testing.assert_equal(tensor.numpy(), numpy.array([[-1, 0, -1, -1, -1],
                                                                [-1, -1, -1, -1, -1],
                                                                [-1, 1, -1, -1, -1],
                                                                [-1, -1, -1, -1, -1],
                                                                [-1, -1, -1, -1, -1]]))
开发者ID:apmoore1,项目名称:allennlp,代码行数:18,代码来源:adjacency_field_test.py

示例13: test_forward_works_with_projection_layer

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
    def test_forward_works_with_projection_layer(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace('the')
        vocab.add_token_to_namespace('a')
        params = Params({
                'pretrained_file': 'tests/fixtures/glove.6B.300d.sample.txt.gz',
                'embedding_dim': 300,
                'projection_dim': 20
                })
        embedding_layer = Embedding.from_params(vocab, params)
        input_tensor = Variable(torch.LongTensor([[3, 2, 1, 0]]))
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 4, 20)

        input_tensor = Variable(torch.LongTensor([[[3, 2, 1, 0]]]))
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 1, 4, 20)
开发者ID:Jordan-Sauchuk,项目名称:allennlp,代码行数:19,代码来源:embedding_test.py

示例14: test_read_hdf5_format_file

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
    def test_read_hdf5_format_file(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        vocab.add_token_to_namespace("word2")
        embeddings_filename = self.TEST_DIR + "embeddings.hdf5"
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 5)
        with h5py.File(embeddings_filename, 'w') as fout:
            _ = fout.create_dataset(
                    'embedding', embeddings.shape, dtype='float32', data=embeddings
            )

        params = Params({
                'pretrained_file': embeddings_filename,
                'embedding_dim': 5,
                })
        embedding_layer = Embedding.from_params(vocab, params)
        assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
开发者ID:Jordan-Sauchuk,项目名称:allennlp,代码行数:19,代码来源:embedding_test.py

示例15: test_start_and_end_tokens

# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import add_token_to_namespace [as 别名]
    def test_start_and_end_tokens(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace='characters')  # 2
        vocab.add_token_to_namespace("s", namespace='characters')  # 3
        vocab.add_token_to_namespace("e", namespace='characters')  # 4
        vocab.add_token_to_namespace("n", namespace='characters')  # 5
        vocab.add_token_to_namespace("t", namespace='characters')  # 6
        vocab.add_token_to_namespace("c", namespace='characters')  # 7
        vocab.add_token_to_namespace("<", namespace='characters')  # 8
        vocab.add_token_to_namespace(">", namespace='characters')  # 9
        vocab.add_token_to_namespace("/", namespace='characters')  # 10

        indexer = TokenCharactersIndexer("characters", start_tokens=["<s>"], end_tokens=["</s>"])
        indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char")
        assert indices == {"char": [[8, 3, 9],
                                    [3, 4, 5, 6, 4, 5, 6, 1, 1, 1],
                                    [8, 10, 3, 9]]}
开发者ID:apmoore1,项目名称:allennlp,代码行数:19,代码来源:character_token_indexer_test.py


注:本文中的allennlp.data.Vocabulary.add_token_to_namespace方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。