Python data.Vocabulary方法代码示例

本文整理汇总了Python中allennlp.data.Vocabulary方法的典型用法代码示例。如果您正苦于以下问题：Python data.Vocabulary方法的具体用法？Python data.Vocabulary怎么用？Python data.Vocabulary使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data的用法示例。

在下文中一共展示了data.Vocabulary方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: init

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 verbose_metrics: bool = False,
                 dropout: float = 0.2,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 ) -> None:
        super(TextClassifier, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.dropout = torch.nn.Dropout(dropout)
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.classifier_feedforward = torch.nn.Linear(self.text_field_embedder.get_output_dim()  , self.num_classes)

        self.label_accuracy = CategoricalAccuracy()
        self.label_f1_metrics = {}

        self.verbose_metrics = verbose_metrics

        for i in range(self.num_classes):
            self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = F1Measure(positive_label=i)
        self.loss = torch.nn.CrossEntropyLoss()

        initializer(self)

开发者ID:allenai，项目名称:scibert，代码行数:26，代码来源:bert_text_classifier.py

示例2: from_params

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def from_params(cls, vocab: Vocabulary, params: Params) -> 'CrfTagger':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)
        encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))
        label_namespace = params.pop("label_namespace", "labels")
        constraint_type = params.pop("constraint_type", None)
        dropout = params.pop("dropout", None)
        include_start_end_transitions = params.pop("include_start_end_transitions", True)
        initializer = InitializerApplicator.from_params(params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))

        params.assert_empty(cls.__name__)

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   encoder=encoder,
                   label_namespace=label_namespace,
                   constraint_type=constraint_type,
                   dropout=dropout,
                   include_start_end_transitions=include_start_end_transitions,
                   initializer=initializer,
                   regularizer=regularizer)

开发者ID:arthurmensch，项目名称:didyprog，代码行数:24，代码来源:crf_tagger.py

示例3: _read_embeddings_from_hdf5

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def _read_embeddings_from_hdf5(
    embeddings_filename: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens"
) -> torch.FloatTensor:
    """
    Reads from a hdf5 formatted file. The embedding matrix is assumed to
    be keyed by 'embedding' and of size `(num_tokens, embedding_dim)`.
    """
    with h5py.File(embeddings_filename, "r") as fin:
        embeddings = fin["embedding"][...]

    if list(embeddings.shape) != [vocab.get_vocab_size(namespace), embedding_dim]:
        raise ConfigurationError(
            "Read shape {0} embeddings from the file, but expected {1}".format(
                list(embeddings.shape), [vocab.get_vocab_size(namespace), embedding_dim]
            )
        )

    return torch.FloatTensor(embeddings)

开发者ID:allenai，项目名称:allennlp，代码行数:20，代码来源:embedding.py

示例4: init

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def __init__(
        self,
        vocab: Vocabulary,
        vocab_namespace: str = "tokens",
        projection_dim: int = None,
        ignore_oov: bool = False,
    ) -> None:
        super().__init__()
        self.vocab = vocab
        self.vocab_size = vocab.get_vocab_size(vocab_namespace)
        if projection_dim:
            self._projection = torch.nn.Linear(self.vocab_size, projection_dim)
        else:
            self._projection = None
        self._ignore_oov = ignore_oov
        oov_token = vocab._oov_token
        self._oov_idx = vocab.get_token_to_index_vocabulary(vocab_namespace).get(oov_token)
        if self._oov_idx is None:
            raise ConfigurationError(
                "OOV token does not exist in vocabulary namespace {}".format(vocab_namespace)
            )
        self.output_dim = projection_dim or self.vocab_size

开发者ID:allenai，项目名称:allennlp，代码行数:24，代码来源:bag_of_word_counts_token_embedder.py

示例5: test_forward_works_with_projection_layer

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_forward_works_with_projection_layer(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("the")
        vocab.add_token_to_namespace("a")
        params = Params(
            {
                "pretrained_file": str(
                    self.FIXTURES_ROOT / "embeddings/glove.6B.300d.sample.txt.gz"
                ),
                "embedding_dim": 300,
                "projection_dim": 20,
            }
        )
        embedding_layer = Embedding.from_params(params, vocab=vocab)
        input_tensor = torch.LongTensor([[3, 2, 1, 0]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 4, 20)

        input_tensor = torch.LongTensor([[[3, 2, 1, 0]]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 1, 4, 20)

开发者ID:allenai，项目名称:allennlp，代码行数:23，代码来源:embedding_test.py

示例6: test_embedding_layer_actually_initializes_word_vectors_correctly

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        vocab.add_token_to_namespace("word2")
        unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
        vocab.add_token_to_namespace(unicode_space)
        embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
        with gzip.open(embeddings_filename, "wb") as embeddings_file:
            embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8"))
            embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8"))
        params = Params({"pretrained_file": embeddings_filename, "embedding_dim": 3})
        embedding_layer = Embedding.from_params(params, vocab=vocab)
        word_vector = embedding_layer.weight.data[vocab.get_token_index("word")]
        assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
        word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)]
        assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0]))
        word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
        assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))

开发者ID:allenai，项目名称:allennlp，代码行数:20，代码来源:embedding_test.py

示例7: test_embedding_vocab_extension_with_specified_namespace

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_embedding_vocab_extension_with_specified_namespace(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word1", "tokens_a")
        vocab.add_token_to_namespace("word2", "tokens_a")
        embedding_params = Params({"vocab_namespace": "tokens_a", "embedding_dim": 10})
        embedder = Embedding.from_params(embedding_params, vocab=vocab)
        original_weight = embedder.weight

        assert original_weight.shape[0] == 4

        extension_counter = {"tokens_a": {"word3": 1}}
        vocab._extend(extension_counter)

        embedder.extend_vocab(vocab, "tokens_a")  # specified namespace

        extended_weight = embedder.weight
        assert extended_weight.shape[0] == 5
        assert torch.all(extended_weight[:4, :] == original_weight[:4, :])

开发者ID:allenai，项目名称:allennlp，代码行数:20，代码来源:embedding_test.py

示例8: test_embedding_vocab_extension_with_default_namespace

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_embedding_vocab_extension_with_default_namespace(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word1")
        vocab.add_token_to_namespace("word2")
        embedding_params = Params({"vocab_namespace": "tokens", "embedding_dim": 10})
        embedder = Embedding.from_params(embedding_params, vocab=vocab)
        original_weight = embedder.weight

        assert original_weight.shape[0] == 4

        extension_counter = {"tokens": {"word3": 1}}
        vocab._extend(extension_counter)

        embedder.extend_vocab(vocab)  # default namespace

        extended_weight = embedder.weight
        assert extended_weight.shape[0] == 5
        assert torch.all(extended_weight[:4, :] == original_weight[:4, :])

开发者ID:allenai，项目名称:allennlp，代码行数:20，代码来源:embedding_test.py

示例9: test_embedding_vocab_extension_without_stored_namespace

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_embedding_vocab_extension_without_stored_namespace(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word1", "tokens_a")
        vocab.add_token_to_namespace("word2", "tokens_a")
        embedding_params = Params({"vocab_namespace": "tokens_a", "embedding_dim": 10})
        embedder = Embedding.from_params(embedding_params, vocab=vocab)

        # Previous models won't have _vocab_namespace attribute. Force it to be None
        embedder._vocab_namespace = None
        original_weight = embedder.weight

        assert original_weight.shape[0] == 4

        extension_counter = {"tokens_a": {"word3": 1}}
        vocab._extend(extension_counter)

        embedder.extend_vocab(vocab, "tokens_a")  # specified namespace

        extended_weight = embedder.weight
        assert extended_weight.shape[0] == 5
        assert torch.all(extended_weight[:4, :] == original_weight[:4, :])

开发者ID:allenai，项目名称:allennlp，代码行数:23，代码来源:embedding_test.py

示例10: test_embedding_constructed_directly_with_pretrained_file

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_embedding_constructed_directly_with_pretrained_file(self):

        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        vocab.add_token_to_namespace("word2")
        unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
        vocab.add_token_to_namespace(unicode_space)
        embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
        with gzip.open(embeddings_filename, "wb") as embeddings_file:
            embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8"))
            embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8"))

        num_embeddings = vocab.get_vocab_size()
        embedding_layer = Embedding(
            embedding_dim=3,
            num_embeddings=num_embeddings,
            pretrained_file=embeddings_filename,
            vocab=vocab,
        )
        word_vector = embedding_layer.weight.data[vocab.get_token_index("word")]
        assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
        word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)]
        assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0]))
        word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
        assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))

开发者ID:allenai，项目名称:allennlp，代码行数:27，代码来源:embedding_test.py

示例11: test_start_and_end_tokens

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_start_and_end_tokens(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace="characters")  # 2
        vocab.add_token_to_namespace("s", namespace="characters")  # 3
        vocab.add_token_to_namespace("e", namespace="characters")  # 4
        vocab.add_token_to_namespace("n", namespace="characters")  # 5
        vocab.add_token_to_namespace("t", namespace="characters")  # 6
        vocab.add_token_to_namespace("c", namespace="characters")  # 7
        vocab.add_token_to_namespace("<", namespace="characters")  # 8
        vocab.add_token_to_namespace(">", namespace="characters")  # 9
        vocab.add_token_to_namespace("/", namespace="characters")  # 10

        indexer = TokenCharactersIndexer(
            "characters", start_tokens=["<s>"], end_tokens=["</s>"], min_padding_length=1
        )
        indices = indexer.tokens_to_indices([Token("sentential")], vocab)
        assert indices == {
            "token_characters": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]]
        }

开发者ID:allenai，项目名称:allennlp，代码行数:21，代码来源:character_token_indexer_test.py

示例12: test_as_array_produces_token_sequence_bert_cased_sentence_pair

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_as_array_produces_token_sequence_bert_cased_sentence_pair(self):
        tokenizer = cached_transformers.get_tokenizer("bert-base-cased")
        allennlp_tokenizer = PretrainedTransformerTokenizer(
            "bert-base-cased", add_special_tokens=False
        )
        indexer = PretrainedTransformerIndexer(model_name="bert-base-cased")
        default_format = "[CLS] AllenNLP is great! [SEP] Really it is! [SEP]"
        tokens = tokenizer.tokenize(default_format)
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        allennlp_tokens = allennlp_tokenizer.add_special_tokens(
            allennlp_tokenizer.tokenize("AllenNLP is great!"),
            allennlp_tokenizer.tokenize("Really it is!"),
        )
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
        assert indexed["token_ids"] == expected_ids

开发者ID:allenai，项目名称:allennlp，代码行数:18，代码来源:pretrained_transformer_indexer_test.py

示例13: test_transformers_vocab_sizes

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_transformers_vocab_sizes(self):
        def check_vocab_size(model_name: str):
            namespace = "tags"
            tokenizer = cached_transformers.get_tokenizer(model_name)
            allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
            indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace)
            allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
            vocab = Vocabulary()
            # here we copy entire transformers vocab
            indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
            del indexed
            assert vocab.get_vocab_size(namespace=namespace) == tokenizer.vocab_size

        check_vocab_size("roberta-base")
        check_vocab_size("bert-base-cased")
        check_vocab_size("xlm-mlm-ende-1024")

开发者ID:allenai，项目名称:allennlp，代码行数:18，代码来源:pretrained_transformer_indexer_test.py

示例14: test_long_sequence_splitting

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_long_sequence_splitting(self):
        tokenizer = cached_transformers.get_tokenizer("bert-base-uncased")
        allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-uncased")
        indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased", max_length=4)
        string_specials = "[CLS] AllenNLP is great [SEP]"
        string_no_specials = "AllenNLP is great"
        tokens = tokenizer.tokenize(string_specials)
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        assert len(expected_ids) == 7  # just to make sure it's what we're expecting
        cls_id, sep_id = expected_ids[0], expected_ids[-1]
        expected_ids = (
            expected_ids[:3]
            + [sep_id, cls_id]
            + expected_ids[3:5]
            + [sep_id, cls_id]
            + expected_ids[5:]
        )

        allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
        assert indexed["token_ids"] == expected_ids
        assert indexed["segment_concat_mask"] == [True] * len(expected_ids)
        assert indexed["mask"] == [True] * 7  # original length

开发者ID:allenai，项目名称:allennlp，代码行数:26，代码来源:pretrained_transformer_indexer_test.py

示例15: test_indices_to_tokens

# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_indices_to_tokens(self):
        allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-uncased")
        indexer_max_length = PretrainedTransformerIndexer(
            model_name="bert-base-uncased", max_length=4
        )
        indexer_no_max_length = PretrainedTransformerIndexer(model_name="bert-base-uncased")
        string_no_specials = "AllenNLP is great"

        allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
        vocab = Vocabulary()
        indexed = indexer_no_max_length.tokens_to_indices(allennlp_tokens, vocab)
        tokens_from_indices = indexer_no_max_length.indices_to_tokens(indexed, vocab)

        self._assert_tokens_equal(allennlp_tokens, tokens_from_indices)

        indexed = indexer_max_length.tokens_to_indices(allennlp_tokens, vocab)
        tokens_from_indices = indexer_max_length.indices_to_tokens(indexed, vocab)

        # For now we are not removing special tokens introduced from max_length
        sep_cls = [allennlp_tokens[-1], allennlp_tokens[0]]
        expected = (
            allennlp_tokens[:3] + sep_cls + allennlp_tokens[3:5] + sep_cls + allennlp_tokens[5:]
        )

        self._assert_tokens_equal(expected, tokens_from_indices)

开发者ID:allenai，项目名称:allennlp，代码行数:27，代码来源:pretrained_transformer_indexer_test.py

注：本文中的allennlp.data.Vocabulary方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。

示例1: __init__

示例2: from_params

示例3: _read_embeddings_from_hdf5

示例4: __init__

示例5: test_forward_works_with_projection_layer

示例6: test_embedding_layer_actually_initializes_word_vectors_correctly

示例7: test_embedding_vocab_extension_with_specified_namespace

示例8: test_embedding_vocab_extension_with_default_namespace

示例9: test_embedding_vocab_extension_without_stored_namespace

示例10: test_embedding_constructed_directly_with_pretrained_file

示例11: test_start_and_end_tokens

示例12: test_as_array_produces_token_sequence_bert_cased_sentence_pair

示例13: test_transformers_vocab_sizes

示例14: test_long_sequence_splitting

示例15: test_indices_to_tokens

示例1: init

示例4: init