當前位置: 首頁>>代碼示例>>Python>>正文


Python vocabulary.Vocabulary方法代碼示例

本文整理匯總了Python中allennlp.data.vocabulary.Vocabulary方法的典型用法代碼示例。如果您正苦於以下問題:Python vocabulary.Vocabulary方法的具體用法?Python vocabulary.Vocabulary怎麽用?Python vocabulary.Vocabulary使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在allennlp.data.vocabulary的用法示例。


在下文中一共展示了vocabulary.Vocabulary方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: __init__

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def __init__(self,
                 vocabulary: Vocabulary,
                 namespace: str = "intent_labels",
                 ignore_classes: List[str] = None,
                 coarse: bool = True) -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the label namespace.
        namespace : str, required.
            The vocabulary namespace for labels.
        ignore_classes : List[str], optional.
            Labels which will be ignored when computing metrics.
        """
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(namespace)
        self._ignore_classes: List[str] = ignore_classes or []
        self._coarse = coarse

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int) 
開發者ID:ConvLab,項目名稱:ConvLab,代碼行數:25,代碼來源:multilabel_f1_measure.py

示例2: __init__

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def __init__(self,
                 embedder: TextFieldEmbedder,
                 vocab: Vocabulary,
                 lm_head: LanguageModelHead=None,
                 clf_head: ClassificationHead=None,
                 language_model_weight: float=.5) -> None:
        
        super().__init__(vocab)
        
        assert not (lm_head is None and clf_head is None)
        
        self.embedder = embedder
        self.clf_head = clf_head
        self.lm_head = lm_head
        self.language_model_weight = language_model_weight
        self.vocab = vocab 
開發者ID:DFKI-NLP,項目名稱:DISTRE,代碼行數:18,代碼來源:model.py

示例3: tokens_to_indices

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
        self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)

        wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize([t.text for t in tokens])

        # For tokens that don't correspond to any word pieces, we put (-1, -1) into the offsets.
        # That results in the embedding for the token to be all zeros.
        offsets = [x if x is not None else (-1, -1) for x in offsets]

        output: IndexedTokenList = {
            "token_ids": [t.text_id for t in wordpieces],
            "mask": [True] * len(tokens),  # for original tokens (i.e. word-level)
            "type_ids": [t.type_id for t in wordpieces],
            "offsets": offsets,
            "wordpiece_mask": [True] * len(wordpieces),  # for wordpieces (i.e. subword-level)
        }

        return self._matched_indexer._postprocess_output(output) 
開發者ID:allenai,項目名稱:allennlp,代碼行數:20,代碼來源:pretrained_transformer_mismatched_indexer.py

示例4: _add_encoding_to_vocabulary_if_needed

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None:
        """
        Copies tokens from ```transformers``` model's vocab to the specified namespace.
        """
        if self._added_to_vocabulary:
            return

        try:
            vocab_items = self._tokenizer.get_vocab().items()
        except NotImplementedError:
            vocab_items = (
                (self._tokenizer.convert_ids_to_tokens(idx), idx)
                for idx in range(self._tokenizer.vocab_size)
            )
        for word, idx in vocab_items:
            vocab._token_to_index[self._namespace][word] = idx
            vocab._index_to_token[self._namespace][idx] = word

        self._added_to_vocabulary = True 
開發者ID:allenai,項目名稱:allennlp,代碼行數:21,代碼來源:pretrained_transformer_indexer.py

示例5: tokens_to_indices

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[int]]:
        indices: List[int] = []

        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            text = self._get_feature_value(token)
            if self.namespace is None:
                # We could have a check here that `text` is an int; not sure it's worth it.
                indices.append(text)  # type: ignore
            else:
                if self.lowercase_tokens:
                    text = text.lower()
                indices.append(vocabulary.get_token_index(text, self.namespace))

        return {"tokens": indices} 
開發者ID:allenai,項目名稱:allennlp,代碼行數:18,代碼來源:single_id_token_indexer.py

示例6: tokens_to_indices

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[List[int]]]:
        indices: List[List[int]] = []
        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            token_indices: List[int] = []
            if token.text is None:
                raise ConfigurationError(
                    "TokenCharactersIndexer needs a tokenizer that retains text"
                )
            for character in self._character_tokenizer.tokenize(token.text):
                if getattr(character, "text_id", None) is not None:
                    # `text_id` being set on the token means that we aren't using the vocab, we just
                    # use this id instead.
                    index = character.text_id
                else:
                    index = vocabulary.get_token_index(character.text, self._namespace)
                token_indices.append(index)
            indices.append(token_indices)
        return {"token_characters": indices} 
開發者ID:allenai,項目名稱:allennlp,代碼行數:22,代碼來源:token_characters_indexer.py

示例7: count_vocab_items

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def count_vocab_items(self, counter: Dict[str, Dict[str, int]]):
        """
        If there are strings in this field that need to be converted into integers through a
        :class:`Vocabulary`, here is where we count them, to determine which tokens are in or out
        of the vocabulary.

        If your `Field` does not have any strings that need to be converted into indices, you do
        not need to implement this method.

        A note on this `counter`: because `Fields` can represent conceptually different things,
        we separate the vocabulary items by `namespaces`.  This way, we can use a single shared
        mechanism to handle all mappings from strings to integers in all fields, while keeping
        words in a `TextField` from sharing the same ids with labels in a `LabelField` (e.g.,
        "entailment" or "contradiction" are labels in an entailment task)

        Additionally, a single `Field` might want to use multiple namespaces - `TextFields` can
        be represented as a combination of word ids and character ids, and you don't want words and
        characters to share the same vocabulary - "a" as a word should get a different id from "a"
        as a character, and the vocabulary sizes of words and characters are very different.

        Because of this, the first key in the `counter` object is a `namespace`, like "tokens",
        "token_characters", "tags", or "labels", and the second key is the actual vocabulary item.
        """
        pass 
開發者ID:allenai,項目名稱:allennlp,代碼行數:26,代碼來源:field.py

示例8: test_multilabel_field_empty_field_works

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
        vocab.add_token_to_namespace("label2", namespace="test_empty_labels")

        f = MultiLabelField([], label_namespace="test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
        g = f.empty_field()
        g.index(vocab)
        tensor = g.as_tensor(g.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))

        h = MultiLabelField(
            [0, 0, 1], label_namespace="test_empty_labels", num_labels=3, skip_indexing=True
        )
        tensor = h.empty_field().as_tensor(None).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0, 0])) 
開發者ID:allenai,項目名稱:allennlp,代碼行數:21,代碼來源:multilabel_field_test.py

示例9: __init__

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 vocab: Vocabulary,
                 lstm_hidden_dim: int,
                 top_k: int,
                 cuda_device: int) -> None:
        super().__init__(vocab)

        self.word_embeddings = word_embeddings

        self.query_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True)
        self.doc_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) 
        self.cosine_module = CosineMatrixAttention()

        self.top_k = top_k

        self.dense = nn.Linear(top_k, out_features=20, bias=True)
        self.dense2 = nn.Linear(20, out_features=20, bias=True)
        self.dense3 = nn.Linear(20, out_features=1, bias=False) 
開發者ID:sebastian-hofstaetter,項目名稱:transformer-kernel-ranking,代碼行數:23,代碼來源:mv_lstm.py

示例10: __init__

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def __init__(self,
                 vocab: Vocabulary,
                 word_embedder: TextFieldEmbedder,
                 character_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 character_encoder: Seq2VecEncoder) -> None:
        super().__init__(vocab)

        self._word_embedder = word_embedder
        self._character_embedder = character_embedder
        self._character_encoder = character_encoder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels')
        )

        self._f1 = SpanBasedF1Measure(vocab, 'labels') 
開發者ID:jbarrow,項目名稱:allennlp_tutorial,代碼行數:20,代碼來源:lstm_character.py

示例11: __init__

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder) -> None:
        super().__init__(vocab)

        self._embedder = embedder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels')
        )
        self._crf = ConditionalRandomField(
            vocab.get_vocab_size('labels')
        )

        self._f1 = SpanBasedF1Measure(vocab, 'labels') 
開發者ID:jbarrow,項目名稱:allennlp_tutorial,代碼行數:19,代碼來源:lstm_crf.py

示例12: _add_encoding_to_vocabulary

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def _add_encoding_to_vocabulary(self, vocabulary: Vocabulary) -> None:
        # pylint: disable=protected-access
        for word, idx in self.encoder.items():
            vocabulary._token_to_index[self._namespace][word] = idx
            vocabulary._index_to_token[self._namespace][idx] = word 
開發者ID:DFKI-NLP,項目名稱:DISTRE,代碼行數:7,代碼來源:byte_pair_indexer.py

示例13: tokens_to_indices

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def tokens_to_indices(self,
                          tokens: List[Token],
                          vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        if not self._added_to_vocabulary:
            self._add_encoding_to_vocabulary(vocabulary)
            self._added_to_vocabulary = True

        text_tokens = []
        offsets = []
        offset = -1

        for token in tokens:
            bpe_tokens = [self.encoder.get(t, 0) for t in self.byte_pair_encode(token) if self.encoder.get(t, 0) != 0]

            if bpe_tokens:
                offset += len(bpe_tokens)
                offsets.append(offset)
                text_tokens.extend(bpe_tokens)

        num_tokens = len(text_tokens)

        # If there's too many tokens, that's going to cause problems.
        if num_tokens >= self.n_ctx:
            print('Sequence to long. Pruning!')
            text_tokens = text_tokens[:self.n_ctx]
            text_tokens[-2] = self.encoder['__clf__</w>']
            text_tokens[-1] = 0
        else:
            text_tokens.append(0)

        return {
                index_name: text_tokens,
                f"{index_name}-offsets": offsets,
                # add mask here according to the original tokens,
                # because calling util.get_text_field_mask on the
                # "byte pair" tokens will produce the wrong shape
                "mask": [1 for _ in offsets]
        } 
開發者ID:DFKI-NLP,項目名稱:DISTRE,代碼行數:41,代碼來源:byte_pair_indexer.py

示例14: _add_encoding_to_vocabulary

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def _add_encoding_to_vocabulary(self, vocabulary: Vocabulary) -> None:
        # pylint: disable=protected-access
        for word, idx in self.vocab.items():
            vocabulary._token_to_index[self._namespace][word] = idx
            vocabulary._index_to_token[self._namespace][idx] = word 
開發者ID:plkmo,項目名稱:NLP_Toolkit,代碼行數:7,代碼來源:wordpiece_indexer.py

示例15: tokens_to_indices

# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
        self._add_encoding_to_vocabulary_if_needed(vocabulary)

        indices, type_ids = self._extract_token_and_type_ids(tokens)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        output: IndexedTokenList = {
            "token_ids": indices,
            "mask": [True] * len(indices),
            "type_ids": type_ids,
        }

        return self._postprocess_output(output) 
開發者ID:allenai,項目名稱:allennlp,代碼行數:14,代碼來源:pretrained_transformer_indexer.py


注:本文中的allennlp.data.vocabulary.Vocabulary方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。