本文整理匯總了Python中allennlp.data.vocabulary.Vocabulary方法的典型用法代碼示例。如果您正苦於以下問題:Python vocabulary.Vocabulary方法的具體用法?Python vocabulary.Vocabulary怎麽用?Python vocabulary.Vocabulary使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類allennlp.data.vocabulary
的用法示例。
在下文中一共展示了vocabulary.Vocabulary方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: __init__
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def __init__(self,
vocabulary: Vocabulary,
namespace: str = "intent_labels",
ignore_classes: List[str] = None,
coarse: bool = True) -> None:
"""
Parameters
----------
vocabulary : ``Vocabulary``, required.
A vocabulary containing the label namespace.
namespace : str, required.
The vocabulary namespace for labels.
ignore_classes : List[str], optional.
Labels which will be ignored when computing metrics.
"""
self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(namespace)
self._ignore_classes: List[str] = ignore_classes or []
self._coarse = coarse
# These will hold per label span counts.
self._true_positives: Dict[str, int] = defaultdict(int)
self._false_positives: Dict[str, int] = defaultdict(int)
self._false_negatives: Dict[str, int] = defaultdict(int)
示例2: __init__
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def __init__(self,
embedder: TextFieldEmbedder,
vocab: Vocabulary,
lm_head: LanguageModelHead=None,
clf_head: ClassificationHead=None,
language_model_weight: float=.5) -> None:
super().__init__(vocab)
assert not (lm_head is None and clf_head is None)
self.embedder = embedder
self.clf_head = clf_head
self.lm_head = lm_head
self.language_model_weight = language_model_weight
self.vocab = vocab
示例3: tokens_to_indices
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)
wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize([t.text for t in tokens])
# For tokens that don't correspond to any word pieces, we put (-1, -1) into the offsets.
# That results in the embedding for the token to be all zeros.
offsets = [x if x is not None else (-1, -1) for x in offsets]
output: IndexedTokenList = {
"token_ids": [t.text_id for t in wordpieces],
"mask": [True] * len(tokens), # for original tokens (i.e. word-level)
"type_ids": [t.type_id for t in wordpieces],
"offsets": offsets,
"wordpiece_mask": [True] * len(wordpieces), # for wordpieces (i.e. subword-level)
}
return self._matched_indexer._postprocess_output(output)
示例4: _add_encoding_to_vocabulary_if_needed
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None:
"""
Copies tokens from ```transformers``` model's vocab to the specified namespace.
"""
if self._added_to_vocabulary:
return
try:
vocab_items = self._tokenizer.get_vocab().items()
except NotImplementedError:
vocab_items = (
(self._tokenizer.convert_ids_to_tokens(idx), idx)
for idx in range(self._tokenizer.vocab_size)
)
for word, idx in vocab_items:
vocab._token_to_index[self._namespace][word] = idx
vocab._index_to_token[self._namespace][idx] = word
self._added_to_vocabulary = True
示例5: tokens_to_indices
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def tokens_to_indices(
self, tokens: List[Token], vocabulary: Vocabulary
) -> Dict[str, List[int]]:
indices: List[int] = []
for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
text = self._get_feature_value(token)
if self.namespace is None:
# We could have a check here that `text` is an int; not sure it's worth it.
indices.append(text) # type: ignore
else:
if self.lowercase_tokens:
text = text.lower()
indices.append(vocabulary.get_token_index(text, self.namespace))
return {"tokens": indices}
示例6: tokens_to_indices
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def tokens_to_indices(
self, tokens: List[Token], vocabulary: Vocabulary
) -> Dict[str, List[List[int]]]:
indices: List[List[int]] = []
for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
token_indices: List[int] = []
if token.text is None:
raise ConfigurationError(
"TokenCharactersIndexer needs a tokenizer that retains text"
)
for character in self._character_tokenizer.tokenize(token.text):
if getattr(character, "text_id", None) is not None:
# `text_id` being set on the token means that we aren't using the vocab, we just
# use this id instead.
index = character.text_id
else:
index = vocabulary.get_token_index(character.text, self._namespace)
token_indices.append(index)
indices.append(token_indices)
return {"token_characters": indices}
示例7: count_vocab_items
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def count_vocab_items(self, counter: Dict[str, Dict[str, int]]):
"""
If there are strings in this field that need to be converted into integers through a
:class:`Vocabulary`, here is where we count them, to determine which tokens are in or out
of the vocabulary.
If your `Field` does not have any strings that need to be converted into indices, you do
not need to implement this method.
A note on this `counter`: because `Fields` can represent conceptually different things,
we separate the vocabulary items by `namespaces`. This way, we can use a single shared
mechanism to handle all mappings from strings to integers in all fields, while keeping
words in a `TextField` from sharing the same ids with labels in a `LabelField` (e.g.,
"entailment" or "contradiction" are labels in an entailment task)
Additionally, a single `Field` might want to use multiple namespaces - `TextFields` can
be represented as a combination of word ids and character ids, and you don't want words and
characters to share the same vocabulary - "a" as a word should get a different id from "a"
as a character, and the vocabulary sizes of words and characters are very different.
Because of this, the first key in the `counter` object is a `namespace`, like "tokens",
"token_characters", "tags", or "labels", and the second key is the actual vocabulary item.
"""
pass
示例8: test_multilabel_field_empty_field_works
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def test_multilabel_field_empty_field_works(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
vocab.add_token_to_namespace("label2", namespace="test_empty_labels")
f = MultiLabelField([], label_namespace="test_empty_labels")
f.index(vocab)
tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
g = f.empty_field()
g.index(vocab)
tensor = g.as_tensor(g.get_padding_lengths()).detach().cpu().numpy()
numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
h = MultiLabelField(
[0, 0, 1], label_namespace="test_empty_labels", num_labels=3, skip_indexing=True
)
tensor = h.empty_field().as_tensor(None).detach().cpu().numpy()
numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0, 0]))
示例9: __init__
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def __init__(self,
word_embeddings: TextFieldEmbedder,
vocab: Vocabulary,
lstm_hidden_dim: int,
top_k: int,
cuda_device: int) -> None:
super().__init__(vocab)
self.word_embeddings = word_embeddings
self.query_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True)
self.doc_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True)
# this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
self.cosine_module = CosineMatrixAttention()
self.top_k = top_k
self.dense = nn.Linear(top_k, out_features=20, bias=True)
self.dense2 = nn.Linear(20, out_features=20, bias=True)
self.dense3 = nn.Linear(20, out_features=1, bias=False)
示例10: __init__
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def __init__(self,
vocab: Vocabulary,
word_embedder: TextFieldEmbedder,
character_embedder: TextFieldEmbedder,
encoder: Seq2SeqEncoder,
character_encoder: Seq2VecEncoder) -> None:
super().__init__(vocab)
self._word_embedder = word_embedder
self._character_embedder = character_embedder
self._character_encoder = character_encoder
self._encoder = encoder
self._classifier = torch.nn.Linear(
in_features=encoder.get_output_dim(),
out_features=vocab.get_vocab_size('labels')
)
self._f1 = SpanBasedF1Measure(vocab, 'labels')
示例11: __init__
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def __init__(self,
vocab: Vocabulary,
embedder: TextFieldEmbedder,
encoder: Seq2SeqEncoder) -> None:
super().__init__(vocab)
self._embedder = embedder
self._encoder = encoder
self._classifier = torch.nn.Linear(
in_features=encoder.get_output_dim(),
out_features=vocab.get_vocab_size('labels')
)
self._crf = ConditionalRandomField(
vocab.get_vocab_size('labels')
)
self._f1 = SpanBasedF1Measure(vocab, 'labels')
示例12: _add_encoding_to_vocabulary
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def _add_encoding_to_vocabulary(self, vocabulary: Vocabulary) -> None:
# pylint: disable=protected-access
for word, idx in self.encoder.items():
vocabulary._token_to_index[self._namespace][word] = idx
vocabulary._index_to_token[self._namespace][idx] = word
示例13: tokens_to_indices
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def tokens_to_indices(self,
tokens: List[Token],
vocabulary: Vocabulary,
index_name: str) -> Dict[str, List[int]]:
if not self._added_to_vocabulary:
self._add_encoding_to_vocabulary(vocabulary)
self._added_to_vocabulary = True
text_tokens = []
offsets = []
offset = -1
for token in tokens:
bpe_tokens = [self.encoder.get(t, 0) for t in self.byte_pair_encode(token) if self.encoder.get(t, 0) != 0]
if bpe_tokens:
offset += len(bpe_tokens)
offsets.append(offset)
text_tokens.extend(bpe_tokens)
num_tokens = len(text_tokens)
# If there's too many tokens, that's going to cause problems.
if num_tokens >= self.n_ctx:
print('Sequence to long. Pruning!')
text_tokens = text_tokens[:self.n_ctx]
text_tokens[-2] = self.encoder['__clf__</w>']
text_tokens[-1] = 0
else:
text_tokens.append(0)
return {
index_name: text_tokens,
f"{index_name}-offsets": offsets,
# add mask here according to the original tokens,
# because calling util.get_text_field_mask on the
# "byte pair" tokens will produce the wrong shape
"mask": [1 for _ in offsets]
}
示例14: _add_encoding_to_vocabulary
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def _add_encoding_to_vocabulary(self, vocabulary: Vocabulary) -> None:
# pylint: disable=protected-access
for word, idx in self.vocab.items():
vocabulary._token_to_index[self._namespace][word] = idx
vocabulary._index_to_token[self._namespace][idx] = word
示例15: tokens_to_indices
# 需要導入模塊: from allennlp.data import vocabulary [as 別名]
# 或者: from allennlp.data.vocabulary import Vocabulary [as 別名]
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
self._add_encoding_to_vocabulary_if_needed(vocabulary)
indices, type_ids = self._extract_token_and_type_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
output: IndexedTokenList = {
"token_ids": indices,
"mask": [True] * len(indices),
"type_ids": type_ids,
}
return self._postprocess_output(output)