本文整理汇总了Python中allennlp.data.Vocabulary方法的典型用法代码示例。如果您正苦于以下问题:Python data.Vocabulary方法的具体用法?Python data.Vocabulary怎么用?Python data.Vocabulary使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data
的用法示例。
在下文中一共展示了data.Vocabulary方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def __init__(self, vocab: Vocabulary,
text_field_embedder: TextFieldEmbedder,
verbose_metrics: bool = False,
dropout: float = 0.2,
initializer: InitializerApplicator = InitializerApplicator(),
regularizer: Optional[RegularizerApplicator] = None,
) -> None:
super(TextClassifier, self).__init__(vocab, regularizer)
self.text_field_embedder = text_field_embedder
self.dropout = torch.nn.Dropout(dropout)
self.num_classes = self.vocab.get_vocab_size("labels")
self.classifier_feedforward = torch.nn.Linear(self.text_field_embedder.get_output_dim() , self.num_classes)
self.label_accuracy = CategoricalAccuracy()
self.label_f1_metrics = {}
self.verbose_metrics = verbose_metrics
for i in range(self.num_classes):
self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = F1Measure(positive_label=i)
self.loss = torch.nn.CrossEntropyLoss()
initializer(self)
示例2: from_params
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def from_params(cls, vocab: Vocabulary, params: Params) -> 'CrfTagger':
embedder_params = params.pop("text_field_embedder")
text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)
encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))
label_namespace = params.pop("label_namespace", "labels")
constraint_type = params.pop("constraint_type", None)
dropout = params.pop("dropout", None)
include_start_end_transitions = params.pop("include_start_end_transitions", True)
initializer = InitializerApplicator.from_params(params.pop('initializer', []))
regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))
params.assert_empty(cls.__name__)
return cls(vocab=vocab,
text_field_embedder=text_field_embedder,
encoder=encoder,
label_namespace=label_namespace,
constraint_type=constraint_type,
dropout=dropout,
include_start_end_transitions=include_start_end_transitions,
initializer=initializer,
regularizer=regularizer)
示例3: _read_embeddings_from_hdf5
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def _read_embeddings_from_hdf5(
embeddings_filename: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens"
) -> torch.FloatTensor:
"""
Reads from a hdf5 formatted file. The embedding matrix is assumed to
be keyed by 'embedding' and of size `(num_tokens, embedding_dim)`.
"""
with h5py.File(embeddings_filename, "r") as fin:
embeddings = fin["embedding"][...]
if list(embeddings.shape) != [vocab.get_vocab_size(namespace), embedding_dim]:
raise ConfigurationError(
"Read shape {0} embeddings from the file, but expected {1}".format(
list(embeddings.shape), [vocab.get_vocab_size(namespace), embedding_dim]
)
)
return torch.FloatTensor(embeddings)
示例4: __init__
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def __init__(
self,
vocab: Vocabulary,
vocab_namespace: str = "tokens",
projection_dim: int = None,
ignore_oov: bool = False,
) -> None:
super().__init__()
self.vocab = vocab
self.vocab_size = vocab.get_vocab_size(vocab_namespace)
if projection_dim:
self._projection = torch.nn.Linear(self.vocab_size, projection_dim)
else:
self._projection = None
self._ignore_oov = ignore_oov
oov_token = vocab._oov_token
self._oov_idx = vocab.get_token_to_index_vocabulary(vocab_namespace).get(oov_token)
if self._oov_idx is None:
raise ConfigurationError(
"OOV token does not exist in vocabulary namespace {}".format(vocab_namespace)
)
self.output_dim = projection_dim or self.vocab_size
示例5: test_forward_works_with_projection_layer
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_forward_works_with_projection_layer(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("the")
vocab.add_token_to_namespace("a")
params = Params(
{
"pretrained_file": str(
self.FIXTURES_ROOT / "embeddings/glove.6B.300d.sample.txt.gz"
),
"embedding_dim": 300,
"projection_dim": 20,
}
)
embedding_layer = Embedding.from_params(params, vocab=vocab)
input_tensor = torch.LongTensor([[3, 2, 1, 0]])
embedded = embedding_layer(input_tensor).data.numpy()
assert embedded.shape == (1, 4, 20)
input_tensor = torch.LongTensor([[[3, 2, 1, 0]]])
embedded = embedding_layer(input_tensor).data.numpy()
assert embedded.shape == (1, 1, 4, 20)
示例6: test_embedding_layer_actually_initializes_word_vectors_correctly
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("word")
vocab.add_token_to_namespace("word2")
unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
vocab.add_token_to_namespace(unicode_space)
embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
with gzip.open(embeddings_filename, "wb") as embeddings_file:
embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8"))
embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8"))
params = Params({"pretrained_file": embeddings_filename, "embedding_dim": 3})
embedding_layer = Embedding.from_params(params, vocab=vocab)
word_vector = embedding_layer.weight.data[vocab.get_token_index("word")]
assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)]
assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0]))
word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
示例7: test_embedding_vocab_extension_with_specified_namespace
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_embedding_vocab_extension_with_specified_namespace(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("word1", "tokens_a")
vocab.add_token_to_namespace("word2", "tokens_a")
embedding_params = Params({"vocab_namespace": "tokens_a", "embedding_dim": 10})
embedder = Embedding.from_params(embedding_params, vocab=vocab)
original_weight = embedder.weight
assert original_weight.shape[0] == 4
extension_counter = {"tokens_a": {"word3": 1}}
vocab._extend(extension_counter)
embedder.extend_vocab(vocab, "tokens_a") # specified namespace
extended_weight = embedder.weight
assert extended_weight.shape[0] == 5
assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
示例8: test_embedding_vocab_extension_with_default_namespace
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_embedding_vocab_extension_with_default_namespace(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("word1")
vocab.add_token_to_namespace("word2")
embedding_params = Params({"vocab_namespace": "tokens", "embedding_dim": 10})
embedder = Embedding.from_params(embedding_params, vocab=vocab)
original_weight = embedder.weight
assert original_weight.shape[0] == 4
extension_counter = {"tokens": {"word3": 1}}
vocab._extend(extension_counter)
embedder.extend_vocab(vocab) # default namespace
extended_weight = embedder.weight
assert extended_weight.shape[0] == 5
assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
示例9: test_embedding_vocab_extension_without_stored_namespace
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_embedding_vocab_extension_without_stored_namespace(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("word1", "tokens_a")
vocab.add_token_to_namespace("word2", "tokens_a")
embedding_params = Params({"vocab_namespace": "tokens_a", "embedding_dim": 10})
embedder = Embedding.from_params(embedding_params, vocab=vocab)
# Previous models won't have _vocab_namespace attribute. Force it to be None
embedder._vocab_namespace = None
original_weight = embedder.weight
assert original_weight.shape[0] == 4
extension_counter = {"tokens_a": {"word3": 1}}
vocab._extend(extension_counter)
embedder.extend_vocab(vocab, "tokens_a") # specified namespace
extended_weight = embedder.weight
assert extended_weight.shape[0] == 5
assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
示例10: test_embedding_constructed_directly_with_pretrained_file
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_embedding_constructed_directly_with_pretrained_file(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("word")
vocab.add_token_to_namespace("word2")
unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
vocab.add_token_to_namespace(unicode_space)
embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
with gzip.open(embeddings_filename, "wb") as embeddings_file:
embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8"))
embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8"))
num_embeddings = vocab.get_vocab_size()
embedding_layer = Embedding(
embedding_dim=3,
num_embeddings=num_embeddings,
pretrained_file=embeddings_filename,
vocab=vocab,
)
word_vector = embedding_layer.weight.data[vocab.get_token_index("word")]
assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)]
assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0]))
word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
示例11: test_start_and_end_tokens
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_start_and_end_tokens(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("A", namespace="characters") # 2
vocab.add_token_to_namespace("s", namespace="characters") # 3
vocab.add_token_to_namespace("e", namespace="characters") # 4
vocab.add_token_to_namespace("n", namespace="characters") # 5
vocab.add_token_to_namespace("t", namespace="characters") # 6
vocab.add_token_to_namespace("c", namespace="characters") # 7
vocab.add_token_to_namespace("<", namespace="characters") # 8
vocab.add_token_to_namespace(">", namespace="characters") # 9
vocab.add_token_to_namespace("/", namespace="characters") # 10
indexer = TokenCharactersIndexer(
"characters", start_tokens=["<s>"], end_tokens=["</s>"], min_padding_length=1
)
indices = indexer.tokens_to_indices([Token("sentential")], vocab)
assert indices == {
"token_characters": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]]
}
示例12: test_as_array_produces_token_sequence_bert_cased_sentence_pair
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_as_array_produces_token_sequence_bert_cased_sentence_pair(self):
tokenizer = cached_transformers.get_tokenizer("bert-base-cased")
allennlp_tokenizer = PretrainedTransformerTokenizer(
"bert-base-cased", add_special_tokens=False
)
indexer = PretrainedTransformerIndexer(model_name="bert-base-cased")
default_format = "[CLS] AllenNLP is great! [SEP] Really it is! [SEP]"
tokens = tokenizer.tokenize(default_format)
expected_ids = tokenizer.convert_tokens_to_ids(tokens)
allennlp_tokens = allennlp_tokenizer.add_special_tokens(
allennlp_tokenizer.tokenize("AllenNLP is great!"),
allennlp_tokenizer.tokenize("Really it is!"),
)
vocab = Vocabulary()
indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
assert indexed["token_ids"] == expected_ids
示例13: test_transformers_vocab_sizes
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_transformers_vocab_sizes(self):
def check_vocab_size(model_name: str):
namespace = "tags"
tokenizer = cached_transformers.get_tokenizer(model_name)
allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace)
allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
vocab = Vocabulary()
# here we copy entire transformers vocab
indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
del indexed
assert vocab.get_vocab_size(namespace=namespace) == tokenizer.vocab_size
check_vocab_size("roberta-base")
check_vocab_size("bert-base-cased")
check_vocab_size("xlm-mlm-ende-1024")
示例14: test_long_sequence_splitting
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_long_sequence_splitting(self):
tokenizer = cached_transformers.get_tokenizer("bert-base-uncased")
allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-uncased")
indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased", max_length=4)
string_specials = "[CLS] AllenNLP is great [SEP]"
string_no_specials = "AllenNLP is great"
tokens = tokenizer.tokenize(string_specials)
expected_ids = tokenizer.convert_tokens_to_ids(tokens)
assert len(expected_ids) == 7 # just to make sure it's what we're expecting
cls_id, sep_id = expected_ids[0], expected_ids[-1]
expected_ids = (
expected_ids[:3]
+ [sep_id, cls_id]
+ expected_ids[3:5]
+ [sep_id, cls_id]
+ expected_ids[5:]
)
allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
vocab = Vocabulary()
indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
assert indexed["token_ids"] == expected_ids
assert indexed["segment_concat_mask"] == [True] * len(expected_ids)
assert indexed["mask"] == [True] * 7 # original length
示例15: test_indices_to_tokens
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Vocabulary [as 别名]
def test_indices_to_tokens(self):
allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-uncased")
indexer_max_length = PretrainedTransformerIndexer(
model_name="bert-base-uncased", max_length=4
)
indexer_no_max_length = PretrainedTransformerIndexer(model_name="bert-base-uncased")
string_no_specials = "AllenNLP is great"
allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
vocab = Vocabulary()
indexed = indexer_no_max_length.tokens_to_indices(allennlp_tokens, vocab)
tokens_from_indices = indexer_no_max_length.indices_to_tokens(indexed, vocab)
self._assert_tokens_equal(allennlp_tokens, tokens_from_indices)
indexed = indexer_max_length.tokens_to_indices(allennlp_tokens, vocab)
tokens_from_indices = indexer_max_length.indices_to_tokens(indexed, vocab)
# For now we are not removing special tokens introduced from max_length
sep_cls = [allennlp_tokens[-1], allennlp_tokens[0]]
expected = (
allennlp_tokens[:3] + sep_cls + allennlp_tokens[3:5] + sep_cls + allennlp_tokens[5:]
)
self._assert_tokens_equal(expected, tokens_from_indices)