本文整理汇总了Python中allennlp.data.vocabulary.Vocabulary类的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary类的具体用法?Python Vocabulary怎么用?Python Vocabulary使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Vocabulary类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_from_instances_exclusive_embeddings_file_inside_archive
def test_from_instances_exclusive_embeddings_file_inside_archive(self):
""" Just for ensuring there are no problems when reading pretrained tokens from an archive """
# Read embeddings file from archive
archive_path = str(self.TEST_DIR / "embeddings-archive.zip")
with zipfile.ZipFile(archive_path, 'w') as archive:
file_path = 'embedding.3d.vec'
with archive.open(file_path, 'w') as embeddings_file:
embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8'))
embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8'))
with archive.open('dummy.vec', 'w') as dummy_file:
dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8'))
embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path)
vocab = Vocabulary.from_instances(self.dataset,
min_count={'tokens': 4},
pretrained_files={'tokens': embeddings_file_uri},
only_include_pretrained_words=True)
words = set(vocab.get_index_to_token_vocabulary().values())
assert 'a' in words
assert 'b' not in words
assert 'c' not in words
vocab = Vocabulary.from_instances(self.dataset,
pretrained_files={'tokens': embeddings_file_uri},
only_include_pretrained_words=True)
words = set(vocab.get_index_to_token_vocabulary().values())
assert 'a' in words
assert 'b' in words
assert 'c' not in words
示例2: test_multilabel_field_empty_field_works
def test_multilabel_field_empty_field_works(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
vocab.add_token_to_namespace("label2", namespace="test_empty_labels")
f = MultiLabelField([], label_namespace="test_empty_labels")
f.index(vocab)
tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
示例3: test_from_dataset_respects_max_vocab_size_single_int
def test_from_dataset_respects_max_vocab_size_single_int(self):
max_vocab_size = 1
vocab = Vocabulary.from_instances(self.dataset, max_vocab_size=max_vocab_size)
words = vocab.get_index_to_token_vocabulary().values()
# Additional 2 tokens are '@@[email protected]@' and '@@[email protected]@' by default
assert len(words) == max_vocab_size + 2
vocab = Vocabulary.from_instances(self.dataset, min_count=None)
words = vocab.get_index_to_token_vocabulary().values()
assert len(words) == 5
示例4: test_multilabel_field_can_index_with_vocab
def test_multilabel_field_can_index_with_vocab(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("rel0", namespace="rel_labels")
vocab.add_token_to_namespace("rel1", namespace="rel_labels")
vocab.add_token_to_namespace("rel2", namespace="rel_labels")
f = MultiLabelField(["rel1", "rel0"], label_namespace="rel_labels")
f.index(vocab)
tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0]))
示例5: test_unknown_token
def test_unknown_token(self):
# pylint: disable=protected-access
# We're putting this behavior in a test so that the behavior is documented. There is
# solver code that depends in a small way on how we treat the unknown token, so any
# breaking change to this behavior should break a test, so you know you've done something
# that needs more consideration.
vocab = Vocabulary()
oov_token = vocab._oov_token
oov_index = vocab.get_token_index(oov_token)
assert oov_index == 1
assert vocab.get_token_index("unseen word") == oov_index
示例6: _get_vocab_index_mapping
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
vocab_index_mapping: List[Tuple[int, int]] = []
for index in range(self.vocab.get_vocab_size(namespace='tokens')):
token = self.vocab.get_token_from_index(index=index, namespace='tokens')
archived_token_index = archived_vocab.get_token_index(token, namespace='tokens')
# Checking if we got the UNK token index, because we don't want all new token
# representations initialized to UNK token's representation. We do that by checking if
# the two tokens are the same. They will not be if the token at the archived index is
# UNK.
if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token:
vocab_index_mapping.append((index, archived_token_index))
return vocab_index_mapping
示例7: test_from_dataset_respects_min_count
def test_from_dataset_respects_min_count(self):
vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4})
words = vocab.get_index_to_token_vocabulary().values()
assert 'a' in words
assert 'b' not in words
assert 'c' not in words
vocab = Vocabulary.from_instances(self.dataset, min_count=None)
words = vocab.get_index_to_token_vocabulary().values()
assert 'a' in words
assert 'b' in words
assert 'c' in words
示例8: __init__
def __init__(self,
vocab: Vocabulary,
text_field_embedder: TextFieldEmbedder,
contextualizer: Seq2SeqEncoder,
dropout: float = None,
num_samples: int = None,
sparse_embeddings: bool = False,
bidirectional: bool = False,
initializer: InitializerApplicator = None) -> None:
super().__init__(vocab)
self._text_field_embedder = text_field_embedder
if contextualizer.is_bidirectional() is not bidirectional:
raise ConfigurationError(
"Bidirectionality of contextualizer must match bidirectionality of "
"language model. "
f"Contextualizer bidirectional: {contextualizer.is_bidirectional()}, "
f"language model bidirectional: {bidirectional}")
self._contextualizer = contextualizer
self._bidirectional = bidirectional
# The dimension for making predictions just in the forward
# (or backward) direction.
if self._bidirectional:
self._forward_dim = contextualizer.get_output_dim() // 2
else:
self._forward_dim = contextualizer.get_output_dim()
# TODO(joelgrus): more sampled softmax configuration options, as needed.
if num_samples is not None:
self._softmax_loss = SampledSoftmaxLoss(num_words=vocab.get_vocab_size(),
embedding_dim=self._forward_dim,
num_samples=num_samples,
sparse=sparse_embeddings)
else:
self._softmax_loss = _SoftmaxLoss(num_words=vocab.get_vocab_size(),
embedding_dim=self._forward_dim)
# TODO(brendanr): Output perplexity here. e^loss
self.register_buffer('_last_average_loss', torch.zeros(1))
if dropout:
self._dropout = torch.nn.Dropout(dropout)
else:
self._dropout = lambda x: x
if initializer is not None:
initializer(self)
示例9: tokens_to_indices
def tokens_to_indices(self,
tokens: List[Token],
vocabulary: Vocabulary,
index_name: str) -> Dict[str, List[int]]:
dep_labels = [token.dep_ or 'NONE' for token in tokens]
return {index_name: [vocabulary.get_token_index(dep_label, self.namespace) for dep_label in dep_labels]}
示例10: __init__
def __init__(self,
vocab: Vocabulary,
sentence_embedder: TextFieldEmbedder,
action_embedding_dim: int,
encoder: Seq2SeqEncoder,
dropout: float = 0.0,
rule_namespace: str = 'rule_labels') -> None:
super(NlvrSemanticParser, self).__init__(vocab=vocab)
self._sentence_embedder = sentence_embedder
self._denotation_accuracy = Average()
self._consistency = Average()
self._encoder = encoder
if dropout > 0:
self._dropout = torch.nn.Dropout(p=dropout)
else:
self._dropout = lambda x: x
self._rule_namespace = rule_namespace
self._action_embedder = Embedding(num_embeddings=vocab.get_vocab_size(self._rule_namespace),
embedding_dim=action_embedding_dim)
# This is what we pass as input in the first step of decoding, when we don't have a
# previous action.
self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
torch.nn.init.normal_(self._first_action_embedding)
示例11: tokens_to_indices
def tokens_to_indices(self,
tokens: List[Token],
vocabulary: Vocabulary,
index_name: str) -> Dict[str, List[int]]:
tags = ['NONE' if not token.ent_type_ else token.ent_type_ for token in tokens]
return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
示例12: __init__
def __init__(self,
vocabulary: Vocabulary,
tag_namespace: str = "tags",
ignore_classes: List[str] = None) -> None:
"""
Parameters
----------
vocabulary : ``Vocabulary``, required.
A vocabulary containing the tag namespace.
tag_namespace : str, required.
This metric assumes that a BIO format is used in which the
labels are of the format: ["B-LABEL", "I-LABEL"].
ignore_classes : List[str], optional.
Span labels which will be ignored when computing span metrics.
A "span label" is the part that comes after the BIO label, so it
would be "ARG1" for the tag "B-ARG1". For example by passing:
``ignore_classes=["V"]``
the following sequence would not consider the "V" span at index (2, 3)
when computing the precision, recall and F1 metrics.
["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"]
This is helpful for instance, to avoid computing metrics for "V"
spans in a BIO tagging scheme which are typically not included.
"""
self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace)
self._ignore_classes: List[str] = ignore_classes or []
# These will hold per label span counts.
self._true_positives: Dict[str, int] = defaultdict(int)
self._false_positives: Dict[str, int] = defaultdict(int)
self._false_negatives: Dict[str, int] = defaultdict(int)
示例13: setUp
def setUp(self):
super(TestCopyNetReader, self).setUp()
params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json")
self.reader = DatasetReader.from_params(params["dataset_reader"])
instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv")
self.instances = ensure_list(instances)
self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
示例14: token_to_indices
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int:
if self._coarse_tags:
tag = token.pos_
else:
tag = token.tag_
if tag is None:
tag = 'NONE'
return vocabulary.get_token_index(tag, self._namespace)
示例15: __init__
def __init__(self,
vocabulary: Vocabulary,
tag_namespace: str = "tags",
ignore_classes: List[str] = None,
label_encoding: Optional[str] = "BIO",
tags_to_spans_function: Optional[TAGS_TO_SPANS_FUNCTION_TYPE] = None) -> None:
"""
Parameters
----------
vocabulary : ``Vocabulary``, required.
A vocabulary containing the tag namespace.
tag_namespace : str, required.
This metric assumes that a BIO format is used in which the
labels are of the format: ["B-LABEL", "I-LABEL"].
ignore_classes : List[str], optional.
Span labels which will be ignored when computing span metrics.
A "span label" is the part that comes after the BIO label, so it
would be "ARG1" for the tag "B-ARG1". For example by passing:
``ignore_classes=["V"]``
the following sequence would not consider the "V" span at index (2, 3)
when computing the precision, recall and F1 metrics.
["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"]
This is helpful for instance, to avoid computing metrics for "V"
spans in a BIO tagging scheme which are typically not included.
label_encoding : ``str``, optional (default = "BIO")
The encoding used to specify label span endpoints in the sequence.
Valid options are "BIO", "IOB1", "BIOUL" or "BMES".
tags_to_spans_function: ``Callable``, optional (default = ``None``)
If ``label_encoding`` is ``None``, ``tags_to_spans_function`` will be
used to generate spans.
"""
if label_encoding and tags_to_spans_function:
raise ConfigurationError(
'Both label_encoding and tags_to_spans_function are provided. '
'Set "label_encoding=None" explicitly to enable tags_to_spans_function.'
)
if label_encoding:
if label_encoding not in ["BIO", "IOB1", "BIOUL", "BMES"]:
raise ConfigurationError("Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL', 'BMES'.")
elif tags_to_spans_function is None:
raise ConfigurationError(
'At least one of the (label_encoding, tags_to_spans_function) should be provided.'
)
self._label_encoding = label_encoding
self._tags_to_spans_function = tags_to_spans_function
self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace)
self._ignore_classes: List[str] = ignore_classes or []
# These will hold per label span counts.
self._true_positives: Dict[str, int] = defaultdict(int)
self._false_positives: Dict[str, int] = defaultdict(int)
self._false_negatives: Dict[str, int] = defaultdict(int)