当前位置: 首页>>代码示例>>Python>>正文


Python vocabulary.Vocabulary类代码示例

本文整理汇总了Python中allennlp.data.vocabulary.Vocabulary的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary类的具体用法?Python Vocabulary怎么用?Python Vocabulary使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Vocabulary类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_from_instances_exclusive_embeddings_file_inside_archive

    def test_from_instances_exclusive_embeddings_file_inside_archive(self):
        """ Just for ensuring there are no problems when reading pretrained tokens from an archive """
        # Read embeddings file from archive
        archive_path = str(self.TEST_DIR / "embeddings-archive.zip")

        with zipfile.ZipFile(archive_path, 'w') as archive:
            file_path = 'embedding.3d.vec'
            with archive.open(file_path, 'w') as embeddings_file:
                embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8'))
                embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8'))

            with archive.open('dummy.vec', 'w') as dummy_file:
                dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8'))

        embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path)
        vocab = Vocabulary.from_instances(self.dataset,
                                          min_count={'tokens': 4},
                                          pretrained_files={'tokens': embeddings_file_uri},
                                          only_include_pretrained_words=True)

        words = set(vocab.get_index_to_token_vocabulary().values())
        assert 'a' in words
        assert 'b' not in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(self.dataset,
                                          pretrained_files={'tokens': embeddings_file_uri},
                                          only_include_pretrained_words=True)
        words = set(vocab.get_index_to_token_vocabulary().values())
        assert 'a' in words
        assert 'b' in words
        assert 'c' not in words
开发者ID:apmoore1,项目名称:allennlp,代码行数:32,代码来源:vocabulary_test.py

示例2: test_multilabel_field_empty_field_works

    def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
        vocab.add_token_to_namespace("label2", namespace="test_empty_labels")

        f = MultiLabelField([], label_namespace="test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
开发者ID:apmoore1,项目名称:allennlp,代码行数:9,代码来源:multilabel_field_test.py

示例3: test_from_dataset_respects_max_vocab_size_single_int

    def test_from_dataset_respects_max_vocab_size_single_int(self):
        max_vocab_size = 1
        vocab = Vocabulary.from_instances(self.dataset, max_vocab_size=max_vocab_size)
        words = vocab.get_index_to_token_vocabulary().values()
        # Additional 2 tokens are '@@[email protected]@' and '@@[email protected]@' by default
        assert len(words) == max_vocab_size + 2

        vocab = Vocabulary.from_instances(self.dataset, min_count=None)
        words = vocab.get_index_to_token_vocabulary().values()
        assert len(words) == 5
开发者ID:apmoore1,项目名称:allennlp,代码行数:10,代码来源:vocabulary_test.py

示例4: test_multilabel_field_can_index_with_vocab

    def test_multilabel_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("rel0", namespace="rel_labels")
        vocab.add_token_to_namespace("rel1", namespace="rel_labels")
        vocab.add_token_to_namespace("rel2", namespace="rel_labels")

        f = MultiLabelField(["rel1", "rel0"], label_namespace="rel_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0]))
开发者ID:apmoore1,项目名称:allennlp,代码行数:10,代码来源:multilabel_field_test.py

示例5: test_unknown_token

 def test_unknown_token(self):
     # pylint: disable=protected-access
     # We're putting this behavior in a test so that the behavior is documented.  There is
     # solver code that depends in a small way on how we treat the unknown token, so any
     # breaking change to this behavior should break a test, so you know you've done something
     # that needs more consideration.
     vocab = Vocabulary()
     oov_token = vocab._oov_token
     oov_index = vocab.get_token_index(oov_token)
     assert oov_index == 1
     assert vocab.get_token_index("unseen word") == oov_index
开发者ID:apmoore1,项目名称:allennlp,代码行数:11,代码来源:vocabulary_test.py

示例6: _get_vocab_index_mapping

 def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
     vocab_index_mapping: List[Tuple[int, int]] = []
     for index in range(self.vocab.get_vocab_size(namespace='tokens')):
         token = self.vocab.get_token_from_index(index=index, namespace='tokens')
         archived_token_index = archived_vocab.get_token_index(token, namespace='tokens')
         # Checking if we got the UNK token index, because we don't want all new token
         # representations initialized to UNK token's representation. We do that by checking if
         # the two tokens are the same. They will not be if the token at the archived index is
         # UNK.
         if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token:
             vocab_index_mapping.append((index, archived_token_index))
     return vocab_index_mapping
开发者ID:pyknife,项目名称:allennlp,代码行数:12,代码来源:nlvr_coverage_semantic_parser.py

示例7: test_from_dataset_respects_min_count

    def test_from_dataset_respects_min_count(self):
        vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4})
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' not in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(self.dataset, min_count=None)
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' in words
        assert 'c' in words
开发者ID:apmoore1,项目名称:allennlp,代码行数:12,代码来源:vocabulary_test.py

示例8: __init__

    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 contextualizer: Seq2SeqEncoder,
                 dropout: float = None,
                 num_samples: int = None,
                 sparse_embeddings: bool = False,
                 bidirectional: bool = False,
                 initializer: InitializerApplicator = None) -> None:
        super().__init__(vocab)
        self._text_field_embedder = text_field_embedder

        if contextualizer.is_bidirectional() is not bidirectional:
            raise ConfigurationError(
                    "Bidirectionality of contextualizer must match bidirectionality of "
                    "language model. "
                    f"Contextualizer bidirectional: {contextualizer.is_bidirectional()}, "
                    f"language model bidirectional: {bidirectional}")

        self._contextualizer = contextualizer
        self._bidirectional = bidirectional

        # The dimension for making predictions just in the forward
        # (or backward) direction.
        if self._bidirectional:
            self._forward_dim = contextualizer.get_output_dim() // 2
        else:
            self._forward_dim = contextualizer.get_output_dim()

        # TODO(joelgrus): more sampled softmax configuration options, as needed.
        if num_samples is not None:
            self._softmax_loss = SampledSoftmaxLoss(num_words=vocab.get_vocab_size(),
                                                    embedding_dim=self._forward_dim,
                                                    num_samples=num_samples,
                                                    sparse=sparse_embeddings)
        else:
            self._softmax_loss = _SoftmaxLoss(num_words=vocab.get_vocab_size(),
                                              embedding_dim=self._forward_dim)

        # TODO(brendanr): Output perplexity here. e^loss
        self.register_buffer('_last_average_loss', torch.zeros(1))

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = lambda x: x

        if initializer is not None:
            initializer(self)
开发者ID:apmoore1,项目名称:allennlp,代码行数:49,代码来源:language_model.py

示例9: tokens_to_indices

    def tokens_to_indices(self,
                          tokens: List[Token],
                          vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        dep_labels = [token.dep_ or 'NONE' for token in tokens]

        return {index_name: [vocabulary.get_token_index(dep_label, self.namespace) for dep_label in dep_labels]}
开发者ID:apmoore1,项目名称:allennlp,代码行数:7,代码来源:dep_label_indexer.py

示例10: __init__

    def __init__(self,
                 vocab: Vocabulary,
                 sentence_embedder: TextFieldEmbedder,
                 action_embedding_dim: int,
                 encoder: Seq2SeqEncoder,
                 dropout: float = 0.0,
                 rule_namespace: str = 'rule_labels') -> None:
        super(NlvrSemanticParser, self).__init__(vocab=vocab)

        self._sentence_embedder = sentence_embedder
        self._denotation_accuracy = Average()
        self._consistency = Average()
        self._encoder = encoder
        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
        self._rule_namespace = rule_namespace

        self._action_embedder = Embedding(num_embeddings=vocab.get_vocab_size(self._rule_namespace),
                                          embedding_dim=action_embedding_dim)

        # This is what we pass as input in the first step of decoding, when we don't have a
        # previous action.
        self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
        torch.nn.init.normal_(self._first_action_embedding)
开发者ID:ziaridoy20,项目名称:allennlp,代码行数:26,代码来源:nlvr_semantic_parser.py

示例11: tokens_to_indices

    def tokens_to_indices(self,
                          tokens: List[Token],
                          vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        tags = ['NONE' if not token.ent_type_ else token.ent_type_ for token in tokens]

        return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
开发者ID:apmoore1,项目名称:allennlp,代码行数:7,代码来源:ner_tag_indexer.py

示例12: __init__

    def __init__(self,
                 vocabulary: Vocabulary,
                 tag_namespace: str = "tags",
                 ignore_classes: List[str] = None) -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the tag namespace.
        tag_namespace : str, required.
            This metric assumes that a BIO format is used in which the
            labels are of the format: ["B-LABEL", "I-LABEL"].
        ignore_classes : List[str], optional.
            Span labels which will be ignored when computing span metrics.
            A "span label" is the part that comes after the BIO label, so it
            would be "ARG1" for the tag "B-ARG1". For example by passing:

             ``ignore_classes=["V"]``
            the following sequence would not consider the "V" span at index (2, 3)
            when computing the precision, recall and F1 metrics.

            ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"]

            This is helpful for instance, to avoid computing metrics for "V"
            spans in a BIO tagging scheme which are typically not included.
        """
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace)
        self._ignore_classes: List[str] = ignore_classes or []

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int)
开发者ID:Jordan-Sauchuk,项目名称:allennlp,代码行数:33,代码来源:span_based_f1_measure.py

示例13: setUp

 def setUp(self):
     super(TestCopyNetReader, self).setUp()
     params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json")
     self.reader = DatasetReader.from_params(params["dataset_reader"])
     instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv")
     self.instances = ensure_list(instances)
     self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
开发者ID:apmoore1,项目名称:allennlp,代码行数:7,代码来源:copynet_test.py

示例14: token_to_indices

 def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int:
     if self._coarse_tags:
         tag = token.pos_
     else:
         tag = token.tag_
     if tag is None:
         tag = 'NONE'
     return vocabulary.get_token_index(tag, self._namespace)
开发者ID:Jordan-Sauchuk,项目名称:allennlp,代码行数:8,代码来源:pos_tag_indexer.py

示例15: __init__

    def __init__(self,
                 vocabulary: Vocabulary,
                 tag_namespace: str = "tags",
                 ignore_classes: List[str] = None,
                 label_encoding: Optional[str] = "BIO",
                 tags_to_spans_function: Optional[TAGS_TO_SPANS_FUNCTION_TYPE] = None) -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the tag namespace.
        tag_namespace : str, required.
            This metric assumes that a BIO format is used in which the
            labels are of the format: ["B-LABEL", "I-LABEL"].
        ignore_classes : List[str], optional.
            Span labels which will be ignored when computing span metrics.
            A "span label" is the part that comes after the BIO label, so it
            would be "ARG1" for the tag "B-ARG1". For example by passing:

             ``ignore_classes=["V"]``
            the following sequence would not consider the "V" span at index (2, 3)
            when computing the precision, recall and F1 metrics.

            ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"]

            This is helpful for instance, to avoid computing metrics for "V"
            spans in a BIO tagging scheme which are typically not included.
        label_encoding : ``str``, optional (default = "BIO")
            The encoding used to specify label span endpoints in the sequence.
            Valid options are "BIO", "IOB1", "BIOUL" or "BMES".
        tags_to_spans_function: ``Callable``, optional (default = ``None``)
            If ``label_encoding`` is ``None``, ``tags_to_spans_function`` will be
            used to generate spans.
        """
        if label_encoding and tags_to_spans_function:
            raise ConfigurationError(
                    'Both label_encoding and tags_to_spans_function are provided. '
                    'Set "label_encoding=None" explicitly to enable tags_to_spans_function.'
                    )
        if label_encoding:
            if label_encoding not in ["BIO", "IOB1", "BIOUL", "BMES"]:
                raise ConfigurationError("Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL', 'BMES'.")
        elif tags_to_spans_function is None:
            raise ConfigurationError(
                    'At least one of the (label_encoding, tags_to_spans_function) should be provided.'
                    )

        self._label_encoding = label_encoding
        self._tags_to_spans_function = tags_to_spans_function
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace)
        self._ignore_classes: List[str] = ignore_classes or []

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int)
开发者ID:apmoore1,项目名称:allennlp,代码行数:56,代码来源:span_based_f1_measure.py


注:本文中的allennlp.data.vocabulary.Vocabulary类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。