當前位置: 首頁>>代碼示例>>Python>>正文


Python util.remove_sentence_boundaries方法代碼示例

本文整理匯總了Python中allennlp.nn.util.remove_sentence_boundaries方法的典型用法代碼示例。如果您正苦於以下問題:Python util.remove_sentence_boundaries方法的具體用法?Python util.remove_sentence_boundaries怎麽用?Python util.remove_sentence_boundaries使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在allennlp.nn.util的用法示例。


在下文中一共展示了util.remove_sentence_boundaries方法的9個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: test_remove_sentence_boundaries

# 需要導入模塊: from allennlp.nn import util [as 別名]
# 或者: from allennlp.nn.util import remove_sentence_boundaries [as 別名]
def test_remove_sentence_boundaries(self):
        tensor = torch.from_numpy(numpy.random.rand(3, 5, 7))
        mask = torch.from_numpy(
            # The mask with two elements is to test the corner case
            # of an empty sequence, so here we are removing boundaries
            # from  "<S> </S>"
            numpy.array([[1, 1, 0, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 0]])
        ).bool()
        new_tensor, new_mask = util.remove_sentence_boundaries(tensor, mask)

        expected_new_tensor = torch.zeros(3, 3, 7)
        expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :]
        expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :]
        assert_array_almost_equal(new_tensor.data.numpy(), expected_new_tensor.data.numpy())

        expected_new_mask = torch.from_numpy(numpy.array([[0, 0, 0], [1, 1, 1], [1, 1, 0]])).bool()
        assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all() 
開發者ID:allenai,項目名稱:allennlp,代碼行數:19,代碼來源:util_test.py

示例2: test_remove_sentence_boundaries

# 需要導入模塊: from allennlp.nn import util [as 別名]
# 或者: from allennlp.nn.util import remove_sentence_boundaries [as 別名]
def test_remove_sentence_boundaries(self):
        tensor = torch.from_numpy(numpy.random.rand(3, 5, 7))
        mask = torch.from_numpy(
                # The mask with two elements is to test the corner case
                # of an empty sequence, so here we are removing boundaries
                # from  "<S> </S>"
                numpy.array([[1, 1, 0, 0, 0],
                             [1, 1, 1, 1, 1],
                             [1, 1, 1, 1, 0]])).long()
        new_tensor, new_mask = util.remove_sentence_boundaries(tensor, mask)

        expected_new_tensor = torch.zeros(3, 3, 7)
        expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :]
        expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :]
        assert_array_almost_equal(new_tensor.data.numpy(), expected_new_tensor.data.numpy())

        expected_new_mask = torch.from_numpy(
                numpy.array([[0, 0, 0],
                             [1, 1, 1],
                             [1, 1, 0]])).long()
        assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all() 
開發者ID:plasticityai,項目名稱:magnitude,代碼行數:23,代碼來源:util_test.py

示例3: test_elmo_token_representation

# 需要導入模塊: from allennlp.nn import util [as 別名]
# 或者: from allennlp.nn.util import remove_sentence_boundaries [as 別名]
def test_elmo_token_representation(self):
        # Load the test words and convert to char ids
        with open(os.path.join(self.elmo_fixtures_path, "vocab_test.txt"), "r") as fin:
            words = fin.read().strip().split("\n")

        vocab = Vocabulary()
        indexer = ELMoTokenCharactersIndexer()
        tokens = [Token(word) for word in words]

        indices = indexer.tokens_to_indices(tokens, vocab)
        # There are 457 tokens. Reshape into 10 batches of 50 tokens.
        sentences = []
        for k in range(10):
            char_indices = indices["elmo_tokens"][(k * 50) : ((k + 1) * 50)]
            sentences.append(
                indexer.as_padded_tensor_dict(
                    {"elmo_tokens": char_indices}, padding_lengths={"elmo_tokens": 50}
                )["elmo_tokens"]
            )
        batch = torch.stack(sentences)

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file)
        elmo_token_embedder_output = elmo_token_embedder(batch)

        # Reshape back to a list of words and compare with ground truth.  Need to also
        # remove <S>, </S>
        actual_embeddings = remove_sentence_boundaries(
            elmo_token_embedder_output["token_embedding"], elmo_token_embedder_output["mask"]
        )[0].data.numpy()
        actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1])

        embedding_file = os.path.join(self.elmo_fixtures_path, "elmo_token_embeddings.hdf5")
        with h5py.File(embedding_file, "r") as fin:
            expected_embeddings = fin["embedding"][...]

        assert numpy.allclose(actual_embeddings[: len(tokens)], expected_embeddings, atol=1e-6) 
開發者ID:allenai,項目名稱:allennlp,代碼行數:38,代碼來源:elmo_test.py

示例4: batch_to_embeddings

# 需要導入模塊: from allennlp.nn import util [as 別名]
# 或者: from allennlp.nn.util import remove_sentence_boundaries [as 別名]
def batch_to_embeddings(self, batch                 )                                     :
        u"""
        Parameters
        ----------
        batch : ``List[List[str]]``, required
            A list of tokenized sentences.

        Returns
        -------
            A tuple of tensors, the first representing activations (batch_size, 3, num_timesteps, 1024) and
        the second a mask (batch_size, num_timesteps).
        """
        character_ids = batch_to_ids(batch)
        if self.cuda_device >= 0:
            character_ids = character_ids.cuda(device=self.cuda_device)

        bilm_output = self.elmo_bilm(character_ids)
        layer_activations = bilm_output[u'activations']
        mask_with_bos_eos = bilm_output[u'mask']

        # without_bos_eos is a 3 element list of (activation, mask) tensor pairs,
        # each with size (batch_size, num_timesteps, dim and (batch_size, num_timesteps)
        # respectively.
        without_bos_eos = [remove_sentence_boundaries(layer, mask_with_bos_eos)
                           for layer in layer_activations]
        # Converts a list of pairs (activation, mask) tensors to a single tensor of activations.
        activations = torch.cat([ele[0].unsqueeze(1) for ele in without_bos_eos], dim=1)
        # The mask is the same for each ELMo vector, so just take the first.
        mask = without_bos_eos[0][1]

        return activations, mask 
開發者ID:plasticityai,項目名稱:magnitude,代碼行數:33,代碼來源:elmo.py

示例5: test_elmo_token_representation

# 需要導入模塊: from allennlp.nn import util [as 別名]
# 或者: from allennlp.nn.util import remove_sentence_boundaries [as 別名]
def test_elmo_token_representation(self):
        # Load the test words and convert to char ids
        with open(os.path.join(self.elmo_fixtures_path, u'vocab_test.txt'), u'r') as fin:
            words = fin.read().strip().split(u'\n')

        vocab = Vocabulary()
        indexer = ELMoTokenCharactersIndexer()
        tokens = [Token(word) for word in words]

        indices = indexer.tokens_to_indices(tokens, vocab, u"elmo")
        # There are 457 tokens. Reshape into 10 batches of 50 tokens.
        sentences = []
        for k in range(10):
            char_indices = indices[u"elmo"][(k * 50):((k + 1) * 50)]
            sentences.append(
                    indexer.pad_token_sequence(
                            {u'key': char_indices}, desired_num_tokens={u'key': 50}, padding_lengths={}
                    )[u'key']
            )
        batch = torch.from_numpy(numpy.array(sentences))

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file)
        elmo_token_embedder_output = elmo_token_embedder(batch)

        # Reshape back to a list of words and compare with ground truth.  Need to also
        # remove <S>, </S>
        actual_embeddings = remove_sentence_boundaries(
                elmo_token_embedder_output[u'token_embedding'],
                elmo_token_embedder_output[u'mask']
        )[0].data.numpy()
        actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1])

        embedding_file = os.path.join(self.elmo_fixtures_path, u'elmo_token_embeddings.hdf5')
        with h5py.File(embedding_file, u'r') as fin:
            expected_embeddings = fin[u'embedding'][...]

        assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6) 
開發者ID:plasticityai,項目名稱:magnitude,代碼行數:39,代碼來源:elmo_test.py

示例6: test_elmo_bilm

# 需要導入模塊: from allennlp.nn import util [as 別名]
# 或者: from allennlp.nn.util import remove_sentence_boundaries [as 別名]
def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {"character_ids": indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()
        dataset = AllennlpDataset(instances, vocab)
        # Now finally we can iterate through batches.
        loader = PyTorchDataLoader(dataset, 3)
        for i, batch in enumerate(loader):
            lm_embeddings = elmo_bilm(batch["elmo"]["character_ids"]["elmo_tokens"])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings["activations"][2], lm_embeddings["mask"]
            )

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [len(sentence.split()) for sentence in batch_sentences]
            assert lengths.tolist() == expected_lengths

            # get the expected embeddings and compare!
            expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)]
            for k in range(3):
                assert numpy.allclose(
                    top_layer_embeddings[k, : lengths[k], :].data.numpy(),
                    expected_top_layer[k],
                    atol=1.0e-6,
                ) 
開發者ID:allenai,項目名稱:allennlp,代碼行數:45,代碼來源:elmo_test.py

示例7: create_cached_cnn_embeddings

# 需要導入模塊: from allennlp.nn import util [as 別名]
# 或者: from allennlp.nn.util import remove_sentence_boundaries [as 別名]
def create_cached_cnn_embeddings(self, tokens           )        :
        u"""
        Given a list of tokens, this method precomputes word representations
        by running just the character convolutions and highway layers of elmo,
        essentially creating uncontextual word vectors. On subsequent forward passes,
        the word ids are looked up from an embedding, rather than being computed on
        the fly via the CNN encoder.

        This function sets 3 attributes:

        _word_embedding : ``torch.Tensor``
            The word embedding for each word in the tokens passed to this method.
        _bos_embedding : ``torch.Tensor``
            The embedding for the BOS token.
        _eos_embedding : ``torch.Tensor``
            The embedding for the EOS token.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            A list of tokens to precompute character convolutions for.
        """
        tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens
        timesteps = 32
        batch_size = 32
        chunked_tokens = lazy_groups_of(iter(tokens), timesteps)

        all_embeddings = []
        device = get_device_of(next(self.parameters()))
        for batch in lazy_groups_of(chunked_tokens, batch_size):
            # Shape (batch_size, timesteps, 50)
            batched_tensor = batch_to_ids(batch)
            # NOTE: This device check is for when a user calls this method having
            # already placed the model on a device. If this is called in the
            # constructor, it will probably happen on the CPU. This isn't too bad,
            # because it's only a few convolutions and will likely be very fast.
            if device >= 0:
                batched_tensor = batched_tensor.cuda(device)
            output = self._token_embedder(batched_tensor)
            token_embedding = output[u"token_embedding"]
            mask = output[u"mask"]
            token_embedding, _ = remove_sentence_boundaries(token_embedding, mask)
            all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1)))
        full_embedding = torch.cat(all_embeddings, 0)

        # We might have some trailing embeddings from padding in the batch, so
        # we clip the embedding and lookup to the right size.
        full_embedding = full_embedding[:len(tokens), :]
        embedding = full_embedding[2:len(tokens), :]
        vocab_size, embedding_dim = list(embedding.size())

        from allennlp.modules.token_embedders import Embedding # type: ignore
        self._bos_embedding = full_embedding[0, :]
        self._eos_embedding = full_embedding[1, :]
        self._word_embedding = Embedding(vocab_size, # type: ignore
                                         embedding_dim,
                                         weight=embedding.data,
                                         trainable=self._requires_grad,
                                         padding_index=0) 
開發者ID:plasticityai,項目名稱:magnitude,代碼行數:61,代碼來源:elmo.py

示例8: test_elmo_bilm

# 需要導入模塊: from allennlp.nn import util [as 別名]
# 或者: from allennlp.nn.util import remove_sentence_boundaries [as 別名]
def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in izip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {u'character_ids': indexer})
                instance = Instance({u"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()

        # Now finally we can iterate through batches.
        iterator = BasicIterator(3)
        iterator.index_with(vocab)
        for i, batch in enumerate(iterator(instances, num_epochs=1, shuffle=False)):
            lm_embeddings = elmo_bilm(batch[u'elmo'][u'character_ids'])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                    lm_embeddings[u'activations'][2],
                    lm_embeddings[u'mask']
            )

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                    len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)]
            for k in range(3):
                self.assertTrue(
                        numpy.allclose(
                                top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                                expected_top_layer[k],
                                atol=1.0e-6
                        )
                ) 
開發者ID:plasticityai,項目名稱:magnitude,代碼行數:51,代碼來源:elmo_test.py

示例9: forward

# 需要導入模塊: from allennlp.nn import util [as 別名]
# 或者: from allennlp.nn.util import remove_sentence_boundaries [as 別名]
def forward(self, inputs):
        """
        Parameters
        ----------
        inputs: ``torch.Tensor``, required.
        Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.
        word_inputs : ``torch.Tensor``, required.
            If you passed a cached vocab, you can in addition pass a tensor of shape
            ``(batch_size, timesteps)``, which represent word ids which have been pre-cached.
        Returns
        -------
        Dict with keys:
        """
        # reshape the input if needed
        original_shape = inputs.size()
        if len(original_shape) > 3:
            timesteps, num_characters = original_shape[-2:]
            reshaped_inputs = inputs.view(-1, timesteps, num_characters)
        else:
            reshaped_inputs = inputs

        # run the biLM
        bilm_output = self._elmo_lstm(reshaped_inputs, None)
        layer_activations = bilm_output['activations']
        mask_with_bos_eos = bilm_output['mask']

        word_embedding_and_hiddens = torch.cat(layer_activations, dim=-1)
        assert self.output_dim * len(layer_activations) == word_embedding_and_hiddens.size(-1)

        # compute the elmo representations
        representation_with_bos_eos = word_embedding_and_hiddens
        representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(representation_with_bos_eos, mask_with_bos_eos)
        processed_representation = representation_without_bos_eos
        processed_mask = mask_without_bos_eos

        # reshape if necessary
        out_representations = []
        out_representations.append(processed_representation[:, :, :self.output_dim])
        if len(layer_activations) > 1:
            for i in range(1, len(layer_activations)):
                out_representations.append(processed_representation[:, :, self.output_dim * i : self.output_dim * (i + 1)])

        return {'elmo_representations': out_representations, 'mask': processed_mask} 
開發者ID:sz128,項目名稱:slot_filling_and_intent_detection_of_SLU,代碼行數:45,代碼來源:get_ELMo_word_embedding_for_a_dataset.py


注:本文中的allennlp.nn.util.remove_sentence_boundaries方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。