当前位置: 首页>>代码示例>>Python>>正文


Python util.END_SYMBOL属性代码示例

本文整理汇总了Python中allennlp.common.util.END_SYMBOL属性的典型用法代码示例。如果您正苦于以下问题:Python util.END_SYMBOL属性的具体用法?Python util.END_SYMBOL怎么用?Python util.END_SYMBOL使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在allennlp.common.util的用法示例。


在下文中一共展示了util.END_SYMBOL属性的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_generator

# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def get_generator(self,
                      model_path: str,
                      token_vocab_path: str,
                      stress_vocab_dump_path: str) -> Generator:
        if self.generator is None:
            assert os.path.isdir(model_path) and os.path.isdir(token_vocab_path)
            vocabulary = Vocabulary.from_files(token_vocab_path)
            stress_vocabulary = StressVocabulary()
            if not os.path.isfile(stress_vocab_dump_path):
                stress_vocabulary = inflate_stress_vocabulary(vocabulary, self.get_stress_predictor())
                stress_vocabulary.save(stress_vocab_dump_path)
            else:
                stress_vocabulary.load(stress_vocab_dump_path)

            eos_index = vocabulary.get_token_index(END_SYMBOL)
            unk_index = vocabulary.get_token_index(DEFAULT_OOV_TOKEN)
            exclude_transform = ExcludeTransform((unk_index, eos_index))

            model = LanguageModel.load(model_path, vocabulary_dir=token_vocab_path,
                                       transforms=[exclude_transform, ])
            self.generator = Generator(model, vocabulary, stress_vocabulary, eos_index)
        return self.generator 
开发者ID:IlyaGusev,项目名称:rupo,代码行数:24,代码来源:api.py

示例2: __init__

# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 words_per_instance: int = 35
                ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(
            start_tokens=[START_SYMBOL],
            end_tokens=[END_SYMBOL]
        )
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True)
        }

        self._words_per_instance = words_per_instance 
开发者ID:dangitstam,项目名称:topic-rnn,代码行数:18,代码来源:imdb_review_reader.py

示例3: text_to_instance

# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def text_to_instance(self, text: str, sentences: List[str] = None, tags: List[int] = None) -> Instance:
        if sentences is None:
            if self._language == "ru":
                sentences = [s.text for s in razdel.sentenize(text)]
            else:
                sentences = nltk.tokenize.sent_tokenize(text)
        sentences_tokens = []
        for sentence in sentences[:self._max_sentences_count]:
            sentence = sentence.lower() if self._lowercase else sentence
            tokens = self._tokenizer.tokenize(sentence)[:self._sentence_max_tokens]
            tokens.insert(0, Token(START_SYMBOL))
            tokens.append(Token(END_SYMBOL))
            indexed_tokens = TextField(tokens, self._source_token_indexers)
            sentences_tokens.append(indexed_tokens)

        sentences_tokens_indexed = ListField(sentences_tokens)
        result = {'source_sentences': sentences_tokens_indexed}

        if tags:
            result["sentences_tags"] = SequenceLabelField(tags[:self._max_sentences_count], sentences_tokens_indexed)
        return Instance(result) 
开发者ID:IlyaGusev,项目名称:summarus,代码行数:23,代码来源:summarization_sentence_tagger_reader.py

示例4: detokenize

# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def detokenize(array, vocab):
    """
    Given an array of ints, we'll turn this into a string or a list of strings.
    :param array: possibly multidimensional numpy array
    :return:
    """
    if array.ndim > 1:
        return [detokenize(x, vocab) for x in array]
    tokenized = [vocab.get_token_from_index(v) for v in array]
    return ' '.join([x for x in tokenized if x not in (vocab._padding_token, START_SYMBOL, END_SYMBOL)]) 
开发者ID:yuweijiang,项目名称:HGL-pytorch,代码行数:12,代码来源:pytorch_misc.py

示例5: text_to_instance

# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def text_to_instance(self, source_string     , target_string      = None)            :  # type: ignore
        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        if self._source_add_start_token:
            tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)
        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._target_token_indexers)
            return Instance({u"source_tokens": source_field, u"target_tokens": target_field})
        else:
            return Instance({u'source_tokens': source_field}) 
开发者ID:plasticityai,项目名称:magnitude,代码行数:17,代码来源:seq2seq.py

示例6: __init__

# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def __init__(self,
                 vocab            ,
                 source_embedder                   ,
                 encoder                ,
                 max_decoding_steps     ,
                 target_namespace      = u"tokens",
                 target_embedding_dim      = None,
                 attention_function                     = None,
                 scheduled_sampling_ratio        = 0.0)        :
        super(SimpleSeq2Seq, self).__init__(vocab)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._decoder_output_dim = self._encoder.get_output_dim()
        target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim()
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        if self._attention_function:
            self._decoder_attention = LegacyAttention(self._attention_function)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time step.
            self._decoder_input_dim = self._encoder.get_output_dim() + target_embedding_dim
        else:
            self._decoder_input_dim = target_embedding_dim
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim)
        self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)

    #overrides 
开发者ID:plasticityai,项目名称:magnitude,代码行数:41,代码来源:simple_seq2seq.py

示例7: text_to_instance

# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        query_id_field = LabelField(int(query_id), skip_indexing=True)
        doc_id_field = LabelField(int(doc_id), skip_indexing=True)

        query_tokenized = self._tokenizer.tokenize(query_sequence)
        #if self._source_add_start_token:
        #    query_tokenized.insert(0, Token(START_SYMBOL))
        #query_tokenized.append(Token(END_SYMBOL))
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_tokenized = self._tokenizer.tokenize(doc_sequence)
        #doc_tokenized.insert(0, Token(START_SYMBOL))
        #doc_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_tokenized = doc_tokenized[:self.max_doc_length]

        doc_field = TextField(doc_tokenized, self._token_indexers)

        query_length = LabelField(len(query_tokenized), skip_indexing=True)
        doc_length = LabelField(len(doc_tokenized), skip_indexing=True)

        return Instance({
            "query_id":query_id_field,
            "doc_id":doc_id_field,
            "query_tokens":query_field,
            "doc_tokens":doc_field,
            "query_length":query_length,
            "doc_length":doc_length}) 
开发者ID:sebastian-hofstaetter,项目名称:sigir19-neural-ir,代码行数:35,代码来源:ir_labeled_tuple_loader.py

示例8: text_to_instance

# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        query_tokenized = self._tokenizer.tokenize(query_sequence)
        #if self._source_add_start_token:
        #    query_tokenized.insert(0, Token(START_SYMBOL))
        #query_tokenized.append(Token(END_SYMBOL))
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence)
        #doc_pos_tokenized.insert(0, Token(START_SYMBOL))
        #doc_pos_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length]

        doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers)

        doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence)
        #doc_neg_tokenized.insert(0, Token(START_SYMBOL))
        #doc_neg_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length]

        doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers)

        query_length = LabelField(len(query_tokenized), skip_indexing=True)
        doc_pos_length = LabelField(len(doc_pos_tokenized), skip_indexing=True)
        doc_neg_length = LabelField(len(doc_neg_tokenized), skip_indexing=True)

        return Instance({
            "query_tokens":query_field,
            "doc_pos_tokens":doc_pos_field,
            "doc_neg_tokens": doc_neg_field,
            "query_length":query_length,
            "doc_pos_length":doc_pos_length,
            "doc_neg_length":doc_neg_length}) 
开发者ID:sebastian-hofstaetter,项目名称:sigir19-neural-ir,代码行数:40,代码来源:ir_triple_loader.py

示例9: text_to_instance

# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        query_id_field = MetadataField(query_id)
        doc_id_field = MetadataField(doc_id)

        query_tokenized = self._tokenizer.tokenize(query_sequence)
        #if self._source_add_start_token:
        #    query_tokenized.insert(0, Token(START_SYMBOL))
        #query_tokenized.append(Token(END_SYMBOL))
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]
        if self.min_query_length > -1 and len(query_tokenized) < self.min_query_length:
            query_tokenized = query_tokenized + [self.padding_value] * (self.min_query_length - len(query_tokenized))

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_tokenized = self._tokenizer.tokenize(doc_sequence)
        #doc_tokenized.insert(0, Token(START_SYMBOL))
        #doc_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_tokenized = doc_tokenized[:self.max_doc_length]
        if self.min_doc_length > -1 and len(doc_tokenized) < self.min_doc_length:
            doc_tokenized = doc_tokenized + [self.padding_value] * (self.min_doc_length - len(doc_tokenized))

        doc_field = TextField(doc_tokenized, self._token_indexers)

        return Instance({
            "query_id":query_id_field,
            "doc_id":doc_id_field,
            "query_tokens":query_field,
            "doc_tokens":doc_field}) 
开发者ID:sebastian-hofstaetter,项目名称:transformer-kernel-ranking,代码行数:34,代码来源:ir_labeled_tuple_loader.py

示例10: test_cnn_dailymail_reader

# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def test_cnn_dailymail_reader(self):
        tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
        reader = CNNDailyMailReader(tokenizer, cnn_tokenized_dir=TEST_STORIES_DIR, separate_namespaces=False)
        dataset = reader.read(TEST_URLS_FILE)
        for sample in dataset:
            self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["source_tokens"]), 2)

            self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["target_tokens"]), 2) 
开发者ID:IlyaGusev,项目名称:summarus,代码行数:14,代码来源:test_readers.py

示例11: test_ria_reader

# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def test_ria_reader(self):
        tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
        reader = RIAReader(tokenizer)
        dataset = reader.read(RIA_EXAMPLE_FILE)
        for sample in dataset:
            self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["source_tokens"]), 2)

            self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["target_tokens"]), 2) 
开发者ID:IlyaGusev,项目名称:summarus,代码行数:14,代码来源:test_readers.py

示例12: __init__

# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def __init__(
            self,
            vocab: Vocabulary,
            decoder_net: DecoderNet,
            max_decoding_steps: int,
            target_embedder: Embedding,
            target_namespace: str = "tokens",
            tie_output_embedding: bool = False,
            scheduled_sampling_ratio: float = 0,
            label_smoothing_ratio: Optional[float] = None,
            beam_size: int = 4,
            tensor_based_metric: Metric = None,
            token_based_metric: Metric = None,
    ) -> None:
        super().__init__(target_embedder)

        self._vocab = vocab

        self._decoder_net = decoder_net
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._label_smoothing_ratio = label_smoothing_ratio

        self._start_index = self._vocab.get_token_index(START_SYMBOL, self._target_namespace)
        self._end_index = self._vocab.get_token_index(END_SYMBOL, self._target_namespace)
        self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size)

        target_vocab_size = self._vocab.get_vocab_size(self._target_namespace)

        if self.target_embedder.get_output_dim() != self._decoder_net.target_embedding_dim:
            raise ConfigurationError("Target Embedder output_dim doesn't match decoder module's input.")

        self._output_projection_layer = Linear(self._decoder_net.get_output_dim(), target_vocab_size)

        if tie_output_embedding:
            if self._output_projection_layer.weight.shape != self.target_embedder.weight.shape:
                raise ConfigurationError("Can't tie embeddings with output linear layer, due to shape mismatch")
            self._output_projection_layer.weight = self.target_embedder.weight

        self._tensor_based_metric = tensor_based_metric
        self._token_based_metric = token_based_metric
        self._scheduled_sampling_ratio = scheduled_sampling_ratio 
开发者ID:IlyaGusev,项目名称:summarus,代码行数:44,代码来源:custom_autoregressive_seq2seq_decoder.py

示例13: text_to_instance

# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def text_to_instance(self, source: str, target: str = None) -> Instance:
        def prepare_text(text, max_tokens):
            text = text.lower() if self._lowercase else text
            tokens = self._tokenizer.tokenize(text)[:max_tokens]
            tokens.insert(0, Token(START_SYMBOL))
            tokens.append(Token(END_SYMBOL))
            return tokens

        source_tokens = prepare_text(source, self._source_max_tokens)
        source_tokens_indexed = TextField(source_tokens, self._source_token_indexers)
        result = {'source_tokens': source_tokens_indexed}
        meta_fields = {}

        if self._save_copy_fields:
            source_to_target_field = NamespaceSwappingField(source_tokens[1:-1], self._target_namespace)
            result["source_to_target"] = source_to_target_field
            meta_fields["source_tokens"] = [x.text for x in source_tokens[1:-1]]

        if self._save_pgn_fields:
            source_to_target_field = NamespaceSwappingField(source_tokens, self._target_namespace)
            result["source_to_target"] = source_to_target_field
            meta_fields["source_tokens"] = [x.text for x in source_tokens]

        if target:
            target_tokens = prepare_text(target, self._target_max_tokens)
            target_tokens_indexed = TextField(target_tokens, self._target_token_indexers)
            result['target_tokens'] = target_tokens_indexed

            if self._save_pgn_fields:
                meta_fields["target_tokens"] = [y.text for y in target_tokens]
                source_and_target_token_ids = self._tokens_to_ids(source_tokens + target_tokens, self._lowercase)
                source_token_ids = source_and_target_token_ids[:len(source_tokens)]
                result["source_token_ids"] = ArrayField(np.array(source_token_ids, dtype='long'))
                target_token_ids = source_and_target_token_ids[len(source_tokens):]
                result["target_token_ids"] = ArrayField(np.array(target_token_ids, dtype='long'))

            if self._save_copy_fields:
                meta_fields["target_tokens"] = [y.text for y in target_tokens[1:-1]]
                source_and_target_token_ids = self._tokens_to_ids(source_tokens[1:-1] + target_tokens, self._lowercase)
                source_token_ids = source_and_target_token_ids[:len(source_tokens)-2]
                result["source_token_ids"] = ArrayField(np.array(source_token_ids))
                target_token_ids = source_and_target_token_ids[len(source_tokens)-2:]
                result["target_token_ids"] = ArrayField(np.array(target_token_ids))

        elif self._save_copy_fields:
            source_token_ids = self._tokens_to_ids(source_tokens[1:-1], self._lowercase)
            result["source_token_ids"] = ArrayField(np.array(source_token_ids))
        elif self._save_pgn_fields:
            source_token_ids = self._tokens_to_ids(source_tokens, self._lowercase)
            result["source_token_ids"] = ArrayField(np.array(source_token_ids))
        if self._save_copy_fields or self._save_pgn_fields:
            result["metadata"] = MetadataField(meta_fields)
        return Instance(result) 
开发者ID:IlyaGusev,项目名称:summarus,代码行数:55,代码来源:summarization_reader.py


注:本文中的allennlp.common.util.END_SYMBOL属性示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。