当前位置: 首页>>代码示例>>Python>>正文


Python token.Token方法代码示例

本文整理汇总了Python中allennlp.data.tokenizers.token.Token方法的典型用法代码示例。如果您正苦于以下问题:Python token.Token方法的具体用法?Python token.Token怎么用?Python token.Token使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在allennlp.data.tokenizers.token的用法示例。


在下文中一共展示了token.Token方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: text_to_instance

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         entity_1: Tuple[int],
                         entity_2: Tuple[int],
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        
        tokens = [OpenAISplitter._standardize(token) for token in tokens]
        tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__']
            
        sentence = TextField([Token(text=t) for t in tokens], self._token_indexers)
        fields['sentence'] = sentence
        #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence)
        #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence)
        
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields) 
开发者ID:DFKI-NLP,项目名称:DISTRE,代码行数:22,代码来源:semeval_2010_task_8_reader.py

示例2: tokens_to_indices

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
        self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)

        wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize([t.text for t in tokens])

        # For tokens that don't correspond to any word pieces, we put (-1, -1) into the offsets.
        # That results in the embedding for the token to be all zeros.
        offsets = [x if x is not None else (-1, -1) for x in offsets]

        output: IndexedTokenList = {
            "token_ids": [t.text_id for t in wordpieces],
            "mask": [True] * len(tokens),  # for original tokens (i.e. word-level)
            "type_ids": [t.type_id for t in wordpieces],
            "offsets": offsets,
            "wordpiece_mask": [True] * len(wordpieces),  # for wordpieces (i.e. subword-level)
        }

        return self._matched_indexer._postprocess_output(output) 
开发者ID:allenai,项目名称:allennlp,代码行数:20,代码来源:pretrained_transformer_mismatched_indexer.py

示例3: __init__

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def __init__(
        self,
        namespace: Optional[str] = "tokens",
        lowercase_tokens: bool = False,
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
        feature_name: str = "text",
        default_value: str = _DEFAULT_VALUE,
        token_min_padding_length: int = 0,
    ) -> None:
        super().__init__(token_min_padding_length)
        self.namespace = namespace
        self.lowercase_tokens = lowercase_tokens

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]
        self._feature_name = feature_name
        self._default_value = default_value 
开发者ID:allenai,项目名称:allennlp,代码行数:20,代码来源:single_id_token_indexer.py

示例4: __init__

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def __init__(
        self,
        namespace: str = "token_characters",
        character_tokenizer: CharacterTokenizer = CharacterTokenizer(),
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
        min_padding_length: int = 0,
        token_min_padding_length: int = 0,
    ) -> None:
        super().__init__(token_min_padding_length)
        if min_padding_length == 0:
            url = "https://github.com/allenai/allennlp/issues/1954"
            warnings.warn(
                "You are using the default value (0) of `min_padding_length`, "
                f"which can cause some subtle bugs (more info see {url}). "
                "Strongly recommend to set a value, usually the maximum size "
                "of the convolutional layer size when using CnnEncoder.",
                UserWarning,
            )
        self._min_padding_length = min_padding_length
        self._namespace = namespace
        self._character_tokenizer = character_tokenizer

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])] 
开发者ID:allenai,项目名称:allennlp,代码行数:27,代码来源:token_characters_indexer.py

示例5: tokens_to_indices

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[List[int]]]:
        indices: List[List[int]] = []
        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            token_indices: List[int] = []
            if token.text is None:
                raise ConfigurationError(
                    "TokenCharactersIndexer needs a tokenizer that retains text"
                )
            for character in self._character_tokenizer.tokenize(token.text):
                if getattr(character, "text_id", None) is not None:
                    # `text_id` being set on the token means that we aren't using the vocab, we just
                    # use this id instead.
                    index = character.text_id
                else:
                    index = vocabulary.get_token_index(character.text, self._namespace)
                token_indices.append(index)
            indices.append(token_indices)
        return {"token_characters": indices} 
开发者ID:allenai,项目名称:allennlp,代码行数:22,代码来源:token_characters_indexer.py

示例6: tokenize

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def tokenize(self, text: str) -> List[Token]:
        if self._lowercase_characters:
            text = text.lower()
        if self._byte_encoding is not None:
            # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out
            # of this.
            tokens = [Token(text_id=c + 1) for c in text.encode(self._byte_encoding)]
        else:
            tokens = [Token(t) for t in list(text)]
        for start_token in self._start_tokens:
            if isinstance(start_token, int):
                token = Token(text_id=start_token, idx=0)
            else:
                token = Token(text=start_token, idx=0)
            tokens.insert(0, token)
        for end_token in self._end_tokens:
            if isinstance(end_token, int):
                token = Token(text_id=end_token, idx=0)
            else:
                token = Token(text=end_token, idx=0)
            tokens.append(token)
        return tokens 
开发者ID:allenai,项目名称:allennlp,代码行数:24,代码来源:character_tokenizer.py

示例7: _intra_word_tokenize

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def _intra_word_tokenize(
        self, string_tokens: List[str]
    ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]:
        tokens: List[Token] = []
        offsets: List[Optional[Tuple[int, int]]] = []
        for token_string in string_tokens:
            wordpieces = self.tokenizer.encode_plus(
                token_string,
                add_special_tokens=False,
                return_tensors=None,
                return_offsets_mapping=False,
                return_attention_mask=False,
                return_token_type_ids=False,
            )
            wp_ids = wordpieces["input_ids"]

            if len(wp_ids) > 0:
                offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1))
                tokens.extend(
                    Token(text=wp_text, text_id=wp_id)
                    for wp_id, wp_text in zip(wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids))
                )
            else:
                offsets.append(None)
        return tokens, offsets 
开发者ID:allenai,项目名称:allennlp,代码行数:27,代码来源:pretrained_transformer_tokenizer.py

示例8: intra_word_tokenize_sentence_pair

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def intra_word_tokenize_sentence_pair(
        self, string_tokens_a: List[str], string_tokens_b: List[str]
    ) -> Tuple[List[Token], List[Tuple[int, int]], List[Tuple[int, int]]]:
        """
        Tokenizes each word into wordpieces separately and returns the wordpiece IDs.
        Also calculates offsets such that wordpieces[offsets[i][0]:offsets[i][1] + 1]
        corresponds to the original i-th token.

        This function inserts special tokens.
        """
        tokens_a, offsets_a = self._intra_word_tokenize(string_tokens_a)
        tokens_b, offsets_b = self._intra_word_tokenize(string_tokens_b)
        offsets_b = self._increment_offsets(
            offsets_b,
            (
                len(self.sequence_pair_start_tokens)
                + len(tokens_a)
                + len(self.sequence_pair_mid_tokens)
            ),
        )
        tokens_a = self.add_special_tokens(tokens_a, tokens_b)
        offsets_a = self._increment_offsets(offsets_a, len(self.sequence_pair_start_tokens))

        return tokens_a, offsets_a, offsets_b 
开发者ID:allenai,项目名称:allennlp,代码行数:26,代码来源:pretrained_transformer_tokenizer.py

示例9: add_special_tokens

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def add_special_tokens(
        self, tokens1: List[Token], tokens2: Optional[List[Token]] = None
    ) -> List[Token]:
        # Make sure we don't change the input parameters
        tokens1 = copy.deepcopy(tokens1)
        tokens2 = copy.deepcopy(tokens2)

        # We add special tokens and also set token type ids.
        if tokens2 is None:
            for token in tokens1:
                token.type_id = self.single_sequence_token_type_id
            return self.single_sequence_start_tokens + tokens1 + self.single_sequence_end_tokens
        else:
            for token in tokens1:
                token.type_id = self.sequence_pair_first_token_type_id
            for token in tokens2:
                token.type_id = self.sequence_pair_second_token_type_id
            return (
                self.sequence_pair_start_tokens
                + tokens1
                + self.sequence_pair_mid_tokens
                + tokens2
                + self.sequence_pair_end_tokens
            ) 
开发者ID:allenai,项目名称:allennlp,代码行数:26,代码来源:pretrained_transformer_tokenizer.py

示例10: add_special_tokens

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def add_special_tokens(
        self, tokens1: List[Token], tokens2: Optional[List[Token]] = None
    ) -> List[Token]:
        """
        Adds special tokens to tokenized text. These are tokens like [CLS] or [SEP].

        Not all tokenizers do this. The default is to just return the tokens unchanged.

        # Parameters

        tokens1 : `List[Token]`
            The list of tokens to add special tokens to.
        tokens2 : `Optional[List[Token]]`
            An optional second list of tokens. This will be concatenated with `tokens1`. Special tokens will be
            added as appropriate.

        # Returns
        tokens : `List[Token]`
            The combined list of tokens, with special tokens added.
        """
        return tokens1 + (tokens2 or []) 
开发者ID:allenai,项目名称:allennlp,代码行数:23,代码来源:tokenizer.py

示例11: get_padding_lengths

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def get_padding_lengths(self) -> Dict[str, int]:
        """
        The `TextField` has a list of `Tokens`, and each `Token` gets converted into arrays by
        (potentially) several `TokenIndexers`.  This method gets the max length (over tokens)
        associated with each of these arrays.
        """
        if self._indexed_tokens is None:
            raise ConfigurationError(
                "You must call .index(vocabulary) on a field before determining padding lengths."
            )

        padding_lengths = {}
        for indexer_name, indexer in self._token_indexers.items():
            indexer_lengths = indexer.get_padding_lengths(self._indexed_tokens[indexer_name])
            for key, length in indexer_lengths.items():
                padding_lengths[f"{indexer_name}___{key}"] = length
        return padding_lengths 
开发者ID:allenai,项目名称:allennlp,代码行数:19,代码来源:text_field.py

示例12: tokenize

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def tokenize(self, text     )               :
        if self._lowercase_characters:
            text = text.lower()
        if self._byte_encoding is not None:
            # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out
            # of this.
            tokens = [Token(text_id=c + 1) for c in text.encode(self._byte_encoding)]
        else:
            tokens = [Token(t) for t in list(text)]
        for start_token in self._start_tokens:
            if isinstance(start_token, int):
                token = Token(text_id=start_token, idx=0)
            else:
                token = Token(text=start_token, idx=0)
            tokens.insert(0, token)
        for end_token in self._end_tokens:
            if isinstance(end_token, int):
                token = Token(text_id=end_token, idx=0)
            else:
                token = Token(text=end_token, idx=0)
            tokens.append(token)
        return tokens 
开发者ID:plasticityai,项目名称:magnitude,代码行数:24,代码来源:character_tokenizer.py

示例13: text_to_instance

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def text_to_instance(self,  # type: ignore
                         sentence_tokens: List[str],
                         verb_vector: List[int],
                         entity_vector: List[int],
                         state_change_types: Optional[List[str]] = None,
                         state_change_tags: Optional[List[str]] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        # encode inputs
        token_field = TextField([Token(word) for word in sentence_tokens], self._token_indexers)
        fields['tokens'] = token_field
        fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags')
        fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags')

        # encode outputs
        if state_change_types:
            fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels')
        if state_change_tags:
            fields['state_change_tags'] = SequenceLabelField(state_change_tags, token_field, 'state_change_tags')

        return Instance(fields) 
开发者ID:allenai,项目名称:propara,代码行数:24,代码来源:prolocal_dataset_reader.py

示例14: test_find_span

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def test_find_span(self):
        sentence = [Token("My"), Token("car"), Token("is"), Token("-"), Token("grey"), Token("?")]

        # Single token
        assert _find_span([Token("car")], sentence) == (1, 1)

        # Multi token
        assert _find_span([Token("My"), Token("car")], sentence) == (0, 1)

        # Case insensitive
        assert _find_span([Token("my"), Token("car")], sentence) == (0, 1)

        # Not in sentence
        assert _find_span([Token("my"), Token("truck")], sentence) == (-1, -1)

        # Unknown
        assert _find_span([Token("?")], sentence) == (-2, -2)

        # Absent
        assert _find_span([Token("-")], sentence) == (-3, -3) 
开发者ID:allenai,项目名称:propara,代码行数:22,代码来源:propara_dataset_reader_test.py

示例15: _number_token_match

# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def _number_token_match(self,
                            entity: str,
                            entity_text: List[Token],
                            token: Token,
                            token_index: int,
                            tokens: List[Token]) -> float:
        # PNP had a "spanFeatures" function that said whether an entity was a-priori known to link
        # to a token or set of tokens in the question.  This was only used for numbers, and it's
        # not totally clear to me how this number feature overlapped with the token match features
        # in the original implementation (I think in most cases it was the same, except for things
        # like "four million", because the token match is derived from the entity name, which would
        # be 4000000, and wouldn't match "four million").
        #
        # Our implementation basically just adds a duplicate token match feature that's specific to
        # numbers.  It'll break in some rare cases (e.g., "Which four had four million ..."), but
        # those shouldn't be a big deal.
        if entity.startswith('fb:'):
            # This check works because numbers are the only entities that don't start with "fb:".
            return 0.0
        return self._contains_exact_token_match(entity, entity_text, token, token_index, tokens) 
开发者ID:jcyk,项目名称:gtos,代码行数:22,代码来源:knowledge_graph_field.py


注:本文中的allennlp.data.tokenizers.token.Token方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。