本文整理汇总了Python中allennlp.data.tokenizers.token.Token方法的典型用法代码示例。如果您正苦于以下问题:Python token.Token方法的具体用法?Python token.Token怎么用?Python token.Token使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.tokenizers.token
的用法示例。
在下文中一共展示了token.Token方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: text_to_instance
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def text_to_instance(self, # type: ignore
tokens: List[str],
entity_1: Tuple[int],
entity_2: Tuple[int],
label: str = None) -> Instance:
# pylint: disable=arguments-differ
fields: Dict[str, Field] = {}
tokens = [OpenAISplitter._standardize(token) for token in tokens]
tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__']
sentence = TextField([Token(text=t) for t in tokens], self._token_indexers)
fields['sentence'] = sentence
#fields['entity1'] = SpanField(*entity_1, sequence_field=sentence)
#fields['entity2'] = SpanField(*entity_2, sequence_field=sentence)
if label:
fields['label'] = LabelField(label)
return Instance(fields)
示例2: tokens_to_indices
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)
wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize([t.text for t in tokens])
# For tokens that don't correspond to any word pieces, we put (-1, -1) into the offsets.
# That results in the embedding for the token to be all zeros.
offsets = [x if x is not None else (-1, -1) for x in offsets]
output: IndexedTokenList = {
"token_ids": [t.text_id for t in wordpieces],
"mask": [True] * len(tokens), # for original tokens (i.e. word-level)
"type_ids": [t.type_id for t in wordpieces],
"offsets": offsets,
"wordpiece_mask": [True] * len(wordpieces), # for wordpieces (i.e. subword-level)
}
return self._matched_indexer._postprocess_output(output)
示例3: __init__
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def __init__(
self,
namespace: Optional[str] = "tokens",
lowercase_tokens: bool = False,
start_tokens: List[str] = None,
end_tokens: List[str] = None,
feature_name: str = "text",
default_value: str = _DEFAULT_VALUE,
token_min_padding_length: int = 0,
) -> None:
super().__init__(token_min_padding_length)
self.namespace = namespace
self.lowercase_tokens = lowercase_tokens
self._start_tokens = [Token(st) for st in (start_tokens or [])]
self._end_tokens = [Token(et) for et in (end_tokens or [])]
self._feature_name = feature_name
self._default_value = default_value
示例4: __init__
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def __init__(
self,
namespace: str = "token_characters",
character_tokenizer: CharacterTokenizer = CharacterTokenizer(),
start_tokens: List[str] = None,
end_tokens: List[str] = None,
min_padding_length: int = 0,
token_min_padding_length: int = 0,
) -> None:
super().__init__(token_min_padding_length)
if min_padding_length == 0:
url = "https://github.com/allenai/allennlp/issues/1954"
warnings.warn(
"You are using the default value (0) of `min_padding_length`, "
f"which can cause some subtle bugs (more info see {url}). "
"Strongly recommend to set a value, usually the maximum size "
"of the convolutional layer size when using CnnEncoder.",
UserWarning,
)
self._min_padding_length = min_padding_length
self._namespace = namespace
self._character_tokenizer = character_tokenizer
self._start_tokens = [Token(st) for st in (start_tokens or [])]
self._end_tokens = [Token(et) for et in (end_tokens or [])]
示例5: tokens_to_indices
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def tokens_to_indices(
self, tokens: List[Token], vocabulary: Vocabulary
) -> Dict[str, List[List[int]]]:
indices: List[List[int]] = []
for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
token_indices: List[int] = []
if token.text is None:
raise ConfigurationError(
"TokenCharactersIndexer needs a tokenizer that retains text"
)
for character in self._character_tokenizer.tokenize(token.text):
if getattr(character, "text_id", None) is not None:
# `text_id` being set on the token means that we aren't using the vocab, we just
# use this id instead.
index = character.text_id
else:
index = vocabulary.get_token_index(character.text, self._namespace)
token_indices.append(index)
indices.append(token_indices)
return {"token_characters": indices}
示例6: tokenize
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def tokenize(self, text: str) -> List[Token]:
if self._lowercase_characters:
text = text.lower()
if self._byte_encoding is not None:
# We add 1 here so that we can still use 0 for masking, no matter what bytes we get out
# of this.
tokens = [Token(text_id=c + 1) for c in text.encode(self._byte_encoding)]
else:
tokens = [Token(t) for t in list(text)]
for start_token in self._start_tokens:
if isinstance(start_token, int):
token = Token(text_id=start_token, idx=0)
else:
token = Token(text=start_token, idx=0)
tokens.insert(0, token)
for end_token in self._end_tokens:
if isinstance(end_token, int):
token = Token(text_id=end_token, idx=0)
else:
token = Token(text=end_token, idx=0)
tokens.append(token)
return tokens
示例7: _intra_word_tokenize
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def _intra_word_tokenize(
self, string_tokens: List[str]
) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]:
tokens: List[Token] = []
offsets: List[Optional[Tuple[int, int]]] = []
for token_string in string_tokens:
wordpieces = self.tokenizer.encode_plus(
token_string,
add_special_tokens=False,
return_tensors=None,
return_offsets_mapping=False,
return_attention_mask=False,
return_token_type_ids=False,
)
wp_ids = wordpieces["input_ids"]
if len(wp_ids) > 0:
offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1))
tokens.extend(
Token(text=wp_text, text_id=wp_id)
for wp_id, wp_text in zip(wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids))
)
else:
offsets.append(None)
return tokens, offsets
示例8: intra_word_tokenize_sentence_pair
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def intra_word_tokenize_sentence_pair(
self, string_tokens_a: List[str], string_tokens_b: List[str]
) -> Tuple[List[Token], List[Tuple[int, int]], List[Tuple[int, int]]]:
"""
Tokenizes each word into wordpieces separately and returns the wordpiece IDs.
Also calculates offsets such that wordpieces[offsets[i][0]:offsets[i][1] + 1]
corresponds to the original i-th token.
This function inserts special tokens.
"""
tokens_a, offsets_a = self._intra_word_tokenize(string_tokens_a)
tokens_b, offsets_b = self._intra_word_tokenize(string_tokens_b)
offsets_b = self._increment_offsets(
offsets_b,
(
len(self.sequence_pair_start_tokens)
+ len(tokens_a)
+ len(self.sequence_pair_mid_tokens)
),
)
tokens_a = self.add_special_tokens(tokens_a, tokens_b)
offsets_a = self._increment_offsets(offsets_a, len(self.sequence_pair_start_tokens))
return tokens_a, offsets_a, offsets_b
示例9: add_special_tokens
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def add_special_tokens(
self, tokens1: List[Token], tokens2: Optional[List[Token]] = None
) -> List[Token]:
# Make sure we don't change the input parameters
tokens1 = copy.deepcopy(tokens1)
tokens2 = copy.deepcopy(tokens2)
# We add special tokens and also set token type ids.
if tokens2 is None:
for token in tokens1:
token.type_id = self.single_sequence_token_type_id
return self.single_sequence_start_tokens + tokens1 + self.single_sequence_end_tokens
else:
for token in tokens1:
token.type_id = self.sequence_pair_first_token_type_id
for token in tokens2:
token.type_id = self.sequence_pair_second_token_type_id
return (
self.sequence_pair_start_tokens
+ tokens1
+ self.sequence_pair_mid_tokens
+ tokens2
+ self.sequence_pair_end_tokens
)
示例10: add_special_tokens
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def add_special_tokens(
self, tokens1: List[Token], tokens2: Optional[List[Token]] = None
) -> List[Token]:
"""
Adds special tokens to tokenized text. These are tokens like [CLS] or [SEP].
Not all tokenizers do this. The default is to just return the tokens unchanged.
# Parameters
tokens1 : `List[Token]`
The list of tokens to add special tokens to.
tokens2 : `Optional[List[Token]]`
An optional second list of tokens. This will be concatenated with `tokens1`. Special tokens will be
added as appropriate.
# Returns
tokens : `List[Token]`
The combined list of tokens, with special tokens added.
"""
return tokens1 + (tokens2 or [])
示例11: get_padding_lengths
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def get_padding_lengths(self) -> Dict[str, int]:
"""
The `TextField` has a list of `Tokens`, and each `Token` gets converted into arrays by
(potentially) several `TokenIndexers`. This method gets the max length (over tokens)
associated with each of these arrays.
"""
if self._indexed_tokens is None:
raise ConfigurationError(
"You must call .index(vocabulary) on a field before determining padding lengths."
)
padding_lengths = {}
for indexer_name, indexer in self._token_indexers.items():
indexer_lengths = indexer.get_padding_lengths(self._indexed_tokens[indexer_name])
for key, length in indexer_lengths.items():
padding_lengths[f"{indexer_name}___{key}"] = length
return padding_lengths
示例12: tokenize
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def tokenize(self, text ) :
if self._lowercase_characters:
text = text.lower()
if self._byte_encoding is not None:
# We add 1 here so that we can still use 0 for masking, no matter what bytes we get out
# of this.
tokens = [Token(text_id=c + 1) for c in text.encode(self._byte_encoding)]
else:
tokens = [Token(t) for t in list(text)]
for start_token in self._start_tokens:
if isinstance(start_token, int):
token = Token(text_id=start_token, idx=0)
else:
token = Token(text=start_token, idx=0)
tokens.insert(0, token)
for end_token in self._end_tokens:
if isinstance(end_token, int):
token = Token(text_id=end_token, idx=0)
else:
token = Token(text=end_token, idx=0)
tokens.append(token)
return tokens
示例13: text_to_instance
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def text_to_instance(self, # type: ignore
sentence_tokens: List[str],
verb_vector: List[int],
entity_vector: List[int],
state_change_types: Optional[List[str]] = None,
state_change_tags: Optional[List[str]] = None) -> Instance:
# pylint: disable=arguments-differ
fields: Dict[str, Field] = {}
# encode inputs
token_field = TextField([Token(word) for word in sentence_tokens], self._token_indexers)
fields['tokens'] = token_field
fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags')
fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags')
# encode outputs
if state_change_types:
fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels')
if state_change_tags:
fields['state_change_tags'] = SequenceLabelField(state_change_tags, token_field, 'state_change_tags')
return Instance(fields)
示例14: test_find_span
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def test_find_span(self):
sentence = [Token("My"), Token("car"), Token("is"), Token("-"), Token("grey"), Token("?")]
# Single token
assert _find_span([Token("car")], sentence) == (1, 1)
# Multi token
assert _find_span([Token("My"), Token("car")], sentence) == (0, 1)
# Case insensitive
assert _find_span([Token("my"), Token("car")], sentence) == (0, 1)
# Not in sentence
assert _find_span([Token("my"), Token("truck")], sentence) == (-1, -1)
# Unknown
assert _find_span([Token("?")], sentence) == (-2, -2)
# Absent
assert _find_span([Token("-")], sentence) == (-3, -3)
示例15: _number_token_match
# 需要导入模块: from allennlp.data.tokenizers import token [as 别名]
# 或者: from allennlp.data.tokenizers.token import Token [as 别名]
def _number_token_match(self,
entity: str,
entity_text: List[Token],
token: Token,
token_index: int,
tokens: List[Token]) -> float:
# PNP had a "spanFeatures" function that said whether an entity was a-priori known to link
# to a token or set of tokens in the question. This was only used for numbers, and it's
# not totally clear to me how this number feature overlapped with the token match features
# in the original implementation (I think in most cases it was the same, except for things
# like "four million", because the token match is derived from the entity name, which would
# be 4000000, and wouldn't match "four million").
#
# Our implementation basically just adds a duplicate token match feature that's specific to
# numbers. It'll break in some rare cases (e.g., "Which four had four million ..."), but
# those shouldn't be a big deal.
if entity.startswith('fb:'):
# This check works because numbers are the only entities that don't start with "fb:".
return 0.0
return self._contains_exact_token_match(entity, entity_text, token, token_index, tokens)