本文整理汇总了Python中allennlp.data.tokenizers.Token方法的典型用法代码示例。如果您正苦于以下问题:Python tokenizers.Token方法的具体用法?Python tokenizers.Token怎么用?Python tokenizers.Token使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.tokenizers
的用法示例。
在下文中一共展示了tokenizers.Token方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: text_to_instance
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None,
intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore
"""
We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
"""
# pylint: disable=arguments-differ
fields: Dict[str, Field] = {}
# print([t.text for t in context_tokens])
fields["context_tokens"] = TextField(context_tokens, self._token_indexers)
fields["tokens"] = TextField(tokens, self._token_indexers)
fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
if tags is not None:
fields["tags"] = SequenceLabelField(tags, fields["tokens"])
if intents is not None:
fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels")
if dialog_act is not None:
fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
'dialog_act': dialog_act})
else:
fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
return Instance(fields)
示例2: text_to_instance
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None,
intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore
"""
We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
"""
# pylint: disable=arguments-differ
fields: Dict[str, Field] = {}
sequence = TextField(tokens, self._token_indexers)
fields["tokens"] = sequence
if tags:
fields["tags"] = SequenceLabelField(tags, sequence)
if domain:
fields["domain"] = LabelField(domain, label_namespace="domain_labels")
if intent:
fields["intent"] = LabelField(intent, label_namespace="intent_labels")
if dialog_act is not None:
fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
'dialog_act': dialog_act})
else:
fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
return Instance(fields)
示例3: _read
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def _read(self, file_path: str) -> Iterable[Instance]:
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
# Group into alternative divider / sentence chunks.
for is_divider, lines in itertools.groupby(data_file, _is_divider):
# Ignore the divider chunks, so that `lines` corresponds to the words
# of a single sentence.
if not is_divider:
fields = [line.strip().split() for line in lines]
# unzipping trick returns tuples, but our Fields need lists
fields = [list(field) for field in zip(*fields)]
tokens_, _, _, pico_tags = fields
# TextField requires ``Token`` objects
tokens = [Token(token) for token in tokens_]
yield self.text_to_instance(tokens, pico_tags)
示例4: preprocess
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def preprocess(self, token_batch):
seq_lens = [len(sequence) for sequence in token_batch if sequence]
if not seq_lens:
return []
max_len = min(max(seq_lens), self.max_len)
batches = []
for indexer in self.indexers:
batch = []
for sequence in token_batch:
tokens = sequence[:max_len]
tokens = [Token(token) for token in ['$START'] + tokens]
batch.append(Instance({'tokens': TextField(tokens, indexer)}))
batch = Batch(batch)
batch.index_instances(self.vocab)
batches.append(batch)
return batches
示例5: _read
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def _read(self, file_path):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
for line in data_file:
line = line.strip("\n")
# skip blank lines
if not line:
continue
tokens_and_tags = [
pair.rsplit(self._word_tag_delimiter, 1)
for pair in line.split(self._token_delimiter)
]
tokens = [Token(token) for token, tag in tokens_and_tags]
tags = [tag for token, tag in tokens_and_tags]
yield self.text_to_instance(tokens, tags)
示例6: _read
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def _read(self, file_path: str) -> Iterable[Instance]:
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
# Group into alternative divider / sentence chunks.
for is_divider, lines in itertools.groupby(data_file, _is_divider):
# Ignore the divider chunks, so that `lines` corresponds to the words
# of a single sentence.
if not is_divider:
fields = [line.strip().split() for line in lines]
# unzipping trick returns tuples, but our Fields need lists
fields = [list(field) for field in zip(*fields)]
tokens_, pos_tags, chunk_tags, ner_tags = fields
# TextField requires `Token` objects
tokens = [Token(token) for token in tokens_]
yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
示例7: _read
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def _read(self, file_path):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
with open(file_path, u"r") as data_file:
logger.info(u"Reading instances from lines in file at: %s", file_path)
for line in data_file:
line = line.strip(u"\n")
# skip blank lines
if not line:
continue
tokens_and_tags = [pair.rsplit(self._word_tag_delimiter, 1)
for pair in line.split(self._token_delimiter)]
tokens = [Token(token) for token, tag in tokens_and_tags]
tags = [tag for token, tag in tokens_and_tags]
yield self.text_to_instance(tokens, tags)
示例8: _read
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def _read(self, file_path ) :
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
with open(file_path, u"r") as data_file:
logger.info(u"Reading instances from lines in file at: %s", file_path)
# Group into alternative divider / sentence chunks.
for is_divider, lines in itertools.groupby(data_file, _is_divider):
# Ignore the divider chunks, so that `lines` corresponds to the words
# of a single sentence.
if not is_divider:
fields = [line.strip().split() for line in lines]
# unzipping trick returns tuples, but our Fields need lists
tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in izip(*fields)]
# TextField requires ``Token`` objects
tokens = [Token(token) for token in tokens]
yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
示例9: _read
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def _read(self, file_path ):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
ontonotes_reader = Ontonotes()
logger.info(u"Reading SRL instances from dataset files at: %s", file_path)
if self._domain_identifier is not None:
logger.info(u"Filtering to only include file paths containing the %s domain", self._domain_identifier)
for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier):
tokens = [Token(t) for t in sentence.words]
if not sentence.srl_frames:
# Sentence contains no predicates.
tags = [u"O" for _ in tokens]
verb_label = [0 for _ in tokens]
yield self.text_to_instance(tokens, verb_label, tags)
else:
for (_, tags) in sentence.srl_frames:
verb_indicator = [1 if label[-2:] == u"-V" else 0 for label in tags]
yield self.text_to_instance(tokens, verb_indicator, tags)
示例10: test_world_adds_numbers_from_question
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def test_world_adds_numbers_from_question(self):
question_tokens = [Token(x) for x in [u'what', u'2007', u'2,107', u'0.2', u'1800s', u'1950s', u'?']]
table_kg = TableQuestionKnowledgeGraph.read_from_file(
self.FIXTURES_ROOT / u"data" / u"wikitables" / u"sample_table.tsv", question_tokens)
world = WikiTablesWorld(table_kg)
valid_actions = world.get_valid_actions()
assert u'n -> 2007' in valid_actions[u'n']
assert u'n -> 2107' in valid_actions[u'n']
# It appears that sempre normalizes floating point numbers.
assert u'n -> 0.200' in valid_actions[u'n']
# We want to add the end-points to things like "1800s": 1800 and 1900.
assert u'n -> 1800' in valid_actions[u'n']
assert u'n -> 1900' in valid_actions[u'n']
assert u'n -> 1950' in valid_actions[u'n']
assert u'n -> 1960' in valid_actions[u'n']
示例11: text_to_instance
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def text_to_instance(self, tokens: List[Token], tags: List[str]=None) -> Instance:
if len(tokens) > self._max_token_len:
tokens = tokens[:self._max_token_len]
print(f'Length of tokens exceeded the limit {self._max_token_len}. Truncating...')
if tags:
tags = tags[:self._max_token_len]
fields = {}
text_field = TextField(tokens, self._token_indexers)
fields['tokens'] = text_field
if tags:
fields['tags'] = SequenceLabelField(tags, text_field)
return Instance(fields)
示例12: split_tokens_by_hyphen
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def split_tokens_by_hyphen(tokens: List[Token]) -> List[Token]:
hyphens = ["-", "–", "~"]
new_tokens: List[Token] = []
for token in tokens:
if any(hyphen in token.text for hyphen in hyphens):
unsplit_tokens = [token]
split_tokens: List[Token] = []
for hyphen in hyphens:
for unsplit_token in unsplit_tokens:
if hyphen in token.text:
split_tokens += split_token_by_delimiter(unsplit_token, hyphen)
else:
split_tokens.append(unsplit_token)
unsplit_tokens, split_tokens = split_tokens, []
new_tokens += unsplit_tokens
else:
new_tokens.append(token)
return new_tokens
示例13: get_strings_from_utterance
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
"""
Based on the current utterance, return a dictionary where the keys are the strings in
the database that map to lists of the token indices that they are linked to.
"""
string_linking_scores: Dict[str, List[int]] = defaultdict(list)
for index, token in enumerate(tokenized_utterance):
for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []):
string_linking_scores[string].append(index)
token_bigrams = bigrams([token.text for token in tokenized_utterance])
for index, token_bigram in enumerate(token_bigrams):
for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []):
string_linking_scores[string].extend([index, index + 1])
trigrams = ngrams([token.text for token in tokenized_utterance], 3)
for index, trigram in enumerate(trigrams):
if trigram[0] == "st":
natural_language_key = f"st. {trigram[2]}".lower()
else:
natural_language_key = " ".join(trigram).lower()
for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []):
string_linking_scores[string].extend([index, index + 1, index + 2])
return string_linking_scores
示例14: get_time_range_end_from_utterance
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def get_time_range_end_from_utterance(
utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
early_indices = {
index for index, token in enumerate(tokenized_utterance) if token.text == "early"
}
time_range_end_linking_dict: Dict[str, List[int]] = defaultdict(list)
for token_index, token in enumerate(tokenized_utterance):
for time in TIME_RANGE_END_DICT.get(token.text, []):
if token_index - 1 not in early_indices:
time_range_end_linking_dict[str(time)].append(token_index)
bigrams = ngrams([token.text for token in tokenized_utterance], 2)
for bigram_index, bigram in enumerate(bigrams):
for time in TIME_RANGE_END_DICT.get(" ".join(bigram), []):
time_range_end_linking_dict[str(time)].extend([bigram_index, bigram_index + 1])
return time_range_end_linking_dict
示例15: get_flight_numbers_from_utterance
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import Token [as 别名]
def get_flight_numbers_from_utterance(
utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
indices_words_preceding_flight_number = {
index
for index, token in enumerate(tokenized_utterance)
if token.text in {"flight", "number"}
or token.text.upper() in AIRLINE_CODE_LIST
or token.text.lower() in AIRLINE_CODES.keys()
}
indices_words_succeeding_flight_number = {
index for index, token in enumerate(tokenized_utterance) if token.text == "flight"
}
flight_numbers_linking_dict: Dict[str, List[int]] = defaultdict(list)
for token_index, token in enumerate(tokenized_utterance):
if token.text.isdigit():
if token_index - 1 in indices_words_preceding_flight_number:
flight_numbers_linking_dict[token.text].append(token_index)
if token_index + 1 in indices_words_succeeding_flight_number:
flight_numbers_linking_dict[token.text].append(token_index)
return flight_numbers_linking_dict