本文整理汇总了Python中spacy.tokens.doc.Doc方法的典型用法代码示例。如果您正苦于以下问题:Python doc.Doc方法的具体用法?Python doc.Doc怎么用?Python doc.Doc使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类spacy.tokens.doc
的用法示例。
在下文中一共展示了doc.Doc方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: convert_to_flair_format
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def convert_to_flair_format(spacy_model: Language, data: List[Tuple[str, List[Offset]]]) -> List[str]:
result: List[str] = list()
for text, offsets in data:
doc: Doc = spacy_model(text)
# remove duplicated offsets
offsets = normalize_offsets(offsets=offsets)
offset_tuples = list(set([offset.to_tuple() for offset in offsets]))
gold_annotations = GoldParse(doc, entities=offset_tuples)
annotations: List[str] = gold_annotations.ner
assert len(annotations) == len(doc)
# Flair uses BIOES and Spacy BILUO
# BILUO for Begin, Inside, Last, Unit, Out
# BIOES for Begin, Inside, Outside, End, Single
annotations = [a.replace('L-', 'E-') for a in annotations]
annotations = [a.replace('U-', 'S-') for a in annotations]
annotations = ["O" if a == "-" else a for a in annotations] # replace unknown
result += [f"{word} {tag}\n" for word, tag in zip(doc, annotations)]
result.append('\n')
return result
示例2: convert_offsets_to_spacy_docs
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def convert_offsets_to_spacy_docs(doc_annotated: List[Tuple[str, str, List[Offset]]]) -> List[Doc]:
"""
Convert a list of tuple of string with their offset to Spacy doc with entities ready
:param doc_annotated: list of tuple (string, array of offsets)
:return: list of spacy doc
"""
model = get_empty_model(load_labels_for_training=False)
docs: List[Doc] = list()
for (index, (case_id, text, tags)) in enumerate(doc_annotated):
doc: Doc = model.make_doc(text)
ents = list()
for offset in tags:
span_doc = doc.char_span(offset.start, offset.end, label=offset.type)
if span_doc is not None:
ents.append(span_doc)
else:
print("Issue in offset", "Index: " + str(index), "case: " + case_id,
text[offset.start:offset.end], text, sep="|")
doc.ents = ents
docs.append(doc)
return docs
示例3: __call__
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def __call__(self, doc: Doc):
"""
The spacy pipeline caller
:param doc: The Doc token.
"""
# get matches
phrase_matches = self.phrase_matcher(doc)
matches = self.matcher(doc)
# process them
for match_id, start, end in phrase_matches + matches:
# start add them into entities list
entity = (match_id, start, end)
doc.ents += (entity,)
return doc
# add factories
示例4: tokenize
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def tokenize(self, doc: 'Doc') -> typing.List[Token]:
return [Token(t.text, t.idx) for t in doc]
示例5: doc_for_text
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def doc_for_text(self, text: Text) -> 'Doc':
if self.component_config.get("case_sensitive"):
return self.nlp(text)
else:
return self.nlp(text.lower())
示例6: extract_entities
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def extract_entities(doc: 'Doc') -> List[Dict[Text, Any]]:
entities = [
{
"entity": ent.label_,
"value": ent.text,
"start": ent.start_char,
"confidence": None,
"end": ent.end_char
}
for ent in doc.ents]
return entities
示例7: __call__
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def __call__(self, doc: Doc):
for token in doc:
wordnet = Wordnet(token=token, lang=self.__lang)
token._.set(WordnetAnnotator.__FIELD, wordnet)
return doc
示例8: doc_to_fixed_tokens
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def doc_to_fixed_tokens(doc: SpacyDoc) -> List[str]:
"""Fix the tokens in a document to not have exceptions"""
return [fix_token(t) for t in doc]
示例9: convert_bilou_with_missing_action
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def convert_bilou_with_missing_action(doc: Doc, offsets: List[Tuple[int, int, str]]) -> List[Optional[str]]:
"""
Convert unknown type token to missing value for NER
Therefore no Loss will be applied to these tokens
https://spacy.io/api/goldparse#biluo_tags_from_offsets
:param doc: text tokenized by Spacy
:param offsets: original offsets
:return: list of BILOU types
"""
result = biluo_tags_from_offsets(doc, offsets)
return [no_action_bilou if unknown_type_name in action_bilou else action_bilou
for action_bilou in result]
示例10: convert_unknown_bilou
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def convert_unknown_bilou(doc: Doc, offsets: List[Offset]) -> GoldParse:
"""
Convert entity offsets to list of BILOU annotations
and convert UNKNOWN label to Spacy missing values
https://spacy.io/api/goldparse#biluo_tags_from_offsets
:param doc: spacy tokenized text
:param offsets: discovered offsets
:return: tuple of docs and BILOU annotations
"""
tupple_offset = [offset.to_tuple() for offset in offsets]
bilou_annotations = convert_bilou_with_missing_action(doc=doc, offsets=tupple_offset)
return GoldParse(doc, entities=bilou_annotations)
示例11: convert_unknown_bilou_bulk
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def convert_unknown_bilou_bulk(docs: List[Doc], offsets: List[List[Offset]]) -> List[GoldParse]:
"""
Convert list of entity offsets to list of BILOU annotations
and convert UNKNOWN label to Spacy missing values
https://spacy.io/api/goldparse#biluo_tags_from_offsets
:param docs: spacy tokenized text
:param offsets: discovered offsets
:return: tuple of docs and GoldParse
"""
list_of_gold_parse = list()
for doc, current_offsets in zip(docs, offsets):
bilou_annotations = convert_unknown_bilou(doc=doc,
offsets=current_offsets)
list_of_gold_parse.append(bilou_annotations)
return list_of_gold_parse
示例12: test_bilou_conv
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def test_bilou_conv():
doc: Doc = pytest.nlp.make_doc("Ceci est un test.")
offset1 = [Offset(5, 8, "UNKNOWN")]
assert convert_unknown_bilou(doc, offsets=offset1).ner == ['O', '-', 'O', 'O', 'O']
assert convert_unknown_bilou_bulk([doc], [offset1])[0].ner == ['O', '-', 'O', 'O', 'O']
offset2 = [Offset(5, 8, "PERS")]
assert convert_unknown_bilou(doc, offsets=offset2).ner == ['O', 'U-PERS', 'O', 'O', 'O']
offset3 = [Offset(0, 4, "UNKNOWN")]
assert convert_unknown_bilou(doc, offsets=offset3).ner == ['-', 'O', 'O', 'O', 'O']
示例13: test_score
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def test_score():
s = "Le Président, Le Commis-Greffier, Jean-Paul I FFELLI Nelly DUBAS"
doc: Doc = pytest.nlp.make_doc(s)
expected_span: GoldParse = GoldParse(doc, entities=[(34, 64, "PERS")])
predicted_span = doc.char_span(34, 58, "PERS")
doc.ents = [predicted_span]
score: Scorer = Scorer()
score.score(doc, expected_span)
assert score.ents_per_type == dict([('PERS', {'p': 0.0, 'r': 0.0, 'f': 0.0})])
predicted_span = doc.char_span(34, 64, "PERS")
doc.ents = [predicted_span]
score: Scorer = Scorer()
score.score(doc, expected_span)
assert score.ents_per_type == dict([('PERS', {'p': 100.0, 'r': 100.0, 'f': 100.0})])
示例14: test_set_span
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def test_set_span():
s = "Le Président, Le Commis-Greffier, Jean-Paul I FFELLI Nelly DUBAS"
doc1: Doc = pytest.nlp.make_doc(s)
doc2: Doc = pytest.nlp.make_doc(s)
span1 = doc1.char_span(34, 58, "PERS")
span2 = doc2.char_span(34, 58, "PERS")
assert {span1.text}.symmetric_difference({span2.text}) == set()
assert len({span1}.symmetric_difference({span2})) > 0
示例15: tokenize
# 需要导入模块: from spacy.tokens import doc [as 别名]
# 或者: from spacy.tokens.doc import Doc [as 别名]
def tokenize(self, doc):
# type: (Doc) -> List[Token]
return [Token(t.text, t.idx) for t in doc]