本文整理匯總了Python中spacy.gold.GoldParse方法的典型用法代碼示例。如果您正苦於以下問題:Python gold.GoldParse方法的具體用法?Python gold.GoldParse怎麽用?Python gold.GoldParse使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類spacy.gold
的用法示例。
在下文中一共展示了gold.GoldParse方法的11個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: jsonToCrf
# 需要導入模塊: from spacy import gold [as 別名]
# 或者: from spacy.gold import GoldParse [as 別名]
def jsonToCrf(self, json_eg, spacy_nlp):
entity_offsets = []
for sentence in json_eg['text']:
doc = spacy_nlp(sentence)
for i in json_eg['entities']:
entity_offsets.append(tuple((i['rangeFrom'],i['rangeTo'],i['entity'])))
gold = GoldParse(doc, entities=entity_offsets)
ents = [l[5] for l in gold.orig_annot]
crf_format = [(doc[i].text, doc[i].tag_, ents[i]) for i in range(len(doc))]
return crf_format
示例2: convert_to_flair_format
# 需要導入模塊: from spacy import gold [as 別名]
# 或者: from spacy.gold import GoldParse [as 別名]
def convert_to_flair_format(spacy_model: Language, data: List[Tuple[str, List[Offset]]]) -> List[str]:
result: List[str] = list()
for text, offsets in data:
doc: Doc = spacy_model(text)
# remove duplicated offsets
offsets = normalize_offsets(offsets=offsets)
offset_tuples = list(set([offset.to_tuple() for offset in offsets]))
gold_annotations = GoldParse(doc, entities=offset_tuples)
annotations: List[str] = gold_annotations.ner
assert len(annotations) == len(doc)
# Flair uses BIOES and Spacy BILUO
# BILUO for Begin, Inside, Last, Unit, Out
# BIOES for Begin, Inside, Outside, End, Single
annotations = [a.replace('L-', 'E-') for a in annotations]
annotations = [a.replace('U-', 'S-') for a in annotations]
annotations = ["O" if a == "-" else a for a in annotations] # replace unknown
result += [f"{word} {tag}\n" for word, tag in zip(doc, annotations)]
result.append('\n')
return result
示例3: _from_json_to_crf
# 需要導入模塊: from spacy import gold [as 別名]
# 或者: from spacy.gold import GoldParse [as 別名]
def _from_json_to_crf(self,
message, # type: Message
entity_offsets # type: List[Tuple[int, int, Text]]
):
# type: (...) -> List[Tuple[Text, Text, Text, Text]]
"""Convert json examples to format of underlying crfsuite."""
from spacy.gold import GoldParse
doc = message.get("spacy_doc")
gold = GoldParse(doc, entities=entity_offsets)
ents = [l[5] for l in gold.orig_annot]
if '-' in ents:
logger.warn("Misaligned entity annotation in sentence '{}'. "
"Make sure the start and end values of the "
"annotated training examples end at token "
"boundaries (e.g. don't include trailing "
"whitespaces).".format(doc.text))
if not self.component_config["BILOU_flag"]:
for i, label in enumerate(ents):
if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
# removes BILOU prefix from label
ents[i] = self._entity_from_label(label)
return self._from_text_to_crf(message, ents)
示例4: evaluate
# 需要導入模塊: from spacy import gold [as 別名]
# 或者: from spacy.gold import GoldParse [as 別名]
def evaluate(tokenizer, nlp, valid_data, labels):
"""Evaluate model performance on a test dataset."""
texts, cats = zip(*valid_data)
golds = []
# Use the model's ops module
# to make sure this is compatible with GPU (cupy array)
# or without (numpy array)
scores = np.zeros((len(cats), len(labels)), dtype="f")
if is_transformer(nlp):
textcat = nlp.get_pipe(PIPES.textcat)
else:
textcat = nlp.get_pipe("textcat")
scores = textcat.model.ops.asarray(scores)
num_correct = 0
for i, doc in enumerate(nlp.pipe(texts)):
gold_cats = cats[i]["cats"]
for j, (label, score) in enumerate(doc.cats.items()):
if label not in gold_cats:
raise ValueError(f"Prediction for unexpected label: {label}")
scores[i, j] = score
doc_prediction = score > 0.5
if doc_prediction == bool(gold_cats[label]):
num_correct += 1
golds.append(GoldParse(doc, cats=gold_cats))
accuracy = num_correct / ((len(texts) * len(labels)) + 1e-8)
loss, _ = textcat.get_loss(texts, golds, scores)
return accuracy, loss
示例5: _from_json_to_crf
# 需要導入模塊: from spacy import gold [as 別名]
# 或者: from spacy.gold import GoldParse [as 別名]
def _from_json_to_crf(self,
message: Message,
entity_offsets: List[Tuple[int, int, Text]]
) -> List[Tuple[Text, Text, Text, Text]]:
"""Convert json examples to format of underlying crfsuite."""
if self.pos_features:
from spacy.gold import GoldParse
doc = message.get("spacy_doc")
gold = GoldParse(doc, entities=entity_offsets)
ents = [l[5] for l in gold.orig_annot]
else:
tokens = message.get("tokens")
ents = self._bilou_tags_from_offsets(tokens, entity_offsets)
if '-' in ents:
logger.warning("Misaligned entity annotation in sentence '{}'. "
"Make sure the start and end values of the "
"annotated training examples end at token "
"boundaries (e.g. don't include trailing "
"whitespaces or punctuation)."
"".format(message.text))
if not self.component_config["BILOU_flag"]:
for i, label in enumerate(ents):
if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
# removes BILOU prefix from label
ents[i] = self._entity_from_label(label)
return self._from_text_to_crf(message, ents)
示例6: _bilou_tags_from_offsets
# 需要導入模塊: from spacy import gold [as 別名]
# 或者: from spacy.gold import GoldParse [as 別名]
def _bilou_tags_from_offsets(tokens, entities, missing='O'):
# From spacy.spacy.GoldParse, under MIT License
starts = {token.offset: i for i, token in enumerate(tokens)}
ends = {token.end: i for i, token in enumerate(tokens)}
bilou = ['-' for _ in tokens]
# Handle entity cases
for start_char, end_char, label in entities:
start_token = starts.get(start_char)
end_token = ends.get(end_char)
# Only interested if the tokenization is correct
if start_token is not None and end_token is not None:
if start_token == end_token:
bilou[start_token] = 'U-%s' % label
else:
bilou[start_token] = 'B-%s' % label
for i in range(start_token + 1, end_token):
bilou[i] = 'I-%s' % label
bilou[end_token] = 'L-%s' % label
# Now distinguish the O cases from ones where we miss the tokenization
entity_chars = set()
for start_char, end_char, label in entities:
for i in range(start_char, end_char):
entity_chars.add(i)
for n, token in enumerate(tokens):
for i in range(token.offset, token.end):
if i in entity_chars:
break
else:
bilou[n] = missing
return bilou
示例7: createDataset
# 需要導入模塊: from spacy import gold [as 別名]
# 或者: from spacy.gold import GoldParse [as 別名]
def createDataset(self, intents, spacy_nlp):
dataset = []
entity_offsets = []
intentCounter = 0
for intent in intents:
sentenceCounter = 0
for sentence in intent['text']:
doc = spacy_nlp(sentence)
print(doc.text)
for entity in intent['entities'][sentenceCounter]:
entity_offsets.append(tuple((entity['rangeFrom'],entity['rangeTo'],entity['entity'])))
gold = GoldParse(doc, entities=entity_offsets)
ents = [l[5] for l in gold.orig_annot]
crf_format = [(doc[entity].text, doc[entity].tag_, ents[entity]) for i in range(len(doc))]
dataset.append(crf_format)
sentenceCounter = sentenceCounter + 1
intentCounter = intentCounter + 1
return dataset
示例8: convert_unknown_bilou
# 需要導入模塊: from spacy import gold [as 別名]
# 或者: from spacy.gold import GoldParse [as 別名]
def convert_unknown_bilou(doc: Doc, offsets: List[Offset]) -> GoldParse:
"""
Convert entity offsets to list of BILOU annotations
and convert UNKNOWN label to Spacy missing values
https://spacy.io/api/goldparse#biluo_tags_from_offsets
:param doc: spacy tokenized text
:param offsets: discovered offsets
:return: tuple of docs and BILOU annotations
"""
tupple_offset = [offset.to_tuple() for offset in offsets]
bilou_annotations = convert_bilou_with_missing_action(doc=doc, offsets=tupple_offset)
return GoldParse(doc, entities=bilou_annotations)
示例9: convert_unknown_bilou_bulk
# 需要導入模塊: from spacy import gold [as 別名]
# 或者: from spacy.gold import GoldParse [as 別名]
def convert_unknown_bilou_bulk(docs: List[Doc], offsets: List[List[Offset]]) -> List[GoldParse]:
"""
Convert list of entity offsets to list of BILOU annotations
and convert UNKNOWN label to Spacy missing values
https://spacy.io/api/goldparse#biluo_tags_from_offsets
:param docs: spacy tokenized text
:param offsets: discovered offsets
:return: tuple of docs and GoldParse
"""
list_of_gold_parse = list()
for doc, current_offsets in zip(docs, offsets):
bilou_annotations = convert_unknown_bilou(doc=doc,
offsets=current_offsets)
list_of_gold_parse.append(bilou_annotations)
return list_of_gold_parse
示例10: test_tokenizer
# 需要導入模塊: from spacy import gold [as 別名]
# 或者: from spacy.gold import GoldParse [as 別名]
def test_tokenizer():
doc: Doc = pytest.nlp.make_doc("Ceci est un test.")
offsets = [(0, 4, "PERS"), (9, 11, "PERS")]
gold: GoldParse = GoldParse(doc, entities=offsets)
word_extracted = [doc.char_span(o[0], o[1]) for o in offsets]
count_ent = sum([1 for item in gold.ner if item != "O"])
assert count_ent == len(word_extracted)
offsets = [(0, 4, "PERS"), (9, 12, "PERS")]
gold: GoldParse = GoldParse(doc, entities=offsets)
word_extracted = [doc.char_span(o[0], o[1]) for o in offsets if doc.char_span(o[0], o[1]) is not None]
count_ent = sum([1 for item in gold.ner if item != "O"])
assert count_ent > len(word_extracted)
示例11: test_score
# 需要導入模塊: from spacy import gold [as 別名]
# 或者: from spacy.gold import GoldParse [as 別名]
def test_score():
s = "Le Président, Le Commis-Greffier, Jean-Paul I FFELLI Nelly DUBAS"
doc: Doc = pytest.nlp.make_doc(s)
expected_span: GoldParse = GoldParse(doc, entities=[(34, 64, "PERS")])
predicted_span = doc.char_span(34, 58, "PERS")
doc.ents = [predicted_span]
score: Scorer = Scorer()
score.score(doc, expected_span)
assert score.ents_per_type == dict([('PERS', {'p': 0.0, 'r': 0.0, 'f': 0.0})])
predicted_span = doc.char_span(34, 64, "PERS")
doc.ents = [predicted_span]
score: Scorer = Scorer()
score.score(doc, expected_span)
assert score.ents_per_type == dict([('PERS', {'p': 100.0, 'r': 100.0, 'f': 100.0})])