Python gold.GoldParse方法代码示例

本文整理汇总了Python中spacy.gold.GoldParse方法的典型用法代码示例。如果您正苦于以下问题：Python gold.GoldParse方法的具体用法？Python gold.GoldParse怎么用？Python gold.GoldParse使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类spacy.gold的用法示例。

在下文中一共展示了gold.GoldParse方法的11个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: jsonToCrf

# 需要导入模块: from spacy import gold [as 别名]
# 或者: from spacy.gold import GoldParse [as 别名]
def jsonToCrf(self, json_eg, spacy_nlp):
        
        entity_offsets = []

        for sentence in json_eg['text']:

            doc = spacy_nlp(sentence)

            for i in json_eg['entities']:
                
                entity_offsets.append(tuple((i['rangeFrom'],i['rangeTo'],i['entity'])))
                
            gold = GoldParse(doc, entities=entity_offsets)
            ents = [l[5] for l in gold.orig_annot]
            crf_format = [(doc[i].text, doc[i].tag_, ents[i]) for i in range(len(doc))]
        
        return crf_format

开发者ID:GeniSysAI，项目名称:NLU，代码行数:19，代码来源:CRF.py

示例2: convert_to_flair_format

# 需要导入模块: from spacy import gold [as 别名]
# 或者: from spacy.gold import GoldParse [as 别名]
def convert_to_flair_format(spacy_model: Language, data: List[Tuple[str, List[Offset]]]) -> List[str]:
    result: List[str] = list()
    for text, offsets in data:
        doc: Doc = spacy_model(text)
        # remove duplicated offsets
        offsets = normalize_offsets(offsets=offsets)
        offset_tuples = list(set([offset.to_tuple() for offset in offsets]))
        gold_annotations = GoldParse(doc, entities=offset_tuples)
        annotations: List[str] = gold_annotations.ner
        assert len(annotations) == len(doc)
        # Flair uses BIOES and Spacy BILUO
        # BILUO for Begin, Inside, Last, Unit, Out
        # BIOES for Begin, Inside, Outside, End, Single
        annotations = [a.replace('L-', 'E-') for a in annotations]
        annotations = [a.replace('U-', 'S-') for a in annotations]
        annotations = ["O" if a == "-" else a for a in annotations]  # replace unknown
        result += [f"{word} {tag}\n" for word, tag in zip(doc, annotations)]
        result.append('\n')
    return result

开发者ID:ELS-RD，项目名称:anonymisation，代码行数:21，代码来源:import_annotations.py

示例3: _from_json_to_crf

# 需要导入模块: from spacy import gold [as 别名]
# 或者: from spacy.gold import GoldParse [as 别名]
def _from_json_to_crf(self,
                          message,  # type: Message
                          entity_offsets  # type: List[Tuple[int, int, Text]]
                          ):
        # type: (...) -> List[Tuple[Text, Text, Text, Text]]
        """Convert json examples to format of underlying crfsuite."""
        from spacy.gold import GoldParse

        doc = message.get("spacy_doc")
        gold = GoldParse(doc, entities=entity_offsets)
        ents = [l[5] for l in gold.orig_annot]
        if '-' in ents:
            logger.warn("Misaligned entity annotation in sentence '{}'. "
                        "Make sure the start and end values of the "
                        "annotated training examples end at token "
                        "boundaries (e.g. don't include trailing "
                        "whitespaces).".format(doc.text))
        if not self.component_config["BILOU_flag"]:
            for i, label in enumerate(ents):
                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
                    # removes BILOU prefix from label
                    ents[i] = self._entity_from_label(label)

        return self._from_text_to_crf(message, ents)

开发者ID:crownpku，项目名称:Rasa_NLU_Chi，代码行数:26，代码来源:crf_entity_extractor.py

示例4: evaluate

# 需要导入模块: from spacy import gold [as 别名]
# 或者: from spacy.gold import GoldParse [as 别名]
def evaluate(tokenizer, nlp, valid_data, labels):
    """Evaluate model performance on a test dataset."""
    texts, cats = zip(*valid_data)

    golds = []
    # Use the model's ops module
    # to make sure this is compatible with GPU (cupy array)
    # or without (numpy array)
    scores = np.zeros((len(cats), len(labels)), dtype="f")
    if is_transformer(nlp):
        textcat = nlp.get_pipe(PIPES.textcat)
    else:
        textcat = nlp.get_pipe("textcat")
    scores = textcat.model.ops.asarray(scores)

    num_correct = 0
    for i, doc in enumerate(nlp.pipe(texts)):
        gold_cats = cats[i]["cats"]
        for j, (label, score) in enumerate(doc.cats.items()):
            if label not in gold_cats:
                raise ValueError(f"Prediction for unexpected label: {label}")

            scores[i, j] = score

            doc_prediction = score > 0.5
            if doc_prediction == bool(gold_cats[label]):
                num_correct += 1

        golds.append(GoldParse(doc, cats=gold_cats))

    accuracy = num_correct / ((len(texts) * len(labels)) + 1e-8)
    loss, _ = textcat.get_loss(texts, golds, scores)

    return accuracy, loss

开发者ID:RTIInternational，项目名称:gobbli，代码行数:36，代码来源:run_spacy.py

示例5: _from_json_to_crf

# 需要导入模块: from spacy import gold [as 别名]
# 或者: from spacy.gold import GoldParse [as 别名]
def _from_json_to_crf(self,
                          message: Message,
                          entity_offsets: List[Tuple[int, int, Text]]
                          ) -> List[Tuple[Text, Text, Text, Text]]:
        """Convert json examples to format of underlying crfsuite."""

        if self.pos_features:
            from spacy.gold import GoldParse

            doc = message.get("spacy_doc")
            gold = GoldParse(doc, entities=entity_offsets)
            ents = [l[5] for l in gold.orig_annot]
        else:
            tokens = message.get("tokens")
            ents = self._bilou_tags_from_offsets(tokens, entity_offsets)

        if '-' in ents:
            logger.warning("Misaligned entity annotation in sentence '{}'. "
                           "Make sure the start and end values of the "
                           "annotated training examples end at token "
                           "boundaries (e.g. don't include trailing "
                           "whitespaces or punctuation)."
                           "".format(message.text))
        if not self.component_config["BILOU_flag"]:
            for i, label in enumerate(ents):
                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
                    # removes BILOU prefix from label
                    ents[i] = self._entity_from_label(label)

        return self._from_text_to_crf(message, ents)

开发者ID:weizhenzhao，项目名称:rasa_nlu，代码行数:32，代码来源:crf_entity_extractor.py

示例6: _bilou_tags_from_offsets

# 需要导入模块: from spacy import gold [as 别名]
# 或者: from spacy.gold import GoldParse [as 别名]
def _bilou_tags_from_offsets(tokens, entities, missing='O'):
        # From spacy.spacy.GoldParse, under MIT License
        starts = {token.offset: i for i, token in enumerate(tokens)}
        ends = {token.end: i for i, token in enumerate(tokens)}
        bilou = ['-' for _ in tokens]
        # Handle entity cases
        for start_char, end_char, label in entities:
            start_token = starts.get(start_char)
            end_token = ends.get(end_char)
            # Only interested if the tokenization is correct
            if start_token is not None and end_token is not None:
                if start_token == end_token:
                    bilou[start_token] = 'U-%s' % label
                else:
                    bilou[start_token] = 'B-%s' % label
                    for i in range(start_token + 1, end_token):
                        bilou[i] = 'I-%s' % label
                    bilou[end_token] = 'L-%s' % label
        # Now distinguish the O cases from ones where we miss the tokenization
        entity_chars = set()
        for start_char, end_char, label in entities:
            for i in range(start_char, end_char):
                entity_chars.add(i)
        for n, token in enumerate(tokens):
            for i in range(token.offset, token.end):
                if i in entity_chars:
                    break
            else:
                bilou[n] = missing

        return bilou

开发者ID:weizhenzhao，项目名称:rasa_nlu，代码行数:33，代码来源:crf_entity_extractor.py

示例7: createDataset

# 需要导入模块: from spacy import gold [as 别名]
# 或者: from spacy.gold import GoldParse [as 别名]
def createDataset(self, intents, spacy_nlp):
        
        dataset = []
        entity_offsets = []
        
        intentCounter = 0
        for intent in intents:
            
            sentenceCounter = 0
            for sentence in intent['text']:
                
                doc = spacy_nlp(sentence)
                print(doc.text)

                for entity in intent['entities'][sentenceCounter]:
                    
                    entity_offsets.append(tuple((entity['rangeFrom'],entity['rangeTo'],entity['entity'])))
                
                gold = GoldParse(doc, entities=entity_offsets)
                ents = [l[5] for l in gold.orig_annot]
                crf_format = [(doc[entity].text, doc[entity].tag_, ents[entity]) for i in range(len(doc))]
                dataset.append(crf_format)
                sentenceCounter = sentenceCounter + 1
            
            intentCounter = intentCounter + 1

        return dataset

开发者ID:GeniSysAI，项目名称:NLU，代码行数:29，代码来源:CRF.py

示例8: convert_unknown_bilou

# 需要导入模块: from spacy import gold [as 别名]
# 或者: from spacy.gold import GoldParse [as 别名]
def convert_unknown_bilou(doc: Doc, offsets: List[Offset]) -> GoldParse:
    """
    Convert entity offsets to list of BILOU annotations
    and convert UNKNOWN label to Spacy missing values
    https://spacy.io/api/goldparse#biluo_tags_from_offsets
    :param doc: spacy tokenized text
    :param offsets: discovered offsets
    :return: tuple of docs and BILOU annotations
    """
    tupple_offset = [offset.to_tuple() for offset in offsets]
    bilou_annotations = convert_bilou_with_missing_action(doc=doc, offsets=tupple_offset)
    return GoldParse(doc, entities=bilou_annotations)

开发者ID:ELS-RD，项目名称:anonymisation，代码行数:14，代码来源:convert_to_bilou.py

示例9: convert_unknown_bilou_bulk

# 需要导入模块: from spacy import gold [as 别名]
# 或者: from spacy.gold import GoldParse [as 别名]
def convert_unknown_bilou_bulk(docs: List[Doc], offsets: List[List[Offset]]) -> List[GoldParse]:
    """
    Convert list of entity offsets to list of BILOU annotations
    and convert UNKNOWN label to Spacy missing values
    https://spacy.io/api/goldparse#biluo_tags_from_offsets
    :param docs: spacy tokenized text
    :param offsets: discovered offsets
    :return: tuple of docs and GoldParse
    """
    list_of_gold_parse = list()
    for doc, current_offsets in zip(docs, offsets):
        bilou_annotations = convert_unknown_bilou(doc=doc,
                                                  offsets=current_offsets)
        list_of_gold_parse.append(bilou_annotations)
    return list_of_gold_parse

开发者ID:ELS-RD，项目名称:anonymisation，代码行数:17，代码来源:convert_to_bilou.py

示例10: test_tokenizer

# 需要导入模块: from spacy import gold [as 别名]
# 或者: from spacy.gold import GoldParse [as 别名]
def test_tokenizer():
    doc: Doc = pytest.nlp.make_doc("Ceci est un test.")
    offsets = [(0, 4, "PERS"), (9, 11, "PERS")]
    gold: GoldParse = GoldParse(doc, entities=offsets)
    word_extracted = [doc.char_span(o[0], o[1]) for o in offsets]
    count_ent = sum([1 for item in gold.ner if item != "O"])
    assert count_ent == len(word_extracted)

    offsets = [(0, 4, "PERS"), (9, 12, "PERS")]
    gold: GoldParse = GoldParse(doc, entities=offsets)
    word_extracted = [doc.char_span(o[0], o[1]) for o in offsets if doc.char_span(o[0], o[1]) is not None]
    count_ent = sum([1 for item in gold.ner if item != "O"])
    assert count_ent > len(word_extracted)

开发者ID:ELS-RD，项目名称:anonymisation，代码行数:15，代码来源:spacy_annotations_test.py

示例11: test_score

# 需要导入模块: from spacy import gold [as 别名]
# 或者: from spacy.gold import GoldParse [as 别名]
def test_score():
    s = "Le Président, Le Commis-Greffier, Jean-Paul I FFELLI Nelly DUBAS"
    doc: Doc = pytest.nlp.make_doc(s)
    expected_span: GoldParse = GoldParse(doc, entities=[(34, 64, "PERS")])
    predicted_span = doc.char_span(34, 58, "PERS")
    doc.ents = [predicted_span]
    score: Scorer = Scorer()
    score.score(doc, expected_span)
    assert score.ents_per_type == dict([('PERS', {'p': 0.0, 'r': 0.0, 'f': 0.0})])

    predicted_span = doc.char_span(34, 64, "PERS")
    doc.ents = [predicted_span]
    score: Scorer = Scorer()
    score.score(doc, expected_span)
    assert score.ents_per_type == dict([('PERS', {'p': 100.0, 'r': 100.0, 'f': 100.0})])

开发者ID:ELS-RD，项目名称:anonymisation，代码行数:17，代码来源:spacy_annotations_test.py

注：本文中的spacy.gold.GoldParse方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。