本文整理汇总了Python中spacy.tokens.Doc.retokenize方法的典型用法代码示例。如果您正苦于以下问题:Python Doc.retokenize方法的具体用法?Python Doc.retokenize怎么用?Python Doc.retokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类spacy.tokens.Doc
的用法示例。
在下文中一共展示了Doc.retokenize方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_doc_retokenize_spans_entity_merge_iob
# 需要导入模块: from spacy.tokens import Doc [as 别名]
# 或者: from spacy.tokens.Doc import retokenize [as 别名]
def test_doc_retokenize_spans_entity_merge_iob():
# Test entity IOB stays consistent after merging
words = ["a", "b", "c", "d", "e"]
doc = Doc(Vocab(), words=words)
doc.ents = [
(doc.vocab.strings.add("ent-abc"), 0, 3),
(doc.vocab.strings.add("ent-d"), 3, 4),
]
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
assert doc[2].ent_iob_ == "I"
assert doc[3].ent_iob_ == "B"
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:1])
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
doc = Doc(Vocab(), words=words)
doc.ents = [
(doc.vocab.strings.add("ent-de"), 3, 5),
(doc.vocab.strings.add("ent-fg"), 5, 7),
]
assert doc[3].ent_iob_ == "B"
assert doc[4].ent_iob_ == "I"
assert doc[5].ent_iob_ == "B"
assert doc[6].ent_iob_ == "I"
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[2:4])
retokenizer.merge(doc[4:6])
retokenizer.merge(doc[7:9])
assert len(doc) == 6
assert doc[3].ent_iob_ == "B"
assert doc[4].ent_iob_ == "I"
示例2: test_doc_retokenize_split_heads_error
# 需要导入模块: from spacy.tokens import Doc [as 别名]
# 或者: from spacy.tokens.Doc import retokenize [as 别名]
def test_doc_retokenize_split_heads_error(en_vocab):
doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
# Not enough heads
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1]])
# Too many heads
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])
示例3: test_issue1547
# 需要导入模块: from spacy.tokens import Doc [as 别名]
# 或者: from spacy.tokens.Doc import retokenize [as 别名]
def test_issue1547():
"""Test that entity labels still match after merging tokens."""
words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
doc = Doc(Vocab(), words=words)
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[5:7])
assert [ent.text for ent in doc.ents]
示例4: test_doc_retokenize_merge_extension_attrs_invalid
# 需要导入模块: from spacy.tokens import Doc [as 别名]
# 或者: from spacy.tokens.Doc import retokenize [as 别名]
def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs):
Token.set_extension("a", getter=lambda x: x, force=True)
Token.set_extension("b", method=lambda x: x, force=True)
doc = Doc(en_vocab, words=["hello", "world", "!"])
attrs = {"_": underscore_attrs}
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2], attrs=attrs)
示例5: test_doc_retokenize_split_extension_attrs_invalid
# 需要导入模块: from spacy.tokens import Doc [as 别名]
# 或者: from spacy.tokens.Doc import retokenize [as 别名]
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
Token.set_extension("x", default=False, force=True)
Token.set_extension("a", getter=lambda x: x, force=True)
Token.set_extension("b", method=lambda x: x, force=True)
doc = Doc(en_vocab, words=["LosAngeles", "start"])
attrs = {"_": underscore_attrs}
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
heads = [(doc[0], 1), doc[1]]
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
示例6: test_doc_retokenize_merge_extension_attrs
# 需要导入模块: from spacy.tokens import Doc [as 别名]
# 或者: from spacy.tokens.Doc import retokenize [as 别名]
def test_doc_retokenize_merge_extension_attrs(en_vocab):
Token.set_extension("a", default=False, force=True)
Token.set_extension("b", default="nothing", force=True)
doc = Doc(en_vocab, words=["hello", "world", "!"])
# Test regular merging
with doc.retokenize() as retokenizer:
attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}}
retokenizer.merge(doc[0:2], attrs=attrs)
assert doc[0].lemma_ == "hello world"
assert doc[0]._.a is True
assert doc[0]._.b == "1"
# Test bulk merging
doc = Doc(en_vocab, words=["hello", "world", "!", "!"])
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}})
retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}})
assert doc[0]._.a is True
assert doc[0]._.b == "1"
assert doc[1]._.a is None
assert doc[1]._.b == "2"
示例7: test_doc_retokenize_split_orths_mismatch
# 需要导入模块: from spacy.tokens import Doc [as 别名]
# 或者: from spacy.tokens.Doc import retokenize [as 别名]
def test_doc_retokenize_split_orths_mismatch(en_vocab):
"""Test that the regular retokenizer.split raises an error if the orths
don't match the original token text. There might still be a method that
allows this, but for the default use cases, merging and splitting should
always conform with spaCy's non-destructive tokenization policy. Otherwise,
it can lead to very confusing and unexpected results.
"""
doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["L", "A"], [(doc[0], 0), (doc[0], 0)])
示例8: test_doc_retokenize_spans_entity_split_iob
# 需要导入模块: from spacy.tokens import Doc [as 别名]
# 或者: from spacy.tokens.Doc import retokenize [as 别名]
def test_doc_retokenize_spans_entity_split_iob():
# Test entity IOB stays consistent after merging
words = ["abc", "d", "e"]
doc = Doc(Vocab(), words=words)
doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)]
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["a", "b", "c"], [(doc[0], 1), (doc[0], 2), doc[1]])
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
assert doc[2].ent_iob_ == "I"
assert doc[3].ent_iob_ == "I"
示例9: test_doc_retokenize_split_dependencies
# 需要导入模块: from spacy.tokens import Doc [as 别名]
# 或者: from spacy.tokens.Doc import retokenize [as 别名]
def test_doc_retokenize_split_dependencies(en_vocab):
doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
dep1 = doc.vocab.strings.add("amod")
dep2 = doc.vocab.strings.add("subject")
with doc.retokenize() as retokenizer:
retokenizer.split(
doc[0],
["Los", "Angeles"],
[(doc[0], 1), doc[1]],
attrs={"dep": [dep1, dep2]},
)
assert doc[0].dep == dep1
assert doc[1].dep == dep2
示例10: test_doc_retokenizer_merge_lex_attrs
# 需要导入模块: from spacy.tokens import Doc [as 别名]
# 或者: from spacy.tokens.Doc import retokenize [as 别名]
def test_doc_retokenizer_merge_lex_attrs(en_vocab):
"""Test that retokenization also sets attributes on the lexeme if they're
lexical attributes. For example, if a user sets IS_STOP, it should mean that
"all tokens with that lexeme" are marked as a stop word, so the ambiguity
here is acceptable. Also see #2390.
"""
# Test regular merging
doc = Doc(en_vocab, words=["hello", "world", "!"])
assert not any(t.is_stop for t in doc)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2], attrs={"lemma": "hello world", "is_stop": True})
assert doc[0].lemma_ == "hello world"
assert doc[0].is_stop
# Test bulk merging
doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
assert not any(t.like_num for t in doc)
assert not any(t.is_stop for t in doc)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2], attrs={"like_num": True})
retokenizer.merge(doc[2:4], attrs={"is_stop": True})
assert doc[0].like_num
assert doc[1].is_stop
assert not doc[0].is_stop
assert not doc[1].like_num
示例11: test_doc_retokenize_split_extension_attrs
# 需要导入模块: from spacy.tokens import Doc [as 别名]
# 或者: from spacy.tokens.Doc import retokenize [as 别名]
def test_doc_retokenize_split_extension_attrs(en_vocab):
Token.set_extension("a", default=False, force=True)
Token.set_extension("b", default="nothing", force=True)
doc = Doc(en_vocab, words=["LosAngeles", "start"])
with doc.retokenize() as retokenizer:
heads = [(doc[0], 1), doc[1]]
underscore = [{"a": True, "b": "1"}, {"b": "2"}]
attrs = {"lemma": ["los", "angeles"], "_": underscore}
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
assert doc[0].lemma_ == "los"
assert doc[0]._.a is True
assert doc[0]._.b == "1"
assert doc[1].lemma_ == "angeles"
assert doc[1]._.a is False
assert doc[1]._.b == "2"
示例12: test_doc_retokenizer_split_lex_attrs
# 需要导入模块: from spacy.tokens import Doc [as 别名]
# 或者: from spacy.tokens.Doc import retokenize [as 别名]
def test_doc_retokenizer_split_lex_attrs(en_vocab):
"""Test that retokenization also sets attributes on the lexeme if they're
lexical attributes. For example, if a user sets IS_STOP, it should mean that
"all tokens with that lexeme" are marked as a stop word, so the ambiguity
here is acceptable. Also see #2390.
"""
assert not Doc(en_vocab, words=["Los"])[0].is_stop
assert not Doc(en_vocab, words=["Angeles"])[0].is_stop
doc = Doc(en_vocab, words=["LosAngeles", "start"])
assert not doc[0].is_stop
with doc.retokenize() as retokenizer:
attrs = {"is_stop": [True, False]}
heads = [(doc[0], 1), doc[1]]
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
assert doc[0].is_stop
assert not doc[1].is_stop