本文整理匯總了Python中spacy.matcher.Matcher類的典型用法代碼示例。如果您正苦於以下問題:Python Matcher類的具體用法?Python Matcher怎麽用?Python Matcher使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了Matcher類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: RussianTokenizer
class RussianTokenizer(object):
name = 'russian_tokenizer'
def __init__(self, nlp, merge_patterns=None, terminal_patterns=None):
self.matcher = Matcher(nlp.vocab)
self.token_merge = nlp.vocab.strings['pattern']
self.sentence_terminal = nlp.vocab.strings['sentence_terminal']
if merge_patterns:
self.matcher.add(self.token_merge, None, *merge_patterns)
if terminal_patterns:
self.matcher.add(self.sentence_terminal, None, *terminal_patterns)
def __call__(self, doc):
spans = []
for id, start, end in self.matcher(doc):
if id == self.token_merge:
spans.append(doc[start:end])
elif id == self.sentence_terminal:
# remove all sentence start marks from span that match pattern
for token in doc[start:end]:
if token.sent_start:
token.sent_start = False
if spans:
for span in spans:
span.merge()
return doc
示例2: write_conllu
def write_conllu(docs, file_):
merger = Matcher(docs[0].vocab)
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
for i, doc in enumerate(docs):
matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches]
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
file_.write("# newdoc id = {i}\n".format(i=i))
for j, sent in enumerate(doc.sents):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
file_.write("# text = {text}\n".format(text=sent.text))
for k, token in enumerate(sent):
if token.head.i > sent[-1].i or token.head.i < sent[0].i:
for word in doc[sent[0].i - 10 : sent[0].i]:
print(word.i, word.head.i, word.text, word.dep_)
for word in sent:
print(word.i, word.head.i, word.text, word.dep_)
for word in doc[sent[-1].i : sent[-1].i + 10]:
print(word.i, word.head.i, word.text, word.dep_)
raise ValueError(
"Invalid parse: head outside sentence (%s)" % token.text
)
file_.write(token._.get_conllu_lines(k) + "\n")
file_.write("\n")
示例3: test_operator_combos
def test_operator_combos(en_vocab):
cases = [
("aaab", "a a a b", True),
("aaab", "a+ b", True),
("aaab", "a+ a+ b", True),
("aaab", "a+ a+ a b", True),
("aaab", "a+ a+ a+ b", True),
("aaab", "a+ a a b", True),
("aaab", "a+ a a", True),
("aaab", "a+", True),
("aaa", "a+ b", False),
("aaa", "a+ a+ b", False),
("aaa", "a+ a+ a+ b", False),
("aaa", "a+ a b", False),
("aaa", "a+ a a b", False),
("aaab", "a+ a a", True),
("aaab", "a+", True),
("aaab", "a+ a b", True),
]
for string, pattern_str, result in cases:
matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=list(string))
pattern = []
for part in pattern_str.split():
if part.endswith("+"):
pattern.append({"ORTH": part[0], "OP": "+"})
else:
pattern.append({"ORTH": part})
matcher.add("PATTERN", None, pattern)
matches = matcher(doc)
if result:
assert matches, (string, pattern_str)
else:
assert not matches, (string, pattern_str)
示例4: test_issue615
def test_issue615(en_tokenizer):
def merge_phrases(matcher, doc, i, matches):
"""Merge a phrase. We have to be careful here because we'll change the
token indices. To avoid problems, merge all the phrases once we're called
on the last match."""
if i != len(matches) - 1:
return None
spans = [Span(doc, start, end, label=label) for label, start, end in matches]
with doc.retokenize() as retokenizer:
for span in spans:
tag = "NNP" if span.label_ else span.root.tag_
attrs = {"tag": tag, "lemma": span.text}
retokenizer.merge(span, attrs=attrs)
doc.ents = doc.ents + (span,)
text = "The golf club is broken"
pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
label = "Sport_Equipment"
doc = en_tokenizer(text)
matcher = Matcher(doc.vocab)
matcher.add(label, merge_phrases, pattern)
matcher(doc)
entities = list(doc.ents)
assert entities != []
assert entities[0].label != 0
示例5: test_matcher_match_zero_plus
def test_matcher_match_zero_plus(matcher):
words = 'He said , " some words " ...'.split()
pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
matcher = Matcher(matcher.vocab)
matcher.add("Quote", None, pattern)
doc = Doc(matcher.vocab, words=words)
assert len(matcher(doc)) == 1
示例6: write_conllu
def write_conllu(docs, file_):
merger = Matcher(docs[0].vocab)
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
for i, doc in enumerate(docs):
matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches]
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
# TODO: This shouldn't be necessary? Should be handled in merge
for word in doc:
if word.i == word.head.i:
word.dep_ = "ROOT"
file_.write("# newdoc id = {i}\n".format(i=i))
for j, sent in enumerate(doc.sents):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
file_.write("# text = {text}\n".format(text=sent.text))
for k, token in enumerate(sent):
file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
file_.write("\n")
for word in sent:
if word.head.i == word.i and word.dep_ == "ROOT":
break
else:
print("Rootless sentence!")
print(sent)
print(i)
for w in sent:
print(w.i, w.text, w.head.text, w.head.i, w.dep_)
raise ValueError
示例7: test_issue3555
def test_issue3555(en_vocab):
"""Test that custom extensions with default None don't break matcher."""
Token.set_extension("issue3555", default=None)
matcher = Matcher(en_vocab)
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
matcher.add("TEST", None, pattern)
doc = Doc(en_vocab, words=["have", "apple"])
matcher(doc)
示例8: test_issue1883
def test_issue1883():
matcher = Matcher(Vocab())
matcher.add("pat1", None, [{"orth": "hello"}])
doc = Doc(matcher.vocab, words=["hello"])
assert len(matcher(doc)) == 1
new_matcher = copy.deepcopy(matcher)
new_doc = Doc(new_matcher.vocab, words=["hello"])
assert len(new_matcher(new_doc)) == 1
示例9: test_matcher_operator_shadow
def test_matcher_operator_shadow(en_vocab):
matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=["a", "b", "c"])
pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
matcher.add("A.C", None, pattern)
matches = matcher(doc)
assert len(matches) == 1
assert matches[0][1:] == (0, 3)
示例10: test_match_consuming
def test_match_consuming(doc, text, pattern, re_pattern):
"""Test that matcher.__call__ consumes tokens on a match similar to
re.findall."""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern, None, pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
assert len(matches) == len(re_matches)
示例11: test_issue_1971_2
def test_issue_1971_2(en_vocab):
matcher = Matcher(en_vocab)
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
matcher.add("TEST1", None, pattern1, pattern2)
matches = matcher(doc)
assert len(matches) == 2
示例12: test_greedy_matching
def test_greedy_matching(doc, text, pattern, re_pattern):
"""Test that the greedy matching behavior of the * op is consistant with
other re implementations."""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern, None, pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
for match, re_match in zip(matches, re_matches):
assert match[1:] == re_match
示例13: test_issue1945
def test_issue1945():
"""Test regression in Matcher introduced in v2.0.6."""
matcher = Matcher(Vocab())
matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}])
doc = Doc(matcher.vocab, words=["a", "a", "a"])
matches = matcher(doc) # we should see two overlapping matches here
assert len(matches) == 2
assert matches[0][1:] == (0, 2)
assert matches[1][1:] == (1, 3)
示例14: test_matcher_compare_length
def test_matcher_compare_length(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"LENGTH": {">=": 2}}]
matcher.add("LENGTH_COMPARE", None, pattern)
doc = Doc(en_vocab, words=["a", "aa", "aaa"])
matches = matcher(doc)
assert len(matches) == 2
doc = Doc(en_vocab, words=["a"])
matches = matcher(doc)
assert len(matches) == 0
示例15: test_matcher_regex_shape
def test_matcher_regex_shape(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
matcher.add("NON_ALPHA", None, pattern)
doc = Doc(en_vocab, words=["99", "problems", "!"])
matches = matcher(doc)
assert len(matches) == 2
doc = Doc(en_vocab, words=["bye"])
matches = matcher(doc)
assert len(matches) == 0