Python tokens.Doc类代码示例

本文整理汇总了Python中spacy.tokens.Doc类的典型用法代码示例。如果您正苦于以下问题：Python Doc类的具体用法？Python Doc怎么用？Python Doc使用的例子？那么, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了Doc类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_doc_array_to_from_string_attrs

def test_doc_array_to_from_string_attrs(en_vocab, attrs):
    """Test that both Doc.to_array and Doc.from_array accept string attrs,
    as well as single attrs and sequences of attrs.
    """
    words = ["An", "example", "sentence"]
    doc = Doc(en_vocab, words=words)
    Doc(en_vocab, words=words).from_array(attrs, doc.to_array(attrs))

开发者ID:spacy-io，项目名称:spaCy，代码行数:7，代码来源:test_array.py

示例2: init

    def __init__(self, nlp, label='GPE'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get('https://restcountries.eu/rest/v2/all')
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()

        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c['name']: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('COUNTRIES', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension('is_country', default=False)
        Token.set_extension('country_capital')
        Token.set_extension('country_latlng')
        Token.set_extension('country_flag')

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension('has_country', getter=self.has_country)
        Span.set_extension('has_country', getter=self.has_country)

开发者ID:AvinashGupta，项目名称:spaCy，代码行数:33，代码来源:custom_component_countries_api.py

示例3: get_doc

def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
    """Create Doc object from given vocab, words and annotations."""
    pos = pos or [""] * len(words)
    tags = tags or [""] * len(words)
    heads = heads or [0] * len(words)
    deps = deps or [""] * len(words)
    for value in deps + tags + pos:
        vocab.strings.add(value)

    doc = Doc(vocab, words=words)
    attrs = doc.to_array([POS, HEAD, DEP])
    for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
        attrs[i, 0] = doc.vocab.strings[p]
        attrs[i, 1] = head
        attrs[i, 2] = doc.vocab.strings[dep]
    doc.from_array([POS, HEAD, DEP], attrs)
    if ents:
        doc.ents = [
            Span(doc, start, end, label=doc.vocab.strings[label])
            for start, end, label in ents
        ]
    if tags:
        for token in doc:
            token.tag_ = tags[token.i]
    return doc

开发者ID:spacy-io，项目名称:spaCy，代码行数:25，代码来源:util.py

示例4: test_doc_array_attr_of_token

def test_doc_array_attr_of_token(en_vocab):
    doc = Doc(en_vocab, words=["An", "example", "sentence"])
    example = doc.vocab["example"]
    assert example.orth != example.shape
    feats_array = doc.to_array((ORTH, SHAPE))
    assert feats_array[0][0] != feats_array[0][1]
    assert feats_array[0][0] != feats_array[0][1]

开发者ID:spacy-io，项目名称:spaCy，代码行数:7，代码来源:test_array.py

示例5: test_doc_api_similarity_match

def test_doc_api_similarity_match():
    doc = Doc(Vocab(), words=["a"])
    assert doc.similarity(doc[0]) == 1.0
    assert doc.similarity(doc.vocab["a"]) == 1.0
    doc2 = Doc(doc.vocab, words=["a", "b", "c"])
    with pytest.warns(ModelsWarning):
        assert doc.similarity(doc2[:1]) == 1.0
        assert doc.similarity(doc2) == 0.0

开发者ID:spacy-io，项目名称:spaCy，代码行数:8，代码来源:test_doc_api.py

示例6: test_doc_retokenize_merge_extension_attrs_invalid

def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[0:2], attrs=attrs)

开发者ID:spacy-io，项目名称:spaCy，代码行数:8，代码来源:test_retokenize_merge.py

示例7: test_issue1547

def test_issue1547():
    """Test that entity labels still match after merging tokens."""
    words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[5:7])
    assert [ent.text for ent in doc.ents]

开发者ID:spacy-io，项目名称:spaCy，代码行数:8，代码来源:test_issue1501-2000.py

示例8: test_serialize_empty_doc

def test_serialize_empty_doc(en_vocab):
    doc = Doc(en_vocab)
    data = doc.to_bytes()
    doc2 = Doc(en_vocab)
    doc2.from_bytes(data)
    assert len(doc) == len(doc2)
    for token1, token2 in zip(doc, doc2):
        assert token1.text == token2.text

开发者ID:spacy-io，项目名称:spaCy，代码行数:8，代码来源:test_serialize_doc.py

示例9: test_underscore_dir

def test_underscore_dir(en_vocab):
    """Test that dir() correctly returns extension attributes. This enables
    things like tab-completion for the attributes in doc._."""
    Doc.set_extension("test_dir", default=None)
    doc = Doc(en_vocab, words=["hello", "world"])
    assert "_" in dir(doc)
    assert "test_dir" in dir(doc._)
    assert "test_dir" not in dir(doc[0]._)
    assert "test_dir" not in dir(doc[0:2]._)

开发者ID:spacy-io，项目名称:spaCy，代码行数:9，代码来源:test_underscore.py

示例10: test_doc_to_json_underscore

def test_doc_to_json_underscore(doc):
    Doc.set_extension("json_test1", default=False)
    Doc.set_extension("json_test2", default=False)
    doc._.json_test1 = "hello world"
    doc._.json_test2 = [1, 2, 3]
    json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
    assert "_" in json_doc
    assert json_doc["_"]["json_test1"] == "hello world"
    assert json_doc["_"]["json_test2"] == [1, 2, 3]

开发者ID:spacy-io，项目名称:spaCy，代码行数:9，代码来源:test_to_json.py

示例11: test_doc_retokenize_split_extension_attrs_invalid

def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("x", default=False, force=True)
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            heads = [(doc[0], 1), doc[1]]
            retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)

开发者ID:spacy-io，项目名称:spaCy，代码行数:10，代码来源:test_retokenize_split.py

示例12: test_doc_retokenize_split_heads_error

def test_doc_retokenize_split_heads_error(en_vocab):
    doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
    # Not enough heads
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1]])

    # Too many heads
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])

开发者ID:spacy-io，项目名称:spaCy，代码行数:11，代码来源:test_retokenize_split.py

示例13: test_doc_add_entities_set_ents_iob

def test_doc_add_entities_set_ents_iob(en_vocab):
    doc = Doc(en_vocab, words=["This", "is", "a", "lion"])
    ner = EntityRecognizer(en_vocab)
    ner.begin_training([])
    ner(doc)
    assert len(list(doc.ents)) == 0
    assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
    assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
    doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
    assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]

开发者ID:spacy-io，项目名称:spaCy，代码行数:11，代码来源:test_ner.py

示例14: test_doc_retokenize_split_orths_mismatch

def test_doc_retokenize_split_orths_mismatch(en_vocab):
    """Test that the regular retokenizer.split raises an error if the orths
    don't match the original token text. There might still be a method that
    allows this, but for the default use cases, merging and splitting should
    always conform with spaCy's non-destructive tokenization policy. Otherwise,
    it can lead to very confusing and unexpected results.
    """
    doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["L", "A"], [(doc[0], 0), (doc[0], 0)])