当前位置: 首页>>代码示例>>Python>>正文


Python spacy.tokens方法代码示例

本文整理汇总了Python中spacy.tokens方法的典型用法代码示例。如果您正苦于以下问题:Python spacy.tokens方法的具体用法?Python spacy.tokens怎么用?Python spacy.tokens使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在spacy的用法示例。


在下文中一共展示了spacy.tokens方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __new__

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def __new__(
        cls,
        span,
        mention_index,
        utterance_index,
        utterance_start_sent,
        speaker=None,
        gold_label=None,
        *args,
        **kwargs,
    ):
        # We need to override __new__ see http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
        obj = spacy.tokens.Span.__new__(
            cls, span.doc, span.start, span.end, *args, **kwargs
        )
        return obj 
开发者ID:huggingface,项目名称:neuralcoref,代码行数:18,代码来源:document.py

示例2: __init__

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def __init__(
        self,
        language: str = "en_core_web_sm",
        pos_tags: bool = False,
        parse: bool = False,
        ner: bool = False,
        keep_spacy_tokens: bool = False,
        split_on_spaces: bool = False,
        start_tokens: Optional[List[str]] = None,
        end_tokens: Optional[List[str]] = None,
    ) -> None:
        self.spacy = get_spacy_model(language, pos_tags, parse, ner)
        if split_on_spaces:
            self.spacy.tokenizer = _WhitespaceSpacyTokenizer(self.spacy.vocab)

        self._keep_spacy_tokens = keep_spacy_tokens
        self._start_tokens = start_tokens or []
        # We reverse the tokens here because we're going to insert them with `insert(0)` later;
        # this makes sure they show up in the right order.
        self._start_tokens.reverse()
        self._end_tokens = end_tokens or [] 
开发者ID:allenai,项目名称:allennlp,代码行数:23,代码来源:spacy_tokenizer.py

示例3: _sanitize

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def _sanitize(self, tokens: List[spacy.tokens.Token]) -> List[Token]:
        """
        Converts spaCy tokens to allennlp tokens. Is a no-op if
        keep_spacy_tokens is True
        """
        if not self._keep_spacy_tokens:
            tokens = [
                Token(
                    token.text,
                    token.idx,
                    token.idx + len(token.text),
                    token.lemma_,
                    token.pos_,
                    token.tag_,
                    token.dep_,
                    token.ent_type_,
                )
                for token in tokens
            ]
        for start_token in self._start_tokens:
            tokens.insert(0, Token(start_token, 0))
        for end_token in self._end_tokens:
            tokens.append(Token(end_token, -1))
        return tokens 
开发者ID:allenai,项目名称:allennlp,代码行数:26,代码来源:spacy_tokenizer.py

示例4: __init__

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"):
        try:
            from nltk.corpus import wordnet
            import nltk
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires nltk to be installed."
            )

        self.wn = wordnet

        try:
            import spacy
            from spacy.tokens import Token
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires spaCy and a language "
                "model to be installed (for part of speech tagging)."
            )

        if not skip_download_check:
            nltk.download("wordnet")

        self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False)
        Token.set_extension("replacement", default=None, force=True) 
开发者ID:RTIInternational,项目名称:gobbli,代码行数:27,代码来源:wordnet.py

示例5: get_sentence_tokens

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def get_sentence_tokens(texts, charoffsets):
    whole_text = "".join(texts)
    tokens = []
    sentence_offsets = []

    start_t = 0
    end_t = 0
    for offset_list in charoffsets:
        end_t = start_t
        for start, end in offset_list:
            cur_token = whole_text[start:end]
            if len(cur_token) > 0:
                tokens.append(cur_token)
                end_t += 1
        sentence_offsets.append((start_t, end_t))
        start_t = end_t
    return tokens, sentence_offsets 
开发者ID:easonnie,项目名称:semanticRetrievalMRS,代码行数:19,代码来源:rindex.py

示例6: remove_stopwords

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def remove_stopwords(self, sentence_str: str=None, tokens: List[Token]=None, use_lemma: bool=True) -> str:
        """Function which gets a normalized string of the sentence and removes stop words
        
        Keyword Arguments:
            sentence_str {str} -- input sentence string (default: {None})
            tokens {List[Token]} -- pre-computed token list, with feature added (default: {None})
            use_lemma {bool} -- return the lemma or the text (default: {True})
        
        Returns:
            str -- the str with stopwords removed
        """
        if not tokens and sentence_str:
            #sentence_str = normalize_answer(sentence_str)
            tokens = self.model(sentence_str)
        elif not tokens:
            tokens = []
        #word_tokenize(sentence_str)
        attr = 'lemma_' if use_lemma else 'text' # what to merge
        return ' '.join([ getattr(token, attr) for token in tokens
            if not token.is_punct and token.text not in STOP_WORDS and token.lemma_ not in STOP_WORDS]) 
开发者ID:uwdata,项目名称:errudite,代码行数:22,代码来源:spacy_annotator.py

示例7: _generate_partly_censored_word

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def _generate_partly_censored_word(self, word: Union[str, spacy.tokens.Token], profane_word: str) -> str:
        def is_delete_or_insert(opcode):
            return opcode[0] in ('delete', 'insert')

        # noinspection PyShadowingNames
        def find_word_part(word: str, word_part: str) -> str:
            word_to_word_part_opcodes = Levenshtein.opcodes(word, word_part)
            word_part_in_word_start = (
                word_to_word_part_opcodes[0][2] if is_delete_or_insert(word_to_word_part_opcodes[0]) else 0)
            word_part_in_word_finish = (
                word_to_word_part_opcodes[-1][1] if is_delete_or_insert(word_to_word_part_opcodes[-1]) else len(word))
            return word[word_part_in_word_start:word_part_in_word_finish]

        with suppress(AttributeError):
            word = word.text

        word_part_for_censoring = find_word_part(word.lower(), profane_word)
        return regex.sub(pattern=re.escape(word_part_for_censoring),
                         repl=self._generate_fully_censored_word(word=word_part_for_censoring),
                         string=word,
                         flags=regex.IGNORECASE) 
开发者ID:rominf,项目名称:profanity-filter,代码行数:23,代码来源:profanity_filter.py

示例8: test_spacy_training_sample_alignment

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def test_spacy_training_sample_alignment(spacy_nlp_component):
    from spacy.tokens import Doc

    m1 = Message.build(text="I have a feeling", intent="feeling")
    m2 = Message.build(text="", intent="feeling")
    m3 = Message.build(text="I am the last message", intent="feeling")
    td = TrainingData(training_examples=[m1, m2, m3])

    attribute_docs = spacy_nlp_component.docs_for_training_data(td)

    assert isinstance(attribute_docs["text"][0], Doc)
    assert isinstance(attribute_docs["text"][1], Doc)
    assert isinstance(attribute_docs["text"][2], Doc)

    assert [t.text for t in attribute_docs["text"][0]] == ["i", "have", "a", "feeling"]
    assert [t.text for t in attribute_docs["text"][1]] == []
    assert [t.text for t in attribute_docs["text"][2]] == [
        "i",
        "am",
        "the",
        "last",
        "message",
    ] 
开发者ID:botfront,项目名称:rasa-for-botfront,代码行数:25,代码来源:test_spacy_featurizer.py

示例9: create_nlp_instance

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def create_nlp_instance():
    import spacy
    from spacymoji import Emoji

    nlp = spacy.load('en')
    emoji_pipe = Emoji(nlp)
    nlp.add_pipe(emoji_pipe, first=True)

    # Merge hashtag tokens which were split by spacy
    def hashtag_pipe(doc):
        merged_hashtag = False
        while True:
            for token_index, token in enumerate(doc):
                if token.text == '#':
                    if token.head is not None:
                        start_index = token.idx
                        end_index = start_index + len(token.head.text) + 1
                        if doc.merge(start_index, end_index) is not None:
                            merged_hashtag = True
                            break
            if not merged_hashtag:
                break
            merged_hashtag = False
        return doc

    nlp.add_pipe(hashtag_pipe)
    return nlp 
开发者ID:csvance,项目名称:armchair-expert,代码行数:29,代码来源:nlp.py

示例10: _form_ann_line

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def _form_ann_line(
        idx: str,
        char_offset: Tuple[int, int, str],
        tag_name: str,
        doc: spacy.tokens.doc.Doc,
    ):
        """ Forms a ann line that can be used to write the ANN files for CoNLL format

        Parameters
        ----------
        idx : int
            The index for the entity being written
        char_offset : int
            THe start, end, tag for the line
        tag_name : str
            The tag to be used and is one of ``[Task, Process, Material]``
        doc : str
            Spacy doc to query the appropriate characters

        Returns
        -------
        str
            An ANN line that is formed.

        """
        start_offset, end_offset, entity_type = char_offset
        surface_form = doc.char_span(start_offset, end_offset).text
        start_offset = str(start_offset)
        end_offset = str(end_offset)
        ann_line = " ".join([start_offset, end_offset])
        ann_line = "\t".join([ann_line, surface_form])
        ann_line = " ".join([tag_name, ann_line])
        ann_line = "\t".join([f"T{idx}", ann_line])
        return ann_line 
开发者ID:abhinavkashyap,项目名称:sciwing,代码行数:36,代码来源:science_ie_data_utils.py

示例11: batch_tokenize

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
        return [
            self._sanitize(_remove_spaces(tokens))
            for tokens in self.spacy.pipe(texts, n_threads=-1)
        ] 
开发者ID:allenai,项目名称:allennlp,代码行数:7,代码来源:spacy_tokenizer.py

示例12: _remove_spaces

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def _remove_spaces(tokens: List[spacy.tokens.Token]) -> List[spacy.tokens.Token]:
    return [token for token in tokens if not token.is_space] 
开发者ID:allenai,项目名称:allennlp,代码行数:4,代码来源:spacy_tokenizer.py

示例13: __init__

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def __init__(self, add_unk=True):
        # init dictionaries
        self.item2idx: Dict[str, int] = {}
        self.idx2item: List[str] = []

        # in order to deal with unknown tokens, add <unk>
        if add_unk:
            self.add_item('<unk>') 
开发者ID:DFKI-NLP,项目名称:TRE,代码行数:10,代码来源:text_utils.py

示例14: spacy_get_pos

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def spacy_get_pos(tokens):
    doc = spacy.tokens.doc.Doc(
                nlp.vocab, words=tokens)

    for name, proc in nlp.pipeline:
        proc(doc)

    return [token.pos_ for token in doc] 
开发者ID:easonnie,项目名称:semanticRetrievalMRS,代码行数:10,代码来源:rindex.py

示例15: iterative_abs

# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def iterative_abs(debug_num=None):
    total_doc_num = init_inspect.TOTAL_NUM_DOC if debug_num is None else debug_num
    cur_count = 0

    with open(config.ABS_WIKI_FILE, 'rb') as abs_file:
        for line in tqdm(abs_file, total=total_doc_num):
            item = json.loads(line)
            # print(item.keys())
            # print()
            tokens, sent_offset = get_sentence_tokens(item['text'], item['charoffset'])
            poss = spacy_get_pos(tokens)
            assert len(tokens) == len(poss)
            print(tokens)
            print(sent_offset)
            # print(poss) 
开发者ID:easonnie,项目名称:semanticRetrievalMRS,代码行数:17,代码来源:rindex.py


注:本文中的spacy.tokens方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。