Python tokens.Doc方法代码示例

本文整理汇总了Python中spacy.tokens.Doc方法的典型用法代码示例。如果您正苦于以下问题：Python tokens.Doc方法的具体用法？Python tokens.Doc怎么用？Python tokens.Doc使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类spacy.tokens的用法示例。

在下文中一共展示了tokens.Doc方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: generate

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def generate(self, message: str, doc: Doc = None) -> Optional[str]:

        reply = ConnectorReplyGenerator.generate(self, message, doc, ignore_topics=[DISCORD_USERNAME.split('#')[0]])

        if reply is None:
            return None

        if DISCORD_REMOVE_URL:
            # Remove URLs
            reply = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', reply)
            reply = reply.strip()

        if len(reply) > 0:
            return reply
        else:
            return None

开发者ID:csvance，项目名称:armchair-expert，代码行数:18，代码来源:discord.py

示例2: generate

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def generate(self, message: str, doc: Doc = None) -> Optional[str]:
        reply = ConnectorReplyGenerator.generate(self, message, doc)

        if reply is None:
            return None

        # TODO: Validate URLs before sending to twitter instead of discarding them
        if TWITTER_REMOVE_URL:
            # Remove URLs
            reply = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', reply)
            reply = reply.strip()

        if len(reply) > 0:
            return reply
        else:
            return None

开发者ID:csvance，项目名称:armchair-expert，代码行数:18，代码来源:twitter.py

示例3: pipe

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def pipe(
        self,
        texts: Union[
            Iterable[str],
            Iterable[List[str]],
            Iterable[List[List[str]]]
        ],
        n_process: Optional[int] = 1
    ) -> Iterable[Doc]:
        """Tokenize a stream of texts.

        texts: A sequence of unicode texts (raw, presegmented or pretokenized).
        n_process: Number of processes to use.
        YIELDS: A sequence of Doc objects, in order.
        """
        n_process = mp.cpu_count() if n_process == -1 else n_process
        with mp.Pool(processes=n_process) as pool:
            return pool.map(self.__call__, texts)

开发者ID:TakeLab，项目名称:spacy-udpipe，代码行数:20，代码来源:language.py

示例4: call

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def __call__(self, text):
        """
        Parameters
        ----------
        text : str
            The text that should be tokenized

        Returns
        -------
        spacy.tokens.Doc
            Spacy docment with the tokenized text and information what is a space.


        """
        words = text.split(" ")
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

开发者ID:abhinavkashyap，项目名称:sciwing，代码行数:20，代码来源:custom_spacy_tokenizers.py

示例5: set_custom_boundary

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def set_custom_boundary(doc: Doc) -> Doc:
    """Set the boundaries of sentence.

    Set the sentence boundaries based on the already separated sentences.
    :param doc: doc.user_data should have a list of Sentence.
    :return doc:
    """
    if doc.user_data == {}:
        raise AttributeError("A list of Sentence is not attached to doc.user_data.")
    # Set every token.is_sent_start False because they are all True by default
    for token_nr, token in enumerate(doc):
        doc[token_nr].is_sent_start = False
    # Set token.is_sent_start True when it is the first token of a Sentence
    token_nr = 0
    for sentence in doc.user_data:
        doc[token_nr].is_sent_start = True
        token_nr += len(sentence.words)
    return doc

开发者ID:HazyResearch，项目名称:fonduer，代码行数:20，代码来源:spacy_parser.py

示例6: init

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def __init__(self, 
        qid: str, 
        text: str, 
        vid: int=0, 
        annotator: SpacyAnnotator=None, 
        metas: Dict[str, any]={}) -> None:
        self.qid: str = qid
        self.vid: int = vid
        # this is a spacy.Doc instance
        if text is not None:
            if not annotator:
                annotator = spacy_annotator
            self.doc: Doc = annotator.model(text)
        else:
            self.doc = None
        self.metas = metas

开发者ID:uwdata，项目名称:errudite，代码行数:18，代码来源:target.py

示例7: get_sentence

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def get_sentence(self, sid: Union[int, List[int]]=0, doc: Doc=None) -> Union[Span, List[Span]]:
        """Query a sentence in a paragraph.
        
        Keyword Arguments:
            sid {Union[int, List[int]]} -- sid the sentence id; or. (default: {None})
        
        Returns:
            Union[Span, List[Span]] -- the sentence
        """
        if doc:
            sentences = list(doc.sents)
        else:
            sentences = list(self.doc.sents)
        if type(sid) == int or type(sid) == float:
            if int(sid) >= 0 and int(sid) < len(sentences):
               return sentences[int(sid)]
        # else if it's an array
        sid = [int(s) for s in sid if s >= 0 and s < len(sentences)]
        if len(sid) > 0:
            filtered = [sentences[s] for s in sid]
            return filtered[0] if len(filtered) == 1 else filtered
        if sentences:
            return sentences[0]
        return None

开发者ID:uwdata，项目名称:errudite，代码行数:26，代码来源:context.py

示例8: convert_doc

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def convert_doc(doc: Union[Doc, Span, 'Target'], strict_format: str=None):
    def _strict_doc(doc):
        if not doc:
            return None
        if type(doc) == str:
            return doc
        if strict_format == 'doc':
            return doc if type(doc) == Doc else doc.as_doc()
        if strict_format == 'span':
            return doc if type(doc) == Span else doc[:]
        return doc
    def _convert(doc):
        if type(doc) == str:
            return doc
        if type(doc) == Doc or type(doc) == Span:
            return _strict_doc(doc)
        else:
            return _strict_doc(getattr(doc, 'doc', None))
    if not doc:
        return None
    if type(doc) == list:
        return [ _convert(d) for d in doc ]
    else:
        return _convert(doc)

开发者ID:uwdata，项目名称:errudite，代码行数:26，代码来源:helpers.py

示例9: _merge_noun_chunks

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def _merge_noun_chunks(self, doc: Doc) -> Doc:
        """Merge the parsing tree for the noun chunks. Used for detecting structural templates
        
        Arguments:
            doc {Doc} -- processed query text, in the form of spacy doc.
        
        Returns:
            Doc -- In-line modified doc
        """

        for noun_phrase in list(doc.noun_chunks):
            if any([t.tag_ in WHs or \
                t.tag_ in ['JJR', 'JJS', 'CD', 'MD'] or \
                t.ent_type_ != noun_phrase.root.ent_type_ or \
                t.pos_ in ['NUM'] or t.text in ['can'] for t in list(noun_phrase)]):
                continue
            noun_phrase.merge(noun_phrase.root.tag_, noun_phrase.root.lemma_, noun_phrase.root.ent_type_)
        return doc

开发者ID:uwdata，项目名称:errudite，代码行数:20，代码来源:semantic_rule_detector.py

示例10: _get_rewrite_ops

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def _get_rewrite_ops(self, adoc: Doc, bdoc: Doc, key: str='text', merge: bool=True, use_natural: bool=True) -> List[OpcodeMeta]:
        """Compare two queries and get the rewriteing ops
        
        Arguments:
            aquery {Query} -- The original query for rewriteing
            bquery {Query} -- The rewritten-to query.
        
        Keyword Arguments:
            key {str} -- the linguistic feature. (default: {'text'})
            merge {bool} -- merge continuously rewritten ops (default: {True})
            use_natural {bool} -- use difflib library / self-implemented function (default: {False})
                difflib cannot handle change of preprosition well. 
        
        Returns:
            List[OpcodeMeta] -- list of rewriteing operations
        """
        return self._get_rewrite_ops_text(
            list(map(lambda p: get_token_feature(p, key), adoc)), 
            list(map(lambda p: get_token_feature(p, key), bdoc)),
            merge=merge, use_natural=use_natural)



    ########## PATTERN GENERATION ##########

开发者ID:uwdata，项目名称:errudite，代码行数:26，代码来源:semantic_rule_detector.py

示例11: _get_target

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def _get_target(self, instance: Instance) -> Doc:
        """Get the target to be rewritten from the instance.
        
        Parameters
        ----------
        instance : Instance
            The instance to be rewritten.
        
        Returns
        -------
        Doc
            The target doc to be rewritten.
        """
        if not instance:
            return None
            
        data = self.bbw.test_instances([{ instance.rid: instance }])
        if instance.key() in data:
            key = data[instance.key()]
            if type(key) == str and instance.get_entry(key) != None:
                return convert_doc(instance.get_entry(key), strict_format='doc')
            return convert_doc(key, strict_format='doc')
        return None

开发者ID:uwdata，项目名称:errudite，代码行数:25，代码来源:rewrite.py

示例12: process_content_bearing_samples

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def process_content_bearing_samples(
        self, samples_to_pipe: List[Tuple[int, Text]]
    ) -> List[Tuple[int, "Doc"]]:
        """Sends content bearing training samples to spaCy's pipe."""

        docs = [
            (to_pipe_sample[0], doc)
            for to_pipe_sample, doc in zip(
                samples_to_pipe,
                [
                    doc
                    for doc in self.nlp.pipe(
                        [txt for _, txt in samples_to_pipe], batch_size=50
                    )
                ],
            )
        ]
        return docs

开发者ID:botfront，项目名称:rasa-for-botfront，代码行数:20，代码来源:spacy_utils.py

示例13: preprocess

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def preprocess(self, doc: Doc) -> bool:
        self.data.append(doc)
        return True

开发者ID:csvance，项目名称:armchair-expert，代码行数:5，代码来源:nlp.py

示例14: preprocess

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def preprocess(self, doc: Doc) -> bool:
        pass

开发者ID:csvance，项目名称:armchair-expert，代码行数:4，代码来源:ml.py

示例15: generate

# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def generate(self, message: str, doc: Doc=None) -> str:
        return self._reply_generator.generate(message, doc)

开发者ID:csvance，项目名称:armchair-expert，代码行数:4，代码来源:connector_common.py

注：本文中的spacy.tokens.Doc方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。