本文整理汇总了Python中spacy.tokens.Doc方法的典型用法代码示例。如果您正苦于以下问题:Python tokens.Doc方法的具体用法?Python tokens.Doc怎么用?Python tokens.Doc使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类spacy.tokens
的用法示例。
在下文中一共展示了tokens.Doc方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: generate
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def generate(self, message: str, doc: Doc = None) -> Optional[str]:
reply = ConnectorReplyGenerator.generate(self, message, doc, ignore_topics=[DISCORD_USERNAME.split('#')[0]])
if reply is None:
return None
if DISCORD_REMOVE_URL:
# Remove URLs
reply = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', reply)
reply = reply.strip()
if len(reply) > 0:
return reply
else:
return None
示例2: generate
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def generate(self, message: str, doc: Doc = None) -> Optional[str]:
reply = ConnectorReplyGenerator.generate(self, message, doc)
if reply is None:
return None
# TODO: Validate URLs before sending to twitter instead of discarding them
if TWITTER_REMOVE_URL:
# Remove URLs
reply = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', reply)
reply = reply.strip()
if len(reply) > 0:
return reply
else:
return None
示例3: pipe
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def pipe(
self,
texts: Union[
Iterable[str],
Iterable[List[str]],
Iterable[List[List[str]]]
],
n_process: Optional[int] = 1
) -> Iterable[Doc]:
"""Tokenize a stream of texts.
texts: A sequence of unicode texts (raw, presegmented or pretokenized).
n_process: Number of processes to use.
YIELDS: A sequence of Doc objects, in order.
"""
n_process = mp.cpu_count() if n_process == -1 else n_process
with mp.Pool(processes=n_process) as pool:
return pool.map(self.__call__, texts)
示例4: __call__
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def __call__(self, text):
"""
Parameters
----------
text : str
The text that should be tokenized
Returns
-------
spacy.tokens.Doc
Spacy docment with the tokenized text and information what is a space.
"""
words = text.split(" ")
# All tokens 'own' a subsequent space character in this tokenizer
spaces = [True] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
示例5: set_custom_boundary
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def set_custom_boundary(doc: Doc) -> Doc:
"""Set the boundaries of sentence.
Set the sentence boundaries based on the already separated sentences.
:param doc: doc.user_data should have a list of Sentence.
:return doc:
"""
if doc.user_data == {}:
raise AttributeError("A list of Sentence is not attached to doc.user_data.")
# Set every token.is_sent_start False because they are all True by default
for token_nr, token in enumerate(doc):
doc[token_nr].is_sent_start = False
# Set token.is_sent_start True when it is the first token of a Sentence
token_nr = 0
for sentence in doc.user_data:
doc[token_nr].is_sent_start = True
token_nr += len(sentence.words)
return doc
示例6: __init__
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def __init__(self,
qid: str,
text: str,
vid: int=0,
annotator: SpacyAnnotator=None,
metas: Dict[str, any]={}) -> None:
self.qid: str = qid
self.vid: int = vid
# this is a spacy.Doc instance
if text is not None:
if not annotator:
annotator = spacy_annotator
self.doc: Doc = annotator.model(text)
else:
self.doc = None
self.metas = metas
示例7: get_sentence
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def get_sentence(self, sid: Union[int, List[int]]=0, doc: Doc=None) -> Union[Span, List[Span]]:
"""Query a sentence in a paragraph.
Keyword Arguments:
sid {Union[int, List[int]]} -- sid the sentence id; or. (default: {None})
Returns:
Union[Span, List[Span]] -- the sentence
"""
if doc:
sentences = list(doc.sents)
else:
sentences = list(self.doc.sents)
if type(sid) == int or type(sid) == float:
if int(sid) >= 0 and int(sid) < len(sentences):
return sentences[int(sid)]
# else if it's an array
sid = [int(s) for s in sid if s >= 0 and s < len(sentences)]
if len(sid) > 0:
filtered = [sentences[s] for s in sid]
return filtered[0] if len(filtered) == 1 else filtered
if sentences:
return sentences[0]
return None
示例8: convert_doc
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def convert_doc(doc: Union[Doc, Span, 'Target'], strict_format: str=None):
def _strict_doc(doc):
if not doc:
return None
if type(doc) == str:
return doc
if strict_format == 'doc':
return doc if type(doc) == Doc else doc.as_doc()
if strict_format == 'span':
return doc if type(doc) == Span else doc[:]
return doc
def _convert(doc):
if type(doc) == str:
return doc
if type(doc) == Doc or type(doc) == Span:
return _strict_doc(doc)
else:
return _strict_doc(getattr(doc, 'doc', None))
if not doc:
return None
if type(doc) == list:
return [ _convert(d) for d in doc ]
else:
return _convert(doc)
示例9: _merge_noun_chunks
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def _merge_noun_chunks(self, doc: Doc) -> Doc:
"""Merge the parsing tree for the noun chunks. Used for detecting structural templates
Arguments:
doc {Doc} -- processed query text, in the form of spacy doc.
Returns:
Doc -- In-line modified doc
"""
for noun_phrase in list(doc.noun_chunks):
if any([t.tag_ in WHs or \
t.tag_ in ['JJR', 'JJS', 'CD', 'MD'] or \
t.ent_type_ != noun_phrase.root.ent_type_ or \
t.pos_ in ['NUM'] or t.text in ['can'] for t in list(noun_phrase)]):
continue
noun_phrase.merge(noun_phrase.root.tag_, noun_phrase.root.lemma_, noun_phrase.root.ent_type_)
return doc
示例10: _get_rewrite_ops
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def _get_rewrite_ops(self, adoc: Doc, bdoc: Doc, key: str='text', merge: bool=True, use_natural: bool=True) -> List[OpcodeMeta]:
"""Compare two queries and get the rewriteing ops
Arguments:
aquery {Query} -- The original query for rewriteing
bquery {Query} -- The rewritten-to query.
Keyword Arguments:
key {str} -- the linguistic feature. (default: {'text'})
merge {bool} -- merge continuously rewritten ops (default: {True})
use_natural {bool} -- use difflib library / self-implemented function (default: {False})
difflib cannot handle change of preprosition well.
Returns:
List[OpcodeMeta] -- list of rewriteing operations
"""
return self._get_rewrite_ops_text(
list(map(lambda p: get_token_feature(p, key), adoc)),
list(map(lambda p: get_token_feature(p, key), bdoc)),
merge=merge, use_natural=use_natural)
########## PATTERN GENERATION ##########
示例11: _get_target
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def _get_target(self, instance: Instance) -> Doc:
"""Get the target to be rewritten from the instance.
Parameters
----------
instance : Instance
The instance to be rewritten.
Returns
-------
Doc
The target doc to be rewritten.
"""
if not instance:
return None
data = self.bbw.test_instances([{ instance.rid: instance }])
if instance.key() in data:
key = data[instance.key()]
if type(key) == str and instance.get_entry(key) != None:
return convert_doc(instance.get_entry(key), strict_format='doc')
return convert_doc(key, strict_format='doc')
return None
示例12: process_content_bearing_samples
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def process_content_bearing_samples(
self, samples_to_pipe: List[Tuple[int, Text]]
) -> List[Tuple[int, "Doc"]]:
"""Sends content bearing training samples to spaCy's pipe."""
docs = [
(to_pipe_sample[0], doc)
for to_pipe_sample, doc in zip(
samples_to_pipe,
[
doc
for doc in self.nlp.pipe(
[txt for _, txt in samples_to_pipe], batch_size=50
)
],
)
]
return docs
示例13: preprocess
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def preprocess(self, doc: Doc) -> bool:
self.data.append(doc)
return True
示例14: preprocess
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def preprocess(self, doc: Doc) -> bool:
pass
示例15: generate
# 需要导入模块: from spacy import tokens [as 别名]
# 或者: from spacy.tokens import Doc [as 别名]
def generate(self, message: str, doc: Doc=None) -> str:
return self._reply_generator.generate(message, doc)