本文整理汇总了Python中pattern.en.parse方法的典型用法代码示例。如果您正苦于以下问题:Python en.parse方法的具体用法?Python en.parse怎么用?Python en.parse使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pattern.en
的用法示例。
在下文中一共展示了en.parse方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: lemmatize
# 需要导入模块: from pattern import en [as 别名]
# 或者: from pattern.en import parse [as 别名]
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, stopwords=frozenset()):
"""
This function is only available when the optional 'pattern' package is installed.
Use the English lemmatizer from `pattern` to extract tokens in
their base form=lemma, e.g. "are, is, being" -> "be" etc.
This is a smarter version of stemming, taking word context into account.
Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
>>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
>>> lemmatize('The study ranks high.')
['study/NN', 'rank/VB', 'high/JJ']
>>> lemmatize('The ranks study hard.')
['rank/NN', 'study/VB', 'hard/RB']
"""
if light:
import warnings
warnings.warn("The light flag is no longer supported by pattern.")
# tokenization in `pattern` is weird; it gets thrown off by non-letters,
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
# FIXME this throws away all fancy parsing cues, including sentence structure,
# abbreviations etc.
content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
parsed = parse(content, lemmata=True, collapse=False)
result = []
for sentence in parsed:
for token, tag, _, _, lemma in sentence:
if 2 <= len(lemma) <= 15 and not lemma.startswith('_') and lemma not in stopwords:
if allowed_tags.match(tag):
lemma += "/" + tag[:2]
result.append(lemma.encode('utf8'))
return result
#endif HAS_PATTERN
示例2: lemmatize
# 需要导入模块: from pattern import en [as 别名]
# 或者: from pattern.en import parse [as 别名]
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False):
"""
This function is only available when the optional 'pattern' package is installed.
Use the English lemmatizer from `pattern` to extract tokens in
their base form=lemma, e.g. "are, is, being" -> "be" etc.
This is a smarter version of stemming, taking word context into account.
Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
>>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
>>> lemmatize('The study ranks high.')
['study/NN', 'rank/VB', 'high/JJ']
>>> lemmatize('The ranks study hard.')
['rank/NN', 'study/VB', 'hard/RB']
"""
if light:
import warnings
warnings.warn("The light flag is no longer supported by pattern.")
# tokenization in `pattern` is weird; it gets thrown off by non-letters,
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
# FIXME this throws away all fancy parsing cues, including sentence structure,
# abbreviations etc.
content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
parsed = parse(content, lemmata=True, collapse=False)
result = []
for sentence in parsed:
for token, tag, _, _, lemma in sentence:
if 2 <= len(lemma) <= 15 and not lemma.startswith('_'):
if allowed_tags.match(tag):
lemma += "/" + tag[:2]
result.append(lemma.encode('utf8'))
return result
#endif HAS_PATTERN
示例3: lemmatize
# 需要导入模块: from pattern import en [as 别名]
# 或者: from pattern.en import parse [as 别名]
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, stopwords=frozenset()):
"""
This function is only available when the optional 'pattern' package is installed.
Use the English lemmatizer from `pattern` to extract tokens in
their base form=lemma, e.g. "are, is, being" -> "be" etc.
This is a smarter version of stemming, taking word context into account.
Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
>>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
>>> lemmatize('The study ranks high.')
['study/NN', 'rank/VB', 'high/JJ']
>>> lemmatize('The ranks study hard.')
['rank/NN', 'study/VB', 'hard/RB']
"""
if light:
import warnings
warnings.warn("The light flag is no longer supported by pattern.")
# tokenization in `pattern` is weird; it gets thrown off by non-letters,
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
# FIXME this throws away all fancy parsing cues, including sentence structure,
# abbreviations etc.
content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
parsed = parse(content, lemmata=True, collapse=False)
result = []
for sentence in parsed:
for token, tag, _, _, lemma in sentence:
if 2 <= len(lemma) <= 15 and not lemma.startswith('_') and lemma not in stopwords:
if allowed_tags.match(tag):
lemma += "/" + tag[:2]
result.append(lemma.encode('utf8'))
return result
#endif HAS_PATTERN