本文整理匯總了Python中pattern.en.parse方法的典型用法代碼示例。如果您正苦於以下問題:Python en.parse方法的具體用法?Python en.parse怎麽用?Python en.parse使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pattern.en
的用法示例。
在下文中一共展示了en.parse方法的3個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: lemmatize
# 需要導入模塊: from pattern import en [as 別名]
# 或者: from pattern.en import parse [as 別名]
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, stopwords=frozenset()):
"""
This function is only available when the optional 'pattern' package is installed.
Use the English lemmatizer from `pattern` to extract tokens in
their base form=lemma, e.g. "are, is, being" -> "be" etc.
This is a smarter version of stemming, taking word context into account.
Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
>>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
>>> lemmatize('The study ranks high.')
['study/NN', 'rank/VB', 'high/JJ']
>>> lemmatize('The ranks study hard.')
['rank/NN', 'study/VB', 'hard/RB']
"""
if light:
import warnings
warnings.warn("The light flag is no longer supported by pattern.")
# tokenization in `pattern` is weird; it gets thrown off by non-letters,
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
# FIXME this throws away all fancy parsing cues, including sentence structure,
# abbreviations etc.
content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
parsed = parse(content, lemmata=True, collapse=False)
result = []
for sentence in parsed:
for token, tag, _, _, lemma in sentence:
if 2 <= len(lemma) <= 15 and not lemma.startswith('_') and lemma not in stopwords:
if allowed_tags.match(tag):
lemma += "/" + tag[:2]
result.append(lemma.encode('utf8'))
return result
#endif HAS_PATTERN
示例2: lemmatize
# 需要導入模塊: from pattern import en [as 別名]
# 或者: from pattern.en import parse [as 別名]
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False):
"""
This function is only available when the optional 'pattern' package is installed.
Use the English lemmatizer from `pattern` to extract tokens in
their base form=lemma, e.g. "are, is, being" -> "be" etc.
This is a smarter version of stemming, taking word context into account.
Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
>>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
>>> lemmatize('The study ranks high.')
['study/NN', 'rank/VB', 'high/JJ']
>>> lemmatize('The ranks study hard.')
['rank/NN', 'study/VB', 'hard/RB']
"""
if light:
import warnings
warnings.warn("The light flag is no longer supported by pattern.")
# tokenization in `pattern` is weird; it gets thrown off by non-letters,
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
# FIXME this throws away all fancy parsing cues, including sentence structure,
# abbreviations etc.
content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
parsed = parse(content, lemmata=True, collapse=False)
result = []
for sentence in parsed:
for token, tag, _, _, lemma in sentence:
if 2 <= len(lemma) <= 15 and not lemma.startswith('_'):
if allowed_tags.match(tag):
lemma += "/" + tag[:2]
result.append(lemma.encode('utf8'))
return result
#endif HAS_PATTERN
示例3: lemmatize
# 需要導入模塊: from pattern import en [as 別名]
# 或者: from pattern.en import parse [as 別名]
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, stopwords=frozenset()):
"""
This function is only available when the optional 'pattern' package is installed.
Use the English lemmatizer from `pattern` to extract tokens in
their base form=lemma, e.g. "are, is, being" -> "be" etc.
This is a smarter version of stemming, taking word context into account.
Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
>>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
>>> lemmatize('The study ranks high.')
['study/NN', 'rank/VB', 'high/JJ']
>>> lemmatize('The ranks study hard.')
['rank/NN', 'study/VB', 'hard/RB']
"""
if light:
import warnings
warnings.warn("The light flag is no longer supported by pattern.")
# tokenization in `pattern` is weird; it gets thrown off by non-letters,
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
# FIXME this throws away all fancy parsing cues, including sentence structure,
# abbreviations etc.
content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
parsed = parse(content, lemmata=True, collapse=False)
result = []
for sentence in parsed:
for token, tag, _, _, lemma in sentence:
if 2 <= len(lemma) <= 15 and not lemma.startswith('_') and lemma not in stopwords:
if allowed_tags.match(tag):
lemma += "/" + tag[:2]
result.append(lemma.encode('utf8'))
return result
#endif HAS_PATTERN