本文整理汇总了Python中nltk.stem.PorterStemmer方法的典型用法代码示例。如果您正苦于以下问题:Python stem.PorterStemmer方法的具体用法?Python stem.PorterStemmer怎么用?Python stem.PorterStemmer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem
的用法示例。
在下文中一共展示了stem.PorterStemmer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __repr__
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def __repr__(self):
return '<PorterStemmer>'
## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
# p = PorterStemmer()
# if len(sys.argv) > 1:
# for f in sys.argv[1:]:
# with open(f, 'r') as infile:
# while 1:
# w = infile.readline()
# if w == '':
# break
# w = w[:-1]
# print(p.stem(w))
##--NLTK--
## Added a demo() function
示例2: __repr__
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def __repr__(self):
return '<PorterStemmer>'
## --NLTK--
## This test procedure isn't applicable.
#if __name__ == '__main__':
# p = PorterStemmer()
# if len(sys.argv) > 1:
# for f in sys.argv[1:]:
# infile = open(f, 'r')
# while 1:
# w = infile.readline()
# if w == '':
# break
# w = w[:-1]
# print p.stem(w)
##--NLTK--
## Added a demo() function
示例3: plot_term_kdes
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def plot_term_kdes(self, words, **kwargs):
"""
Plot kernel density estimates for multiple words.
Args:
words (list): A list of unstemmed terms.
"""
stem = PorterStemmer().stem
for word in words:
kde = self.kde(stem(word), **kwargs)
plt.plot(kde)
plt.show()
示例4: tokenize
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def tokenize(text):
"""
Yield tokens.
Args:
text (str): The original text.
Yields:
dict: The next token.
"""
stem = PorterStemmer().stem
tokens = re.finditer('[a-z]+', text.lower())
for offset, match in enumerate(tokens):
# Get the raw token.
unstemmed = match.group(0)
yield { # Emit the token.
'stemmed': stem(unstemmed),
'unstemmed': unstemmed,
'offset': offset
}
示例5: preprocess
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def preprocess(string):
stemmer = PorterStemmer()
# Remove any punctuation character
removed_punc = ''.join([char for char in string if char not in punctuation])
cleaned = []
# Remove any stopword
for word in removed_punc.split(' '):
if word not in stops:
cleaned.append(stemmer.stem(word.lower()))
return ' '.join(cleaned)
# Shuffle
示例6: _create_frequency_table
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def _create_frequency_table(text_string) -> dict:
"""
we create a dictionary for the word frequency table.
For this, we should only use the words that are not part of the stopWords array.
Removing stop words and making frequency table
Stemmer - an algorithm to bring words to its root word.
:rtype: dict
"""
stopWords = set(stopwords.words("english"))
words = word_tokenize(text_string)
ps = PorterStemmer()
freqTable = dict()
for word in words:
word = ps.stem(word)
if word in stopWords:
continue
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
return freqTable
示例7: tagFilterAndStemming
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def tagFilterAndStemming(originalTag):
# Remove non alphabetical character and split on spaces
processedTag = re.sub("[^a-zA-Z0-9]", " ", originalTag)
processedTag = re.sub(" +", " ", processedTag)
processedTag = processedTag.split(" ")
stopwords_set = set(stopwords.words('english'))
stemmer = PorterStemmer()
result = []
for tag in processedTag:
tag_stemmed = stemmer.stem(tag)
if tag_stemmed not in stopwords_set:
result.append(tag_stemmed)
return result
示例8: demo
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def demo():
"""
A demonstration of the porter stemmer on a sample from
the Penn Treebank corpus.
"""
from nltk.corpus import treebank
from nltk import stem
stemmer = stem.PorterStemmer()
orig = []
stemmed = []
for item in treebank.files()[:3]:
for (word, tag) in treebank.tagged_words(item):
orig.append(word)
stemmed.append(stemmer.stem(word))
# Convert the results to a string, and word-wrap them.
results = ' '.join(stemmed)
results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()
# Convert the original to a string, and word wrap it.
original = ' '.join(orig)
original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()
# Print the results.
print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
print(original)
print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
print(results)
print('*'*70)
##--NLTK--
示例9: preprocessing
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def preprocessing(text):
text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
tokens = [word for sent in nltk.sent_tokenize(text2) for word in
nltk.word_tokenize(sent)]
tokens = [word.lower() for word in tokens]
stopwds = stopwords.words('english')
tokens = [token for token in tokens if token not in stopwds]
tokens = [word for word in tokens if len(word)>=3]
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens]
tagged_corpus = pos_tag(tokens)
Noun_tags = ['NN','NNP','NNPS','NNS']
Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
lemmatizer = WordNetLemmatizer()
def prat_lemmatize(token,tag):
if tag in Noun_tags:
return lemmatizer.lemmatize(token,'n')
elif tag in Verb_tags:
return lemmatizer.lemmatize(token,'v')
else:
return lemmatizer.lemmatize(token,'n')
pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])
return pre_proc_text
开发者ID:PacktPublishing,项目名称:Natural-Language-Processing-with-Python-Cookbook,代码行数:36,代码来源:9.5 Skipgram_Keras.py
示例10: preprocessing
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def preprocessing(text):
text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
tokens = [word for sent in nltk.sent_tokenize(text2) for word in
nltk.word_tokenize(sent)]
tokens = [word.lower() for word in tokens]
stopwds = stopwords.words('english')
tokens = [token for token in tokens if token not in stopwds]
tokens = [word for word in tokens if len(word)>=3]
stemmer = PorterStemmer()
try:
tokens = [stemmer.stem(word) for word in tokens]
except:
tokens = tokens
tagged_corpus = pos_tag(tokens)
Noun_tags = ['NN','NNP','NNPS','NNS']
Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
lemmatizer = WordNetLemmatizer()
def prat_lemmatize(token,tag):
if tag in Noun_tags:
return lemmatizer.lemmatize(token,'n')
elif tag in Verb_tags:
return lemmatizer.lemmatize(token,'v')
else:
return lemmatizer.lemmatize(token,'n')
pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])
return pre_proc_text
开发者ID:PacktPublishing,项目名称:Natural-Language-Processing-with-Python-Cookbook,代码行数:40,代码来源:9.2 Email_Classification.py
示例11: __init__
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def __init__(self):
self.stemmer = NltkPorterStemmer()
#overrides
示例12: __init__
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def __init__(self):
self.stemmer = NltkPorterStemmer()
示例13: demo
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def demo():
"""
A demonstration of the porter stemmer on a sample from
the Penn Treebank corpus.
"""
from nltk.corpus import treebank
from nltk import stem
stemmer = stem.PorterStemmer()
orig = []
stemmed = []
for item in treebank.files()[:3]:
for (word, tag) in treebank.tagged_words(item):
orig.append(word)
stemmed.append(stemmer.stem(word))
# Convert the results to a string, and word-wrap them.
results = ' '.join(stemmed)
results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()
# Convert the original to a string, and word wrap it.
original = ' '.join(orig)
original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()
# Print the results.
print '-Original-'.center(70).replace(' ', '*').replace('-', ' ')
print original
print '-Results-'.center(70).replace(' ', '*').replace('-', ' ')
print results
print '*'*70
##--NLTK--
示例14: __init__
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def __init__(self, pos_tagged, lang, stem=False, min_word_len=3):
"""
:param pos_tagged: List of list : Text pos_tagged as a list of sentences
where each sentence is a list of tuple (word, TAG).
:param stem: If we want to apply stemming on the text.
"""
self.min_word_len = min_word_len
self.considered_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'JJ'}
self.pos_tagged = []
self.filtered_pos_tagged = []
self.isStemmed = stem
self.lang = lang
if stem:
stemmer = PorterStemmer()
self.pos_tagged = [[(stemmer.stem(t[0]), t[1]) for t in sent] for sent in pos_tagged]
else:
self.pos_tagged = [[(t[0].lower(), t[1]) for t in sent] for sent in pos_tagged]
temp = []
for sent in self.pos_tagged:
s = []
for elem in sent:
if len(elem[0]) < min_word_len:
s.append((elem[0], 'LESS'))
else:
s.append(elem)
temp.append(s)
self.pos_tagged = temp
# Convert some language-specific tag (NC, NE to NN) or ADJA ->JJ see convert method.
if lang in ['fr', 'de']:
self.pos_tagged = [[(tagged_token[0], convert(tagged_token[1])) for tagged_token in sentence] for sentence
in
self.pos_tagged]
self.filtered_pos_tagged = [[(t[0].lower(), t[1]) for t in sent if self.is_candidate(t)] for sent in
self.pos_tagged]
示例15: __init__
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import PorterStemmer [as 别名]
def __init__(self, mode=NLTK_EXTENSIONS):
if mode not in (
self.NLTK_EXTENSIONS,
self.MARTIN_EXTENSIONS,
self.ORIGINAL_ALGORITHM,
):
raise ValueError(
"Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
"PorterStemmer.MARTIN_EXTENSIONS, or "
"PorterStemmer.ORIGINAL_ALGORITHM"
)
self.mode = mode
if self.mode == self.NLTK_EXTENSIONS:
# This is a table of irregular forms. It is quite short,
# but still reflects the errors actually drawn to Martin
# Porter's attention over a 20 year period!
irregular_forms = {
"sky": ["sky", "skies"],
"die": ["dying"],
"lie": ["lying"],
"tie": ["tying"],
"news": ["news"],
"inning": ["innings", "inning"],
"outing": ["outings", "outing"],
"canning": ["cannings", "canning"],
"howe": ["howe"],
"proceed": ["proceed"],
"exceed": ["exceed"],
"succeed": ["succeed"],
}
self.pool = {}
for key in irregular_forms:
for val in irregular_forms[key]:
self.pool[val] = key
self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])