本文整理汇总了Python中nltk.PorterStemmer.stem_word方法的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer.stem_word方法的具体用法?Python PorterStemmer.stem_word怎么用?Python PorterStemmer.stem_word使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.PorterStemmer
的用法示例。
在下文中一共展示了PorterStemmer.stem_word方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: stemming
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
def stemming(line_list):
"""
Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data
Iterates over all terms in lines, stem them
Return: stemmed_list (list of strings(terms that stemmed))
"""
stemmed_list = []
stemmer = PorterStemmer()
for i, line in enumerate(line_list):
# linercase
line = line.lower()
# remove punctuation
# below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
# nopunct_line = ''.join([c for c in line
# if re.match("[a-z\-\' \n\t]", c)])
# this solve the problem above:
nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)
# tokenize
line_token = wt(nopunct_line)
# list to store stemmed terms
stemmed_line = []
for term in line_token:
term = stemmer.stem_word(term)
stemmed_line.append(term)
# back to sentence as a string
stemmed_sentence = ' '.join(stemmed_line)
stemmed_list.append(stemmed_sentence)
return stemmed_list
示例2: make_tags
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
def make_tags(title_string):
stemmer = PorterStemmer()
ret = []
for word in title_string.split():
if word not in stop_words:
ret.append(stemmer.stem_word(word.lower()))
return ret
示例3: __process_email
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
def __process_email(self, email_contents, vocab):
'''
Preprocess a the body of an email and returns a
list of word_indices.
Arguments:
email_contents (str): Email body.
vocab (dict): Words dictionary.
Return:
(str list): Tokenized email body after processing.
'''
# Lower case.
email_contents = email_contents.lower()
# Strip all HTML
# Looks for any expression that starts with < and ends with > and replace
# and does not have any < or > in the tag it with a space
email_contents = re.sub('<[^<>]+>', ' ', email_contents)
# Handle Numbers
# Look for one or more characters between 0-9
email_contents = re.sub('[0-9]+', 'number', email_contents)
# Handle URLS
# Look for strings starting with http:// or https://
email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)
# Handle Email Addresses
# Look for strings with @ in the middle
email_contents = re.sub('[^\s][email protected][^\s]+', 'emailaddr', email_contents)
# Handle $ sign
email_contents = re.sub('[$]+', 'dollar', email_contents)
# Tokenize and also get rid of any punctuation
word_list = re.split(' |@|$|/|#|\.|-|:|&|\*|\+|=|[|]|\?|!|(|)|{|}|,|''|"|>|_|<|;|%',
email_contents)
# Remove empty string and skip the word if it is too short.
word_list = [s for s in word_list if s and len(s) > 1]
# Remove any non alphanumeric characters
word_list = [re.sub('[^a-zA-Z0-9]', '', s) for s in word_list]
# Remove empty string and skip the word if it is too short.
word_list = [s for s in word_list if s and len(s) > 1]
# Stem the word
ps = PorterStemmer()
word_list = [ps.stem_word(s) for s in word_list]
word_indices = []
# Find index in vocab list.
for w in word_list:
if w in vocab:
word_indices.append(vocab[w])
return word_indices
示例4: getStemmedWords
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
def getStemmedWords(self,html):
stemmed_words=[]
#stemmer = SnowballStemmer("english")
stemmer = PorterStemmer()
for token in html:
stemmed_words.append(stemmer.stem_word(token))
return ' '.join(stemmed_words)
示例5: __init__
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
def __init__(self,text):
lmtzr = WordNetLemmatizer()
porter_stem = PorterStemmer()
wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'}
data = text.lower()
tokens = nltk.word_tokenize(data)
tagged = nltk.pos_tag(tokens)
word_list = []
for t in tagged:
try:
word_list.append(lmtzr.lemmatize(t[0],wordnet_tag[t[1][:2]]))
except:
word_list.append(porter_stem.stem_word(t[0]))
self.filtered_words = [w for w in word_list if not w in stopwords.words('english')]
示例6: run_cv
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
df_to_append = tmp_df_agg.sort(('AUC test mean'), ascending=False).head(1) # выбор строки с максимальным AUC
df_to_append = df_to_append[['C', 'AUC test mean']] # выбор только нужных колонок
df_to_append.columns = ['C', 'CV AUC'] # переименование выбранных колонок
df_to_append.insert(0, 'Vectorizer', train_data_ind) # добавление колонок
df_to_append.insert(1, 'Stemming', stemming) # добавление колонок
df_auc_agg = df_auc_agg.append(df_to_append, ignore_index=True) # добавление строки в dataframe с агрегатами
for x in vectorized_text:
run_cv(x, stemming = False)
# Стемминг исходного текста
from nltk import PorterStemmer
ps = PorterStemmer()
import re
stemmed_train_text = [' '.join([ps.stem_word(x) for x in re.findall(r"[\w']+", y)]) for y in twenty_train.data]
# Словарь векторайзеров для текста после стемминга
vectorizers_stem = {'CountVect': CountVectorizer(binary = False), 'CountVectBin': CountVectorizer(binary = True), 'TFIDFVect': TfidfVectorizer()}
# Векторизация текста со стеммингом
vectorized_stemmed_text_train = {}
for i in vectorizers_stem:
vectorized_stemmed_text_train[i] = vectorizers_stem[i].fit_transform(stemmed_train_text)
vectorized_stemmed_text_train
for x in vectorized_stemmed_text_train:
run_cv(x, stemming = True)
print('Best models')
df_auc_agg.sort(('CV AUC'), ascending=False)
# Обучение лучшей модели на всей обучающей выборке и расчет AUC на обучающей и тестовой выборках
best_model = LogisticRegression(class_weight = 'balanced', penalty = 'l1', C = 6.0).fit(vectorized_stemmed_text_train['TFIDFVect'], train_labels)
train_auc = calc_auc(y_labels = train_labels, y_predicted = best_model.predict_proba(vectorized_stemmed_text_train['TFIDFVect'])[:, 1])
print('Train AUC = ' + str(train_auc))
示例7: stem_word
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
def stem_word(word):
return PorterStemmer.stem_word(word)
示例8: len
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
import cjson
from nltk import PorterStemmer
infile='./stop_word_list_new'
x=PorterStemmer()
f=open(infile,'r')
listt=cjson.decode(f.readline())
nw=list(set(listt))
new_list=[]
for word in nw:
word1=x.stem_word(word)
if word1 not in new_list:
new_list.append(word1)
newlist=list(set(new_list))
print new_list
print len(new_list)
outfile='./stop_word_porter_stems'
o=open(outfile,'w')
o.write(cjson.encode(new_list))
outfile1='./stop_word_list_new'
o1=open(outfile1,'w')
o1.write(cjson.encode(nw))