当前位置: 首页>>代码示例>>Python>>正文


Python PorterStemmer.stem_word方法代码示例

本文整理汇总了Python中nltk.PorterStemmer.stem_word方法的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer.stem_word方法的具体用法?Python PorterStemmer.stem_word怎么用?Python PorterStemmer.stem_word使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.PorterStemmer的用法示例。


在下文中一共展示了PorterStemmer.stem_word方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: stemming

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
def stemming(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Iterates over all terms in lines, stem them

    Return: stemmed_list (list of strings(terms that stemmed))
    """
    stemmed_list = []
    stemmer = PorterStemmer()
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        # tokenize
        line_token = wt(nopunct_line)
        # list to store stemmed terms
        stemmed_line = []
        for term in line_token:
            term = stemmer.stem_word(term)
            stemmed_line.append(term)
        # back to sentence as a string
        stemmed_sentence = ' '.join(stemmed_line)
        stemmed_list.append(stemmed_sentence)
    return stemmed_list
开发者ID:YuanhaoSun,项目名称:PPLearn,代码行数:32,代码来源:ml_feature_engineering.py

示例2: make_tags

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
def make_tags(title_string):
    stemmer = PorterStemmer()
    ret = []
    for word in title_string.split():
        if word not in stop_words:
            ret.append(stemmer.stem_word(word.lower()))
    return ret
开发者ID:abhijat,项目名称:RedditSearch,代码行数:9,代码来源:tagger.py

示例3: __process_email

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
  def __process_email(self, email_contents, vocab):
    '''
    Preprocess a the body of an email and returns a
    list of word_indices.

    Arguments:
      email_contents (str): Email body.
      vocab (dict): Words dictionary.

    Return:
      (str list): Tokenized email body after processing.
    '''
    # Lower case.
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s][email protected][^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    # Tokenize and also get rid of any punctuation
    word_list = re.split(' |@|$|/|#|\.|-|:|&|\*|\+|=|[|]|\?|!|(|)|{|}|,|''|"|>|_|<|;|%',
                        email_contents)

    # Remove empty string and skip the word if it is too short.
    word_list = [s for s in word_list if s and len(s) > 1]

    # Remove any non alphanumeric characters
    word_list = [re.sub('[^a-zA-Z0-9]', '', s) for s in word_list]

    # Remove empty string and skip the word if it is too short.
    word_list = [s for s in word_list if s and len(s) > 1]

    # Stem the word
    ps = PorterStemmer() 
    word_list = [ps.stem_word(s) for s in word_list]
    word_indices = []

    # Find index in vocab list.
    for w in word_list:
      if w in vocab:
        word_indices.append(vocab[w])
    return word_indices
开发者ID:farjan,项目名称:MachineLearning,代码行数:60,代码来源:ex6.py

示例4: getStemmedWords

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
 def getStemmedWords(self,html):
     
     stemmed_words=[]
     #stemmer = SnowballStemmer("english")
     stemmer = PorterStemmer()
     for token in html:
         stemmed_words.append(stemmer.stem_word(token))
         
     return ' '.join(stemmed_words)
开发者ID:usc-isi-i2,项目名称:dig-classifier,代码行数:11,代码来源:preprocessor.py

示例5: __init__

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
 def __init__(self,text):
     lmtzr = WordNetLemmatizer()
     porter_stem = PorterStemmer()
     wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'}
     data = text.lower()
     tokens = nltk.word_tokenize(data)
     tagged = nltk.pos_tag(tokens)
     word_list = []
     for t in tagged:
         try:
             word_list.append(lmtzr.lemmatize(t[0],wordnet_tag[t[1][:2]]))
         except:
             word_list.append(porter_stem.stem_word(t[0]))
     self.filtered_words = [w for w in word_list if not w in stopwords.words('english')]
开发者ID:pankajksharma,项目名称:Opinion-Mining,代码行数:16,代码来源:datacleaner.py

示例6: run_cv

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
    df_to_append = tmp_df_agg.sort(('AUC test mean'), ascending=False).head(1) # выбор строки с максимальным AUC
    df_to_append = df_to_append[['C', 'AUC test mean']] # выбор только нужных колонок
    df_to_append.columns = ['C', 'CV AUC'] # переименование выбранных колонок
    df_to_append.insert(0, 'Vectorizer', train_data_ind) # добавление колонок
    df_to_append.insert(1, 'Stemming', stemming) # добавление колонок
    df_auc_agg = df_auc_agg.append(df_to_append, ignore_index=True) # добавление строки в dataframe с агрегатами

for x in vectorized_text:
    run_cv(x, stemming = False)

# Стемминг исходного текста
from nltk import PorterStemmer
ps = PorterStemmer()
import re
stemmed_train_text =  [' '.join([ps.stem_word(x) for x in re.findall(r"[\w']+", y)]) for y in twenty_train.data]
# Словарь векторайзеров для текста после стемминга
vectorizers_stem = {'CountVect': CountVectorizer(binary = False), 'CountVectBin': CountVectorizer(binary = True), 'TFIDFVect': TfidfVectorizer()}
# Векторизация текста со стеммингом
vectorized_stemmed_text_train = {}
for i in vectorizers_stem:
    vectorized_stemmed_text_train[i] = vectorizers_stem[i].fit_transform(stemmed_train_text)
vectorized_stemmed_text_train
for x in vectorized_stemmed_text_train:
    run_cv(x, stemming = True)
print('Best models')
df_auc_agg.sort(('CV AUC'), ascending=False)
# Обучение лучшей модели на всей обучающей выборке и расчет AUC на обучающей и тестовой выборках
best_model = LogisticRegression(class_weight = 'balanced', penalty = 'l1', C = 6.0).fit(vectorized_stemmed_text_train['TFIDFVect'], train_labels)
train_auc = calc_auc(y_labels = train_labels, y_predicted = best_model.predict_proba(vectorized_stemmed_text_train['TFIDFVect'])[:, 1])
print('Train AUC = ' + str(train_auc))
开发者ID:Nakols,项目名称:Machine_Learning,代码行数:32,代码来源:MachineLearning1.py

示例7: stem_word

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
def stem_word(word):
    return PorterStemmer.stem_word(word)
开发者ID:raethlein,项目名称:TrendAnalysis,代码行数:4,代码来源:cleaner.py

示例8: len

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem_word [as 别名]
import cjson
from nltk import PorterStemmer
infile='./stop_word_list_new'
x=PorterStemmer()
f=open(infile,'r')
listt=cjson.decode(f.readline())
nw=list(set(listt))
new_list=[]
for word in nw:
    word1=x.stem_word(word)
    if word1 not in new_list:
        new_list.append(word1)
newlist=list(set(new_list))
print new_list
print len(new_list)

outfile='./stop_word_porter_stems'
o=open(outfile,'w')
o.write(cjson.encode(new_list))
outfile1='./stop_word_list_new'
o1=open(outfile1,'w')
o1.write(cjson.encode(nw))
开发者ID:WeiNiu,项目名称:lsfolk,代码行数:24,代码来源:get_stop_word_list1.py


注:本文中的nltk.PorterStemmer.stem_word方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。