Python stem.PorterStemmer类代码示例

本文整理汇总了Python中nltk.stem.PorterStemmer类的典型用法代码示例。如果您正苦于以下问题：Python PorterStemmer类的具体用法？Python PorterStemmer怎么用？Python PorterStemmer使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了PorterStemmer类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _stemmatize

    def _stemmatize(self, word):
        lmtzr = WordNetLemmatizer() # lemmatizer won't stem words ending in '-ing' unless you tell it it's a verb
        stemmer = PorterStemmer()

        if word.endswith('ing'):
            return stemmer.stem(word)
        return lmtzr.lemmatize(word)

开发者ID:stong1108，项目名称:CL_missedconn，代码行数:7，代码来源:TopicModeling.py

示例2: porter_list1

def porter_list1(lista):
    stemmer = PorterStemmer()
    newlist = []
    for b in lista:
        b = stemmer.stem(b)
        newlist.append(b)
    return newlist

开发者ID:ASpathoulas，项目名称:MSc-Courseworks，代码行数:7，代码来源:best.py

示例3: splitAndStem

def splitAndStem(inputfilename, outputfilename):
    '''
    For each ingredient split it into words, stem each word, construct a new recipe from those words
    :param inputfilename:
    :return:
    '''


    with open(outputfilename, 'w') as ff:
        ff.write('[\n')

    with open(inputfilename) as f:
        d = eval(f.read())

    stemmer = PorterStemmer()
    with open(outputfilename, 'a') as ff:
        for i in d:
            # print(i)
            new_item = {}
            new_ingredients = []
            for ingredient in i['ingredients']:
                tokens = word_tokenize(ingredient)
                clean_tokens = [re.subn('[^A-Za-z]', '', token)[0] for token in tokens]
                new_ingredients += [stemmer.stem(w).lower() for w in clean_tokens]
            new_item['cuisine'] = i['cuisine']
            new_item['id'] = i['id']
            new_item['ingredients'] = new_ingredients
            json_recipe = json.dumps(new_item)
            ff.write('%s,\n' % str(json_recipe))

开发者ID:sashavtyurina，项目名称:What-s-cooking，代码行数:29，代码来源:Alex_whatscooking.py

示例4: parseReviews

def parseReviews(mypath):
  filelist = os.listdir(mypath) 
  wordDict = {}
  negationList = ["no","not","never","can't","won't","cannot","didn't","couldn't"]
  negationFlag = False
  stopwordList = set(stopwords.words("english"))
  stemmer = PorterStemmer()
  for file in filelist:
    with open(mypath + "/" + file,"r") as f:
      word_list = word_tokenize(f.read())
    for word in word_list:
      if word in negationList:
        #double negative
        if negationFlag:
          negationFlag = False
        else:
          negationFlag = True
        continue
      if not word.isalnum():
        negationFlag = False
      if word.isalnum() and word not in stopwordList:
        word = stemmer.stem(word)
        if negationFlag:
          word = "!" + word
          negationFlag = False
        if word not in wordDict:
          wordDict[word] = 1
        else:
          wordDict[word] += 1
  return wordDict

开发者ID:sagardmni，项目名称:sentiment_classification，代码行数:30，代码来源:train.py

示例5: tokenizeTags

def tokenizeTags(str,dict_items):
    #temp map (for getting the local term frequency)
    #for a sentence
    str =str.decode('ascii', 'ignore')
    #tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
    #tokens=tokenizer.tokenize(str)
    tokens = str.split()
    #print tokens
    stemmer = PorterStemmer()
    #small set of stopwords (remove you, are, and, I those kinds of words)
    last =[]
    #bigram_list=[]
    for d in tokens:
        d = d.split('-')
        for c in d:
                c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
                #regular expression -> strip punctuations
                if c!='' and c not in dict_items:
                    try:
                        if int(c):
                            if len(c)!=4 and (c>2015 or c<1900): #keep years
                                c=stemmer.stem('NUM')
                    except Exception:
                        c = stemmer.stem(c.lower())
                        pass
                    #c = stemmer.stem(c.lower())
                    last.append(c)
                    #bigram generation
                #index= len(last)
                #if index>1:
                   # bigram = last[index-2]+' '+last[index-1]
                   # bigram_list.append(bigram)
    return last

开发者ID:wingsrc，项目名称:musicRecommendation_topicmodeling，代码行数:33，代码来源:topic_model.py

示例6: tokenize2_bigram

def tokenize2_bigram(str,df_freq):
    temp_map={}
    #for a sentence
    str =str.decode('ascii', 'ignore')
    tokens = str.split()
    #print tokens
    stemmer = PorterStemmer()
    last =[]
    bigram_list=[]
    for d in tokens:
        d = d.split('-')
        for c in d:
                c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
                #regular expression -> strip punctuations
                if c!='':
                    try:
                        if int(c):
                            if len(c)!=4 and (c>2015 or c<1900): #keep years
                                c=stemmer.stem('NUM')
                    except Exception:
                        c = stemmer.stem(c.lower())
                        pass
                    
                    #c = stemmer.stem(c.lower())
                    last.append(c)
                    
                    #bigram generation
                index= 0
                if index>1:
                    bigram = last[index-2]+' '+last[index-1]
                    bigram_list.append(bigram)
                    updateDF(temp_map,df_freq,bigram)
                    index+=1
    return bigram_list

开发者ID:wingsrc，项目名称:musicRecommendation_topicmodeling，代码行数:34，代码来源:topic_model.py

示例7: openfile

def openfile(filename,output):
    print(filename)
    #starts run time
    start = timeit.default_timer()
    ps = PorterStemmer()
    file = open(filename,"r")
    tokens = []

    #Used for removing punctuation from the documents
    translate_table = dict((ord(char), None) for char in string.punctuation)

    start2 = timeit.default_timer()
    #splits the lines into words and removes the punctuation
    for line in file:
        tokens += word_tokenize(line.translate(translate_table)  )
    start3 = timeit.default_timer()
    print("tokenize")
    print(start3 - start2)
        
    #creates a set of stop words to be removed later
    stop_words = set(stopwords.words("english"))

    start6 = timeit.default_timer()
    #if a word is not a stop word it adds it to a list 
    filtered_sentence = []
    for w in tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    start7 = timeit.default_timer()
    print("stop word removal")
    print(start7 - start6)

    startw = timeit.default_timer()    
    #stems each word and adds it to the output file in csv form
    f = open(output,'w')
    iterFilSen = iter(filtered_sentence)
    if output == "documents.csv":
        for w in filtered_sentence:
            if w == "I":
                f.write("\n")
            f.write(ps.stem(w))
            f.write(",")
    else:
        for w in iterFilSen:
            if w == "I":
                f.write("\n")
                #removes the I number W
                next(iterFilSen)
                next(iterFilSen)
            else:
                f.write(ps.stem(w))
                f.write(",")
            
        
    #ends run time
    stop = timeit.default_timer()
    print("writing")
    print(stop - startw)
    print("total: "+output)
    print(stop - start)

开发者ID:SomeUserName-ForMe，项目名称:InvertedIndex，代码行数:60，代码来源:stemmer.py

示例8: testing

def testing():
    # - tokenize on sentence and word
    ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!"
    print(sent_tokenize(ex_txt))
    print(word_tokenize(ex_txt, language='english'))

    # - stop words (pre-defined by nltk)
    stop_words = set(stopwords.words('english'))
    print(stop_words)
    words = word_tokenize(ex_txt)
    print(words)
    filtered_sent = []
    for w in words:
        if w not in stop_words:
            filtered_sent.append(w)
    print(filtered_sent)
    filtered_sent = [w for w in words if not w in stop_words]
    print(filtered_sent)

    # - stemming
    ps = PorterStemmer()
    example_words = [python,pythoner,pythoning,pythoned,pythonly]
    # for w in example_words:
    #     print(ps.stem(w))
    new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
    words = word_tokenize(new_text)
    for w in words:
        print(ps.stem(w))

开发者ID:gbartusk，项目名称:coursera_data_science_capstone，代码行数:28，代码来源:capstone.py

示例9: prepare_data

def prepare_data(reviews):
    # run porter stemmer on every word
    stemmer = PorterStemmer()
    stem_text = lambda x: {'class': x['class'],
                           'text': stemmer.stem(x['text'])}

    # clean text and remove empty items
    reviews = filter(lambda x: x != {}, reviews)
    reviews = map(stem_text, reviews)

    print('classification: ' + reviews[observed_element]['class'] + '\n\n------------------------------------\n\n')

    print('stemming: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')

    # remove stopwords
    reviews = map(remove_stop_words, reviews)

    print('stopwords: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')

    # remove undesired patterns
    reviews = map(clean_text, reviews)

    print('elementos inuteis: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')

    return reviews

开发者ID:teago19，项目名称:sentimentAnalysis，代码行数:25，代码来源:classify.py

示例10: extract_clean_sentences

    def extract_clean_sentences(self):
        """
        Extracts sentences from plain text. Also applies the following cleaning
        operations:
        - Exclude all characters not recognized by 'utf-8' encoding
        - Exclude all characters not contained in [a-zA-Z0-9 '-]
        - Exclude common stopwords
        """

        text = self.raw_text
        
        exclude = re.compile('[^a-zA-Z0-9 \'-]')
        linebreaks = re.compile('\s')
        excess_space = re.compile('\s+')
        stemmer = PorterStemmer()

        sentences = sent_tokenize(text)
        out = []
        for sentence in sentences:
            sentence = linebreaks.sub(' ', sentence)
            sentence = exclude.sub(' ', sentence)
            sentence = excess_space.sub(' ', sentence)
            tokens = word_tokenize(sentence)
            tokens = [stemmer.stem(t.lower()) for t in tokens]
            out.append(tokens)

        return out

开发者ID:flinder，项目名称:human_rights_text，代码行数:27，代码来源:import_reports.py

示例11: preprocess_document

def preprocess_document(doc):
  stopset = set(stopwords.words('english'))
  stemmer = PorterStemmer()
  tokens = wordpunct_tokenize(doc)
  clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
  final = [stemmer.stem(word) for word in clean]
  return final

开发者ID:mrquant，项目名称:InfoRetrievalSystem，代码行数:7，代码来源:irs.py

示例12: preprocess

def preprocess(text):
  stemmer = PorterStemmer()
  stop = stopwords.words('english')
  tokens = [tok for tok in word_tokenize(text.lower())
    if tok not in stop]
  tokens_stemmed = [stemmer.stem(tok) for tok in tokens]
  return tokens_stemmed

开发者ID:kedz，项目名称:newsblaster，代码行数:7，代码来源:cluster.py

示例13: preprocessing

def preprocessing(text, debug = False):
    if debug:
        print text

    # lower case
    text = text.lower()
    if debug:
        print text

    # can't -> cannot, bya's -> bya is
    text = replacers.RegexpReplacer().replace(text)
    if debug:
        print text

    # word tokenize
    words = word_tokenize(text)
    if debug:
        print words

    # removing stopwords
    english_stops = set(stopwords.words('english'))
    english_stops_added = english_stops | {'.', ',', ':', ';'}
    words = [word for word in words if word not in english_stops_added]
    if debug:
        print words

    # stemming words
    stemmer = PorterStemmer()
    words_stemmed = list(map(lambda word: stemmer.stem(word), words))
    if debug:
        print words_stemmed

    return words, words_stemmed

开发者ID:Muugii-bs，项目名称:hommie，代码行数:33，代码来源:utils.py

示例14: buildVocab

 def buildVocab(self):
     '''Build a vocabulary for the selected documents (from dir database).'''
     ## Note: The source of text should be Lucene processed field values. Lucene tokenized the text, remove stop words, and may take other unknown steps.
     ## Right now the vocabulary is built on the raw text with NLTK based stopwords removal, and tokenization. This should be improved.
     # collect contents from /database/ for each of these doc
     for pmid in self.pmidList: # self.pmidList includes the query and the 99 most similar articles selected by BM25
         self.corpus.append(file(os.path.join(self.dbDir,pmid)).read()) # corpus contains raw text (MH, title*2, abstract)
     for text in self.corpus:
         sent_tokenize_list = sent_tokenize(text.strip().lower(), "english") # tokenize an article text
         stemmed_text = []
         if sent_tokenize_list: # if sent_tokenize_list is not empty
             porter_stemmer = PorterStemmer()
             for sent in sent_tokenize_list:
                 words = TreebankWordTokenizer().tokenize(sent) # tokenize the sentence
                 words = [word.strip(string.punctuation) for word in words]
                 words = [word for word in words if not word in stopwords.words("english")]               
                 words = [word for word in words if len(word)>1] # remove single letters and non alphabetic characters               
                 words = [word for word in words if re.search('[a-zA-Z]',word)]                        
                 words = [porter_stemmer.stem(word) for word in words] # apply Porter stemmer                     
                 stemmed_text.append(" ".join(words))
                 self.vocab+=words
         self.stemmed_corpus.append(". ".join(stemmed_text)) # append a stemmed article text
     # save stemmed corpus
     pickle.dump(self.stemmed_corpus, file(os.path.join(self.stemmed_corpusDir,str(self.pmidList[0])),"w"))
     # remove low frequency tokens and redundant tokens
     tokenDist = Counter(self.vocab)
     lowFreqList = []
     for token, count in tokenDist.iteritems():
         if count<2:
             lowFreqList.append(token)
     self.vocab = list(set(self.vocab)-set(lowFreqList))
     # save vocabulary
     pickle.dump(self.vocab,file(os.path.join(self.vocabDir,str(self.pmidList[0])),"w"))

开发者ID:w2wei，项目名称:XPRC，代码行数:33，代码来源:RetKNN_MPRC.py

示例15: StemmedBagOfWordsFeatureGenerator

class StemmedBagOfWordsFeatureGenerator(EdgeFeatureGenerator):
    """
    Generates stemmed Bag of Words representation for each sentence that contains
    an edge, using the function given in the argument.

    By default it uses Porter stemmer

    :type feature_set: nala.structures.data.FeatureDictionary
    :type stemmer: nltk.stem.PorterStemmer
    :type stop_words: list[str]
    :type training_mode: bool
    """

    def __init__(self, feature_set, stop_words=[], training_mode=True):
        self.feature_set = feature_set
        """the feature set for the dataset"""
        self.training_mode = training_mode
        """whether the mode is training or testing"""
        self.stemmer = PorterStemmer()
        """an instance of the PorterStemmer"""
        self.stop_words = stop_words
        """a list of stop words"""

    def generate(self, dataset):
        for edge in dataset.edges():
            sentence = edge.part.sentences[edge.sentence_id]
            if self.training_mode:
                for token in sentence:
                    if self.stemmer.stem(
                            token.word
                    ) not in self.stop_words and not token.features['is_punct']:
                        feature_name = '4_bow_stem_' + self.stemmer.stem(
                            token.word) + '_[0]'
                        self.add_to_feature_set(edge, feature_name)

开发者ID:Rostlab，项目名称:relna，代码行数:34，代码来源:sentence.py

注：本文中的nltk.stem.PorterStemmer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。