当前位置: 首页>>代码示例>>Python>>正文


Python nltk.PorterStemmer类代码示例

本文整理汇总了Python中nltk.PorterStemmer的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer类的具体用法?Python PorterStemmer怎么用?Python PorterStemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了PorterStemmer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: process_email

def process_email(filename):
  
  f = open(filename, 'r')
  text = f.read()
  f.close()
  
  text = text.lower()
  
  #replaces html tags by space
  text = re.sub(r'<[^<>]+>', ' ', text)
  
  #replaces numbers by word number
  text = re.sub(r'[0-9]+', 'number', text)
  
  #replaces URLs by word httpaddr
  text = re.sub(r'(http|https)://[^\s]*', 'httpaddr', text)
  
  #replaces email addresses by word emailaddr
  text = re.sub(r'[^\s][email protected][^\s]+', 'emailaddr', text)
  
  #replaces dollar signs with word dollar 
  text = re.sub(r'[$]+', 'dollar', text)
  
  #removes punctuation and non-words and separates words 
  words = re.split('[^a-z0-9]| ', text)
  
  #removes nans 
  words = filter(lambda x: x!='', words)
  
  #reduces words to their stems
  stemmer = PorterStemmer()
  words = [stemmer.stem(word) for word in words]
  
  return words
开发者ID:aidad,项目名称:MachineLearning,代码行数:34,代码来源:spam_classifier.py

示例2: stemming

def stemming(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Iterates over all terms in lines, stem them

    Return: stemmed_list (list of strings(terms that stemmed))
    """
    stemmed_list = []
    stemmer = PorterStemmer()
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        # tokenize
        line_token = wt(nopunct_line)
        # list to store stemmed terms
        stemmed_line = []
        for term in line_token:
            term = stemmer.stem_word(term)
            stemmed_line.append(term)
        # back to sentence as a string
        stemmed_sentence = ' '.join(stemmed_line)
        stemmed_list.append(stemmed_sentence)
    return stemmed_list
开发者ID:YuanhaoSun,项目名称:PPLearn,代码行数:30,代码来源:ml_feature_engineering.py

示例3: make_tags

def make_tags(title_string):
    stemmer = PorterStemmer()
    ret = []
    for word in title_string.split():
        if word not in stop_words:
            ret.append(stemmer.stem_word(word.lower()))
    return ret
开发者ID:abhijat,项目名称:RedditSearch,代码行数:7,代码来源:tagger.py

示例4: _log_likelihood

    def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix):
        LL = 0
        if answer_text is not '':
            tokens = word_tokenize(str(answer_text), language='english')
            porter_stemmer = PorterStemmer()
            unique_wordcount = len(stemmed_vocabulary)
            """
            per ogni w unica print_function words
                Cw = conta w in answer_text
                PwM = self.distrib_matrix[stemmer(w)]
                unique_wordcount = len(tokenize(answer_text)
            """
            for w in tokens:
                _w = w.strip().lower()
                Cw = 0
                for _ in answer_text.split():
                    if _w == _.strip().lower():
                        Cw += 1

                try:
                    w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace'))
                except AttributeError:
                    w_stem = porter_stemmer.stem(_w)
                try:
                    PwM = distrib_matrix[w_stem]
                except KeyError:  # key error means frequency is equal to cutoff point 1
                    PwM = 1
                LL += (Cw * log(float(PwM)))

            try:
                LL = "{0:.2f}".format(LL / float(unique_wordcount))
            except ZeroDivisionError:
                LL = 0 

        return LL
开发者ID:piercolella,项目名称:qa-scrapers,代码行数:35,代码来源:discretizer.py

示例5: openAndProcessingFiles

def openAndProcessingFiles(path,resultDict):  # Main Function

    for filename in os.listdir(os.getcwd()+path):

        thisFile = open(os.getcwd()+path+'/'+filename,'r') #open the file and process each file
        
        currentTextString = " ".join(thisFile.read().split())#store the file as a string for removing HTML tags
        
        textAfterHtmlRemovingString = re.sub('<[^>]*>', '', currentTextString) # remove HTML tags (String)
        
        textAfterHtmlRemovingList = textAfterHtmlRemovingString.split() # convert String to List for the text contains only characters
        
        textRemoveingUnnecessaryCharactersList = [removeUnnecessaryCharacters(word) for word in textAfterHtmlRemovingList ] 

        textRemoveingUnnecessaryCharactersList = [word for word in textRemoveingUnnecessaryCharactersList if word is not None]
        
        stop_words = set(stopwords.words('english'))
        
        stop_words.update(['texthtml', 'html', 'server', "email", 'date', 'gmt', 'www']) # By analying the previous result set, continully adding new stopwords
    
        textAfterStopwordsRemovingList = [word for word in textRemoveingUnnecessaryCharactersList if word not in stop_words] #remove stopwords

        stemmer = PorterStemmer() #stemming
        
        for eachWord in textAfterStopwordsRemovingList:
            eachWord = stemmer.stem(eachWord)
            storeToResultDict(eachWord,resultDict)
    
        thisFile.close()
开发者ID:abryu,项目名称:WebMining-Python,代码行数:29,代码来源:v1.py

示例6: review_to_words

    def review_to_words(raw_review, remove_stopwords = False):
        # BeautifulSoup pulls data out of html file
        # here it removes html tags and markups
        text = BeautifulSoup(raw_review).get_text()

        # replace numbers by word number
        text=re.sub(r'[0-9]+','number',text)

        # remove punctuations (they can be analyzed for better results)
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        text = text.lower()

        #make a list of words
        words_list = text.split()

        #download nltk text data sets, including stop words
        #nltk.download()

        if remove_stopwords:
            # get stopwords, searching a set is faster than searching a list
            stops = set(stopwords.words('english'))
            # remove stopwords
            words_list = [word for word in words_list if not word in stops]

        # reduce words to their stems
        stemmer=PorterStemmer()
        words_list=[stemmer.stem(word) for word in words_list]
        # return the list of words
        return words_list
开发者ID:aidad,项目名称:MachineLearning,代码行数:29,代码来源:ReviewParsing.py

示例7: main

def main():
    with open("sentiment.txt", 'r') as _file:
        stemmer = PorterStemmer()
        features = []

        for words in _file:
            feature = []
            is_sentence = True

            # 極性ラベルを除外
            for word in words.split()[1:]:
                try:
                    word = word.decode("utf-8")
                    if word not in [".", ",", ":", "?", "!"] \
                            and not has_stop_list(word):

                        feature.append(stemmer.stem(word))
                except UnicodeDecodeError:
                    # 文字化けは無視する
                    is_sentence = False
                    break

            if is_sentence:
                features.append(feature)

    return features
开发者ID:N4CL,项目名称:NLP100,代码行数:26,代码来源:ch8_72.py

示例8: stemm

    def stemm(cls, tokens):
        stemmer = PorterStemmer()

        for i, t in enumerate(tokens):
            tokens[i] = stemmer.stem(t)

        return tokens
开发者ID:devjyotip,项目名称:twitter-analytics-dashboard,代码行数:7,代码来源:textprocessing.py

示例9: normalize

def normalize(word):
    '''
    normalize the the word for query or indexing
    :param word: unicode string
    :return: unicode string of the normalized ter
    '''
    porter = PorterStemmer()
    return porter.stem(word) if word[0].isalpha() else ''
开发者ID:genehwung,项目名称:gutenberg_indexer,代码行数:8,代码来源:utilities.py

示例10: get_ngram_features

    def get_ngram_features(self):

        stemmer = PorterStemmer()

        top_features = [(stemmer.stem(token) + "__TOP__", True) for token in self.top_text]
        bottom_features = [(stemmer.stem(token) + "__BOTTOM__", True) for token in self.bottom_text]
        all_features = [(stemmer.stem(token) + "__ALL__", True) for token in self.all_text]
        self.ngram_features = dict(top_features + bottom_features + all_features)
开发者ID:jayhack,项目名称:automeme,代码行数:8,代码来源:Meme.py

示例11: __process_email

  def __process_email(self, email_contents, vocab):
    '''
    Preprocess a the body of an email and returns a
    list of word_indices.

    Arguments:
      email_contents (str): Email body.
      vocab (dict): Words dictionary.

    Return:
      (str list): Tokenized email body after processing.
    '''
    # Lower case.
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s][email protected][^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    # Tokenize and also get rid of any punctuation
    word_list = re.split(' |@|$|/|#|\.|-|:|&|\*|\+|=|[|]|\?|!|(|)|{|}|,|''|"|>|_|<|;|%',
                        email_contents)

    # Remove empty string and skip the word if it is too short.
    word_list = [s for s in word_list if s and len(s) > 1]

    # Remove any non alphanumeric characters
    word_list = [re.sub('[^a-zA-Z0-9]', '', s) for s in word_list]

    # Remove empty string and skip the word if it is too short.
    word_list = [s for s in word_list if s and len(s) > 1]

    # Stem the word
    ps = PorterStemmer() 
    word_list = [ps.stem_word(s) for s in word_list]
    word_indices = []

    # Find index in vocab list.
    for w in word_list:
      if w in vocab:
        word_indices.append(vocab[w])
    return word_indices
开发者ID:farjan,项目名称:MachineLearning,代码行数:58,代码来源:ex6.py

示例12: processContent

 def processContent(self, content):
     stemmer = PorterStemmer()
     tokens = word_tokenize(content)
     tokens = filter(lambda x: len(x) < 20 and x.isalnum(), tokens)
     tokens = [stemmer.stem(token.lower()) for token in tokens]
     tokens = filter(lambda x: x not in stopwords.words('english'), tokens)
     tokens = [str(token) for token in tokens]      
     bow = FreqDist(tokens)
     return(bow)
开发者ID:danmerl,项目名称:jobbot,代码行数:9,代码来源:job_spider.py

示例13: getStemmedWords

 def getStemmedWords(self,html):
     
     stemmed_words=[]
     #stemmer = SnowballStemmer("english")
     stemmer = PorterStemmer()
     for token in html:
         stemmed_words.append(stemmer.stem_word(token))
         
     return ' '.join(stemmed_words)
开发者ID:usc-isi-i2,项目名称:dig-classifier,代码行数:9,代码来源:preprocessor.py

示例14: main

def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]
    
    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)
    
    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)
    
    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()
    
    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()
    
    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection
    
    # Get documents to manipulate
    documents = bioc_writer.collection.documents
    
    # Go through each document
    annotation_id = 0
    for document in documents:
        
        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [stemmer.stem(token) for 
                     token in wordpunct_tokenize(passage.text)]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1
                
                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 
                                          'stemmed token')
                passage.add_annotation(bioc_annotation)
    
    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))
    
    # Write to disk
    bioc_writer.write()
开发者ID:2mh,项目名称:PyBioC,代码行数:57,代码来源:stemmer.py

示例15: stemmingword

 def stemmingword(word_list, stemtype='porter'):
     if stemtype == 'porter':
         stemengine = PorterStemmer()
     else:
         stemengine = LancasterStemmer()
     try:
         filtered_words = [stemengine.stem(token).encode('latin-1', errors='ignore') for token in word_list]
     except UnicodeDecodeError, e:
         print 'Error en el tipo de caracteres descartando texto "{}"'.format(' '.join(word_list))
开发者ID:ARGHZ,项目名称:ClassifTweets,代码行数:9,代码来源:execute_xperiment.py


注:本文中的nltk.PorterStemmer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。