当前位置: 首页>>代码示例>>Python>>正文


Python PorterStemmer.stem方法代码示例

本文整理汇总了Python中nltk.PorterStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer.stem方法的具体用法?Python PorterStemmer.stem怎么用?Python PorterStemmer.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.PorterStemmer的用法示例。


在下文中一共展示了PorterStemmer.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _log_likelihood

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
    def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix):
        LL = 0
        if answer_text is not '':
            tokens = word_tokenize(str(answer_text), language='english')
            porter_stemmer = PorterStemmer()
            unique_wordcount = len(stemmed_vocabulary)
            """
            per ogni w unica print_function words
                Cw = conta w in answer_text
                PwM = self.distrib_matrix[stemmer(w)]
                unique_wordcount = len(tokenize(answer_text)
            """
            for w in tokens:
                _w = w.strip().lower()
                Cw = 0
                for _ in answer_text.split():
                    if _w == _.strip().lower():
                        Cw += 1

                try:
                    w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace'))
                except AttributeError:
                    w_stem = porter_stemmer.stem(_w)
                try:
                    PwM = distrib_matrix[w_stem]
                except KeyError:  # key error means frequency is equal to cutoff point 1
                    PwM = 1
                LL += (Cw * log(float(PwM)))

            try:
                LL = "{0:.2f}".format(LL / float(unique_wordcount))
            except ZeroDivisionError:
                LL = 0 

        return LL
开发者ID:piercolella,项目名称:qa-scrapers,代码行数:37,代码来源:discretizer.py

示例2: stemming

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"):
    supported_stemmers = [
        "PorterStemmer", "SnowballStemmer",
        "LancasterStemmer", "WordNetLemmatizer"]
    if type is False or type not in supported_stemmers:
        return words_l
    else:
        l = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "WordNetLemmatizer":  # TODO: context
            wnl = WordNetLemmatizer()
            for word in words_l:
                l.append(wnl.lemmatize(word).encode(encoding))
        return l
开发者ID:LewkowskiArkadiusz,项目名称:magisterka,代码行数:27,代码来源:preprocessing.py

示例3: get_ngram_features

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
    def get_ngram_features(self):

        stemmer = PorterStemmer()

        top_features = [(stemmer.stem(token) + "__TOP__", True) for token in self.top_text]
        bottom_features = [(stemmer.stem(token) + "__BOTTOM__", True) for token in self.bottom_text]
        all_features = [(stemmer.stem(token) + "__ALL__", True) for token in self.all_text]
        self.ngram_features = dict(top_features + bottom_features + all_features)
开发者ID:jayhack,项目名称:automeme,代码行数:10,代码来源:Meme.py

示例4: stem

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def stem(input):
	from nltk import PorterStemmer
	stemmer = PorterStemmer();
	stemmed_training_input = [];
	stemmed_testing_input = [];
	for training_example in input['training']:
		word_list = training_example.split();
		stemmed_training_input.append(' '.join([stemmer.stem(word) for word in word_list]))

	for testing_example in input['testing']:
		word_list = testing_example.split();
		stemmed_testing_input.append(' '.join([stemmer.stem(word) for word in word_list]))

	result = {'training':stemmed_training_input, 'training_labels':input['training_labels'], 'testing':stemmed_testing_input, 'testing_labels':input['testing_labels']}
	return result
开发者ID:JT17,项目名称:445Project,代码行数:17,代码来源:featurizer.py

示例5: process_email

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def process_email(filename):
  
  f = open(filename, 'r')
  text = f.read()
  f.close()
  
  text = text.lower()
  
  #replaces html tags by space
  text = re.sub(r'<[^<>]+>', ' ', text)
  
  #replaces numbers by word number
  text = re.sub(r'[0-9]+', 'number', text)
  
  #replaces URLs by word httpaddr
  text = re.sub(r'(http|https)://[^\s]*', 'httpaddr', text)
  
  #replaces email addresses by word emailaddr
  text = re.sub(r'[^\s][email protected][^\s]+', 'emailaddr', text)
  
  #replaces dollar signs with word dollar 
  text = re.sub(r'[$]+', 'dollar', text)
  
  #removes punctuation and non-words and separates words 
  words = re.split('[^a-z0-9]| ', text)
  
  #removes nans 
  words = filter(lambda x: x!='', words)
  
  #reduces words to their stems
  stemmer = PorterStemmer()
  words = [stemmer.stem(word) for word in words]
  
  return words
开发者ID:aidad,项目名称:MachineLearning,代码行数:36,代码来源:spam_classifier.py

示例6: Model

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
class Model(FileIO):
  def __init__(self, *args, **kwargs):
    FileIO.__init__(self, *args, **kwargs)
    self.data_list = []
    self.stemmer = PorterStemmer() # correct syntax?
    self.score_map = 
    self.ranges = 

  def isInt(self, val):
    try:
      val = int(val)
      return True
    except ValueError:
      return False

  def cleanString(self, word):
    if (word not in stopwords) and (word is not " ") and (self.isInt(word) is False):
      word = word.lower()
      return self.stemmer.stem(word)
    else:
      return None

  def makeScoreList(self):
    '''Initialize a new array of 0s for each range'''
    s_list = [0] * len(self.ranges))
开发者ID:USStateDept,项目名称:dn-predict,代码行数:27,代码来源:dn_predict.py

示例7: main

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def main():
    with open("sentiment.txt", 'r') as _file:
        stemmer = PorterStemmer()
        features = []

        for words in _file:
            feature = []
            is_sentence = True

            # 極性ラベルを除外
            for word in words.split()[1:]:
                try:
                    word = word.decode("utf-8")
                    if word not in [".", ",", ":", "?", "!"] \
                            and not has_stop_list(word):

                        feature.append(stemmer.stem(word))
                except UnicodeDecodeError:
                    # 文字化けは無視する
                    is_sentence = False
                    break

            if is_sentence:
                features.append(feature)

    return features
开发者ID:N4CL,项目名称:NLP100,代码行数:28,代码来源:ch8_72.py

示例8: openAndProcessingFiles

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def openAndProcessingFiles(path,resultDict):  # Main Function

    for filename in os.listdir(os.getcwd()+path):

        thisFile = open(os.getcwd()+path+'/'+filename,'r') #open the file and process each file
        
        currentTextString = " ".join(thisFile.read().split())#store the file as a string for removing HTML tags
        
        textAfterHtmlRemovingString = re.sub('<[^>]*>', '', currentTextString) # remove HTML tags (String)
        
        textAfterHtmlRemovingList = textAfterHtmlRemovingString.split() # convert String to List for the text contains only characters
        
        textRemoveingUnnecessaryCharactersList = [removeUnnecessaryCharacters(word) for word in textAfterHtmlRemovingList ] 

        textRemoveingUnnecessaryCharactersList = [word for word in textRemoveingUnnecessaryCharactersList if word is not None]
        
        stop_words = set(stopwords.words('english'))
        
        stop_words.update(['texthtml', 'html', 'server', "email", 'date', 'gmt', 'www']) # By analying the previous result set, continully adding new stopwords
    
        textAfterStopwordsRemovingList = [word for word in textRemoveingUnnecessaryCharactersList if word not in stop_words] #remove stopwords

        stemmer = PorterStemmer() #stemming
        
        for eachWord in textAfterStopwordsRemovingList:
            eachWord = stemmer.stem(eachWord)
            storeToResultDict(eachWord,resultDict)
    
        thisFile.close()
开发者ID:abryu,项目名称:WebMining-Python,代码行数:31,代码来源:v1.py

示例9: review_to_words

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
    def review_to_words(raw_review, remove_stopwords = False):
        # BeautifulSoup pulls data out of html file
        # here it removes html tags and markups
        text = BeautifulSoup(raw_review).get_text()

        # replace numbers by word number
        text=re.sub(r'[0-9]+','number',text)

        # remove punctuations (they can be analyzed for better results)
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        text = text.lower()

        #make a list of words
        words_list = text.split()

        #download nltk text data sets, including stop words
        #nltk.download()

        if remove_stopwords:
            # get stopwords, searching a set is faster than searching a list
            stops = set(stopwords.words('english'))
            # remove stopwords
            words_list = [word for word in words_list if not word in stops]

        # reduce words to their stems
        stemmer=PorterStemmer()
        words_list=[stemmer.stem(word) for word in words_list]
        # return the list of words
        return words_list
开发者ID:aidad,项目名称:MachineLearning,代码行数:31,代码来源:ReviewParsing.py

示例10: stemm

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
    def stemm(cls, tokens):
        stemmer = PorterStemmer()

        for i, t in enumerate(tokens):
            tokens[i] = stemmer.stem(t)

        return tokens
开发者ID:devjyotip,项目名称:twitter-analytics-dashboard,代码行数:9,代码来源:textprocessing.py

示例11: normalize

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def normalize(word):
    '''
    normalize the the word for query or indexing
    :param word: unicode string
    :return: unicode string of the normalized ter
    '''
    porter = PorterStemmer()
    return porter.stem(word) if word[0].isalpha() else ''
开发者ID:genehwung,项目名称:gutenberg_indexer,代码行数:10,代码来源:utilities.py

示例12: Tokenizer

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
class Tokenizer(object):
    def __init__(self):
        self.stem = PorterStemmer()
        self.punct = set(string.punctuation) | set(['·™','..','...','....','.....','......'])
        self.punct = self.punct | set(["``", "·", "–", "--", "”","—","•","—"])
    def __call__(self, doc):
        return [t.lower() for t in word_tokenize(doc) if t not in self.punct]
    def stem_toke(self, doc):
        return [self.stem.stem(t.lower()) for t in word_tokenize(doc) if t not in self.punct]
开发者ID:JoshCason,项目名称:LING573,代码行数:11,代码来源:util.py

示例13: processContent

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
 def processContent(self, content):
     stemmer = PorterStemmer()
     tokens = word_tokenize(content)
     tokens = filter(lambda x: len(x) < 20 and x.isalnum(), tokens)
     tokens = [stemmer.stem(token.lower()) for token in tokens]
     tokens = filter(lambda x: x not in stopwords.words('english'), tokens)
     tokens = [str(token) for token in tokens]      
     bow = FreqDist(tokens)
     return(bow)
开发者ID:danmerl,项目名称:jobbot,代码行数:11,代码来源:job_spider.py

示例14: main

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]
    
    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)
    
    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)
    
    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()
    
    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()
    
    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection
    
    # Get documents to manipulate
    documents = bioc_writer.collection.documents
    
    # Go through each document
    annotation_id = 0
    for document in documents:
        
        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [stemmer.stem(token) for 
                     token in wordpunct_tokenize(passage.text)]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1
                
                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 
                                          'stemmed token')
                passage.add_annotation(bioc_annotation)
    
    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))
    
    # Write to disk
    bioc_writer.write()
开发者ID:2mh,项目名称:PyBioC,代码行数:59,代码来源:stemmer.py

示例15: stemmingword

# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
 def stemmingword(word_list, stemtype='porter'):
     if stemtype == 'porter':
         stemengine = PorterStemmer()
     else:
         stemengine = LancasterStemmer()
     try:
         filtered_words = [stemengine.stem(token).encode('latin-1', errors='ignore') for token in word_list]
     except UnicodeDecodeError, e:
         print 'Error en el tipo de caracteres descartando texto "{}"'.format(' '.join(word_list))
开发者ID:ARGHZ,项目名称:ClassifTweets,代码行数:11,代码来源:execute_xperiment.py


注:本文中的nltk.PorterStemmer.stem方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。