当前位置: 首页>>代码示例>>Python>>正文


Python PorterStemmer.PorterStemmer类代码示例

本文整理汇总了Python中PorterStemmer.PorterStemmer的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer类的具体用法?Python PorterStemmer怎么用?Python PorterStemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了PorterStemmer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: process_word

def process_word(token):
    token = token.lower()
    if constants.STEM is True:
        p = PorterStemmer()
        token = p.stem(token, 0,len(token)-1)                       
    
    return token
开发者ID:aimannajjar,项目名称:columbiau-set-movie-review-classifier,代码行数:7,代码来源:common.py

示例2: __init__

    def __init__(self, parent, docno, doc, terms):
        QtGui.QDialog.__init__(self, parent)

        self.setupUi(self)

        # Set fields
        self.labelDocumentNo.setText(docno)

        textDocument = self.textEdit.document()
        textCursor = QtGui.QTextCursor(textDocument)

        normalFormat = QtGui.QTextCharFormat()
        termFormat = QtGui.QTextCharFormat()
        termFormat.setForeground(QtGui.QBrush(QtGui.QColor("red")))
        termFormat.setFontWeight(QtGui.QFont.Bold)

        textCursor.beginEditBlock()

        stemmer = PorterStemmer()
        terms = terms.split(",")
        stemmed_terms = [stemmer.stem(term, 0, len(term)-1) for term in terms]

        for line in unicode(doc).split("\n"):
            for word in line.split(" "):
                nword = word.lower().strip(punctuation)
                sword = stemmer.stem(nword, 0, len(nword)-1)
                if nword in terms or sword in stemmed_terms:
                    textCursor.insertText(word, termFormat)
                else:
                    textCursor.insertText(word, normalFormat)
                textCursor.insertText(" ", normalFormat)

            textCursor.insertText("\n", normalFormat)

        self.textEdit.moveCursor(QtGui.QTextCursor.Start)
开发者ID:ozancaglayan,项目名称:SearchEngine,代码行数:35,代码来源:documentwindow.py

示例3: stemWords

def stemWords(inList):
##Function that stems the	words.
##Name: stemWords; input:	list (of tokens); output: list	(of stemmed tokens)
    outlist = []
    p = PorterStemmer()
    for word in inList:
        outlist.append(p.stem(word, 0, len(word)-1))
    return outlist
开发者ID:yiwuxie15,项目名称:A1_Language_Identifier,代码行数:8,代码来源:preprocess.py

示例4: load_dictionary

def load_dictionary(filename, stem=True):
    """Loads line separated dictionary into a list"""
    out = []
    for word in open("dictionaries/%s" % filename, "r"):
        word = word.lower()
        if stem is True:
            p = PorterStemmer()
            word = p.stem(word, 0,len(word)-1)               
        out.append(word)
    return out
开发者ID:aimannajjar,项目名称:columbiau-set-movie-review-classifier,代码行数:10,代码来源:common.py

示例5: format_description

def format_description(text, stop_words):
    words = text.split()

    stemmer = PorterStemmer()
    non_stop_words = []
    for word in words:
        if word not in stop_words:      # Not a stop word, so lower, remove punctuation, and stem
            lowered_token = remove_punctuation(word).lower()
            #non_stop_words.append(lowered_token)
            non_stop_words.append(stemmer.stem(lowered_token))

    return ' '.join(non_stop_words)
开发者ID:casawa,项目名称:multiclass-multilabel-course-labeling,代码行数:12,代码来源:format_data.py

示例6: stem_text

 def stem_text(text):
     p = PorterStemmer()
     stemmed_text = ''
     word = ''
     for i, c in enumerate(text):
         if c.isalpha():
             word += c.lower()
         if not c.isalpha() or i == (len(text) - 1):
             if word:
                 stemmed_text += p.stem(word, 0,len(word)-1)
                 word = ''
             if c.lower() == ' ':
                 stemmed_text += c.lower()
     return stemmed_text
开发者ID:pawelrychly,项目名称:ezi,代码行数:14,代码来源:stemmer_helper.py

示例7: remove_porterstemmer

def remove_porterstemmer(input_file,noise_words_set):
	questions = list()
	word_weight = []
	p = PorterStemmer()
	for line in input_file:
		line = line.lower()
		words = filter(None, re.split("\W*\d*", line))
		question = []
		for word in words:
			new_word = p.stem(word,0,len(word)-1)
			if new_word not in noise_words_set and len(new_word)>2:
				question.append(new_word)
		questions.append(question)
		word_weight.append(Counter(question))
	return word_weight, questions
开发者ID:jializhou,项目名称:machine-learning-projects,代码行数:15,代码来源:preprocess.py

示例8: getStemWords

def getStemWords(query_line, stopwords):
    raw_data = query_line.replace(".", "").replace(",", "").replace('"', "").replace("\n", "").replace("-", " ") \
        .replace("(", "").replace(")", "").split(" ")

    for i in stopwords:
        while i in raw_data:
            raw_data.remove(i)

    stemmedArray = raw_data
    p = PorterStemmer()

    for i in range(1, stemmedArray.__len__()):
        while stemmedArray[i] != p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1):
            stemmedArray[i] = p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1)

    return raw_data[0], raw_data[1:], stemmedArray[1:]
开发者ID:pankajtripathi,项目名称:Information-Retrieval,代码行数:16,代码来源:query_processing.py

示例9: __init__

class Parser:

	#A processor for removing the commoner morphological and inflexional endings from words in English
	stemmer=None

	stopwords=[]

	def __init__(self,):
		self.stemmer = PorterStemmer()

		#English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
		#self.stopwords = open('data/english.stop', 'r').read().split()


	def clean(self, string):
		""" remove any nasty grammar tokens from string """
		string = string.replace(".","")
		string = string.replace("\s+"," ")
		string = string.lower()
		return string
	

	def removeStopWords(self,list):
		""" Remove common words which have no search value """
		return [word for word in list if word not in self.stopwords ]


	def tokenise(self, string):
		""" break string up into tokens and stem words """
		string = self.clean(string)
		words = string.split(" ")
		
		return [self.stemmer.stem(word,0,len(word)-1) for word in words]
开发者ID:Sapphirine,项目名称:bestbuy-recommend,代码行数:33,代码来源:Parser.py

示例10: dict_qryid_terms

def dict_qryid_terms(is_stopping):
  global STOPWORDS_FILE 
  stopwords_list = stopwords(STOPWORDS_FILE)  ## create stopwords list
  p = PorterStemmer() ##create an Porter Stemmer instance 
  dictquery = defaultdict(lambda: [])  ## create the target dictionary
  with open(QUERY_TEXT_FILE, 'r') as f: 
    for line in f: 
      data_list = re.findall(r"[\w]+", line)
      query_id = data_list[0]
      for term in data_list[1:]:
        term = term.lower()
        if is_stopping:
          if term not in stopwords_list:
            dictquery[query_id].append(p.stem(term, 0,len(term)-1))
        else: 
            dictquery[query_id].append(p.stem(term, 0,len(term)-1))
  return dictquery
开发者ID:priya-sudarshanam,项目名称:Information-Retrieval---CS6200,代码行数:17,代码来源:project-3.py

示例11: stem_string

def stem_string(line):
    if line == "":
        return ""
    p = PorterStemmer()
    word = ""
    output = ""
    for c in line:
        if c.isalpha():
            word += c.lower()
        else:
            if word:
                output += p.stem(word, 0,len(word)-1)
                word = ''
            output += c.lower()
    if word:
        output += p.stem(word, 0,len(word)-1)
    return output
开发者ID:CoderChang,项目名称:nlp_project,代码行数:17,代码来源:util.py

示例12: __init__

 def __init__(self):
     # For holding the data - initialized in read_data()
     self.titles = []
     self.docs = []
     self.vocab = []
     # For the text pre-processing.
     self.alphanum = re.compile('[^a-zA-Z0-9]')
     self.p = PorterStemmer()
开发者ID:lyl209,项目名称:nlp,代码行数:8,代码来源:IRSystem.py

示例13: getQuestionKeywords

def getQuestionKeywords(question):
    """Return the keywords from a question.

    The logic is: remove the stop words and punctuations from question, stem the keywords and remove duplicates
    Currently there are still issues with
    1. stop words list is not complete: eg "recommend" etc is not a stop word.
    2. stemmer issue: The current stemmer utility has an issue eg "restaurant" is stemmed to "restau"

    >>> getQuestionKeywords('what is the best preschool in Potomac?')
    ['potomac', 'preschool']

    >>> getQuestionKeywords('Can someone help with a preschool around potomac?')
    ['potomac', 'preschool']

    >>> getQuestionKeywords('What is the best cafeteria around potomac?')
    ['potomac', 'restaurant']

    """

    # split the question into a list
    keywordList = question.split()

    # strip the punctuations etc
    keywordList = [keyword.strip(PUNCTUATION) for keyword in keywordList]

    # convert into lower case
    keywordList = [keyword.lower() for keyword in keywordList]

    #remove stop words from keywords
    keywordList = [keyword for keyword in keywordList if keyword not in stopWords]

    #stem the keywords
    stemmer = PorterStemmer()
    keywordList = [stemmer.stem(keyword,0,len(keyword)-1) for keyword in keywordList]

    #take care of synonyms
    keywordList = [synonyms[keyword] if keyword in synonyms else keyword for keyword in keywordList ]

    #remove duplicates
    keywordList = list(set(keywordList))

    #sort the keywords
    keywordList.sort()
    
    return keywordList
开发者ID:zmao,项目名称:jmao_python,代码行数:45,代码来源:keywordUtil.py

示例14: parse

 def parse(self):
     #remove stop words
     self.dataList = [w for w in self.dataList if not w in self.stopWords]
     #get the stem of the words
     st = PorterStemmer()
     self.dataList = [st.stem(w, 0, len(w)-1) for w in self.dataList]        
     # add to list based on frequency of occurrence
     wordFreq = {}
     for word in self.dataList:
         if word in wordFreq:
             wordFreq[word] = wordFreq[word] + 1
         else:
             wordFreq[word] = 0
     wordList = sorted(wordFreq.iteritems(), key = operator.itemgetter(1))
     newList = []
     for w in wordList:
         newList.insert(0,w[0])
     self.dataList = newList
开发者ID:ben444422,项目名称:Recommendit,代码行数:18,代码来源:Parser.py

示例15: __init__

    def __init__(self, path, num_records):
        self.porter = PorterStemmer()
        self.stop = set()
        with open("stop.words.dat", "r") as sw:
            for line in sw:
                self.stop.add(line[:-1])

        if path != "" and num_records != 0:
            self.process(path, num_records)
开发者ID:iamkeyur,项目名称:task-identification,代码行数:9,代码来源:preprocess.py


注:本文中的PorterStemmer.PorterStemmer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。