本文整理汇总了Python中PorterStemmer.PorterStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer.stem方法的具体用法?Python PorterStemmer.stem怎么用?Python PorterStemmer.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类PorterStemmer.PorterStemmer
的用法示例。
在下文中一共展示了PorterStemmer.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
def __init__(self, parent, docno, doc, terms):
QtGui.QDialog.__init__(self, parent)
self.setupUi(self)
# Set fields
self.labelDocumentNo.setText(docno)
textDocument = self.textEdit.document()
textCursor = QtGui.QTextCursor(textDocument)
normalFormat = QtGui.QTextCharFormat()
termFormat = QtGui.QTextCharFormat()
termFormat.setForeground(QtGui.QBrush(QtGui.QColor("red")))
termFormat.setFontWeight(QtGui.QFont.Bold)
textCursor.beginEditBlock()
stemmer = PorterStemmer()
terms = terms.split(",")
stemmed_terms = [stemmer.stem(term, 0, len(term)-1) for term in terms]
for line in unicode(doc).split("\n"):
for word in line.split(" "):
nword = word.lower().strip(punctuation)
sword = stemmer.stem(nword, 0, len(nword)-1)
if nword in terms or sword in stemmed_terms:
textCursor.insertText(word, termFormat)
else:
textCursor.insertText(word, normalFormat)
textCursor.insertText(" ", normalFormat)
textCursor.insertText("\n", normalFormat)
self.textEdit.moveCursor(QtGui.QTextCursor.Start)
示例2: getStemWords
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
def getStemWords(query_line, stopwords):
raw_data = query_line.replace(".", "").replace(",", "").replace('"', "").replace("\n", "").replace("-", " ") \
.replace("(", "").replace(")", "").split(" ")
for i in stopwords:
while i in raw_data:
raw_data.remove(i)
stemmedArray = raw_data
p = PorterStemmer()
for i in range(1, stemmedArray.__len__()):
while stemmedArray[i] != p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1):
stemmedArray[i] = p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1)
return raw_data[0], raw_data[1:], stemmedArray[1:]
示例3: process_word
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
def process_word(token):
token = token.lower()
if constants.STEM is True:
p = PorterStemmer()
token = p.stem(token, 0,len(token)-1)
return token
示例4: __init__
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
class Parser:
#A processor for removing the commoner morphological and inflexional endings from words in English
stemmer=None
stopwords=[]
def __init__(self,):
self.stemmer = PorterStemmer()
#English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
#self.stopwords = open('data/english.stop', 'r').read().split()
def clean(self, string):
""" remove any nasty grammar tokens from string """
string = string.replace(".","")
string = string.replace("\s+"," ")
string = string.lower()
return string
def removeStopWords(self,list):
""" Remove common words which have no search value """
return [word for word in list if word not in self.stopwords ]
def tokenise(self, string):
""" break string up into tokens and stem words """
string = self.clean(string)
words = string.split(" ")
return [self.stemmer.stem(word,0,len(word)-1) for word in words]
示例5: dict_qryid_terms
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
def dict_qryid_terms(is_stopping):
global STOPWORDS_FILE
stopwords_list = stopwords(STOPWORDS_FILE) ## create stopwords list
p = PorterStemmer() ##create an Porter Stemmer instance
dictquery = defaultdict(lambda: []) ## create the target dictionary
with open(QUERY_TEXT_FILE, 'r') as f:
for line in f:
data_list = re.findall(r"[\w]+", line)
query_id = data_list[0]
for term in data_list[1:]:
term = term.lower()
if is_stopping:
if term not in stopwords_list:
dictquery[query_id].append(p.stem(term, 0,len(term)-1))
else:
dictquery[query_id].append(p.stem(term, 0,len(term)-1))
return dictquery
示例6: stem_string
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
def stem_string(line):
if line == "":
return ""
p = PorterStemmer()
word = ""
output = ""
for c in line:
if c.isalpha():
word += c.lower()
else:
if word:
output += p.stem(word, 0,len(word)-1)
word = ''
output += c.lower()
if word:
output += p.stem(word, 0,len(word)-1)
return output
示例7: stemWords
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
def stemWords(inList):
##Function that stems the words.
##Name: stemWords; input: list (of tokens); output: list (of stemmed tokens)
outlist = []
p = PorterStemmer()
for word in inList:
outlist.append(p.stem(word, 0, len(word)-1))
return outlist
示例8: getTopTerms
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
def getTopTerms(currentQuery, weightsMap, topX):
p = PorterStemmer()
current_terms = []
for term in currentQuery.split():
term = p.stem(term.lower(), 0,len(term)-1)
current_terms.append(term)
i = 0
new_terms = []
for term in sorted(weightsMap, key=weightsMap.get, reverse=True):
if term in constants.QUERY_SKIP_TERMS or p.stem(term.lower(), 0,len(term)-1) in current_terms:
continue
new_terms.append(term)
current_terms.append(p.stem(term.lower(), 0,len(term)-1))
i = i + 1
if (topX != 'ALL' and i >= topX):
break;
return new_terms
示例9: load_dictionary
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
def load_dictionary(filename, stem=True):
"""Loads line separated dictionary into a list"""
out = []
for word in open("dictionaries/%s" % filename, "r"):
word = word.lower()
if stem is True:
p = PorterStemmer()
word = p.stem(word, 0,len(word)-1)
out.append(word)
return out
示例10: format_description
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
def format_description(text, stop_words):
words = text.split()
stemmer = PorterStemmer()
non_stop_words = []
for word in words:
if word not in stop_words: # Not a stop word, so lower, remove punctuation, and stem
lowered_token = remove_punctuation(word).lower()
#non_stop_words.append(lowered_token)
non_stop_words.append(stemmer.stem(lowered_token))
return ' '.join(non_stop_words)
示例11: stem_text
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
def stem_text(text):
p = PorterStemmer()
stemmed_text = ''
word = ''
for i, c in enumerate(text):
if c.isalpha():
word += c.lower()
if not c.isalpha() or i == (len(text) - 1):
if word:
stemmed_text += p.stem(word, 0,len(word)-1)
word = ''
if c.lower() == ' ':
stemmed_text += c.lower()
return stemmed_text
示例12: remove_porterstemmer
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
def remove_porterstemmer(input_file,noise_words_set):
questions = list()
word_weight = []
p = PorterStemmer()
for line in input_file:
line = line.lower()
words = filter(None, re.split("\W*\d*", line))
question = []
for word in words:
new_word = p.stem(word,0,len(word)-1)
if new_word not in noise_words_set and len(new_word)>2:
question.append(new_word)
questions.append(question)
word_weight.append(Counter(question))
return word_weight, questions
示例13: getQuestionKeywords
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
def getQuestionKeywords(question):
"""Return the keywords from a question.
The logic is: remove the stop words and punctuations from question, stem the keywords and remove duplicates
Currently there are still issues with
1. stop words list is not complete: eg "recommend" etc is not a stop word.
2. stemmer issue: The current stemmer utility has an issue eg "restaurant" is stemmed to "restau"
>>> getQuestionKeywords('what is the best preschool in Potomac?')
['potomac', 'preschool']
>>> getQuestionKeywords('Can someone help with a preschool around potomac?')
['potomac', 'preschool']
>>> getQuestionKeywords('What is the best cafeteria around potomac?')
['potomac', 'restaurant']
"""
# split the question into a list
keywordList = question.split()
# strip the punctuations etc
keywordList = [keyword.strip(PUNCTUATION) for keyword in keywordList]
# convert into lower case
keywordList = [keyword.lower() for keyword in keywordList]
#remove stop words from keywords
keywordList = [keyword for keyword in keywordList if keyword not in stopWords]
#stem the keywords
stemmer = PorterStemmer()
keywordList = [stemmer.stem(keyword,0,len(keyword)-1) for keyword in keywordList]
#take care of synonyms
keywordList = [synonyms[keyword] if keyword in synonyms else keyword for keyword in keywordList ]
#remove duplicates
keywordList = list(set(keywordList))
#sort the keywords
keywordList.sort()
return keywordList
示例14: __init__
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
class Tokenizer:
""" Helper class for tokenizing document space and removing stop words """
corpus = None
terms = []
stop_words = []
stemmer = None
def __init__(self):
# read stop words from file
self.stop_words = open('stop_words.txt', 'r').read().split()
self.stemmer = PorterStemmer()
def tokenize(self, docs_string):
""" Tokenizer's most important method.
It separates the whole corpus string in tokens and
removes stop words.
"""
self.corpus = docs_string
self.clean()
self.terms = self.corpus.split(" ")
self.remove_stop_words()
self.remove_duplicates()
return self.terms
def clean(self):
""" get rid of punctuation signs, convert to lower case, standardize spacing """
self.corpus = self.corpus.replace(".", " ")
self.corpus = self.corpus.replace(",", " ")
self.corpus = self.corpus.lower()
self.corpus = self.corpus.replace("\s+", " ")
def remove_stop_words(self):
self.terms = [self.stemmer.stem(term,0,len(term)-1) for term in self.terms if term not in self.stop_words]
def remove_duplicates(self):
""" remove duplicated terms in the list """
from sets import Set
self.terms = Set((term for term in self.terms))
示例15: parse
# 需要导入模块: from PorterStemmer import PorterStemmer [as 别名]
# 或者: from PorterStemmer.PorterStemmer import stem [as 别名]
def parse(self):
#remove stop words
self.dataList = [w for w in self.dataList if not w in self.stopWords]
#get the stem of the words
st = PorterStemmer()
self.dataList = [st.stem(w, 0, len(w)-1) for w in self.dataList]
# add to list based on frequency of occurrence
wordFreq = {}
for word in self.dataList:
if word in wordFreq:
wordFreq[word] = wordFreq[word] + 1
else:
wordFreq[word] = 0
wordList = sorted(wordFreq.iteritems(), key = operator.itemgetter(1))
newList = []
for w in wordList:
newList.insert(0,w[0])
self.dataList = newList