本文整理汇总了Python中PorterStemmer.PorterStemmer类的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer类的具体用法?Python PorterStemmer怎么用?Python PorterStemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PorterStemmer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: process_word
def process_word(token):
token = token.lower()
if constants.STEM is True:
p = PorterStemmer()
token = p.stem(token, 0,len(token)-1)
return token
示例2: __init__
def __init__(self, parent, docno, doc, terms):
QtGui.QDialog.__init__(self, parent)
self.setupUi(self)
# Set fields
self.labelDocumentNo.setText(docno)
textDocument = self.textEdit.document()
textCursor = QtGui.QTextCursor(textDocument)
normalFormat = QtGui.QTextCharFormat()
termFormat = QtGui.QTextCharFormat()
termFormat.setForeground(QtGui.QBrush(QtGui.QColor("red")))
termFormat.setFontWeight(QtGui.QFont.Bold)
textCursor.beginEditBlock()
stemmer = PorterStemmer()
terms = terms.split(",")
stemmed_terms = [stemmer.stem(term, 0, len(term)-1) for term in terms]
for line in unicode(doc).split("\n"):
for word in line.split(" "):
nword = word.lower().strip(punctuation)
sword = stemmer.stem(nword, 0, len(nword)-1)
if nword in terms or sword in stemmed_terms:
textCursor.insertText(word, termFormat)
else:
textCursor.insertText(word, normalFormat)
textCursor.insertText(" ", normalFormat)
textCursor.insertText("\n", normalFormat)
self.textEdit.moveCursor(QtGui.QTextCursor.Start)
示例3: stemWords
def stemWords(inList):
##Function that stems the words.
##Name: stemWords; input: list (of tokens); output: list (of stemmed tokens)
outlist = []
p = PorterStemmer()
for word in inList:
outlist.append(p.stem(word, 0, len(word)-1))
return outlist
示例4: load_dictionary
def load_dictionary(filename, stem=True):
"""Loads line separated dictionary into a list"""
out = []
for word in open("dictionaries/%s" % filename, "r"):
word = word.lower()
if stem is True:
p = PorterStemmer()
word = p.stem(word, 0,len(word)-1)
out.append(word)
return out
示例5: format_description
def format_description(text, stop_words):
words = text.split()
stemmer = PorterStemmer()
non_stop_words = []
for word in words:
if word not in stop_words: # Not a stop word, so lower, remove punctuation, and stem
lowered_token = remove_punctuation(word).lower()
#non_stop_words.append(lowered_token)
non_stop_words.append(stemmer.stem(lowered_token))
return ' '.join(non_stop_words)
示例6: stem_text
def stem_text(text):
p = PorterStemmer()
stemmed_text = ''
word = ''
for i, c in enumerate(text):
if c.isalpha():
word += c.lower()
if not c.isalpha() or i == (len(text) - 1):
if word:
stemmed_text += p.stem(word, 0,len(word)-1)
word = ''
if c.lower() == ' ':
stemmed_text += c.lower()
return stemmed_text
示例7: remove_porterstemmer
def remove_porterstemmer(input_file,noise_words_set):
questions = list()
word_weight = []
p = PorterStemmer()
for line in input_file:
line = line.lower()
words = filter(None, re.split("\W*\d*", line))
question = []
for word in words:
new_word = p.stem(word,0,len(word)-1)
if new_word not in noise_words_set and len(new_word)>2:
question.append(new_word)
questions.append(question)
word_weight.append(Counter(question))
return word_weight, questions
示例8: getStemWords
def getStemWords(query_line, stopwords):
raw_data = query_line.replace(".", "").replace(",", "").replace('"', "").replace("\n", "").replace("-", " ") \
.replace("(", "").replace(")", "").split(" ")
for i in stopwords:
while i in raw_data:
raw_data.remove(i)
stemmedArray = raw_data
p = PorterStemmer()
for i in range(1, stemmedArray.__len__()):
while stemmedArray[i] != p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1):
stemmedArray[i] = p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1)
return raw_data[0], raw_data[1:], stemmedArray[1:]
示例9: __init__
class Parser:
#A processor for removing the commoner morphological and inflexional endings from words in English
stemmer=None
stopwords=[]
def __init__(self,):
self.stemmer = PorterStemmer()
#English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
#self.stopwords = open('data/english.stop', 'r').read().split()
def clean(self, string):
""" remove any nasty grammar tokens from string """
string = string.replace(".","")
string = string.replace("\s+"," ")
string = string.lower()
return string
def removeStopWords(self,list):
""" Remove common words which have no search value """
return [word for word in list if word not in self.stopwords ]
def tokenise(self, string):
""" break string up into tokens and stem words """
string = self.clean(string)
words = string.split(" ")
return [self.stemmer.stem(word,0,len(word)-1) for word in words]
示例10: dict_qryid_terms
def dict_qryid_terms(is_stopping):
global STOPWORDS_FILE
stopwords_list = stopwords(STOPWORDS_FILE) ## create stopwords list
p = PorterStemmer() ##create an Porter Stemmer instance
dictquery = defaultdict(lambda: []) ## create the target dictionary
with open(QUERY_TEXT_FILE, 'r') as f:
for line in f:
data_list = re.findall(r"[\w]+", line)
query_id = data_list[0]
for term in data_list[1:]:
term = term.lower()
if is_stopping:
if term not in stopwords_list:
dictquery[query_id].append(p.stem(term, 0,len(term)-1))
else:
dictquery[query_id].append(p.stem(term, 0,len(term)-1))
return dictquery
示例11: stem_string
def stem_string(line):
if line == "":
return ""
p = PorterStemmer()
word = ""
output = ""
for c in line:
if c.isalpha():
word += c.lower()
else:
if word:
output += p.stem(word, 0,len(word)-1)
word = ''
output += c.lower()
if word:
output += p.stem(word, 0,len(word)-1)
return output
示例12: __init__
def __init__(self):
# For holding the data - initialized in read_data()
self.titles = []
self.docs = []
self.vocab = []
# For the text pre-processing.
self.alphanum = re.compile('[^a-zA-Z0-9]')
self.p = PorterStemmer()
示例13: getQuestionKeywords
def getQuestionKeywords(question):
"""Return the keywords from a question.
The logic is: remove the stop words and punctuations from question, stem the keywords and remove duplicates
Currently there are still issues with
1. stop words list is not complete: eg "recommend" etc is not a stop word.
2. stemmer issue: The current stemmer utility has an issue eg "restaurant" is stemmed to "restau"
>>> getQuestionKeywords('what is the best preschool in Potomac?')
['potomac', 'preschool']
>>> getQuestionKeywords('Can someone help with a preschool around potomac?')
['potomac', 'preschool']
>>> getQuestionKeywords('What is the best cafeteria around potomac?')
['potomac', 'restaurant']
"""
# split the question into a list
keywordList = question.split()
# strip the punctuations etc
keywordList = [keyword.strip(PUNCTUATION) for keyword in keywordList]
# convert into lower case
keywordList = [keyword.lower() for keyword in keywordList]
#remove stop words from keywords
keywordList = [keyword for keyword in keywordList if keyword not in stopWords]
#stem the keywords
stemmer = PorterStemmer()
keywordList = [stemmer.stem(keyword,0,len(keyword)-1) for keyword in keywordList]
#take care of synonyms
keywordList = [synonyms[keyword] if keyword in synonyms else keyword for keyword in keywordList ]
#remove duplicates
keywordList = list(set(keywordList))
#sort the keywords
keywordList.sort()
return keywordList
示例14: parse
def parse(self):
#remove stop words
self.dataList = [w for w in self.dataList if not w in self.stopWords]
#get the stem of the words
st = PorterStemmer()
self.dataList = [st.stem(w, 0, len(w)-1) for w in self.dataList]
# add to list based on frequency of occurrence
wordFreq = {}
for word in self.dataList:
if word in wordFreq:
wordFreq[word] = wordFreq[word] + 1
else:
wordFreq[word] = 0
wordList = sorted(wordFreq.iteritems(), key = operator.itemgetter(1))
newList = []
for w in wordList:
newList.insert(0,w[0])
self.dataList = newList
示例15: __init__
def __init__(self, path, num_records):
self.porter = PorterStemmer()
self.stop = set()
with open("stop.words.dat", "r") as sw:
for line in sw:
self.stop.add(line[:-1])
if path != "" and num_records != 0:
self.process(path, num_records)