本文整理汇总了Python中nltk.stem.PorterStemmer类的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer类的具体用法?Python PorterStemmer怎么用?Python PorterStemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PorterStemmer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _stemmatize
def _stemmatize(self, word):
lmtzr = WordNetLemmatizer() # lemmatizer won't stem words ending in '-ing' unless you tell it it's a verb
stemmer = PorterStemmer()
if word.endswith('ing'):
return stemmer.stem(word)
return lmtzr.lemmatize(word)
示例2: porter_list1
def porter_list1(lista):
stemmer = PorterStemmer()
newlist = []
for b in lista:
b = stemmer.stem(b)
newlist.append(b)
return newlist
示例3: splitAndStem
def splitAndStem(inputfilename, outputfilename):
'''
For each ingredient split it into words, stem each word, construct a new recipe from those words
:param inputfilename:
:return:
'''
with open(outputfilename, 'w') as ff:
ff.write('[\n')
with open(inputfilename) as f:
d = eval(f.read())
stemmer = PorterStemmer()
with open(outputfilename, 'a') as ff:
for i in d:
# print(i)
new_item = {}
new_ingredients = []
for ingredient in i['ingredients']:
tokens = word_tokenize(ingredient)
clean_tokens = [re.subn('[^A-Za-z]', '', token)[0] for token in tokens]
new_ingredients += [stemmer.stem(w).lower() for w in clean_tokens]
new_item['cuisine'] = i['cuisine']
new_item['id'] = i['id']
new_item['ingredients'] = new_ingredients
json_recipe = json.dumps(new_item)
ff.write('%s,\n' % str(json_recipe))
示例4: parseReviews
def parseReviews(mypath):
filelist = os.listdir(mypath)
wordDict = {}
negationList = ["no","not","never","can't","won't","cannot","didn't","couldn't"]
negationFlag = False
stopwordList = set(stopwords.words("english"))
stemmer = PorterStemmer()
for file in filelist:
with open(mypath + "/" + file,"r") as f:
word_list = word_tokenize(f.read())
for word in word_list:
if word in negationList:
#double negative
if negationFlag:
negationFlag = False
else:
negationFlag = True
continue
if not word.isalnum():
negationFlag = False
if word.isalnum() and word not in stopwordList:
word = stemmer.stem(word)
if negationFlag:
word = "!" + word
negationFlag = False
if word not in wordDict:
wordDict[word] = 1
else:
wordDict[word] += 1
return wordDict
示例5: tokenizeTags
def tokenizeTags(str,dict_items):
#temp map (for getting the local term frequency)
#for a sentence
str =str.decode('ascii', 'ignore')
#tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
#tokens=tokenizer.tokenize(str)
tokens = str.split()
#print tokens
stemmer = PorterStemmer()
#small set of stopwords (remove you, are, and, I those kinds of words)
last =[]
#bigram_list=[]
for d in tokens:
d = d.split('-')
for c in d:
c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
#regular expression -> strip punctuations
if c!='' and c not in dict_items:
try:
if int(c):
if len(c)!=4 and (c>2015 or c<1900): #keep years
c=stemmer.stem('NUM')
except Exception:
c = stemmer.stem(c.lower())
pass
#c = stemmer.stem(c.lower())
last.append(c)
#bigram generation
#index= len(last)
#if index>1:
# bigram = last[index-2]+' '+last[index-1]
# bigram_list.append(bigram)
return last
示例6: tokenize2_bigram
def tokenize2_bigram(str,df_freq):
temp_map={}
#for a sentence
str =str.decode('ascii', 'ignore')
tokens = str.split()
#print tokens
stemmer = PorterStemmer()
last =[]
bigram_list=[]
for d in tokens:
d = d.split('-')
for c in d:
c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
#regular expression -> strip punctuations
if c!='':
try:
if int(c):
if len(c)!=4 and (c>2015 or c<1900): #keep years
c=stemmer.stem('NUM')
except Exception:
c = stemmer.stem(c.lower())
pass
#c = stemmer.stem(c.lower())
last.append(c)
#bigram generation
index= 0
if index>1:
bigram = last[index-2]+' '+last[index-1]
bigram_list.append(bigram)
updateDF(temp_map,df_freq,bigram)
index+=1
return bigram_list
示例7: openfile
def openfile(filename,output):
print(filename)
#starts run time
start = timeit.default_timer()
ps = PorterStemmer()
file = open(filename,"r")
tokens = []
#Used for removing punctuation from the documents
translate_table = dict((ord(char), None) for char in string.punctuation)
start2 = timeit.default_timer()
#splits the lines into words and removes the punctuation
for line in file:
tokens += word_tokenize(line.translate(translate_table) )
start3 = timeit.default_timer()
print("tokenize")
print(start3 - start2)
#creates a set of stop words to be removed later
stop_words = set(stopwords.words("english"))
start6 = timeit.default_timer()
#if a word is not a stop word it adds it to a list
filtered_sentence = []
for w in tokens:
if w not in stop_words:
filtered_sentence.append(w)
start7 = timeit.default_timer()
print("stop word removal")
print(start7 - start6)
startw = timeit.default_timer()
#stems each word and adds it to the output file in csv form
f = open(output,'w')
iterFilSen = iter(filtered_sentence)
if output == "documents.csv":
for w in filtered_sentence:
if w == "I":
f.write("\n")
f.write(ps.stem(w))
f.write(",")
else:
for w in iterFilSen:
if w == "I":
f.write("\n")
#removes the I number W
next(iterFilSen)
next(iterFilSen)
else:
f.write(ps.stem(w))
f.write(",")
#ends run time
stop = timeit.default_timer()
print("writing")
print(stop - startw)
print("total: "+output)
print(stop - start)
示例8: testing
def testing():
# - tokenize on sentence and word
ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!"
print(sent_tokenize(ex_txt))
print(word_tokenize(ex_txt, language='english'))
# - stop words (pre-defined by nltk)
stop_words = set(stopwords.words('english'))
print(stop_words)
words = word_tokenize(ex_txt)
print(words)
filtered_sent = []
for w in words:
if w not in stop_words:
filtered_sent.append(w)
print(filtered_sent)
filtered_sent = [w for w in words if not w in stop_words]
print(filtered_sent)
# - stemming
ps = PorterStemmer()
example_words = [python,pythoner,pythoning,pythoned,pythonly]
# for w in example_words:
# print(ps.stem(w))
new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)
for w in words:
print(ps.stem(w))
示例9: prepare_data
def prepare_data(reviews):
# run porter stemmer on every word
stemmer = PorterStemmer()
stem_text = lambda x: {'class': x['class'],
'text': stemmer.stem(x['text'])}
# clean text and remove empty items
reviews = filter(lambda x: x != {}, reviews)
reviews = map(stem_text, reviews)
print('classification: ' + reviews[observed_element]['class'] + '\n\n------------------------------------\n\n')
print('stemming: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')
# remove stopwords
reviews = map(remove_stop_words, reviews)
print('stopwords: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')
# remove undesired patterns
reviews = map(clean_text, reviews)
print('elementos inuteis: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')
return reviews
示例10: extract_clean_sentences
def extract_clean_sentences(self):
"""
Extracts sentences from plain text. Also applies the following cleaning
operations:
- Exclude all characters not recognized by 'utf-8' encoding
- Exclude all characters not contained in [a-zA-Z0-9 '-]
- Exclude common stopwords
"""
text = self.raw_text
exclude = re.compile('[^a-zA-Z0-9 \'-]')
linebreaks = re.compile('\s')
excess_space = re.compile('\s+')
stemmer = PorterStemmer()
sentences = sent_tokenize(text)
out = []
for sentence in sentences:
sentence = linebreaks.sub(' ', sentence)
sentence = exclude.sub(' ', sentence)
sentence = excess_space.sub(' ', sentence)
tokens = word_tokenize(sentence)
tokens = [stemmer.stem(t.lower()) for t in tokens]
out.append(tokens)
return out
示例11: preprocess_document
def preprocess_document(doc):
stopset = set(stopwords.words('english'))
stemmer = PorterStemmer()
tokens = wordpunct_tokenize(doc)
clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
final = [stemmer.stem(word) for word in clean]
return final
示例12: preprocess
def preprocess(text):
stemmer = PorterStemmer()
stop = stopwords.words('english')
tokens = [tok for tok in word_tokenize(text.lower())
if tok not in stop]
tokens_stemmed = [stemmer.stem(tok) for tok in tokens]
return tokens_stemmed
示例13: preprocessing
def preprocessing(text, debug = False):
if debug:
print text
# lower case
text = text.lower()
if debug:
print text
# can't -> cannot, bya's -> bya is
text = replacers.RegexpReplacer().replace(text)
if debug:
print text
# word tokenize
words = word_tokenize(text)
if debug:
print words
# removing stopwords
english_stops = set(stopwords.words('english'))
english_stops_added = english_stops | {'.', ',', ':', ';'}
words = [word for word in words if word not in english_stops_added]
if debug:
print words
# stemming words
stemmer = PorterStemmer()
words_stemmed = list(map(lambda word: stemmer.stem(word), words))
if debug:
print words_stemmed
return words, words_stemmed
示例14: buildVocab
def buildVocab(self):
'''Build a vocabulary for the selected documents (from dir database).'''
## Note: The source of text should be Lucene processed field values. Lucene tokenized the text, remove stop words, and may take other unknown steps.
## Right now the vocabulary is built on the raw text with NLTK based stopwords removal, and tokenization. This should be improved.
# collect contents from /database/ for each of these doc
for pmid in self.pmidList: # self.pmidList includes the query and the 99 most similar articles selected by BM25
self.corpus.append(file(os.path.join(self.dbDir,pmid)).read()) # corpus contains raw text (MH, title*2, abstract)
for text in self.corpus:
sent_tokenize_list = sent_tokenize(text.strip().lower(), "english") # tokenize an article text
stemmed_text = []
if sent_tokenize_list: # if sent_tokenize_list is not empty
porter_stemmer = PorterStemmer()
for sent in sent_tokenize_list:
words = TreebankWordTokenizer().tokenize(sent) # tokenize the sentence
words = [word.strip(string.punctuation) for word in words]
words = [word for word in words if not word in stopwords.words("english")]
words = [word for word in words if len(word)>1] # remove single letters and non alphabetic characters
words = [word for word in words if re.search('[a-zA-Z]',word)]
words = [porter_stemmer.stem(word) for word in words] # apply Porter stemmer
stemmed_text.append(" ".join(words))
self.vocab+=words
self.stemmed_corpus.append(". ".join(stemmed_text)) # append a stemmed article text
# save stemmed corpus
pickle.dump(self.stemmed_corpus, file(os.path.join(self.stemmed_corpusDir,str(self.pmidList[0])),"w"))
# remove low frequency tokens and redundant tokens
tokenDist = Counter(self.vocab)
lowFreqList = []
for token, count in tokenDist.iteritems():
if count<2:
lowFreqList.append(token)
self.vocab = list(set(self.vocab)-set(lowFreqList))
# save vocabulary
pickle.dump(self.vocab,file(os.path.join(self.vocabDir,str(self.pmidList[0])),"w"))
示例15: StemmedBagOfWordsFeatureGenerator
class StemmedBagOfWordsFeatureGenerator(EdgeFeatureGenerator):
"""
Generates stemmed Bag of Words representation for each sentence that contains
an edge, using the function given in the argument.
By default it uses Porter stemmer
:type feature_set: nala.structures.data.FeatureDictionary
:type stemmer: nltk.stem.PorterStemmer
:type stop_words: list[str]
:type training_mode: bool
"""
def __init__(self, feature_set, stop_words=[], training_mode=True):
self.feature_set = feature_set
"""the feature set for the dataset"""
self.training_mode = training_mode
"""whether the mode is training or testing"""
self.stemmer = PorterStemmer()
"""an instance of the PorterStemmer"""
self.stop_words = stop_words
"""a list of stop words"""
def generate(self, dataset):
for edge in dataset.edges():
sentence = edge.part.sentences[edge.sentence_id]
if self.training_mode:
for token in sentence:
if self.stemmer.stem(
token.word
) not in self.stop_words and not token.features['is_punct']:
feature_name = '4_bow_stem_' + self.stemmer.stem(
token.word) + '_[0]'
self.add_to_feature_set(edge, feature_name)