本文整理汇总了Python中nltk.PorterStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer.stem方法的具体用法?Python PorterStemmer.stem怎么用?Python PorterStemmer.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.PorterStemmer
的用法示例。
在下文中一共展示了PorterStemmer.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _log_likelihood
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix):
LL = 0
if answer_text is not '':
tokens = word_tokenize(str(answer_text), language='english')
porter_stemmer = PorterStemmer()
unique_wordcount = len(stemmed_vocabulary)
"""
per ogni w unica print_function words
Cw = conta w in answer_text
PwM = self.distrib_matrix[stemmer(w)]
unique_wordcount = len(tokenize(answer_text)
"""
for w in tokens:
_w = w.strip().lower()
Cw = 0
for _ in answer_text.split():
if _w == _.strip().lower():
Cw += 1
try:
w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace'))
except AttributeError:
w_stem = porter_stemmer.stem(_w)
try:
PwM = distrib_matrix[w_stem]
except KeyError: # key error means frequency is equal to cutoff point 1
PwM = 1
LL += (Cw * log(float(PwM)))
try:
LL = "{0:.2f}".format(LL / float(unique_wordcount))
except ZeroDivisionError:
LL = 0
return LL
示例2: stemming
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"):
supported_stemmers = [
"PorterStemmer", "SnowballStemmer",
"LancasterStemmer", "WordNetLemmatizer"]
if type is False or type not in supported_stemmers:
return words_l
else:
l = []
if type == "PorterStemmer":
stemmer = PorterStemmer()
for word in words_l:
l.append(stemmer.stem(word).encode(encoding))
if type == "SnowballStemmer":
stemmer = SnowballStemmer(lang)
for word in words_l:
l.append(stemmer.stem(word).encode(encoding))
if type == "LancasterStemmer":
stemmer = LancasterStemmer()
for word in words_l:
l.append(stemmer.stem(word).encode(encoding))
if type == "WordNetLemmatizer": # TODO: context
wnl = WordNetLemmatizer()
for word in words_l:
l.append(wnl.lemmatize(word).encode(encoding))
return l
示例3: get_ngram_features
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def get_ngram_features(self):
stemmer = PorterStemmer()
top_features = [(stemmer.stem(token) + "__TOP__", True) for token in self.top_text]
bottom_features = [(stemmer.stem(token) + "__BOTTOM__", True) for token in self.bottom_text]
all_features = [(stemmer.stem(token) + "__ALL__", True) for token in self.all_text]
self.ngram_features = dict(top_features + bottom_features + all_features)
示例4: stem
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def stem(input):
from nltk import PorterStemmer
stemmer = PorterStemmer();
stemmed_training_input = [];
stemmed_testing_input = [];
for training_example in input['training']:
word_list = training_example.split();
stemmed_training_input.append(' '.join([stemmer.stem(word) for word in word_list]))
for testing_example in input['testing']:
word_list = testing_example.split();
stemmed_testing_input.append(' '.join([stemmer.stem(word) for word in word_list]))
result = {'training':stemmed_training_input, 'training_labels':input['training_labels'], 'testing':stemmed_testing_input, 'testing_labels':input['testing_labels']}
return result
示例5: process_email
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def process_email(filename):
f = open(filename, 'r')
text = f.read()
f.close()
text = text.lower()
#replaces html tags by space
text = re.sub(r'<[^<>]+>', ' ', text)
#replaces numbers by word number
text = re.sub(r'[0-9]+', 'number', text)
#replaces URLs by word httpaddr
text = re.sub(r'(http|https)://[^\s]*', 'httpaddr', text)
#replaces email addresses by word emailaddr
text = re.sub(r'[^\s][email protected][^\s]+', 'emailaddr', text)
#replaces dollar signs with word dollar
text = re.sub(r'[$]+', 'dollar', text)
#removes punctuation and non-words and separates words
words = re.split('[^a-z0-9]| ', text)
#removes nans
words = filter(lambda x: x!='', words)
#reduces words to their stems
stemmer = PorterStemmer()
words = [stemmer.stem(word) for word in words]
return words
示例6: Model
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
class Model(FileIO):
def __init__(self, *args, **kwargs):
FileIO.__init__(self, *args, **kwargs)
self.data_list = []
self.stemmer = PorterStemmer() # correct syntax?
self.score_map =
self.ranges =
def isInt(self, val):
try:
val = int(val)
return True
except ValueError:
return False
def cleanString(self, word):
if (word not in stopwords) and (word is not " ") and (self.isInt(word) is False):
word = word.lower()
return self.stemmer.stem(word)
else:
return None
def makeScoreList(self):
'''Initialize a new array of 0s for each range'''
s_list = [0] * len(self.ranges))
示例7: main
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def main():
with open("sentiment.txt", 'r') as _file:
stemmer = PorterStemmer()
features = []
for words in _file:
feature = []
is_sentence = True
# 極性ラベルを除外
for word in words.split()[1:]:
try:
word = word.decode("utf-8")
if word not in [".", ",", ":", "?", "!"] \
and not has_stop_list(word):
feature.append(stemmer.stem(word))
except UnicodeDecodeError:
# 文字化けは無視する
is_sentence = False
break
if is_sentence:
features.append(feature)
return features
示例8: openAndProcessingFiles
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def openAndProcessingFiles(path,resultDict): # Main Function
for filename in os.listdir(os.getcwd()+path):
thisFile = open(os.getcwd()+path+'/'+filename,'r') #open the file and process each file
currentTextString = " ".join(thisFile.read().split())#store the file as a string for removing HTML tags
textAfterHtmlRemovingString = re.sub('<[^>]*>', '', currentTextString) # remove HTML tags (String)
textAfterHtmlRemovingList = textAfterHtmlRemovingString.split() # convert String to List for the text contains only characters
textRemoveingUnnecessaryCharactersList = [removeUnnecessaryCharacters(word) for word in textAfterHtmlRemovingList ]
textRemoveingUnnecessaryCharactersList = [word for word in textRemoveingUnnecessaryCharactersList if word is not None]
stop_words = set(stopwords.words('english'))
stop_words.update(['texthtml', 'html', 'server', "email", 'date', 'gmt', 'www']) # By analying the previous result set, continully adding new stopwords
textAfterStopwordsRemovingList = [word for word in textRemoveingUnnecessaryCharactersList if word not in stop_words] #remove stopwords
stemmer = PorterStemmer() #stemming
for eachWord in textAfterStopwordsRemovingList:
eachWord = stemmer.stem(eachWord)
storeToResultDict(eachWord,resultDict)
thisFile.close()
示例9: review_to_words
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def review_to_words(raw_review, remove_stopwords = False):
# BeautifulSoup pulls data out of html file
# here it removes html tags and markups
text = BeautifulSoup(raw_review).get_text()
# replace numbers by word number
text=re.sub(r'[0-9]+','number',text)
# remove punctuations (they can be analyzed for better results)
text = re.sub(r'[^a-zA-Z]', ' ', text)
text = text.lower()
#make a list of words
words_list = text.split()
#download nltk text data sets, including stop words
#nltk.download()
if remove_stopwords:
# get stopwords, searching a set is faster than searching a list
stops = set(stopwords.words('english'))
# remove stopwords
words_list = [word for word in words_list if not word in stops]
# reduce words to their stems
stemmer=PorterStemmer()
words_list=[stemmer.stem(word) for word in words_list]
# return the list of words
return words_list
示例10: stemm
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def stemm(cls, tokens):
stemmer = PorterStemmer()
for i, t in enumerate(tokens):
tokens[i] = stemmer.stem(t)
return tokens
示例11: normalize
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def normalize(word):
'''
normalize the the word for query or indexing
:param word: unicode string
:return: unicode string of the normalized ter
'''
porter = PorterStemmer()
return porter.stem(word) if word[0].isalpha() else ''
示例12: Tokenizer
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
class Tokenizer(object):
def __init__(self):
self.stem = PorterStemmer()
self.punct = set(string.punctuation) | set(['·™','..','...','....','.....','......'])
self.punct = self.punct | set(["``", "·", "–", "--", "”","—","•","—"])
def __call__(self, doc):
return [t.lower() for t in word_tokenize(doc) if t not in self.punct]
def stem_toke(self, doc):
return [self.stem.stem(t.lower()) for t in word_tokenize(doc) if t not in self.punct]
示例13: processContent
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def processContent(self, content):
stemmer = PorterStemmer()
tokens = word_tokenize(content)
tokens = filter(lambda x: len(x) < 20 and x.isalnum(), tokens)
tokens = [stemmer.stem(token.lower()) for token in tokens]
tokens = filter(lambda x: x not in stopwords.words('english'), tokens)
tokens = [str(token) for token in tokens]
bow = FreqDist(tokens)
return(bow)
示例14: main
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def main():
# Use file defined by BIOC_IN as default if no other provided
bioc_in = BIOC_IN
if len(sys.argv) >= 2:
bioc_in = sys.argv[1]
# A BioCReader object is put in place to hold the example BioC XML
# document
bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)
# A BioCWRiter object is prepared to write out the annotated data
bioc_writer = BioCWriter(BIOC_OUT)
# The NLTK porter stemmer is used for stemming
stemmer = PorterStemmer()
# The example input file given above (by BIOC_IN) is fed into
# a BioCReader object; validation is done by the BioC DTD
bioc_reader.read()
# Pass over basic data
bioc_writer.collection = bioc_reader.collection
# Get documents to manipulate
documents = bioc_writer.collection.documents
# Go through each document
annotation_id = 0
for document in documents:
# Go through each passage of the document
for passage in document:
# Stem all the tokens found
stems = [stemmer.stem(token) for
token in wordpunct_tokenize(passage.text)]
# Add an anotation showing the stemmed version, in the
# given order
for stem in stems:
annotation_id += 1
# For each token an annotation is created, providing
# the surface form of a 'stemmed token'.
# (The annotations are collectively added following
# a document passage with a <text> tag.)
bioc_annotation = BioCAnnotation()
bioc_annotation.text = stem
bioc_annotation.id = str(annotation_id)
bioc_annotation.put_infon('surface form',
'stemmed token')
passage.add_annotation(bioc_annotation)
# Print file to screen w/o trailing newline
# (Can be redirected into a file, e. g output_bioc.xml)
sys.stdout.write(str(bioc_writer))
# Write to disk
bioc_writer.write()
示例15: stemmingword
# 需要导入模块: from nltk import PorterStemmer [as 别名]
# 或者: from nltk.PorterStemmer import stem [as 别名]
def stemmingword(word_list, stemtype='porter'):
if stemtype == 'porter':
stemengine = PorterStemmer()
else:
stemengine = LancasterStemmer()
try:
filtered_words = [stemengine.stem(token).encode('latin-1', errors='ignore') for token in word_list]
except UnicodeDecodeError, e:
print 'Error en el tipo de caracteres descartando texto "{}"'.format(' '.join(word_list))