本文整理汇总了Python中nltk.stem.PorterStemmer.stem_word方法的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer.stem_word方法的具体用法?Python PorterStemmer.stem_word怎么用?Python PorterStemmer.stem_word使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.PorterStemmer
的用法示例。
在下文中一共展示了PorterStemmer.stem_word方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: StemmerTokenizer
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
class StemmerTokenizer():
def __init__(self):
self.lem = PorterStemmer()
def __call__(self, string):
tokens = word_tokenize( string.lower() )
return [ self.lem.stem_word(t) for t in tokens ]
示例2: preprocess
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
def preprocess(text):
stemmer = PorterStemmer()
stop = stopwords.words("english")
result = word_tokenize(text)
result = [stemmer.stem_word(word.lower()) for word in result if \
word not in stop and \
word not in string.punctuation and \
word not in string.digits]
return result
示例3: dialogue_act_features
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
def dialogue_act_features(post):
words = nltk.word_tokenize(post)
sentences = nltk.sent_tokenize(post)
features = {
'word_diversity': len(words)/len(set(words)),
}
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem_word(w) for w in words]
# words
for word in set(stemmed_words):
features['contains(%s)' % word.lower()] = True
# check for presence/absence of specific words
check_words = [
'who', 'what', 'where', 'why', 'how', # question words
'love', 'hate', 'despis', # emotional words (?)
]
for word in check_words:
features['contains(%s)' % word] = word in stemmed_words
# punctuation
for punctuation in ['?', '!', '!!', '?!', '"', '...', '.']:
features['punctuation_count(%s)' % punctuation] = post.count(punctuation)
# skip parts of speech for now - slow, not helping much
return features
# get counts for parts of speech
pos_count = defaultdict(int)
for sentence in sentences:
# tokenize the sentence into words and tag parts of speech
sentence_words = nltk.word_tokenize(sentence)
# - using the nltk parts-of-speech tagger for now
# (other options may be faster/more accurate)
pos_sentence = nltk.pos_tag(sentence_words)
for word, pos in pos_sentence:
pos_count['pos_%s' % pos] += 1
# include final counts by part of speech in the features
features.update(pos_count)
return features
示例4: __init__
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
class PreProcess:
def __init__(self):
self.tokenizer = word_tokenize
self.stemmer = PorterStemmer()
self.punct = string.punctuation
self.digits = string.digits
self.stop = stopwords.words("english")
def process_sent(self, snt):
snt = self.tokenizer(snt)
snt = [self.stemmer.stem_word(wrd.lower()) for wrd in snt if \
wrd not in self.stop and \
wrd not in self.digits and \
wrd not in self.punct ]
return snt
def process(self, snts):
return [self.process_sent(snt) for snt in snts]
示例5: stem
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
def stem(self,input_text):
tokenizer = RegexpTokenizer('\s+', gaps=True)
stemmed_text=[]
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
text = tokenizer.tokenize(str(input_text))
filtered_text = self.stopword(text)
for word in filtered_text:
if word.isalpha():
if len(word)>4:
stemmed_text.append(stemmer.stem_word(word).lower())
else:
stemmed_text.append(word.lower())
for word in stemmed_text:
if len(word) < 3 :
stemmed_text.remove(word)
' '.join(stemmed_text)
return stemmed_text
示例6: ambigious
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
""" General approach: serach for control-type structures which may be ambigious (with raising)
then search for those verbs to see if they exist in "There[ex] VERB" contexts
e.g we find "John seems to be beside himself today"
so we search for "/[tT]here/ . (/VB/ < /^(seem)/)"
if this returns any results, "seem" must be a raising verb
"""
from pdb import set_trace
import runTregex as tx
from nltk.stem import PorterStemmer
ps = PorterStemmer()
treebank_dir = "/home/chase/CompLing/stanford-tregex-2012-03-09/treebank_3/parsed/mrg/wsj/"
unfiltered = set()
for t in trees:
unfiltered.add(ps.stem_word(t.matchTree.leaves()[0]).lower())
# this takes forever and isn't really too effective...
for word in unfiltered:
pat = "(/[Tt]here/ > EX) . /^%s/"%word
reload(tx)
trees = tx.Treebank(treebank_dir, pat)
trees.run()
if len(trees) > 0:
print word
示例7: Counter
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
True_iD = True
continue
if inPage and line.find( "<text" ) != -1:
inText = True
continue
if inPage and True_iD and line.find("<id>") != -1:
iD.append(line[len("<id>") : -len("</id>")])
True_iD = False
if inPage and line.find( "/text" ) != -1:
inText = False
text = ' '.join(list)
#Tokenizing Text For Each XML
temp = text.decode("utf-8","ignore")
temp = temp.replace(u'\ufeff',' ')
temp_1 = re.sub(pattern," ",temp)
temp_1 = temp_1.lower()
res=[]
for x in temp_1.split():
if x not in stopwords.words('english'):
res.append(x)
clean_text = " ".join(stem.stem_word(word) for word in res)
tokens = nltk.word_tokenize(clean_text)
cnt = Counter(tokens)
print("[[%s]]\t[[%.0f]]") % (dict(cnt),int(iD[0]))
list = []
continue
示例8: set
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
stop_words = set(stopwords.words("english"))
words = word_tokenize(example_sentence)
filter_sentence = [w for w in words if w not in stop_words]
print(filter_sentence)
##### STEAMMER EXAMPLE #####
ps = PorterStemmer()
example_words = ["pythone", "pythoner", "pythoning", "pythoned", "pythonly"]
for w in example_words:
print(ps.stem_word(w))
##### SENTENCE TOKENIZER EXAMPLE #####
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_some_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_some_tokenizer(sample_text)
def proce_content():
try:
for w in tokenized:
words = nltk.word_tokenize(w)
示例9: __init__
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem_word [as 别名]
class TitleSim:
def __init__(self, features_conf, features_deleted):
print 'Start initialization'
# initial model training
features = features_deleted + features_conf
target = [0 for x in range(len(features_deleted))] +\
[1 for x in range(len(features_conf))]
self.classifier = RandomForestClassifier(n_estimators=50,
verbose=2,
n_jobs=1,
min_samples_split=10,
random_state=1)
self.classifier.fit(features, target)
# loading relational data which will be used
paths = json.loads(open("SETTINGS.json").read())
paper_doc = paths["paper_doc"]
self.paper = dict([(entry[0], entry[1]) for entry in csv.reader(open(paper_doc))])
# loading setting file
self.paths = json.loads(open("SETTINGS.json").read())
# loading word map of titles
self.wordmap = self.load_titlemap()
# do other initializations
self.stemmer = PorterStemmer()
print 'End initialization'
def label_predict(self, fea_dict):
# fea_dict is a dictionary whose key is 'user id'
prob_dict = {}
for key in fea_dict:
features = [feature[1:] for feature in fea_dict[key]]
predictions = self.classifier.predict_proba(features)[:,1]
prob_dict[key]=[(item[0],prob) for item,prob in zip(fea_dict[key],predictions)]
return prob_dict
def load_titlemap(self):
return dict([(entry[0],entry[1]) for entry in \
csv.reader(open(self.paths["title_wordmap"]))])
def calsim(self, author_doc, pairs):
# calculate the similarity between titles
title_features = []
for pair in pairs:
if pair[0] not in author_doc:
print 'Key error.'
sys.exit(1)
title_features.append(self.calpairsim(author_doc[pair[0]], pair[1]))
return title_features
def calpairsim(self, doclist, target_doc):
author_words = {}
for doc in doclist:
words = self.paper[doc].lower().split(' ')
for word in words:
stemmed_word = self.stemmer.stem_word(word)
if stemmed_word in self.wordmap:
if stemmed_word in author_words:
author_words[stemmed_word] += 1
else:
author_words[stemmed_word] = 1
doc_words = {}
words = self.paper[target_doc].lower().split(' ')
for word in words:
stemmed_word = self.stemmer.stem_word(word)
if stemmed_word in self.wordmap:
if stemmed_word in doc_words:
doc_words[stemmed_word] += 1
else:
doc_words[stemmed_word] = 1
# number of common words
comm_num = len(set(author_words.keys()) & set(doc_words.keys()))
# pearson coefficient
if (len(set(author_words.keys())) + len(set(doc_words.keys()))) != 0:
pearson = comm_num*1.0/ (len(set(author_words.keys())) + len(set(doc_words.keys())))
else:
pearson = 0.0
return [comm_num, pearson]