本文整理汇总了Python中nltk.stem.porter.PorterStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer.stem方法的具体用法?Python PorterStemmer.stem怎么用?Python PorterStemmer.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.porter.PorterStemmer
的用法示例。
在下文中一共展示了PorterStemmer.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_questions
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
def parse_questions(self):
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+')
for questions_key in self.rawSamples:
# Stem the Question Text
question_text = self.rawSamples[questions_key][0]
words_array = tokenizer.tokenize(question_text)
question_text = ""
for word in words_array:
if word.isnumeric():
continue
if word not in text.ENGLISH_STOP_WORDS:
word = stemmer.stem(word)
word = stemmer.stem(word)
question_text += (word + " ")
self.rawSamples[questions_key][0] = question_text
# Stem the topic names
topics_text = self.rawSamples[questions_key][2]
words_array = tokenizer.tokenize(topics_text)
topics_text = ""
for word in words_array:
if word.isnumeric():
continue
if word not in text.ENGLISH_STOP_WORDS:
word = stemmer.stem(word)
word = stemmer.stem(word)
topics_text += (word + " ")
self.rawSamples[questions_key][2] = topics_text
示例2: extract_entities
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
def extract_entities(doc):
print 'extracting entities from %s...' % doc.getFilename()
nps = list(set([re.sub(' \.', '', re.sub(' -[A-Z]{3}-', '', np).lower()) for np in doc.getAllNodesOfType('NP')]))
p = PorterStemmer()
entities = []
for np in nps:
try:
response = json.loads(requests.get(host+'select', params={'q': 'wam:[50 TO 100] AND iscontent:true AND lang:en AND (title_en:"%s" OR redirect_titles_mv_en:"%s")' % (np, np), 'fl': 'title_en,redirect_titles_mv_en', 'wt': 'json'}).content)
except requests.exceptions.ConnectionError:
while True:
time.sleep(15)
print 'retrying connection...'
try:
response = json.loads(requests.get(host+'select', params={'q': 'wam:[50 TO 100] AND iscontent:true AND lang:en AND (title_en:"%s" OR redirect_titles_mv_en:"%s")' % (np, np), 'fl': 'title_en,redirect_titles_mv_en', 'wt': 'json'}).content)
break
except requests.exceptions.ConnectionError:
continue
docs = response[u'response'][u'docs']
if len(docs) > 0:
titles = [docs[0][u'title_en']] + docs[0].get(u'redirect_titles_mv_en', [])
else:
titles = []
if len(titles) > 0:
titles = [' '.join([p.stem(w.lower()) for w in t.split(' ')]) for t in titles]
stem_np = ' '.join([p.stem(w) for w in np.split(' ')])
for title in titles:
if stem_np == title:
entities.append(np)
print np
break
#print doc.getFilename(), entities
return (doc.getFilename(), entities)
示例3: AddTopicUnigram
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
def AddTopicUnigram(self, feaName,comName, data = None):
#need mapping first
if data is None:
data =self._data
for i in range(len(data)):
t_bigram = self.getEssayCollocation(data, i)
t_uni = list()
for (a, b) in t_bigram:
t_uni.append(a)
t_uni.append(b)
t_uni = set(t_uni)
comment = data[i][comName]
tokens = nltk.wordpunct_tokenize(comment)
tokens = [word.lower() for word in tokens]
#stemming
if self._stemoption ==True:
st = PorterStemmer()
tokens = [st.stem(t) for t in tokens]
t_uni = set([st.stem(t) for t in list(t_uni)])
shared = [w for w in tokens if w in t_uni]
#normalized
data[i][feaName] = float(len(shared))/(len(tokens)+0.00001)
示例4: compare_english_simple
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
def compare_english_simple(article_title):
"""Given a title of an article, returns the number of tokens, types, and stems
in both the English version and the simple English version."""
english = extract_wikipedia_page(article_title, "en")
simple = extract_wikipedia_page(article_title, "simple")
num_tokens_english = len(english)
num_tokens_simple = len(simple)
types_english = count_words(get_words(english))
types_simple = count_words(get_words(simple))
porter_stemmer = PorterStemmer()
stem_english = defaultdict(int)
stem_simple = defaultdict(int)
for key in types_english.keys():
stem_english[porter_stemmer.stem(key)] += 1
for key in types_simple.keys():
stem_simple[porter_stemmer.stem(key)] += 1
print ("Number of Tokens in English " + article_title + ": %d" % num_tokens_english)
print ("Number of Tokens in Simple English " + article_title + ": %d" % num_tokens_simple)
print ("Number of Types in English " + article_title + ": %d" % len(types_english))
print ("Number of Types in Simple English " + article_title + ": %d" % len(types_simple))
print ("Number of Stems in English " + article_title + ": %d" % len(stem_english))
print ("Number of Stems in Simple English " + article_title + ": %d" % len(stem_simple))
示例5: getDomainUnigram
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
def getDomainUnigram(self, directory = None):
collocations = set() #collocation items
ewordlists = list() #list of lists of words
#extract words from essays
if directory is not None:
doclist = os.listdir(directory)
for essay in doclist:
dir_essay = directory+'/'+essay
etext = open(dir_essay,'r').read()
tokens = nltk.wordpunct_tokenize(etext)
tokens = [word.lower() for word in tokens]
#stemming
if self._stemoption ==True:
st = PorterStemmer()
tokens = [st.stem(t) for t in tokens]
#extract the collocation for the given essay
e_bigram = set(Mytext(tokens).collocations())
collocations = collocations | e_bigram
ewordlists.append(tokens)
else: # using the mapped essay to calcuate the candidate bigrams
#need to call mapessay fuction first
for ins in self._data:
if ins['essay'] is not None:
etext = open(ins['essay'],'r').read()
tokens = nltk.wordpunct_tokenize(etext)
tokens = [word.lower() for word in tokens]
#stemming
if self._stemoption ==True:
st = PorterStemmer()
tokens = [st.stem(t) for t in tokens]
#extract the collocation for the given essay
e_bigram = set(Mytext(tokens).collocations())
collocations = collocations | e_bigram
ewordlists.append(tokens)
#get collection of all essays under the specified directory / associated essays
collection_text = TextCollection(ewordlists)
itemlist = list()
for (a, b) in collocations:
itemlist.append(a)
itemlist.append(b)
itemlist = list(set(itemlist))
word_idf = []
for i in range(len(itemlist)):
word_idf.append((collection_text.idf(itemlist[i]), itemlist[i]))
word_idf = sorted(word_idf, key = operator.itemgetter(0))
ave = 0
if len(word_idf)!=0:
ave = sum(map(operator.itemgetter(0), word_idf)) / len(word_idf)
wlist = [j for (i, j) in word_idf if i<ave]
return wlist
示例6: __init__
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
class PostProcessor:
def __init__(self):
"""Loads in Ed and Olivier's domainRules.json file, now converted to a big (7k+ entry) dict object"""
#import domainRules.json
from domain_rules import domain_rules
from tldextract.tldextract import extract
self.extract = extract
from nltk.stem.porter import PorterStemmer as PorterStemmer
self.domain_rules = domain_rules
#create stemmer
self.Stemmer = PorterStemmer()
def rerank(self, url, text, results):
"""Processes classified results"""
#check if the domain exists in domainrules
domain = self.extract(url)
domain = domain.domain + "." + domain.suffix
print "Extracted domain: {0}".format(domain)
if domain in self.domain_rules:
print "found domain"
if "__ANY" in self.domain_rules[domain]:
categories = self.domain_rules[domain]['__ANY']
for cat in categories:
#stem it
matchers = [self.Stemmer.stem(cat)]
if "-" in matchers[0]:
matchers.append(matchers[0].replace("-", "_"))
for matcher in matchers:
for x in range(len(results)):
print "comparing {0} to {1}".format(matcher, results[x][0])
if matcher.lower() in results[x][0].lower():
print "{0} with score {1} contains {2}".format(results[x][0], results[x][1], matcher)
results[x][1] = results[x][1] + 1
print "score is now {0}".format(results[x][1])
else:
print "augmenting common words"
#check for common words
words = defaultdict(int)
for result in results:
tokens = re.findall("[a-z]+", result[0].lower())
for token in tokens:
words[token] += 1
#remove single entries
for k,v in words.iteritems():
if v > 1:
for x in range(len(results)):
matchers = [self.Stemmer.stem(k)]
if "-" in matchers[0]:
matchers.append(matchers[0].replace("-", "_"))
for matcher in matchers:
if matcher.lower() in results[x][0].lower():
print "{0} with score {1} contains {2} which has score {3}".format(results[x][0], results[x][1], matcher, v)
results[x][1] = results[x][1] + v
print "score is now {0}".format(results[x][1])
return sorted(results, key=lambda x:x[1], reverse=True)
示例7: extractFeatures
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
def extractFeatures(dataSet):
vector1, vector2 = list(), list()
stemmer = PorterStemmer()
# Produces list of all unique word stems in the titles in the dataset
wordBag = list({stemmer.stem(word) for entry in dataSet for word in entry[2].strip().split(" ") if not word in stopwords.words('english')})
for entry in dataSet:
genre, isbn, title, authors = entry[0], entry[1].strip(), entry[2].strip(), entry[3].strip()
wordList, authorList = [word for word in title.split(" ")], [author.strip() for author in authors.split(";")]
sortedWords = sorted(wordList, key = lambda x: -1*len(x))
nonStopWords = [word for word in sortedWords if not word in stopwords.words('english')]
stemmedWords = [stemmer.stem(word) for word in nonStopWords]
# Quantitative data about the title
shortestWord = len(nonStopWords[-1])
longestWord = len(nonStopWords[0])
meanWord = sum([len(word) for word in nonStopWords])/len(nonStopWords)
wordSD = (sum([(len(word)-meanWord)**2 for word in nonStopWords])/len(nonStopWords))**.5
vector1.append([(len(authorList), len(wordList), longestWord, shortestWord, meanWord, wordSD), genre])
# Creates a vector storing whether a word in a dataset occurred in the title
occurrences = tuple(1 if word in stemmedWords else 0 for word in wordBag)
vector2.append([occurrences, genre])
return (vector1,vector2)
示例8: search
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
def search(ngrams, index, path, counts, id):
print 'Searching {}'.format(path.split('/')[-1])
# If 'Graph!' button was hit with nothing in box
if ngrams == '':
return None
if len(ngrams) > 1:
ngrams = ngrams.replace(', ', ',').encode('utf-8').lower().split(',')
else:
ngrams = ngrams.encode('utf-8').lower()
ngram_count = {ngram: defaultdict(int) for ngram in ngrams}
stemmer = PorterStemmer()
for ngram in ngrams:
transcripts = list()
for word in ngram.split():
# Get stem of word
word = stemmer.stem(word)
try:
# Get set of books the word appears in
transcripts.append(set([posting[0] for posting in index[word]]))
except:
# If the word is not in the index
pass
# Get the set of transcripts in which all words in the ngram appear
transcripts = set.intersection(*transcripts) if len(transcripts) > 0 else set()
for transcript in transcripts:
year = int(transcript.split('-')[1])
month = int(transcript.split('-')[2])
day = int(transcript.split('-')[3])
date = datetime(year, month, day)
locs = []
# For each transcript, get all of the locations of where the words in the ngram appear
for word in ngram.split():
word = stemmer.stem(word)
locs.extend([posting[1] for posting in index[word] if posting[0] == transcript])
# Check if the words are next to each other
# e.g. ngram = 'very high profit margin' and the positions of the words are [[2,10] [3], [4,8,12,29], [5]]
# This line of code will shift the position of each word over by its distance from the
# beginning of the ngram to produce new positions [[2,10], [2], [2,6,10,29], [2]]
# Then I take the intersection of these positions -- if it's not empty,
# then the ngram appears in the transcript
locs = [set([int(pos) - i for pos in loc]) for i, loc in enumerate(locs)]
ngram_count[ngram][date] += len(set.intersection(*locs))
counts[id] = ngram_count
print 'Finished searching {}'.format(path.split('/')[-1])
示例9: stem
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
def stem(ts):
global stemmer
if stemmer is None:
stemmer = PorterStemmer()
if type(ts) is list:
return [stemmer.stem(x) for x in ts]
else:
return stemmer.stem(ts)
示例10: get_bleu_similarity
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
def get_bleu_similarity(reference_answers, student_answer):
porter_stemmer = PorterStemmer()
reference_answers_tokens = []
for answer in reference_answers:
reference_answers_tokens.append(map(lambda x: str(porter_stemmer.stem(x)), answer.split()))
student_answer = map(lambda x: str(porter_stemmer.stem(x)), student_answer.split())
weights = [0.25, 0.25]
return bleu(student_answer,reference_answers_tokens, weights)
示例11: PropertyFinder
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
class PropertyFinder(object):
def __init__(self):
self._stemmer = PorterStemmer()
def __get_property_string_forms(self, property_subtree):
words = stopwords.words('english')
property_string_forms = set()
property_string_forms.add((' '.join(property_subtree.leaves())).lower())
property_string_forms.add((' '.join([self._stemmer.stem(word) for word in property_subtree.leaves()])).lower())
property_string_forms.add((' '.join([word for word in property_subtree.leaves() if word not in words])).lower())
property_string_forms.add((' '.join([self._stemmer.stem(word) for word in property_subtree.leaves() if word not in words])).lower())
return property_string_forms
def __fetch_from_wikibase(self, property_string):
labels = DataBase().search_properties_name(property_string)
if labels is None:
return []
return [label.lower() for label in labels]
def __fetch_synonyms_and_hypernyms(self, property_string):
words = set()
synsets = wordnet.synsets(property_string)
for synset in synsets:
words.update([lemma.replace('_', ' ').lower() for lemma in synset.lemma_names()])
for hypernym in synset.hypernyms():
words.update([lemma.replace('_', ' ').lower() for lemma in hypernym.lemma_names()])
return words
def find_candidates(self, property_subtree):
if not isinstance(property_subtree, ParentedTree):
raise AttributeError
candidates = set(self.__get_property_string_forms(property_subtree))
new_candidates = set()
for candidate in candidates:
for label in self.__fetch_from_wikibase(candidate):
new_candidates.add(label)
candidates.update(new_candidates)
new_candidates = set()
for candidate in candidates:
new_candidates.update(self.__fetch_synonyms_and_hypernyms(candidate))
candidates.update(new_candidates)
new_candidates = set()
for candidate in candidates:
for POS in [wordnet.ADJ, wordnet.ADV, wordnet.NOUN, wordnet.VERB]:
morphy = wordnet.morphy(candidate, POS)
if morphy is not None:
new_candidates.add(morphy)
candidates.update(new_candidates)
return candidates
示例12: __weight_tokens
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
def __weight_tokens(self, mid, nps, sentences, sent_id):
st = PorterStemmer()
sent_target = sentences[sent_id]
token_id = [idx for idx, token in enumerate(sent_target.strip().split(" ")) if mid in token][0]
sent_lengths= [len(s.split(" ")) for s in sentences]
nps_base = {np:" ".join(st.stem(token) for token in np.split(" ")) for np in nps}
nps_proc = {}
for sent_idx, sent in enumerate(sentences):
sent_stem = " ".join(st.stem(token) for token in sent.split(" "))
for np_ori, np in nps_base.iteritems():
if np_ori not in nps_proc: nps_proc[np_ori] = {}
if "dist_sent" not in nps_proc[np_ori] or abs(sent_idx - sent_id) < nps_proc[np_ori]["dist_sent"]:
#always update the info
if np not in sent_stem:
continue
np_idx = sent_stem.rindex(np)
np_token_idx= len(sent_target[:np_idx].strip().split(" "))
dist_start = len(sent_stem[:np_idx].strip().split(" "))
dist_end = len(sent_stem[np_idx+len(np):].strip().split(" "))
dist_sent = abs(sent_idx - sent_id)
dist_token = -1
if dist_sent == 0:
if mid in np_ori:
dist_token = 0
elif np_token_idx < token_id:
dist_token = token_id - np_token_idx - (len(np.split(" ")) - 1) - 1
elif np_token_idx > token_id:
dist_token = np_token_idx - token_id - 1
elif sent_idx < sent_id:
dist_token = dist_end + sum(sent_lengths[sent_idx+1:sent_id]) + token_id
elif sent_idx > sent_id:
dist_token = (len(sent_target.strip().split(" "))-1-token_id) + sum(sent_lengths[sent_id+1:sent_idx]) + dist_start
nps_proc[np_ori]["dist_sent"] = dist_sent
nps_proc[np_ori]["dist_token"] = dist_token
np_count = sent_stem.count(np)
nps_proc[np_ori]["tf"] = (nps_proc[np_ori].get("tf") or 0) + np_count
nps_weight = {}
for np, vals in nps_proc.iteritems():
term1 = self.__alpha * self.__gaussian_weight(vals["dist_token"], self.__var_d)
term2 = self.__beta * self.__gaussian_weight(vals["dist_sent"], self.__var_s)
term3 = self.__gamma * vals["tf"]
nps_weight[np] = (term1 + term2 + term3) / (self.__alpha + self.__beta + self.__gamma)
return nps_weight
示例13: preProcessing
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
def preProcessing(self,raw,fileName):
cachedStopWords = stopwords.words("english")
stemmer = PorterStemmer()
text = ' '.join([word for word in raw.split() if word not in cachedStopWords])
tokens = nltk.word_tokenize(text.lower())
stemmed = []
directory = os.getcwd()+"/pre-process/"
if not os.path.exists(directory):
os.makedirs(directory)
test = open(directory+re.sub('\.htm$', '', fileName)+".txt","w")
for item in tokens:
stemmed.append(stemmer.stem(item))
test.write(stemmer.stem(item)+' ')
test.close()
return stemmed
示例14: main
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
def main():
rake=RAKE.Rake('SmartStoplist.txt')
fp=open(input_file,'r')
text=fp.read()
text=text_clean(text)
"""wnl=WordNetLemmatizer()
text=' '.join([wnl.lemmatize(i.strip()) for i in nltk.word_tokenize(text)])"""
porter_stemmer=PorterStemmer()
text=' '.join([porter_stemmer.stem(i.strip()) for i in nltk.word_tokenize(text)])
keywords=rake.run(text)
# print keywords
with open(key_score_file,'wb') as out:
csv_out=csv.writer(out)
csv_out.writerow(['KEYWORD','SCORE'])
for row in keywords:
if row[1]>0:
csv_out.writerow(row)
unibitrigram_list=[]
unibitrigram_list=generate_unibitrigrams(key_score_file)
#print unibitrigram_list
#ngram_freq=[]
ngram_freq=Counter(unibitrigram_list)
sorted_ngram_freq=sorted(ngram_freq.items(),key=lambda x:x[1],reverse=True )
print ngram_freq
with open('bcom_ngramfr_stem.csv','wb') as nf_csv:
csv_wr=csv.writer(nf_csv)
for item in sorted_ngram_freq:
if ((item[0]!='')):
csv_wr.writerow(item)
示例15: StemmerTokenizer
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem [as 别名]
class StemmerTokenizer(object):
def __init__(self):
self.stemmer = PorterStemmer()
def __call__(self, doc):
return [self.stemmer.stem(t) for t in word_tokenize(doc)]