本文整理汇总了Python中nltk.stem.porter.PorterStemmer类的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer类的具体用法?Python PorterStemmer怎么用?Python PorterStemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PorterStemmer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: AddTopicUnigram
def AddTopicUnigram(self, feaName,comName, data = None):
#need mapping first
if data is None:
data =self._data
for i in range(len(data)):
t_bigram = self.getEssayCollocation(data, i)
t_uni = list()
for (a, b) in t_bigram:
t_uni.append(a)
t_uni.append(b)
t_uni = set(t_uni)
comment = data[i][comName]
tokens = nltk.wordpunct_tokenize(comment)
tokens = [word.lower() for word in tokens]
#stemming
if self._stemoption ==True:
st = PorterStemmer()
tokens = [st.stem(t) for t in tokens]
t_uni = set([st.stem(t) for t in list(t_uni)])
shared = [w for w in tokens if w in t_uni]
#normalized
data[i][feaName] = float(len(shared))/(len(tokens)+0.00001)
示例2: lda
def lda(data):
data = get_only_text(data)
only_tweet = data
length = len(only_tweet)
length = min(20,length)
for i in xrange(0,length):
print i
print only_tweet[i]
return
tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
p_stemmer = PorterStemmer()
length = len(only_tweet)
length = min(20,length)
total_texts = []
for i in xrange(0,length):
print only_tweet[i]
print
to_lower = only_tweet[i].lower()
tokens = tokenizer.tokenize(to_lower)
stopped_tokens = [k for k in tokens if not k in en_stop]
texts = [p_stemmer.stem(k) for k in stopped_tokens]
total_texts.append(texts)
dictionary = corpora.Dictionary(total_texts)
corpus = [dictionary.doc2bow(text) for text in total_texts]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
result = ldamodel.print_topics(num_topics=2, num_words=1)
for i in result:
print i
示例3: cleanData
def cleanData(doc_list):
# tokenize
tokens = []
for doc in doc_list:
text_l = []
ws_split = re.split(split_on, doc)
for w in ws_split:
# remove URLs and empty strings
if not (url_pat.match(w) or w == u''):
text_l.append(w)
# rejoin text and 'properly' tokenize
text = " ".join(text_l)
text_l = nltk.word_tokenize(text)
# stop words
text_l = [ w.lower() for w in text_l if w.lower() not in stops]
# stemming
p_stemmer = PorterStemmer()
text_l = [p_stemmer.stem(t) for t in text_l]
## append cleaned text to list
tokens.append(text_l)
return tokens
示例4: get_stemmed_separate
def get_stemmed_separate(indeed_reviews_db, glassdoor_reviews_db):
separate = get_separate_reviews(indeed_reviews_db, glassdoor_reviews_db)
stemmer = PorterStemmer()
stemmed_reviews = []
for review in separate:
stemmed_reviews.append(' '.join([stemmer.stem(word) for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())]))
return stemmed_reviews
示例5: main
def main():
rake=RAKE.Rake('SmartStoplist.txt')
fp=open(input_file,'r')
text=fp.read()
text=text_clean(text)
"""wnl=WordNetLemmatizer()
text=' '.join([wnl.lemmatize(i.strip()) for i in nltk.word_tokenize(text)])"""
porter_stemmer=PorterStemmer()
text=' '.join([porter_stemmer.stem(i.strip()) for i in nltk.word_tokenize(text)])
keywords=rake.run(text)
# print keywords
with open(key_score_file,'wb') as out:
csv_out=csv.writer(out)
csv_out.writerow(['KEYWORD','SCORE'])
for row in keywords:
if row[1]>0:
csv_out.writerow(row)
unibitrigram_list=[]
unibitrigram_list=generate_unibitrigrams(key_score_file)
#print unibitrigram_list
#ngram_freq=[]
ngram_freq=Counter(unibitrigram_list)
sorted_ngram_freq=sorted(ngram_freq.items(),key=lambda x:x[1],reverse=True )
print ngram_freq
with open('bcom_ngramfr_stem.csv','wb') as nf_csv:
csv_wr=csv.writer(nf_csv)
for item in sorted_ngram_freq:
if ((item[0]!='')):
csv_wr.writerow(item)
示例6: parse_questions
def parse_questions(self):
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+')
for questions_key in self.rawSamples:
# Stem the Question Text
question_text = self.rawSamples[questions_key][0]
words_array = tokenizer.tokenize(question_text)
question_text = ""
for word in words_array:
if word.isnumeric():
continue
if word not in text.ENGLISH_STOP_WORDS:
word = stemmer.stem(word)
word = stemmer.stem(word)
question_text += (word + " ")
self.rawSamples[questions_key][0] = question_text
# Stem the topic names
topics_text = self.rawSamples[questions_key][2]
words_array = tokenizer.tokenize(topics_text)
topics_text = ""
for word in words_array:
if word.isnumeric():
continue
if word not in text.ENGLISH_STOP_WORDS:
word = stemmer.stem(word)
word = stemmer.stem(word)
topics_text += (word + " ")
self.rawSamples[questions_key][2] = topics_text
示例7: evaluate
def evaluate(query):
global DICTIONARY
word_score = {}
seek_pos = open(postings_file, 'r')
seek_pos.seek(0,0)
words = query.split()
stemmer = PorterStemmer()
words = [element.lower() for element in words]
for item in words:
word = stemmer.stem(item)
if word not in word_score:
if word in DICTIONARY:
seek_pointer = DICTIONARY[word]
seek_pos.seek(int(seek_pointer))
line = seek_pos.readline()
seek_pos.seek(0,0)
post_list = line.split()
score = score_documents(post_list)
word_score[word] = score
else:
#not encountered, score of 0
word_score[word] = []
#else duplicate, skip word
result = score_query(word_score)
return result
示例8: issue_analysis
def issue_analysis(df):
df_sub = df[['Issue']]
df_sub.insert(0, 'count', 1)
Issue_List=[]
for i in range(0,50):
Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name)
tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg
en_stop = get_stop_words('en') # create English stop words list
p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer
texts = [] # list for tokenized documents in loop
text_view = ''
# loop through document list
for i in Issue_List:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens and add them to list
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
texts.append(stemmed_tokens)
#print ' '.join(stemmed_tokens)
text_view += ' '.join(stemmed_tokens)
text_view += ' '
wordcloud = WordCloud().generate(text_view)
fig = plt.figure(figsize=(8,6))
fig1 = fig.add_subplot(1,1,1)
fig1.set_title("Top issued words", fontdict={'fontsize':25})
fig1.imshow(wordcloud)
fig1.axis("off")
#plt.savefig('ComplainCount_WC.png')
plt.savefig('ComplainCount_WC_2016.png')
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
LDAText = ldamodel.print_topics(num_topics=5, num_words=3)
#print "\n Topic analysis result for top 25 issues with LDA"
#print(LDAText)
vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
#pyLDAvis.show(vis_data)
#pyLDAvis.save_html(vis_data, "issue_lda.html")
#pyLDAvis.save_json(vis_data, "issue_lda.json")
pyLDAvis.save_html(vis_data, "issue_lda_2016.html")
pyLDAvis.save_json(vis_data, "issue_lda_2016.json")
return 0
示例9: destem
def destem(self, stemmed_term, corpus):
'''
Given a stemmed term, we look through the text of every document
in corpus, determine the most common "parent" version of the
given stemmed term, and return it.
'''
destemmed_term = ""
min_num_terms = 5000
min_percentage = 0.20
candidates = {}
stemmer = PorterStemmer()
num_terms_checked = 0
num_docs_checked = 0
total_matches = 0
for doc in corpus:
# matches is the list of all term in the current text that are
# "ancestor" versions of the stemmed term.
matches = ([term for term in doc.split_text
if stemmer.stem(term) == stemmed_term])
num_terms_checked += len(doc.split_text)
num_docs_checked += 1
total_matches += len(matches)
if not matches:
continue
# we keep a tally of the number of times each "ancestor"
# appears in our text
for match in matches:
if match in candidates:
candidates[match] += 1
else:
candidates[match] = 1
# sort potential destemmed versions in descending order
# by frequency
sorted_candidates = sorted(candidates.keys(),
key=lambda
term: candidates[term],
reverse=True)
if num_docs_checked == self.num_corpus_docs:
# we've run through every doc, so the most frequent
# ancestor of the stemmed term is the best destemmed
# result.
destemmed_term = sorted_candidates[0]
break
# if we've reviewed enough total words, we can start trying
# to find a suitable destemmed term from what we have so far
if min_num_terms <= num_terms_checked:
# this is the most frequent ancestor of the stemmed term
possible_match = sorted_candidates[0]
test_percentage = candidates[possible_match] \
/ float(total_matches)
# if the potential destemmed version accounts for a
# sufficient percentage of the total matches, we can
# decide that it's a suitable destemmed result.
if min_percentage <= test_percentage:
destemmed_term = possible_match
break
print("Destemmed: {0} --> {1}".format(stemmed_term, destemmed_term))
return destemmed_term
示例10: processing
def processing(raw_review):
word1=[]
# 1. Remove HTML
review_text = BeautifulSoup(raw_review).get_text()
# 2. Remove Punctuations
letters_only = remove_punctuations(review_text)
# 3. Convert to lower case, split into individual words
for words in letters_only:
wordset=[word.lower() for word in words]
word1.append(wordset)
#4Handling Double Negation
negated_words=negation_handling(word1)
#5 Read only verbs,adjectives,adverbs,interjections (descriptive words)
meaningful_words=descriptive_words(negated_words)
#6 Remove Time, Location, Organization, Person, Money, Percent, Date using NER
#removed_words=remove_names(meaningful_words)
#7. Remove stop words
stops =open(r'C:\Users\PSarka\Desktop\sentimentanalysis\stopwords.txt','r')
stops= set([word[:-1] for word in stops])
meaningful_words_new = [w for w in meaningful_words if not w in stops]
#6.Stemming using Porter Stemmer,Lemming can also be used check which is more efficient
st=PorterStemmer()
stemmed_words=[st.stem(words) for words in meaningful_words_new]
#7. Join the words back into one string separated by space,
# and return the result.
print stemmed_words
return( " ".join(stemmed_words ))
示例11: tweet_stemming
def tweet_stemming(tweet, token_freqs):
"""
Stems tweets words and counts diversty
:param tweet: the tweet to analyze
:type tweet: str or unicode
:param token_freqs: counter of words frequency
:type token_freqs: Counter
:returns: words added to token_freqs
:rtype: int
"""
pattern_url = '((https?:\/\/)|www\.)([\da-z\.-]+)\.([\/\w \.-]*)( |$)'
regex_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
porter = PorterStemmer()
counter_tokens = 0
tweet_url_removed = re.sub(pattern_url, '', tweet, flags=re.MULTILINE) # remove URL
tweet_url_removed_tokenized = word_tokenize(tweet_url_removed) # tokenize tweet
tweet_url_removed_tokenized_cleaned_stemming = [] # cleaned of URLs and hashs, and stemming
for token in tweet_url_removed_tokenized:
new_token = regex_punctuation.sub(u'', token) # remove punctuation and hash
if not new_token == u'':
new_token_stemming = porter.stem(new_token)
tweet_url_removed_tokenized_cleaned_stemming.append(new_token_stemming)
token_freqs[new_token_stemming] += 1
counter_tokens += 1
return counter_tokens
示例12: compare_english_simple
def compare_english_simple(article_title):
"""Given a title of an article, returns the number of tokens, types, and stems
in both the English version and the simple English version."""
english = extract_wikipedia_page(article_title, "en")
simple = extract_wikipedia_page(article_title, "simple")
num_tokens_english = len(english)
num_tokens_simple = len(simple)
types_english = count_words(get_words(english))
types_simple = count_words(get_words(simple))
porter_stemmer = PorterStemmer()
stem_english = defaultdict(int)
stem_simple = defaultdict(int)
for key in types_english.keys():
stem_english[porter_stemmer.stem(key)] += 1
for key in types_simple.keys():
stem_simple[porter_stemmer.stem(key)] += 1
print ("Number of Tokens in English " + article_title + ": %d" % num_tokens_english)
print ("Number of Tokens in Simple English " + article_title + ": %d" % num_tokens_simple)
print ("Number of Types in English " + article_title + ": %d" % len(types_english))
print ("Number of Types in Simple English " + article_title + ": %d" % len(types_simple))
print ("Number of Stems in English " + article_title + ": %d" % len(stem_english))
print ("Number of Stems in Simple English " + article_title + ": %d" % len(stem_simple))
示例13: query
def query(new_doc,doc_topic,topic_word,dictionary,LSH,num_topic):
tokens = []
token = get_tokens(new_doc)
stopped_tokens = [i for i in token if not i in en_stop]
p_stemmer = PorterStemmer()
stemed_tokens = []
for i in stopped_tokens:
try:
temp_token = str(p_stemmer.stem(i))
stemed_tokens.append(temp_token)
except IndexError:
pass
tokens = stemed_tokens
new_corpus=dictionary.doc2bow(tokens)
new_corpus = to_gibbs_corpus([new_corpus])[0] ##convert
new_topic_vector = np.zeros(num_topic)
for t in new_corpus:
mult_par = topic_word[:,t[0]] + 1
mult_par = mult_par/np.sum(mult_par)
new_topic_vector += np.random.multinomial(t[1],mult_par)
#print mult_par
#print topic_word[:,t[0]]
new_topic_vector = new_topic_vector/np.sum(new_topic_vector)
dist,indices=LSH.kneighbors(new_topic_vector,n_neighbors=20)
print indices+1
示例14: read_class_data
def read_class_data(path, label=None):
'''
Label may come from the data itself, may be assigned at run time
'''
if os.path.exists(path):
if os.path.isdir(path):
paths = [os.path.join(path, f) for f in os.listdir(path)]
else:
paths = [path]
else:
print 'Given path does not exist.'
return
doc = doc_file()
stemmer = PorterStemmer()
instances = []
for p in paths:
doc.path = p
for raw_record in doc:
record = unpack(raw_record, ',')
text = record[3].strip('"')
inst = {'tokens': [], 'label': ''}
for t in wordpunct_tokenize(text):
stem_t = stemmer.stem(t.lower())
if stem_t[0].islower():
inst['tokens'].append(stem_t)
else:
continue
inst['label'] = label
instances.append(inst)
return instances
示例15: extract_entities
def extract_entities(doc):
print 'extracting entities from %s...' % doc.getFilename()
nps = list(set([re.sub(' \.', '', re.sub(' -[A-Z]{3}-', '', np).lower()) for np in doc.getAllNodesOfType('NP')]))
p = PorterStemmer()
entities = []
for np in nps:
try:
response = json.loads(requests.get(host+'select', params={'q': 'wam:[50 TO 100] AND iscontent:true AND lang:en AND (title_en:"%s" OR redirect_titles_mv_en:"%s")' % (np, np), 'fl': 'title_en,redirect_titles_mv_en', 'wt': 'json'}).content)
except requests.exceptions.ConnectionError:
while True:
time.sleep(15)
print 'retrying connection...'
try:
response = json.loads(requests.get(host+'select', params={'q': 'wam:[50 TO 100] AND iscontent:true AND lang:en AND (title_en:"%s" OR redirect_titles_mv_en:"%s")' % (np, np), 'fl': 'title_en,redirect_titles_mv_en', 'wt': 'json'}).content)
break
except requests.exceptions.ConnectionError:
continue
docs = response[u'response'][u'docs']
if len(docs) > 0:
titles = [docs[0][u'title_en']] + docs[0].get(u'redirect_titles_mv_en', [])
else:
titles = []
if len(titles) > 0:
titles = [' '.join([p.stem(w.lower()) for w in t.split(' ')]) for t in titles]
stem_np = ' '.join([p.stem(w) for w in np.split(' ')])
for title in titles:
if stem_np == title:
entities.append(np)
print np
break
#print doc.getFilename(), entities
return (doc.getFilename(), entities)