本文整理汇总了Python中stemming.porter2.stem函数的典型用法代码示例。如果您正苦于以下问题:Python stem函数的具体用法?Python stem怎么用?Python stem使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了stem函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_pmi
def get_pmi(self, word0, word1):
"""Return the pointwise mutual information, a measure of word
association within a window, for two words. This is normalized
using Bouma (2009) to avoid infinite values for OOV terms.
"""
word0 = word0.lower()
word1 = word1.lower()
if self.stemming:
word0 = porter2.stem(word0)
word1 = porter2.stem(word1)
if word0 not in self.word_counts or word1 not in self.word_counts:
return -1
if word0 < word1:
pair_counts = self.word_pair_counts[word0][word1]
else:
pair_counts = self.word_pair_counts[word0][word1]
if pair_counts == 0:
return -1
num_words = self.word_counts[anyword]
# TODO: confirm normalization. Currently assuming words are
# normalized by num_words and pairs by num_words^2.
ratio = pair_counts / (self.word_counts[word0] *
self.word_counts[word1])
pmi = np.log(ratio)
normalized_pmi = - pmi / np.log(pair_counts / (num_words * num_words))
return normalized_pmi
示例2: find_collocations
def find_collocations(file_name, data, popular_word):
text_file = open(file_name, 'r')
file_content = text_file.read()
most_common_words = find_most_common_words(file_content, popular_word)
second_word = None
third_word = None
collocations = data
text_file.seek(0)
for line in text_file:
for word in line.split():
first_word = second_word
second_word = third_word
third_word = trim_word(word)
if (first_word not in most_common_words and second_word not in most_common_words) and \
(first_word and first_word[0].islower() and second_word and second_word[0].islower()):
count_collocations(collocations, stem(first_word.lower()), stem(second_word.lower()))
# dodatkowa iteracja dla ostatniego slowa
first_word = second_word
second_word = third_word
count_collocations(collocations, first_word, second_word)
collocations = find_whole_collocations_from_stems(collocations, file_content)
return collocations, most_common_words, file_content
示例3: calculateScore
def calculateScore(query,qID):
sfile=open('../AP_DATA/stoplist.txt','r')
sList=sfile.read().split('\n')
query=query.lower()
qList=re.findall("\w+[\.?\w+]*",query)
temp=list()
for term in qList:
if term.endswith('.') & term.count('.')==1 & (len(term)>1):
term=term.replace('.','')
if term.startswith('_') & term.count('_') ==1 & (len(term)>1):
term = term.replace('_','')
temp.append(term)
qList = temp
#print index_num
if index_num=='4':
#print 123
qList=[i for i in temp if i not in sList]
temp=list()
for term in qList:
term=stem(term)
temp.append(term)
qList=temp
if index_num=='3':
temp=list()
for term in qList:
term=stem(term)
temp.append(term)
qList=temp
if index_num=='2':
qList=[i for i in temp if i not in sList]
示例4: find_collocations_tri
def find_collocations_tri(filename):
text_file = open(filename, 'r')
most_common_words = find_most_common_words(text_file, 100)
second_word = None
third_word = None
fourth_word = None
collocations = dict()
text_file.seek(0)
for line in text_file:
for word in line.split():
first_word = second_word
second_word = third_word
third_word = fourth_word
fourth_word = trim_word(word)
if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words) and \
(first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower()):
count_collocations_tri(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()))
#dodatkowa iteracja dla ostatniego slowa
first_word = second_word
second_word = third_word
third_word = fourth_word
count_collocations_tri(collocations, first_word, second_word, third_word)
sort_collocations_tri(collocations)
示例5: find_collocations_penta
def find_collocations_penta(text, data, popular_word):
most_common_words = find_most_common_words(text, popular_word)
second_word = None
third_word = None
fourth_word = None
fifth_word = None
sixth_word = None
collocations = data
for word in text.split():
first_word = second_word
second_word = third_word
third_word = fourth_word
fourth_word = fifth_word
fifth_word = sixth_word
sixth_word = trim_word(word)
if (first_word not in most_common_words and second_word not in most_common_words and third_word not in most_common_words and fourth_word not in most_common_words and fifth_word not in most_common_words) and \
(first_word and first_word[0].islower() and second_word and second_word[0].islower() and third_word and third_word[0].islower() and fourth_word and fourth_word[0].islower() and fifth_word and fifth_word[0].islower() ):
count_collocations_penta(collocations, stem(first_word.lower()), stem(second_word.lower()), stem(third_word.lower()), stem(fourth_word.lower()), stem(fifth_word.lower()))
#dodatkowa iteracja dla ostatniego slowa
first_word = second_word
second_word = third_word
third_word = fourth_word
fourth_word = fifth_word
fifth_word = sixth_word
count_collocations_penta(collocations, first_word, second_word, third_word, fourth_word, fifth_word)
return collocations, most_common_words
示例6: tokenize_porter
def tokenize_porter(title, body):
""" Break text into words and stem user porter stemmer """
# break up words & remove stopwords
title_break = stopWords(nltk.word_tokenize(title), lower_case=True)
body_break = stopWords(nltk.word_tokenize(body), lower_case=True)
# print title_break
return ["title:" + stem(title) for title in title_break] + ["body:" + stem(body) for body in body_break]
示例7: ngram_in_collection
def ngram_in_collection(ngram, coll):
"""
Check if ngram's components are in collection
"""
s1 = set([stem(word) for word in ngram.split(' ')])
s2 = set([stem(word) for word in coll])
return (len(s1.intersection(s2)) > 0)
示例8: tokenize
def tokenize(self):
punc = """\\.!?,(){}[]"'"""
wordarray = []
for c in self.document.lower().split():
if stem(c.strip()) not in self.corpus.stopwords:
wordarray.append(stem(c.strip(punc)))
return wordarray
示例9: cleanText
def cleanText(text, entities, category):
cleanText = text
hashtags = entities.get('hashtags', [])
ranges = []
for hashtag in hashtags:
if hashtag.get('text', '').lower() == category:
indices = hashtag.get('indices')
ranges.append(indices)
urls = entities.get('urls', [])
urls.reverse()
ranges.extend([v for url in urls for k,v in url.iteritems() if k == 'indices'])
media = entities.get('media', [])
media.reverse()
ranges.extend([v for medium in media for k,v in medium.iteritems() if k == 'indices'])
ranges = sorted(ranges, key=lambda x: x[0], reverse=True)
for r in ranges:
cleanText = cleanText[:r[0]] + cleanText[r[1] + 1:]
category_stem = stem(category).lower()
cleanTextList = cleanText.split(' ')
cleanText = []
for word in cleanTextList:
if category_stem not in stem(word).lower() and stem(word).lower() not in category_stem:
cleanText.append(word)
cleanText = " ".join(cleanText)
return cleanText
示例10: read
def read(self, publication_keyword, publication_data):
words = open(publication_keyword, 'r').readlines()
for i in range(0, self.topic_number):
s = stem(words[i].split('\t')[0])
self.topics[ s ] = dict()
self.stemword_dict[s] = words[i].split('\t')[0]
content = open(publication_data, 'r').readlines()
counter = 0
year = ''
for i in content:
# three line represents a publication
if counter % 3000 == 0:
print (counter / 3)
# record the year of this publication
if counter % 4 == 1:
year = int(i.strip())
# parse the keywords of this publication
elif counter % 4 == 3:
keywords = i.strip().split(' ')
for j in keywords:
j = stem(j)
if j in self.topics:
if year in self.topics[j]:
self.topics[j][year] += 1
else:
self.topics[j][year] = 1
counter = counter + 1
示例11: makeFreqDictionaryOfSentenceWords
def makeFreqDictionaryOfSentenceWords(s1):
words1 = s1.split();
dt1 = {}
for w in words1:
if w.lower() not in stopwords:
dt1[stem(w.lower())] = dt1.get(stem(w.lower()),0) + 1
return dt1
示例12: sentence_matches
def sentence_matches(self, sentence_text):
"""Returns true iff the sentence contains this mention's upstream
and downstream participants, and if one of the stemmed verbs in
the sentence is the same as the stemmed action type."""
has_upstream = False
has_downstream = False
has_verb = False
# Get the first word of the action type and assume this is the verb
# (Ex. get depends for depends on)
actiontype_words = word_tokenize(self.mention.actiontype)
actiontype_verb_stemmed = stem(actiontype_words[0])
words = word_tokenize(sentence_text)
if self.string_matches_sans_whitespace(sentence_text.lower(),
self.mention.upstream.lower()):
has_upstream = True
if self.string_matches_sans_whitespace(sentence_text.lower(),
self.mention.downstream.lower()):
has_downstream = True
for word in words:
if actiontype_verb_stemmed == stem(word):
has_verb = True
return has_upstream and has_downstream and has_verb
示例13: getVocabularyStem
def getVocabularyStem(content):
vocabulary = {}
index = 0
for i in range(len(content)):
if stem(content[i]) not in vocabulary:
vocabulary[stem(content[i])] = index
index = index + 1
return vocabulary
示例14: main
def main():
nlp_file = open(sys.argv[1], "r")
for line in nlp_file:
words = line.strip().split(" ")
for word in words:
print stem(word)
nlp_file.close()
示例15: getSentTf
def getSentTf(sent, stopwords):
doc = dict()
for word in re.split("[^a-zA-Z0-9]", sent):
word = word.lower()
if word != "" and word!="'" and stem(word) not in stopwords:
if doc.get(stem(word), 0) == 0:
doc[stem(word)] = 1
else:
doc[stem(word)] = doc[stem(word)]+1
return doc