本文整理汇总了Python中nltk.stem.porter.PorterStemmer.stem_word方法的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer.stem_word方法的具体用法?Python PorterStemmer.stem_word怎么用?Python PorterStemmer.stem_word使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.porter.PorterStemmer
的用法示例。
在下文中一共展示了PorterStemmer.stem_word方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
def __init__(self, words, is_binary=False):
self._keywords = words
self._stemmed_keywords = []
stemmer = PorterStemmer()
for word in words:
self._stemmed_keywords.append(stemmer.stem_word(word))
self._is_binary = is_binary
示例2: extract
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
def extract(self, line):
"""
find word pairs that co-occur and extract # of minimum distance word pairs in the line
"""
words = self.tokenize(line.lower())
count = 0.0
stemmer = PorterStemmer()
bad_indices = []
you_indices = []
for i in range(len(words)):
word = words[i]
if word in self._youwords:
you_indices.append(i)
word = stemmer.stem_word(word)
if word in self._stemmed_badwords or self.isWordPartOf(word,self._badwords):
bad_indices.append(i)
if not bad_indices or not you_indices:
return [-1]
else:
#print line
#print bad_indices
#print you_indices
distances = []
for bindex in bad_indices:
for yindex in you_indices:
distances.append(abs(bindex - yindex))
#print distances
mn = min(distances)
count = sum([1 for d in distances if d == mn])
#return [(count *1.0)* mn/len(line)]
return [1]
示例3: get_stemmed_terms_list
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
def get_stemmed_terms_list(doc, stem_words_map = None, stem_bigrams_map = None):
ps = PorterStemmer()
local_map = dict()
word_list = []
clean_doc = [(w.strip()).lower() for w in doc.split() if len(w) in range(3,16)]
filtered_words = [w.strip('.,;?!:)(#') for w in clean_doc if not w.strip('.,;?!:)(#') in stopwords.words('english')]
for w in filtered_words:
if w.isalpha():
w_temp = ps.stem_word(w)
if stem_words_map is not None:
if w_temp not in stem_words_map:
stem_words_map[w_temp] = dict()
stem_words_map[w_temp][w] = stem_words_map[w_temp].get(w, 0)+1
local_map[w_temp] = w
word_list.append(w_temp)
bigrams = nltk.bigrams(word_list)
for b in bigrams:
bigram_org = (local_map[b[0]],local_map[b[1]])
if stem_bigrams_map is not None:
if b not in stem_bigrams_map:
stem_bigrams_map[b] = dict()
stem_bigrams_map[b][bigram_org] = stem_bigrams_map[b].get(bigram_org, 0)+1
return word_list, bigrams
示例4: _getFeatures
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
def _getFeatures(self, corpus):
stemmer = PorterStemmer()
tokens = corpus.split(" ")
features = filter(lambda x: len(x) > 1, tokens)
finalList = []
for feature in features :
feature = re.sub("[^a-zA-Z0-9']", "", feature.lower())
finalList.append(stemmer.stem_word(feature))
return finalList
示例5: extract
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
def extract(self, line):
words = self.tokenize(line.lower())
count = 0.0
stemmer = PorterStemmer()
for word in words:
word = stemmer.stem_word(word)
if word in self._stemmed_keywords:
count += 1
if self._is_binary:
return [1] if count > 0 else [0]
else:
return [count]
示例6: getFeatures
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
def getFeatures(self, corpus):
stemmer = PorterStemmer()
stems = FreqDist()
onlyLettersNumbers = re.compile('[^a-zA-Z0-9%!]')
corpus = onlyLettersNumbers.sub(' ', corpus.lower())
corpus = TreebankWordTokenizer().tokenize(corpus)
count = 0
for word in corpus :
if not stopwords.STOP_WORDS.get(word) and len(word.strip()) > 1 :
stems.inc(stemmer.stem_word(word))
count += 1
if self.__maxFeatures > 0 and count >= self.__maxFeatures :
break
features = stems.samples()
return features
示例7: __init__
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
#.........这里部分代码省略.........
if self.isVariable(tag_frame[x]):
if tag_frame[x] in groundings.keys():
output[x] = groundings[tag_frame[x]]
return output
def matchTemplate(self, rule_set, callback):
output_stack = []
for key, value in rule_set.items():
match_rule = callback(value)
if match_rule[0]:
output = self.replaceOutput(match_rule, value)
output_stack.append((key, output))
self.reset()
return output_stack
def matchRuleSet(self, ruleSet):
output_stack = []
for key, value in ruleSet.items():
match_list = self.matchRule(value)
if match_list:
output = self.replaceOutput(match_list, value)
output_stack.append((key, output))
self.reset()
return output_stack
def lemma(self, word, grounding):
if self.isVariable(word) and self.Grounds.isGround(word):
groundedWord = self.Grounds.getGrounding(word)
stemmed = self.stemmer.stem_word(groundedWord)
self.Grounds.groundVariable(grounding, stemmed)
elif not self.isVariable(word):
stemmed = self.stemmer.stem_word(word)
self.Grounds.groundVariable(grounding, stemmed)
def compareRule(self, tag, var_1, var_2):
## check to see if we are ground and it is a var
if self.isVariable(var_1) and self.Grounds.isGround(var_1):
out = self.compareGround(var_1, 0)
if not out:
return False
## otherwise we need to ground it
elif self.isVariable(var_1) and not self.Grounds.isGround(var_1):
self.Grounds.groundVariable(var_1, self.current[0])
if self.isVariable(var_2) and self.Grounds.isGround(var_2):
out = self.compareGround(var_2, 2)
if not out:
return False
elif self.isVariable(var_2) and not self.Grounds.isGround(var_2):
self.Grounds.groundVariable(var_2, self.current[2])
if tag == '$prep':
## XXX: bad way of doing this
return True
if self.isVariable(tag) and self.Grounds.isGround(tag):
self.compareGround(tag, 1)
示例8: get_word_counts
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
def get_word_counts( solr, fq, query, num_words, field='sentence' ) :
print query
print str(time.asctime())
start_time = time.time()
function_start_time = start_time
results = fetch_all( solr, fq, query, 'sentence' )
print "got " + query
print len( results )
print time.asctime()
end_time = time.time()
print "time {}".format( str(end_time - start_time) )
start_time = end_time
print 'converting to utf8 and lowercasing';
sentences = [ result['sentence'].lower() for result in results ]
results = None
end_time = time.time()
print "time {}".format( str(end_time - start_time) )
start_time = end_time
print 'calculating non_stemmed_wordcounts'
term_counts = non_stemmed_word_count( sentences )
if '' in term_counts:
del term_counts['']
print "Returned from non_stemmed_word_count"
print time.asctime()
end_time = time.time()
print "time {}".format( str(end_time - start_time) )
start_time = end_time
print "freeing sentences "
sentences = None
end_time = time.time()
print "time {}".format( str(end_time - start_time) )
start_time = end_time
print 'stemming and counting'
stem_counts = collections.Counter()
st = PorterStemmer()
for term in term_counts.keys():
#ipdb.set_trace()
stem = st.stem_word( term )
stem_counts[ stem ] += term_counts[ term ]
end_time = time.time()
print "done stemming and counting "
print "time {}".format( str(end_time - start_time) )
start_time = end_time
print ' calcuating stem to term map '
stem_to_terms = {}
for term in term_counts.keys():
stem = st.stem_word( term )
if stem not in stem_to_terms:
stem_to_terms[ stem ] = []
stem_to_terms[stem].append( term )
print "done calcuating stem to term map "
print "time {}".format( str(end_time - start_time) )
counts = stem_counts.most_common( num_words )
ret = [ ]
for stem, count in counts:
if len( stem_to_terms[ stem ] ) < 2:
term = stem_to_terms[ stem][0]
else:
best_count = 0
for possible_best in stem_to_terms[ stem ] :
if term_counts[ possible_best ] > best_count:
term = possible_best
best_count = term_counts[ possible_best ]
ret.append(
{ 'stem': stem,
'term': term,
'count': count
} )
#.........这里部分代码省略.........
示例9: Namespace
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
#.........这里部分代码省略.........
compdict[uri] = label.toPython().strip().lower()
if (None, URIRef("http://www.w3.org/2000/01/rdf-schema#label"), None) in graph:
for uri, label in graph.subject_objects(
URIRef("http://www.w3.org/2000/01/rdf-schema#label")):
compdict[uri] = label.toPython().strip().lower()
return compdict
def removeDiacritics(self, label):
"""
This method uses unicodedata() to remove diacritics from a string
TODO: Does this work without unicodedata?
param: string
return: string
"""
label = ''.join((c for c in unicodedata.normalize('NFD', unicode(label)) if unicodedata.category(c) != 'Mn'))
return label
def removePunctuation(self, label):
"""This method removes punctuations. Right now, it will remove
'!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~'
param: string
return: string
"""
for punct in string.punctuation:
label = label.replace(punct," ")
return label
def stemWords(self, label):
"""This method stems a single word or a phrase
param: string
return: string
"""
label = " ".join(self.porter.stem_word(word) for word in label.split(" "))
return label
def __broaders(self, uri, graph):
"""This "private" method is a generator over broaderTerms of a given URI in
a given ConjunctiveGraph. Use getParents() to obtain the list of broaderTerms
"""
for n in graph.transitive_objects(URIRef(uri), URIRef('http://www.w3.org/2004/02/skos/core#broader')):
if (uri==n):
continue
yield n
def __narrowers(self, uri, graph):
"""This "private" method is a generator over narrowerTerms of a given URI in
a given ConjunctiveGraph. Use getchildren() to obtain the list of narrowerTerms
"""
for n in graph.transitive_objects(URIRef(uri), URIRef('http://www.w3.org/2004/02/skos/core#narrower')):
if (uri==n):
continue
yield n
def getParents(self, uri, graph):
list = []
for n in self.__broaders(uri, graph):
for label in graph.objects(n, URIRef("http://www.w3.org/2004/02/skos/core#prefLabel")):
list.append(label.toPython().strip().lower())
return list
def getChildren(self, uri, graph):
list = []
for n in self.__narrowers(uri, graph):
for label in graph.objects(n, URIRef("http://www.w3.org/2004/02/skos/core#prefLabel")):
list.append(label.toPython().strip().lower())
示例10: set
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
wouldn't
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves""".translate(table,punc).split())
for x in nltk.corpus.words.words():
stopwords.add(x.translate(table,punc))
stopwords = set([stemm.stem_word(x) for x in stopwords])
#get a list of the normalized words from the tweet
def split_tweet(m):
text = str(m['text']).translate(table, punc)
#repattern.sub('',
return set([x for x in [stemm.stem_word(x.lower()) for x in text.split() if x.lower().isalpha() and len(x) > 2 and not '#' in x and not 'http' in x] ])
#get hashtags
def get_tags(m):
return [stemm.stem_word(x['text'].lower()) for x in m['entities']['hashtags']]
english_vocab = set(stemm.stem_word(w.lower()) for w in nltk.corpus.words.words())
print 'english parsed'
#text_vocab = set(w.lower() for w in text if w.lower().isalpha())
#loop over all tweets found in files on @path
示例11: DateExtractor
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
#.........这里部分代码省略.........
return result
def _extend_to_left(self, expression, tagged_sentence):
"""Try to complete the first numeral. It's a healing method for the next scenario:
seventy two (72) days.
"""
num_text = self._extract_number_from_expression(expression)
if not num_text:
return expression
if expression.startswith(num_text):
# Expression is correctly formatted
return expression
# If the text would have contained '-' the expression would have been correct.
num_words = num_text.split('-')
if not expression.lower().startswith(num_words[-1]):
print 'Expression %s does not start with the expected word %s' % (expression,
num_words[-1])
return expression
sentence = join_sentence([t[0] for t in tagged_sentence])
# Last word is in the expression, the other words we expect to find in the subsentence.
# The fact that there can be more such expressions in a sentence was considered but we
# should not do anything special for them because agreements have a pseudo structure which
# leads to consistency in terms of formats.
idx = sentence.find(expression)
subsentence = sentence[:idx].strip().split()
subsentence.reverse()
num_words = num_words[:-1]
num_words.reverse()
idx = 0
wc = len(num_words)
while idx < wc and num_words[idx] == subsentence[idx]:
expression = '%s %s' % (num_words[idx], expression)
idx += 1
return expression
def _extract_number_from_expression(self, expression):
number_search = re.search('\(([0-9]+)\)', expression)
if not number_search:
return None
try:
number = int(number_search.groups()[0])
except:
print "Could not extract number from expression: %s" % expression
return None
return num2words(number)
def _extract_data_from_tree(self, tree):
expressions = []
for subtree in tree.subtrees():
if not subtree.label() == 'DATE':
continue
expressions.append(join_sentence([t[0] for t in subtree.leaves()]))
return expressions
def _get_sentences(self, text):
sentences = nltk.sent_tokenize(text)
# Remove new lines
sentences = [s.replace('\r\n', ' ') for s in sentences]
# Collapse whitespaces
rex = re.compile(r'[ \t]+')
sentences = [rex.sub(' ', s) for s in sentences]
sentences = [nltk.word_tokenize(sent) for sent in sentences]
return sentences
def _is_false_positive(self, expression, tagged_sentence):
# The last token should be either time unit or 'period'
time_unit = expression.split()[-1]
stem = self.stemmer.stem_word(time_unit)
if stem not in ALLOWED_STEMS:
return True
if stem == YEAR_STEM:
# Check if the sentence represents an age expression (it is followed by "of age"
# or "old").
sentence = join_sentence([t[0] for t in tagged_sentence])
idx = sentence.find(expression) + len(expression)
subsentence = sentence[idx:].strip()
# Note: This check may fail if in the same sentence there are both age expressions and
# date expressions in years. This should not be a problem since no reviewed document
# has this case (it also doesn't make sense in the pseudo-structure of agreements).
if any(subsentence.startswith(expr) for expr in AGE_EXPRESSIONS):
return True
return False
示例12: PorterStemmer
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
if(word!=False):
fullSearchURL=baseSearchURL+word
f = urllib.urlopen(fullSearchURL)
for line in f.read().split("\n"):
found=False
if("video src" in line):
endURL=line[12:]
endURL=endURL[:endURL.index("\"")]
fullURL=baseURL+endURL
print 'getting video from'+fullURL
found=True
break
if(found):
urllib.urlretrieve(fullURL,word+'.mp4')
else:
p = PorterStemmer()
word=PorterStemmer.stem_word(p,origword)
fullSearchURL=baseSearchURL+word
f = urllib.urlopen(fullSearchURL)
for line in f.read().split("\n"):
found=False
if("video src" in line):
endURL=line[12:]
endURL=endURL[:endURL.index("\"")]
fullURL=baseURL+endURL
print 'getting video from'+fullURL
found=True
break
if(found):
urllib.urlretrieve(fullURL,word+'.mp4')
示例13: glossOverlap
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
def glossOverlap(gloss1, gloss2):
# stopws = stopwords.words('english')
stopws = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs',
'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as',
'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
'in', 'out', 'on', 'off', 'over', 'under', 'then',
'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
'than', 's', 't', 'just', 'don']
stemmer = PorterStemmer()
# print "-"*50
# for x in stopws:
# if stemmer.stem_word(x) != x:
# print x, stemmer.stem_word(x)
def longestOverlap(a, b):
now = [0]*len(b)
bestOverlap = 0
aStart = 0
bStart = 0
nextNonStopWord = [-1]*(len(a)+1)
for i in range(len(a)-1, 0, -1):
if a[i] not in stopws:
nextNonStopWord[i] = i
else:
nextNonStopWord[i] = nextNonStopWord[i+1]
for i in range(1, len(a)):
prev = now
now = [0]*len(b)
if a[i] == '#':
continue
for j in range(1, len(b)):
if b[j] == '#':
continue
if a[i] == b[j]:
now[j] = max(now[j], prev[j-1] + 1)
if a[i] in stopws:
continue
overlap = now[j]
start = i - overlap + 1
start = nextNonStopWord[start]
overlap = i - start + 1
if bestOverlap < overlap:
bestOverlap = overlap
aStart = i - overlap + 1
bStart = j - overlap + 1
return (bestOverlap, aStart, bStart)
regex = ',|\.|\s|\?|\'|\"|!|;|-'
#maybe check what happens if we don't stem the glosses
a1 = ['#'] + [stemmer.stem_word(x.lower()) for x in re.split(regex, gloss1) if x]
a2 = ['#'] + [stemmer.stem_word(x.lower()) for x in re.split(regex, gloss2) if x]
score = 0
(overlap, start1, start2) = longestOverlap(a1, a2)
while overlap > 0:
# print overlap
# print a1[start1:start1+overlap]
# print a2[start2:start2+overlap]
a1[start1:start1+overlap] = ['#']
a2[start2:start2+overlap] = ['#']
score += overlap**2
(overlap, start1, start2) = longestOverlap(a1, a2)
return score
示例14: BooleanSearch
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
class BooleanSearch(object):
"""
This class handles the parsing and execution of queries from a provided query file based on
the index and postings file provided upon initialization. Results are saved into an output file.
Parsing of queries is handled by converting the boolean expressions into an equivalent python
expression that is executable.
"""
eval_index_local = "index"
eval_globals = {"__builtins__": None}
replacements = [("AND NOT", "-"), ("AND", "&"), ("OR", "|"), ("NOT", "~"), ("(", " ( "), (")", " ) ")]
exprs = set(["-", "&", "|", "~", "(", ")"])
expr_postings_ref = "index[\"%s\"]"
def __init__(self, index_filename, postings_filename):
"""
index_filename refers to the dictionary file.
postings_filename refers to the postings file.
"""
self.stemmer = PorterStemmer()
self.index = Index(index_filename, postings_filename)
self.eval_locals = {self.eval_index_local: self.index}
def _to_python_expression(self, query):
"""
Parses a boolean expression by converting the boolean operator keywords into python's bitwise operators,
and converts the terms into their respective index calls that return SkipList objects.
The resulting expression is an executable python expression.
WARNING: NOT SAFE FOR PRODUCTION SYSTEMS. FOR ACADEMIC PURPOSES ONLY.
"""
query = reduce(lambda q,args: q.replace(*args), self.replacements, query)
query_list = [x not in self.exprs and self.expr_postings_ref % self.stemmer.stem_word(x.lower()) or x for x in query.split()]
return " ".join(query_list)
def _execute_query(self, query):
"""
Executes the provided query and returns the result
WARNING: NOT SAFE FOR PRODUCTION SYSTEMS. FOR ACADEMIC PURPOSES ONLY.
"""
expression = self._to_python_expression(query)
try:
result = eval(expression, self.eval_globals, self.eval_locals)
except SyntaxError as se:
return "Syntax Error occurred, possible malformed expression during conversion: %s" % expression
except NameError as ne:
return "Name Error occured, possible invalid object reference in query: %s" % expression
else:
return result
def process_queries(self, query_filename, output_filename):
"""
This method takes in a query filename and output filename.
For every query, it writes the output into a new line.
"""
try:
with open(query_filename, 'r') as query_file, open(output_filename, 'w') as output_file:
for row in query_file:
result = self._execute_query(row)
output_file.write(str(result) + "\n")
except IOError as error:
print "IO Error occured while attempting to run BooleanSearch"
sys.exit(error.args[1])
示例15: len
# 需要导入模块: from nltk.stem.porter import PorterStemmer [as 别名]
# 或者: from nltk.stem.porter.PorterStemmer import stem_word [as 别名]
db = c['haiku']
# Open connection to Twitter's public timeline, build haikus
failed = True
while failed:
failed = False
try:
with tweetstream.SampleStream(USER,PASS) as stream:
for tweet in stream:
if 'text' in tweet and len(tweet['text'])>0:
screen_name = tweet['user']['screen_name']
hashes = [j for j in set([i for i in tweet['text'].split() if i.startswith('#')])]
# Strip out urls, punctuation, RTs, and @'s
tweet_stripped = urlre.sub('',tweet['text'])
tweet_stripped = punctre.sub('',tweet_stripped)
tweet_stemmed = [porter_stemmer.stem_word(i.lower()) for i in tweet_stripped.split()]
# Keep unstemmed, stripped tweet for either storage or retweeting
tweet_outgoing = [i.lower() for i in tweet_stripped.split()]
# hack to make sure that only coherent tweets are passed through
temp_tweet = [i.lower() for i in tweet_stemmed if not i.lower().startswith('rt')]
tweet_for_topic = [i.lower() for i in tweet_stemmed if not i.lower().startswith('rt') and nsyl(i)>0 and i.lower() not in stopwords]
if tweet_for_topic==temp_tweet and len(tweet_for_topic)>0:
print 'Iteration '+str(counter)
#Assign this tweet a topic
docset = []
docset.append(' '.join(i for i in tweet_for_topic))
print 'Tweet: '+docset[0]
(gamma, bound) = olda.update_lambda(docset)
counter+=1
if (counter % 100 == 0):
numpy.savetxt('lambdas/lambda-%d.dat' % counter, olda._lambda)