本文整理汇总了Python中nltk.stem.PorterStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer.stem方法的具体用法?Python PorterStemmer.stem怎么用?Python PorterStemmer.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.PorterStemmer
的用法示例。
在下文中一共展示了PorterStemmer.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: new_lesk
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def new_lesk(context_sentence, ambiguous_word, pos=None, stem=True, hyperhypo=True):
ps = PorterStemmer()
max_overlaps = 0; lesk_sense = None
context_sentence = context_sentence.split()
for ss in wn.synsets(ambiguous_word):
# If POS is specified.
if pos and ss.pos is not pos:
continue
lesk_dictionary = []
# Includes definition.
lesk_dictionary+= ss.definition.split()
# Includes lemma_names.
lesk_dictionary+= ss.lemma_names
# Optional: includes lemma_names of hypernyms and hyponyms.
if hyperhypo == True:
lesk_dictionary+= list(chain(*[i.lemma_names for i in ss.hypernyms()+ss.hyponyms()]))
if stem == True: # Matching exact words causes sparsity, so lets match stems.
lesk_dictionary = [ps.stem(i) for i in lesk_dictionary]
context_sentence = [ps.stem(i) for i in context_sentence]
overlaps = set(lesk_dictionary).intersection(context_sentence)
if len(overlaps) > max_overlaps:
lesk_sense = ss
max_overlaps = len(overlaps)
return lesk_sense
示例2: stem
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def stem(string):
"""Stem a phrase"""
global stemmer
if not stemmer:
stemmer = Stemmer()
#words = string.split()
#for i in range(len(words)):
# words[i] = self.stemmer.stem(words[i])
# stemming last word only
#string = self._reGlue(words)
#
#string2 = stemmer.stem(string)
#if string2 not in stemdict:
# stemdict[string2] = string
# FIX ME
if string not in stemdict:
if bad_unicode(string):
## added A. Meyers 8/28/15
temp = stemmer.stem(remove_non_unicode(string))
else:
temp = stemmer.stem(string)
if temp:
stemdict[string] = temp
if not temp:
pass
elif temp not in unstemdict:
unstemdict[temp] = [string]
elif string not in unstemdict[temp]:
unstemdict[temp].append(string)
else:
temp = stemdict[string]
return temp
示例3: tokenize2
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def tokenize2(str,df_freq):
#temp map (for getting the local term frequency)
temp_map={}
#for a sentence
str =str.decode('ascii', 'ignore')
#tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
#tokens=tokenizer.tokenize(str)
tokens = str.split()
#print tokens
stemmer = PorterStemmer()
#small set of stopwords (remove you, are, and, I those kinds of words)
last =[]
#bigram_list=[]
for d in tokens:
d = d.split('-')
for c in d:
c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
#regular expression -> strip punctuations
if c!='':
try:
if int(c):
if len(c)!=4 and (c>2015 or c<1900): #keep years
c=stemmer.stem('NUM')
except Exception:
c = stemmer.stem(c.lower())
pass
last.append(c)
updateDF(temp_map,df_freq,c)
示例4: tokenizeTags
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def tokenizeTags(str,dict_items):
#temp map (for getting the local term frequency)
#for a sentence
str =str.decode('ascii', 'ignore')
#tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
#tokens=tokenizer.tokenize(str)
tokens = str.split()
#print tokens
stemmer = PorterStemmer()
#small set of stopwords (remove you, are, and, I those kinds of words)
last =[]
#bigram_list=[]
for d in tokens:
d = d.split('-')
for c in d:
c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
#regular expression -> strip punctuations
if c!='' and c not in dict_items:
try:
if int(c):
if len(c)!=4 and (c>2015 or c<1900): #keep years
c=stemmer.stem('NUM')
except Exception:
c = stemmer.stem(c.lower())
pass
#c = stemmer.stem(c.lower())
last.append(c)
#bigram generation
#index= len(last)
#if index>1:
# bigram = last[index-2]+' '+last[index-1]
# bigram_list.append(bigram)
return last
示例5: tokenize2_bigram
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def tokenize2_bigram(str,df_freq):
temp_map={}
#for a sentence
str =str.decode('ascii', 'ignore')
tokens = str.split()
#print tokens
stemmer = PorterStemmer()
last =[]
bigram_list=[]
for d in tokens:
d = d.split('-')
for c in d:
c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
#regular expression -> strip punctuations
if c!='':
try:
if int(c):
if len(c)!=4 and (c>2015 or c<1900): #keep years
c=stemmer.stem('NUM')
except Exception:
c = stemmer.stem(c.lower())
pass
#c = stemmer.stem(c.lower())
last.append(c)
#bigram generation
index= 0
if index>1:
bigram = last[index-2]+' '+last[index-1]
bigram_list.append(bigram)
updateDF(temp_map,df_freq,bigram)
index+=1
return bigram_list
示例6: openfile
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def openfile(filename,output):
print(filename)
#starts run time
start = timeit.default_timer()
ps = PorterStemmer()
file = open(filename,"r")
tokens = []
#Used for removing punctuation from the documents
translate_table = dict((ord(char), None) for char in string.punctuation)
start2 = timeit.default_timer()
#splits the lines into words and removes the punctuation
for line in file:
tokens += word_tokenize(line.translate(translate_table) )
start3 = timeit.default_timer()
print("tokenize")
print(start3 - start2)
#creates a set of stop words to be removed later
stop_words = set(stopwords.words("english"))
start6 = timeit.default_timer()
#if a word is not a stop word it adds it to a list
filtered_sentence = []
for w in tokens:
if w not in stop_words:
filtered_sentence.append(w)
start7 = timeit.default_timer()
print("stop word removal")
print(start7 - start6)
startw = timeit.default_timer()
#stems each word and adds it to the output file in csv form
f = open(output,'w')
iterFilSen = iter(filtered_sentence)
if output == "documents.csv":
for w in filtered_sentence:
if w == "I":
f.write("\n")
f.write(ps.stem(w))
f.write(",")
else:
for w in iterFilSen:
if w == "I":
f.write("\n")
#removes the I number W
next(iterFilSen)
next(iterFilSen)
else:
f.write(ps.stem(w))
f.write(",")
#ends run time
stop = timeit.default_timer()
print("writing")
print(stop - startw)
print("total: "+output)
print(stop - start)
示例7: StemmedBagOfWordsFeatureGenerator
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
class StemmedBagOfWordsFeatureGenerator(EdgeFeatureGenerator):
"""
Generates stemmed Bag of Words representation for each sentence that contains
an edge, using the function given in the argument.
By default it uses Porter stemmer
:type feature_set: nala.structures.data.FeatureDictionary
:type stemmer: nltk.stem.PorterStemmer
:type stop_words: list[str]
:type training_mode: bool
"""
def __init__(self, feature_set, stop_words=[], training_mode=True):
self.feature_set = feature_set
"""the feature set for the dataset"""
self.training_mode = training_mode
"""whether the mode is training or testing"""
self.stemmer = PorterStemmer()
"""an instance of the PorterStemmer"""
self.stop_words = stop_words
"""a list of stop words"""
def generate(self, dataset):
for edge in dataset.edges():
sentence = edge.part.sentences[edge.sentence_id]
if self.training_mode:
for token in sentence:
if self.stemmer.stem(
token.word
) not in self.stop_words and not token.features['is_punct']:
feature_name = '4_bow_stem_' + self.stemmer.stem(
token.word) + '_[0]'
self.add_to_feature_set(edge, feature_name)
示例8: IntermediateTokensFeatureGenerator
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
class IntermediateTokensFeatureGenerator(EdgeFeatureGenerator):
"""
Generate the bag of words representation, masked text, stemmed text and
parts of speech tag for each of the tokens present between two entities in
an edge.
:param feature_set: the feature set for the dataset
:type feature_set: nala.structures.data.FeatureDictionary
:param training_mode: indicates whether the mode is training or testing
:type training_mode: bool
"""
def __init__(self, feature_set, training_mode=True):
self.feature_set = feature_set
"""the feature set for the dataset"""
self.training_mode = training_mode
"""whether the mode is training or testing"""
self.stemmer = PorterStemmer()
"""an instance of PorterStemmer"""
def generate(self, dataset):
for edge in dataset.edges():
sentence = edge.part.sentences[edge.sentence_id]
if edge.entity1.head_token.features['id'] < edge.entity2.head_token.features['id']:
first = edge.entity1.head_token.features['id']
second = edge.entity2.head_token.features['id']
for i in range(first+1, second):
token = sentence[i]
feature_name = '33_fwd_bow_intermediate_'+token.word+'_[0]'
self.add_to_feature_set(edge, feature_name)
feature_name = '34_fwd_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]'
self.add_to_feature_set(edge, feature_name)
feature_name = '35_fwd_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]'
self.add_to_feature_set(edge, feature_name)
feature_name = '36_fwd_pos_intermediate_'+token.features['pos']+'_[0]'
self.add_to_feature_set(edge, feature_name)
else:
first = edge.entity2.head_token.features['id']
second = edge.entity1.head_token.features['id']
for i in range(first+1, second):
token = sentence[i]
feature_name = '37_bkd_bow_intermediate_'+token.word+'_[0]'
self.add_to_feature_set(edge, feature_name)
feature_name = '38_bkd_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]'
self.add_to_feature_set(edge, feature_name)
feature_name = '39_bkd_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]'
self.add_to_feature_set(edge, feature_name)
feature_name = '40_bkd_pos_intermediate_'+token.features['pos']+'_[0]'
self.add_to_feature_set(edge, feature_name)
for i in range(first+1, second):
token = sentence[i]
feature_name = '41_bow_intermediate_'+token.word+'_[0]'
self.add_to_feature_set(edge, feature_name)
feature_name = '42_bow_intermediate_masked_'+token.masked_text(edge.part)+'_[0]'
self.add_to_feature_set(edge, feature_name)
feature_name = '43_stem_intermediate_'+self.stemmer.stem(token.word)+'_[0]'
self.add_to_feature_set(edge, feature_name)
feature_name = '44_pos_intermediate_'+token.features['pos']+'_[0]'
self.add_to_feature_set(edge, feature_name)
示例9: EntityHeadTokenFeatureGenerator
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
class EntityHeadTokenFeatureGenerator(EdgeFeatureGenerator):
"""
Calculate the head token for each entity, using a simple heuristic - the
distance to the root of the sentence.
If the entity has just one token, then that forms the head token.
If the entity has multiple tokens, then the token which is closest to the
root of the sentence forms the entity head.
:param feature_set: the feature set for the dataset
:type feature_set: nala.structures.data.FeatureDictionary
:param training_mode: whether the mode is training or testing, default True
:type training_mode: bool
"""
def __init__(self, feature_set, training_mode=True):
self.feature_set = feature_set
"""the feature set for the dataset"""
self.training_mode = training_mode
"""whether the mode is training or testing"""
self.stemmer = PorterStemmer()
"""an instance of the PorterStemmer"""
def generate(self, dataset):
for edge in dataset.edges():
entity1 = edge.entity1
entity2 = edge.entity2
self.named_entity_count('entity1_', entity1.class_id, edge)
self.named_entity_count('entity2_', entity2.class_id, edge)
entity1_stem = self.stemmer.stem(entity1.head_token.word)
entity1_non_stem = entity1.head_token.word[len(entity1_stem):]
entity2_stem = self.stemmer.stem(entity2.head_token.word)
entity2_non_stem = entity1.head_token.word[len(entity2_stem):]
feature_name_1_1 = '7_entity1_txt_' + entity1.head_token.word + '_[0]'
feature_name_2_1 = '7_entity2_txt_' + entity2.head_token.word + '_[0]'
feature_name_1_2 = '8_entity1_pos_' + entity1.head_token.features['pos'] + '_[0]'
feature_name_2_2 = '8_entity2_pos_' + entity2.head_token.features['pos'] + '_[0]'
feature_name_1_3 = '9_entity1_stem_' + entity1_stem + '_[0]'
feature_name_2_3 = '9_entity2_stem_' + entity2_stem + '_[0]'
feature_name_1_4 = '10_entity1_nonstem_' + entity1_non_stem + '_[0]'
feature_name_2_4 = '10_entity2_nonstem_' + entity2_non_stem + '_[0]'
self.add_to_feature_set(edge, feature_name_1_1)
self.add_to_feature_set(edge, feature_name_2_1)
self.add_to_feature_set(edge, feature_name_1_2)
self.add_to_feature_set(edge, feature_name_2_2)
self.add_to_feature_set(edge, feature_name_1_3)
self.add_to_feature_set(edge, feature_name_2_3)
self.add_to_feature_set(edge, feature_name_1_4)
self.add_to_feature_set(edge, feature_name_2_4)
def named_entity_count(self, prefix, entity_type, edge):
entities = edge.part.get_entities_in_sentence(edge.sentence_id, entity_type)
feature_name = '1_'+prefix+entity_type+'_count_['+str(len(entities))+']'
self.add_to_feature_set(edge, feature_name)
示例10: Indexer
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
class Indexer():
def __init__(self, rem_punc=True, rem_stop=True):
self.rem_punc = rem_punc
self.rem_stop = rem_stop
self.stoplist = stopwords.words('english')
self.punctunation = list(string.punctuation)
self.token_dict = dict()
self.pst = PorterStemmer()
self.postings_list = dict()
def get_pages(self):
with open('./data/ucl', 'r') as ifile:
contents = ifile.read()
for page in contents.split('visited:'):
self.parse_page(page)
def parse_page(self, page):
page = unicode(page, errors='ignore')
lines = page.strip().split()
if len(lines) > 2:
title = lines[1]
# tokenize and make lowercase
tokens = [word.lower() for word in word_tokenize(str(lines[2:]))]
# remove punctuation
if self.rem_punc:
tokens = [word for word in tokens if word not in self.punctunation]
# remove stopwords
if self.rem_stop:
tokens = [word for word in tokens if word not in self.stoplist]
# stem (Porter stemmer)
tokens = [self.pst.stem(word) for word in tokens]
# add to dictionary
self.add_to_token_dict(title, tokens[3:])
def add_to_token_dict(self, title, tokens):
if tokens:
words = dict()
for token in tokens[1:]:
key = self.pst.stem(token.lower())
if key in self.token_dict:
self.token_dict[key] += 1
else:
self.token_dict[key] = 1
if key in words:
words[key] += 1
else:
words[key] = 1
self.postings_list[title] = [(k, v) for k, v in words.iteritems()]
示例11: testing
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def testing():
# - tokenize on sentence and word
ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!"
print(sent_tokenize(ex_txt))
print(word_tokenize(ex_txt, language='english'))
# - stop words (pre-defined by nltk)
stop_words = set(stopwords.words('english'))
print(stop_words)
words = word_tokenize(ex_txt)
print(words)
filtered_sent = []
for w in words:
if w not in stop_words:
filtered_sent.append(w)
print(filtered_sent)
filtered_sent = [w for w in words if not w in stop_words]
print(filtered_sent)
# - stemming
ps = PorterStemmer()
example_words = [python,pythoner,pythoning,pythoned,pythonly]
# for w in example_words:
# print(ps.stem(w))
new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)
for w in words:
print(ps.stem(w))
示例12: Stemmer
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
class Stemmer(SentenceProcesser):
def __init__(self):
self.stemmer=PorterStemmer()
def process(self, sentence):
for word in sentence.words:
word.stem=self.stemmer.stem(word.content)
return sentence
示例13: parseTranscript
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def parseTranscript(transcript):
assert isinstance(transcript, Transcript), \
"transcript must be stored in custom namedtuple, not {}".format(type(transcript))
text = transcript.prepared.append(transcript.QandA)
id = "{ticker}-{year}-{month}-{day}".format(ticker=transcript.ticker.split(':')[-1],
year=transcript.date.year,
month=transcript.date.month,
day=transcript.date.day)
tokenizer = wordpunct_tokenize
stemmer = PorterStemmer()
index = dict()
pos = 0
for row in text:
for i, token in enumerate(tokenizer(row.lower())):
token = stemmer.stem(token)
if token not in index and '|' not in token:
index[token] = [id, [str(pos + i)]]
elif '|' not in token:
index[token][-1].append(str(pos + i))
try:
pos += (i + 1)
except:
pass
return index
示例14: preprocessing
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def preprocessing(text, debug = False):
if debug:
print text
# lower case
text = text.lower()
if debug:
print text
# can't -> cannot, bya's -> bya is
text = replacers.RegexpReplacer().replace(text)
if debug:
print text
# word tokenize
words = word_tokenize(text)
if debug:
print words
# removing stopwords
english_stops = set(stopwords.words('english'))
english_stops_added = english_stops | {'.', ',', ':', ';'}
words = [word for word in words if word not in english_stops_added]
if debug:
print words
# stemming words
stemmer = PorterStemmer()
words_stemmed = list(map(lambda word: stemmer.stem(word), words))
if debug:
print words_stemmed
return words, words_stemmed
示例15: parseReviews
# 需要导入模块: from nltk.stem import PorterStemmer [as 别名]
# 或者: from nltk.stem.PorterStemmer import stem [as 别名]
def parseReviews(mypath):
filelist = os.listdir(mypath)
wordDict = {}
negationList = ["no","not","never","can't","won't","cannot","didn't","couldn't"]
negationFlag = False
stopwordList = set(stopwords.words("english"))
stemmer = PorterStemmer()
for file in filelist:
with open(mypath + "/" + file,"r") as f:
word_list = word_tokenize(f.read())
for word in word_list:
if word in negationList:
#double negative
if negationFlag:
negationFlag = False
else:
negationFlag = True
continue
if not word.isalnum():
negationFlag = False
if word.isalnum() and word not in stopwordList:
word = stemmer.stem(word)
if negationFlag:
word = "!" + word
negationFlag = False
if word not in wordDict:
wordDict[word] = 1
else:
wordDict[word] += 1
return wordDict