本文整理汇总了Python中nltk.stem.snowball.SnowballStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Python SnowballStemmer.stem方法的具体用法?Python SnowballStemmer.stem怎么用?Python SnowballStemmer.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.snowball.SnowballStemmer
的用法示例。
在下文中一共展示了SnowballStemmer.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_spanish
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def test_spanish(self):
stemmer = SnowballStemmer('spanish')
assert stemmer.stem("Visionado") == 'vision'
# The word 'algue' was raising an IndexError
assert stemmer.stem("algue") == 'algu'
示例2: classify
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def classify(self, sText):
"""Given a target string sText, this function returns the most likely document
class to which the target string belongs (i.e., positive, negative or neutral).
"""
tokens = self.tokenize(sText)
posProbability, negProbability = 0, 0
posNum, negNum = float(sum(self.pos_dic.values())), float(sum(self.neg_dic.values()))
stemmer = SnowballStemmer("english")
for i in range(len(tokens) - 1):
if not isPunctuationMark(tokens[i]):
unigram = stemmer.stem(tokens[i])
second_word = stemmer.stem(tokens[i + 1])
try:
bigram = unigram + " " + second_word
except UnicodeDecodeError:
continue
#adds one smoothing and takes log to avoid underflow
posProbability += math.log(float((self.pos_dic.get(bigram, 0) + 1)) / posNum)
posProbability += math.log(float((self.pos_dic.get(unigram, 0) + 1)) / posNum)
negProbability += math.log(float((self.neg_dic.get(bigram, 0) + 1)) / negNum)
negProbability += math.log(float((self.neg_dic.get(unigram, 0) + 1)) / negNum)
if tokens:
posProbability += math.log(float((self.pos_dic.get(tokens[-1], 0) + 1)) / posNum)
negProbability += math.log(float((self.neg_dic.get(tokens[-1], 0) + 1)) / negNum)
if posProbability > negProbability:
return "positive"
else:
return "negative"
示例3: main
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def main():
parser = argparse.ArgumentParser(description='Evaluate translation hypotheses.')
parser.add_argument('-i', '--input', default=baseline_path+'data/hyp1-hyp2-ref',
help='input file (default data/hyp1-hyp2-ref)')
parser.add_argument('-n', '--num_sentences', default=None, type=int,
help='Number of hypothesis pairs to evaluate')
# note that if x == [1, 2, 3], then x[:None] == x[:] == x (copy); no need for sys.maxint
opts = parser.parse_args()
# we create a generator and avoid loading all sentences into a list
def sentences():
with open(opts.input) as f:
for pair in f:
yield [sentence.strip().split() for sentence in pair.split(' ||| ')]
english_stemmer = SnowballStemmer("english")
# note: the -n option does not work in the original code
for h1, h2, ref in islice(sentences(), opts.num_sentences):
# Perform morphological stemming before calculating METEOR score
h1 = [english_stemmer.stem(word) for word in h1]
h2 = [english_stemmer.stem(word) for word in h2]
ref = [english_stemmer.stem(word) for word in ref]
rset = set(ref)
h1_match = meteor(h1, rset)
# print "meteor is h1_match ", h1_match
h2_match = meteor(h2, rset)
# print "meteor is h2_match ", h2_match
print(1 if h1_match > h2_match else # \begin{cases}
(0 if h1_match == h2_match
else -1)) # \end{cases}
示例4: pos_tokenizer
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def pos_tokenizer(s): #define a tokenizer that uses POS tagging
texts=nltk.word_tokenize(s)
texts=[word for word in texts if len(word)>2]
# PULL OUT NOUN AND VERB PHRASES
chunktext=nltk.pos_tag(texts)
patterns="""
VP:{<V.*><DT>?<JJ.*>?<NN.*>}
NP:{<DT>?<JJ>*<NN.*>}
N:{<NN.*>}
"""
NPchunker=nltk.RegexpParser(patterns)
from nltk.stem.snowball import SnowballStemmer
st=SnowballStemmer('english')
#print text
temp=[]
result=NPchunker.parse(chunktext)
#print result
for phrase in result:
try:
phrase.label()
string=''
m=0
for word in phrase:
if m==0:
string+=st.stem(word[0])
m+=1
else: string+=' '+st.stem(word[0])
temp.append(string)
except: pass
return temp
示例5: __init__
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
class WordCount:
def __init__(self, language):
self.stopwords = self.load_stopwords(language)
self.parse_regexp = re.compile(r"([0-9]*[\w][\w0-9]+)", re.UNICODE)
self.current_stemmer = SnowballStemmer(language)
@staticmethod
def load_stopwords(language):
stoplist = []
if language == 'english':
with codecs.open('geomedia'+ os.sep +'en_stoplist.txt', "r", "utf-8") as f:
stoplist = [line.rstrip() for line in f]
else:
#download('stopwords')
stoplist = stopwords.words(language)
return stoplist
def parse_text(self, text, wordcount_dictionary=None):
"""
>>> wordcount = WordCount() #doctest: +ELLIPSIS
[nltk_data] ...
>>> wordcount.parse_text("a1a ma kota")
{'ma': 1, 'a1a': 1, 'kota': 1}
>>> wordcount.parse_text("a1a ma kota", {'a1a': 2, 'kota': 1})
{'ma': 1, 'a1a': 3, 'kota': 2}
"""
if wordcount_dictionary is None:
wordcount_dictionary = {}
words = self.parse_regexp.findall(text)
for word in words:
new_word = self.current_stemmer.stem(word.lower())
if word not in self.stopwords and new_word not in self.stopwords:
if new_word in wordcount_dictionary:
wordcount_dictionary[new_word] += 1
else:
wordcount_dictionary[new_word] = 1
return wordcount_dictionary
def parse_text_extra(self, text, wordcount_dictionary=None, extras=None):
if wordcount_dictionary is None:
wordcount_dictionary = {}
if wordcount_dictionary is None:
extras = {}
words = self.parse_regexp.findall(text)
for word in words:
new_word = self.current_stemmer.stem(word.lower())
word = word.lower()
if word not in self.stopwords and new_word not in self.stopwords:
if new_word in wordcount_dictionary:
wordcount_dictionary[new_word] += 1
if word in extras[new_word]:
extras[new_word][word] += 1
else:
extras[new_word][word] = 1
else:
wordcount_dictionary[new_word] = 1
extras[new_word] = {}
extras[new_word][word] = 1
示例6: stem_snowball
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def stem_snowball(tokens):
stemmer = SnowballStemmer("russian")
if isinstance(tokens, basestring):
return stemmer.stem(tokens)
else:
stemmed = [stemmer.stem(token) for token in tokens]
return stemmed
示例7: stem
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def stem(list):
stemmer = SnowballStemmer('english')
stemmed_tokens = []
for x in list:
stemmed_tokens.append(stemmer.stem(x))
terms_dictionary.update_terms_dictionary(stemmer.stem(x), x) #creo il dizionario di token e termini originali
return stemmed_tokens
示例8: test_german
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def test_german(self):
stemmer_german = SnowballStemmer("german")
stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)
assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'
assert stemmer_german.stem("keinen") == 'kein'
assert stemmer_german2.stem("keinen") == 'keinen'
示例9: extract_bigrams
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def extract_bigrams(articleList, commentCount):
featureMatrix = np.zeros([commentCount,100])
index = 0
stemmer = SnowballStemmer("english", ignore_stopwords=True)
bagOfWords = []
for art in articleList.items():
for comm in art[1]:
mywords = words(comm.body)
mywords = known_words(mywords)
# Remove Stops
filtered_words = [w for w in mywords if not w in stopwords.words('english')]
# Stemming
stemmed_words = [stemmer.stem(w) for w in filtered_words]
bagOfWords += stemmed_words
bagOfWords.append("\n")
tempVector = dict()
#Create your bigrams
bgs = nltk.bigrams(bagOfWords)
fdist = nltk.FreqDist(bgs)
for k in fdist.keys()[:100]:
tempVector[k] = 0
theKeys = tempVector.keys()
for art in articleList.items():
for comm in art[1]:
mywords = words(comm.body)
mywords = known_words(mywords)
# Remove Stops
filtered_words = [w for w in mywords if not w in stopwords.words('english')]
# Stemming
stemmed_words = [stemmer.stem(w) for w in filtered_words]
bgs = nltk.bigrams(stemmed_words)
for word in (w for w in bgs if tempVector.has_key(w)):
keyInd = theKeys.index(word)
featureMatrix[index][keyInd] += 1
index += 1
if index % 100 == 0:
print "extracted", index, "features"
if index >= commentCount:
break
print "non-zero",np.count_nonzero(featureMatrix)
print "Percentage filled:%.2f" %(float(np.count_nonzero(featureMatrix))/(featureMatrix.shape[0]*featureMatrix.shape[1]))
return featureMatrix
示例10: get_unigram_feats
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def get_unigram_feats(document):
document_words = set(document.split())
s = SnowballStemmer("english")
stemmed_words = [ s.stem(word) for word in document_words ]
features = {}
#features['count'] = len(document_words)
for word in data.wordlist:
word = s.stem(word)
features['contains({})'.format(word)] = (word in stemmed_words)
return features
示例11: highestFrequency
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def highestFrequency(quesWords,sentWords):
stemmer = SnowballStemmer("english");
match = 0
nonMatch = 0
for qw in quesWords:
for aw in sentWords:
if stemmer.stem(qw) == stemmer.stem(aw) :
match += 1
else:
nonMatch += 1
return (match)
示例12: jaccardDistance
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def jaccardDistance(quesWords,sentWords):
stemmer = SnowballStemmer("english");
match = 0
nonMatch = 0
for qw in quesWords:
for aw in sentWords:
if stemmer.stem(qw) == stemmer.stem(aw) :
match += 1
else:
nonMatch += 1
return (match)
示例13: preProcessing
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def preProcessing(bitext):
# transfer to lower case
bitext = [[[x.lower() for x in sent ] for sent in bisent] for bisent in bitext]
# stemmer
e_stemmer = SnowballStemmer("german")
f_stemmer = SnowballStemmer("english")
for (n, (f,e)) in enumerate(bitext):
for idx, f_i in enumerate(f):
f[idx] = f_stemmer.stem(f_i)
for idx, e_i in enumerate(e):
e[idx] = e_stemmer.stem(e_i)
示例14: process_missing
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def process_missing(missing, sec):
st = SnowballStemmer('english')
morphological_errors = 0
for m in missing:
ind = sec['incorrect'].index(m)
prediction = sec['predicted'][ind]
if(st.stem(m[3]) == st.stem(prediction[0])):
morphological_errors += 1
print('the correct sequence is: '+str(m)+' but predicted: '+str(prediction))
print('morphological errors:' + str(morphological_errors))
if len(missing):
print('percentage:' + str(morphological_errors/len(missing)))
示例15: trigram
# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def trigram(self,term):
x,y,z =term
stemmer=SnowballStemmer("english")
x= stemmer.stem(x)
y= stemmer.stem(y)
z= stemmer.stem(z)
label=x+y+z
new_column=[]
for words_stem in self.stemwords:
if x in words_stem and y in words_stem and z in words_stem:
new_column.append('True')
else:
new_column.append('False')
self.dataframegenerator(new_column,label)