Python SnowballStemmer.stem方法代码示例

本文整理汇总了Python中nltk.stem.snowball.SnowballStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题：Python SnowballStemmer.stem方法的具体用法？Python SnowballStemmer.stem怎么用？Python SnowballStemmer.stem使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.snowball.SnowballStemmer的用法示例。

在下文中一共展示了SnowballStemmer.stem方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_spanish

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
    def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu'

开发者ID:Journo-App，项目名称:flask-by-example，代码行数:9，代码来源:test_stem.py

示例2: classify

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
 def classify(self, sText):
    """Given a target string sText, this function returns the most likely document
    class to which the target string belongs (i.e., positive, negative or neutral).
    """
    tokens = self.tokenize(sText)
    posProbability, negProbability = 0, 0
    posNum, negNum = float(sum(self.pos_dic.values())), float(sum(self.neg_dic.values()))
    stemmer = SnowballStemmer("english")
    for i in range(len(tokens) - 1):
        if not isPunctuationMark(tokens[i]):
            unigram = stemmer.stem(tokens[i])
            second_word = stemmer.stem(tokens[i + 1])
            try:
                bigram = unigram + " " + second_word
            except UnicodeDecodeError:
                continue
            #adds one smoothing and takes log to avoid underflow
            posProbability += math.log(float((self.pos_dic.get(bigram, 0) + 1)) / posNum)
            posProbability += math.log(float((self.pos_dic.get(unigram, 0) + 1)) / posNum)
            negProbability += math.log(float((self.neg_dic.get(bigram, 0) + 1)) / negNum)
            negProbability += math.log(float((self.neg_dic.get(unigram, 0) + 1)) / negNum)
    if tokens:
        posProbability += math.log(float((self.pos_dic.get(tokens[-1], 0) + 1)) / posNum)
        negProbability += math.log(float((self.neg_dic.get(tokens[-1], 0) + 1)) / negNum)
    if posProbability > negProbability:
        return "positive"
    else:
        return "negative"

开发者ID:luofei11，项目名称:SentimentAnalysis，代码行数:30，代码来源:freq_bigram.py

示例3: main

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def main():
    parser = argparse.ArgumentParser(description='Evaluate translation hypotheses.')
    parser.add_argument('-i', '--input', default=baseline_path+'data/hyp1-hyp2-ref',
            help='input file (default data/hyp1-hyp2-ref)')
    parser.add_argument('-n', '--num_sentences', default=None, type=int,
            help='Number of hypothesis pairs to evaluate')
    # note that if x == [1, 2, 3], then x[:None] == x[:] == x (copy); no need for sys.maxint
    opts = parser.parse_args()

    # we create a generator and avoid loading all sentences into a list
    def sentences():
        with open(opts.input) as f:
            for pair in f:
                yield [sentence.strip().split() for sentence in pair.split(' ||| ')]

    english_stemmer = SnowballStemmer("english")

    # note: the -n option does not work in the original code
    for h1, h2, ref in islice(sentences(), opts.num_sentences):
        # Perform morphological stemming before calculating METEOR score
        h1 = [english_stemmer.stem(word) for word in h1]
        h2 = [english_stemmer.stem(word) for word in h2]
        ref = [english_stemmer.stem(word) for word in ref]

        rset = set(ref)
        h1_match = meteor(h1, rset)
        # print "meteor is h1_match ", h1_match
        h2_match = meteor(h2, rset)
        # print "meteor is h2_match ", h2_match
        print(1 if h1_match > h2_match else # \begin{cases}
                (0 if h1_match == h2_match
                    else -1)) # \end{cases}

开发者ID:stothe2，项目名称:468-MT，代码行数:34，代码来源:evaluate_stemming.py

示例4: pos_tokenizer

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def pos_tokenizer(s): #define a tokenizer that uses POS tagging
    texts=nltk.word_tokenize(s)

    texts=[word for word in texts if len(word)>2]

    # PULL OUT NOUN AND VERB PHRASES
    chunktext=nltk.pos_tag(texts)
    patterns="""
                VP:{<V.*><DT>?<JJ.*>?<NN.*>}
                NP:{<DT>?<JJ>*<NN.*>}
                N:{<NN.*>}
    """
    NPchunker=nltk.RegexpParser(patterns)

    from nltk.stem.snowball import SnowballStemmer
    st=SnowballStemmer('english')

    #print text
    temp=[]
    result=NPchunker.parse(chunktext)
    #print result
    for phrase in result:
        try:
            phrase.label()
            string=''
            m=0
            for word in phrase:
                if m==0:
                    string+=st.stem(word[0])
                    m+=1
                else: string+=' '+st.stem(word[0])
            temp.append(string)
        except: pass
    return temp

开发者ID:ecpaulson，项目名称:Intuitive-CMS，代码行数:36，代码来源:SBA_tweet_sklearn.py

示例5: init

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
class WordCount:
	def __init__(self, language):
		self.stopwords = self.load_stopwords(language)
		self.parse_regexp = re.compile(r"([0-9]*[\w][\w0-9]+)", re.UNICODE)
		self.current_stemmer = SnowballStemmer(language)

	@staticmethod
	def load_stopwords(language):
		stoplist = []
		if language == 'english':
			with codecs.open('geomedia'+ os.sep +'en_stoplist.txt', "r", "utf-8") as f:
				stoplist = [line.rstrip() for line in f]
		else:
			#download('stopwords')
			stoplist = stopwords.words(language)

		return stoplist

	def parse_text(self, text, wordcount_dictionary=None):
		"""
		>>> wordcount = WordCount() #doctest: +ELLIPSIS
		[nltk_data] ...
		>>> wordcount.parse_text("a1a ma kota")
		{'ma': 1, 'a1a': 1, 'kota': 1}
		>>> wordcount.parse_text("a1a ma kota", {'a1a': 2, 'kota': 1})
		{'ma': 1, 'a1a': 3, 'kota': 2}
		"""
		if wordcount_dictionary is None:
			wordcount_dictionary = {}
		words = self.parse_regexp.findall(text)
		for word in words:
			new_word = self.current_stemmer.stem(word.lower())
			if word not in self.stopwords and new_word not in self.stopwords:
				if new_word in wordcount_dictionary:
					wordcount_dictionary[new_word] += 1
				else:
					wordcount_dictionary[new_word] = 1
		return wordcount_dictionary
		
	def parse_text_extra(self, text, wordcount_dictionary=None, extras=None):
		if wordcount_dictionary is None:
			wordcount_dictionary = {}
		if wordcount_dictionary is None:
			extras = {}
		words = self.parse_regexp.findall(text)
		for word in words:
			new_word = self.current_stemmer.stem(word.lower())
			word = word.lower()
			if word not in self.stopwords and new_word not in self.stopwords:
				if new_word in wordcount_dictionary:
					wordcount_dictionary[new_word] += 1
					if word in extras[new_word]:
						extras[new_word][word] += 1
					else:
						extras[new_word][word] = 1
				else:
					wordcount_dictionary[new_word] = 1
					extras[new_word] = {}
					extras[new_word][word] = 1

开发者ID:lszyman，项目名称:PressNotePropagation，代码行数:61，代码来源:wordcount.py

示例6: stem_snowball

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def stem_snowball(tokens):
    stemmer = SnowballStemmer("russian")

    if isinstance(tokens, basestring):
        return stemmer.stem(tokens)
    else:
        stemmed = [stemmer.stem(token) for token in tokens]
        return stemmed

开发者ID:i-Hun，项目名称:thesis-code，代码行数:10，代码来源:preprocess.py

示例7: stem

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def stem(list):
    stemmer = SnowballStemmer('english')
    stemmed_tokens = []

    for x in list:
        stemmed_tokens.append(stemmer.stem(x))
        terms_dictionary.update_terms_dictionary(stemmer.stem(x), x) #creo il dizionario di token e termini originali
    return stemmed_tokens

开发者ID:alemidori，项目名称:ProgettoIR，代码行数:10，代码来源:processing.py

示例8: test_german

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
    def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen'

开发者ID:Journo-App，项目名称:flask-by-example，代码行数:11，代码来源:test_stem.py

示例9: extract_bigrams

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def extract_bigrams(articleList, commentCount):
    featureMatrix = np.zeros([commentCount,100])

    index = 0
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    bagOfWords = []
    for art in articleList.items():        
        for comm in art[1]:
            mywords = words(comm.body)
            mywords = known_words(mywords)
            # Remove Stops
            filtered_words = [w for w in mywords if not w in stopwords.words('english')]
            # Stemming
            stemmed_words = [stemmer.stem(w) for w in filtered_words]
            bagOfWords += stemmed_words
            bagOfWords.append("\n")
            
    tempVector = dict()
        
    #Create your bigrams
    bgs = nltk.bigrams(bagOfWords)

    fdist = nltk.FreqDist(bgs)   
    
    for k in fdist.keys()[:100]:
        tempVector[k] = 0
    
    
    theKeys = tempVector.keys()
    
    for art in articleList.items():        
        for comm in art[1]:
            mywords = words(comm.body)
            mywords = known_words(mywords)
            # Remove Stops
            filtered_words = [w for w in mywords if not w in stopwords.words('english')]
            # Stemming
            stemmed_words = [stemmer.stem(w) for w in filtered_words]
            bgs = nltk.bigrams(stemmed_words)
            for word in (w for w in bgs if tempVector.has_key(w)):
                keyInd = theKeys.index(word)      
                featureMatrix[index][keyInd] += 1
                           
            index += 1
            if index % 100 == 0:
                print "extracted", index, "features"
        
            if index >= commentCount:
                break            
            
            
    
    
    print "non-zero",np.count_nonzero(featureMatrix)
    print "Percentage filled:%.2f" %(float(np.count_nonzero(featureMatrix))/(featureMatrix.shape[0]*featureMatrix.shape[1]))
    return featureMatrix

开发者ID:DirkBrand，项目名称:Comment-Classification，代码行数:58，代码来源:mainExtractor.py

示例10: get_unigram_feats

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def get_unigram_feats(document):
	document_words = set(document.split())
	s = SnowballStemmer("english")
	stemmed_words = [ s.stem(word) for word in document_words ]
	features = {}
	#features['count'] = len(document_words)
	for word in data.wordlist:
		word = s.stem(word)
		features['contains({})'.format(word)] = (word in stemmed_words)
	return features

开发者ID:suclike，项目名称:FlipkartHackthon，代码行数:12，代码来源:classify.py

示例11: highestFrequency

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def highestFrequency(quesWords,sentWords):
    stemmer = SnowballStemmer("english");
    match = 0
    nonMatch = 0
    for qw in quesWords:
        for aw in sentWords:
            if stemmer.stem(qw) == stemmer.stem(aw) :
                match += 1
            else:
                nonMatch += 1
    return (match)

开发者ID:vathsava，项目名称:NLP_PROJ，代码行数:13，代码来源:jaccard.py

示例12: jaccardDistance

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def jaccardDistance(quesWords,sentWords):
    stemmer = SnowballStemmer("english");
    match = 0
    nonMatch = 0
    for qw in quesWords:
        for aw in sentWords:
            if stemmer.stem(qw) == stemmer.stem(aw) :
                match += 1
            else:
                nonMatch += 1
    return (match)

开发者ID:vathsava，项目名称:NLP_PROJ，代码行数:13，代码来源:jaccard.py

示例13: preProcessing

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def preProcessing(bitext):
    # transfer to lower case
    bitext = [[[x.lower() for x in sent ] for sent in bisent] for bisent in bitext]
    # stemmer
    e_stemmer = SnowballStemmer("german")
    f_stemmer = SnowballStemmer("english")
    for (n, (f,e)) in enumerate(bitext):
        for idx, f_i in enumerate(f):
            f[idx] = f_stemmer.stem(f_i)
        for idx, e_i in enumerate(e):
            e[idx] = e_stemmer.stem(e_i)

开发者ID:frederick0329，项目名称:sp2016.11-731，代码行数:13，代码来源:HMM_compound_back.py

示例14: process_missing

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def process_missing(missing, sec):
    st = SnowballStemmer('english')
    morphological_errors = 0
    for m in missing:
        ind = sec['incorrect'].index(m)
        prediction = sec['predicted'][ind]
        if(st.stem(m[3]) == st.stem(prediction[0])):
            morphological_errors += 1        
        print('the correct sequence is: '+str(m)+' but predicted: '+str(prediction))
    print('morphological errors:' + str(morphological_errors))
    if len(missing):
        print('percentage:' + str(morphological_errors/len(missing)))

开发者ID:jvalansi，项目名称:word2vec_exp，代码行数:14，代码来源:w2v.py

示例15: trigram

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
 def trigram(self,term):
   x,y,z =term
   stemmer=SnowballStemmer("english")
   x= stemmer.stem(x)
   y= stemmer.stem(y)
   z= stemmer.stem(z)
   label=x+y+z 
   new_column=[]
   for words_stem in self.stemwords:       
     if x in words_stem and y in words_stem and z in words_stem:
         new_column.append('True')
     else:
         new_column.append('False')
   self.dataframegenerator(new_column,label)

开发者ID:omedranoc，项目名称:ThesisPreprocessing，代码行数:16，代码来源:join.py

注：本文中的nltk.stem.snowball.SnowballStemmer.stem方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。