当前位置: 首页>>代码示例>>Python>>正文


Python SnowballStemmer.stem方法代码示例

本文整理汇总了Python中nltk.stem.snowball.SnowballStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Python SnowballStemmer.stem方法的具体用法?Python SnowballStemmer.stem怎么用?Python SnowballStemmer.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.stem.snowball.SnowballStemmer的用法示例。


在下文中一共展示了SnowballStemmer.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_spanish

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
    def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu'
开发者ID:Journo-App,项目名称:flask-by-example,代码行数:9,代码来源:test_stem.py

示例2: classify

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
 def classify(self, sText):
    """Given a target string sText, this function returns the most likely document
    class to which the target string belongs (i.e., positive, negative or neutral).
    """
    tokens = self.tokenize(sText)
    posProbability, negProbability = 0, 0
    posNum, negNum = float(sum(self.pos_dic.values())), float(sum(self.neg_dic.values()))
    stemmer = SnowballStemmer("english")
    for i in range(len(tokens) - 1):
        if not isPunctuationMark(tokens[i]):
            unigram = stemmer.stem(tokens[i])
            second_word = stemmer.stem(tokens[i + 1])
            try:
                bigram = unigram + " " + second_word
            except UnicodeDecodeError:
                continue
            #adds one smoothing and takes log to avoid underflow
            posProbability += math.log(float((self.pos_dic.get(bigram, 0) + 1)) / posNum)
            posProbability += math.log(float((self.pos_dic.get(unigram, 0) + 1)) / posNum)
            negProbability += math.log(float((self.neg_dic.get(bigram, 0) + 1)) / negNum)
            negProbability += math.log(float((self.neg_dic.get(unigram, 0) + 1)) / negNum)
    if tokens:
        posProbability += math.log(float((self.pos_dic.get(tokens[-1], 0) + 1)) / posNum)
        negProbability += math.log(float((self.neg_dic.get(tokens[-1], 0) + 1)) / negNum)
    if posProbability > negProbability:
        return "positive"
    else:
        return "negative"
开发者ID:luofei11,项目名称:SentimentAnalysis,代码行数:30,代码来源:freq_bigram.py

示例3: main

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def main():
    parser = argparse.ArgumentParser(description='Evaluate translation hypotheses.')
    parser.add_argument('-i', '--input', default=baseline_path+'data/hyp1-hyp2-ref',
            help='input file (default data/hyp1-hyp2-ref)')
    parser.add_argument('-n', '--num_sentences', default=None, type=int,
            help='Number of hypothesis pairs to evaluate')
    # note that if x == [1, 2, 3], then x[:None] == x[:] == x (copy); no need for sys.maxint
    opts = parser.parse_args()

    # we create a generator and avoid loading all sentences into a list
    def sentences():
        with open(opts.input) as f:
            for pair in f:
                yield [sentence.strip().split() for sentence in pair.split(' ||| ')]

    english_stemmer = SnowballStemmer("english")

    # note: the -n option does not work in the original code
    for h1, h2, ref in islice(sentences(), opts.num_sentences):
        # Perform morphological stemming before calculating METEOR score
        h1 = [english_stemmer.stem(word) for word in h1]
        h2 = [english_stemmer.stem(word) for word in h2]
        ref = [english_stemmer.stem(word) for word in ref]

        rset = set(ref)
        h1_match = meteor(h1, rset)
        # print "meteor is h1_match ", h1_match
        h2_match = meteor(h2, rset)
        # print "meteor is h2_match ", h2_match
        print(1 if h1_match > h2_match else # \begin{cases}
                (0 if h1_match == h2_match
                    else -1)) # \end{cases}
开发者ID:stothe2,项目名称:468-MT,代码行数:34,代码来源:evaluate_stemming.py

示例4: pos_tokenizer

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def pos_tokenizer(s): #define a tokenizer that uses POS tagging
    texts=nltk.word_tokenize(s)

    texts=[word for word in texts if len(word)>2]

    # PULL OUT NOUN AND VERB PHRASES
    chunktext=nltk.pos_tag(texts)
    patterns="""
                VP:{<V.*><DT>?<JJ.*>?<NN.*>}
                NP:{<DT>?<JJ>*<NN.*>}
                N:{<NN.*>}
    """
    NPchunker=nltk.RegexpParser(patterns)

    from nltk.stem.snowball import SnowballStemmer
    st=SnowballStemmer('english')

    #print text
    temp=[]
    result=NPchunker.parse(chunktext)
    #print result
    for phrase in result:
        try:
            phrase.label()
            string=''
            m=0
            for word in phrase:
                if m==0:
                    string+=st.stem(word[0])
                    m+=1
                else: string+=' '+st.stem(word[0])
            temp.append(string)
        except: pass
    return temp
开发者ID:ecpaulson,项目名称:Intuitive-CMS,代码行数:36,代码来源:SBA_tweet_sklearn.py

示例5: __init__

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
class WordCount:
	def __init__(self, language):
		self.stopwords = self.load_stopwords(language)
		self.parse_regexp = re.compile(r"([0-9]*[\w][\w0-9]+)", re.UNICODE)
		self.current_stemmer = SnowballStemmer(language)

	@staticmethod
	def load_stopwords(language):
		stoplist = []
		if language == 'english':
			with codecs.open('geomedia'+ os.sep +'en_stoplist.txt', "r", "utf-8") as f:
				stoplist = [line.rstrip() for line in f]
		else:
			#download('stopwords')
			stoplist = stopwords.words(language)

		return stoplist

	def parse_text(self, text, wordcount_dictionary=None):
		"""
		>>> wordcount = WordCount() #doctest: +ELLIPSIS
		[nltk_data] ...
		>>> wordcount.parse_text("a1a ma kota")
		{'ma': 1, 'a1a': 1, 'kota': 1}
		>>> wordcount.parse_text("a1a ma kota", {'a1a': 2, 'kota': 1})
		{'ma': 1, 'a1a': 3, 'kota': 2}
		"""
		if wordcount_dictionary is None:
			wordcount_dictionary = {}
		words = self.parse_regexp.findall(text)
		for word in words:
			new_word = self.current_stemmer.stem(word.lower())
			if word not in self.stopwords and new_word not in self.stopwords:
				if new_word in wordcount_dictionary:
					wordcount_dictionary[new_word] += 1
				else:
					wordcount_dictionary[new_word] = 1
		return wordcount_dictionary
		
	def parse_text_extra(self, text, wordcount_dictionary=None, extras=None):
		if wordcount_dictionary is None:
			wordcount_dictionary = {}
		if wordcount_dictionary is None:
			extras = {}
		words = self.parse_regexp.findall(text)
		for word in words:
			new_word = self.current_stemmer.stem(word.lower())
			word = word.lower()
			if word not in self.stopwords and new_word not in self.stopwords:
				if new_word in wordcount_dictionary:
					wordcount_dictionary[new_word] += 1
					if word in extras[new_word]:
						extras[new_word][word] += 1
					else:
						extras[new_word][word] = 1
				else:
					wordcount_dictionary[new_word] = 1
					extras[new_word] = {}
					extras[new_word][word] = 1
开发者ID:lszyman,项目名称:PressNotePropagation,代码行数:61,代码来源:wordcount.py

示例6: stem_snowball

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def stem_snowball(tokens):
    stemmer = SnowballStemmer("russian")

    if isinstance(tokens, basestring):
        return stemmer.stem(tokens)
    else:
        stemmed = [stemmer.stem(token) for token in tokens]
        return stemmed
开发者ID:i-Hun,项目名称:thesis-code,代码行数:10,代码来源:preprocess.py

示例7: stem

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def stem(list):
    stemmer = SnowballStemmer('english')
    stemmed_tokens = []

    for x in list:
        stemmed_tokens.append(stemmer.stem(x))
        terms_dictionary.update_terms_dictionary(stemmer.stem(x), x) #creo il dizionario di token e termini originali
    return stemmed_tokens
开发者ID:alemidori,项目名称:ProgettoIR,代码行数:10,代码来源:processing.py

示例8: test_german

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
    def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen'
开发者ID:Journo-App,项目名称:flask-by-example,代码行数:11,代码来源:test_stem.py

示例9: extract_bigrams

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def extract_bigrams(articleList, commentCount):
    featureMatrix = np.zeros([commentCount,100])

    index = 0
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    bagOfWords = []
    for art in articleList.items():        
        for comm in art[1]:
            mywords = words(comm.body)
            mywords = known_words(mywords)
            # Remove Stops
            filtered_words = [w for w in mywords if not w in stopwords.words('english')]
            # Stemming
            stemmed_words = [stemmer.stem(w) for w in filtered_words]
            bagOfWords += stemmed_words
            bagOfWords.append("\n")
            
    tempVector = dict()
        
    #Create your bigrams
    bgs = nltk.bigrams(bagOfWords)

    fdist = nltk.FreqDist(bgs)   
    
    for k in fdist.keys()[:100]:
        tempVector[k] = 0
    
    
    theKeys = tempVector.keys()
    
    for art in articleList.items():        
        for comm in art[1]:
            mywords = words(comm.body)
            mywords = known_words(mywords)
            # Remove Stops
            filtered_words = [w for w in mywords if not w in stopwords.words('english')]
            # Stemming
            stemmed_words = [stemmer.stem(w) for w in filtered_words]
            bgs = nltk.bigrams(stemmed_words)
            for word in (w for w in bgs if tempVector.has_key(w)):
                keyInd = theKeys.index(word)      
                featureMatrix[index][keyInd] += 1
                           
            index += 1
            if index % 100 == 0:
                print "extracted", index, "features"
        
            if index >= commentCount:
                break            
            
            
    
    
    print "non-zero",np.count_nonzero(featureMatrix)
    print "Percentage filled:%.2f" %(float(np.count_nonzero(featureMatrix))/(featureMatrix.shape[0]*featureMatrix.shape[1]))
    return featureMatrix
开发者ID:DirkBrand,项目名称:Comment-Classification,代码行数:58,代码来源:mainExtractor.py

示例10: get_unigram_feats

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def get_unigram_feats(document):
	document_words = set(document.split())
	s = SnowballStemmer("english")
	stemmed_words = [ s.stem(word) for word in document_words ]
	features = {}
	#features['count'] = len(document_words)
	for word in data.wordlist:
		word = s.stem(word)
		features['contains({})'.format(word)] = (word in stemmed_words)
	return features
开发者ID:suclike,项目名称:FlipkartHackthon,代码行数:12,代码来源:classify.py

示例11: highestFrequency

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def highestFrequency(quesWords,sentWords):
    stemmer = SnowballStemmer("english");
    match = 0
    nonMatch = 0
    for qw in quesWords:
        for aw in sentWords:
            if stemmer.stem(qw) == stemmer.stem(aw) :
                match += 1
            else:
                nonMatch += 1
    return (match)
开发者ID:vathsava,项目名称:NLP_PROJ,代码行数:13,代码来源:jaccard.py

示例12: jaccardDistance

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def jaccardDistance(quesWords,sentWords):
    stemmer = SnowballStemmer("english");
    match = 0
    nonMatch = 0
    for qw in quesWords:
        for aw in sentWords:
            if stemmer.stem(qw) == stemmer.stem(aw) :
                match += 1
            else:
                nonMatch += 1
    return (match)
开发者ID:vathsava,项目名称:NLP_PROJ,代码行数:13,代码来源:jaccard.py

示例13: preProcessing

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def preProcessing(bitext):
    # transfer to lower case
    bitext = [[[x.lower() for x in sent ] for sent in bisent] for bisent in bitext]
    # stemmer
    e_stemmer = SnowballStemmer("german")
    f_stemmer = SnowballStemmer("english")
    for (n, (f,e)) in enumerate(bitext):
        for idx, f_i in enumerate(f):
            f[idx] = f_stemmer.stem(f_i)
        for idx, e_i in enumerate(e):
            e[idx] = e_stemmer.stem(e_i)
开发者ID:frederick0329,项目名称:sp2016.11-731,代码行数:13,代码来源:HMM_compound_back.py

示例14: process_missing

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
def process_missing(missing, sec):
    st = SnowballStemmer('english')
    morphological_errors = 0
    for m in missing:
        ind = sec['incorrect'].index(m)
        prediction = sec['predicted'][ind]
        if(st.stem(m[3]) == st.stem(prediction[0])):
            morphological_errors += 1        
        print('the correct sequence is: '+str(m)+' but predicted: '+str(prediction))
    print('morphological errors:' + str(morphological_errors))
    if len(missing):
        print('percentage:' + str(morphological_errors/len(missing)))
开发者ID:jvalansi,项目名称:word2vec_exp,代码行数:14,代码来源:w2v.py

示例15: trigram

# 需要导入模块: from nltk.stem.snowball import SnowballStemmer [as 别名]
# 或者: from nltk.stem.snowball.SnowballStemmer import stem [as 别名]
 def trigram(self,term):
   x,y,z =term
   stemmer=SnowballStemmer("english")
   x= stemmer.stem(x)
   y= stemmer.stem(y)
   z= stemmer.stem(z)
   label=x+y+z 
   new_column=[]
   for words_stem in self.stemwords:       
     if x in words_stem and y in words_stem and z in words_stem:
         new_column.append('True')
     else:
         new_column.append('False')
   self.dataframegenerator(new_column,label) 
开发者ID:omedranoc,项目名称:ThesisPreprocessing,代码行数:16,代码来源:join.py


注:本文中的nltk.stem.snowball.SnowballStemmer.stem方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。