Python SnowballStemmer.stem方法代码示例

本文整理汇总了Python中nltk.stem.SnowballStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题：Python SnowballStemmer.stem方法的具体用法？Python SnowballStemmer.stem怎么用？Python SnowballStemmer.stem使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.SnowballStemmer的用法示例。

在下文中一共展示了SnowballStemmer.stem方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: ModelBuilder

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
class ModelBuilder():

    def __init__(self):
        self.model = {}
        self.stemmer = SnowballStemmer('english')

    def build(self):
        with open('data/candidate_synonyms.txt') as f:
            all_words = f.read().split('\n')
            for words in all_words:
                if words:
                    word, similar = words.split(',')
                    word, similar = self.stemmer.stem(word), self.stemmer.stem(similar)
                    if word not in self.model: self.model[word] = {}
                    self.model[word][similar] = 1
        return self

    def condense(self):
        condensed_model = {}
        for word, similars in self.model.items():
            for similar in similars:
                if self.model.get(similar, {}).has_key(word):
                    if condensed_model.has_key(word):
                        condensed_model[word].append(similar)
                    else:
                        condensed_model[word] = [similar]
        self.model = condensed_model
        return self

开发者ID:jayeshsidhwani，项目名称:simset_model，代码行数:30，代码来源:model_builder.py

示例2: text_token_data_generator

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def text_token_data_generator():
    global id_text_index_map
    translation_table = string.maketrans(
        string.punctuation + string.uppercase, " " * len(string.punctuation) + string.lowercase
    )
    snowball_stemmer = SnowballStemmer("english")
    for f in glob.glob("json/text/*.json"):
        for line in open(f).readlines():
            extract_row = json.loads(line)
            id_text_index_map[extract_row["file_id"]] = len(id_text_index_map)
            visible_text = extract_row["visible_text"].encode("ascii", "ignore")
            visible_text = visible_text.translate(translation_table)
            visible_text = [
                snowball_stemmer.stem(word)
                for word in visible_text.split()
                if word not in ENGLISH_STOP_WORDS and len(word) > 1
            ]
            title = extract_row["title"].encode("ascii", "ignore")
            title = title.translate(translation_table)
            title = [
                "t^{}".format(snowball_stemmer.stem(word))
                for word in title.split()
                if word not in ENGLISH_STOP_WORDS and len(word) > 1
            ]
            visible_text.extend(title)
            yield " ".join(visible_text)

开发者ID:daxiongshu，项目名称:Dato-Sponsored-Page-Prediction，代码行数:28，代码来源:js2sp_converter.py

示例3: stemWordMatch2

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def stemWordMatch2(question,sentence):


    question_tokens = set(nltk.word_tokenize(question))
    sentence_tokens=set(nltk.word_tokenize(sentence))

    #  Finding the match between two words from the same root  using Lancaster Stemmizer

    '''stemmer=LancasterStemmer()

    for i in sentence_tokens:
        stem_words_list.append(stemmer.stem(i))

    for i in question_tokens:
        question_words_list.append(stemmer.stem(i))

    #print 'Stem word list',stem_words_list
    #print 'Question word list', question_words_list

    stem_count=0
    for i in stem_words_list:
        #Finding the exact word match
        if i.lower() in [x.lower() for x in question_words_list]:
            #print 'Question word is',x
            #print 'Sentence word stem is :',i
            #print 'Match'
            stem_count=stem_count+6
    stem_word_match_counter.append(count)'''

    stem_word_match_counter=[]
    stem_words_list=[]
    question_words_list=[]

    #  Finding the match between two words from the same root  using Snowball Stemmizer

    snowball_stemmer = SnowballStemmer('english')

    for i in sentence_tokens:
        stem_words_list.append(snowball_stemmer.stem(i))

    for i in question_tokens:
        question_words_list.append(snowball_stemmer.stem(i))

    #print 'Stem word list',stem_words_list
    #print 'Question word list', question_words_list

    stem_count=0
    for i in stem_words_list:
        #Finding the exact word match
        if i.lower() in [x.lower() for x in question_words_list]:
            #print 'Question word is',x
            #print 'Sentence word stem is :',i
            #print 'Match'
            stem_count=stem_count+6
    #print 'Stem word count match score is :', stem_count

    return stem_count

开发者ID:AnirudhNarasimhamurthy，项目名称:Natural-Language-Processing-Fall-2015，代码行数:59，代码来源:WM.py

示例4: wordnet_sim

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def wordnet_sim(query, db):
    """
    This function imlements simple wordnet definition lookup and compares it
    with a different block of text. For every word match between the definition
    token and text token doc receives +1.

    INPUT:
    query  --  string that represents user query expanded with word net defs
    db  --  dict representation of database xml file

    OUTPUT:
    maxdoc  --  the document with the highest score
    """
    # print('QUERY:', query)
    # initializing SnowballStemmer from nltk
    sst = SnowballStemmer("english")
    # taking stopwords from nltk
    stop = stopwords.words("english")
    # creating translation table to remove punctuation
    transnone = {ord(c): None for c in string.punctuation}
    # first we remove any punctuation and concatenate specific nodes into one
    query_nopunct = query.lower().translate(transnone)
    query_stems = [sst.stem(token) for token in query_nopunct.split() if token not in stop]
    doc_scores = defaultdict(float)
    for doc in db:
        for block, text in db[doc].items():
            # normalize block text
            if not text:
                continue
            text_nopunct = text.lower().translate(transnone)
            text = [sst.stem(t) for t in text_nopunct.split() if t not in stop]
            if len(text) == 0:
                text += " "
            # here we can finetune the block score multiplicators
            # some blocks are more important than the others
            if block == "description":
                for s in query_stems:
                    doc_scores[doc] += text.count(s) / len(text) * 2
            elif block == "trivia":
                for s in query_stems:
                    doc_scores[doc] += text.count(s) / len(text) * 0.5
            elif block == "history":
                for s in query_stems:
                    doc_scores[doc] += text.count(s) / len(text) * 0.5
            elif block == "comments":
                for s in query_stems:
                    doc_scores[doc] += text.count(s) / len(text)
    maxdoc = max(doc_scores, key=lambda x: doc_scores[x])
    debug = sorted([(k, v) for k, v in doc_scores.items()], key=lambda x: x[1])
    return (debug, maxdoc)

开发者ID:tastyminerals，项目名称:cocktail_bot，代码行数:52，代码来源:cocktail_ir.py

示例5: des_extrect

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def des_extrect():
    filename_list = []
    file_stopwords = file('stopwords.txt', "r")
    stopwords = [line.strip() for line in file_stopwords.readlines()]  
    for file_name in os.listdir(DESCRIPTION_DIR):
        filename_list.append(file_name) 
    for filename in filename_list:
        path =  os.path.join(DESCRIPTION_DIR, filename)
        fr = file(path, 'r')
        fw = file(filename+'.des', 'w')
        soup = BeautifulSoup(fr.read())
        docs = soup.findAll('doc')
        for doc in docs:
            content = str(doc['title'] + doc.snippet.text)
            content =  re.sub("[\.\@\,\:\;\!\?\(\)]".decode("utf8"), "".decode("utf8"),content)
            stemmer = SnowballStemmer('english')
            content = content.split()
            pro_content = ''
            for w in content: 
                w = stemmer.stem(w)
                #去停用词
                if w not in stopwords:
                    pro_content += w + ' '
            fw.write(doc['rank'] + ' ' +pro_content+'\n')
        fw.close()
        fr.close()

开发者ID:delili，项目名称:WePS-2-Clustering，代码行数:28，代码来源:procress.py

示例6: text_to_wordlist

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

开发者ID:hitboys，项目名称:Toxic-Comment-Classification-Challenge，代码行数:30，代码来源:simple_lstm.py

示例7: StemmedCorpus

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
class StemmedCorpus(DocumentCorpus):
    def __init__(self, documents=None, language="german"):
        DocumentCorpus.__init__(self, documents)
        with codecs.open("stopwords/" + language, "r", encoding=my_encoding) as f:
            self._stopwords = [sw.strip() for sw in f.readlines()]
        self._stemmer = SnowballStemmer(language)
        self._lemmatizer = WordNetLemmatizer()
        self._stemmed_documents = []
    
    def preprocess_documents(self, lemmatize=False, remove_stopwords = True):
        _highest_func = self._lemmatize_tokens if lemmatize else self._stemm_tokens
        _second_highest_func = self._remove_stopword if remove_stopwords else lambda x: x
        self._stemmed_documents = [ (_highest_func(_second_highest_func(self._tokenize_document(doc[0].lower()))), doc[1] ) for doc in self._documents]

    def _tokenize_document(self, document):
        return regexp_tokenize(document, pattern_words)
    
    def _remove_stopword(self, tokens):
        return [token for token in tokens if token not in self._stopwords]
    
    def _stemm_tokens(self, tokens):
        return [self._stemmer.stem(token) for token in tokens]

    def _lemmatize_tokens(self, tokens):
        return [self._lemmatizer.lemmatize(token, trans_tag(tag)) for token, tag in pos_tag(tokens)]

开发者ID:daniilsorokin，项目名称:Web-Mining-Exercises，代码行数:27，代码来源:vector_representation.py

示例8: norm_corpus

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def norm_corpus(document_list):
    norm_doc_list = []
    
    # lowercase
    document_list = [word.lower() for word in document_list]

    
    # remove symbols in text
    symbols = ",.?!"
    for sym in symbols:
        document_list = [word.replace(sym,'') for word in document_list]
    
    
    # loop through each string i.e. review in the column
    for doc in document_list:
        doc = nltk.word_tokenize(doc)
        
        # remove stopwords
        doc = [word for word in doc if word not in stopwords.words('english')]
        
        # stem words
        stemmer = SnowballStemmer("english")
        doc = [stemmer.stem(word) for word in doc]
        
        # make tokenised text one string
        norm_doc = " ".join(doc)
        norm_doc_list.append(norm_doc)
    
    return norm_doc_list

开发者ID:mariaathena，项目名称:yelp_data_challenge，代码行数:31，代码来源:old_parse_tip_data.py

示例9: frequency_analysis

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def frequency_analysis(input_path, output_path, stopwords=None, n_most_common=50):
	recipes = []
	with open(input_path, 'r') as f:
		for i, line in enumerate(f):
			if line == '\n':
				break
			if i == 0:
				continue  # skip header
			fields = line.split('\t')
			recipes.append(fields[1].replace("\n", ""))
	recipe_text = re.sub("[^a-z ]", "", ' '.join(recipes))
	recipe_words = re.split("\s+", recipe_text)
	stemmer = SnowballStemmer("english")
	recipe_stems = [stemmer.stem(w) for w in recipe_words]
	if stopwords is not None:
		recipe_stems = filter(None, [s for s in recipe_stems if s not in stopwords])
	top_words = Counter(recipe_stems).most_common(n_most_common)

	# write to a file
	# do a second pass of the recipe to determine how many of the documents the term is in
	freq_table = open(output_path, 'wb')
	for elt in top_words:
		doc_freq = sum([elt[0] in recipe for recipe in recipes])
		freq_table.write(','.join([str(e) for e in elt]) +','+ str(doc_freq) + '\n')
	freq_table.close()

开发者ID:robert-giaquinto，项目名称:sentence_boundary_detection，代码行数:27，代码来源:frequency_analysis.py

示例10: normalized_token

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def normalized_token(token):
    """
    Use stemmer to normalize the token.
    建图时调用该函数，而不是在file_text改变词形的存储
    """
    stemmer = SnowballStemmer("english") 
    return stemmer.stem(token.lower())

开发者ID:carlsplace，项目名称:KeyphraseExtraction，代码行数:9，代码来源:ugly.py

示例11: VocKeyworder

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
class VocKeyworder(BaseKeyworder):
    def __init__(self):
        super(VocKeyworder, self).__init__()
        self._vocs = engvoc.voc2000
        self._lemmatizer = WordNetLemmatizer()
        self._stemmer1 = LancasterStemmer()
        self._stemmer2 = SnowballStemmer('english')

    def add_keyword(self, gag_id, title):
        tokens = re.split(' |\.|,|;|=', title)
        for token in tokens:
            token = re.sub(r"\W+$", '', token)
            token = re.sub(r"^\W+", '', token)
            vocs = []
            try:
                token = token.encode('utf8')
                vocs.append(re.sub(r"'\w+", '', token).lower())
                vocs.append(self._lemmatizer.lemmatize(vocs[0]))
                vocs.append(self._stemmer1.stem(vocs[0]))
                vocs.append(self._stemmer2.stem(vocs[0]))
            except UnicodeDecodeError:
                continue
            if vocs[0] == '':
                continue
            try:
                float(vocs[0])
                continue
            except ValueError:
                pass
            if not any([voc in self._vocs for voc in vocs]):
                print 'voc', vocs, token
                self._add_keyword(gag_id, token)

开发者ID:johnlinp，项目名称:9gag-keyworder，代码行数:34，代码来源:keyworder.py

示例12: preprocessing

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def preprocessing(doc): #stop word as optional
        x = re.sub("[^a-zA-Z]", " ", doc) #only words
        x = x.lower().split()
        stemmer = SnowballStemmer("english") # use snowball
        stops = set(stopwords.words("english")) # set is faster than list
        x = [stemmer.stem(word) for word in x if word not in stops]
        return(x)

开发者ID:Kiminaka，项目名称:topic_model_intrusion_eval，代码行数:9，代码来源:evaluation_function.py

示例13: procesar

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def procesar(request, identificador):
	lmtzr = WordNetLemmatizer()
	d = Documento.objects.get(id=identificador)
	
	#nltk.corpus.cess_esp.words()
	
	
	tokens = nltk.word_tokenize(d.contenido.replace('.', ' . '))
	#print tokens
	#scentence = d.contenido

	#scentence = scentence.lower() 

	words = tokens
	spanish_stemmer = SnowballStemmer('spanish')
	

	#This is the simple way to remove stop words
	important_words=[]
	for word in words:
		if word not in stopwords.words('spanish'):
		    important_words.append([word, lmtzr.lemmatize(word), spanish_stemmer.stem(word)])




	return render_to_response('templates/documentoProcesado.html', 
				{
					'original': d.contenido,
					'tokens': tokens,
					'important_words' : important_words,
					#'pos_tags': pos_tags,
					#'ne_chunks': ne_chunks.subtrees(),
				})

开发者ID:alexanderalfaro，项目名称:pqr，代码行数:36，代码来源:views.py

示例14: Cleaner

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
class Cleaner(object):
	"""
	the sql query in get_reviews needs to be customized
	"""
	def __init__(self):
		self.sbstem = SnowballStemmer("english")
		replace = string.punctuation + string.digits
		self.replace_punctuation = string.maketrans(replace, ' '*len(replace))
		self.locations = []
		self.cached_stopwords = stopwords.words("english")

	def clean(self, txt):
		#removes stopwords, punctuation
	    txt = txt.encode('ascii','ignore')
	    nopunct = txt.translate(self.replace_punctuation)
	    no_locs = [x for x in nopunct.split() if x.lower() not in self.cached_stopwords]
	    stemmed = [self.sbstem.stem(x) for x in no_locs]
	    return " ".join(stemmed)

	def make_loclist(self, locations):
		locations = list(locations)
		removelist = ['Ho Chi Minh City', 'Phu Quoc Island', 'Halong Bay']
		locations = [x.lower() for x in locations if x not in removelist]		
		locations.extend(['ho chi minh','hoan','kiem','phu quoc', 'halong', 'vietnam', 'dong','vnd','vdn'])
		locations.extend(['vietnames', 'nhatrang','saigon','america','maryland','york'])
		loc_wordlist = [f.split() for f in locations]
		loc_wordlist = list(itertools.chain(*loc_wordlist))
		self.cached_stopwords.extend(loc_wordlist)
		return loc_wordlist

开发者ID:arhee，项目名称:tripadvisor，代码行数:31，代码来源:clean_reviews.py

示例15: stemmed

# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def stemmed(text,language):
    stemmer= SnowballStemmer(language)
    tas=text.split()
    text=""
    for word in tas:
        text=" ".join((text,stemmer.stem(word)))
    return text.lstrip()

开发者ID:bobvdvelde，项目名称:inca，代码行数:9，代码来源:analysis.py

注：本文中的nltk.stem.SnowballStemmer.stem方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。