本文整理汇总了Python中nltk.stem.snowball.SnowballStemmer方法的典型用法代码示例。如果您正苦于以下问题:Python snowball.SnowballStemmer方法的具体用法?Python snowball.SnowballStemmer怎么用?Python snowball.SnowballStemmer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.snowball
的用法示例。
在下文中一共展示了snowball.SnowballStemmer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: clean_resume
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def clean_resume(resume_text):
cleaned_resume = []
# replacing newlines and punctuations with space
resume_text =resume_text.replace('\t', ' ').replace('\n', ' ')
for punctuation in string.punctuation:
resume_text = resume_text.replace(punctuation, ' ')
resume_text = resume_text.split()
# removing stop words and Stemming the remaining words in the resume
stemmer = SnowballStemmer("english")
for word in resume_text:
if word not in stopwords.words('english') and not word.isdigit():
cleaned_resume.append(word.lower())#stemmer.stem(word))
cleaned_resume = ' '.join(cleaned_resume)
return cleaned_resume
示例2: conversion
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def conversion(source, dest):
"""
:param source: the unit of measure you have
:param dest: the unit of measure need to convert to
:return:
"""
stemmer = SnowballStemmer('english')
source = stemmer.stem(source)
dest = stemmer.stem(dest)
try:
units = conv_dict.get(source).get('Units')[
conv_dict.get(source).get('Destination').index(dest)
]
except:
units = None
return units, source, dest
示例3: tokenize
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def tokenize(text):
"""
Tokenizes sequences of text and stems the tokens.
:param text: String to tokenize
:return: List with stemmed tokens
"""
tokens = nltk.WhitespaceTokenizer().tokenize(text)
tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
tokens = [word for word in tokens if word not in stopwords.words('english')]
tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
stems = []
stemmer = SnowballStemmer("english")
for token in tokens:
token = stemmer.stem(token)
if token != "":
stems.append(token)
return stems
示例4: stemming_message_snowball
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def stemming_message_snowball(message, stemmings_to_words=dict()):
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import casual_tokenize
stemmer = SnowballStemmer('finnish')
if type(message) == None:
return '', stemmings_to_words
message.replace('#','')
stemmed_message = []
for word in casual_tokenize(message):
stemmed_word = stemmer.stem(word.lower())
stemmed_message.append(stemmed_word)
stemmings_to_words[stemmed_word] = word
stemmed_message = ' '.join(stemmed_message)
return stemmed_message, stemmings_to_words
示例5: process
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def process(input_text):
# Create a regular expression tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# Create a Snowball stemmer
stemmer = SnowballStemmer('english')
# Get the list of stop words
stop_words = stopwords.words('english')
# Tokenize the input string
tokens = tokenizer.tokenize(input_text.lower())
# Remove the stop words
tokens = [x for x in tokens if not x in stop_words]
# Perform stemming on the tokenized words
tokens_stemmed = [stemmer.stem(x) for x in tokens]
return tokens_stemmed
示例6: __init__
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def __init__(self, input_directory, language):
"""
Args:
input_directory (str): the directory from which text documents to
be summarized are loaded.
@type language: str
"""
self.input_directory = input_directory
self.sentences = []
self.weights = {}
self.c2s = defaultdict(set)
self.concept_sets = defaultdict(frozenset)
self.LANGUAGE = language
# type: str
self.stoplist = set(stopwords.words(self.LANGUAGE))
self.stemmer = SnowballStemmer(self.LANGUAGE)
self.word_frequencies = defaultdict(int)
self.w2s = defaultdict(set)
示例7: test_russian
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def test_russian(self):
# Russian words both consisting of Cyrillic
# and Roman letters can be stemmed.
stemmer_russian = SnowballStemmer("russian")
assert stemmer_russian.stem("авантненькая") == "авантненьк"
assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k"
示例8: test_german
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def test_german(self):
stemmer_german = SnowballStemmer("german")
stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)
assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'
assert stemmer_german.stem("keinen") == 'kein'
assert stemmer_german2.stem("keinen") == 'keinen'
示例9: test_spanish
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def test_spanish(self):
stemmer = SnowballStemmer('spanish')
assert stemmer.stem("Visionado") == 'vision'
# The word 'algue' was raising an IndexError
assert stemmer.stem("algue") == 'algu'
示例10: test_short_strings_bug
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def test_short_strings_bug(self):
stemmer = SnowballStemmer('english')
assert stemmer.stem("y's") == 'y'
示例11: __init__
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def __init__(self,bigrams=True,min_df=3,stemming=True,tfidf=True):
self.regex = re.compile('[^a-zA-Z ]')
self.stop = set(stopwords.words('english'))
self.stemmer = SnowballStemmer("english")
self.bigrams = bigrams
self.min_df = min_df
self.stemming = stemming
self.tfidf = tfidf
示例12: cleaned_tokens
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def cleaned_tokens(tokens):
"""Clean the tokens by removing stop words and stemming."""
# stemmer = SnowballStemmer("english")
# stemmed = [stemmer.stem(token) for token in tokens]
s = set(stopwords.words('english'))
tokens = [x.lower() for x in tokens if not x.isdigit()]
return filter(lambda w: not w.lower() in s, tokens)
示例13: build_analyzer
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def build_analyzer(self):
analyzer = super(StemmedCountVectorizer, self).build_analyzer()
stemmer = SnowballStemmer("english", ignore_stopwords=True)
return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
示例14: __init__
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def __init__(self):
# Create a regular expression tokenizer
self.tokenizer = RegexpTokenizer(r'\w+')
# get the list of stop words
self.stop_words_english = stopwords.words('english')
# Create a Snowball stemmer
self.stemmer = SnowballStemmer('english')
# Tokenizing, stop word removal, and stemming
开发者ID:PacktPublishing,项目名称:Python-Machine-Learning-Cookbook-Second-Edition,代码行数:13,代码来源:topic_modeling.py
示例15: tokenize_and_stem
# 需要导入模块: from nltk.stem import snowball [as 别名]
# 或者: from nltk.stem.snowball import SnowballStemmer [as 别名]
def tokenize_and_stem(text):
stemmer = SnowballStemmer("english")
text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
################################################################################