本文整理汇总了Python中nltk.stem.SnowballStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Python SnowballStemmer.stem方法的具体用法?Python SnowballStemmer.stem怎么用?Python SnowballStemmer.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.SnowballStemmer
的用法示例。
在下文中一共展示了SnowballStemmer.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: ModelBuilder
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
class ModelBuilder():
def __init__(self):
self.model = {}
self.stemmer = SnowballStemmer('english')
def build(self):
with open('data/candidate_synonyms.txt') as f:
all_words = f.read().split('\n')
for words in all_words:
if words:
word, similar = words.split(',')
word, similar = self.stemmer.stem(word), self.stemmer.stem(similar)
if word not in self.model: self.model[word] = {}
self.model[word][similar] = 1
return self
def condense(self):
condensed_model = {}
for word, similars in self.model.items():
for similar in similars:
if self.model.get(similar, {}).has_key(word):
if condensed_model.has_key(word):
condensed_model[word].append(similar)
else:
condensed_model[word] = [similar]
self.model = condensed_model
return self
示例2: text_token_data_generator
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def text_token_data_generator():
global id_text_index_map
translation_table = string.maketrans(
string.punctuation + string.uppercase, " " * len(string.punctuation) + string.lowercase
)
snowball_stemmer = SnowballStemmer("english")
for f in glob.glob("json/text/*.json"):
for line in open(f).readlines():
extract_row = json.loads(line)
id_text_index_map[extract_row["file_id"]] = len(id_text_index_map)
visible_text = extract_row["visible_text"].encode("ascii", "ignore")
visible_text = visible_text.translate(translation_table)
visible_text = [
snowball_stemmer.stem(word)
for word in visible_text.split()
if word not in ENGLISH_STOP_WORDS and len(word) > 1
]
title = extract_row["title"].encode("ascii", "ignore")
title = title.translate(translation_table)
title = [
"t^{}".format(snowball_stemmer.stem(word))
for word in title.split()
if word not in ENGLISH_STOP_WORDS and len(word) > 1
]
visible_text.extend(title)
yield " ".join(visible_text)
示例3: stemWordMatch2
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def stemWordMatch2(question,sentence):
question_tokens = set(nltk.word_tokenize(question))
sentence_tokens=set(nltk.word_tokenize(sentence))
# Finding the match between two words from the same root using Lancaster Stemmizer
'''stemmer=LancasterStemmer()
for i in sentence_tokens:
stem_words_list.append(stemmer.stem(i))
for i in question_tokens:
question_words_list.append(stemmer.stem(i))
#print 'Stem word list',stem_words_list
#print 'Question word list', question_words_list
stem_count=0
for i in stem_words_list:
#Finding the exact word match
if i.lower() in [x.lower() for x in question_words_list]:
#print 'Question word is',x
#print 'Sentence word stem is :',i
#print 'Match'
stem_count=stem_count+6
stem_word_match_counter.append(count)'''
stem_word_match_counter=[]
stem_words_list=[]
question_words_list=[]
# Finding the match between two words from the same root using Snowball Stemmizer
snowball_stemmer = SnowballStemmer('english')
for i in sentence_tokens:
stem_words_list.append(snowball_stemmer.stem(i))
for i in question_tokens:
question_words_list.append(snowball_stemmer.stem(i))
#print 'Stem word list',stem_words_list
#print 'Question word list', question_words_list
stem_count=0
for i in stem_words_list:
#Finding the exact word match
if i.lower() in [x.lower() for x in question_words_list]:
#print 'Question word is',x
#print 'Sentence word stem is :',i
#print 'Match'
stem_count=stem_count+6
#print 'Stem word count match score is :', stem_count
return stem_count
示例4: wordnet_sim
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def wordnet_sim(query, db):
"""
This function imlements simple wordnet definition lookup and compares it
with a different block of text. For every word match between the definition
token and text token doc receives +1.
INPUT:
query -- string that represents user query expanded with word net defs
db -- dict representation of database xml file
OUTPUT:
maxdoc -- the document with the highest score
"""
# print('QUERY:', query)
# initializing SnowballStemmer from nltk
sst = SnowballStemmer("english")
# taking stopwords from nltk
stop = stopwords.words("english")
# creating translation table to remove punctuation
transnone = {ord(c): None for c in string.punctuation}
# first we remove any punctuation and concatenate specific nodes into one
query_nopunct = query.lower().translate(transnone)
query_stems = [sst.stem(token) for token in query_nopunct.split() if token not in stop]
doc_scores = defaultdict(float)
for doc in db:
for block, text in db[doc].items():
# normalize block text
if not text:
continue
text_nopunct = text.lower().translate(transnone)
text = [sst.stem(t) for t in text_nopunct.split() if t not in stop]
if len(text) == 0:
text += " "
# here we can finetune the block score multiplicators
# some blocks are more important than the others
if block == "description":
for s in query_stems:
doc_scores[doc] += text.count(s) / len(text) * 2
elif block == "trivia":
for s in query_stems:
doc_scores[doc] += text.count(s) / len(text) * 0.5
elif block == "history":
for s in query_stems:
doc_scores[doc] += text.count(s) / len(text) * 0.5
elif block == "comments":
for s in query_stems:
doc_scores[doc] += text.count(s) / len(text)
maxdoc = max(doc_scores, key=lambda x: doc_scores[x])
debug = sorted([(k, v) for k, v in doc_scores.items()], key=lambda x: x[1])
return (debug, maxdoc)
示例5: des_extrect
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def des_extrect():
filename_list = []
file_stopwords = file('stopwords.txt', "r")
stopwords = [line.strip() for line in file_stopwords.readlines()]
for file_name in os.listdir(DESCRIPTION_DIR):
filename_list.append(file_name)
for filename in filename_list:
path = os.path.join(DESCRIPTION_DIR, filename)
fr = file(path, 'r')
fw = file(filename+'.des', 'w')
soup = BeautifulSoup(fr.read())
docs = soup.findAll('doc')
for doc in docs:
content = str(doc['title'] + doc.snippet.text)
content = re.sub("[\.\@\,\:\;\!\?\(\)]".decode("utf8"), "".decode("utf8"),content)
stemmer = SnowballStemmer('english')
content = content.split()
pro_content = ''
for w in content:
w = stemmer.stem(w)
#去停用词
if w not in stopwords:
pro_content += w + ' '
fw.write(doc['rank'] + ' ' +pro_content+'\n')
fw.close()
fr.close()
示例6: text_to_wordlist
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
# Clean the text, with the option to remove stopwords and to stem words.
# Convert words to lower case and split them
text = text.lower().split()
# Optionally, remove stop words
if remove_stopwords:
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
#Remove Special Characters
text=special_character_removal.sub('',text)
#Replace Numbers
text=replace_numbers.sub('n',text)
# Optionally, shorten words to their stems
if stem_words:
text = text.split()
stemmer = SnowballStemmer('english')
stemmed_words = [stemmer.stem(word) for word in text]
text = " ".join(stemmed_words)
# Return a list of words
return(text)
示例7: StemmedCorpus
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
class StemmedCorpus(DocumentCorpus):
def __init__(self, documents=None, language="german"):
DocumentCorpus.__init__(self, documents)
with codecs.open("stopwords/" + language, "r", encoding=my_encoding) as f:
self._stopwords = [sw.strip() for sw in f.readlines()]
self._stemmer = SnowballStemmer(language)
self._lemmatizer = WordNetLemmatizer()
self._stemmed_documents = []
def preprocess_documents(self, lemmatize=False, remove_stopwords = True):
_highest_func = self._lemmatize_tokens if lemmatize else self._stemm_tokens
_second_highest_func = self._remove_stopword if remove_stopwords else lambda x: x
self._stemmed_documents = [ (_highest_func(_second_highest_func(self._tokenize_document(doc[0].lower()))), doc[1] ) for doc in self._documents]
def _tokenize_document(self, document):
return regexp_tokenize(document, pattern_words)
def _remove_stopword(self, tokens):
return [token for token in tokens if token not in self._stopwords]
def _stemm_tokens(self, tokens):
return [self._stemmer.stem(token) for token in tokens]
def _lemmatize_tokens(self, tokens):
return [self._lemmatizer.lemmatize(token, trans_tag(tag)) for token, tag in pos_tag(tokens)]
示例8: norm_corpus
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def norm_corpus(document_list):
norm_doc_list = []
# lowercase
document_list = [word.lower() for word in document_list]
# remove symbols in text
symbols = ",.?!"
for sym in symbols:
document_list = [word.replace(sym,'') for word in document_list]
# loop through each string i.e. review in the column
for doc in document_list:
doc = nltk.word_tokenize(doc)
# remove stopwords
doc = [word for word in doc if word not in stopwords.words('english')]
# stem words
stemmer = SnowballStemmer("english")
doc = [stemmer.stem(word) for word in doc]
# make tokenised text one string
norm_doc = " ".join(doc)
norm_doc_list.append(norm_doc)
return norm_doc_list
示例9: frequency_analysis
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def frequency_analysis(input_path, output_path, stopwords=None, n_most_common=50):
recipes = []
with open(input_path, 'r') as f:
for i, line in enumerate(f):
if line == '\n':
break
if i == 0:
continue # skip header
fields = line.split('\t')
recipes.append(fields[1].replace("\n", ""))
recipe_text = re.sub("[^a-z ]", "", ' '.join(recipes))
recipe_words = re.split("\s+", recipe_text)
stemmer = SnowballStemmer("english")
recipe_stems = [stemmer.stem(w) for w in recipe_words]
if stopwords is not None:
recipe_stems = filter(None, [s for s in recipe_stems if s not in stopwords])
top_words = Counter(recipe_stems).most_common(n_most_common)
# write to a file
# do a second pass of the recipe to determine how many of the documents the term is in
freq_table = open(output_path, 'wb')
for elt in top_words:
doc_freq = sum([elt[0] in recipe for recipe in recipes])
freq_table.write(','.join([str(e) for e in elt]) +','+ str(doc_freq) + '\n')
freq_table.close()
示例10: normalized_token
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def normalized_token(token):
"""
Use stemmer to normalize the token.
建图时调用该函数,而不是在file_text改变词形的存储
"""
stemmer = SnowballStemmer("english")
return stemmer.stem(token.lower())
示例11: VocKeyworder
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
class VocKeyworder(BaseKeyworder):
def __init__(self):
super(VocKeyworder, self).__init__()
self._vocs = engvoc.voc2000
self._lemmatizer = WordNetLemmatizer()
self._stemmer1 = LancasterStemmer()
self._stemmer2 = SnowballStemmer('english')
def add_keyword(self, gag_id, title):
tokens = re.split(' |\.|,|;|=', title)
for token in tokens:
token = re.sub(r"\W+$", '', token)
token = re.sub(r"^\W+", '', token)
vocs = []
try:
token = token.encode('utf8')
vocs.append(re.sub(r"'\w+", '', token).lower())
vocs.append(self._lemmatizer.lemmatize(vocs[0]))
vocs.append(self._stemmer1.stem(vocs[0]))
vocs.append(self._stemmer2.stem(vocs[0]))
except UnicodeDecodeError:
continue
if vocs[0] == '':
continue
try:
float(vocs[0])
continue
except ValueError:
pass
if not any([voc in self._vocs for voc in vocs]):
print 'voc', vocs, token
self._add_keyword(gag_id, token)
示例12: preprocessing
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def preprocessing(doc): #stop word as optional
x = re.sub("[^a-zA-Z]", " ", doc) #only words
x = x.lower().split()
stemmer = SnowballStemmer("english") # use snowball
stops = set(stopwords.words("english")) # set is faster than list
x = [stemmer.stem(word) for word in x if word not in stops]
return(x)
示例13: procesar
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def procesar(request, identificador):
lmtzr = WordNetLemmatizer()
d = Documento.objects.get(id=identificador)
#nltk.corpus.cess_esp.words()
tokens = nltk.word_tokenize(d.contenido.replace('.', ' . '))
#print tokens
#scentence = d.contenido
#scentence = scentence.lower()
words = tokens
spanish_stemmer = SnowballStemmer('spanish')
#This is the simple way to remove stop words
important_words=[]
for word in words:
if word not in stopwords.words('spanish'):
important_words.append([word, lmtzr.lemmatize(word), spanish_stemmer.stem(word)])
return render_to_response('templates/documentoProcesado.html',
{
'original': d.contenido,
'tokens': tokens,
'important_words' : important_words,
#'pos_tags': pos_tags,
#'ne_chunks': ne_chunks.subtrees(),
})
示例14: Cleaner
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
class Cleaner(object):
"""
the sql query in get_reviews needs to be customized
"""
def __init__(self):
self.sbstem = SnowballStemmer("english")
replace = string.punctuation + string.digits
self.replace_punctuation = string.maketrans(replace, ' '*len(replace))
self.locations = []
self.cached_stopwords = stopwords.words("english")
def clean(self, txt):
#removes stopwords, punctuation
txt = txt.encode('ascii','ignore')
nopunct = txt.translate(self.replace_punctuation)
no_locs = [x for x in nopunct.split() if x.lower() not in self.cached_stopwords]
stemmed = [self.sbstem.stem(x) for x in no_locs]
return " ".join(stemmed)
def make_loclist(self, locations):
locations = list(locations)
removelist = ['Ho Chi Minh City', 'Phu Quoc Island', 'Halong Bay']
locations = [x.lower() for x in locations if x not in removelist]
locations.extend(['ho chi minh','hoan','kiem','phu quoc', 'halong', 'vietnam', 'dong','vnd','vdn'])
locations.extend(['vietnames', 'nhatrang','saigon','america','maryland','york'])
loc_wordlist = [f.split() for f in locations]
loc_wordlist = list(itertools.chain(*loc_wordlist))
self.cached_stopwords.extend(loc_wordlist)
return loc_wordlist
示例15: stemmed
# 需要导入模块: from nltk.stem import SnowballStemmer [as 别名]
# 或者: from nltk.stem.SnowballStemmer import stem [as 别名]
def stemmed(text,language):
stemmer= SnowballStemmer(language)
tas=text.split()
text=""
for word in tas:
text=" ".join((text,stemmer.stem(word)))
return text.lstrip()