本文整理汇总了Python中nltk.stem.lancaster.LancasterStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Python LancasterStemmer.stem方法的具体用法?Python LancasterStemmer.stem怎么用?Python LancasterStemmer.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.lancaster.LancasterStemmer
的用法示例。
在下文中一共展示了LancasterStemmer.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: simplify_old
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def simplify_old(s):
res = ''
st = LancasterStemmer()
text = nltk.word_tokenize(s)
tags = nltk.pos_tag(text)
for tag in tags:
word = tag[0]
if f.checkPos(tag[1]):
if word in model:
word_stem = st.stem(word)
top_words = model.most_similar(positive=[word], topn = 20)
candidate_list = [w[0] for w in top_words]
freq_list = [fdist[w] for w in candidate_list]
c_f_list = zip(candidate_list, freq_list)
ordered_list = sorted(c_f_list, key=lambda c_f_list:c_f_list[1], reverse=True)
word_freq = fdist[word]
# synonmys = f.getSynonmys(word) ## get synonmys from wordnet
# print synonmys
for w in ordered_list:
if not f.freq_diff(word_freq, w[1]): ## break for loop if candidate word frequency does not exceed the word frequency by a threshold
break
if st.stem(w[0]) != word_stem and f.samePos(word, w[0]): ##exclude morphological derivations and same pos
word = w[0] ### do not use wordnet
# if w[0] in synonmys:
# word = w[0]
# else:
# for syn in synonmys:
# if st.stem(w[0]) == st.stem(syn):
# word = w[0]
res = res + word + ' '
return res
示例2: getstems
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def getstems(dict):
l = LancasterStemmer()
stems = {}
for word in dict:
if word in dicts.irregforms:
stems[word] = l.stem(dicts.irregforms[word])
else:
stems[word] = l.stem(word)
return stems
示例3: mapper
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def mapper(shard, doc_counter):
st = LancasterStemmer()
with open(shard, "r") as f:
ohsu = json.JSONDecoder().decode(f.read())
output_values = []
doc_counter.add(len(ohsu))
for article in ohsu:
output_values += [(w, (article[".I"], 'a')) for w in article[".A"]]
output_values += [(st.stem(w), (article[".I"], 't')) for w in alphabet.findall(article[".T"].lower())]
if article.get('.W') is not None:
body_words = (w for w in alphabet.findall(article[".W"].lower()))
output_values += [(st.stem(w), (article[".I"], 'w')) for w in body_words]
return output_values
示例4: poss_train
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def poss_train(train_file,train_write,sw_file):
"""
Arguments:
- `train_file`:
"""
a = 0
f = open(train_file)
reader = csv.reader(f)
t = open(train_write,"w")
sw = open(sw_file)
sw = sw.readlines()
sw = [word.strip() for word in sw]
#stopwords = sw # use nltk stopwords
stopwords = nltk.corpus.stopwords.words('english')
print "停顿词表长度",len(stopwords)
stopwords = set(stopwords)
g = lambda x : x not in stopwords
for row in reader:
if a%10000 == 0:
print a
a += 1
title = row[1].lower()
#clean html
body = nltk.clean_html(row[2].lower())
#word tokenize
pattern = r"([a-z])\w+"
body = nltk.regexp_tokenize(body, pattern)
title = nltk.regexp_tokenize(title, pattern)
#remove stopwords
body = filter(g,body)
title = filter(g,title)
#light stem
st = LancasterStemmer()
title = set([st.stem(word) for word in title])
body = set(body)
body = set([st.stem(word) for word in body])
# list to string
body = ' '.join(body)
title = ' '.join(title)
t.write('"%s","%s","%s","%s"\n'%(row[0], title,body,row[3]))
示例5: stemming
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def stemming(words):
wordsAfterStemming=[]
st=LancasterStemmer()
for x in words:
y=st.stem(x)
wordsAfterStemming.append(y)
return wordsAfterStemming
示例6: score_sentence
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def score_sentence(sentence, weights, stop_words):
"""
Parameters weights: Counter, sentence: string
#I NEED SKIES DOCUMENTATION
"""
lemmatizer = WordNetLemmatizer()
stemmer = LancasterStemmer()
sentence = strip_punc(sentence)
tokens = word_tokenize(sentence)
score = 0
for token in tokens:
root = stemmer.stem(lemmatizer.lemmatize(token))
if token not in stop_words and root not in stop_words:
score += weights[root]
score = sum([weights[stemmer.stem(lemmatizer.lemmatize(token))] for token in tokens if token not in stop_words and stemmer.stem(lemmatizer.lemmatize(token)) not in stop_words])
return score
示例7: readText
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def readText(textFile):
examples = []
count = 0
lexicon_en = {}
lexicon_ge = {}
stem_en = LancasterStemmer()
stem_ge = nltk.stem.snowball.GermanStemmer()
for line in open(textFile):
count+=1
if count % 1000 == 0:
print count
lans = line.lower().strip().split("|||")
#german = [stem_ge.stem(x.decode('utf-8')) for x in lans[0].strip().split(" ")]
german = lans[0].strip().split(" ")
german = process(german)
for wordx in german:
for word in wordx:
if word not in lexicon_ge:
lexicon_ge[word]=1
else:
lexicon_ge[word]+=1
eng = [stem_en.stem(x.decode('utf-8')) for x in lans[1].strip().split(" ")]
#parse_en = pattern.en.parse(" ".join(eng))
eng = lans[1].strip().split(" ")
for word in eng:
if word not in lexicon_en:
lexicon_en[word]=1
else:
lexicon_en[word]+=1
examples.append(Example(german,eng))
return examples, lexicon_en, lexicon_ge
示例8: word_stem_example
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def word_stem_example(word="Amevive"):
"""
[EN]Read: http://www.nltk.org/book/ch03.html, 3.6 Normalizing Text
[CN]根据NLTK in python书中的推荐Porter算法较为鲁棒, 推荐使用
"""
stemmer = LancasterStemmer()
print("Lancaster [%s => %s]" % (word, stemmer.stem(word)))
stemmer = PorterStemmer() # <=== recommended algorithm
print("Porter [%s => %s]" % (word, stemmer.stem(word)))
stemmer = RegexpStemmer('ing$|s$|e$', min=4)
print("Regexp [%s => %s]" % (word, stemmer.stem(word)))
stemmer = SnowballStemmer('english') # Choose a language
print("Snowball [%s => %s]" % (word, stemmer.stem(word)))
示例9: preprocess
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def preprocess(reviews):
import nltk
from nltk.tokenize import word_tokenize
review_tokenized = [[word.lower() for word in word_tokenize(review.decode('utf-8'))] for review in reviews]
#print "review tokenize done"
#remove stop words
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')
review_filterd_stopwords = [[word for word in review if not word in english_stopwords] for review in review_tokenized]
#print 'remove stop words done'
#remove punctuations
english_punctuations = [',','.',':',';','?','(',')','&','!','@','#','$','%']
review_filtered = [[word for word in review if not word in english_punctuations] for review in review_filterd_stopwords]
#print 'remove punctuations done'
#stemming
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
review_stemmed = [[st.stem(word) for word in review] for review in review_filtered]
#print 'stemming done'
return review_stemmed
示例10: predict_category_subcategory
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def predict_category_subcategory(book_name):
data_set1 = pandas.Series(book_name.encode('ascii'))
#Data Preprocessing
data_set1 = data_set1.dropna(axis=0,how='any')
data_set1 = data_set1.str.lower()
#Manual removal List
remove_list = ['edition','ed','edn', 'vol' , 'vol.' , '-' ,'i']
data_set1[0] =' '.join([i for i in data_set1[0].split() if i not in remove_list])
data_set1 = data_set1.apply(lambda x :re.sub(r'\w*\d\w*', '', x).strip())
data_set1 = data_set1.apply(lambda x :re.sub(r'\([^)]*\)', ' ', x))
data_set1 = data_set1.apply(lambda x :re.sub('[^A-Za-z0-9]+', ' ', x))
#data_set['Category ID'] = data_set['Category ID']+"|"+data_set['Subcategory ID']
#Stemming the book titles
stemmer = LancasterStemmer()
data_set1[0]=" ".join([stemmer.stem(i) for i in data_set1[0].split()])
clf = joblib.load(os.path.join(BASE_DIR+"/learners/",'category_predict.pkl'))
ans = clf.predict(data_set1)
sub_clf = joblib.load(os.path.join(BASE_DIR+"/learners/",'subcategory_predict.pkl'))
sub_ans = sub_clf.predict(data_set1)
return [ans[0],sub_ans[0]]
示例11: word_standardize
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def word_standardize(sentences):
tokens = []
sentences_st = []
for sent in sentences:
tokens.extend(word_tokenize(sent))
sentences_st.append(word_tokenize(sent))
words = tokens
st = LancasterStemmer()
words = [w.lower() for w in words]
words = [w for w in words if not w in stopwords.words('english')]
words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~']
st_words = [st.stem(w) for w in words]
sent_result = []
for sent in sentences_st:
sent = [w.lower() for w in sent]
sent = [w for w in sent if not w in stopwords.words('english')]
sent = [w for w in sent if not w in '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~']
sent_result.append(sent)
return st_words, sent_result
示例12: LemmaTokenizer
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
class LemmaTokenizer(object):
def __init__(self):
#self.wnl = WordNetLemmatizer()
self.stemmer = LancasterStemmer()
def __call__(self, doc):
#return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if re.match(r'[a-z]+', t, re.M|re.I)]
return [self.stemmer.stem(t) for t in word_tokenize(doc) if re.match(r'[a-z]+', t, re.M|re.I)]
示例13: stem_text
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def stem_text(text):
stm = LancasterStemmer()
tokens = text.split()
words = [stm.stem(w) for w in tokens]
snt = " ".join(words)
return snt
示例14: filt
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def filt(string):
ret = string
# Filter all punctuation from string
for p in punctuation:
ret = ret.replace(p, '')
# Replace hyphens with spaces
ret = ret.replace('-', ' ')
oldret = ret
ret = ""
# Filter all stop words from string
for word in oldret.split():
if (word in allStopWords) or len (word) <= 1:
pass
else:
ret += word.lower() + " "
st = LancasterStemmer()
steamed = ""
for word in ret.split():
try:
steamed += str(st.stem(word)) + " "
except UnicodeDecodeError:
pass
return steamed
示例15: preprocess
# 需要导入模块: from nltk.stem.lancaster import LancasterStemmer [as 别名]
# 或者: from nltk.stem.lancaster.LancasterStemmer import stem [as 别名]
def preprocess(content):
stopset = set(stopwords.words('english'))
#replace punctuation and tag with space
tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ', content.lower()))
pos_list = pos_tag(tokens)
s_tokens = list()
#noun and verb only
for pos in pos_list:
#print pos[1]
#if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
if pos[1] in ['NN', 'NNS']:
s_tokens.append(pos[0])
wordfreq = FreqDist(s_tokens)
stemfreq = dict()
st = LancasterStemmer()
for word, freq in wordfreq.items():
#stopwords
if word in stopset:
del wordfreq[word]
continue
#tiny words
if len(word) <= 2:
del wordfreq[word]
continue
#stemmer
stem = st.stem(word)
try:
stemfreq[stem]+=freq
except:
stemfreq[stem]=freq
return stemfreq