本文整理汇总了Python中nltk.stem.snowball.EnglishStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Python EnglishStemmer.stem方法的具体用法?Python EnglishStemmer.stem怎么用?Python EnglishStemmer.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.snowball.EnglishStemmer
的用法示例。
在下文中一共展示了EnglishStemmer.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fix_lemma_problem
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def fix_lemma_problem(pred_scores, targets, space):
from nltk.stem.snowball import EnglishStemmer
es = EnglishStemmer()
r = pred_scores.copy()
lemmas = np.array([es.stem(v) for v in space.vocab])
for i, t in enumerate(targets):
g = es.stem(space.vocab[t])
mask = (lemmas == g)
#print space.vocab[t], np.sum(mask)
r[i][mask] = -1e9
#print r[i][mask]
return r
示例2: similarity_score
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def similarity_score(word1, word2):
""" see sections 2.3 and 2.4 of http://dx.doi.org.ezp-prod1.hul.harvard.edu/10.1109/TKDE.2003.1209005
:type word1: string
:type word2: string
:return: float: between 0 and 1; similarity between two given words
"""
stemmer = EnglishStemmer()
if stemmer.stem(word1) == stemmer.stem(word2):
return 1
alpha = 0.2
beta = 0.6
l, h = get_path_length_and_subsumer_height(word1, word2)
return exp((-1)*alpha*l)*((exp(beta*h)-exp((-1)*beta*h))/(exp(beta*h)+exp((-1)*beta*h)))
示例3: getAllStemEntities
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def getAllStemEntities(entities):
st = EnglishStemmer()
q = [',', '.', '!', '?', ':', ';']
tmp = []
sourceEntities = [x for x in entities if len(x)>0]
np.random.shuffle(entities)
for i in xrange(len(entities)):
if len(entities[i]) == 0:
continue
if i % 1000 == 0:
print i
entities[i] = entities[i].lower()
entities[i] = entities[i].replace(' - ', ' \u2013 ', entities[i].count(' - '))
entities[i] = entities[i].replace(' -', ' \u2013', entities[i].count(' -'))
entities[i] = entities[i].replace('- ', '\u2013 ', entities[i].count('- '))
entities[i] = entities[i].replace('-', ' - ', entities[i].count('-'))
entities[i] = entities[i].replace(')', ' )', entities[i].count(')'))
entities[i] = entities[i].replace('(', '( ', entities[i].count('('))
entities[i] = entities[i].replace('\u0027', ' \u0027', entities.count('\u0027'))
for w in q:
entities[i]=entities[i].replace(w, ' '+w, entities[i].count(w))
word = entities[i].split(' ')
s = ''
for w in word:
s += st.stem(unicode(w)) + ' '
tmp.append(s[:-1])
if len(tmp) > 50:
break
return tmp, entities[:len(tmp)]
示例4: query
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def query(word):
db = MySQLdb.connect("127.0.0.1","dizing","ynr3","dizing" )
cursor=db.cursor()
snowball_stemmer = EnglishStemmer()
stem2 = snowball_stemmer.stem(word)
cursor.execute("SELECT * FROM words WHERE original=%s OR stem1=%s OR stem2=%s", (word,word,stem2))
rows = cursor.fetchall()
words1 = dict()
words2 = dict()
for row in rows:
if row[1] == word or row[3]==word:
words1[word] = row[0]
else:
words2[word] = row[0]
scenes1 = []
scenes2 = []
for (i,words_dict) in [(1,words1), (2,words2)]:
wids = words_dict.values()
for wid in wids:
sql = "SELECT s.sentence, s.start, s.stop, s.ready, m.title FROM scene AS s, words_scenes AS ws, movie as m " + \
"WHERE ws.wid=%d AND ws.sid=s.sid AND s.mid = m.mid" % int(wid)
# print sql
cursor.execute(sql)
rows = cursor.fetchall()
if (i==1): scenes1 += rows
else: scenes2 += rows
print scenes1
print scenes2
return scenes1 + scenes2
db.close()
示例5: Granularity
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def Granularity(sentenceArray):
for sentence in sentenceArray:
# print(sentence)
try:
stemmer = EnglishStemmer()
sentence = re.sub(r'\#.*?$', '', sentence)
sentence = re.sub(r'\#.*? ', '', sentence)
sentence = re.sub(r'\@.*?$', '', sentence)
sentence = re.sub(r'\@.*? ', '', sentence)
sentence = re.sub(r'pic.twitter.*?$', '', sentence)
sentence = re.sub(r'pic.twitter.*? ', '', sentence)
sentence = re.sub(r'\'m', ' am', sentence)
sentence = re.sub(r'\'d', ' would', sentence)
sentence = re.sub(r'\'ll', ' will', sentence)
sentence = re.sub(r'\&', 'and', sentence)
sentence = re.sub(r'don\'t', 'do not', sentence)
data = stemmer.stem(sentence)
print(data)
from nltk.corpus import stopwords
sentence = str(data)
stop = stopwords.words('english')
final = [i for i in sentence.split() if i not in stop]
finalstring = ' '.join(final)
os.system("printf \"" + str(finalstring) + "\n\">> stemstop/" + word)
except Exception as e:
print(e)
示例6: pre_proc
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def pre_proc(in_str, removestop=True, alwayskeep=False, word_punc=False, unquote=False):
# remove accents, wordify punctuation
in_str = strip_accents(in_str, wordify=word_punc, unquote=unquote)
en_stem = EnglishStemmer()
# tokenize string
if removestop: # remove stop words
tok_list = filter(lambda x: x not in stopwords.words('english'), wordpunct_tokenize(in_str))
else:
tok_list = wordpunct_tokenize(in_str)
new_tok_list = []
for tok in tok_list:
if tok not in WORD_PUNC_LIST:
correct_spell = HOBJ.spell(tok)
if not correct_spell:
suggestions = [strip_accents(tmp_sug).lower() for tmp_sug in HOBJ.suggest(tok)]
else:
suggestions = []
if correct_spell or (tok.lower() in suggestions):
new_tok_list.append(tok)
tok_stem = en_stem.stem(tok)
if tok_stem != tok:
new_tok_list.append(tok_stem)
elif len(tok) >= 3:
tok_sug = None
lev_perc = .34
for sug in suggestions:
if not tok_sug and tok == sug[1:]:
tok_sug = sug
if not tok_sug:
for sug in suggestions:
tmp_lev_perc = float(lev_dist(tok, sug)) / float(max(len(tok),len(sug)))
if not tok_sug and tmp_lev_perc < lev_perc:
tok_sug = sug
lev_perc = tmp_lev_perc
if tok_sug:
new_tok_list.append(tok_sug)
tok_stem = en_stem.stem(tok_sug)
if tok_stem != tok_sug:
new_tok_list.append(tok_stem)
elif alwayskeep:
new_tok_list.append(tok)
elif alwayskeep:
new_tok_list.append(tok)
else:
new_tok_list.append(tok)
out_str = string.join(new_tok_list, ' ')
return out_str.lower()
示例7: as_eng_postagged_doc
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def as_eng_postagged_doc(doc):
'''Uses nltk default tagger.'''
tags = [t for _, t in nltk.pos_tag(list(doc.word))]
stemmer = EnglishStemmer()
lemmata = [stemmer.stem(w) for w in list(doc.word)]
doc['pos'] = Series(tags)
doc['lemma'] = Series(lemmata)
return doc
示例8: _execute
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def _execute(self):
corpus = mongoExtractText(self.name)
stemmer = EnglishStemmer()
for item in corpus:
line = item.replace(',', ' ')
stemmed_line = stemmer.stem(line)
self.sentiment.append((sentiment.sentiment(stemmed_line), stemmed_line))
示例9: stem_word
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def stem_word(word):
"""
Stem words
:param word: (str) text word
:returns: stemmed word
"""
stemmer = EnglishStemmer()
return stemmer.stem(word)
示例10: text_processing
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def text_processing(text, min_size=4, sep_char=' '):
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords as stwds
stemmer = EnglishStemmer()
stopwords = set(stwds.words('english') +
contractions_without_punc)
text = [stemmer.stem(w) for w in text.split(sep_char)
if not w in stopwords
and len(w) >= min_size]
return text
words = list()
for word in text:
words.append(stemmer.stem(word))
return words
示例11: computeSentiment
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def computeSentiment(tweet_text):
pos_count = 0
neg_count = 0
pos_terms = []
neg_terms = []
st = EnglishStemmer()
tokenized_tweet = tokenize(tweet_text)
for t in tokenized_tweet:
#print st.stem(t.lower())
if st.stem(t.lower()) in negative_terms:
neg_terms.append(t.lower())
neg_count += 1
elif st.stem(t.lower()) in positive_terms:
pos_terms.append(t.lower())
pos_count += 1
return pos_count, neg_count, set(pos_terms), set(neg_terms)
示例12: use_snowball_stemmer
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def use_snowball_stemmer(self,word):
"""
return stemmed words used snowball algorithm
:param word:
:return:
"""
englishStemmer=EnglishStemmer()
stemmed_word= englishStemmer.stem(word)
return stemmed_word
示例13: getLemmatizerInfo
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def getLemmatizerInfo(pathArticle):
data = open(pathArticle, "r")
text1 = data.read().decode("utf-8")
sourceText = text1
links1 = []
l = 0
for q in text1.split():
if q == "\ufeff":
continue
links1.append([text1.find(q, l), q])
l = len(q) + 1 + text1.find(q, l)
text1 = text1.replace(" - ", " \u2013 ", text1.count(" - "))
text1 = text1.replace(" -", " \u2013", text1.count(" -"))
text1 = text1.replace("- ", "\u2013 ", text1.count("- "))
text1 = text1.replace("-", " - ", text1.count("-"))
text1 = text1.replace("(", "( ", text1.count("("))
text1 = text1.replace(")", " )", text1.count(")"))
text1 = text1.replace(" \u0027", " \u301E", text1.count(" \u0027"))
text1 = text1.replace("\u0027", " \u0027", text1.count("\u0027"))
text1 = text1.split()
if text1[0] == u"\ufeff":
text1 = text1[1:]
text = []
for word in text1:
text2 = []
if len(word) == 0:
continue
while word[len(word) - 1] in [",", ".", "!", "?", ":", ";"]:
text2.append(word[len(word) - 1])
word = word[:-1]
if len(word) == 0:
break
text.append(word)
for i in range(len(text2) - 1, -1, -1):
text.append(text2[i])
out = ""
st = EnglishStemmer()
l = 0
links = []
for word in text:
if isOk(word):
q = st.stem(word) + " "
else:
q = word + " "
out += q.lower()
links.append([l, q])
l += len(q)
return out, links, links1, sourceText
示例14: get_query
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def get_query(vec_dict):
dim = 300 # Dimension of the GloVe vectors chosen
# initialize stemmer for search in GLoVe vector space
st = EnglishStemmer()
query = raw_input("Please enter search query:")
query_vector = np.zeros(dim)
numWords = 0
for word in query.split():
if st.stem(word) in vec_dict:
query_vector += vec_dict[st.stem(word)].astype(np.float)
numWords += 1
elif st.stem(word) + "e" in vec_dict:
query_vector += vec_dict[st.stem(word) + "e"].astype(np.float)
numWords += 1
query_vector /= numWords
return query, query_vector
示例15: stemming
# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def stemming(tweet):
tweets = tweet.split()
wrdStemmer = EnglishStemmer()
stemTweet =[]
try:
for tweet in tweets:
tweet = wrdStemmer.stem(tweet)
stemTweet.append(tweet)
except:
print("Error: Stemming")
return " ".join(stemTweet)