本文整理汇总了Python中nltk.stem.WordNetLemmatizer类的典型用法代码示例。如果您正苦于以下问题:Python WordNetLemmatizer类的具体用法?Python WordNetLemmatizer怎么用?Python WordNetLemmatizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了WordNetLemmatizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: bow_score
def bow_score(hypothesis_list,text_list):
wordnet_lemmatizer = WordNetLemmatizer()
stop_word_list = ['a', 'an', 'the', ',', '.', ';', ':' ]
i = 0
while i < len(hypothesis_list):
if hypothesis_list[i] in stop_word_list:
del hypothesis_list[i]
i = i - 1
i = i + 1
if len(hypothesis_list) == 0:
return 0
i = 0
while i < len(text_list):
if text_list[i] in stop_word_list:
del text_list[i]
i = i - 1
i = i + 1
if len(text_list) == 0:
return 0
## Stop words removed up until here
score = 0
for word_text in text_list:
lemma_text = wordnet_lemmatizer.lemmatize(word_text)
for word_hypothesis in hypothesis_list:
lemma_hypothesis = wordnet_lemmatizer.lemmatize(word_hypothesis)
print lemma_hypothesis
print lemma_text
score += lexical_compare(lemma_text,lemma_hypothesis)
print str(score)
return score
示例2: labelBasedEntry
def labelBasedEntry(term,uri):
wnl = WordNetLemmatizer()
hm = {}
sparql = Sparql.Connection()
if " " in term:
term = term.split(" ")[1]
stem = wnl.lemmatize(term)
wiktionary_informations = sparql.getWiktionaryInformationsNEW(stem)
for x in wiktionary_informations:
if " + " in x[0] and "," not in x[0] and "*" not in x[0]:
tmp = x[0].split(" + ")[0]
if "Adjective" in x[1]:
hm[LexiconGenerator.AdjectivePPFrame(tmp, uri,{})] = ""
if "Verb" in x[1]:
hm[LexiconGenerator.TransitiveFrame(tmp, uri,{})] = ""
if "Noun" in x[1]:
hm[LexiconGenerator.NounPPFrame(tmp,uri,{})] = ""
elif "," not in x[0] and "*" not in x[0]:
if "Adjective" in x[1]:
hm[LexiconGenerator.AdjectivePPFrame(term, uri,{})] = ""
if "Verb" in x[1]:
hm[LexiconGenerator.TransitiveFrame(term, uri,{})] = ""
if "Noun" in x[1]:
hm[LexiconGenerator.NounPPFrame(term,uri,{})] = ""
if len(wiktionary_informations) == 0:
hm[LexiconGenerator.TransitiveFrame(stem, uri,{})] = ""
hm[LexiconGenerator.NounPPFrame(stem,uri,{})] = ""
entry = []
for key in hm:
entry.append(key)
return entry
示例3: stemming
def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"):
supported_stemmers = [
"PorterStemmer", "SnowballStemmer",
"LancasterStemmer", "WordNetLemmatizer"]
if type is False or type not in supported_stemmers:
return words_l
else:
l = []
if type == "PorterStemmer":
stemmer = PorterStemmer()
for word in words_l:
l.append(stemmer.stem(word).encode(encoding))
if type == "SnowballStemmer":
stemmer = SnowballStemmer(lang)
for word in words_l:
l.append(stemmer.stem(word).encode(encoding))
if type == "LancasterStemmer":
stemmer = LancasterStemmer()
for word in words_l:
l.append(stemmer.stem(word).encode(encoding))
if type == "WordNetLemmatizer": # TODO: context
wnl = WordNetLemmatizer()
for word in words_l:
l.append(wnl.lemmatize(word).encode(encoding))
return l
示例4: getWordCounts
def getWordCounts(WordCloudTweetNo):
print('Fetching the most commonly used {0} words in the "{1}" feed...'.format(WordCloudTweetNo, ScreenName))
cur = "DELETE FROM WordsCount;"
conn.execute(cur)
conn.commit()
cur = 'SELECT tweet_text FROM UserTimeline'
data = conn.execute(cur)
StopList = stopwords.words('english')
Lem = WordNetLemmatizer()
AllWords = ''
for w in tqdm(data.fetchall(),leave=1):
try:
#remove certain characters and strings
CleanWordList = re.sub(r'http://[\w.]+/+[\w.]+', "", w[0], re.IGNORECASE)
CleanWordList = re.sub(r'https://[\w.]+/+[\w.]+', "", CleanWordList, re.IGNORECASE)
CleanWordList = re.sub(r'[@#\[\]\'"$.;{}~`<>:%&^*()-?_!,+=]', "", CleanWordList)
#tokenize and convert to lower case
CleanWordList = [words.lower() for words in word_tokenize(CleanWordList) if words not in StopList]
#lemmatize words
CleanWordList = [Lem.lemmatize(word) for word in CleanWordList]
#join words
CleanWordList =' '.join(CleanWordList)
AllWords += CleanWordList
except Exception as e:
print (e)
sys.exit(e)
if AllWords is not None:
words = [word for word in AllWords.split()]
c = Counter(words)
for word, count in c.most_common(WordCloudTweetNo):
conn.execute("INSERT INTO WordsCount (word, frequency) VALUES (?,?)", (word, count))
conn.commit()
示例5: preprocess
def preprocess(line, is_lmz=False):
line = wordpunct_tokenize(line.strip())
if is_lmz:
lemmatizer = WordNetLemmatizer()
line = [lemmatizer.lemmatize(word) for word in line]
return line
示例6: create_lexicon
def create_lexicon(pos_file, neg_file):
lex = []
# 读取文件
def process_file(_f):
with open(_f, 'r') as f:
lex = []
lines = f.readlines()
# print(lines)
for line in lines:
words = word_tokenize(line.lower())
lex += words
return lex
lex += process_file(pos_file)
lex += process_file(neg_file)
# print(len(lex))
lemmatizer = WordNetLemmatizer()
lex = [lemmatizer.lemmatize(word) for word in lex] # 词形还原 (cats->cat)
word_count = Counter(lex)
# print(word_count)
# {'.': 13944, ',': 10536, 'the': 10120, 'a': 9444, 'and': 7108, 'of': 6624, 'it': 4748, 'to': 3940......}
# 去掉一些常用词,像the,a and等等,和一些不常用词; 这些词对判断一个评论是正面还是负面没有做任何贡献
lex = []
for word in word_count:
if word_count[word] < 2000 and word_count[word] > 20: # 这写死了,好像能用百分比
lex.append(word) # 齐普夫定律-使用Python验证文本的Zipf分布 http://blog.topspeedsnail.com/archives/9546
return lex
示例7: split_into_words
def split_into_words(text, lemmatize=False, reattach=True, replace_numbers=True, split_off_quotes=True,
fix_semicolon_mistakes=True):
if fix_semicolon_mistakes:
text = fix_semicolons(text)
word_tokenizer = nltk.tokenize.TreebankWordTokenizer()
# get rid of certain character so that we can use those for special purposes
tokens = word_tokenizer.tokenize(text)
if reattach:
tokens = reattach_clitics(tokens)
if split_off_quotes:
tokens = split_off_quote_marks(tokens)
if lemmatize:
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
if replace_numbers:
tokens = [re.sub('[0-9]', '#', t) for t in tokens]
tokens = split_off_final_punctuation(tokens)
tokens = split_off_commas(tokens)
return tokens
示例8: lemmatize
def lemmatize(self):
wnl = WordNetLemmatizer()
self.lemma_list = []
for i in self.tokens_no_punct:
lemmy_word = wnl.lemmatize(i)
self.lemma_list.append(unicode(lemmy_word))
示例9: get_words_list
def get_words_list(dataset):
'''
Loading dataset and read contents, use tokenize to get tokens and lemmatize the words.
'''
# join the path and file name together
spam_path = 'data/enron/pre/'+ dataset + '/spam/'
ham_path = 'data/enron/pre/'+ dataset + '/ham/'
spam_npl = [i[-1] for i in os.walk(spam_path)][0]
ham_npl = [i[-1] for i in os.walk(ham_path)][0]
spam_fl = (open(os.path.join(spam_path, j)).read().lower() for j in spam_npl)
ham_fl = (open(os.path.join(ham_path, j)).read().lower() for j in ham_npl)
splitter = re.compile("\\W*")
english_stops = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# tokenize the files into words
spam_wl = [None]*len(spam_npl)
for i,f in enumerate(spam_fl):
spam_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]
ham_wl = [None]*len(ham_npl)
for i,f in enumerate(ham_fl):
ham_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]
return spam_wl, ham_wl
示例10: preProcessHistogram
def preProcessHistogram(documents):
"""
preProcessHistogram(listofString) -> listOfString
consumes a listofSentences and Tokenizes it, and returns a list of lemmatized words.
"""
paragraph = ""
for sentence in documents:
paragraph = paragraph + " " + sentence.lower()
#make all words lowercase and remove all punctuation
lowerCaseParagraph = paragraph.translate(maketrans("",""),punctuation)
words = lowerCaseParagraph.split()
lemmatizer = WordNetLemmatizer()
#lemmatize every word.... (if it needs to be lemmatized) and remove words that are too long, because chances are they arent words.
words = map(lambda x: lemmatizer.lemmatize(x,'v'), words)
words = filter(lambda x: len(x) < 10 or x.isdigit() , words)
return words
示例11: text_tokenize
def text_tokenize(sentence):
#stemmer = SnowballStemmer('english')
lmtr = WordNetLemmatizer()
tokens = [x.lower() for x in word_tokenize(sentence) if x.isalpha()]
tokens_tagged = nltk.pos_tag(tokens)
tokens_tagged = [(x, get_wordnet_pos(y)) for (x, y) in tokens_tagged if x not in stopwords.words('english')]
return [lmtr.lemmatize(x, y) if y != '' else x for (x, y) in tokens_tagged]
示例12: getdata
def getdata():
"""
retrieves the data from repository table
removes the special characters and stop words and does the stemming(using nltk package)
"""
conn=db.getDBConnection()
cursor = conn.cursor()
global stopWordSet
sql = "select id, description from repository"
rows = db.executeSQL(conn, sql)
counter=1
wnl = WordNetLemmatizer()
for row in rows:
id = row[0]
desc= row[1]
#print desc
if desc is not None:
desc=desc.replace('-',' ').replace(',',' ').replace('/',' ').replace('.',' ').replace('_',' ')
desc = desc.lower()
desc = re.sub('[^a-z0-9 ]','',desc)
keywords = desc.split(" ")
for word in keywords:
#word = porter.stem(word.strip())
word=wnl.lemmatize(word.strip())
if word not in stopWordSet:
sql1 = "insert into keywords1 values("+str(counter)+",'"+word+"',"+str(id)+ ',' + str(0) + ")"
print sql1
cursor.execute(sql1)
conn.commit()
counter = counter+1
示例13: lemmatize
def lemmatize(tweets):
'''
Lemmatize words in the corpus.
Input:
------------------
tweets: List of lists, [[word1OfTweet1, word2OfTweet1,...,word_m1OfTweet1],
[word1OfTweet2, word2OfTweet2,...,word_m2OfTweet2],
.
.
.
[word1OfTweetN, word2OfTweetN,...,word_mNOfTweetN]]
Output:
-----------------
newTweets: All the words in the tweet lemmatized.
'''
wordnet_lemmatizer = WordNetLemmatizer()
pos_tag_tweets = [nltk.pos_tag(t) for t in tweets]
tweets = []
i = 0
for t in pos_tag_tweets:
tt = []
for w in t:
if get_wordnet_pos(w[1]) =='':
tt.append(w[0])
else:
try:
tt.append(wordnet_lemmatizer.lemmatize(w[0], pos = get_wordnet_pos(w[1])))
except UnicodeDecodeError:
pass
tweets.append(tt)
i += 1
return tweets
示例14: _lemma_
def _lemma_(token):
if isinstance(token, str):
return _stem_(token)
if isinstance(token, unicode):
return _stem_(token)
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return ''
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
p = get_wordnet_pos(token.pos()[0][1])
if p!=wordnet.VERB:
return _stem_(token[0])
rs = wordnet_lemmatizer.lemmatize(token[0], pos=p)
return rs
示例15: GetCleanWords
def GetCleanWords(content_string):
# Tokenize the sentences using hte Punkt word Tokenizer
tokenized_words = PunktWordTokenizer().tokenize(content_string)
#Now let's remove the stop words
tokenized_words = [word for word in tokenized_words if word.lower() not in stopwords_list]
# Now let's remove all of the solely punctuation.
punctuation_list = ['.',',',';',':','!','?']
tokenized_words = [word for word in tokenized_words if word not in punctuation_list]
# Finally let's get rid of the punctuation at the end of each word
cleaned_words = []
for word in tokenized_words:
if word[-1] in punctuation_list:
cleaned_words.append(word[:-1])
else:
cleaned_words.append(word)
# Now let's stem each of the words to lower our word count
wnl = WordNetLemmatizer()
clean_and_stemmed_words = [wnl.lemmatize(cleaned_word) for cleaned_word in cleaned_words]
return clean_and_stemmed_words