本文整理汇总了Python中nltk.stem.wordnet.WordNetLemmatizer类的典型用法代码示例。如果您正苦于以下问题:Python WordNetLemmatizer类的具体用法?Python WordNetLemmatizer怎么用?Python WordNetLemmatizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了WordNetLemmatizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parseLine
def parseLine(line, stopWords_, wordInd, currWrd):
""" Removes stop words and lemmas using nltk and punctuations
using re. Returns a list with valid words in the line. currWrd is
the index of next word occurring for the first time
"""
lineWords = []
# Hypen in hyphenated words are removed e.g. wi-fi ==> wifi.
line = re.sub('(\w)-(\w)',r'\1\2',line)
# replace underscore with space
line = re.sub('(\w)_(\w)',r'\1 \2',line)
# Remove punctuation marks.
line = re.sub("[',~`@#$%^&*|<>{}[\]\\\/.:;?!\(\)_+\"-]",r'',line)
wnLmtzr = WordNetLemmatizer()
for word in line.split():
# Get index of word from wordInd. If it is seen for the first
# time assign an index to the word.
word = word.lower() # case of words is ignored
# Lemmatize word using word net function
word = wnLmtzr.lemmatize(word, 'n') # with noun
word1 = wnLmtzr.lemmatize(word, 'v') # with verb
if len(word1) < len(word): # select smaller of two
word = word1
# Ignore stop words and numbers.
if word in stopWords_ or \
re.match('^\d+x?\d*$',word) is not None:
continue
# Update wordInd with number of occurrences of word.
if word not in wordInd:
wordInd[word] = currWrd[0]
currWrd[0] += 1
# Update lineWords with word.
lineWords.append(word)
return lineWords
示例2: cleanUp
def cleanUp(rawWords):
stops = [t.lower() for t in stopwords.words('english')]
sumarr = []
for i in range(0,len(rawWords)):
arr = [t.lower() for t in rawWords[i].split()]
for word in arr:
if word not in stops:
sumarr.append(word.lower())
punct1 = '.,?/><";![]:@#$%&*()'
punct2 = "'"
# for i in range(0,len(sumarr)):
for i in range(0,1):
if r'\\xe2' in sumarr[i]: sumarr[i] = '*'
if len(sumarr[i]) > 1:
if sumarr[i][-1] in punct1 or sumarr[i][-1] in punct2: sumarr[i] = sumarr[i][:-1] # delete punctuation at the end of the word.
if sumarr[i][-1] in punct1 or sumarr[i][-1] in punct2: sumarr[i] = sumarr[i][:-1] # once more to delete double punctuations. if sumarr[i][0] in punct1 or sumarr[i][0] in punct2: sumarr[i] = sumarr[i][1:] # delete punctuation at the start of the word.
if len(sumarr[i]) > 2:
# print sumarr
if sumarr[i][-2] == "'" and sumarr[i][-1] == 's' : sumarr[i] = sumarr[i][:-2] # so that Jim's --> Jim.
if sumarr[i][-2] == "'" and sumarr[i][-1] == 'm' : sumarr[i] = sumarr[i][:-2] # so that I'm --> I.
if len(sumarr[i]) > 3:
if sumarr[i][-3] == 'n' and sumarr[i][-2] == "'" and sumarr[i][-1] == 't' : sumarr[i] = sumarr[i][:-3] # so that isn't --> is. Not is a stop word.
lmtzr = WordNetLemmatizer()
return [lmtzr.lemmatize(t) for t in sumarr if ("'" not in t and t not in stops)]
示例3: parseLyrics2
def parseLyrics2(outlist):
bandLyricInfo = {}
master = [['death', 0],['violence',0],['sacrifice',0],['nature',0],['peace',0],['storm',0],['spirit',0],[ 'dark',0],['scream',0],['pain',0],['blood',0],['flesh',0],['love',0],['greed',0],['poison',0],['anger',0],['revenge',0],['misery',0],['hell',0],['heaven',0],['hate',0],['soul',0],['battle',0],['ghost',0],['joy',0],['light',0],['omen',0],['miracle',0],['magic',0],['universe',0],['disease',0],['god',0],['satan',0],['struggle',0],['heart',0]]
for key in outlist:
templist = copy.deepcopy(master) ;
#key = 'Queensryche'
raw = outlist[key];
raw = raw.lower();
words = re.findall(r'\w+', raw,flags = re.UNICODE | re.LOCALE) # punctuation
imp_words = filter(lambda x: x not in stopwords.words('english'), words) # filter noise
lmt = WordNetLemmatizer()
words_new = [lmt.lemmatize(x) for x in words]
dw = list(set(words_new))
for word in dw:
for m in templist:
p1 = wordnet.synsets(word) ;
p2 = wordnet.synsets(m[0]) ;
if(len(p1) >0 and len(p2) >0):
c = p1[0].wup_similarity(p2[0])
if(c > m[1]):
m[1] = c
# sort words according to similarity
tnew = sorted(templist,key=lambda val:val[1],reverse=True) [0:10] ;
# remove the other column
for l in tnew:
del l[1]
print 'Done ',key
#break ;
bandLyricInfo[key] = tnew
#del templist
return bandLyricInfo
示例4: MakeLemmaList
def MakeLemmaList(tagged):
# n noun
# v verb
# a adje
# r adverb
# m,w,.. something else
noun_op, adj_op, adv_op, verb_op, other_op = [], [], [], [], []
lm = WordNetLemmatizer()
for i in tagged:
# print i, i[0], i[1][0:2]
if cmp(i[1][0:1], "N") == 0:
noun_op.append(lm.lemmatize(i[0], "n"))
elif cmp(i[1][0:1], "V") == 0:
asd = lm.lemmatize(i[0], "v")
if asd != "be" and asd != "have" and asd != "do" and asd != "done" and asd != "should":
verb_op.append(asd)
elif cmp(i[1][0:1], "J") == 0:
adj_op.append(lm.lemmatize(i[0], "a"))
elif cmp(i[1][0:1], "R") == 0:
adv_op.append(lm.lemmatize(i[0], "r"))
else:
# print lm.lemmatize(i[0])+ " "
pass
final_op = noun_op + verb_op + other_op + adj_op + adv_op
return final_op
示例5: convertToVec
def convertToVec(self, line):
lmtzr = WordNetLemmatizer()
if isinstance(line, unicode):
line = str(unicodedata.normalize('NFKD', line).encode('ascii','ignore'))
#Strip of special characters
line = re.sub(r'[^a-z^A-Z^0-9^,^.]|\^', ' ', line)
line = line.lower()
wordcount = {}
count = self.Dic.count
for word in line.split(' '):
word = lmtzr.lemmatize(word)
if isinstance(word, unicode):
word = str(unicodedata.normalize('NFKD', word).encode('ascii','ignore'))
if word in self.Dic.words.keys():
num = self.Dic.words[word]
else:
num = count
count += 1
if num not in wordcount.keys():
wordcount[num] = 1
else:
wordcount[num] = wordcount[num] + 1
vec = []
for key in wordcount.keys():
tp = (key, wordcount[key] + 0.0)
vec.append(tp)
return vec
示例6: get_cooc
def get_cooc(chunk_trees,stoplist=True):
triples, simple_trees = [], []
lmtzr = WordNetLemmatizer()
for t in chunk_trees:
entities = []
for chunk in t[:]:
if isinstance(chunk,Tree) and chunk.node == 'NP':
# getting a tree for later processing of triples from the simple noun
# phrases (if present)
simple_trees.append(parser_smp.parse(chunk.leaves()))
words = []
for word, tag in chunk[:]:
# stem/discard elements and construct an argument
if (stoplist and word in STOPLIST) or \
(len([x for x in word if x.isalnum()]) == 0):
# do not process stopwords for simple trees, do not process purely
# non alphanumeric characters
continue
if tag.startswith('N'):
words.append(lmtzr.lemmatize(word,'n'))
elif tag.startswith('J'):
words.append(lmtzr.lemmatize(word,'a'))
else:
words.append(word)
if len(words) > 0:
entities.append(SEP.join(words))
for e1, e2 in combinations(entities,2):
triples.append((e1,util.COOC_RELNAME,e2))
triples.append((e2,util.COOC_RELNAME,e1))
return triples, simple_trees
示例7: wordLemmatization
def wordLemmatization(self):
#should be working now
lemmatizer = WordNetLemmatizer()
lemmatization_result = []
for word in self.file:
lemmatization_result.append(lemmatizer.lemmatize(word))
self.file=lemmatization_result
示例8: __tokenize
def __tokenize(self,text):
""" function: tokenize
------------------
generate list of tokens given a block of @text
:param text: string representing article text field
:returns: list of tokens with various modifications
"""
ascii = text.encode('ascii', 'ignore')
# remove digits & punctuation
no_digits = ascii.translate(None, string.digits)
no_punctuation = no_digits.translate(None, string.punctuation)
# separate text blocks into tokens
tokens = nltk.word_tokenize(no_punctuation)
# remove class labels, stopwords, and non-english words
no_class_labels = [w for w in tokens if not w in Document.banned_words]
no_stop_words = [w for w in no_class_labels if not w in stopwords.words('english')]
eng = [y for y in no_stop_words if wordnet.synsets(y)]
# lemmatization
lemmas = []
lmtzr = WordNetLemmatizer()
for token in eng:
lemmas.append(lmtzr.lemmatize(token))
# stemming
stems = []
stemmer = PorterStemmer()
for token in lemmas:
stem = stemmer.stem(token).encode('ascii', 'ignore')
if len(stem) >= 4:
stems.append(stem)
return stems
示例9: main
def main():
rake=RAKE.Rake('SmartStoplist.txt')
fp=open(input_file,'r')
text=fp.read()
text=text_clean(text)
wnl=WordNetLemmatizer()
text=' '.join([wnl.lemmatize(i.strip()) for i in nltk.word_tokenize(text)])
keywords=rake.run(text)
#print keywords
#key_list=list()
with open(key_score_file,'wb') as out:
csv_out=csv.writer(out)
csv_out.writerow(['KEYWORD','SCORE'])
for row in keywords:
#csv_out.writerow(row)
if row[1]>0:
csv_out.writerow(row)
unibitrigram_list=[]
unibitrigram_list=generate_unibitrigrams(key_score_file)
ngram_freq=Counter(unibitrigram_list)
sorted_ngram_freq=sorted(ngram_freq.items(),key=lambda x:x[1],reverse=True )
print ngram_freq
with open('bcom_ngramfr.csv','wb') as nf_csv:
csv_wr=csv.writer(nf_csv)
for item in sorted_ngram_freq:
if ((item[0]!='' or item[1]>0 )):
csv_wr.writerow(item)
示例10: clean_single_word
def clean_single_word(word, lemmatizing="wordnet"):
"""
Performs stemming or lemmatizing on a single word.
If we are to search for a word in a clean bag-of-words, we need to search it after the same kind of preprocessing.
Inputs: - word: A string containing the source word.
- lemmatizing: A string containing one of the following: "porter", "snowball" or "wordnet".
Output: - lemma: The resulting clean lemma or stem.
"""
if lemmatizing == "porter":
porter = PorterStemmer()
lemma = porter.stem(word)
elif lemmatizing == "snowball":
snowball = SnowballStemmer('english')
lemma = snowball.stem(word)
elif lemmatizing == "wordnet":
wordnet = WordNetLemmatizer()
lemma = wordnet.lemmatize(word)
else:
print("Invalid lemmatizer argument.")
raise RuntimeError
return lemma
示例11: data_preprocessing
def data_preprocessing(file_path):
f = open(file_path,'r')
speech_list = f.read().split("###") # read speeches, split with ###, and save them into list.
del speech_list[-1]
f.close()
#print len(speech_list)
f = open(file_path,'r')
speeches = f.read().lower() #set all letters lower case
speeches = re.sub('http://[a-zA-Z0-9|/|.]*',' ',speeches)
speeches = re.sub('%[0-9|.]*', ' ', speeches)
speeches = re.sub('$[0-9|.]*',' ', speeches)
#speeches = re.sub('\\\\xe2\\\\x80\\\\x[a-zA-Z0-9]*',' ',speeches)
#print speeches
for ch in " \"$!'@#%&()*+,-./:;<=>?[\\]^_`{|}~ ":
speeches = speeches.replace(ch,' ')
tokens = speeches.split()
#word lemmatization
lmtzr = WordNetLemmatizer()
tokens = [lmtzr.lemmatize(token) for token in tokens]
tokens = [lmtzr.lemmatize(token,'v') for token in tokens]
#tokens = bigrams(tokens) # uncomment this line, we can use bigram as
total_tokens_count = len(tokens)
unique_tokens_dict = collections.Counter(tokens) #key is word, value is the count,
#also default value 0 for non-exsit key.
result = [ speech_list, unique_tokens_dict, total_tokens_count ]
return result
示例12: stemWordMatch
def stemWordMatch(question,sentence):
lmtzr = WordNetLemmatizer()
question_tokens = set(nltk.word_tokenize(question))
sentence_tokens=set(nltk.word_tokenize(sentence))
count=0
'''for i in sentence_tokens:
#Finding the exact word match
if lmtzr.lemmatize(i, 'v').lower() in [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]:
#print 'matching word is:',i
count=count+6
elif i.lower() in [x.lower() for x in question_tokens]:
print 'i is :',i
count=count+3
#print 'Exact word match count is :',count'''
for i in sentence_tokens:
#Finding the exact word match
if i.lower() in [x.lower() for x in question_tokens]:
#print 'i is :',i
count=count+3
elif lmtzr.lemmatize(i, 'v').lower() in [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]:
#print 'matching word is:',i
count=count+6
#print 'Exact word match count is :',count
return count
示例13: weed_out_lexelts
def weed_out_lexelts(tweets_file):
lexelts = []
WNL = WordNetLemmatizer()
with open(tweets_file, 'r') as twh:
for line in twh:
line = line.strip().split(' :: ')[1]
lexelts_temp = []
try:
lexelts_temp = pos_tag(word_tokenize(line))
except TypeError:
print line
# Get sanitized parts of speech, not the Treebank style
# Tuples are immutable, need to make a new single-tuple list
for w, p in lexelts_temp:
new_p = get_sanitized_pos(p)
new_w = w
try:
new_w = WNL.lemmatize(w, new_p)
except KeyError:
pass
lexelts.extend([(new_w, new_p)])
lexelts = list(set(lexelts))
print lexelts
return lexelts
示例14: run
def run(self):
"""
How do I run this Task?
Luigi will call this method if the Task needs to be run.
"""
# remove stop words and punctuation
stop = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
wordnet = WordNetLemmatizer()
docs = []
#ipdb.set_trace()
for f in self.input(): # The input() method is a wrapper around requires() that returns Target objects
lines = 0
words = []
for line in f.open('r'):
if lines == 0:
label = line
lines +=1
else:
words.extend(tokenizer.tokenize(line))
lines +=1
words_filtered = filtered_words = [wordnet.lemmatize(w) for w in words if not w in stopwords.words('english')]
docs.append((label, '\t'.join(words)))
out = self.output().open('w')
for label, tokens in docs:
out.write("%s,%s\n" % (label.strip(), tokens.strip()))
out.close()
示例15: lemma_tokenize
def lemma_tokenize(paragraph):
lmtzr = WordNetLemmatizer()
try:
return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
except LookupError:
nltk.download('wordnet')
return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]