本文整理汇总了Python中nltk.stem.wordnet.WordNetLemmatizer.lemmatize方法的典型用法代码示例。如果您正苦于以下问题:Python WordNetLemmatizer.lemmatize方法的具体用法?Python WordNetLemmatizer.lemmatize怎么用?Python WordNetLemmatizer.lemmatize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.wordnet.WordNetLemmatizer
的用法示例。
在下文中一共展示了WordNetLemmatizer.lemmatize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_cooc
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def get_cooc(chunk_trees,stoplist=True):
triples, simple_trees = [], []
lmtzr = WordNetLemmatizer()
for t in chunk_trees:
entities = []
for chunk in t[:]:
if isinstance(chunk,Tree) and chunk.node == 'NP':
# getting a tree for later processing of triples from the simple noun
# phrases (if present)
simple_trees.append(parser_smp.parse(chunk.leaves()))
words = []
for word, tag in chunk[:]:
# stem/discard elements and construct an argument
if (stoplist and word in STOPLIST) or \
(len([x for x in word if x.isalnum()]) == 0):
# do not process stopwords for simple trees, do not process purely
# non alphanumeric characters
continue
if tag.startswith('N'):
words.append(lmtzr.lemmatize(word,'n'))
elif tag.startswith('J'):
words.append(lmtzr.lemmatize(word,'a'))
else:
words.append(word)
if len(words) > 0:
entities.append(SEP.join(words))
for e1, e2 in combinations(entities,2):
triples.append((e1,util.COOC_RELNAME,e2))
triples.append((e2,util.COOC_RELNAME,e1))
return triples, simple_trees
示例2: MakeLemmaList
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def MakeLemmaList(tagged):
# n noun
# v verb
# a adje
# r adverb
# m,w,.. something else
noun_op, adj_op, adv_op, verb_op, other_op = [], [], [], [], []
lm = WordNetLemmatizer()
for i in tagged:
# print i, i[0], i[1][0:2]
if cmp(i[1][0:1], "N") == 0:
noun_op.append(lm.lemmatize(i[0], "n"))
elif cmp(i[1][0:1], "V") == 0:
asd = lm.lemmatize(i[0], "v")
if asd != "be" and asd != "have" and asd != "do" and asd != "done" and asd != "should":
verb_op.append(asd)
elif cmp(i[1][0:1], "J") == 0:
adj_op.append(lm.lemmatize(i[0], "a"))
elif cmp(i[1][0:1], "R") == 0:
adv_op.append(lm.lemmatize(i[0], "r"))
else:
# print lm.lemmatize(i[0])+ " "
pass
final_op = noun_op + verb_op + other_op + adj_op + adv_op
return final_op
示例3: decompose
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def decompose(text, keepOriginal):
if text:
# Case-folding
text = text.lower();
# Expand all contractions like "isn't" to "is not"
text = expandContractions(text);
# Remove punctuation
regex = re.compile('[%s]' % re.escape(string.punctuation))
text = regex.sub('', text)
# Remove stop words (just add words to the list you think also have to be removed)
stopWords = ['the','this','that','those','these','to','as','there','has','and','or',
'is','not','a','an','of','but','in','by','on','are','it','if'];
words = text.split();
text = ' '.join([i for i in words if i not in stopWords]);
# Lemmatization
lemmatizer = WordNetLemmatizer();
words = text.split();
if keepOriginal:
text = ' '.join([i + " " + lemmatizer.lemmatize(i) for i in words]);
else:
text = ' '.join([lemmatizer.lemmatize(i) for i in words]);
# Remove duplicate words
text = ' '.join(OrderedDict((word,word) for word in text.split()).keys());
return text
示例4: stemming
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def stemming():
lmtzr = WordNetLemmatizer()
with open('date_gone.out', 'rb') as fin:
with open('stemmed.out', 'w') as fout:
i = 0
for line in fin:
#i+=1
new_data = []
row = line.split('\t')
#print(i)
l = len(row)
if l > 5:
data = row[5]
words = data.split(' ')
for word in words:
new_word = lmtzr.lemmatize(word)
new_data.append(new_word)
row[5] = ' '.join(new_data)
if l > 6:
data = row[6]
words = data.split(' ')
for word in words:
new_word = lmtzr.lemmatize(word)
new_data.append(new_word)
row[6] = ' '.join(new_data)
fout.write('\t'.join(row))
示例5: firstDef
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def firstDef(mwe,definition):
# this is the approach of using only the first definition
if definition=='':
return([1,1])
definition = definition.split('\n')[0]
definition = definition.replace(mwe,'')
definition = definition.replace('(','')
definition = definition.replace(')','')
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
defineArr = tokenizer.tokenize(definition)
lmtzr = WordNetLemmatizer()
for i in range(0,len(defineArr)):
defineArr[i] = lmtzr.lemmatize(defineArr[i])
words = mwe.split()
for i in range(0,len(words)):
words[i] = lmtzr.lemmatize(words[i])
if words[0] in defineArr and words[1] in defineArr:
return([1,1])
elif words[0] in defineArr:
return([1,0])
elif words[1] in defineArr:
return([0,1])
else:
return([0,0])
示例6: convert_speeches_into_matrix
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def convert_speeches_into_matrix(features,speech_list,label):
sample_matrix = []
label_vector = []
#print len(features)
for speech in speech_list:
sample = []
speech = re.sub('http://[a-zA-Z0-9|/|.]*',' ',speech)
speech = re.sub('%[0-9|.]*', ' ', speech)
speech = re.sub('$[0-9|.]*',' ', speech)
for ch in " \"$!'@#%&()*+,-./:;<=>?[\\]^_`{|}~ ":
speech = speech.replace(ch,' ')
tokens = speech.split()
#word lemmatization
lmtzr = WordNetLemmatizer()
tokens = [lmtzr.lemmatize(token) for token in tokens]
tokens = [lmtzr.lemmatize(token,'v') for token in tokens]
#tokens = bigrams(tokens) # uncomment this line, we can use bigram as
unique_tokens_dict = collections.Counter(tokens)
for fea in features:
if fea in unique_tokens_dict:
sample.append(unique_tokens_dict[fea])
else:
sample.append(0)
#print(sample)
sample_matrix.append(sample)
label_vector.append(label)
return sample_matrix,label_vector
示例7: data_preprocessing
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def data_preprocessing(file_path):
f = open(file_path,'r')
speech_list = f.read().split("###") # read speeches, split with ###, and save them into list.
del speech_list[-1]
f.close()
#print len(speech_list)
f = open(file_path,'r')
speeches = f.read().lower() #set all letters lower case
speeches = re.sub('http://[a-zA-Z0-9|/|.]*',' ',speeches)
speeches = re.sub('%[0-9|.]*', ' ', speeches)
speeches = re.sub('$[0-9|.]*',' ', speeches)
#speeches = re.sub('\\\\xe2\\\\x80\\\\x[a-zA-Z0-9]*',' ',speeches)
#print speeches
for ch in " \"$!'@#%&()*+,-./:;<=>?[\\]^_`{|}~ ":
speeches = speeches.replace(ch,' ')
tokens = speeches.split()
#word lemmatization
lmtzr = WordNetLemmatizer()
tokens = [lmtzr.lemmatize(token) for token in tokens]
tokens = [lmtzr.lemmatize(token,'v') for token in tokens]
#tokens = bigrams(tokens) # uncomment this line, we can use bigram as
total_tokens_count = len(tokens)
unique_tokens_dict = collections.Counter(tokens) #key is word, value is the count,
#also default value 0 for non-exsit key.
result = [ speech_list, unique_tokens_dict, total_tokens_count ]
return result
示例8: getpurpose
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def getpurpose(matched,classname):
lmtzr = WordNetLemmatizer()
if classname=='class4' or classname=='class6' or classname=='class3':
exp='\w*?ing NN\w*?'
match=re.search(exp,matched)
purpose_text=match.group().split()
purpose=lmtzr.lemmatize(purpose_text[0],'v')
return purpose
if classname=='class2':
exp='\w*? VB\w*?'
match=re.search(exp,matched)
purpose_text=match.group().split()
purpose=lmtzr.lemmatize(purpose_text[0],'v')
return purpose
if classname=='class5' or classname=='class7':
exp='for IN \w*? NN\w*?';
match=re.search(exp,matched)
purpose_text=match.group().split()
purpose=lmtzr.lemmatize(purpose_text[2],'v')
return purpose
if classname=='class1' or 'class9':
exp='\w*? IN \w*? VBG'
match=re.search(exp,matched)
if match:
purpose_text=match.group().split()
purpose=lmtzr.lemmatize(purpose_text[2],'v')
return purpose
if classname=='class1':
exp='\w*? TO \w*? VB\w*? \w*? NN\w*?'
match=re.search(exp,matched)
if match:
purpose_text=match.group().split()
purpose=lmtzr.lemmatize(purpose_text[2],'v')
return purpose
return None
示例9: parseLine
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def parseLine(line, stopWords_, wordInd, currWrd):
""" Removes stop words and lemmas using nltk and punctuations
using re. Returns a list with valid words in the line. currWrd is
the index of next word occurring for the first time
"""
lineWords = []
# Hypen in hyphenated words are removed e.g. wi-fi ==> wifi.
line = re.sub('(\w)-(\w)',r'\1\2',line)
# replace underscore with space
line = re.sub('(\w)_(\w)',r'\1 \2',line)
# Remove punctuation marks.
line = re.sub("[',~`@#$%^&*|<>{}[\]\\\/.:;?!\(\)_+\"-]",r'',line)
wnLmtzr = WordNetLemmatizer()
for word in line.split():
# Get index of word from wordInd. If it is seen for the first
# time assign an index to the word.
word = word.lower() # case of words is ignored
# Lemmatize word using word net function
word = wnLmtzr.lemmatize(word, 'n') # with noun
word1 = wnLmtzr.lemmatize(word, 'v') # with verb
if len(word1) < len(word): # select smaller of two
word = word1
# Ignore stop words and numbers.
if word in stopWords_ or \
re.match('^\d+x?\d*$',word) is not None:
continue
# Update wordInd with number of occurrences of word.
if word not in wordInd:
wordInd[word] = currWrd[0]
currWrd[0] += 1
# Update lineWords with word.
lineWords.append(word)
return lineWords
示例10: stemWordMatch
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def stemWordMatch(question,sentence):
lmtzr = WordNetLemmatizer()
question_tokens = set(nltk.word_tokenize(question))
sentence_tokens=set(nltk.word_tokenize(sentence))
count=0
'''for i in sentence_tokens:
#Finding the exact word match
if lmtzr.lemmatize(i, 'v').lower() in [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]:
#print 'matching word is:',i
count=count+6
elif i.lower() in [x.lower() for x in question_tokens]:
print 'i is :',i
count=count+3
#print 'Exact word match count is :',count'''
for i in sentence_tokens:
#Finding the exact word match
if i.lower() in [x.lower() for x in question_tokens]:
#print 'i is :',i
count=count+3
elif lmtzr.lemmatize(i, 'v').lower() in [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]:
#print 'matching word is:',i
count=count+6
#print 'Exact word match count is :',count
return count
示例11: getlemmas
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def getlemmas(tokens):
lemmas = []
for token in tokens:
if len(token) < 2 or not isWord(token) or token == "the":
lemmas.append({})
continue
tokenLemmas = {}
#Synonyms
for syn in wn.synsets(token):
#Derived Forms and their Syns
for lemma in syn.lemmas():
for df in lemma.derivationally_related_forms():
for ln in df.synset().lemma_names():
tokenLemmas[ln] = 4
tokenLemmas[df.name()] = 3
for lname in syn.lemma_names():
tokenLemmas[lname] = 2
#Wordnet lemmas
l = WordNetLemmatizer()
for x in ('v','a','s','r','n'):
tmp = l.lemmatize(token, x)
tokenLemmas[tmp] = 1
tmp = l.lemmatize(tmp, x)
tokenLemmas[tmp] = 1
#Exact
tokenLemmas[token] = 1
lemmas.append(tokenLemmas)
return lemmas
示例12: get_dante_answers
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def get_dante_answers(senseval_data):
# TODO: implement probability based inference of accuracy, i.e. POS adds prob, colloc adds prob, phrase adds prob
# - must find values for probs first. for colloc - adjacency affects it. for phrase - order affects it
# Or, just test adjacency, presence of colloc and phrase words in the sentence (test both lemmatized and not)
# Methods: Set arbitrary values and adjust manually
# Use a learning algorithm to find the best mix of values
DanteAPI.initialize()
dante = DanteAPI.get_all_word_meanings()
print "\nDANTE parsing completed"
dante_answers = {}
lemmatizer = WordNetLemmatizer()
for sentence_data in senseval_data:
for phrase in sentence_data["test_phrases"]:
word_id, raw_word = phrase["headword"]
word = lemmatizer.lemmatize(raw_word)
phrase_meaning = _answer_phrase(word, sentence_data, dante)
if phrase_meaning is not None:
dante_answers[word_id] = phrase_meaning
else:
dante_answers[word_id] = _answer_word(word, sentence_data, dante)
for word_id, raw_word in sentence_data["test_words"].iteritems():
word = lemmatizer.lemmatize(raw_word)
dante_answers[word_id] = _answer_word(word, sentence_data, dante)
return dante_answers
示例13: extract_cooking_methods
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def extract_cooking_methods(input_steps, title):
steps = copy.deepcopy(input_steps)
steps.append(title)
tk_steps = [pos_tag(word_tokenize(w.lower())) for w in steps]
methods = []
for step in tk_steps:
# methods += [wordnet_lemmatizer.lemmatize(w, pos='v').encode('ascii', 'ignore') for (w, pos) in step if 'VB' in pos]
methods += [w.encode('ascii', 'ignore') for (w, pos) in step if 'VB' in pos]
for step in steps:
if 'preheat' in step:
methods += ['preheat', 'preheating']
if 'microwav' in step:
methods += ['microwave', 'microwaving']
if 'place' in step:
methods.append('place')
if 'form' in step:
methods.append('form')
if 'sprinkle' in step:
methods.append('sprinkle')
wordnet_lemmatizer = WordNetLemmatizer()
discard = ['be', 'use', 'need', 'should', 'allow', 'pink', 'turn', 'reserve']
methods = [m for m in methods if wordnet_lemmatizer.lemmatize(m, pos='v') not in discard and len(m) > 2]
stems = [wordnet_lemmatizer.lemmatize(w, pos='v') for w in methods]
gerunds = [w[:-1] + 'ing' for w in stems if w[-1] == 'e']
gerunds += [w + 'ing' for w in stems if w[-1] != 'e']
methods = list(set(methods + stems + gerunds))
return methods
示例14: LexicalBigramUnigramAnalyzer
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
class LexicalBigramUnigramAnalyzer(object):
def __init__(self):
self.lemmatizer = WordNetLemmatizer()
self.tb = Blobber(pos_tagger=PerceptronTagger())
self.sentencer = SentenceTokenizer()
def __call__(self, doc):
tokens = []
for sent in self.sentencer.tokenize(doc.decode('ascii','ignore')):
tagged = self.tb(sent.lower()).tags
tagged = [(t[0], penn_to_wn(t[1])) for t in tagged]
tagged = [(t[0], t[1]) for t in tagged if t[0] not in stopwords.words('english')]
ng = zip(tagged, tagged[1:])
rule1 = [(t[0],t[1]) for t in ng if t[0][1]== wn.ADJ and t[1][1]== wn.NOUN]
rule2 = [(t[0],t[1]) for t in ng if (t[0][1]== wn.ADV and t[1][1]== wn.VERB) or (t[0][1]== wn.VERB and t[1][1]== wn.ADV)]
rule3 = [(t[0],t[1]) for t in ng if t[0][1]== wn.VERB and t[1][1]== wn.VERB]
rule4 = [(t[0],t[1]) for t in ng if t[0][1]== wn.NOUN and t[1][1]== wn.NOUN]
filtered_list = rule1 + rule2 + rule3 + rule4
# Lemmatize
filtered_bigrams = [self.lemmatizer.lemmatize(t[0][0], t[0][1]) + ' ' + self.lemmatizer.lemmatize(t[1][0], t[1][1]) for t in filtered_list]
filtered_unigrams = [self.lemmatizer.lemmatize(w[0], w[1]) for w in tagged]
for bigram in filtered_bigrams:
tokens.append(bigram)
for unigram in filtered_unigrams:
tokens.append(unigram)
return tokens
示例15: single_master_list
# 需要导入模块: from nltk.stem.wordnet import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.wordnet.WordNetLemmatizer import lemmatize [as 别名]
def single_master_list(data):
my_vocab = deepcopy(init_to_zero_vocab)
data = data.lower()
data = re.sub("\[email protected]\S", " EMAILREPLACED ", data)
data = re.sub("\d+", " NUMBERREPLACED ", data)
data = re.sub("\s?http:s?\/\/\w{0,3}\.\w+\.\w{0,3}\S?|w{0,3}\.\w+\.\w{0,3}\S?", " URLREPLACED ", data)
for punct in string.punctuation:
data = data.replace(punct," ")
format_data = data.split()
no_stop_words = []
l = WordNetLemmatizer()
for word in format_data:
if (stop):
if word not in stopwords.words('english'):
if (lem):
no_stop_words.append(l.lemmatize(word))
else:
no_stop_words.append(word)
else:
if (lem):
no_stop_words.append(l.lemmatize(word))
else:
no_stop_words.append(word)
for element in no_stop_words:
if(element in my_vocab):
my_vocab[element] += 1
return my_vocab