本文整理汇总了Python中nltk.WordNetLemmatizer.lemmatize方法的典型用法代码示例。如果您正苦于以下问题:Python WordNetLemmatizer.lemmatize方法的具体用法?Python WordNetLemmatizer.lemmatize怎么用?Python WordNetLemmatizer.lemmatize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.WordNetLemmatizer
的用法示例。
在下文中一共展示了WordNetLemmatizer.lemmatize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def __init__(self, text, product_name):
self.candidate_features = []
self.feature_sentences = []
self.product_name = product_name.lower().split('-')[0].split('_')
t = Tokenizer()
sents = t.sent_tokenize(text.lower())
p = POSTagger()
wnl = WordNetLemmatizer()
for sent in sents:
tagged_sent = p.nltk_tag(t.word_tokenize(sent))
feature_sent = {}
feature_sent['sentence'] = sent
feature_sent['tags'] = tagged_sent
feature_sent['nouns'] = []
feature_sent['noun_phrases'] = []
for i in range(0, len(tagged_sent)):
(word, tag) = tagged_sent[i]
#Don't include proper nouns
if tag.startswith('N') and tag != 'NNP':
"""
Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase.
Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is low. """
if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1] and feature_sent['sentence'].find(feature_sent['nouns'][-1] + ' ' + word) > -1:
feature_sent['noun_phrases'].append(wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word))
else:
feature_sent['nouns'].append(wnl.lemmatize(word))
self.feature_sentences.append(feature_sent)
示例2: init_feature_sentences
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def init_feature_sentences(self, total_content):
t = Tokenizer()
p = POSTagger()
wnl = WordNetLemmatizer()
sentences = t.sent_tokenize(total_content.lower())
for sentence in sentences:
tagged_sentence = p.ntlk_tag(t.word_tokenize(sentence))
#Initializing Feature Sentence dictionary
feature_sentence = {}
feature_sentence['sentence'] = sentence
feature_sentence['tags'] = tagged_sentence
feature_sentence['nouns'] = []
feature_sentence['noun_phrases'] = []
#Finding the Nouns/Noun Phrases in the tagged sentence
for i in range(0,len(tagged_sentence)):
(word, tag) = tagged_sentence[i]
#Chunking
if tag.startswith('N') and tag != 'NNP':
if i > 0 and len(feature_sentence['nouns']) > 0 and tagged_sentence[i - 1][0] == feature_sentence['nouns'][-1] and feature_sentence['sentence'].find(feature_sentence['nouns'][-1] + ' ' + word) > -1:
feature_sentence['noun_phrases'].append(wnl.lemmatize(feature_sentence['nouns'].pop() + ' ' + word))
else:
feature_sentence['nouns'].append(wnl.lemmatize(word))
self.feature_sentences.append(feature_sentence)
示例3: write_clean_turian_unigrams
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def write_clean_turian_unigrams():
"""
Extracts unigram embeddings from Socher's binary distribution. These can be used by other composers.
There are only 50k embeddings (presumably for the most frequent tokens in the corpus). The words have not
been processed- there are punctuation-only tokens, uppercased words and non-lemmatized words. There isn't
any PoS tag filtering either- words like "to", "while" and "there".
I remove punctuation, then lowercase and lemmatize each entry. Multiple entries may map to the
same canonical form. I select the shortest original entry (ties are broken by giving preference to
words that are already lowercased). This could have been done better.
Only vectors for the selected entries are kept. There's 33k canonical
forms left, many of which are not nouns/adjs/verbs.
We don't have a PoS tag for the canonical forms. I get around the problem by creating 3 copies of each
canonical form and expand "cat" to cat/N, cat/J and cat/V, which all share the same vector.
"""
logging.info('Writing Turian unigrams to %s', turian_unigram_vectors_file)
mat = loadmat(socher_unigram_embedding_matlab)
words = [w[0] for w in mat['words'].ravel()]
df = pd.DataFrame(mat['We'].T, index=words)
lmtzr = WordNetLemmatizer()
clean_to_dirty = defaultdict(list) # canonical -> [non-canonical]
dirty_to_clean = dict() # non-canonical -> canonical
to_keep = set() # which non-canonical forms forms we will keep
# todo this can be done based on frequency or something
for w in words:
if set(w).intersection(set(string.punctuation).union(set('0123456789'))):
# not a real word- contains digits or punctuation
continue
lemma = lmtzr.lemmatize(w.lower())
clean_to_dirty[lemma].append(w)
dirty_to_clean[w] = lemma
# decide which of possibly many non-canonical forms with the same lemma to keep
# prefer shorter and lowercased non-canonical forms
for lemma, dirty_list in clean_to_dirty.items():
if len(dirty_list) > 1:
best_lemma = min(dirty_list, key=lambda w: (len(w), not w.islower()))
else:
best_lemma = dirty_list[0]
to_keep.add(best_lemma)
# remove non-canonical forms we don't want
idx_to_drop = [i for i, w in enumerate(df.index) if w not in to_keep]
ddf = df.drop(df.index[idx_to_drop])
# canonicalize whatever is left
ddf.index = [lmtzr.lemmatize(w.lower()) for w in ddf.index]
# we don't know what the PoS tags of the canonical forms are, so make them all of the same tag
# e.g. expand "cat" to cat/N, cat/J and cat/V, which all share the same vector
new_index = ['%s/%s'%(w, pos) for pos in 'NJV' for w in ddf.index]
new_data = np.vstack([ddf.values] * 3)
ddf = pd.DataFrame(new_data, index= new_index)
dv = DenseVectors(ddf, allow_lexical_overlap=True)
dv.to_tsv(turian_unigram_vectors_file)
logging.info('Done')
示例4: lemmatize
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def lemmatize(tokens):
# lemmatize words. try both noun and verb lemmatizations
lmtzr = WordNetLemmatizer()
for i in range(0,len(tokens)):
res = lmtzr.lemmatize(tokens[i])
if res == tokens[i]:
tokens[i] = lmtzr.lemmatize(tokens[i], 'v')
else:
tokens[i] = res
return tokens
示例5: Lemmatizer
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
class Lemmatizer():
def __init__(self):
self.lemmatizer = WordNetLemmatizer()
self.stemmer = SnowballStemmer("english", ignore_stopwords=True)
'''
Lemmatizes every word in a sentence and then tokenizes it.
sentence: str
'''
def lemmatize(self, sentence):
tokens = word_tokenize(sentence)
lemmas = self.lemmatizeTokens(tokens)
return " ".join(lemmas)
'''
Turns phrase tokens into lemmatized tokens, which means into some standard format
as determined by the nltk lemmatizer. "Dogs" to "dog", "went" to "go", etc.
tokens: list of str
'''
def lemmatizeTokens(self, tokens):
tokens_tagged = pos_tag(tokens)
#Get simple POS tags.
tokens_simpleTags = [(word, map_tag('en-ptb', 'universal', tag))
for word, tag in tokens_tagged]
#Actually lemmatize.
lemmas = []
for token, tag in tokens_simpleTags:
lemmatized = ""
if tag == "VERB":
lemmatized = self.lemmatizer.lemmatize(token, pos='v')
elif tag == "ADJ":
lemmatized = self.lemmatizer.lemmatize(token, pos='a')
elif tag == "ADV":
lemmatized = self.lemmatizer.lemmatize(token, pos='r')
else:
lemmatized = self.lemmatizer.lemmatize(token) #pos = 'n'
lemmas.append(lemmatized.encode("utf-8"))
return lemmas
'''
Reduce this word down to its most basic form by removing suffixes or common ending
and finding the "root" or "stem" of the word.
Example: "response," "responsive," and "responsivity" all stem from "respons," or
something similar.
'''
def stem(self, tokens):
stemmed = []
for token in tokens:
stem = self.stemmer.stem(token)
stemmed.append(stem.encode("utf-8"))
return stemmed
示例6: lemmstem
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def lemmstem(sentences):
''' This function is responsible for perfoming
the lemmarization and stemming of the words
Input: A list of trees containing the sentences.
All words are classificated by their NE type
Output: Lemmatized/Stemmized sentences
'''
lmtzr = WordNetLemmatizer()
st = LancasterStemmer()
dic = {'VB' :wordnet.VERB,
'NN': wordnet.NOUN,
'JJ':wordnet.ADJ,
'RB':wordnet.ADV }
for sent in sentences:
lvsidx=sent.treepositions('leaves')
for pos in lvsidx:
word=sent[pos][0]
tag = sent[pos][1]
rtag = tag[0:2]
if rtag in dic:
lemm=lmtzr.lemmatize( word, dic[rtag] )
stem=st.stem(lemm)
#print word, lemm, stem #Linia maldita
sent[pos]=(word, tag, stem)
else:
sent[pos]=(word, tag, word)
return sentences
示例7: text2sents
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def text2sents(text, lemmatize=False, stemmer=None):
"""
converts a text into a list of sentences consisted of normalized words
:param text: list of string to process
:param lemmatize: if true, words will be lemmatized, otherwise -- stemmed
:param stemmer: stemmer to be used, if None, PortedStemmer is used. Only applyed if lemmatize==False
:return: list of lists of words
"""
sents = sent_tokenize(text)
tokenizer = RegexpTokenizer(r'\w+')
if lemmatize:
normalizer = WordNetLemmatizer()
tagger = PerceptronTagger()
elif stemmer is None:
normalizer = PorterStemmer()
else:
normalizer = stemmer
sents_normalized = []
for sent in sents:
sent_tokenized = tokenizer.tokenize(sent)
if lemmatize:
sent_tagged = tagger.tag(sent_tokenized)
sent_normalized = [normalizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in sent_tagged]
else:
sent_normalized = [normalizer.stem(w) for w in sent_tokenized]
sents_normalized.append(sent_normalized)
return sents_normalized
示例8: preprocess
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def preprocess(original_str):
# stemmer
wnl = WordNetLemmatizer()
# pos
original_str = unicode(original_str, errors='ignore')
print type(original_str)
article_tok = pos_tag(word_tokenize(original_str))
print type(article_tok)
print "token: "
print article_tok
# choose Noun
str_noun = ''
for word, tag in article_tok:
if ("NN" in tag) or ("JJ" in tag):
# print(word,":",tag)
# print(wnl.lemmatize(word))
try:
stemming_word = wnl.lemmatize(word)
print stemming_word
if len(word) > 1:
str_noun = str_noun + stemming_word + " "
except UnicodeDecodeError as e:
print "error: " + word
# end if
# result
# final_doc.append(str_noun)
# print "return_preprocess : " + str_noun
return str_noun
示例9: feature_extractor_top_words_weights
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def feature_extractor_top_words_weights(data):
data = data.decode('utf-8')
top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel',
'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort',
'good', 'cebu', 'island']
features = {}
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]
for word in words:
if word not in stop_words:
if word in features:
if word in top_words:
features[word] += 1.5
else:
features[word] += 1
else:
if word in top_words:
features[word] = 1.5
else:
features[word] = 1
return features
示例10: returnKeywordFromList
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def returnKeywordFromList(convertpath):
token_dict = {}
i=0
#nltk.download()
wnl = WordNetLemmatizer()
fileName = {}
#print file
#print str(i)+ file
#file_path = subdir + os.path.sep + file
shakes = open(convertpath, 'r')
text = shakes.read()
lowers = "".join(map(lambda l:l.decode('unicode_escape').encode('ascii','ignore'),text))
no_punctuation = re.sub(r'[?|$|.|!0-9()=+-\/\'\"\|]',r'',lowers)
d = {v:True for v in no_punctuation.split()}
for token in d.keys():
no_punctuation = no_punctuation.replace(token, wnl.lemmatize(token))
fileName[i] = file
token_dict[i] = no_punctuation.replace("\n"," ").replace("\r","")
#break
#this can take some time
##print token_dict.values()
tfidf_vect = TfidfVectorizer(stop_words =stops, ngram_range=(1, 2))
# #
# count_vect.stop_words = stops
#
X_train_counts = tfidf_vect.fit_transform(token_dict.values())
#print tfidf_vect.get_feature_names()
#print(sortSparseMatrix(X_train_counts.getrow(0),rev=False, only_indices=False))
sortedMatrix = sortSparseMatrix(X_train_counts.getrow(0),rev=True, only_indices=False)[0]
x = map(lambda (x,y):x,sortedMatrix)
result = getKeywordAlgorithms(1,sortedMatrix)
return map(lambda key:tfidf_vect.get_feature_names()[key],result)
示例11: feature_extractor_tripadvisor_top_words_weights
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def feature_extractor_tripadvisor_top_words_weights(data):
data = data.decode('utf-8')
top_file = open('scraper/top_words.txt', 'r')
top_words = [word.replace('\n', '') for word in top_file]
places_file = open('scraper/places.txt', 'r')
for place in places_file:
place = place.replace('\n', '')
for word in place.split(' '):
if word != '-':
top_words.append(word)
features = {}
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]
for word in words:
if word not in stop_words:
if word in features:
if word in top_words:
features[word] += 1.5
else:
features[word] += 1
else:
if word in top_words:
features[word] = 1.5
else:
features[word] = 1
return features
示例12: lemmatizing
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def lemmatizing(line_list):
"""
Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data
Iterates over all terms in lines, lemmatize them using WordNetLemmatizer()
Return: lemmatized_list (list of strings(terms that stemmed))
"""
lemmatized_list = []
lemmatizer = WordNetLemmatizer()
for i, line in enumerate(line_list):
# linercase
line = line.lower()
# remove punctuation
# below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
# nopunct_line = ''.join([c for c in line
# if re.match("[a-z\-\' \n\t]", c)])
# this solve the problem above:
nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)
# tokenize
line_token = wt(nopunct_line)
# stemming
lemmatized_line = []
for term in line_token:
term = lemmatizer.lemmatize(term)
lemmatized_line.append(term)
# back to sentence as a string
lemmatized_sentence = ' '.join(lemmatized_line)
lemmatized_list.append(lemmatized_sentence)
return lemmatized_list
示例13: feature_extractor_top_words_weights
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def feature_extractor_top_words_weights(data):
"""
Extract features using the top words with weights method
parameter: data (tweet)
returns: returns features of the given data
"""
data = data.decode('utf-8')
# top 15 frequently-ocurring words from the tourism-related twitter corpus
top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel',
'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort',
'good', 'cebu', 'island']
features = {}
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
# preprocessing: tokenize, convert to lowercase and lemmatize words
words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]
# remove stop words and add words and their frequencies as features
for word in words:
if word not in stop_words:
if word in features:
# if word is found in the top words list, increase by 1.5 or preferred weight
if word in top_words:
features[word] += 1.5
else:
features[word] += 1
else:
if word in top_words:
features[word] = 1.5
else:
features[word] = 1
return features
示例14: Check
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def Check(mArray):
# what am I checking?
item = mArray[1]
lmtzr = WordNetLemmatizer()
item = lmtzr.lemmatize(item)
# converts to a string
return ''.join(item)
示例15: word_extractor2
# 需要导入模块: from nltk import WordNetLemmatizer [as 别名]
# 或者: from nltk.WordNetLemmatizer import lemmatize [as 别名]
def word_extractor2(text):
wordlemmatizer = WordNetLemmatizer()
text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two
words = ""
wordtokens = [ wordlemmatizer.lemmatize(word.lower()) \
for word in word_tokenize(text.decode('utf-8', 'ignore')) ]
for word in wordtokens:
words+=" "+word
return words