本文整理匯總了Python中nltk.WordNetLemmatizer類的典型用法代碼示例。如果您正苦於以下問題:Python WordNetLemmatizer類的具體用法?Python WordNetLemmatizer怎麽用?Python WordNetLemmatizer使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了WordNetLemmatizer類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: text2sents
def text2sents(text, lemmatize=False, stemmer=None):
"""
converts a text into a list of sentences consisted of normalized words
:param text: list of string to process
:param lemmatize: if true, words will be lemmatized, otherwise -- stemmed
:param stemmer: stemmer to be used, if None, PortedStemmer is used. Only applyed if lemmatize==False
:return: list of lists of words
"""
sents = sent_tokenize(text)
tokenizer = RegexpTokenizer(r'\w+')
if lemmatize:
normalizer = WordNetLemmatizer()
tagger = PerceptronTagger()
elif stemmer is None:
normalizer = PorterStemmer()
else:
normalizer = stemmer
sents_normalized = []
for sent in sents:
sent_tokenized = tokenizer.tokenize(sent)
if lemmatize:
sent_tagged = tagger.tag(sent_tokenized)
sent_normalized = [normalizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in sent_tagged]
else:
sent_normalized = [normalizer.stem(w) for w in sent_tokenized]
sents_normalized.append(sent_normalized)
return sents_normalized
示例2: lemmatizing
def lemmatizing(line_list):
"""
Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data
Iterates over all terms in lines, lemmatize them using WordNetLemmatizer()
Return: lemmatized_list (list of strings(terms that stemmed))
"""
lemmatized_list = []
lemmatizer = WordNetLemmatizer()
for i, line in enumerate(line_list):
# linercase
line = line.lower()
# remove punctuation
# below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
# nopunct_line = ''.join([c for c in line
# if re.match("[a-z\-\' \n\t]", c)])
# this solve the problem above:
nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)
# tokenize
line_token = wt(nopunct_line)
# stemming
lemmatized_line = []
for term in line_token:
term = lemmatizer.lemmatize(term)
lemmatized_line.append(term)
# back to sentence as a string
lemmatized_sentence = ' '.join(lemmatized_line)
lemmatized_list.append(lemmatized_sentence)
return lemmatized_list
示例3: feature_extractor_tripadvisor_top_words_weights
def feature_extractor_tripadvisor_top_words_weights(data):
data = data.decode('utf-8')
top_file = open('scraper/top_words.txt', 'r')
top_words = [word.replace('\n', '') for word in top_file]
places_file = open('scraper/places.txt', 'r')
for place in places_file:
place = place.replace('\n', '')
for word in place.split(' '):
if word != '-':
top_words.append(word)
features = {}
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]
for word in words:
if word not in stop_words:
if word in features:
if word in top_words:
features[word] += 1.5
else:
features[word] += 1
else:
if word in top_words:
features[word] = 1.5
else:
features[word] = 1
return features
示例4: returnKeywordFromList
def returnKeywordFromList(convertpath):
token_dict = {}
i=0
#nltk.download()
wnl = WordNetLemmatizer()
fileName = {}
#print file
#print str(i)+ file
#file_path = subdir + os.path.sep + file
shakes = open(convertpath, 'r')
text = shakes.read()
lowers = "".join(map(lambda l:l.decode('unicode_escape').encode('ascii','ignore'),text))
no_punctuation = re.sub(r'[?|$|.|!0-9()=+-\/\'\"\|]',r'',lowers)
d = {v:True for v in no_punctuation.split()}
for token in d.keys():
no_punctuation = no_punctuation.replace(token, wnl.lemmatize(token))
fileName[i] = file
token_dict[i] = no_punctuation.replace("\n"," ").replace("\r","")
#break
#this can take some time
##print token_dict.values()
tfidf_vect = TfidfVectorizer(stop_words =stops, ngram_range=(1, 2))
# #
# count_vect.stop_words = stops
#
X_train_counts = tfidf_vect.fit_transform(token_dict.values())
#print tfidf_vect.get_feature_names()
#print(sortSparseMatrix(X_train_counts.getrow(0),rev=False, only_indices=False))
sortedMatrix = sortSparseMatrix(X_train_counts.getrow(0),rev=True, only_indices=False)[0]
x = map(lambda (x,y):x,sortedMatrix)
result = getKeywordAlgorithms(1,sortedMatrix)
return map(lambda key:tfidf_vect.get_feature_names()[key],result)
示例5: feature_extractor_top_words_weights
def feature_extractor_top_words_weights(data):
"""
Extract features using the top words with weights method
parameter: data (tweet)
returns: returns features of the given data
"""
data = data.decode('utf-8')
# top 15 frequently-ocurring words from the tourism-related twitter corpus
top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel',
'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort',
'good', 'cebu', 'island']
features = {}
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
# preprocessing: tokenize, convert to lowercase and lemmatize words
words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]
# remove stop words and add words and their frequencies as features
for word in words:
if word not in stop_words:
if word in features:
# if word is found in the top words list, increase by 1.5 or preferred weight
if word in top_words:
features[word] += 1.5
else:
features[word] += 1
else:
if word in top_words:
features[word] = 1.5
else:
features[word] = 1
return features
示例6: init_feature_sentences
def init_feature_sentences(self, total_content):
t = Tokenizer()
p = POSTagger()
wnl = WordNetLemmatizer()
sentences = t.sent_tokenize(total_content.lower())
for sentence in sentences:
tagged_sentence = p.ntlk_tag(t.word_tokenize(sentence))
#Initializing Feature Sentence dictionary
feature_sentence = {}
feature_sentence['sentence'] = sentence
feature_sentence['tags'] = tagged_sentence
feature_sentence['nouns'] = []
feature_sentence['noun_phrases'] = []
#Finding the Nouns/Noun Phrases in the tagged sentence
for i in range(0,len(tagged_sentence)):
(word, tag) = tagged_sentence[i]
#Chunking
if tag.startswith('N') and tag != 'NNP':
if i > 0 and len(feature_sentence['nouns']) > 0 and tagged_sentence[i - 1][0] == feature_sentence['nouns'][-1] and feature_sentence['sentence'].find(feature_sentence['nouns'][-1] + ' ' + word) > -1:
feature_sentence['noun_phrases'].append(wnl.lemmatize(feature_sentence['nouns'].pop() + ' ' + word))
else:
feature_sentence['nouns'].append(wnl.lemmatize(word))
self.feature_sentences.append(feature_sentence)
示例7: write_clean_turian_unigrams
def write_clean_turian_unigrams():
"""
Extracts unigram embeddings from Socher's binary distribution. These can be used by other composers.
There are only 50k embeddings (presumably for the most frequent tokens in the corpus). The words have not
been processed- there are punctuation-only tokens, uppercased words and non-lemmatized words. There isn't
any PoS tag filtering either- words like "to", "while" and "there".
I remove punctuation, then lowercase and lemmatize each entry. Multiple entries may map to the
same canonical form. I select the shortest original entry (ties are broken by giving preference to
words that are already lowercased). This could have been done better.
Only vectors for the selected entries are kept. There's 33k canonical
forms left, many of which are not nouns/adjs/verbs.
We don't have a PoS tag for the canonical forms. I get around the problem by creating 3 copies of each
canonical form and expand "cat" to cat/N, cat/J and cat/V, which all share the same vector.
"""
logging.info('Writing Turian unigrams to %s', turian_unigram_vectors_file)
mat = loadmat(socher_unigram_embedding_matlab)
words = [w[0] for w in mat['words'].ravel()]
df = pd.DataFrame(mat['We'].T, index=words)
lmtzr = WordNetLemmatizer()
clean_to_dirty = defaultdict(list) # canonical -> [non-canonical]
dirty_to_clean = dict() # non-canonical -> canonical
to_keep = set() # which non-canonical forms forms we will keep
# todo this can be done based on frequency or something
for w in words:
if set(w).intersection(set(string.punctuation).union(set('0123456789'))):
# not a real word- contains digits or punctuation
continue
lemma = lmtzr.lemmatize(w.lower())
clean_to_dirty[lemma].append(w)
dirty_to_clean[w] = lemma
# decide which of possibly many non-canonical forms with the same lemma to keep
# prefer shorter and lowercased non-canonical forms
for lemma, dirty_list in clean_to_dirty.items():
if len(dirty_list) > 1:
best_lemma = min(dirty_list, key=lambda w: (len(w), not w.islower()))
else:
best_lemma = dirty_list[0]
to_keep.add(best_lemma)
# remove non-canonical forms we don't want
idx_to_drop = [i for i, w in enumerate(df.index) if w not in to_keep]
ddf = df.drop(df.index[idx_to_drop])
# canonicalize whatever is left
ddf.index = [lmtzr.lemmatize(w.lower()) for w in ddf.index]
# we don't know what the PoS tags of the canonical forms are, so make them all of the same tag
# e.g. expand "cat" to cat/N, cat/J and cat/V, which all share the same vector
new_index = ['%s/%s'%(w, pos) for pos in 'NJV' for w in ddf.index]
new_data = np.vstack([ddf.values] * 3)
ddf = pd.DataFrame(new_data, index= new_index)
dv = DenseVectors(ddf, allow_lexical_overlap=True)
dv.to_tsv(turian_unigram_vectors_file)
logging.info('Done')
示例8: feature_extractor_top_words_weights
def feature_extractor_top_words_weights(data):
data = data.decode('utf-8')
top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel',
'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort',
'good', 'cebu', 'island']
features = {}
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]
for word in words:
if word not in stop_words:
if word in features:
if word in top_words:
features[word] += 1.5
else:
features[word] += 1
else:
if word in top_words:
features[word] = 1.5
else:
features[word] = 1
return features
示例9: preprocess
def preprocess(original_str):
# stemmer
wnl = WordNetLemmatizer()
# pos
original_str = unicode(original_str, errors='ignore')
print type(original_str)
article_tok = pos_tag(word_tokenize(original_str))
print type(article_tok)
print "token: "
print article_tok
# choose Noun
str_noun = ''
for word, tag in article_tok:
if ("NN" in tag) or ("JJ" in tag):
# print(word,":",tag)
# print(wnl.lemmatize(word))
try:
stemming_word = wnl.lemmatize(word)
print stemming_word
if len(word) > 1:
str_noun = str_noun + stemming_word + " "
except UnicodeDecodeError as e:
print "error: " + word
# end if
# result
# final_doc.append(str_noun)
# print "return_preprocess : " + str_noun
return str_noun
示例10: lemmstem
def lemmstem(sentences):
''' This function is responsible for perfoming
the lemmarization and stemming of the words
Input: A list of trees containing the sentences.
All words are classificated by their NE type
Output: Lemmatized/Stemmized sentences
'''
lmtzr = WordNetLemmatizer()
st = LancasterStemmer()
dic = {'VB' :wordnet.VERB,
'NN': wordnet.NOUN,
'JJ':wordnet.ADJ,
'RB':wordnet.ADV }
for sent in sentences:
lvsidx=sent.treepositions('leaves')
for pos in lvsidx:
word=sent[pos][0]
tag = sent[pos][1]
rtag = tag[0:2]
if rtag in dic:
lemm=lmtzr.lemmatize( word, dic[rtag] )
stem=st.stem(lemm)
#print word, lemm, stem #Linia maldita
sent[pos]=(word, tag, stem)
else:
sent[pos]=(word, tag, word)
return sentences
示例11: __init__
def __init__(self, text, product_name):
self.candidate_features = []
self.feature_sentences = []
self.product_name = product_name.lower().split('-')[0].split('_')
t = Tokenizer()
sents = t.sent_tokenize(text.lower())
p = POSTagger()
wnl = WordNetLemmatizer()
for sent in sents:
tagged_sent = p.nltk_tag(t.word_tokenize(sent))
feature_sent = {}
feature_sent['sentence'] = sent
feature_sent['tags'] = tagged_sent
feature_sent['nouns'] = []
feature_sent['noun_phrases'] = []
for i in range(0, len(tagged_sent)):
(word, tag) = tagged_sent[i]
#Don't include proper nouns
if tag.startswith('N') and tag != 'NNP':
"""
Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase.
Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is low. """
if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1] and feature_sent['sentence'].find(feature_sent['nouns'][-1] + ' ' + word) > -1:
feature_sent['noun_phrases'].append(wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word))
else:
feature_sent['nouns'].append(wnl.lemmatize(word))
self.feature_sentences.append(feature_sent)
示例12: Check
def Check(mArray):
# what am I checking?
item = mArray[1]
lmtzr = WordNetLemmatizer()
item = lmtzr.lemmatize(item)
# converts to a string
return ''.join(item)
示例13: word_extractor2
def word_extractor2(text):
wordlemmatizer = WordNetLemmatizer()
text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two
words = ""
wordtokens = [ wordlemmatizer.lemmatize(word.lower()) \
for word in word_tokenize(text.decode('utf-8', 'ignore')) ]
for word in wordtokens:
words+=" "+word
return words
示例14: Check
def Check(mArray):
#what am I checking?
#Taking the 2nd item in the array since popopen puts the file path as the first item.
item = mArray[1]
lmtzr = WordNetLemmatizer()
item = lmtzr.lemmatize(item, get_wordnet_pos(item))
#converts to a string
return ''.join(item)
示例15: lemmatize
def lemmatize(tokens):
# lemmatize words. try both noun and verb lemmatizations
lmtzr = WordNetLemmatizer()
for i in range(0,len(tokens)):
res = lmtzr.lemmatize(tokens[i])
if res == tokens[i]:
tokens[i] = lmtzr.lemmatize(tokens[i], 'v')
else:
tokens[i] = res
return tokens