本文整理汇总了Python中nltk.stem.WordNetLemmatizer.lemmatize方法的典型用法代码示例。如果您正苦于以下问题:Python WordNetLemmatizer.lemmatize方法的具体用法?Python WordNetLemmatizer.lemmatize怎么用?Python WordNetLemmatizer.lemmatize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.WordNetLemmatizer
的用法示例。
在下文中一共展示了WordNetLemmatizer.lemmatize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: getBoW
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def getBoW(self, instance):
bowFeatures = {}
# tokens in the third position
tokens = instance[3]
# pos tag
wordnet_lemmatizer = WordNetLemmatizer()
tagged = nltk.pos_tag(tokens)
i = 0
for tag in tagged:
if instance[2] == i:
i +=1
continue
#sys.stderr.write('remove target word (%s)\n' % tag[0])
elif tag[0] in stopwords.words("english"):
i +=1
continue
#sys.stderr.write('stopword (%s)\n' % tag[0])
elif re.match("N.*", tag[1]):
bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="n")] = True
elif re.match("V.*", tag[1]):
bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="v")] = True
elif re.match("R.*", tag[1]):
bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="r")] = True
elif re.match("J.*", tag[1]):
bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="a")] = True
i += 1
return bowFeatures
示例2: get_words_list
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def get_words_list(dataset):
'''
Loading dataset and read contents, use tokenize to get tokens and lemmatize the words.
'''
# join the path and file name together
spam_path = 'data/enron/pre/'+ dataset + '/spam/'
ham_path = 'data/enron/pre/'+ dataset + '/ham/'
spam_npl = [i[-1] for i in os.walk(spam_path)][0]
ham_npl = [i[-1] for i in os.walk(ham_path)][0]
spam_fl = (open(os.path.join(spam_path, j)).read().lower() for j in spam_npl)
ham_fl = (open(os.path.join(ham_path, j)).read().lower() for j in ham_npl)
splitter = re.compile("\\W*")
english_stops = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# tokenize the files into words
spam_wl = [None]*len(spam_npl)
for i,f in enumerate(spam_fl):
spam_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]
ham_wl = [None]*len(ham_npl)
for i,f in enumerate(ham_fl):
ham_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]
return spam_wl, ham_wl
示例3: bow_score
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def bow_score(hypothesis_list,text_list):
wordnet_lemmatizer = WordNetLemmatizer()
stop_word_list = ['a', 'an', 'the', ',', '.', ';', ':' ]
i = 0
while i < len(hypothesis_list):
if hypothesis_list[i] in stop_word_list:
del hypothesis_list[i]
i = i - 1
i = i + 1
if len(hypothesis_list) == 0:
return 0
i = 0
while i < len(text_list):
if text_list[i] in stop_word_list:
del text_list[i]
i = i - 1
i = i + 1
if len(text_list) == 0:
return 0
## Stop words removed up until here
score = 0
for word_text in text_list:
lemma_text = wordnet_lemmatizer.lemmatize(word_text)
for word_hypothesis in hypothesis_list:
lemma_hypothesis = wordnet_lemmatizer.lemmatize(word_hypothesis)
print lemma_hypothesis
print lemma_text
score += lexical_compare(lemma_text,lemma_hypothesis)
print str(score)
return score
示例4: possibility
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def possibility():
wnl = WordNetLemmatizer()
verb = wnl.lemmatize(verbs[random.randrange(0, len(verbs))])
noun = wnl.lemmatize(nouns[random.randrange(0, len(nouns))])
article = "a"
if noun[0] in ["a", "e", "i", "o", "u"]:
article = "an"
if random.randrange(0, 100) < chance_quantity:
quantity_word = quantity_adverbs[random.randrange(0, len(quantity_adverbs))]
if not noun.endswith("s") and not noun.endswith("y") and not quantity_word == "numerous":
noun += "s"
possibility = verb + " " + quantity_word + " of the " + noun
elif random.randrange(0, 100) < chance_location:
location_word = location_adverbs[random.randrange(0, len(location_adverbs))]
possibility = (
verb
+ " "
+ article
+ " "
+ noun
+ " "
+ location_word
+ " the "
+ wnl.lemmatize(nouns[random.randrange(0, len(nouns))])
)
else:
possibility = verb + " " + article + " " + noun
return possibility
示例5: get_clean_text
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def get_clean_text(list_filenames, path_to_file):
'''
parameter:
----------
list_filenames: as LST is a list of filename as STR
path_to_file: as STR is the path to the file containing movie scripts
--> such that path_to_file/filename.txt is the file to open
returns:
--------
list of list of words (lemmatize, lowercase) in the text (order preserved)
'''
wnl = WordNetLemmatizer()
list_texts_as_words = []
for filename in list_filenames:
path_file = path_to_file+"/"+filename+".txt"
with open(path_file) as f:
text = f.readlines()
lines = [line.strip() for line in text if line.strip()]
string_words = []
for line in lines:
words = [wnl.lemmatize(word.lower()) for word in line.split(' ') if wnl.lemmatize(word.lower())]
string_words += words
list_texts_as_words.append(string_words)
return list_texts_as_words
示例6: TweetsLemmatizedVectorizer
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
class TweetsLemmatizedVectorizer(TweetsTruncatedVectorizer):
def __init__(self):
self.vectorizer = TfidfVectorizer(stop_words='english',min_df=5) #, sublinear_tf=True)
self.wordnet = WordNetLemmatizer()
def fit_transform(self, users):
join_tweets = []
for user in users:
timeline = [''.join(remove_tweet_noise(tweet.text)) for tweet in user.twitter]
#timeline_insta = [''.join(remove_tweet_noise(insta.text)) for insta in user.instagram]
#print timeline_insta
#timeline = timeline + timeline_insta
lemmatized = []
for tweet in timeline:
lemma = [self.wordnet.lemmatize(word) for word in tweet.split()]
lemmatized.append(' '.join(lemma))
join_tweets.append(''.join(lemmatized))
return self.vectorizer.fit_transform([usertweets for usertweets in join_tweets])
def transform(self, users):
join_tweets = []
for user in users:
timeline = [''.join(remove_tweet_noise(tweet.text)) for tweet in user.twitter]
lemmatized = []
for tweet in timeline:
lemma = [self.wordnet.lemmatize(word) for word in tweet.split()]
lemmatized.append(' '.join(lemma))
join_tweets.append(''.join(lemmatized))
return self.vectorizer.transform([usertweets for usertweets in join_tweets])
示例7: createCorpus
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def createCorpus(data,i, binaryX="False", stopWords=None, lemmatize="False", tfidf= "False", useidf="True"): # will vectorize BOG using frequency as the parameter and will return the required arrays
X_train =[]
X_test=[]
Y_train=[]
Y_test=[]
for key in data:
if key in i:
for filename in data[key]:
text = data[key][filename][0]
if lemmatize == "True":
port = WordNetLemmatizer()
text = " ".join([port.lemmatize(k,"v") for k in text.split()])
X_test.append(text)
Y_test.append(data[key][filename][1])
else:
for filename in data[key]:
text = data[key][filename][0]
if lemmatize == "True":
port = WordNetLemmatizer()
text = " ".join([port.lemmatize(k,"v") for k in text.split()])
X_train.append(text)
Y_train.append(data[key][filename][1])
if tfidf == "False":
vectorizer = CountVectorizer(min_df=1, binary= binaryX, stop_words=stopWords)
X_train_ans = vectorizer.fit_transform(X_train)
X_test_ans = vectorizer.transform(X_test)
return X_train_ans, Y_train, X_test_ans,Y_test
elif tfidf == "True":
vectorizer = TfidfVectorizer(min_df=1, use_idf=useidf)
X_train_ans = vectorizer.fit_transform(X_train)
X_test_ans = vectorizer.transform(X_test)
return X_train_ans, Y_train, X_test_ans,Y_test
示例8: negator
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def negator(self,wordVec):
negation = False
negated_doc = []
lemmatizer = WordNetLemmatizer()
for w,p in wordVec:
w_out = ""
if (p[:2] == "NN"):
w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.NOUN)
elif (p[:2] == "JJ"):
w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.ADJ)
elif (p[:2] == "VB"):
w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.VERB)
elif (p[:2] == "RB"):
w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.ADV)
if(w_out == "not" or w_out == "n't" ):
#print "blah"
negation = not negation
#rint negation
elif(w_out in string.punctuation and w_out != ''):
negation = False
elif(negation):
#print negation
w_out = "NOT_"+w_out
negated_doc.append((w_out,p))
#print negated_doc
return negated_doc
示例9: tokenize3
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def tokenize3(text):
wordnet_lemmatizer = WordNetLemmatizer()
tokens = word_tokenize(text)
tokens = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens]
tokens = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens]
tokens = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens]
return tokens
示例10: pos_analysis
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def pos_analysis(tags, stoplist):
wordnet_lemmatizer = WordNetLemmatizer()
nouns = [wordnet_lemmatizer.lemmatize(word) for word, tag in tags if tag=='NN']
display_freq(nouns, 'Nouns', top=50)
adjectives = [wordnet_lemmatizer.lemmatize(word) for word, tag in tags if tag=='JJ']
display_freq(adjectives, 'Adjectives', top=50)
verbs = [wordnet_lemmatizer.lemmatize(word, pos='v') for word, tag in tags if tag[:2] in ('VB') and word not in stoplist]
display_freq(verbs, 'Verbs', top=50)
示例11: build_analyzer
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def build_analyzer(self):
try:
english_lemmatizer = WordNetLemmatizer()
analyzer = super(ProcessCountVectorizer, self).build_analyzer()
return lambda doc: (english_lemmatizer.lemmatize(english_lemmatizer.lemmatize(w, "v"), "n")
for w in analyzer(doc) if not w.endswith("ly") and len(w) > 4)
except Warning:
pass
示例12: LemmaTokenizer
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, doc):
doc = doc.lower()
doc = re.sub("[^a-z]", " ", doc) #replace punctuation with spaces
# doc = re.sub("thanks", "thank", doc)
return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if len(self.wnl.lemmatize(t)) > 2]
示例13: __init__
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def __init__(self, data, label=None, *args, **kwargs):
lem = WordNetLemmatizer()
if data and not label:
# Data is assumed to be NLTK-style (word, tag) pairs.
# If you'd like to collapse the tag set, this is the place.
label = [re.sub(r'[{}]+'.format(punctuation),'PUN',tag) for word, tag in data] # e.g., tag[0]
data = [re.sub(r'[{}]+'.format(punctuation),'PUN', lem.lemmatize(word.lower())) for word, tag in data]
data = [re.sub(r'[0-9]+','NUM', lem.lemmatize(word.lower())) for word in data]
super(TaggedSentence, self).__init__(data, label, *args, **kwargs)
示例14: tokenize4
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def tokenize4(text):
wordnet_lemmatizer = WordNetLemmatizer()
tokens = word_tokenize(text)
wordset = set(words.words())
tokens = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens]
tokens = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens]
tokens = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens]
tokens = [token for token in tokens if token in wordset]
return tokens
示例15: stopWordRemoval
# 需要导入模块: from nltk.stem import WordNetLemmatizer [as 别名]
# 或者: from nltk.stem.WordNetLemmatizer import lemmatize [as 别名]
def stopWordRemoval() :
f = open('repos', 'r')
strn = f.read()
lst = strn.split('\n')
i = 0
while i < (len(lst) - 1) :
name = lst[i].split("/")
dummyFile = 'filteredData/' + name[1] + '/dummy.txt';
dr = os.path.dirname(dummyFile)
if not os.path.exists(dr) :
os.makedirs(dr)
ft = open('data/'+name[1]+'/title.txt')
st = ft.read().lower()
fd = open('data/'+name[1]+'/description.txt')
sd = fd.read().lower()
fc = open('data/'+name[1]+'/content.txt')
sc = fc.read().lower()
tokenizer = RegexpTokenizer(r'\w+')
wordArrTitle = tokenizer.tokenize(st)
wordArrDesc = tokenizer.tokenize(sd)
wordArrData = tokenizer.tokenize(sc)
filteredWordsTitle = [w for w in wordArrTitle if not w in stopwords.words('english')]
filteredWordsDesc = [w for w in wordArrDesc if not w in stopwords.words('english')]
filteredWordsData = [w for w in wordArrData if not w in stopwords.words('english')]
wordnet_lem= WordNetLemmatizer()
ftf = open('filteredData/'+name[1]+'/title.lst','w')
for w in filteredWordsTitle:
#print w
ftf.write(wordnet_lem.lemmatize(w)+'\n')
fdf = open('filteredData/'+name[1]+'/description.lst','w')
for w in filteredWordsDesc:
#print w
fdf.write(wordnet_lem.lemmatize(w)+'\n')
fcf = open('filteredData/'+name[1]+'/content.lst','w')
for w in filteredWordsData:
print w+'\n'
fcf.write(wordnet_lem.lemmatize(w)+'\n')
i=i+2