本文整理汇总了Python中nltk.stem.WordNetLemmatizer方法的典型用法代码示例。如果您正苦于以下问题:Python stem.WordNetLemmatizer方法的具体用法?Python stem.WordNetLemmatizer怎么用?Python stem.WordNetLemmatizer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem
的用法示例。
在下文中一共展示了stem.WordNetLemmatizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: clean_text
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def clean_text(text):
# stop_words = stopwords.words('english')
stop_words = []
stop_words.extend(['!', ',' ,'.' ,'?' ,'-s' ,'-ly' ,'</s> ', 's'])
stemmer = WordNetLemmatizer()
text = remove_short(text)
text = clean_str(text)
text = word_tokenize(text)
text = [word for word in text if word not in stop_words]
text = [stemmer.lemmatize(word) for word in text]
return ' '.join(text)
示例2: real_word
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def real_word(self, word, LEMMATIZATION_flag=True):
'''
find the real word
'''
p_forword = re.compile('[a-z,A-Z,\',‘]')
word_s = p_forword.findall(word)
real_word = ''.join(word_s)#.lower()
if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['list', 'both']:
try:
real_word = self.fix_dic[real_word]
except Exception as e:
logger.debug(e)
pass
if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['NLTK', 'both']:
wordnet_lemmatizer = WordNetLemmatizer()
real_word = wordnet_lemmatizer.lemmatize(real_word)
logger.debug(word+'-->'+real_word)
return real_word
示例3: __init__
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def __init__(self):
super().__init__()
self.entity_dict = {}
self.abbr_dict = {}
self.wn = WordNetLemmatizer()
self.tokenized_data_path = './data/kvret/'
self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity)
#self.test = self.train
示例4: __init__
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def __init__(self):
super().__init__()
self.entity_dict = {}
self.abbr_dict = {}
self.wn = WordNetLemmatizer()
self.db = {}
self.tokenized_data_path = './data/kvret/'
self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity)
示例5: __lemmatize
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def __lemmatize(self, lemma):
"""
Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma.
"""
string, tag = lemma
if tag in ('a', 'n', 'r', 'v'):
wnl = WordNetLemmatizer()
string = wnl.lemmatize(string, tag)
return (string, tag)
######################################################################
# POSITIONING.
示例6: __init__
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def __init__(self):
self.wnl = WordNetLemmatizer()
示例7: __init__
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def __init__(self):
self.wnl = WordNetLemmatizer()
示例8: __wn_lemmatize
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def __wn_lemmatize(self, lemma):
"""
Lemmatize lemma using wordnet.stemWordNetLemmatizer(). Always
returns a (string, pos) pair. Lemmatizes even when the tag
isn't helpful, by ignoring it for stemming.
"""
string, tag = lemma
wnl = WordNetLemmatizer()
if tag in ('a', 'n', 'r', 'v'):
string = wnl.lemmatize(string, tag)
else:
string = wnl.lemmatize(string)
return (string, tag)
示例9: __init__
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def __init__(self):
"""
Intialize memebers:
question_dist - generalized-question distribution of the assigned extraction
location.
"""
self.question_dist = defaultdict(lambda : defaultdict(lambda : 0))
self.lmtzr = WordNetLemmatizer()
示例10: preprocessing
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def preprocessing(text):
text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
tokens = [word for sent in nltk.sent_tokenize(text2) for word in
nltk.word_tokenize(sent)]
tokens = [word.lower() for word in tokens]
stopwds = stopwords.words('english')
tokens = [token for token in tokens if token not in stopwds]
tokens = [word for word in tokens if len(word)>=3]
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens]
tagged_corpus = pos_tag(tokens)
Noun_tags = ['NN','NNP','NNPS','NNS']
Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
lemmatizer = WordNetLemmatizer()
def prat_lemmatize(token,tag):
if tag in Noun_tags:
return lemmatizer.lemmatize(token,'n')
elif tag in Verb_tags:
return lemmatizer.lemmatize(token,'v')
else:
return lemmatizer.lemmatize(token,'n')
pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])
return pre_proc_text
开发者ID:PacktPublishing,项目名称:Natural-Language-Processing-with-Python-Cookbook,代码行数:36,代码来源:9.5 Skipgram_Keras.py
示例11: preprocessing
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def preprocessing(text):
text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
tokens = [word for sent in nltk.sent_tokenize(text2) for word in
nltk.word_tokenize(sent)]
tokens = [word.lower() for word in tokens]
stopwds = stopwords.words('english')
tokens = [token for token in tokens if token not in stopwds]
tokens = [word for word in tokens if len(word)>=3]
stemmer = PorterStemmer()
try:
tokens = [stemmer.stem(word) for word in tokens]
except:
tokens = tokens
tagged_corpus = pos_tag(tokens)
Noun_tags = ['NN','NNP','NNPS','NNS']
Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
lemmatizer = WordNetLemmatizer()
def prat_lemmatize(token,tag):
if tag in Noun_tags:
return lemmatizer.lemmatize(token,'n')
elif tag in Verb_tags:
return lemmatizer.lemmatize(token,'v')
else:
return lemmatizer.lemmatize(token,'n')
pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])
return pre_proc_text
开发者ID:PacktPublishing,项目名称:Natural-Language-Processing-with-Python-Cookbook,代码行数:40,代码来源:9.2 Email_Classification.py
示例12: stem_corpus
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def stem_corpus():
stemmer = WordNetLemmatizer()
with open('data/mr/text_train.txt') as f:
raw_text = f.read()
with open('data/mr/label_train.txt') as f:
raw_labels = f.read()
labels = []
for raw_label in raw_labels.split('\n'):
if raw_label == '1':
labels.append('pos')
elif raw_label == '0':
labels.append('neg')
else:
if len(raw_label) == 0:
continue
raise ValueError(raw_label)
corpus = raw_text.split('\n')
corpus = [clean_str(doc) for doc in corpus]
corpus = [remove_short(doc) for doc in corpus]
tokenized_corpus = [word_tokenize(doc) for doc in corpus]
results = []
for line in tokenized_corpus:
results.append(' '.join([stemmer.lemmatize(word) for word in line]))
results = list(zip(labels, results))
results = ['\t'.join(line) for line in results]
random.shuffle(results)
with open('data/mr/mr-train-stemmed.txt', 'w') as f:
f.write('\n'.join(results))
示例13: _phrase_stem
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def _phrase_stem(cls, phrase):
wnl = WordNetLemmatizer()
l_term = phrase.split()
l_term = [wnl.lemmatize(term, 'n') for term in l_term]
return ' '.join(l_term)
示例14: extract_experience
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def extract_experience(resume_text):
'''
Helper function to extract experience from resume text
:param resume_text: Plain resume text
:return: list of experience
'''
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# word tokenization
word_tokens = nltk.word_tokenize(resume_text)
# remove stop words and lemmatize
filtered_sentence = [w for w in word_tokens if not w in stop_words and wordnet_lemmatizer.lemmatize(w) not in stop_words]
sent = nltk.pos_tag(filtered_sentence)
# parse regex
cp = nltk.RegexpParser('P: {<NNP>+}')
cs = cp.parse(sent)
# for i in cs.subtrees(filter=lambda x: x.label() == 'P'):
# print(i)
test = []
for vp in list(cs.subtrees(filter=lambda x: x.label()=='P')):
test.append(" ".join([i[0] for i in vp.leaves() if len(vp.leaves()) >= 2]))
# Search the word 'experience' in the chunk and then print out the text after it
x = [x[x.lower().index('experience') + 10:] for i, x in enumerate(test) if x and 'experience' in x.lower()]
return x
示例15: __init__
# 需要导入模块: from nltk import stem [as 别名]
# 或者: from nltk.stem import WordNetLemmatizer [as 别名]
def __init__(self, stemmer='porter', tokenize=True, case_sensitive=False,
*args, **kwargs):
if isinstance(stemmer, str):
if stemmer not in self._stemmers:
valid = list(self._stemmers.keys())
raise ValueError("Invalid stemmer '%s'; please use one of %s."
% (stemmer, valid))
stemmer = getattr(stem, self._stemmers[stemmer])(*args, **kwargs)
elif not isinstance(stemmer, (stem.StemmerI, stem.WordNetLemmatizer)):
raise ValueError("stemmer must be either a valid string, or an "
"instance of class StemmerI.")
self.stemmer = stemmer
self.tokenize = tokenize
self.case_sensitive = case_sensitive
super().__init__()