本文整理汇总了Python中nltk.WordNetLemmatizer方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.WordNetLemmatizer方法的具体用法?Python nltk.WordNetLemmatizer怎么用?Python nltk.WordNetLemmatizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.WordNetLemmatizer方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: preprocess_sentence
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import WordNetLemmatizer [as 别名]
def preprocess_sentence(sentence):
lemmatizer = nltk.WordNetLemmatizer()
# clearly list out our preprocessing pipeline
processed_tokens = nltk.word_tokenize(sentence)
processed_tokens = [w.lower() for w in processed_tokens]
# find least common elements
word_counts = collections.Counter(processed_tokens)
uncommon_words = word_counts.most_common()[:-10:-1]
# remove these tokens
processed_tokens = [w for w in processed_tokens if w not in stop_words]
processed_tokens = [w for w in processed_tokens if w not in uncommon_words]
# lemmatize
processed_tokens = [lemmatizer.lemmatize(w) for w in processed_tokens]
return processed_tokens
开发者ID:PacktPublishing,项目名称:Hands-on-NLP-with-NLTK-and-scikit-learn-,代码行数:16,代码来源:nlp-2-spam-classification.py
示例2: transform
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import WordNetLemmatizer [as 别名]
def transform(self, input_: list) -> list:
"""
Lemmatization a sequence of tokens.
:param input_: list of tokens to be lemmatized.
:return tokens: list of lemmatizd tokens.
"""
lemmatizer = nltk.WordNetLemmatizer()
return [lemmatizer.lemmatize(token, pos='v') for token in input_]
示例3: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import WordNetLemmatizer [as 别名]
def __init__(self, stopwords=None):
self.stopwords = set(stopwords or nltk.corpus.stopwords.words('english'))
self.lemmatizer = nltk.WordNetLemmatizer()
示例4: lemmatize
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import WordNetLemmatizer [as 别名]
def lemmatize(tokens):
"""
lemmatize tokens
"""
try:
wnl = nltk.WordNetLemmatizer()
except LookupError:
nltk.download('wordnet')
wnl = nltk.WordNetLemmatizer()
return [wnl.lemmatize(t) for t in tokens]
示例5: cond2
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import WordNetLemmatizer [as 别名]
def cond2(w):
""" Return word if its lemmatised form is not in the wordlist."""
wnl = WordNetLemmatizer()
return wnl.lemmatize(w.lower()) not in wordlist
示例6: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import WordNetLemmatizer [as 别名]
def __init__(self):
self.install_nltk_corpora('stopwords', 'wordnet', 'punkt')
self.lemmatizer = nltk.WordNetLemmatizer()
self.lemmatizer.lemmatize('') # Force nltk lazy corpus loader to do something.
self.tokenizer = self.make_tokenizer()
self.stopwords = nltk.corpus.stopwords.words('english')
self.sent_tokenizer = None
示例7: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import WordNetLemmatizer [as 别名]
def __init__(self):
NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger')
self.normalizer = NltkNormalizer()
self.lem = nltk.WordNetLemmatizer()
self.tagger = nltk.PerceptronTagger()
self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
示例8: lemmatize_term
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import WordNetLemmatizer [as 别名]
def lemmatize_term(term, pos=None):
if pos is None:
synsets = wordnet.synsets(term)
if not synsets:
return term
pos = synsets[0].pos()
if pos == wordnet.ADJ_SAT:
pos = wordnet.ADJ
return nltk.WordNetLemmatizer().lemmatize(term, pos=pos)
示例9: add_from_lemma_definitions
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import WordNetLemmatizer [as 别名]
def add_from_lemma_definitions(self, vocab, try_lower=False):
"""Add lemma definitions for non-lemmas.
This code covers the following scenario: supposed a dictionary is crawled,
but only for word lemmas.
"""
lemmatizer = nltk.WordNetLemmatizer()
added = 0
for word in vocab.words:
word_list = [word, word.lower()] if try_lower else [word]
for word_to_lemma in word_list:
try:
for part_of_speech in ['a', 's', 'r', 'n', 'v']:
lemma = lemmatizer.lemmatize(word_to_lemma, part_of_speech)
lemma_defs = self._data.get(lemma)
if lemma != word and lemma_defs:
# This can be quite slow. But this code will not be used
# very often.
for def_ in lemma_defs:
if not def_ in self._data[word]:
added += 1
self._data[word].append(def_)
except:
logger.error("lemmatizer crashed on {}".format(word))
logger.error(traceback.format_exc())
logger.info("Added {} new defs in add_from_lemma_definitions".format(added))
self.save()
示例10: crawl_lemmas
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import WordNetLemmatizer [as 别名]
def crawl_lemmas(self, vocab):
"""Add Wordnet lemmas as definitions."""
lemmatizer = nltk.WordNetLemmatizer()
for word in vocab.words:
definitions = []
try:
for part_of_speech in ['a', 's', 'r', 'n', 'v']:
lemma = lemmatizer.lemmatize(word, part_of_speech)
if lemma != word and not [lemma] in definitions:
definitions.append([lemma])
except:
logger.error("lemmatizer crashed on {}".format(word))
if definitions:
self._data[word] = definitions
self.save()
示例11: Process
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import WordNetLemmatizer [as 别名]
def Process(data):
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(unicode(sentence,errors='ignore'))]
开发者ID:PacktPublishing,项目名称:Mastering-Machine-Learning-for-Penetration-Testing,代码行数:5,代码来源:SpamDetection_NLTK.py
示例12: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import WordNetLemmatizer [as 别名]
def __init__(self, require_unique_match, lemmatizer="word_net",
empty_question_features=False, stop_words=None):
self.lemmatizer = lemmatizer
self.stop_words = stop_words
self.empty_question_features = empty_question_features
if lemmatizer == "word_net":
self._lemmatizer = WordNetLemmatizer()
else:
raise ValueError()
self._cache = {}
self.require_unique_match = require_unique_match
示例13: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import WordNetLemmatizer [as 别名]
def __init__(self, lower: bool = True, stemmer="port"):
self.lower = lower
self.stemmer = stemmer
if stemmer == "port":
self._stemmer = PorterStemmer()
self._stem = self._stemmer.stem
elif stemmer == "wordnet":
self._stemmer = WordNetLemmatizer()
self._stem = self._stemmer.lemmatize
else:
raise ValueError(stemmer)
# stemming is slow, so we cache words as we go
self.normalize_cache = {}