本文整理匯總了Python中nltk.WordNetLemmatizer方法的典型用法代碼示例。如果您正苦於以下問題:Python nltk.WordNetLemmatizer方法的具體用法?Python nltk.WordNetLemmatizer怎麽用?Python nltk.WordNetLemmatizer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類nltk
的用法示例。
在下文中一共展示了nltk.WordNetLemmatizer方法的13個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: preprocess_sentence
# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import WordNetLemmatizer [as 別名]
def preprocess_sentence(sentence):
lemmatizer = nltk.WordNetLemmatizer()
# clearly list out our preprocessing pipeline
processed_tokens = nltk.word_tokenize(sentence)
processed_tokens = [w.lower() for w in processed_tokens]
# find least common elements
word_counts = collections.Counter(processed_tokens)
uncommon_words = word_counts.most_common()[:-10:-1]
# remove these tokens
processed_tokens = [w for w in processed_tokens if w not in stop_words]
processed_tokens = [w for w in processed_tokens if w not in uncommon_words]
# lemmatize
processed_tokens = [lemmatizer.lemmatize(w) for w in processed_tokens]
return processed_tokens
開發者ID:PacktPublishing,項目名稱:Hands-on-NLP-with-NLTK-and-scikit-learn-,代碼行數:16,代碼來源:nlp-2-spam-classification.py
示例2: transform
# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import WordNetLemmatizer [as 別名]
def transform(self, input_: list) -> list:
"""
Lemmatization a sequence of tokens.
:param input_: list of tokens to be lemmatized.
:return tokens: list of lemmatizd tokens.
"""
lemmatizer = nltk.WordNetLemmatizer()
return [lemmatizer.lemmatize(token, pos='v') for token in input_]
示例3: __init__
# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import WordNetLemmatizer [as 別名]
def __init__(self, stopwords=None):
self.stopwords = set(stopwords or nltk.corpus.stopwords.words('english'))
self.lemmatizer = nltk.WordNetLemmatizer()
示例4: lemmatize
# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import WordNetLemmatizer [as 別名]
def lemmatize(tokens):
"""
lemmatize tokens
"""
try:
wnl = nltk.WordNetLemmatizer()
except LookupError:
nltk.download('wordnet')
wnl = nltk.WordNetLemmatizer()
return [wnl.lemmatize(t) for t in tokens]
示例5: cond2
# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import WordNetLemmatizer [as 別名]
def cond2(w):
""" Return word if its lemmatised form is not in the wordlist."""
wnl = WordNetLemmatizer()
return wnl.lemmatize(w.lower()) not in wordlist
示例6: __init__
# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import WordNetLemmatizer [as 別名]
def __init__(self):
self.install_nltk_corpora('stopwords', 'wordnet', 'punkt')
self.lemmatizer = nltk.WordNetLemmatizer()
self.lemmatizer.lemmatize('') # Force nltk lazy corpus loader to do something.
self.tokenizer = self.make_tokenizer()
self.stopwords = nltk.corpus.stopwords.words('english')
self.sent_tokenizer = None
示例7: __init__
# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import WordNetLemmatizer [as 別名]
def __init__(self):
NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger')
self.normalizer = NltkNormalizer()
self.lem = nltk.WordNetLemmatizer()
self.tagger = nltk.PerceptronTagger()
self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
示例8: lemmatize_term
# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import WordNetLemmatizer [as 別名]
def lemmatize_term(term, pos=None):
if pos is None:
synsets = wordnet.synsets(term)
if not synsets:
return term
pos = synsets[0].pos()
if pos == wordnet.ADJ_SAT:
pos = wordnet.ADJ
return nltk.WordNetLemmatizer().lemmatize(term, pos=pos)
示例9: add_from_lemma_definitions
# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import WordNetLemmatizer [as 別名]
def add_from_lemma_definitions(self, vocab, try_lower=False):
"""Add lemma definitions for non-lemmas.
This code covers the following scenario: supposed a dictionary is crawled,
but only for word lemmas.
"""
lemmatizer = nltk.WordNetLemmatizer()
added = 0
for word in vocab.words:
word_list = [word, word.lower()] if try_lower else [word]
for word_to_lemma in word_list:
try:
for part_of_speech in ['a', 's', 'r', 'n', 'v']:
lemma = lemmatizer.lemmatize(word_to_lemma, part_of_speech)
lemma_defs = self._data.get(lemma)
if lemma != word and lemma_defs:
# This can be quite slow. But this code will not be used
# very often.
for def_ in lemma_defs:
if not def_ in self._data[word]:
added += 1
self._data[word].append(def_)
except:
logger.error("lemmatizer crashed on {}".format(word))
logger.error(traceback.format_exc())
logger.info("Added {} new defs in add_from_lemma_definitions".format(added))
self.save()
示例10: crawl_lemmas
# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import WordNetLemmatizer [as 別名]
def crawl_lemmas(self, vocab):
"""Add Wordnet lemmas as definitions."""
lemmatizer = nltk.WordNetLemmatizer()
for word in vocab.words:
definitions = []
try:
for part_of_speech in ['a', 's', 'r', 'n', 'v']:
lemma = lemmatizer.lemmatize(word, part_of_speech)
if lemma != word and not [lemma] in definitions:
definitions.append([lemma])
except:
logger.error("lemmatizer crashed on {}".format(word))
if definitions:
self._data[word] = definitions
self.save()
示例11: Process
# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import WordNetLemmatizer [as 別名]
def Process(data):
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(unicode(sentence,errors='ignore'))]
開發者ID:PacktPublishing,項目名稱:Mastering-Machine-Learning-for-Penetration-Testing,代碼行數:5,代碼來源:SpamDetection_NLTK.py
示例12: __init__
# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import WordNetLemmatizer [as 別名]
def __init__(self, require_unique_match, lemmatizer="word_net",
empty_question_features=False, stop_words=None):
self.lemmatizer = lemmatizer
self.stop_words = stop_words
self.empty_question_features = empty_question_features
if lemmatizer == "word_net":
self._lemmatizer = WordNetLemmatizer()
else:
raise ValueError()
self._cache = {}
self.require_unique_match = require_unique_match
示例13: __init__
# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import WordNetLemmatizer [as 別名]
def __init__(self, lower: bool = True, stemmer="port"):
self.lower = lower
self.stemmer = stemmer
if stemmer == "port":
self._stemmer = PorterStemmer()
self._stem = self._stemmer.stem
elif stemmer == "wordnet":
self._stemmer = WordNetLemmatizer()
self._stem = self._stemmer.lemmatize
else:
raise ValueError(stemmer)
# stemming is slow, so we cache words as we go
self.normalize_cache = {}