本文整理汇总了Python中nltk.stem方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.stem方法的具体用法?Python nltk.stem怎么用?Python nltk.stem使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.stem方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _nltkStemmer
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def _nltkStemmer(self, name):
""" NLTK Stemmer """
if name == 'porter':
stemmer = PorterStemmer()
elif name == 'snowball':
stemmer = SnowballStemmer("english")
elif name == "lancaster":
stemmer = LancasterStemmer()
else:
return
length = len(self._words)
for i in range(length):
word = self._words[i]['word']
l = len(word)
# Don't stem short words or words already categorized
if l < 4 or self._words[i]['tag'] != Vocabulary.UNTAG:
continue
self._words[i]['word'] = stemmer.stem(self._words[i]['word'])
示例2: wrap_words
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def wrap_words (pair):
"""wrap each (word, tag) pair as an object with fully indexed metadata"""
global STEMMER
index = pair[0]
result = []
for word, tag in pair[1]:
word = word.lower()
stem = STEMMER.stem(word)
if stem == "":
stem = word
keep = tag in ('JJ', 'NN', 'NNS', 'NNP',)
result.append({ "id": 0, "index": index, "stem": stem, "word": word, "tag": tag, "keep": keep })
index += 1
return result
######################################################################
## build a graph from raw text
示例3: tagFilterAndStemming
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def tagFilterAndStemming(originalTag):
# Remove non alphabetical character and split on spaces
processedTag = re.sub("[^a-zA-Z0-9]", " ", originalTag)
processedTag = re.sub(" +", " ", processedTag)
processedTag = processedTag.split(" ")
stopwords_set = set(stopwords.words('english'))
stemmer = PorterStemmer()
result = []
for tag in processedTag:
tag_stemmed = stemmer.stem(tag)
if tag_stemmed not in stopwords_set:
result.append(tag_stemmed)
return result
示例4: build_analyzer
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def build_analyzer(self):
analyzer = super(TfidfVectorizer, self).build_analyzer()
return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
示例5: preprocessing
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def preprocessing(text):
text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
tokens = [word for sent in nltk.sent_tokenize(text2) for word in
nltk.word_tokenize(sent)]
tokens = [word.lower() for word in tokens]
stopwds = stopwords.words('english')
tokens = [token for token in tokens if token not in stopwds]
tokens = [word for word in tokens if len(word)>=3]
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens]
tagged_corpus = pos_tag(tokens)
Noun_tags = ['NN','NNP','NNPS','NNS']
Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
lemmatizer = WordNetLemmatizer()
def prat_lemmatize(token,tag):
if tag in Noun_tags:
return lemmatizer.lemmatize(token,'n')
elif tag in Verb_tags:
return lemmatizer.lemmatize(token,'v')
else:
return lemmatizer.lemmatize(token,'n')
pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])
return pre_proc_text
开发者ID:PacktPublishing,项目名称:Natural-Language-Processing-with-Python-Cookbook,代码行数:36,代码来源:9.5 Skipgram_Keras.py
示例6: preprocessing
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def preprocessing(text):
text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
tokens = [word for sent in nltk.sent_tokenize(text2) for word in
nltk.word_tokenize(sent)]
tokens = [word.lower() for word in tokens]
stopwds = stopwords.words('english')
tokens = [token for token in tokens if token not in stopwds]
tokens = [word for word in tokens if len(word)>=3]
stemmer = PorterStemmer()
try:
tokens = [stemmer.stem(word) for word in tokens]
except:
tokens = tokens
tagged_corpus = pos_tag(tokens)
Noun_tags = ['NN','NNP','NNPS','NNS']
Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
lemmatizer = WordNetLemmatizer()
def prat_lemmatize(token,tag):
if tag in Noun_tags:
return lemmatizer.lemmatize(token,'n')
elif tag in Verb_tags:
return lemmatizer.lemmatize(token,'v')
else:
return lemmatizer.lemmatize(token,'n')
pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])
return pre_proc_text
开发者ID:PacktPublishing,项目名称:Natural-Language-Processing-with-Python-Cookbook,代码行数:40,代码来源:9.2 Email_Classification.py
示例7: stemmer
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def stemmer(method,data):
"""
Takes an array of words in JSON format.
"""
data = parse_input(data)
if data == False:
return ret_failure(703)
else:
res=[]
if method == "lancaster":
for word in data:
try:
res.append([word,LancasterSt.stem(word)])
except:
return ret_failure(702)
elif method == "porter":
for word in data:
try:
res.append([word,PorterSt.stem(word)])
except:
return ret_failure(702)
elif method == 'snowball':
for word in data:
try:
res.append([word,SnowballSt.stem(word)])
except:
return ret_failure(702)
else:
abort(404)
return ret_success(res)
示例8: data_preparation
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def data_preparation(tweet): #nltk.tag._POS_TAGGER #treebank tag set https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
url_regex = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)'
clean = re.sub(url_regex, '', tweet, flags = re.MULTILINE) # strip out urls. urls, ew, nasty.
clean = clean.replace('\n', ' ').replace("'", " ").replace('"', ' ')
try:
clean = clean.decode("utf-8-sig").replace(u"\ufffd", "?") # strip out Byte Order Marks
print("Detected BOS")
except:
pass
clean = re.sub(r'[^a-zA-Z ]', '', clean, flags = re.MULTILINE) # the "#" symbol is actually called octothorpe. bananas.
tokens = splitter.split(clean) # Tokeniztion
lemma_pos_token = lemmatization_using_pos_tagger.pos_tag(tokens) # Part of speech tagging.
out = ' '.join([out[1] for out in lemma_pos_token[0]])
return out
''' #https://pypi.org/project/hunspell/ #Double tokenizing. hunspell for units, nltk for context.
import hunspell
hobj = hunspell.HunSpell('/usr/share/myspell/en_US.dic', '/usr/share/myspell/en_US.aff')
hobj.spell('spookie')
hobj.suggest('spookie')
hobj.spell('spooky')
hobj.analyze('linked')
hobj.stem('linked')
'''
示例9: tokenize_and_stem
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def tokenize_and_stem(text):
stemmer = SnowballStemmer("english")
text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
################################################################################
示例10: stem_tokens
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def stem_tokens(tokens, stemmer):
stemmed = []
for token in tokens:
stemmed.append(stemmer.stem(token))
return stemmed
#process the data
示例11: process_data
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def process_data(data,exclude_stopword=True,stem=True):
tokens = [w.lower() for w in data]
tokens_stemmed = tokens
tokens_stemmed = stem_tokens(tokens, eng_stemmer)
tokens_stemmed = [w for w in tokens_stemmed if w not in stopwords ]
return tokens_stemmed
#creating ngrams
#unigram
示例12: tokenizer_porter
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def tokenizer_porter(text):
return [porter.stem(word) for word in text.split()]
#doc = ['runners like running and thus they run','this is a test for tokens']
#tokenizer([word for line in test_news.iloc[:,1] for word in line.lower().split()])
#show the distribution of labels in the train and test data
示例13: tokenize
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def tokenize(self, paragraph):
words = [self.ps.stem(word) for word in word_tokenize(paragraph)]
filtered_words = [word for word in words if word not in stopwords.words('english')]
return filtered_words
示例14: text_cleaner
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import stem [as 别名]
def text_cleaner(text,
deep_clean=False,
stem= True,
stop_words=True,
translite_rate=True):
rules = [
{r'>\s+': u'>'}, # remove spaces after a tag opens or closes
{r'\s+': u' '}, # replace consecutive spaces
{r'\s*<br\s*/?>\s*': u'\n'}, # newline after a <br>
{r'</(div)\s*>\s*': u'\n'}, # newline after </p> and </div> and <h1/>...
{r'</(p|h\d)\s*>\s*': u'\n\n'}, # newline after </p> and </div> and <h1/>...
{r'<head>.*<\s*(/head|body)[^>]*>': u''}, # remove <head> to </head>
{r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'}, # show links instead of texts
{r'[ \t]*<[^<]*?/?>': u''}, # remove remaining tags
{r'^\s+': u''} # remove spaces at the beginning
]
if deep_clean:
text = text.replace(".", "")
text = text.replace("[", " ")
text = text.replace(",", " ")
text = text.replace("]", " ")
text = text.replace("(", " ")
text = text.replace(")", " ")
text = text.replace("\"", "")
text = text.replace("-", " ")
text = text.replace("=", " ")
text = text.replace("?", " ")
text = text.replace("!", " ")
for rule in rules:
for (k, v) in rule.items():
regex = re.compile(k)
text = regex.sub(v, text)
text = text.rstrip()
text = text.strip()
text = text.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ')
text = re.sub("(^|\W)\d+($|\W)", " ", text)
if translite_rate:
text = transliterate(text)
if stem:
text = PorterStemmer().stem(text)
text = WordNetLemmatizer().lemmatize(text)
if stop_words:
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)
text = [w for w in word_tokens if not w in stop_words]
text = ' '.join(str(e) for e in text)
else:
for rule in rules:
for (k, v) in rule.items():
regex = re.compile(k)
text = regex.sub(v, text)
text = text.rstrip()
text = text.strip()
return text.lower()