本文整理汇总了Python中nltk.stem.lancaster.LancasterStemmer方法的典型用法代码示例。如果您正苦于以下问题:Python lancaster.LancasterStemmer方法的具体用法?Python lancaster.LancasterStemmer怎么用?Python lancaster.LancasterStemmer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.stem.lancaster
的用法示例。
在下文中一共展示了lancaster.LancasterStemmer方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from nltk.stem import lancaster [as 别名]
# 或者: from nltk.stem.lancaster import LancasterStemmer [as 别名]
def __init__(self):
###############################################################
#
# Sets up all default requirements and placeholders
# needed for the NLU engine to run.
#
# - Helpers: Useful global functions
# - Logging: Logging class
# - LancasterStemmer: Word stemmer
#
###############################################################
self.ignore = [',','.','!','?']
self.Helpers = Helpers()
self._confs = self.Helpers.loadConfigs()
self.LogFile = self.Helpers.setLogFile(self._confs["aiCore"]["Logs"]+"JumpWay/")
self.LancasterStemmer = LancasterStemmer()
示例2: __init__
# 需要导入模块: from nltk.stem import lancaster [as 别名]
# 或者: from nltk.stem.lancaster import LancasterStemmer [as 别名]
def __init__(self, Logging, LogFile):
self.LancasterStemmer = LancasterStemmer()
self.Logging = Logging
self.LogFile = LogFile
self.ignore = [
'?',
'!'
]
self.Logging.logMessage(
self.LogFile,
"Data",
"INFO",
"Data Helper Ready")
示例3: getTokens
# 需要导入模块: from nltk.stem import lancaster [as 别名]
# 或者: from nltk.stem.lancaster import LancasterStemmer [as 别名]
def getTokens(self, removeStopwords=True):
""" Tokenizes the text, breaking it up into words, removing punctuation. """
tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer.
spans = list(tokenizer.span_tokenize(self.text))
# Take note of how many spans there are in the text
self.length = spans[-1][-1]
tokens = tokenizer.tokenize(self.text)
tokens = [ token.lower() for token in tokens ] # make them lowercase
stemmer = LancasterStemmer()
tokens = [ stemmer.stem(token) for token in tokens ]
if not removeStopwords:
self.spans = spans
return tokens
tokenSpans = list(zip(tokens, spans)) # zip it up
stopwords = nltk.corpus.stopwords.words('english') # get stopwords
tokenSpans = [ token for token in tokenSpans if token[0] not in stopwords ] # remove stopwords from zip
self.spans = [ x[1] for x in tokenSpans ] # unzip; get spans
return [ x[0] for x in tokenSpans ] # unzip; get tokens
示例4: extract
# 需要导入模块: from nltk.stem import lancaster [as 别名]
# 或者: from nltk.stem.lancaster import LancasterStemmer [as 别名]
def extract(self, data=None, splitIt=False):
###############################################################
#
# Extracts words from sentences, stripping out characters in
# the ignore list above
#
# https://www.nltk.org/_modules/nltk/stem/lancaster.html
# http://insightsbot.com/blog/R8fu5/bag-of-words-algorithm-in-python-introduction
#
###############################################################
return [self.LancasterStemmer.stem(word) for word in (data.split() if splitIt == True else data) if word not in self.ignore]
示例5: extract
# 需要导入模块: from nltk.stem import lancaster [as 别名]
# 或者: from nltk.stem.lancaster import LancasterStemmer [as 别名]
def extract(self, data=None, lowerIt=True, splitIt=False, ignoreWords=False):
if ignoreWords:
return [self.LancasterStemmer.stem(word if lowerIt == False else word.lower()) for word in (data.split() if splitIt == True else data) if word not in self.ignore]
else:
return [self.LancasterStemmer.stem(word if lowerIt == False else word.lower()) for word in (data.split() if splitIt == True else data)]
示例6: __init__
# 需要导入模块: from nltk.stem import lancaster [as 别名]
# 或者: from nltk.stem.lancaster import LancasterStemmer [as 别名]
def __init__(self):
###############################################################
#
# Sets up all default requirements
#
# - Helpers: Useful global functions
# - LancasterStemmer: Word stemmer
#
###############################################################
self.Helpers = Helpers()
self._confs = self.Helpers.loadConfigs()
self.stemmer = LancasterStemmer()
示例7: get_vocabularies
# 需要导入模块: from nltk.stem import lancaster [as 别名]
# 或者: from nltk.stem.lancaster import LancasterStemmer [as 别名]
def get_vocabularies(dataset, vocab_file, nearby_file):
"""Create map from example ID to (basic_words, nearby_words."""
with open(vocab_file) as f:
basic_vocab = [line.strip() for line in f]
with open(nearby_file) as f:
nearby_words = json.load(f)
stemmer = LancasterStemmer()
vocabs = {}
for a in dataset['data']:
for p in a['paragraphs']:
for q in p['qas']:
q_words = [w.lower() for w in word_tokenize(q['question'])]
if OPTS.mode == 'basic':
vocabs[q['id']] = (basic_vocab, [])
elif OPTS.mode == 'add-question-words':
vocabs[q['id']] = (basic_vocab, q_words)
elif OPTS.mode.endswith('-nearby'):
q_stems = [stemmer.stem(qw) for qw in q_words]
cur_vocab = [w for w in basic_vocab if w not in q_stems]
cur_nearby = []
for q_word, q_stem in zip(q_words, q_stems):
if q_word in nearby_words:
qw_nearby = []
for nearby_word in nearby_words[q_word]:
if len(qw_nearby) == OPTS.num_nearby: break
if nearby_word['word'] in PUNCTUATION: continue
nearby_stem = stemmer.stem(nearby_word['word'])
if nearby_stem != q_stem:
qw_nearby.append(nearby_word['word'])
cur_nearby.extend(qw_nearby)
vocabs[q['id']] = (cur_vocab, cur_nearby)
return vocabs