本文整理汇总了Python中Stemmer.Stemmer类的典型用法代码示例。如果您正苦于以下问题:Python Stemmer类的具体用法?Python Stemmer怎么用?Python Stemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Stemmer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: getStems
def getStems(cleanedText, stopWords):
stems = {}
matches = re.finditer(r'\w+(\.?\w+)*', cleanedText.strip(), flags=re.IGNORECASE)
stemmer = Stemmer('english')
#maxlength = sum(1 for _ in matches1)
#stemmer.maxCacheSize = maxlength
offset = len(termDict)
tokenid = offset + 1
position = 0
for match in matches:
#position = match.start()
position += 1
token = match.group()
filteredToken = filterToken(token, stopWords)
if filteredToken and filteredToken is not None:
wordStem = stemmer.stemWord(filteredToken.lower())
#present = wordStem in stems
if wordStem not in stems:
#tokenid += 1
stems[wordStem] = tokenid
positions = set()
positions.add(position)
if wordStem not in termDict:
termDict[wordStem] = tokenid
terms[tokenid] = positions
tokenid = tokenid + 1
else:
stemid = termDict[wordStem]
terms[stemid] = positions
else:
stemid = termDict[wordStem]
postns = terms[stemid]
postns.add(position)
terms[stemid] = postns
示例2: classif
def classif(text, mass, num_all_docs, num_words_unic):
stm = Stemmer('russian')
text = stm.stemWords(regexp_tokenize((text.decode('UTF-8')).lower(), r"(?x) \w+ | \w+(-\w+)*"))
num_povt_words = 0
summa = 0
while_iter = 0
while while_iter < len(mass):
summand_1 = log((mass[while_iter].num_docs + 0.0) / (num_all_docs + 0.0) + 0.0, 1.1)
for i in text:
for i1 in mass[while_iter].lst_allword:
if i == i1:
num_povt_words = num_povt_words + 1
summand_2 = log(((num_povt_words + 1) + 0.0) / ((num_words_unic + mass[while_iter].num_words) + 0.0), 1.1)
num_povt_words = 0
summa = summa + summand_2
mass[while_iter].c = summand_1 + summa
summa = 0
while_iter = while_iter + 1
max_c = -100000
while_iter = 0
number_max = 0
while while_iter < len(mass):
print mass[while_iter].c
if mass[while_iter].c > max_c:
max_c = mass[while_iter].c
number_max = while_iter
while_iter = while_iter + 1
print mass[number_max].name_categories
示例3: StemProvider
class StemProvider(Provider):
"""Stem the input values (either a single word or a list of words)
Uses the porter stemmer algorithm.
"""
def __init__(self, language='english', **kwargs):
"""
See here for a full list of languages:
http://nltk.org/_modules/nltk/stem/snowball.html
.. note::
This does not depend on nltk, it depends on the ``pystemmer`` package.
:param language: language to use during stemming, defaults to english.
"""
Provider.__init__(self, **kwargs)
self._stemmer = Stemmer(language)
def do_process(self, input_value):
if isinstance(input_value, str):
return self._stemmer.stemWord(input_value)
else:
return self._stemmer.stemWords(input_value)
示例4: run
def run():
stemmer = Stemmer("english")
pages = db.en.find()
print colored.yellow("statistic words")
wordstatistic = {}
for page in progress.bar(pages,size=db.en.count()):
data = page.get("data")
if not data:continue
content = data.get("content")
if not content:
db.en.remove({"_id":page["_id"]})
continue
words = EN_WORD_CUT.split(content)
for word in words:
w=stemmer.stemWord(word.strip()).lower()
if w and len(w)<20 and not w in EN_IGNORE:
if wordstatistic.get(w):
wordstatistic[w]+=1
else:
wordstatistic[w]=1
print colored.yellow("save to en_words_freq")
savequene = []
for k,v in progress.bar(wordstatistic.iteritems(),size=len(wordstatistic)):
savequene.append({"_id":k,"freq":v})
if len(savequene) >=1000:
db.en_words_freq.insert(savequene)
savequene=[]
if savequene:db.en_words_freq.insert(savequene)
print colored.cyan(
"count of en_words_freq: %d" % db.en_words_freq.count())
示例5: _prepare_text
def _prepare_text(self, text):
"""Extracts and stems the words from some given text.
"""
words = re.findall('[a-z0-9\']+', text.lower())
words = [word for word in words if word not in STOP_WORDS]
stemmer = Stemmer('english')
stemmed_words = stemmer.stemWords(words)
return stemmed_words
示例6: train
def train(name_file_dbase, way_to_dbase):
stm = Stemmer('russian')
file_base = open(name_file_dbase, 'r')
Lines = file_base.readlines()
num_all_docs = len(Lines) + 1
mass = []
iter1 = 0
iter2 = 0
for line in Lines:
number1, address1 = unpack_line(line)
number = number1.strip("\n")
address = address1.strip("\n")
if (number == "1"):
mass.append(Categories())
mass[iter1].name_categories = address1
mass[iter1 - 1].num_docs = iter2
iter1 = iter1 + 1
iter2 = 0
iter2 = iter2 + 1
mass[len(mass) - 1].num_docs = iter2
while_iter = 0
file_base.close()
number = 1
while while_iter < len(mass):
while number <= mass[while_iter].num_docs:
file_forclass = open(way_to_dbase + mass[while_iter].name_categories
+ '/' + str(number) + 'forclass.txt', 'r')
str_read = re.sub("^\s+|\n|\r|\s+$", ' ', file_forclass.read())
mass[while_iter].line_allword = mass[while_iter].line_allword + str_read
file_forclass.close()
number = number + 1
while_iter = while_iter + 1
number = 1
while_iter = 0
while while_iter < len(mass):
forstemmer = mass[while_iter].line_allword.decode('UTF-8')
str_read = stm.stemWords(regexp_tokenize(forstemmer.lower(), r"(?x) \w+ | \w+(-\w+)*"))
mass[while_iter].num_words = len(str_read)
mass[while_iter].lst_allword = str_read
lst_unic_words = list(set(mass[while_iter].lst_allword))
mass[while_iter].num_wordsunic = len(lst_unic_words)
while_iter = while_iter + 1
all_words = 0
num_words_unic = 0
while_iter = 0
while while_iter < len(mass):
all_words = all_words + mass[while_iter].num_words
num_words_unic = num_words_unic + mass[while_iter].num_wordsunic
while_iter = while_iter + 1
return mass, num_all_docs, num_words_unic
示例7: get_search_phrases
def get_search_phrases(self, indexing_func=None):
"""Returns search phrases from properties in a given Model instance.
Args (optional):
only_index: List of strings. Restricts indexing to these property names.
indexing_func: A function that returns a set of keywords or phrases.
Note that the indexing_func can be passed in to allow more customized
search phrase generation.
Two model variables influence the output of this method:
INDEX_ONLY: If None, all indexable properties are indexed.
If a list of property names, only those properties are indexed.
INDEX_MULTI_WORD: Class variable that allows multi-word search
phrases like "statue of liberty."
INDEX_STEMMING: Returns stemmed phrases.
"""
if not indexing_func:
klass = self.__class__
if klass.INDEX_MULTI_WORD:
indexing_func = klass.get_search_phraseset
else:
indexing_func = klass.get_simple_search_phraseset
if self.INDEX_STEMMING:
stemmer = Stemmer('english')
phrases = set()
# allow indexing of 'subentities' such as tasks of a list as well
queries = [(self,self.INDEX_ONLY)] + self.INDEX_SUBENTITY_QUERIES
import logging
for query, props in queries:
entities = []
try:
subentities = query(self).fetch(1000)
# get all of them
while len(subentities) > 0:
entities.extend(subentities)
last_key = subentities[-1].key()
subentities = query(self).order('__key__').filter('__key__ >',last_key).fetch(1000)
except TypeError, e: # query is not callable because it's an actual entity
entities = [query]
for entity in entities:
for prop_name, prop_value in entity.properties().iteritems():
if not props or prop_name in props:
values = prop_value.get_value_for_datastore(entity)
if not isinstance(values, list):
values = [values]
if (isinstance(values[0], basestring) and
not isinstance(values[0], datastore_types.Blob)):
for value in values:
words = indexing_func(value,add_stop_words=self.INDEX_ADD_STOP_WORDS)
if self.INDEX_STEMMING:
stemmed_words = set(stemmer.stemWords(words))
phrases.update(stemmed_words)
else:
phrases.update(words)
示例8: make_index
def make_index(expression):
"""
Make a standardization in the expression to return a tuple who maximise
maching possibilities.
expression must be a list or tuple
"""
stemmer = Stemmer("french")
expression = [stemmer.stemWord(normalize_token(w)) for w in expression]
expression.sort()
return tuple(expression)
示例9: processQueries
def processQueries(queries):
queryList = []
for query in queries:
filteredQuery = tokenize.filterToken(query, tokenize.getStopWords())
if filteredQuery and filteredQuery is not None:
stemmer = Stemmer('english')
queryStem = stemmer.stemWord(filteredQuery.lower())
queryList.append(queryStem)
return queryList
示例10: parse_html
def parse_html(html):
words = dehtml(html)
s = Stemmer("danish")
result = []
for w in words.split():
word = w.lower()
if word in stop_words or len(word) < 2 or word.count('\\'):
continue
result.append(s.stemWord(word))
return result
示例11: getTerm
def getTerm(term):
term_ids = {}
term_ids_file = open(TERMIDSFILE, 'rU')
for line in term_ids_file.readlines():
pieces = line.strip().split('\t')
stemmer = Stemmer('english')
#stemmer.maxCacheSize = 1
termStem = stemmer.stemWord(term.lower())
if termStem == pieces[1]:
term_ids[pieces[1]] = int(pieces[0])
return term_ids
term_ids_file.close()
return term_ids
示例12: stem_words
def stem_words(self, words: List[str]) -> List[str]:
"""Stem list of words with PyStemmer."""
language_code = self.language_code()
words = decode_object_from_bytes_if_needed(words)
# Normalize apostrophe so that "it’s" and "it's" get treated identically (it's being done in
# _tokenize_with_spaces() too but let's not assume that all tokens that are to be stemmed go through sentence
# tokenization first)
words = [word.replace("’", "'") for word in words]
if language_code is None:
raise McLanguageException("Language code is None.")
if words is None:
raise McLanguageException("Words to stem is None.")
# (Re-)initialize stemmer if needed
if self.__pystemmer is None:
try:
self.__pystemmer = PyStemmer(language_code)
except Exception as ex:
raise McLanguageException(
"Unable to initialize PyStemmer for language '%s': %s" % (language_code, str(ex),)
)
stems = self.__pystemmer.stemWords(words)
if len(words) != len(stems):
log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),))
# Perl's Snowball implementation used to return lowercase stems
stems = [stem.lower() for stem in stems]
return stems
示例13: BagOfWordsFeatureSupport
class BagOfWordsFeatureSupport(FeatureSupport):
def __init__(self, featuresData, featureId):
FeatureSupport.__init__(self, featuresData, featureId)
self.stemmer = Stemmer('english')
self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
stopListFn = './resources/general/stopword.csv'
self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines())))
def preprocess(self, s):
chars = []
for c in unidecode(s.strip().lower()):
if c in self.goodChars:
chars.append(c)
word = ''.join(chars)
return self.stemmer.stemWord(word)
def extract(self, i):
bag = frozenset(map(lambda w: self.preprocess(w), filter(None, self[i].split())))
ret = bag - self.stopList
if len(ret) == 0: ret = frozenset([''.join(random.choice('abcdefghjiklmnopqrstuvwxyz') for _ in range(20))])
return ret
def similarity(self, a, b):
num = len(a & b)
den = len(a | b)
return num / den if den != 0 else 1.0
示例14: BagOfWordsFeatureBooleanizer
class BagOfWordsFeatureBooleanizer(FeatureBooleanizer):
def __init__(self, featureName, featuresData, featureId):
FeatureBooleanizer.__init__(self, featureName, featuresData, featureId)
self.stemmer = Stemmer('english')
self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
stopListFn = './resources/general/stopword.csv'
self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines())))
allWords = set()
if self.featureName == 'Basic: Tagline':
for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split(','))))
else:
for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split())))
self.words = sorted(list(filter(None, allWords - self.stopList)))
def preprocess(self, s):
chars = []
for c in unidecode(s.strip().lower()):
if c in self.goodChars:
chars.append(c)
word = ''.join(chars)
return self.stemmer.stemWord(word)
def getFeatureNames(self):
return [self.featureName + ': ' + word for word in self.words]
def process(self, v):
vWords = set(map(lambda w: self.preprocess(w), filter(None, v.split(','))))
return [(word in vWords) for word in self.words]
示例15: Stemmer
class Stemmer(object):
def __init__(self):
# type: () -> None
self.stemmer = PyStemmer('porter')
def stem(self, word):
# type: (unicode) -> unicode
return self.stemmer.stemWord(word)