本文整理汇总了Python中Stemmer.Stemmer.stemWord方法的典型用法代码示例。如果您正苦于以下问题:Python Stemmer.stemWord方法的具体用法?Python Stemmer.stemWord怎么用?Python Stemmer.stemWord使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Stemmer.Stemmer
的用法示例。
在下文中一共展示了Stemmer.stemWord方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: BagOfWordsFeatureSupport
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
class BagOfWordsFeatureSupport(FeatureSupport):
def __init__(self, featuresData, featureId):
FeatureSupport.__init__(self, featuresData, featureId)
self.stemmer = Stemmer('english')
self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
stopListFn = './resources/general/stopword.csv'
self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines())))
def preprocess(self, s):
chars = []
for c in unidecode(s.strip().lower()):
if c in self.goodChars:
chars.append(c)
word = ''.join(chars)
return self.stemmer.stemWord(word)
def extract(self, i):
bag = frozenset(map(lambda w: self.preprocess(w), filter(None, self[i].split())))
ret = bag - self.stopList
if len(ret) == 0: ret = frozenset([''.join(random.choice('abcdefghjiklmnopqrstuvwxyz') for _ in range(20))])
return ret
def similarity(self, a, b):
num = len(a & b)
den = len(a | b)
return num / den if den != 0 else 1.0
示例2: run
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def run():
stemmer = Stemmer("english")
pages = db.en.find()
print colored.yellow("statistic words")
wordstatistic = {}
for page in progress.bar(pages,size=db.en.count()):
data = page.get("data")
if not data:continue
content = data.get("content")
if not content:
db.en.remove({"_id":page["_id"]})
continue
words = EN_WORD_CUT.split(content)
for word in words:
w=stemmer.stemWord(word.strip()).lower()
if w and len(w)<20 and not w in EN_IGNORE:
if wordstatistic.get(w):
wordstatistic[w]+=1
else:
wordstatistic[w]=1
print colored.yellow("save to en_words_freq")
savequene = []
for k,v in progress.bar(wordstatistic.iteritems(),size=len(wordstatistic)):
savequene.append({"_id":k,"freq":v})
if len(savequene) >=1000:
db.en_words_freq.insert(savequene)
savequene=[]
if savequene:db.en_words_freq.insert(savequene)
print colored.cyan(
"count of en_words_freq: %d" % db.en_words_freq.count())
示例3: getStems
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def getStems(cleanedText, stopWords):
stems = {}
matches = re.finditer(r'\w+(\.?\w+)*', cleanedText.strip(), flags=re.IGNORECASE)
stemmer = Stemmer('english')
#maxlength = sum(1 for _ in matches1)
#stemmer.maxCacheSize = maxlength
offset = len(termDict)
tokenid = offset + 1
position = 0
for match in matches:
#position = match.start()
position += 1
token = match.group()
filteredToken = filterToken(token, stopWords)
if filteredToken and filteredToken is not None:
wordStem = stemmer.stemWord(filteredToken.lower())
#present = wordStem in stems
if wordStem not in stems:
#tokenid += 1
stems[wordStem] = tokenid
positions = set()
positions.add(position)
if wordStem not in termDict:
termDict[wordStem] = tokenid
terms[tokenid] = positions
tokenid = tokenid + 1
else:
stemid = termDict[wordStem]
terms[stemid] = positions
else:
stemid = termDict[wordStem]
postns = terms[stemid]
postns.add(position)
terms[stemid] = postns
示例4: BagOfWordsFeatureBooleanizer
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
class BagOfWordsFeatureBooleanizer(FeatureBooleanizer):
def __init__(self, featureName, featuresData, featureId):
FeatureBooleanizer.__init__(self, featureName, featuresData, featureId)
self.stemmer = Stemmer('english')
self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
stopListFn = './resources/general/stopword.csv'
self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines())))
allWords = set()
if self.featureName == 'Basic: Tagline':
for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split(','))))
else:
for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split())))
self.words = sorted(list(filter(None, allWords - self.stopList)))
def preprocess(self, s):
chars = []
for c in unidecode(s.strip().lower()):
if c in self.goodChars:
chars.append(c)
word = ''.join(chars)
return self.stemmer.stemWord(word)
def getFeatureNames(self):
return [self.featureName + ': ' + word for word in self.words]
def process(self, v):
vWords = set(map(lambda w: self.preprocess(w), filter(None, v.split(','))))
return [(word in vWords) for word in self.words]
示例5: StemProvider
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
class StemProvider(Provider):
"""Stem the input values (either a single word or a list of words)
Uses the porter stemmer algorithm.
"""
def __init__(self, language='english', **kwargs):
"""
See here for a full list of languages:
http://nltk.org/_modules/nltk/stem/snowball.html
.. note::
This does not depend on nltk, it depends on the ``pystemmer`` package.
:param language: language to use during stemming, defaults to english.
"""
Provider.__init__(self, **kwargs)
self._stemmer = Stemmer(language)
def do_process(self, input_value):
if isinstance(input_value, str):
return self._stemmer.stemWord(input_value)
else:
return self._stemmer.stemWords(input_value)
示例6: Stemmer
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
class Stemmer(object):
def __init__(self):
# type: () -> None
self.stemmer = PyStemmer('porter')
def stem(self, word):
# type: (unicode) -> unicode
return self.stemmer.stemWord(word)
示例7: make_index
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def make_index(expression):
"""
Make a standardization in the expression to return a tuple who maximise
maching possibilities.
expression must be a list or tuple
"""
stemmer = Stemmer("french")
expression = [stemmer.stemWord(normalize_token(w)) for w in expression]
expression.sort()
return tuple(expression)
示例8: processQueries
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def processQueries(queries):
queryList = []
for query in queries:
filteredQuery = tokenize.filterToken(query, tokenize.getStopWords())
if filteredQuery and filteredQuery is not None:
stemmer = Stemmer('english')
queryStem = stemmer.stemWord(filteredQuery.lower())
queryList.append(queryStem)
return queryList
示例9: parse_html
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def parse_html(html):
words = dehtml(html)
s = Stemmer("danish")
result = []
for w in words.split():
word = w.lower()
if word in stop_words or len(word) < 2 or word.count('\\'):
continue
result.append(s.stemWord(word))
return result
示例10: getTerm
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def getTerm(term):
term_ids = {}
term_ids_file = open(TERMIDSFILE, 'rU')
for line in term_ids_file.readlines():
pieces = line.strip().split('\t')
stemmer = Stemmer('english')
#stemmer.maxCacheSize = 1
termStem = stemmer.stemWord(term.lower())
if termStem == pieces[1]:
term_ids[pieces[1]] = int(pieces[0])
return term_ids
term_ids_file.close()
return term_ids
示例11: TextEater
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
class TextEater(object):
def __init__(self):
self.stoplist = gen_stops()
self.stemmer = Stemmer('english')
@coroutine
def sent_filter(self,target):
word = ''
print "ready to eat lines"
while True:
sentence = (yield)
target.send((sentence.lower()).split())
@coroutine
def word_filter(self, target):
print "ready to eat words"
while True:
raw = (yield)
target.send([self.stemmer.stemWord(w) for w in raw if len(w)<=3 or
w in self.stoplist])
@coroutine
def ngrams(self,container, n=2,):
"Compute n-grams"
while True:
grams= (yield)
for i in range(0, len((grams)) - (n - 1)):
container[(tuple(grams[i:i+n]))]+=1
@coroutine
def printer(self):
while True:
line = (yield)
print (line)
@coroutine
def typer(self,target):
print "ready to check type"
word = None
while True:
line = (yield word)
word= type(line)
示例12: index
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def index(text, accepted_languages=None, langs=None):
registry = get_current_registry()
if accepted_languages == None:
accepted_languages = [x.strip() for x in
registry.settings["accepted_languages"].split(","
)]
if langs == None:
lang = guessLanguage(text)
if lang not in accepted_languages:
langs = accepted_languages
else:
langs = [lang]
langs = list(set(langs).intersection(set(accepted_languages)))
if not langs:
langs = accepted_languages
indexed_words = set()
for lang in langs:
stemmer = Stemmer(lang)
indexed_words.update([stemmer.stemWord(x.value) for x in
tokenize(text)])
return indexed_words
示例13: Overview
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
class Overview(Feature):
description = """
Basic: Overview
""".strip()
def __init__(self, *args, **kwargs):
Feature.__init__(self)
self.stemmer = Stemmer('english')
self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
self.stopList = frozenset(['a', 'abaft', 'aboard', 'about', 'abov', 'absent', 'accord', 'account', 'across', 'addit', 'afor', 'after', 'against', 'ago', 'ahead', 'all', 'along', 'alongsid', 'alreadi', 'also', 'am', 'amid', 'amidst', 'among', 'amongst', 'an', 'and', 'anenst', 'ani', 'anoth', 'anybodi', 'anyhow', 'anyon', 'anyth', 'anywher', 'apart', 'apr', 'april', 'apropo', 'apud', 'are', 'around', 'as', 'asid', 'astrid', 'at', 'athwart', 'atop', 'aug', 'august', 'back', 'bad', 'bar', 'be', 'becaus', 'been', 'befor', 'begin', 'behalf', 'behest', 'behind', 'below', 'beneath', 'besid', 'best', 'better', 'between', 'beyond', 'big', 'bigger', 'biggest', 'billion', 'blah', 'bln', 'both', 'but', 'by', 'c', 'ca', 'call', 'can', 'cannot', 'cant', 'case', 'circa', 'close', 'concern', 'could', 'couldt', 'current', 'daili', 'day', 'dec', 'decemb', 'despit', 'did', 'do', 'doe', 'doesnt', 'done', 'dont', 'down', 'due', 'dure', 'each', 'eight', 'eighteen', 'eighth', 'eighti', 'eleven', 'end', 'enough', 'ever', 'except', 'exclud', 'fail', 'far', 'feb', 'februari', 'few', 'fifth', 'first', 'five', 'fiveteen', 'fivti', 'follow', 'for', 'forenenst', 'four', 'fourteen', 'fourth', 'fourti', 'fri', 'friday', 'from', 'front', 'full', 'further', 'get', 'given', 'go', 'gone', 'goot', 'had', 'hadnt', 'has', 'hasnt', 'have', 'havent', 'he', 'her', 'here', 'herself', 'high', 'higher', 'hightst', 'himself', 'his', 'how', 'hunderd', 'i', 'if', 'in', 'includ', 'insid', 'instead', 'into', 'is', 'it', 'itself', 'jan', 'januari', 'jul', 'juli', 'jun', 'june', 'just', 'last', 'late', 'later', 'latest', 'left', 'lest', 'lieu', 'like', 'littl', 'long', 'low', 'lower', 'lowest', 'made', 'make', 'mani', 'mar', 'march', 'may', 'me', 'mean', 'mid', 'midst', 'might', 'milliard', 'million', 'mine', 'minus', 'mld', 'mln', 'modulo', 'mon', 'monday', 'month', 'more', 'most', 'mth', 'much', 'must', 'my', 'myself', 'near', 'need', 'neednt', 'neither', 'never', 'next', 'nine', 'nineteen', 'nineth', 'nineti', 'no', 'none', 'nor', 'not', 'notwithstand', 'nov', 'novemb', 'number', 'o', 'oct', 'octob', 'of', 'off', 'on', 'one', 'onli', 'onto', 'oppos', 'opposit', 'or', 'order', 'other', 'ought', 'our', 'ourselv', 'out', 'outsid', 'over', 'owe', 'pace', 'past', 'per', 'place', 'plus', 'point', 'previous', 'prior', 'pro', 'pursuant', 'put', 'qua', 'rather', 'recent', 'regard', 'regardless', 'respect', 'right', 'round', 'said', 'sake', 'same', 'san', 'sat', 'saturday', 'save', 'saw', 'say', 'second', 'see', 'seen', 'sep', 'septemb', 'seven', 'seventeen', 'seventh', 'seventi', 'sever', 'shall', 'she', 'should', 'shouldnt', 'show', 'shown', 'sinc', 'six', 'sixteen', 'sixth', 'sixti', 'small', 'smaller', 'smallest', 'so', 'some', 'somebodi', 'somehow', 'someon', 'someth', 'somewher', 'soon', 'sooner', 'spite', 'start', 'still', 'subsequ', 'such', 'sun', 'sunday', 'take', 'taken', 'tell', 'ten', 'tenth', 'than', 'thank', 'that', 'the', 'their', 'them', 'themselv', 'there', 'these', 'they', 'third', 'thirteen', 'thirti', 'this', 'those', 'thousand', 'three', 'through', 'throughout', 'thru', 'thruout', 'thu', 'thursday', 'till', 'time', 'to', 'today', 'told', 'too', 'took', 'top', 'toward', 'tue', 'tuesday', 'twelv', 'twenti', 'two', 'under', 'underneath', 'unit', 'unlik', 'until', 'unto', 'up', 'upon', 'us', 'use', 'versus', 'via', 'vice', 'view', 'virtu', 'vis', 'visavi', 'vs', 'was', 'we', 'wed', 'wednesday', 'week', 'well', 'went', 'were', 'what', 'when', 'where', 'whether', 'whi', 'which', 'while', 'who', 'whose', 'will', 'with', 'within', 'without', 'wont', 'wors', 'worst', 'worth', 'would', 'wrt', 'xor', 'year', 'yes', 'yesterday', 'yet', 'you', 'your', 'yourself', 'yourselv', 'yr'])
def preprocess(self, s):
chars = []
for c in unidecode(s.strip().lower()):
if c in self.goodChars:
chars.append(c)
word = ''.join(chars)
return self.stemmer.stemWord(word)
def extract(self, m):
t = m.overview
return ','.join(sorted(list(set(filter(lambda w: len(w) > 0 and w not in self.stopList, map(self.preprocess, t.split()))))))
示例14: stemmer
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def stemmer(listofTokens): #Stemming
stemmer=Stemmer("english")
stemmedWords=[ stemmer.stemWord(key) for key in listofTokens ]
return stemmedWords
示例15: Searcher
# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
class Searcher(object):
"""Run a search on documents or objects within documents
in the SQLite table
Three scoring options are available: Frequency, TF-IDF and BM25
Two methods of incrementing the scores of results are available:
simple addition or best score"""
def __init__(self, query, db, doc_level_search=True, stemmer=False, path='/var/lib/philologic/databases/'):
self.path = path + db + '/'
self.words = query.split()
self.doc_level_search = doc_level_search
self.results = {}
if doc_level_search:
self.doc_path = self.path + 'doc_arrays/'
else:
self.doc_path = self.path + 'obj_arrays/'
self.stemmer = stemmer
if stemmer:
try:
from Stemmer import Stemmer
self.stemmer = Stemmer(stemmer) # where stemmer is the language selected
self.words = [self.stemmer.stemWord(word) for word in self.words]
except KeyError:
print >> sys.stderr, "Language not supported by stemmer. No stemming will be done."
except ImportError:
print >> sys.stderr, "PyStemmer is not installed on your system. No stemming will be done."
def get_hits(self, word, doc=True):
"""Query the SQLite table and return a list of tuples containing the results"""
cursor = sqlite_conn(self.path + 'hits_per_word.sqlite')
if self.doc_level_search:
cursor.execute('select doc_id, word_freq, total_words from doc_hits where word=?', (word,))
else:
cursor.execute('select obj_id, word_freq, total_words from obj_hits where word=?', (word,))
return cursor.fetchall()
def id_to_word(self, id):
"""Return the word given its ID"""
m = mapper(self.path)
return m[id]
def get_idf(self, hits):
"""Return IDF score"""
total_docs = doc_counter(self.doc_path) #### WRONG COUNT
try:
return log(float(total_docs) / float(len(hits))) + 1
except ZeroDivisionError:
return 0
def search(self, measure='tf_idf', scoring='simple_scoring', intersect=False, display=10):
"""Searcher function"""
self.intersect = False
if self.words != []:
for word in self.words:
hits = self.get_hits(word)
getattr(self, measure)(hits, scoring)
if intersect:
if self.intersect:
self.docs = self.docs.intersection(self.new_docs)
self.new_docs = set([])
else:
self.intersect = True
self.docs = set([obj_id for obj_id in self.results])
self.new_docs = set([])
if intersect:
self.results = dict([(obj_id, self.results[obj_id]) for obj_id in self.results if obj_id in self.docs])
return sorted(self.results.iteritems(), key=itemgetter(1), reverse=True)[:display]
else:
return []
def debug_score(self, hits, scoring):
for obj_id, word_freq, word_sum in hits:
getattr(self, scoring)(obj_id, word_freq)
def tf_idf(self, hits, scoring):
idf = self.get_idf(hits)
for obj_id, word_freq, word_sum in hits:
tf = float(word_freq) / float(word_sum)
score = tf * idf
getattr(self, scoring)(obj_id, score)
def frequency(self, hits, scoring):
for obj_id, word_freq, word_sum in hits:
score = float(word_freq) / float(word_sum)
getattr(self, scoring)(obj_id, score)
def bm25(self, hits, scoring, k1=1.2, b=0.75):
## a floor is applied to normalized length of doc
## in order to diminish the importance of small docs
## see http://xapian.org/docs/bm25.html
idf = self.get_idf(hits)
avg_dl = avg_doc_length(self.path)
for obj_id, word_freq, obj_length in hits:
tf = float(word_freq)
dl = float(obj_length)
temp_score = tf * (k1 + 1.0)
temp_score2 = tf + k1 * ((1.0 - b) + b * floor(dl / avg_dl))
score = idf * temp_score / temp_score2
getattr(self, scoring)(obj_id, score)
#.........这里部分代码省略.........