当前位置: 首页>>代码示例>>Python>>正文


Python Stemmer.stemWord方法代码示例

本文整理汇总了Python中Stemmer.Stemmer.stemWord方法的典型用法代码示例。如果您正苦于以下问题:Python Stemmer.stemWord方法的具体用法?Python Stemmer.stemWord怎么用?Python Stemmer.stemWord使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Stemmer.Stemmer的用法示例。


在下文中一共展示了Stemmer.stemWord方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: BagOfWordsFeatureSupport

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
class BagOfWordsFeatureSupport(FeatureSupport):
  def __init__(self, featuresData, featureId):
    FeatureSupport.__init__(self, featuresData, featureId)
    self.stemmer = Stemmer('english')
    self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
    stopListFn = './resources/general/stopword.csv'
    self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines())))
  
  def preprocess(self, s):
    chars = []
    for c in unidecode(s.strip().lower()):
      if c in self.goodChars:
        chars.append(c)
    word = ''.join(chars)
    return self.stemmer.stemWord(word)
  
  def extract(self, i):
    bag = frozenset(map(lambda w: self.preprocess(w), filter(None, self[i].split())))
    ret = bag - self.stopList
    if len(ret) == 0: ret = frozenset([''.join(random.choice('abcdefghjiklmnopqrstuvwxyz') for _ in range(20))])
    return ret
  
  def similarity(self, a, b):
    num = len(a & b)
    den = len(a | b)
    return num / den if den != 0 else 1.0
开发者ID:unoduetre,项目名称:MovieRecommendator,代码行数:28,代码来源:bag-of-words.py

示例2: run

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def run():
    stemmer = Stemmer("english")
    pages = db.en.find()
    print colored.yellow("statistic words") 
    wordstatistic = {}
    for page in progress.bar(pages,size=db.en.count()):
        data = page.get("data")
        if not data:continue
        content = data.get("content")
        if not content:
            db.en.remove({"_id":page["_id"]})
            continue
        words = EN_WORD_CUT.split(content)
        for word in words:
            w=stemmer.stemWord(word.strip()).lower()
            if w and len(w)<20 and not w in EN_IGNORE:
                if wordstatistic.get(w):
                    wordstatistic[w]+=1
                else:
                    wordstatistic[w]=1

    
    print colored.yellow("save to en_words_freq")
    savequene = []
    for k,v in progress.bar(wordstatistic.iteritems(),size=len(wordstatistic)):
        savequene.append({"_id":k,"freq":v})
        if len(savequene) >=1000:
            db.en_words_freq.insert(savequene)
            savequene=[]
        
    if savequene:db.en_words_freq.insert(savequene)
    print colored.cyan(
            "count of en_words_freq: %d" % db.en_words_freq.count())
开发者ID:shanzi,项目名称:gather,代码行数:35,代码来源:statistic_words_freqency.py

示例3: getStems

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def getStems(cleanedText, stopWords):
    stems = {}
    matches = re.finditer(r'\w+(\.?\w+)*', cleanedText.strip(), flags=re.IGNORECASE)
    stemmer = Stemmer('english')
    #maxlength = sum(1 for _ in matches1)
    #stemmer.maxCacheSize = maxlength
    offset = len(termDict)
    tokenid = offset + 1
    position = 0
    for match in matches:
        #position = match.start()
        position += 1 
        token = match.group()
        filteredToken = filterToken(token, stopWords)
        if filteredToken and filteredToken is not None:
            wordStem = stemmer.stemWord(filteredToken.lower())
            #present = wordStem in stems
            if wordStem not in stems:
                #tokenid += 1
                stems[wordStem] = tokenid
                positions = set()
                positions.add(position)
                if wordStem not in termDict:
                    termDict[wordStem] = tokenid
                    terms[tokenid] = positions
                    tokenid = tokenid + 1
                else:
                    stemid = termDict[wordStem] 
                    terms[stemid] = positions
            else:
                stemid = termDict[wordStem]
                postns = terms[stemid]
                postns.add(position)
                terms[stemid] = postns
开发者ID:mayankn,项目名称:InfoRetrieval,代码行数:36,代码来源:tokenize.py

示例4: BagOfWordsFeatureBooleanizer

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
class BagOfWordsFeatureBooleanizer(FeatureBooleanizer):
  def __init__(self, featureName, featuresData, featureId):
    FeatureBooleanizer.__init__(self, featureName, featuresData, featureId)
    self.stemmer = Stemmer('english')
    self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
    stopListFn = './resources/general/stopword.csv'
    self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines())))
    allWords = set()
    if self.featureName == 'Basic: Tagline':
      for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split(','))))
    else:
      for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split())))
    self.words = sorted(list(filter(None, allWords - self.stopList)))
  
  def preprocess(self, s):
    chars = []
    for c in unidecode(s.strip().lower()):
      if c in self.goodChars:
        chars.append(c)
    word = ''.join(chars)
    return self.stemmer.stemWord(word)
  
  def getFeatureNames(self):
    return [self.featureName + ': ' + word for word in self.words]
  
  def process(self, v):
    vWords = set(map(lambda w: self.preprocess(w), filter(None, v.split(','))))
    return [(word in vWords) for word in self.words]
开发者ID:unoduetre,项目名称:MovieRecommendator,代码行数:30,代码来源:bag-of-words.py

示例5: StemProvider

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
class StemProvider(Provider):
    """Stem the input values (either a single word or a list of words)

    Uses the porter stemmer algorithm.
    """
    def __init__(self, language='english', **kwargs):
        """
        See here for a full list of languages:

            http://nltk.org/_modules/nltk/stem/snowball.html

        .. note::

            This does not depend on nltk, it depends on the ``pystemmer`` package.

        :param language: language to use during stemming, defaults to english.
        """
        Provider.__init__(self, **kwargs)
        self._stemmer = Stemmer(language)

    def do_process(self, input_value):
        if isinstance(input_value, str):
            return self._stemmer.stemWord(input_value)
        else:
            return self._stemmer.stemWords(input_value)
开发者ID:mardix,项目名称:libmunin,代码行数:27,代码来源:stem.py

示例6: Stemmer

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
            class Stemmer(object):
                def __init__(self):
                    # type: () -> None
                    self.stemmer = PyStemmer('porter')

                def stem(self, word):
                    # type: (unicode) -> unicode
                    return self.stemmer.stemWord(word)
开发者ID:JelteF,项目名称:sphinx,代码行数:10,代码来源:en.py

示例7: make_index

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def make_index(expression):
    """
    Make a standardization in the expression to return a tuple who maximise
    maching possibilities.
    expression must be a list or tuple
    """
    stemmer = Stemmer("french")
    expression = [stemmer.stemWord(normalize_token(w)) for w in expression]
    expression.sort()
    return tuple(expression)
开发者ID:jmvanel,项目名称:sulci,代码行数:12,代码来源:textutils.py

示例8: processQueries

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def processQueries(queries):
    queryList = []
    for query in queries:
        filteredQuery = tokenize.filterToken(query, tokenize.getStopWords())
        if filteredQuery and filteredQuery is not None:
            stemmer = Stemmer('english')
            queryStem = stemmer.stemWord(filteredQuery.lower())
            queryList.append(queryStem)
    
    return queryList
开发者ID:mayankn,项目名称:InfoRetrieval,代码行数:12,代码来源:read_index.py

示例9: parse_html

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
    def parse_html(html):
        words = dehtml(html)

        s = Stemmer("danish")

        result = []
        for w in words.split():
            word = w.lower()
            if word in stop_words or len(word) < 2 or word.count('\\'):
                continue

            result.append(s.stemWord(word))
        return result
开发者ID:Roknahr,项目名称:pyCrawler,代码行数:15,代码来源:Indexer.py

示例10: getTerm

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def getTerm(term):
    term_ids = {}
    term_ids_file = open(TERMIDSFILE, 'rU')
    
    for line in term_ids_file.readlines():
        pieces = line.strip().split('\t')
        stemmer = Stemmer('english')
        #stemmer.maxCacheSize = 1
        termStem = stemmer.stemWord(term.lower())
        if termStem == pieces[1]:
            term_ids[pieces[1]] = int(pieces[0])
            return term_ids
    
    term_ids_file.close()
    return term_ids
开发者ID:mayankn,项目名称:InfoRetrieval,代码行数:17,代码来源:read_index.py

示例11: TextEater

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
class TextEater(object):
    
    def __init__(self):
        self.stoplist = gen_stops()
        self.stemmer = Stemmer('english')
    
    @coroutine
    def sent_filter(self,target):
        word = ''
        print "ready to eat lines"
        while True:
            sentence = (yield)
            target.send((sentence.lower()).split())

    @coroutine
    def word_filter(self, target):
        print "ready to eat words"
        while True:
            raw = (yield)
            target.send([self.stemmer.stemWord(w) for w in raw if len(w)<=3 or 
                    w in self.stoplist])


    @coroutine
    def ngrams(self,container, n=2,):
        "Compute n-grams" 
        while True:
            grams= (yield)
            for i in range(0, len((grams)) - (n - 1)):
                container[(tuple(grams[i:i+n]))]+=1
               
    @coroutine
    def printer(self):
        while True:
            line = (yield)
            print (line)

    @coroutine
    def typer(self,target):
        print "ready to check type"
        word = None
        while True:
            line = (yield word)
            word=  type(line)
开发者ID:enjoylife,项目名称:mlsocial,代码行数:46,代码来源:flowtest.py

示例12: index

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def index(text, accepted_languages=None, langs=None):
    registry = get_current_registry()
    if accepted_languages == None:
        accepted_languages = [x.strip() for x in
                              registry.settings["accepted_languages"].split(","
                              )]
    if langs == None:
        lang = guessLanguage(text)
        if lang not in accepted_languages:
            langs = accepted_languages
        else:
            langs = [lang]
    langs = list(set(langs).intersection(set(accepted_languages)))
    if not langs:
        langs = accepted_languages
    indexed_words = set()
    for lang in langs:
        stemmer = Stemmer(lang)
        indexed_words.update([stemmer.stemWord(x.value) for x in
                             tokenize(text)])
    return indexed_words
开发者ID:do3cc,项目名称:Scanned-Docs,代码行数:23,代码来源:index.py

示例13: Overview

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
class Overview(Feature):
  description = """
Basic: Overview
""".strip()

  def __init__(self, *args, **kwargs):
    Feature.__init__(self)
    self.stemmer = Stemmer('english')
    self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
    self.stopList = frozenset(['a', 'abaft', 'aboard', 'about', 'abov', 'absent', 'accord', 'account', 'across', 'addit', 'afor', 'after', 'against', 'ago', 'ahead', 'all', 'along', 'alongsid', 'alreadi', 'also', 'am', 'amid', 'amidst', 'among', 'amongst', 'an', 'and', 'anenst', 'ani', 'anoth', 'anybodi', 'anyhow', 'anyon', 'anyth', 'anywher', 'apart', 'apr', 'april', 'apropo', 'apud', 'are', 'around', 'as', 'asid', 'astrid', 'at', 'athwart', 'atop', 'aug', 'august', 'back', 'bad', 'bar', 'be', 'becaus', 'been', 'befor', 'begin', 'behalf', 'behest', 'behind', 'below', 'beneath', 'besid', 'best', 'better', 'between', 'beyond', 'big', 'bigger', 'biggest', 'billion', 'blah', 'bln', 'both', 'but', 'by', 'c', 'ca', 'call', 'can', 'cannot', 'cant', 'case', 'circa', 'close', 'concern', 'could', 'couldt', 'current', 'daili', 'day', 'dec', 'decemb', 'despit', 'did', 'do', 'doe', 'doesnt', 'done', 'dont', 'down', 'due', 'dure', 'each', 'eight', 'eighteen', 'eighth', 'eighti', 'eleven', 'end', 'enough', 'ever', 'except', 'exclud', 'fail', 'far', 'feb', 'februari', 'few', 'fifth', 'first', 'five', 'fiveteen', 'fivti', 'follow', 'for', 'forenenst', 'four', 'fourteen', 'fourth', 'fourti', 'fri', 'friday', 'from', 'front', 'full', 'further', 'get', 'given', 'go', 'gone', 'goot', 'had', 'hadnt', 'has', 'hasnt', 'have', 'havent', 'he', 'her', 'here', 'herself', 'high', 'higher', 'hightst', 'himself', 'his', 'how', 'hunderd', 'i', 'if', 'in', 'includ', 'insid', 'instead', 'into', 'is', 'it', 'itself', 'jan', 'januari', 'jul', 'juli', 'jun', 'june', 'just', 'last', 'late', 'later', 'latest', 'left', 'lest', 'lieu', 'like', 'littl', 'long', 'low', 'lower', 'lowest', 'made', 'make', 'mani', 'mar', 'march', 'may', 'me', 'mean', 'mid', 'midst', 'might', 'milliard', 'million', 'mine', 'minus', 'mld', 'mln', 'modulo', 'mon', 'monday', 'month', 'more', 'most', 'mth', 'much', 'must', 'my', 'myself', 'near', 'need', 'neednt', 'neither', 'never', 'next', 'nine', 'nineteen', 'nineth', 'nineti', 'no', 'none', 'nor', 'not', 'notwithstand', 'nov', 'novemb', 'number', 'o', 'oct', 'octob', 'of', 'off', 'on', 'one', 'onli', 'onto', 'oppos', 'opposit', 'or', 'order', 'other', 'ought', 'our', 'ourselv', 'out', 'outsid', 'over', 'owe', 'pace', 'past', 'per', 'place', 'plus', 'point', 'previous', 'prior', 'pro', 'pursuant', 'put', 'qua', 'rather', 'recent', 'regard', 'regardless', 'respect', 'right', 'round', 'said', 'sake', 'same', 'san', 'sat', 'saturday', 'save', 'saw', 'say', 'second', 'see', 'seen', 'sep', 'septemb', 'seven', 'seventeen', 'seventh', 'seventi', 'sever', 'shall', 'she', 'should', 'shouldnt', 'show', 'shown', 'sinc', 'six', 'sixteen', 'sixth', 'sixti', 'small', 'smaller', 'smallest', 'so', 'some', 'somebodi', 'somehow', 'someon', 'someth', 'somewher', 'soon', 'sooner', 'spite', 'start', 'still', 'subsequ', 'such', 'sun', 'sunday', 'take', 'taken', 'tell', 'ten', 'tenth', 'than', 'thank', 'that', 'the', 'their', 'them', 'themselv', 'there', 'these', 'they', 'third', 'thirteen', 'thirti', 'this', 'those', 'thousand', 'three', 'through', 'throughout', 'thru', 'thruout', 'thu', 'thursday', 'till', 'time', 'to', 'today', 'told', 'too', 'took', 'top', 'toward', 'tue', 'tuesday', 'twelv', 'twenti', 'two', 'under', 'underneath', 'unit', 'unlik', 'until', 'unto', 'up', 'upon', 'us', 'use', 'versus', 'via', 'vice', 'view', 'virtu', 'vis', 'visavi', 'vs', 'was', 'we', 'wed', 'wednesday', 'week', 'well', 'went', 'were', 'what', 'when', 'where', 'whether', 'whi', 'which', 'while', 'who', 'whose', 'will', 'with', 'within', 'without', 'wont', 'wors', 'worst', 'worth', 'would', 'wrt', 'xor', 'year', 'yes', 'yesterday', 'yet', 'you', 'your', 'yourself', 'yourselv', 'yr'])
  
  def preprocess(self, s):
    chars = []
    for c in unidecode(s.strip().lower()):
      if c in self.goodChars:
        chars.append(c)
    word = ''.join(chars)
    return self.stemmer.stemWord(word)
  
  def extract(self, m):
    t = m.overview
    return ','.join(sorted(list(set(filter(lambda w: len(w) > 0 and w not in self.stopList, map(self.preprocess, t.split()))))))
开发者ID:unoduetre,项目名称:MovieRecommendator,代码行数:24,代码来源:Overview.py

示例14: stemmer

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
def stemmer(listofTokens):                                          #Stemming
  stemmer=Stemmer("english")
  stemmedWords=[ stemmer.stemWord(key) for key in listofTokens ]
  return stemmedWords
开发者ID:ashg1910,项目名称:ire_mini_project,代码行数:6,代码来源:textProcessing.py

示例15: Searcher

# 需要导入模块: from Stemmer import Stemmer [as 别名]
# 或者: from Stemmer.Stemmer import stemWord [as 别名]
class Searcher(object):
    """Run a search on documents or objects within documents
    in the SQLite table
    Three scoring options are available: Frequency, TF-IDF and BM25
    Two methods of incrementing the scores of results are available:
    simple addition or best score"""
    
    
    def __init__(self, query, db, doc_level_search=True, stemmer=False, path='/var/lib/philologic/databases/'):
        self.path = path + db + '/'
        self.words = query.split()
        self.doc_level_search = doc_level_search
        self.results = {}
        if doc_level_search:
             self.doc_path = self.path + 'doc_arrays/'
        else:
            self.doc_path = self.path + 'obj_arrays/'
        self.stemmer = stemmer
        if stemmer:
            try:
                from Stemmer import Stemmer
                self.stemmer = Stemmer(stemmer) # where stemmer is the language selected
                self.words = [self.stemmer.stemWord(word) for word in self.words]
            except KeyError:
                print >> sys.stderr, "Language not supported by stemmer. No stemming will be done."
            except ImportError:
                print >> sys.stderr, "PyStemmer is not installed on your system. No stemming will be done."            
        
    def get_hits(self, word, doc=True):
        """Query the SQLite table and return a list of tuples containing the results"""
        cursor = sqlite_conn(self.path + 'hits_per_word.sqlite')
        if self.doc_level_search:
            cursor.execute('select doc_id, word_freq, total_words from doc_hits where word=?', (word,))
        else:
            cursor.execute('select obj_id, word_freq, total_words from obj_hits where word=?', (word,))
        return cursor.fetchall()
        
    def id_to_word(self, id):
        """Return the word given its ID"""
        m = mapper(self.path)
        return m[id]
        
    def get_idf(self, hits):
        """Return IDF score"""
        total_docs = doc_counter(self.doc_path) #### WRONG COUNT
        try:
            return log(float(total_docs) / float(len(hits))) + 1
        except ZeroDivisionError:
            return 0
               
    def search(self, measure='tf_idf', scoring='simple_scoring', intersect=False, display=10):
        """Searcher function"""
        self.intersect = False
        if self.words != []:
            for word in self.words:
                hits = self.get_hits(word)
                getattr(self, measure)(hits, scoring)
                if intersect:
                    if self.intersect:
                        self.docs = self.docs.intersection(self.new_docs)
                        self.new_docs = set([])
                    else:
                        self.intersect = True
                        self.docs = set([obj_id for obj_id in self.results])
                        self.new_docs = set([])
            if intersect:
                self.results = dict([(obj_id, self.results[obj_id]) for obj_id in self.results if obj_id in self.docs])
            return sorted(self.results.iteritems(), key=itemgetter(1), reverse=True)[:display]
        else:
            return []
    
    def debug_score(self, hits, scoring):
        for obj_id, word_freq, word_sum in hits:
            getattr(self, scoring)(obj_id, word_freq)
    
    def tf_idf(self, hits, scoring):
        idf = self.get_idf(hits)
        for obj_id, word_freq, word_sum in hits:
            tf = float(word_freq) / float(word_sum)
            score = tf * idf
            getattr(self, scoring)(obj_id, score)
                    
    def frequency(self, hits, scoring):
        for obj_id, word_freq, word_sum in hits:
            score = float(word_freq) / float(word_sum)
            getattr(self, scoring)(obj_id, score)
                    
    def bm25(self, hits, scoring, k1=1.2, b=0.75):
        ## a floor is applied to normalized length of doc
        ## in order to diminish the importance of small docs
        ## see http://xapian.org/docs/bm25.html
        idf = self.get_idf(hits)
        avg_dl = avg_doc_length(self.path)
        for obj_id, word_freq, obj_length in hits:
            tf = float(word_freq)
            dl = float(obj_length)
            temp_score = tf * (k1 + 1.0)
            temp_score2 = tf + k1 * ((1.0 - b) + b * floor(dl / avg_dl))
            score = idf * temp_score / temp_score2
            getattr(self, scoring)(obj_id, score)
#.........这里部分代码省略.........
开发者ID:clovis,项目名称:philo_extensions,代码行数:103,代码来源:ranked_relevance.py


注:本文中的Stemmer.Stemmer.stemWord方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。