当前位置: 首页>>代码示例>>Python>>正文


Python Stemmer.Stemmer类代码示例

本文整理汇总了Python中Stemmer.Stemmer的典型用法代码示例。如果您正苦于以下问题:Python Stemmer类的具体用法?Python Stemmer怎么用?Python Stemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Stemmer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: getStems

def getStems(cleanedText, stopWords):
    stems = {}
    matches = re.finditer(r'\w+(\.?\w+)*', cleanedText.strip(), flags=re.IGNORECASE)
    stemmer = Stemmer('english')
    #maxlength = sum(1 for _ in matches1)
    #stemmer.maxCacheSize = maxlength
    offset = len(termDict)
    tokenid = offset + 1
    position = 0
    for match in matches:
        #position = match.start()
        position += 1 
        token = match.group()
        filteredToken = filterToken(token, stopWords)
        if filteredToken and filteredToken is not None:
            wordStem = stemmer.stemWord(filteredToken.lower())
            #present = wordStem in stems
            if wordStem not in stems:
                #tokenid += 1
                stems[wordStem] = tokenid
                positions = set()
                positions.add(position)
                if wordStem not in termDict:
                    termDict[wordStem] = tokenid
                    terms[tokenid] = positions
                    tokenid = tokenid + 1
                else:
                    stemid = termDict[wordStem] 
                    terms[stemid] = positions
            else:
                stemid = termDict[wordStem]
                postns = terms[stemid]
                postns.add(position)
                terms[stemid] = postns
开发者ID:mayankn,项目名称:InfoRetrieval,代码行数:34,代码来源:tokenize.py

示例2: classif

def classif(text, mass, num_all_docs, num_words_unic):
    stm = Stemmer('russian')
    text = stm.stemWords(regexp_tokenize((text.decode('UTF-8')).lower(), r"(?x) \w+ | \w+(-\w+)*"))
    num_povt_words = 0
    summa = 0
    while_iter = 0
    while while_iter < len(mass):
        summand_1 = log((mass[while_iter].num_docs + 0.0) / (num_all_docs + 0.0) + 0.0, 1.1)
        for i in text:
            for i1 in mass[while_iter].lst_allword:
                if i == i1:
                    num_povt_words = num_povt_words + 1
            summand_2 = log(((num_povt_words + 1) + 0.0) / ((num_words_unic + mass[while_iter].num_words) + 0.0), 1.1)
            num_povt_words = 0
            summa = summa + summand_2
        mass[while_iter].c = summand_1 + summa
        summa = 0
        while_iter = while_iter + 1

    max_c = -100000
    while_iter = 0
    number_max = 0

    while while_iter < len(mass):
        print mass[while_iter].c
        if mass[while_iter].c > max_c:
            max_c = mass[while_iter].c
            number_max = while_iter
        while_iter = while_iter + 1
    print mass[number_max].name_categories
开发者ID:nsmalimov,项目名称:Naive_Bayes_classifier_Python,代码行数:30,代码来源:BayesClassif.py

示例3: StemProvider

class StemProvider(Provider):
    """Stem the input values (either a single word or a list of words)

    Uses the porter stemmer algorithm.
    """
    def __init__(self, language='english', **kwargs):
        """
        See here for a full list of languages:

            http://nltk.org/_modules/nltk/stem/snowball.html

        .. note::

            This does not depend on nltk, it depends on the ``pystemmer`` package.

        :param language: language to use during stemming, defaults to english.
        """
        Provider.__init__(self, **kwargs)
        self._stemmer = Stemmer(language)

    def do_process(self, input_value):
        if isinstance(input_value, str):
            return self._stemmer.stemWord(input_value)
        else:
            return self._stemmer.stemWords(input_value)
开发者ID:mardix,项目名称:libmunin,代码行数:25,代码来源:stem.py

示例4: run

def run():
    stemmer = Stemmer("english")
    pages = db.en.find()
    print colored.yellow("statistic words") 
    wordstatistic = {}
    for page in progress.bar(pages,size=db.en.count()):
        data = page.get("data")
        if not data:continue
        content = data.get("content")
        if not content:
            db.en.remove({"_id":page["_id"]})
            continue
        words = EN_WORD_CUT.split(content)
        for word in words:
            w=stemmer.stemWord(word.strip()).lower()
            if w and len(w)<20 and not w in EN_IGNORE:
                if wordstatistic.get(w):
                    wordstatistic[w]+=1
                else:
                    wordstatistic[w]=1

    
    print colored.yellow("save to en_words_freq")
    savequene = []
    for k,v in progress.bar(wordstatistic.iteritems(),size=len(wordstatistic)):
        savequene.append({"_id":k,"freq":v})
        if len(savequene) >=1000:
            db.en_words_freq.insert(savequene)
            savequene=[]
        
    if savequene:db.en_words_freq.insert(savequene)
    print colored.cyan(
            "count of en_words_freq: %d" % db.en_words_freq.count())
开发者ID:shanzi,项目名称:gather,代码行数:33,代码来源:statistic_words_freqency.py

示例5: _prepare_text

 def _prepare_text(self, text):
     """Extracts and stems the words from some given text.
     """
     words = re.findall('[a-z0-9\']+', text.lower())
     words = [word for word in words if word not in STOP_WORDS]
     stemmer = Stemmer('english')
     stemmed_words = stemmer.stemWords(words)
     return stemmed_words
开发者ID:ktf,项目名称:DAS,代码行数:8,代码来源:mongosearch.py

示例6: train

def train(name_file_dbase, way_to_dbase):
    stm = Stemmer('russian')
    file_base = open(name_file_dbase, 'r')
    Lines = file_base.readlines()
    num_all_docs = len(Lines) + 1

    mass = []
    iter1 = 0
    iter2 = 0

    for line in Lines:
        number1, address1 = unpack_line(line)
        number = number1.strip("\n")
        address = address1.strip("\n")
        if (number == "1"):
            mass.append(Categories())
            mass[iter1].name_categories = address1
            mass[iter1 - 1].num_docs = iter2
            iter1 = iter1 + 1
            iter2 = 0
        iter2 = iter2 + 1
    mass[len(mass) - 1].num_docs = iter2
    while_iter = 0

    file_base.close()
    number = 1

    while while_iter < len(mass):
        while number <= mass[while_iter].num_docs:
            file_forclass = open(way_to_dbase + mass[while_iter].name_categories
                                 + '/' + str(number) + 'forclass.txt', 'r')
            str_read = re.sub("^\s+|\n|\r|\s+$", ' ', file_forclass.read())
            mass[while_iter].line_allword = mass[while_iter].line_allword + str_read
            file_forclass.close()
            number = number + 1
        while_iter = while_iter + 1
        number = 1

    while_iter = 0

    while while_iter < len(mass):
        forstemmer = mass[while_iter].line_allword.decode('UTF-8')
        str_read = stm.stemWords(regexp_tokenize(forstemmer.lower(), r"(?x) \w+ | \w+(-\w+)*"))
        mass[while_iter].num_words = len(str_read)
        mass[while_iter].lst_allword = str_read
        lst_unic_words = list(set(mass[while_iter].lst_allword))
        mass[while_iter].num_wordsunic = len(lst_unic_words)
        while_iter = while_iter + 1

    all_words = 0
    num_words_unic = 0
    while_iter = 0

    while while_iter < len(mass):
        all_words = all_words + mass[while_iter].num_words
        num_words_unic = num_words_unic + mass[while_iter].num_wordsunic
        while_iter = while_iter + 1
    return mass, num_all_docs, num_words_unic
开发者ID:nsmalimov,项目名称:Naive_Bayes_classifier_Python,代码行数:58,代码来源:BayesClassif.py

示例7: get_search_phrases

    def get_search_phrases(self, indexing_func=None):
        """Returns search phrases from properties in a given Model instance.

        Args (optional):
            only_index: List of strings.  Restricts indexing to these property names.
            indexing_func: A function that returns a set of keywords or phrases.

        Note that the indexing_func can be passed in to allow more customized
        search phrase generation.

        Two model variables influence the output of this method:
            INDEX_ONLY: If None, all indexable properties are indexed.
                If a list of property names, only those properties are indexed.
            INDEX_MULTI_WORD: Class variable that allows multi-word search
                phrases like "statue of liberty."
            INDEX_STEMMING: Returns stemmed phrases.
        """
        if not indexing_func:
            klass = self.__class__
            if klass.INDEX_MULTI_WORD:
                indexing_func = klass.get_search_phraseset
            else:
                indexing_func = klass.get_simple_search_phraseset
        if self.INDEX_STEMMING:
            stemmer = Stemmer('english')
        phrases = set()

        # allow indexing of 'subentities' such as tasks of a list as well
        queries = [(self,self.INDEX_ONLY)] + self.INDEX_SUBENTITY_QUERIES
        import logging
        for query, props in queries:
            entities = []
            try:
                subentities = query(self).fetch(1000)
                # get all of them
                while len(subentities) > 0:
                    entities.extend(subentities)
                    last_key = subentities[-1].key()
                    subentities = query(self).order('__key__').filter('__key__ >',last_key).fetch(1000)
            except TypeError, e: # query is not callable because it's an actual entity
                entities = [query]
            for entity in entities:
                for prop_name, prop_value in entity.properties().iteritems():
                    if not props or prop_name in props:
                        values = prop_value.get_value_for_datastore(entity)
                        if not isinstance(values, list):
                            values = [values]
                        if (isinstance(values[0], basestring) and
                                not isinstance(values[0], datastore_types.Blob)):
                            for value in values:
                                words = indexing_func(value,add_stop_words=self.INDEX_ADD_STOP_WORDS)
                                if self.INDEX_STEMMING:
                                    stemmed_words = set(stemmer.stemWords(words))
                                    phrases.update(stemmed_words)
                                else:
                                    phrases.update(words)
开发者ID:cheesun,项目名称:chees-test,代码行数:56,代码来源:search.py

示例8: make_index

def make_index(expression):
    """
    Make a standardization in the expression to return a tuple who maximise
    maching possibilities.
    expression must be a list or tuple
    """
    stemmer = Stemmer("french")
    expression = [stemmer.stemWord(normalize_token(w)) for w in expression]
    expression.sort()
    return tuple(expression)
开发者ID:jmvanel,项目名称:sulci,代码行数:10,代码来源:textutils.py

示例9: processQueries

def processQueries(queries):
    queryList = []
    for query in queries:
        filteredQuery = tokenize.filterToken(query, tokenize.getStopWords())
        if filteredQuery and filteredQuery is not None:
            stemmer = Stemmer('english')
            queryStem = stemmer.stemWord(filteredQuery.lower())
            queryList.append(queryStem)
    
    return queryList
开发者ID:mayankn,项目名称:InfoRetrieval,代码行数:10,代码来源:read_index.py

示例10: parse_html

    def parse_html(html):
        words = dehtml(html)

        s = Stemmer("danish")

        result = []
        for w in words.split():
            word = w.lower()
            if word in stop_words or len(word) < 2 or word.count('\\'):
                continue

            result.append(s.stemWord(word))
        return result
开发者ID:Roknahr,项目名称:pyCrawler,代码行数:13,代码来源:Indexer.py

示例11: getTerm

def getTerm(term):
    term_ids = {}
    term_ids_file = open(TERMIDSFILE, 'rU')
    
    for line in term_ids_file.readlines():
        pieces = line.strip().split('\t')
        stemmer = Stemmer('english')
        #stemmer.maxCacheSize = 1
        termStem = stemmer.stemWord(term.lower())
        if termStem == pieces[1]:
            term_ids[pieces[1]] = int(pieces[0])
            return term_ids
    
    term_ids_file.close()
    return term_ids
开发者ID:mayankn,项目名称:InfoRetrieval,代码行数:15,代码来源:read_index.py

示例12: stem_words

    def stem_words(self, words: List[str]) -> List[str]:
        """Stem list of words with PyStemmer."""
        language_code = self.language_code()
        words = decode_object_from_bytes_if_needed(words)

        # Normalize apostrophe so that "it’s" and "it's" get treated identically (it's being done in
        # _tokenize_with_spaces() too but let's not assume that all tokens that are to be stemmed go through sentence
        # tokenization first)
        words = [word.replace("’", "'") for word in words]

        if language_code is None:
            raise McLanguageException("Language code is None.")

        if words is None:
            raise McLanguageException("Words to stem is None.")

        # (Re-)initialize stemmer if needed
        if self.__pystemmer is None:

            try:
                self.__pystemmer = PyStemmer(language_code)
            except Exception as ex:
                raise McLanguageException(
                    "Unable to initialize PyStemmer for language '%s': %s" % (language_code, str(ex),)
                )

        stems = self.__pystemmer.stemWords(words)

        if len(words) != len(stems):
            log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),))

        # Perl's Snowball implementation used to return lowercase stems
        stems = [stem.lower() for stem in stems]

        return stems
开发者ID:berkmancenter,项目名称:mediacloud,代码行数:35,代码来源:__init__.py

示例13: BagOfWordsFeatureSupport

class BagOfWordsFeatureSupport(FeatureSupport):
  def __init__(self, featuresData, featureId):
    FeatureSupport.__init__(self, featuresData, featureId)
    self.stemmer = Stemmer('english')
    self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
    stopListFn = './resources/general/stopword.csv'
    self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines())))
  
  def preprocess(self, s):
    chars = []
    for c in unidecode(s.strip().lower()):
      if c in self.goodChars:
        chars.append(c)
    word = ''.join(chars)
    return self.stemmer.stemWord(word)
  
  def extract(self, i):
    bag = frozenset(map(lambda w: self.preprocess(w), filter(None, self[i].split())))
    ret = bag - self.stopList
    if len(ret) == 0: ret = frozenset([''.join(random.choice('abcdefghjiklmnopqrstuvwxyz') for _ in range(20))])
    return ret
  
  def similarity(self, a, b):
    num = len(a & b)
    den = len(a | b)
    return num / den if den != 0 else 1.0
开发者ID:unoduetre,项目名称:MovieRecommendator,代码行数:26,代码来源:bag-of-words.py

示例14: BagOfWordsFeatureBooleanizer

class BagOfWordsFeatureBooleanizer(FeatureBooleanizer):
  def __init__(self, featureName, featuresData, featureId):
    FeatureBooleanizer.__init__(self, featureName, featuresData, featureId)
    self.stemmer = Stemmer('english')
    self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789')
    stopListFn = './resources/general/stopword.csv'
    self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines())))
    allWords = set()
    if self.featureName == 'Basic: Tagline':
      for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split(','))))
    else:
      for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split())))
    self.words = sorted(list(filter(None, allWords - self.stopList)))
  
  def preprocess(self, s):
    chars = []
    for c in unidecode(s.strip().lower()):
      if c in self.goodChars:
        chars.append(c)
    word = ''.join(chars)
    return self.stemmer.stemWord(word)
  
  def getFeatureNames(self):
    return [self.featureName + ': ' + word for word in self.words]
  
  def process(self, v):
    vWords = set(map(lambda w: self.preprocess(w), filter(None, v.split(','))))
    return [(word in vWords) for word in self.words]
开发者ID:unoduetre,项目名称:MovieRecommendator,代码行数:28,代码来源:bag-of-words.py

示例15: Stemmer

            class Stemmer(object):
                def __init__(self):
                    # type: () -> None
                    self.stemmer = PyStemmer('porter')

                def stem(self, word):
                    # type: (unicode) -> unicode
                    return self.stemmer.stemWord(word)
开发者ID:JelteF,项目名称:sphinx,代码行数:8,代码来源:en.py


注:本文中的Stemmer.Stemmer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。