当前位置: 首页>>代码示例>>Python>>正文


Python EnglishStemmer.stem方法代码示例

本文整理汇总了Python中nltk.stem.snowball.EnglishStemmer.stem方法的典型用法代码示例。如果您正苦于以下问题:Python EnglishStemmer.stem方法的具体用法?Python EnglishStemmer.stem怎么用?Python EnglishStemmer.stem使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.stem.snowball.EnglishStemmer的用法示例。


在下文中一共展示了EnglishStemmer.stem方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: fix_lemma_problem

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def fix_lemma_problem(pred_scores, targets, space):
    from nltk.stem.snowball import EnglishStemmer
    es = EnglishStemmer()
    r = pred_scores.copy()
    lemmas = np.array([es.stem(v) for v in space.vocab])
    for i, t in enumerate(targets):
        g = es.stem(space.vocab[t])
        mask = (lemmas == g)
        #print space.vocab[t], np.sum(mask)
        r[i][mask] = -1e9
        #print r[i][mask]
    return r
开发者ID:stephenroller,项目名称:naacl2016,代码行数:14,代码来源:lexsub.py

示例2: similarity_score

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def similarity_score(word1, word2):
    """ see sections 2.3 and 2.4 of http://dx.doi.org.ezp-prod1.hul.harvard.edu/10.1109/TKDE.2003.1209005
    :type word1: string
    :type word2: string
    :return: float: between 0 and 1; similarity between two given words
    """
    stemmer = EnglishStemmer()
    if stemmer.stem(word1) == stemmer.stem(word2):
        return 1
    alpha = 0.2
    beta = 0.6
    l, h = get_path_length_and_subsumer_height(word1, word2)
    return exp((-1)*alpha*l)*((exp(beta*h)-exp((-1)*beta*h))/(exp(beta*h)+exp((-1)*beta*h)))
开发者ID:ReganBell,项目名称:QReview,代码行数:15,代码来源:Analyze.py

示例3: getAllStemEntities

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def getAllStemEntities(entities):
    st = EnglishStemmer()
    q = [',', '.', '!', '?', ':', ';']
    tmp = []
    sourceEntities = [x for x in entities if len(x)>0]
    np.random.shuffle(entities)

    for i in xrange(len(entities)):
        if len(entities[i]) == 0:
            continue
        if i % 1000 == 0:
            print i
        entities[i] = entities[i].lower()
        entities[i] = entities[i].replace(' - ', ' \u2013 ', entities[i].count(' - '))
        entities[i] = entities[i].replace(' -', ' \u2013', entities[i].count(' -'))
        entities[i] = entities[i].replace('- ', '\u2013 ', entities[i].count('- '))
        entities[i] = entities[i].replace('-', ' - ', entities[i].count('-'))
        entities[i] = entities[i].replace(')', ' )', entities[i].count(')'))
        entities[i] = entities[i].replace('(', '( ', entities[i].count('('))
        entities[i] = entities[i].replace('\u0027', ' \u0027', entities.count('\u0027'))
        for w in q:
            entities[i]=entities[i].replace(w, ' '+w, entities[i].count(w))
        word = entities[i].split(' ')
        s = ''
        for w in word:
            s  += st.stem(unicode(w)) + ' '
        tmp.append(s[:-1])
        if len(tmp) > 50:
            break

    return tmp, entities[:len(tmp)]
开发者ID:mikhaylova-daria,项目名称:NER,代码行数:33,代码来源:allFunctions.py

示例4: query

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def query(word):
    db = MySQLdb.connect("127.0.0.1","dizing","ynr3","dizing" )
    cursor=db.cursor()
    snowball_stemmer = EnglishStemmer()
    stem2 = snowball_stemmer.stem(word)
    cursor.execute("SELECT * FROM words WHERE original=%s OR stem1=%s OR stem2=%s", (word,word,stem2))
    rows = cursor.fetchall()
    words1 = dict()
    words2 = dict()
    for row in rows:
        if row[1] == word or row[3]==word:
            words1[word] = row[0]
        else:
            words2[word] = row[0]
    scenes1 = []
    scenes2 = []
    for (i,words_dict) in [(1,words1), (2,words2)]:
        wids = words_dict.values()
        for wid in wids:
            sql = "SELECT s.sentence, s.start, s.stop, s.ready, m.title FROM scene AS s, words_scenes AS ws, movie as m " + \
                           "WHERE ws.wid=%d AND ws.sid=s.sid AND s.mid = m.mid" % int(wid)
            # print sql
            cursor.execute(sql)
            rows = cursor.fetchall()
            if (i==1): scenes1 += rows
            else: scenes2 += rows
    print scenes1
    print scenes2
    return scenes1 + scenes2
    db.close()
开发者ID:yasinzor,项目名称:videosozluk,代码行数:32,代码来源:query_word.py

示例5: Granularity

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def Granularity(sentenceArray):
    for sentence in sentenceArray:
        # print(sentence)
        try:

            stemmer = EnglishStemmer()
            sentence = re.sub(r'\#.*?$', '', sentence)
            sentence = re.sub(r'\#.*? ', '', sentence)
            sentence = re.sub(r'\@.*?$', '', sentence)
            sentence = re.sub(r'\@.*? ', '', sentence)
            sentence = re.sub(r'pic.twitter.*?$', '', sentence)
            sentence = re.sub(r'pic.twitter.*? ', '', sentence)
            sentence = re.sub(r'\'m', ' am', sentence)
            sentence = re.sub(r'\'d', ' would', sentence)
            sentence = re.sub(r'\'ll', ' will', sentence)
            sentence = re.sub(r'\&', 'and', sentence)
            sentence = re.sub(r'don\'t', 'do not', sentence)

            data = stemmer.stem(sentence)
            print(data)
            from nltk.corpus import stopwords

            sentence = str(data)
            stop = stopwords.words('english')
            final = [i for i in sentence.split() if i not in stop]
            finalstring = ' '.join(final)
            os.system("printf \"" + str(finalstring) + "\n\">> stemstop/" + word)
        except Exception as e:
            print(e)
开发者ID:PgnDvd,项目名称:SNLP,代码行数:31,代码来源:Stemmer.py

示例6: pre_proc

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def pre_proc(in_str, removestop=True, alwayskeep=False, word_punc=False, unquote=False):
    # remove accents, wordify punctuation
    in_str = strip_accents(in_str, wordify=word_punc, unquote=unquote)
    en_stem = EnglishStemmer()
    # tokenize string
    if removestop:  # remove stop words
        tok_list = filter(lambda x: x not in stopwords.words('english'), wordpunct_tokenize(in_str))
    else:
        tok_list = wordpunct_tokenize(in_str)
    new_tok_list = []
    for tok in tok_list:
        if tok not in WORD_PUNC_LIST:
            correct_spell = HOBJ.spell(tok)
            if not correct_spell:
                suggestions = [strip_accents(tmp_sug).lower() for tmp_sug in HOBJ.suggest(tok)]
            else:
                suggestions = []
            if correct_spell or (tok.lower() in suggestions):
                new_tok_list.append(tok)
                tok_stem = en_stem.stem(tok)
                if tok_stem != tok:
                    new_tok_list.append(tok_stem)
            elif len(tok) >= 3:
                tok_sug = None
                lev_perc = .34
                for sug in suggestions:
                    if not tok_sug and tok == sug[1:]:
                        tok_sug = sug
                if not tok_sug:
                    for sug in suggestions:
                        tmp_lev_perc = float(lev_dist(tok, sug)) / float(max(len(tok),len(sug)))
                        if not tok_sug and tmp_lev_perc < lev_perc:
                            tok_sug = sug
                            lev_perc = tmp_lev_perc
                if tok_sug:
                    new_tok_list.append(tok_sug)
                    tok_stem = en_stem.stem(tok_sug)
                    if tok_stem != tok_sug:
                        new_tok_list.append(tok_stem)
                elif alwayskeep:
                    new_tok_list.append(tok)
            elif alwayskeep:
                new_tok_list.append(tok)
        else:
            new_tok_list.append(tok)
    out_str = string.join(new_tok_list, ' ')
    return out_str.lower()
开发者ID:seanlindsey,项目名称:text_class_utils,代码行数:49,代码来源:svm_utils.py

示例7: as_eng_postagged_doc

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def as_eng_postagged_doc(doc):
    '''Uses nltk default tagger.'''
    tags    = [t for _, t in nltk.pos_tag(list(doc.word))]
    stemmer = EnglishStemmer()
    lemmata = [stemmer.stem(w) for w in list(doc.word)]
    doc['pos']   = Series(tags)
    doc['lemma'] = Series(lemmata)
    return doc
开发者ID:estnltk,项目名称:pfe,代码行数:10,代码来源:corpus.py

示例8: _execute

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
 def _execute(self):
     
     corpus = mongoExtractText(self.name)
     stemmer = EnglishStemmer()
     for item in corpus:
         line = item.replace(',', ' ')
         stemmed_line = stemmer.stem(line)
         self.sentiment.append((sentiment.sentiment(stemmed_line), stemmed_line))
开发者ID:cevaris,项目名称:nebula,代码行数:10,代码来源:mining.py

示例9: stem_word

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def stem_word(word):
    """
    Stem words
    :param word: (str) text word
    :returns: stemmed word
    """
    stemmer = EnglishStemmer()
    return stemmer.stem(word)
开发者ID:vipul-sharma20,项目名称:tweet-analysis,代码行数:10,代码来源:util.py

示例10: text_processing

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def text_processing(text, min_size=4, sep_char=' '):
	from nltk.stem.snowball import EnglishStemmer
	from nltk.corpus import stopwords as stwds

	stemmer = EnglishStemmer()
	stopwords = set(stwds.words('english') + 
			contractions_without_punc)
	
	text = [stemmer.stem(w) for w in text.split(sep_char) 
			if not w in stopwords
			and len(w) >= min_size]

	return text
	words = list()
	for word in text:
		words.append(stemmer.stem(word))
	
	return words
开发者ID:danielcestari,项目名称:machine_learning,代码行数:20,代码来源:naive.py

示例11: computeSentiment

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def computeSentiment(tweet_text):
    pos_count = 0
    neg_count = 0
    pos_terms = []
    neg_terms = []
    st = EnglishStemmer()

    tokenized_tweet = tokenize(tweet_text)
    for t in tokenized_tweet:
        #print st.stem(t.lower())
        if st.stem(t.lower()) in negative_terms:
            neg_terms.append(t.lower())
            neg_count += 1
        elif st.stem(t.lower()) in positive_terms:
            pos_terms.append(t.lower())
            pos_count += 1

    return pos_count, neg_count, set(pos_terms), set(neg_terms)
开发者ID:mchlbrnd,项目名称:visual-analytics-2013-boxoffice,代码行数:20,代码来源:hoursum.py

示例12: use_snowball_stemmer

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
 def use_snowball_stemmer(self,word):
     """
     return stemmed words used snowball algorithm
     :param word:
     :return:
     """
     englishStemmer=EnglishStemmer()
     stemmed_word= englishStemmer.stem(word)
     return stemmed_word
开发者ID:soumik-dutta,项目名称:Keyword-Extraction,代码行数:11,代码来源:Stemming.py

示例13: getLemmatizerInfo

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def getLemmatizerInfo(pathArticle):

    data = open(pathArticle, "r")
    text1 = data.read().decode("utf-8")

    sourceText = text1

    links1 = []
    l = 0
    for q in text1.split():
        if q == "\ufeff":
            continue
        links1.append([text1.find(q, l), q])
        l = len(q) + 1 + text1.find(q, l)

    text1 = text1.replace(" - ", " \u2013 ", text1.count(" - "))
    text1 = text1.replace(" -", " \u2013", text1.count(" -"))
    text1 = text1.replace("- ", "\u2013 ", text1.count("- "))
    text1 = text1.replace("-", " - ", text1.count("-"))
    text1 = text1.replace("(", "( ", text1.count("("))
    text1 = text1.replace(")", " )", text1.count(")"))
    text1 = text1.replace(" \u0027", " \u301E", text1.count(" \u0027"))
    text1 = text1.replace("\u0027", " \u0027", text1.count("\u0027"))
    text1 = text1.split()
    if text1[0] == u"\ufeff":
        text1 = text1[1:]
    text = []
    for word in text1:
        text2 = []
        if len(word) == 0:
            continue
        while word[len(word) - 1] in [",", ".", "!", "?", ":", ";"]:
            text2.append(word[len(word) - 1])
            word = word[:-1]
            if len(word) == 0:
                break
        text.append(word)
        for i in range(len(text2) - 1, -1, -1):
            text.append(text2[i])

    out = ""

    st = EnglishStemmer()

    l = 0
    links = []
    for word in text:
        if isOk(word):
            q = st.stem(word) + " "
        else:
            q = word + " "
        out += q.lower()
        links.append([l, q])
        l += len(q)
    return out, links, links1, sourceText
开发者ID:mikhaylova-daria,项目名称:NER,代码行数:57,代码来源:allFunctions.py

示例14: get_query

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def get_query(vec_dict):
    dim = 300  # Dimension of the GloVe vectors chosen

    # initialize stemmer for search in GLoVe vector space
    st = EnglishStemmer()

    query = raw_input("Please enter search query:")
    query_vector = np.zeros(dim)
    numWords = 0
    for word in query.split():
        if st.stem(word) in vec_dict:
            query_vector += vec_dict[st.stem(word)].astype(np.float)
            numWords += 1
        elif st.stem(word) + "e" in vec_dict:
            query_vector += vec_dict[st.stem(word) + "e"].astype(np.float)
            numWords += 1

    query_vector /= numWords

    return query, query_vector
开发者ID:Pei-jie,项目名称:224d-project,代码行数:22,代码来源:searchWithText.py

示例15: stemming

# 需要导入模块: from nltk.stem.snowball import EnglishStemmer [as 别名]
# 或者: from nltk.stem.snowball.EnglishStemmer import stem [as 别名]
def stemming(tweet):
    tweets = tweet.split()
    wrdStemmer = EnglishStemmer()
    stemTweet =[]
    try:
        for tweet in tweets:
            tweet = wrdStemmer.stem(tweet)
            stemTweet.append(tweet)
    except:
        print("Error: Stemming")
    return " ".join(stemTweet)
开发者ID:RohithEngu,项目名称:Opinion-Summarizer,代码行数:13,代码来源:PreProcessing.py


注:本文中的nltk.stem.snowball.EnglishStemmer.stem方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。