當前位置: 首頁>>代碼示例>>Python>>正文


Python stem.SnowballStemmer方法代碼示例

本文整理匯總了Python中nltk.stem.SnowballStemmer方法的典型用法代碼示例。如果您正苦於以下問題:Python stem.SnowballStemmer方法的具體用法?Python stem.SnowballStemmer怎麽用?Python stem.SnowballStemmer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在nltk.stem的用法示例。


在下文中一共展示了stem.SnowballStemmer方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: data_qualityCheck

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import SnowballStemmer [as 別名]
def data_qualityCheck():
    
    print("Checking data qualitites...")
    train_news.isnull().sum()
    train_news.info()
        
    print("check finished.")

    #below datasets were used to 
    test_news.isnull().sum()
    test_news.info()

    valid_news.isnull().sum()
    valid_news.info()

#run the below function call to see the quality check results
#data_qualityCheck()



#eng_stemmer = SnowballStemmer('english')
#stopwords = set(nltk.corpus.stopwords.words('english'))

#Stemming 
開發者ID:nishitpatel01,項目名稱:Fake_News_Detection,代碼行數:26,代碼來源:DataPrep.py

示例2: collect_pairs_by_rel

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import SnowballStemmer [as 別名]
def collect_pairs_by_rel(filename, rel):
    """ Collect pairs from PPDB maintaining the specified relation. """
    stemmer = SnowballStemmer("english")

    with open(filename, "r") as f:
        data = f.readlines()

    phrase2paraphrase = dict()

    for item in data:
        item = item.strip()
        phrase = item.split('|||')[1].strip()
        paraphrase = item.split('|||')[2].strip()

        if stemmer.stem(phrase) == stemmer.stem(paraphrase):
            continue

        entailment = item.split('|||')[-1].strip()

        if entailment == rel:
            add_to_dict_of_set(phrase, paraphrase, phrase2paraphrase)
            add_to_dict_of_set(paraphrase, phrase, phrase2paraphrase)

    print("Size: %d" % len(phrase2paraphrase))
    return phrase2paraphrase 
開發者ID:LittleYUYU,項目名稱:Interactive-Semantic-Parsing,代碼行數:27,代碼來源:ppdb.py

示例3: __init__

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import SnowballStemmer [as 別名]
def __init__(self, language='english', stopwords=None, stemming=True):
        if stemming:
            self._stemmer = SnowballStemmer(language)
        else:
            self._stemmer = None

        if isinstance(stopwords, list):
            self._stopwords = stopwords
        elif isinstance(stopwords, (str, unicode)):
            # stopwords argument is a path
            try:
                self._stopwords = self._load_stopwords(stopwords)
            except IOError:
                raise IOError('stopwords argument must be a path to a .txt file, a list of word strings '
                              'or None (which loads the default list)')
        else:
            # Load built-in stopwords
            stopwords_dir = 'stopwords/{0}.txt'.format(language.lower())
            application_root = os.path.dirname(__file__)
            stopwords_file = os.path.join(application_root, '..', stopwords_dir)
            self._stopwords = self._load_stopwords(stopwords_file) 
開發者ID:jaijuneja,項目名稱:PyTLDR,代碼行數:23,代碼來源:tokenizer.py

示例4: __init__

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import SnowballStemmer [as 別名]
def __init__(self, language):
        self.language = language
        self.stopwords = corpus.stopwords.words(language)
        self.stemmer = stem.SnowballStemmer(language) 
開發者ID:despawnerer,項目名稱:summarize,代碼行數:6,代碼來源:language.py

示例5: clean_paraphrase

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import SnowballStemmer [as 別名]
def clean_paraphrase(paraphrase_dict):
    stemmer = SnowballStemmer("english")
    paraphrase_dict_clean = dict()
    print("Size: %d" % len(paraphrase_dict))

    for phrase, paraphrases in paraphrase_dict.items():
        new_paraphrases = set()
        for paraphrase in paraphrases:
            if stemmer.stem(phrase) != stemmer.stem(paraphrase):
                new_paraphrases.add(paraphrase)
        if len(new_paraphrases):
            paraphrase_dict_clean[phrase] = new_paraphrases
    print("Size: %d" % len(paraphrase_dict_clean))
    return paraphrase_dict_clean 
開發者ID:LittleYUYU,項目名稱:Interactive-Semantic-Parsing,代碼行數:16,代碼來源:ppdb.py

示例6: test_word_stemming_filter

# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import SnowballStemmer [as 別名]
def test_word_stemming_filter():
    stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'),
                           columns='to', default_duration=1)

    # With all defaults (porter stemmer)
    filt = WordStemmingFilter()
    assert isinstance(filt.stemmer, nls.PorterStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    target = ['some', 'sampl', 'text', 'for', 'test', 'annot']
    assert stems == target

    # Try a different stemmer
    filt = WordStemmingFilter(stemmer='snowball', language='english')
    assert isinstance(filt.stemmer, nls.SnowballStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Handles StemmerI stemmer
    stemmer = nls.SnowballStemmer(language='english')
    filt = WordStemmingFilter(stemmer=stemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Try lemmatization filter
    try:
        nltk.find('taggers/universal_tagset')
    except LookupError:
        nltk.download('universal_tagset')
    try:
        nltk.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')
    stim = ComplexTextStim(text='These are tests for Stemming filters')
    filt = WordStemmingFilter(stemmer='wordnet')
    lemmatized = filt.transform(stim)
    lemmas = [l.text for l in lemmatized]
    target = ['these', 'be', 'test', 'for', 'stem', 'filter']
    assert lemmas == target

    # Try case sensitive
    filt = WordStemmingFilter(stemmer='wordnet', case_sensitive=True)
    lemmatized = filt.transform(stim)
    lemmas = [l.text for l in lemmatized]
    target = ['These', 'be', 'test', 'for', 'Stemming', 'filter']
    assert lemmas == target

    # Fails on invalid values
    with pytest.raises(ValueError):
        filt = WordStemmingFilter(stemmer='nonexistent_stemmer')

    # Try a long text stim
    stim2 = TextStim(text='theres something happening here')
    filt = WordStemmingFilter()
    assert filt.transform(stim2).text == 'there someth happen here' 
開發者ID:tyarkoni,項目名稱:pliers,代碼行數:59,代碼來源:test_text_filters.py


注:本文中的nltk.stem.SnowballStemmer方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。