本文整理匯總了Python中nltk.stem.SnowballStemmer方法的典型用法代碼示例。如果您正苦於以下問題:Python stem.SnowballStemmer方法的具體用法?Python stem.SnowballStemmer怎麽用?Python stem.SnowballStemmer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類nltk.stem
的用法示例。
在下文中一共展示了stem.SnowballStemmer方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: data_qualityCheck
# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import SnowballStemmer [as 別名]
def data_qualityCheck():
print("Checking data qualitites...")
train_news.isnull().sum()
train_news.info()
print("check finished.")
#below datasets were used to
test_news.isnull().sum()
test_news.info()
valid_news.isnull().sum()
valid_news.info()
#run the below function call to see the quality check results
#data_qualityCheck()
#eng_stemmer = SnowballStemmer('english')
#stopwords = set(nltk.corpus.stopwords.words('english'))
#Stemming
示例2: collect_pairs_by_rel
# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import SnowballStemmer [as 別名]
def collect_pairs_by_rel(filename, rel):
""" Collect pairs from PPDB maintaining the specified relation. """
stemmer = SnowballStemmer("english")
with open(filename, "r") as f:
data = f.readlines()
phrase2paraphrase = dict()
for item in data:
item = item.strip()
phrase = item.split('|||')[1].strip()
paraphrase = item.split('|||')[2].strip()
if stemmer.stem(phrase) == stemmer.stem(paraphrase):
continue
entailment = item.split('|||')[-1].strip()
if entailment == rel:
add_to_dict_of_set(phrase, paraphrase, phrase2paraphrase)
add_to_dict_of_set(paraphrase, phrase, phrase2paraphrase)
print("Size: %d" % len(phrase2paraphrase))
return phrase2paraphrase
示例3: __init__
# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import SnowballStemmer [as 別名]
def __init__(self, language='english', stopwords=None, stemming=True):
if stemming:
self._stemmer = SnowballStemmer(language)
else:
self._stemmer = None
if isinstance(stopwords, list):
self._stopwords = stopwords
elif isinstance(stopwords, (str, unicode)):
# stopwords argument is a path
try:
self._stopwords = self._load_stopwords(stopwords)
except IOError:
raise IOError('stopwords argument must be a path to a .txt file, a list of word strings '
'or None (which loads the default list)')
else:
# Load built-in stopwords
stopwords_dir = 'stopwords/{0}.txt'.format(language.lower())
application_root = os.path.dirname(__file__)
stopwords_file = os.path.join(application_root, '..', stopwords_dir)
self._stopwords = self._load_stopwords(stopwords_file)
示例4: __init__
# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import SnowballStemmer [as 別名]
def __init__(self, language):
self.language = language
self.stopwords = corpus.stopwords.words(language)
self.stemmer = stem.SnowballStemmer(language)
示例5: clean_paraphrase
# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import SnowballStemmer [as 別名]
def clean_paraphrase(paraphrase_dict):
stemmer = SnowballStemmer("english")
paraphrase_dict_clean = dict()
print("Size: %d" % len(paraphrase_dict))
for phrase, paraphrases in paraphrase_dict.items():
new_paraphrases = set()
for paraphrase in paraphrases:
if stemmer.stem(phrase) != stemmer.stem(paraphrase):
new_paraphrases.add(paraphrase)
if len(new_paraphrases):
paraphrase_dict_clean[phrase] = new_paraphrases
print("Size: %d" % len(paraphrase_dict_clean))
return paraphrase_dict_clean
示例6: test_word_stemming_filter
# 需要導入模塊: from nltk import stem [as 別名]
# 或者: from nltk.stem import SnowballStemmer [as 別名]
def test_word_stemming_filter():
stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'),
columns='to', default_duration=1)
# With all defaults (porter stemmer)
filt = WordStemmingFilter()
assert isinstance(filt.stemmer, nls.PorterStemmer)
stemmed = filt.transform(stim)
stems = [s.text for s in stemmed]
target = ['some', 'sampl', 'text', 'for', 'test', 'annot']
assert stems == target
# Try a different stemmer
filt = WordStemmingFilter(stemmer='snowball', language='english')
assert isinstance(filt.stemmer, nls.SnowballStemmer)
stemmed = filt.transform(stim)
stems = [s.text for s in stemmed]
assert stems == target
# Handles StemmerI stemmer
stemmer = nls.SnowballStemmer(language='english')
filt = WordStemmingFilter(stemmer=stemmer)
stemmed = filt.transform(stim)
stems = [s.text for s in stemmed]
assert stems == target
# Try lemmatization filter
try:
nltk.find('taggers/universal_tagset')
except LookupError:
nltk.download('universal_tagset')
try:
nltk.find('corpora/wordnet')
except LookupError:
nltk.download('wordnet')
stim = ComplexTextStim(text='These are tests for Stemming filters')
filt = WordStemmingFilter(stemmer='wordnet')
lemmatized = filt.transform(stim)
lemmas = [l.text for l in lemmatized]
target = ['these', 'be', 'test', 'for', 'stem', 'filter']
assert lemmas == target
# Try case sensitive
filt = WordStemmingFilter(stemmer='wordnet', case_sensitive=True)
lemmatized = filt.transform(stim)
lemmas = [l.text for l in lemmatized]
target = ['These', 'be', 'test', 'for', 'Stemming', 'filter']
assert lemmas == target
# Fails on invalid values
with pytest.raises(ValueError):
filt = WordStemmingFilter(stemmer='nonexistent_stemmer')
# Try a long text stim
stim2 = TextStim(text='theres something happening here')
filt = WordStemmingFilter()
assert filt.transform(stim2).text == 'there someth happen here'