本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.max_df方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.max_df方法的具體用法?Python TfidfVectorizer.max_df怎麽用?Python TfidfVectorizer.max_df使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text.TfidfVectorizer
的用法示例。
在下文中一共展示了TfidfVectorizer.max_df方法的2個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: test_vectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import max_df [as 別名]
def test_vectorizer():
# raw documents as an iterator
train_data = iter(ALL_FOOD_DOCS[:-1])
test_data = [ALL_FOOD_DOCS[-1]]
n_train = len(ALL_FOOD_DOCS) - 1
# test without vocabulary
v1 = CountVectorizer(max_df=0.5)
counts_train = v1.fit_transform(train_data)
if hasattr(counts_train, 'tocsr'):
counts_train = counts_train.tocsr()
assert_equal(counts_train[0, v1.vocabulary_["pizza"]], 2)
# build a vectorizer v1 with the same vocabulary as the one fitted by v1
v2 = CountVectorizer(vocabulary=v1.vocabulary_)
# compare that the two vectorizer give the same output on the test sample
for v in (v1, v2):
counts_test = v.transform(test_data)
if hasattr(counts_test, 'tocsr'):
counts_test = counts_test.tocsr()
vocabulary = v.vocabulary_
assert_equal(counts_test[0, vocabulary["salad"]], 1)
assert_equal(counts_test[0, vocabulary["tomato"]], 1)
assert_equal(counts_test[0, vocabulary["water"]], 1)
# stop word from the fixed list
assert_false("the" in vocabulary)
# stop word found automatically by the vectorizer DF thresholding
# words that are high frequent across the complete corpus are likely
# to be not informative (either real stop words of extraction
# artifacts)
assert_false("copyright" in vocabulary)
# not present in the sample
assert_equal(counts_test[0, vocabulary["coke"]], 0)
assert_equal(counts_test[0, vocabulary["burger"]], 0)
assert_equal(counts_test[0, vocabulary["beer"]], 0)
assert_equal(counts_test[0, vocabulary["pizza"]], 0)
# test tf-idf
t1 = TfidfTransformer(norm='l1')
tfidf = t1.fit(counts_train).transform(counts_train).toarray()
assert_equal(len(t1.idf_), len(v1.vocabulary_))
assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_)))
# test tf-idf with new data
tfidf_test = t1.transform(counts_test).toarray()
assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_)))
# test tf alone
t2 = TfidfTransformer(norm='l1', use_idf=False)
tf = t2.fit(counts_train).transform(counts_train).toarray()
assert_equal(t2.idf_, None)
# test idf transform with unlearned idf vector
t3 = TfidfTransformer(use_idf=True)
assert_raises(ValueError, t3.transform, counts_train)
# test idf transform with incompatible n_features
X = [[1, 1, 5],
[1, 1, 0]]
t3.fit(X)
X_incompt = [[1, 3],
[1, 3]]
assert_raises(ValueError, t3.transform, X_incompt)
# L1-normalized term frequencies sum to one
assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)
# test the direct tfidf vectorizer
# (equivalent to term count vectorizer + tfidf transformer)
train_data = iter(ALL_FOOD_DOCS[:-1])
tv = TfidfVectorizer(norm='l1')
assert_false(tv.fixed_vocabulary)
tv.max_df = v1.max_df
tfidf2 = tv.fit_transform(train_data).toarray()
assert_array_almost_equal(tfidf, tfidf2)
# test the direct tfidf vectorizer with new data
tfidf_test2 = tv.transform(test_data).toarray()
assert_array_almost_equal(tfidf_test, tfidf_test2)
# test transform on unfitted vectorizer with empty vocabulary
v3 = CountVectorizer(vocabulary=None)
assert_raises(ValueError, v3.transform, train_data)
# ascii preprocessor?
v3.set_params(strip_accents='ascii', lowercase=False)
assert_equal(v3.build_preprocessor(), strip_accents_ascii)
# error on bad strip_accents param
v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
assert_raises(ValueError, v3.build_preprocessor)
# error with bad analyzer type
v3.set_params = '_invalid_analyzer_type_'
#.........這裏部分代碼省略.........
示例2: test_vectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import max_df [as 別名]
def test_vectorizer():
# raw documents as an iterator
train_data = iter(ALL_FOOD_DOCS[:-1])
test_data = [ALL_FOOD_DOCS[-1]]
n_train = len(ALL_FOOD_DOCS) - 1
# test without vocabulary
v1 = CountVectorizer(max_df=0.5)
counts_train = v1.fit_transform(train_data)
if hasattr(counts_train, 'tocsr'):
counts_train = counts_train.tocsr()
assert_equal(counts_train[0, v1.vocabulary_[u"pizza"]], 2)
# build a vectorizer v1 with the same vocabulary as the one fitted by v1
v2 = CountVectorizer(vocabulary=v1.vocabulary_)
# compare that the two vectorizer give the same output on the test sample
for v in (v1, v2):
counts_test = v.transform(test_data)
if hasattr(counts_test, 'tocsr'):
counts_test = counts_test.tocsr()
vocabulary = v.vocabulary_
assert_equal(counts_test[0, vocabulary[u"salad"]], 1)
assert_equal(counts_test[0, vocabulary[u"tomato"]], 1)
assert_equal(counts_test[0, vocabulary[u"water"]], 1)
# stop word from the fixed list
assert_false(u"the" in vocabulary)
# stop word found automatically by the vectorizer DF thresholding
# words that are high frequent across the complete corpus are likely
# to be not informative (either real stop words of extraction
# artifacts)
assert_false(u"copyright" in vocabulary)
# not present in the sample
assert_equal(counts_test[0, vocabulary[u"coke"]], 0)
assert_equal(counts_test[0, vocabulary[u"burger"]], 0)
assert_equal(counts_test[0, vocabulary[u"beer"]], 0)
assert_equal(counts_test[0, vocabulary[u"pizza"]], 0)
# test tf-idf
t1 = TfidfTransformer(norm='l1')
tfidf = t1.fit(counts_train).transform(counts_train).toarray()
assert_equal(len(t1.idf_), len(v1.vocabulary_))
assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_)))
# test tf-idf with new data
tfidf_test = t1.transform(counts_test).toarray()
assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_)))
# test tf alone
t2 = TfidfTransformer(norm='l1', use_idf=False)
tf = t2.fit(counts_train).transform(counts_train).toarray()
assert_equal(t2.idf_, None)
# L1-normalized term frequencies sum to one
assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)
# test the direct tfidf vectorizer
# (equivalent to term count vectorizer + tfidf transformer)
train_data = iter(ALL_FOOD_DOCS[:-1])
tv = TfidfVectorizer(norm='l1')
assert_false(tv.fixed_vocabulary)
tv.max_df = v1.max_df
tfidf2 = tv.fit_transform(train_data).toarray()
assert_array_almost_equal(tfidf, tfidf2)
# test the direct tfidf vectorizer with new data
tfidf_test2 = tv.transform(test_data).toarray()
assert_array_almost_equal(tfidf_test, tfidf_test2)
# test transform on unfitted vectorizer with empty vocabulary
v3 = CountVectorizer(vocabulary=None)
assert_raises(ValueError, v3.transform, train_data)