本文整理汇总了Python中sklearn.datasets.fetch_20newsgroups_vectorized函数的典型用法代码示例。如果您正苦于以下问题:Python fetch_20newsgroups_vectorized函数的具体用法?Python fetch_20newsgroups_vectorized怎么用?Python fetch_20newsgroups_vectorized使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了fetch_20newsgroups_vectorized函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_20news_vectorized
def test_20news_vectorized():
try:
datasets.fetch_20newsgroups(subset='all',
download_if_missing=False)
except IOError:
raise SkipTest("Download 20 newsgroups to run this test")
# test subset = train
bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
assert_true(sp.isspmatrix_csr(bunch.data))
assert_equal(bunch.data.shape, (11314, 130107))
assert_equal(bunch.target.shape[0], 11314)
assert_equal(bunch.data.dtype, np.float64)
# test subset = test
bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
assert_true(sp.isspmatrix_csr(bunch.data))
assert_equal(bunch.data.shape, (7532, 130107))
assert_equal(bunch.target.shape[0], 7532)
assert_equal(bunch.data.dtype, np.float64)
# test return_X_y option
fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test')
check_return_X_y(bunch, fetch_func)
# test subset = all
bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
assert_true(sp.isspmatrix_csr(bunch.data))
assert_equal(bunch.data.shape, (11314 + 7532, 130107))
assert_equal(bunch.target.shape[0], 11314 + 7532)
assert_equal(bunch.data.dtype, np.float64)
示例2: test_LogisticRegressionCV
def test_LogisticRegressionCV():
bunch = fetch_20newsgroups_vectorized(subset="train")
X = bunch.data
y = bunch.target
y[y < y.mean()] = -1
y[y >= y.mean()] = 1
Xt, Xh, yt, yh = cross_validation.train_test_split(
X, y, test_size=.5, random_state=0)
# compute the scores
all_scores = []
all_alphas = np.linspace(-12, 0, 5)
for a in all_alphas:
lr = linear_model.LogisticRegression(
solver='lbfgs', C=np.exp(-a), fit_intercept=False, tol=1e-6,
max_iter=100)
lr.fit(Xt, yt)
score_scv = linear_model.logistic._logistic_loss(
lr.coef_.ravel(), Xh, yh, 0)
all_scores.append(score_scv)
all_scores = np.array(all_scores)
best_alpha = all_alphas[np.argmin(all_scores)]
clf = LogisticRegressionCV(max_iter=50)
clf.fit(Xt, yt, Xh, yh)
np.testing.assert_array_less(np.abs(clf.alpha_ - best_alpha), 0.5)
示例3: generate_data
def generate_data(case, sparse=False):
# Generate regression / classification data.
bunch = None
if case == 'regression':
bunch = datasets.load_boston()
elif case == 'classification':
bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
X, y = shuffle(bunch.data, bunch.target)
offset = int(X.shape[0] * 0.8)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]
if sparse:
X_train = csr_matrix(X_train)
X_test = csr_matrix(X_test)
else:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
y_train = np.array(y_train)
data = {
'X_train': X_train,
'X_test': X_test,
'y_train': y_train,
'y_test': y_test,
}
return data
示例4: test_20news_vectorized
def test_20news_vectorized():
# This test is slow.
raise SkipTest
bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
assert_equal(bunch.data.shape, (11314, 107130))
assert_equal(bunch.target.shape[0], 11314)
assert_equal(bunch.data.dtype, np.float64)
bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
assert_equal(bunch.data.shape, (7532, 107130))
assert_equal(bunch.target.shape[0], 7532)
assert_equal(bunch.data.dtype, np.float64)
bunch = datasets.fetch_20newsgroups_vectorized(subset="all")
assert_equal(bunch.data.shape, (11314 + 7532, 107130))
assert_equal(bunch.target.shape[0], 11314 + 7532)
assert_equal(bunch.data.dtype, np.float64)
示例5: load_data
def load_data(name, partition_id, n_partitions):
"""load partition of data into global var `name`"""
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.utils import gen_even_slices
dataset = fetch_20newsgroups_vectorized('test')
size = dataset.data.shape[0]
slices = list(gen_even_slices(size, n_partitions))
part = dataset.data[slices[partition_id]]
# put it in globals
globals().update({name : part})
return part.shape
示例6: test_20news_vectorized
def test_20news_vectorized():
# This test is slow.
raise SkipTest("Test too slow.")
bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
assert_true(sp.isspmatrix_csr(bunch.data))
assert_equal(bunch.data.shape, (11314, 107428))
assert_equal(bunch.target.shape[0], 11314)
assert_equal(bunch.data.dtype, np.float64)
bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
assert_true(sp.isspmatrix_csr(bunch.data))
assert_equal(bunch.data.shape, (7532, 107428))
assert_equal(bunch.target.shape[0], 7532)
assert_equal(bunch.data.dtype, np.float64)
bunch = datasets.fetch_20newsgroups_vectorized(subset="all")
assert_true(sp.isspmatrix_csr(bunch.data))
assert_equal(bunch.data.shape, (11314 + 7532, 107428))
assert_equal(bunch.target.shape[0], 11314 + 7532)
assert_equal(bunch.data.dtype, np.float64)
示例7: create_plot_curve
def create_plot_curve():
clients = parallel.Client()
lview = clients.load_balanced_view()
dview = clients[:]
dview['data']= fetch_20newsgroups_vectorized(remove=('headers', 'footers', 'quotes'))
lview.block = True
alphas = [1E-4, 1E-3, 1E-2, 1E-1]
with dview.sync_imports():
import numpy
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score
res = lview.map(grid_search, alphas)
return res
示例8: get_data
def get_data(dataset_name):
print("Getting dataset: %s" % dataset_name)
if dataset_name == 'lfw_people':
X = fetch_lfw_people().data
elif dataset_name == '20newsgroups':
X = fetch_20newsgroups_vectorized().data[:, :100000]
elif dataset_name == 'olivetti_faces':
X = fetch_olivetti_faces().data
elif dataset_name == 'rcv1':
X = fetch_rcv1().data
elif dataset_name == 'CIFAR':
if handle_missing_dataset(CIFAR_FOLDER) == "skip":
return
X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1))
for i in range(5)]
X = np.vstack(X1)
del X1
elif dataset_name == 'SVHN':
if handle_missing_dataset(SVHN_FOLDER) == 0:
return
X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X']
X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])]
X = np.vstack(X2)
del X1
del X2
elif dataset_name == 'low rank matrix':
X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4),
effective_rank=100, tail_strength=.5,
random_state=random_state)
elif dataset_name == 'uncorrelated matrix':
X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000,
random_state=random_state)
elif dataset_name == 'big sparse matrix':
sparsity = np.int(1e6)
size = np.int(1e6)
small_size = np.int(1e4)
data = np.random.normal(0, 1, np.int(sparsity/10))
data = np.repeat(data, 10)
row = np.random.uniform(0, small_size, sparsity)
col = np.random.uniform(0, small_size, sparsity)
X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size))
del data
del row
del col
else:
X = fetch_mldata(dataset_name).data
return X
示例9: generate_data
def generate_data(case, sparse=False):
"""Generate regression/classification data."""
bunch = None
if case == "regression":
bunch = datasets.load_boston()
elif case == "classification":
bunch = datasets.fetch_20newsgroups_vectorized(subset="all")
X, y = shuffle(bunch.data, bunch.target)
offset = int(X.shape[0] * 0.8)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]
if sparse:
X_train = csr_matrix(X_train)
X_test = csr_matrix(X_test)
else:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
y_train = np.array(y_train)
data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
return data
示例10: get_results
def get_results():
# get data
data = fetch_20newsgroups_vectorized(remove=('headers',
'footers',
'quotes'))
alphas = [1E-4, 1E-3, 1E-2, 1E-1]
# set up dview for imports
clients = parallel.Client()
dview = clients[:]
with dview.sync_imports():
# doesn't seem to like import numpy as np, using numpy instead
import numpy
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score
dview.block = True
# send data to clients
dview['data'] = data
# set up load balanced view for parallel processing
lview = clients.load_balanced_view()
# set blocking to True to get all results once processing is done
lview.block = True
results = lview.map(get_single_result, alphas)
return results
示例11: evaluate
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.decomposition import PCA
import featurelearning
def evaluate(dataset_name, fl, ratio):
print dataset_name, fl.__name__, ratio
d = dataset.load_dataset(dataset_name)
fea = d.data
label = d.target
fea = fl(fea)
ss = StratifiedShuffleSplit(label, 3, test_size=(1-ratio), random_state=0)
svc = LinearSVC()
for train, test in ss:
svc.fit(fea[train,:], label[train,:])
predict = svc.predict(fea[test, :])
acc = accuracy_score(label[test, :], predict)
print acc
if __name__ == '__main__':
pca = PCA()
train = fetch_20newsgroups_vectorized('train')
test = fetch_20newsgroups_vectorized('test')
svc = LinearSVC()
train_data = pca.fit_transform(train.data.toarray())
svc.fit(train_data, train.target)
test_data = pca.transform(test.data.toarray())
predict = svc.predict(test_data)
acc = accuracy_score(test.target, predict)
print acc
# evaluate('20newsgroups', featurelearning.TF_IDF, 0.1)
# evaluate('20newsgroups', featurelearning.LDA, 0.1)
示例12: zip
for n_samples, color in zip(n_samples_range, colors):
min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range)
plt.semilogy(eps_range, min_n_components, color=color)
plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right")
plt.xlabel("Distortion eps")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")
# Part 2: perform sparse random projection of some digits images which are
# quite low dimensional and dense or documents of the 20 newsgroups dataset
# which is both high dimensional and sparse
if '--twenty-newsgroups' in sys.argv:
# Need an internet connection hence not enabled by default
data = fetch_20newsgroups_vectorized().data[:500]
else:
data = load_digits().data[:500]
n_samples, n_features = data.shape
print("Embedding %d samples with dim %d using various random projections"
% (n_samples, n_features))
n_components_range = np.array([300, 1000, 10000])
dists = euclidean_distances(data, squared=True).ravel()
# select only non-identical samples pairs
nonzero = dists != 0
dists = dists[nonzero]
for n_components in n_components_range:
示例13: exp
def exp(solvers, penalties, single_target, n_samples=30000, max_iter=20,
dataset='rcv1', n_jobs=1, skip_slow=False):
mem = Memory(cachedir=expanduser('~/cache'), verbose=0)
if dataset == 'rcv1':
rcv1 = fetch_rcv1()
lbin = LabelBinarizer()
lbin.fit(rcv1.target_names)
X = rcv1.data
y = rcv1.target
y = lbin.inverse_transform(y)
le = LabelEncoder()
y = le.fit_transform(y)
if single_target:
y_n = y.copy()
y_n[y > 16] = 1
y_n[y <= 16] = 0
y = y_n
elif dataset == 'digits':
digits = load_digits()
X, y = digits.data, digits.target
if single_target:
y_n = y.copy()
y_n[y < 5] = 1
y_n[y >= 5] = 0
y = y_n
elif dataset == 'iris':
iris = load_iris()
X, y = iris.data, iris.target
elif dataset == '20newspaper':
ng = fetch_20newsgroups_vectorized()
X = ng.data
y = ng.target
if single_target:
y_n = y.copy()
y_n[y > 4] = 1
y_n[y <= 16] = 0
y = y_n
X = X[:n_samples]
y = y[:n_samples]
cached_fit = mem.cache(fit_single)
out = Parallel(n_jobs=n_jobs, mmap_mode=None)(
delayed(cached_fit)(solver, X, y,
penalty=penalty, single_target=single_target,
C=1, max_iter=max_iter, skip_slow=skip_slow)
for solver in solvers
for penalty in penalties)
res = []
idx = 0
for solver in solvers:
for penalty in penalties:
if not (skip_slow and solver == 'lightning' and penalty == 'l1'):
lr, times, train_scores, test_scores, accuracies = out[idx]
this_res = dict(solver=solver, penalty=penalty,
single_target=single_target,
times=times, train_scores=train_scores,
test_scores=test_scores,
accuracies=accuracies)
res.append(this_res)
idx += 1
with open('bench_saga.json', 'w+') as f:
json.dump(res, f)
示例14: fetch_20newsgroups_vectorized
import sparse_interaction
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
dat = fetch_20newsgroups_vectorized()
X = dat.data
Y = dat.target
cv = StratifiedKFold(Y)
X = X[:, :20000]
si = sparse_interaction.SparseInteractionFeatures()
X_i = si.transform(X)
scores, scores_i = [], []
clf = SGDClassifier(penalty='l1', n_iter=10)
for train, test in cv:
clf.fit(X[train], Y[train])
scores.append(f1_score(Y[test], clf.predict(X[test]), average='macro', pos_label=None))
clf.fit(X_i[train], Y[train])
scores_i.append(f1_score(Y[test], clf.predict(X_i[test]), average='macro', pos_label=None))
print sum(scores), sum(scores_i)
示例15: calculate_result
analyze = tv.build_analyzer()
tv.get_feature_names()#statistical features/terms
#(准确率*召回率)/(准确率+召回率)
def calculate_result(actual,pred):
m_precision = metrics.precision_score(actual,pred);
m_recall = metrics.recall_score(actual,pred);
print 'predict info:'
print 'precision:{0:.3f}'.format(m_precision)
print 'recall:{0:0.3f}'.format(m_recall);
print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred));
#或者sklearn里封装好的抓feature函数,fetch_20newsgroups_vectorized
print '*************************\nfetch_20newsgroups_vectorized\n*************************'
from sklearn.datasets import fetch_20newsgroups_vectorized
tfidf_train_3 = fetch_20newsgroups_vectorized(subset = 'train');
tfidf_test_3 = fetch_20newsgroups_vectorized(subset = 'test');
print "the shape of train is "+repr(tfidf_train_3.data.shape)
print "the shape of test is "+repr(tfidf_test_3.data.shape)
#分类
######################################################
#Multinomial Naive Bayes Classifier
print '*************************\nNaive Bayes\n*************************'
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
newsgroups_test = fetch_20newsgroups(subset = 'test',
categories = categories);
fea_test = vectorizer.fit_transform(newsgroups_test.data);
#create the Multinomial Naive Bayesian Classifier
clf = MultinomialNB(alpha = 0.01)