当前位置: 首页>>代码示例>>Python>>正文


Python datasets.fetch_20newsgroups_vectorized函数代码示例

本文整理汇总了Python中sklearn.datasets.fetch_20newsgroups_vectorized函数的典型用法代码示例。如果您正苦于以下问题:Python fetch_20newsgroups_vectorized函数的具体用法?Python fetch_20newsgroups_vectorized怎么用?Python fetch_20newsgroups_vectorized使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了fetch_20newsgroups_vectorized函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_20news_vectorized

def test_20news_vectorized():
    try:
        datasets.fetch_20newsgroups(subset='all',
                                    download_if_missing=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # test subset = train
    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314, 130107))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    # test subset = test
    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (7532, 130107))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    # test return_X_y option
    fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test')
    check_return_X_y(bunch, fetch_func)

    # test subset = all
    bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314 + 7532, 130107))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64)
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:31,代码来源:test_20news.py

示例2: test_LogisticRegressionCV

def test_LogisticRegressionCV():
    bunch = fetch_20newsgroups_vectorized(subset="train")
    X = bunch.data
    y = bunch.target

    y[y < y.mean()] = -1
    y[y >= y.mean()] = 1
    Xt, Xh, yt, yh = cross_validation.train_test_split(
        X, y, test_size=.5, random_state=0)

    # compute the scores
    all_scores = []
    all_alphas = np.linspace(-12, 0, 5)
    for a in all_alphas:
        lr = linear_model.LogisticRegression(
            solver='lbfgs', C=np.exp(-a), fit_intercept=False, tol=1e-6,
            max_iter=100)
        lr.fit(Xt, yt)
        score_scv = linear_model.logistic._logistic_loss(
            lr.coef_.ravel(), Xh, yh, 0)
        all_scores.append(score_scv)
    all_scores = np.array(all_scores)

    best_alpha = all_alphas[np.argmin(all_scores)]

    clf = LogisticRegressionCV(max_iter=50)
    clf.fit(Xt, yt, Xh, yh)
    np.testing.assert_array_less(np.abs(clf.alpha_ - best_alpha), 0.5)
开发者ID:dongzhixiang,项目名称:hoag,代码行数:28,代码来源:tests.py

示例3: generate_data

    def generate_data(case, sparse=False):
        # Generate regression / classification data. 
        bunch = None 
        if case == 'regression':
            bunch = datasets.load_boston()
        elif case == 'classification': 
            bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
        X, y = shuffle(bunch.data, bunch.target)
        offset = int(X.shape[0] * 0.8) 
        X_train, y_train = X[:offset], y[:offset]
        X_test, y_test = X[offset:], y[offset:] 
        if sparse:
            X_train = csr_matrix(X_train)
            X_test = csr_matrix(X_test)
        else:
            X_train = np.array(X_train)
            X_test = np.array(X_test)
        y_test = np.array(y_test)
        y_train = np.array(y_train)
        data = {
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test,
        }

        return data 
开发者ID:0x0all,项目名称:machineLearning,代码行数:27,代码来源:plot_model_complexity_influence.py

示例4: test_20news_vectorized

def test_20news_vectorized():
    # This test is slow.
    raise SkipTest

    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_equal(bunch.data.shape, (11314, 107130))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_equal(bunch.data.shape, (7532, 107130))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    bunch = datasets.fetch_20newsgroups_vectorized(subset="all")
    assert_equal(bunch.data.shape, (11314 + 7532, 107130))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64)
开发者ID:forkloop,项目名称:scikit-learn,代码行数:18,代码来源:test_20news.py

示例5: load_data

def load_data(name, partition_id, n_partitions):
    """load partition of data into global var `name`"""
    from sklearn.datasets import fetch_20newsgroups_vectorized
    from sklearn.utils import gen_even_slices
    dataset = fetch_20newsgroups_vectorized('test')
    size = dataset.data.shape[0]
    slices = list(gen_even_slices(size, n_partitions))
    
    part = dataset.data[slices[partition_id]]
    # put it in globals
    globals().update({name : part})
    return part.shape
开发者ID:minrk,项目名称:pycon-pydata-sprint,代码行数:12,代码来源:parallel_kmeans.py

示例6: test_20news_vectorized

def test_20news_vectorized():
    # This test is slow.
    raise SkipTest("Test too slow.")

    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314, 107428))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (7532, 107428))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    bunch = datasets.fetch_20newsgroups_vectorized(subset="all")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314 + 7532, 107428))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64)
开发者ID:0664j35t3r,项目名称:scikit-learn,代码行数:21,代码来源:test_20news.py

示例7: create_plot_curve

def create_plot_curve():
    clients = parallel.Client()
    lview = clients.load_balanced_view()
    dview = clients[:]
    dview['data']= fetch_20newsgroups_vectorized(remove=('headers', 'footers', 'quotes'))
    lview.block = True
    alphas = [1E-4, 1E-3, 1E-2, 1E-1]

    with dview.sync_imports():
        import numpy
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.cross_validation import cross_val_score

    res = lview.map(grid_search, alphas)
    return res
开发者ID:miramzz,项目名称:machine_learning,代码行数:15,代码来源:machine_l.py

示例8: get_data

def get_data(dataset_name):
    print("Getting dataset: %s" % dataset_name)

    if dataset_name == 'lfw_people':
        X = fetch_lfw_people().data
    elif dataset_name == '20newsgroups':
        X = fetch_20newsgroups_vectorized().data[:, :100000]
    elif dataset_name == 'olivetti_faces':
        X = fetch_olivetti_faces().data
    elif dataset_name == 'rcv1':
        X = fetch_rcv1().data
    elif dataset_name == 'CIFAR':
        if handle_missing_dataset(CIFAR_FOLDER) == "skip":
            return
        X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1))
              for i in range(5)]
        X = np.vstack(X1)
        del X1
    elif dataset_name == 'SVHN':
        if handle_missing_dataset(SVHN_FOLDER) == 0:
            return
        X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X']
        X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])]
        X = np.vstack(X2)
        del X1
        del X2
    elif dataset_name == 'low rank matrix':
        X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4),
                                 effective_rank=100, tail_strength=.5,
                                 random_state=random_state)
    elif dataset_name == 'uncorrelated matrix':
        X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000,
                                        random_state=random_state)
    elif dataset_name == 'big sparse matrix':
        sparsity = np.int(1e6)
        size = np.int(1e6)
        small_size = np.int(1e4)
        data = np.random.normal(0, 1, np.int(sparsity/10))
        data = np.repeat(data, 10)
        row = np.random.uniform(0, small_size, sparsity)
        col = np.random.uniform(0, small_size, sparsity)
        X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size))
        del data
        del row
        del col
    else:
        X = fetch_mldata(dataset_name).data
    return X
开发者ID:0664j35t3r,项目名称:scikit-learn,代码行数:48,代码来源:bench_plot_randomized_svd.py

示例9: generate_data

def generate_data(case, sparse=False):
    """Generate regression/classification data."""
    bunch = None
    if case == "regression":
        bunch = datasets.load_boston()
    elif case == "classification":
        bunch = datasets.fetch_20newsgroups_vectorized(subset="all")
    X, y = shuffle(bunch.data, bunch.target)
    offset = int(X.shape[0] * 0.8)
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]
    if sparse:
        X_train = csr_matrix(X_train)
        X_test = csr_matrix(X_test)
    else:
        X_train = np.array(X_train)
        X_test = np.array(X_test)
    y_test = np.array(y_test)
    y_train = np.array(y_train)
    data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
    return data
开发者ID:xuq,项目名称:Dash-Docset,代码行数:21,代码来源:plot_model_complexity_influence.py

示例10: get_results

def get_results():
    # get data
    data = fetch_20newsgroups_vectorized(remove=('headers',
                                                 'footers',
                                                 'quotes'))
    alphas = [1E-4, 1E-3, 1E-2, 1E-1]
    # set up dview for imports
    clients = parallel.Client()
    dview = clients[:]
    with dview.sync_imports():
        # doesn't seem to like import numpy as np, using numpy instead
        import numpy
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.cross_validation import cross_val_score
    dview.block = True
    # send data to clients
    dview['data'] = data
    # set up load balanced view for parallel processing
    lview = clients.load_balanced_view()
    # set blocking to True to get all results once processing is done
    lview.block = True
    results = lview.map(get_single_result, alphas)
    return results
开发者ID:MikeDelaney,项目名称:sentiment,代码行数:23,代码来源:parallel.py

示例11: evaluate

from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.decomposition import PCA
import featurelearning

def evaluate(dataset_name, fl, ratio):
    print dataset_name, fl.__name__, ratio
    d = dataset.load_dataset(dataset_name)
    fea = d.data
    label = d.target
    fea = fl(fea)
    ss = StratifiedShuffleSplit(label, 3, test_size=(1-ratio), random_state=0)
    svc = LinearSVC()
    for train, test in ss:
        svc.fit(fea[train,:], label[train,:])
        predict = svc.predict(fea[test, :])
        acc = accuracy_score(label[test, :], predict)
        print acc

if __name__ == '__main__':
    pca = PCA()
    train = fetch_20newsgroups_vectorized('train')
    test = fetch_20newsgroups_vectorized('test')
    svc = LinearSVC()
    train_data = pca.fit_transform(train.data.toarray())
    svc.fit(train_data, train.target)
    test_data = pca.transform(test.data.toarray())
    predict = svc.predict(test_data)
    acc = accuracy_score(test.target, predict)
    print acc
    # evaluate('20newsgroups', featurelearning.TF_IDF, 0.1)
    # evaluate('20newsgroups', featurelearning.LDA, 0.1)
开发者ID:shirc,项目名称:text-feature-learning,代码行数:31,代码来源:evaluation.py

示例12: zip

for n_samples, color in zip(n_samples_range, colors):
    min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range)
    plt.semilogy(eps_range, min_n_components, color=color)

plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right")
plt.xlabel("Distortion eps")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")

# Part 2: perform sparse random projection of some digits images which are
# quite low dimensional and dense or documents of the 20 newsgroups dataset
# which is both high dimensional and sparse

if '--twenty-newsgroups' in sys.argv:
    # Need an internet connection hence not enabled by default
    data = fetch_20newsgroups_vectorized().data[:500]
else:
    data = load_digits().data[:500]

n_samples, n_features = data.shape
print("Embedding %d samples with dim %d using various random projections"
      % (n_samples, n_features))

n_components_range = np.array([300, 1000, 10000])
dists = euclidean_distances(data, squared=True).ravel()

# select only non-identical samples pairs
nonzero = dists != 0
dists = dists[nonzero]

for n_components in n_components_range:
开发者ID:AlexanderFabisch,项目名称:scikit-learn,代码行数:31,代码来源:plot_johnson_lindenstrauss_bound.py

示例13: exp

def exp(solvers, penalties, single_target, n_samples=30000, max_iter=20,
        dataset='rcv1', n_jobs=1, skip_slow=False):
    mem = Memory(cachedir=expanduser('~/cache'), verbose=0)

    if dataset == 'rcv1':
        rcv1 = fetch_rcv1()

        lbin = LabelBinarizer()
        lbin.fit(rcv1.target_names)

        X = rcv1.data
        y = rcv1.target
        y = lbin.inverse_transform(y)
        le = LabelEncoder()
        y = le.fit_transform(y)
        if single_target:
            y_n = y.copy()
            y_n[y > 16] = 1
            y_n[y <= 16] = 0
            y = y_n

    elif dataset == 'digits':
        digits = load_digits()
        X, y = digits.data, digits.target
        if single_target:
            y_n = y.copy()
            y_n[y < 5] = 1
            y_n[y >= 5] = 0
            y = y_n
    elif dataset == 'iris':
        iris = load_iris()
        X, y = iris.data, iris.target
    elif dataset == '20newspaper':
        ng = fetch_20newsgroups_vectorized()
        X = ng.data
        y = ng.target
        if single_target:
            y_n = y.copy()
            y_n[y > 4] = 1
            y_n[y <= 16] = 0
            y = y_n

    X = X[:n_samples]
    y = y[:n_samples]

    cached_fit = mem.cache(fit_single)
    out = Parallel(n_jobs=n_jobs, mmap_mode=None)(
        delayed(cached_fit)(solver, X, y,
                            penalty=penalty, single_target=single_target,
                            C=1, max_iter=max_iter, skip_slow=skip_slow)
        for solver in solvers
        for penalty in penalties)

    res = []
    idx = 0
    for solver in solvers:
        for penalty in penalties:
            if not (skip_slow and solver == 'lightning' and penalty == 'l1'):
                lr, times, train_scores, test_scores, accuracies = out[idx]
                this_res = dict(solver=solver, penalty=penalty,
                                single_target=single_target,
                                times=times, train_scores=train_scores,
                                test_scores=test_scores,
                                accuracies=accuracies)
                res.append(this_res)
            idx += 1

    with open('bench_saga.json', 'w+') as f:
        json.dump(res, f)
开发者ID:AlexandreAbraham,项目名称:scikit-learn,代码行数:69,代码来源:bench_saga.py

示例14: fetch_20newsgroups_vectorized

import sparse_interaction
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score

dat = fetch_20newsgroups_vectorized()
X = dat.data
Y = dat.target
cv = StratifiedKFold(Y)

X = X[:, :20000]

si = sparse_interaction.SparseInteractionFeatures()
X_i = si.transform(X)

scores, scores_i = [], []

clf = SGDClassifier(penalty='l1', n_iter=10)

for train, test in cv:
    clf.fit(X[train], Y[train])
    scores.append(f1_score(Y[test], clf.predict(X[test]), average='macro', pos_label=None))
    clf.fit(X_i[train], Y[train])
    scores_i.append(f1_score(Y[test], clf.predict(X_i[test]), average='macro', pos_label=None))
print sum(scores), sum(scores_i)
开发者ID:AWNystrom,项目名称:SparseInteraction,代码行数:26,代码来源:compare.py

示例15: calculate_result

analyze = tv.build_analyzer()
tv.get_feature_names()#statistical features/terms

#(准确率*召回率)/(准确率+召回率)
def calculate_result(actual,pred):
    m_precision = metrics.precision_score(actual,pred);
    m_recall = metrics.recall_score(actual,pred);
    print 'predict info:'
    print 'precision:{0:.3f}'.format(m_precision)
    print 'recall:{0:0.3f}'.format(m_recall);
    print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred));

#或者sklearn里封装好的抓feature函数,fetch_20newsgroups_vectorized
print '*************************\nfetch_20newsgroups_vectorized\n*************************'
from sklearn.datasets import fetch_20newsgroups_vectorized
tfidf_train_3 = fetch_20newsgroups_vectorized(subset = 'train');
tfidf_test_3 = fetch_20newsgroups_vectorized(subset = 'test');
print "the shape of train is "+repr(tfidf_train_3.data.shape)
print "the shape of test is "+repr(tfidf_test_3.data.shape)

#分类
######################################################
#Multinomial Naive Bayes Classifier
print '*************************\nNaive Bayes\n*************************'
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
newsgroups_test = fetch_20newsgroups(subset = 'test',
                                     categories = categories);
fea_test = vectorizer.fit_transform(newsgroups_test.data);
#create the Multinomial Naive Bayesian Classifier
clf = MultinomialNB(alpha = 0.01)
开发者ID:GZJackCai,项目名称:machineLearnTest,代码行数:31,代码来源:testScipy.py


注:本文中的sklearn.datasets.fetch_20newsgroups_vectorized函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。