Python samples_generator.make_blobs函数代码示例

本文整理汇总了Python中sklearn.datasets.samples_generator.make_blobs函数的典型用法代码示例。如果您正苦于以下问题：Python make_blobs函数的具体用法？Python make_blobs怎么用？Python make_blobs使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了make_blobs函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: createCluster

def createCluster():
    X1, y1 = make_blobs(n_samples=50, centers=1, n_features=2,random_state=0,center_box = (-5.0,5.0))
    X2, y2 = make_blobs(n_samples=200, centers=1, n_features=2,random_state=0,center_box = (-4.0,6.0))
    
    X = np.concatenate((X1,X2),axis=0)
    y = np.concatenate((y1,[1]*len(y2)),axis=0)
    
    return X.tolist(),y.tolist()

开发者ID:abhishekkrthakur，项目名称:amazon_challenge，代码行数:8，代码来源:adasyn_test.py

示例2: main

def main():
    import matplotlib.pyplot as plt
    from sklearn.datasets.samples_generator import make_blobs
    n_centers = 3
    X, y = make_blobs(n_samples=1000, centers=n_centers, n_features=2,
                    cluster_std=0.7, random_state=0)

    # Run this K-Means
    import kmeans
    t0 = time.time()
    y_pred, centers, obj_val_seq = kmeans.kmeans(X, n_centers)
    t1 = time.time()
    print("Final obj val: {}".format(obj_val_seq[-1]))
    print("Time taken (this implementation): {}".format(t1 - t0))

    # Run scikit-learn's K-Means
    from sklearn.cluster import k_means
    t0 = time.time()
    centers, y_pred, obj_val = k_means(X, n_centers, random_state=0)
    t1 = time.time()
    print("Final obj val: {}".format(obj_val))
    print("Time taken (Scikit, 1 job): {}".format(t1 - t0))

    # Plot change in objective value over iteration
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(obj_val_seq, 'b-', marker='*')
    fig.suptitle("Change in K-means objective value across iterations")
    ax.set_xlabel("Iteration")
    ax.set_ylabel("Objective value")
    fig.show()

    # Plot data
    from itertools import cycle
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    fig = plt.figure(figsize=plt.figaspect(0.5))  # Make twice as wide to accomodate both plots
    ax = fig.add_subplot(121)
    ax.set_title("Data with true labels and final centers")
    for k, color in zip(range(n_centers), colors):
        ax.plot(X[y==k, 0], X[y==k, 1], color + '.')

    initial_centers = kmeans.init_centers(X, n_centers, 2) # This is valid because we always use the same random seed.
    # Plot initial centers
    for x in initial_centers:
        ax.plot(x[0], x[1], "mo", markeredgecolor="k", markersize=8)

    # Plot final centers
    for x in centers:
        ax.plot(x[0], x[1], "co", markeredgecolor="k", markersize=8)

    # Plot assignments
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    ax = fig.add_subplot(122)
    ax.set_title("Data with final assignments")
    for k, color in zip(range(n_centers), colors):
        ax.plot(X[y_pred==k, 0], X[y_pred==k, 1], color + '.')

    fig.tight_layout()
    fig.gca()
    fig.show()

开发者ID:lightalchemist，项目名称:ML-algorithms，代码行数:60，代码来源:kmeans.py

示例3: test_fitted_model

    def test_fitted_model(self):

        # non centered, sparse centers to check the
        centers = np.array([
            [0.0, 5.0, 0.0, 0.0, 0.0],
            [1.0, 1.0, 4.0, 0.0, 0.0],
            [1.0, 0.0, 0.0, 5.0, 1.0],
            ])
        n_samples = 100
        n_clusters, n_features = centers.shape
        X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)

        cbook = CoodeBook(n_words=3)
        cbook = cbook.fit(X) # TODO: Is it neaded to reasign? or it can be just cbook.fit(X)

        # check that the number of clusters centers and distinct labels match
        # the expectation
        centers = cbook.get_dictionary()
        assert_equal(centers.shape, (n_clusters, n_features))

        labels = cbook.predict(X)
        assert_equal(np.unique(labels).shape[0], n_clusters)

        # check that the labels assignment are perfect (up to a permutation)
        assert_equal(v_measure_score(true_labels, labels), 1.0)
        assert_greater(cbook.cluster_core.inertia_, 0.0)

        # check that the descriptor looks like the homogenous PDF used
        # to create the original samples
        cbook_hist = cbook.get_BoF_descriptor(X)
        expected_value = float(1)/cbook.n_words
        for bin_value in cbook_hist[0]:
            assert_less(round(bin_value-expected_value,3), 0.01)

开发者ID:massich，项目名称:oct_image_classif，代码行数:34，代码来源:test_codebook.py

示例4: generate_anisotropically_clusters

def generate_anisotropically_clusters(number_of_samples, number_of_clusters, n_features=2,  variances=None, filename=""):
    """
    :param number_of_samples:  The total number of points equally divided among clusters.
    :param number_of_clusters: The number of clusters to generate
    :param n_features:         The number of features for each sample.
    :param variances:          The standard deviation of the clusters.
    :param filename:           The file to store the results
    :return:
    """

    if variances is None: variances = [0.5 for _ in xrange(number_of_clusters)]
    if filename == "":
        filename = "./Data/anisotropically_" + str(number_of_samples) + "_features_" + str(n_features) \
                   + "_cluster_" + str(number_of_clusters) + ".csv"
    random_state = 170
    X, y = make_blobs(n_samples=number_of_samples, centers=number_of_clusters, n_features=n_features,
                      random_state=random_state, cluster_std=variances)
    transformation = np.array([[random() if i == j else uniform(-1, 1) for j in xrange(n_features)] for i in xrange(n_features)])
    X = np.dot(X, transformation)

    features = ["features_" + str(i + 1) for i in xrange(n_features)]
    df = pd.DataFrame()
    for i, feature in enumerate(features): df[feature] = X[:, i]
    df["class"] = y
    df.to_csv(filename, index=False)

    return X, y

开发者ID:vivekaxl，项目名称:ScratchPad，代码行数:27，代码来源:generate_data.py

示例5: test_soft

	def test_soft():
		
		X, Y = make_blobs(n_samples=10, centers=2, n_features=2, random_state=1)
		
		for i in range(0, len(Y)):
			if Y[i] == 0 :
				Y[i] = -1.0
		
		X1, y1, X2, y2 = gen_lin_separable_data()
		#print Y
		#print X1
		#X1, y1, X2, y2 = gen_lin_separable_overlap_data()
		#print y2
		X_train, y_train = split_train(X1, y1, X2, y2)
		#print X_train
		#X_test, y_test = split_test(X1, y1, X2, y2)
 
		clf = SVM(C=0.1)
		#clf.fit(X_train, y_train)
		clf.fit(X, Y)
 
		#y_predict = clf.predict(X_test)
		#correct = np.sum(y_predict == y_test)
		#print "%d out of %d predictions correct" % (correct, len(y_predict))

		plot_contour(X_train[y_train==1], X_train[y_train==-1], clf)

开发者ID:kimjh4930，项目名称:m-learning，代码行数:26，代码来源:project.py

示例6: test_k_means_fit_predict

def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
    # check that fit.predict gives same result as fit_predict
    # There's a very small chance of failure with elkan on unstructured dataset
    # because predict method uses fast euclidean distances computation which
    # may cause small numerical instabilities.
    # NB: This test is largely redundant with respect to test_predict and
    #     test_predict_equal_labels.  This test has the added effect of
    #     testing idempotence of the fittng procesdure which appears to
    #     be where it fails on some MacOS setups.
    if sys.platform == "darwin":
        pytest.xfail(
            "Known failures on MacOS, See "
            "https://github.com/scikit-learn/scikit-learn/issues/12644")
    if not (algo == 'elkan' and constructor is sp.csr_matrix):
        rng = np.random.RandomState(seed)

        X = make_blobs(n_samples=1000, n_features=10, centers=10,
                       random_state=rng)[0].astype(dtype, copy=False)
        X = constructor(X)

        kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
                        tol=tol, max_iter=max_iter, n_jobs=1)

        labels_1 = kmeans.fit(X).predict(X)
        labels_2 = kmeans.fit_predict(X)

        assert_array_equal(labels_1, labels_2)

开发者ID:daniel-perry，项目名称:scikit-learn，代码行数:27，代码来源:test_k_means.py

示例7: tree_evaluation

def tree_evaluation():
    print 'DecisionTreeClassifier'
    np.random.seed(123)
    dataset,true_labels = make_blobs(n_samples=10000, n_features=2)
    color = ['r-', 'b-']
    methods = [True, False]

    for b in methods:
        print 'bootstrapping = %s' % methods[b]
        misclassification_rates = []
        min_rate = np.inf
        min_k = 0

        for i in range(2,16):
            tree_classifier = tree.DecisionTreeClassifier(max_depth=i)
            scores = validation(tree_classifier, dataset, true_labels, methods[b])
            misclassifications = 1 - scores
            misclassification_rates.append(np.average(misclassifications))
        
            if min_rate > misclassification_rates[i-2]:
                min_rate = misclassification_rates[i-2]
                min_k = i

        print 'minimum rate = %s' % min_rate
        print 'best depth = %s' % min_k

        label = 'bootstrap' if methods[b] else 'cross-validation'
        pyplot.plot(range(2,16), misclassification_rates, color[b], label = label)
    
    pyplot.title('Mis-classification rates of DecisionTreeClassifier')
    pyplot.xlabel('Values of k')
    pyplot.ylabel('Mis classification rates')
    pyplot.legend(loc = 'upper left')
    pyplot.show()

开发者ID:akash13singh，项目名称:dm-assignment2，代码行数:34，代码来源:exercise2.py

示例8: plot_sgd_classifier

def plot_sgd_classifier(num_samples, clt_std):
    #generation of data
    X, y = make_blobs(n_samples=num_samples, centers=2, cluster_std=clt_std)

    #fitting of data using logistic regression
    clf = SGDClassifier(loss='log', alpha=0.01)
    clf.fit(X, y)

    #plotting of data
    x_ = np.linspace(min(X[:, 0]), max(X[:, 0]), 10)
    y_ = np.linspace(min(X[:, 1]), max(X[:, 1]), 10)

    X_, Y_ = np.meshgrid(x_, y_)
    Z = np.empty(X_.shape)

    for (i, j), val in np.ndenumerate(X_):
        x1 = val
        x2 = Y_[i, j]
        conf_score = clf.decision_function([x1, x2])
        Z[i, j] = conf_score[0]

    levels = [-1.0, 0, 1.0]
    colors = 'k'
    linestyles = ['dashed', 'solid', 'dashed']

    ax = plt.axes()
    plt.xlabel('X1')
    plt.ylabel('X2')
    ax.contour(X_, Y_, Z, colors=colors,
               levels=levels, linestyles=linestyles, labels='Boundary')
    ax.scatter(X[:, 0], X[:, 1], c=y)

开发者ID:abinashpanda，项目名称:ml_tutorial，代码行数:31，代码来源:SGD_Classification.py

示例9: plot_sgd_separator

def plot_sgd_separator():
    # we create 50 separable points
    X, Y = make_blobs(n_samples=50, centers=2,
                      random_state=0, cluster_std=0.60)

    # fit the model
    clf = SGDClassifier(loss="hinge", alpha=0.01,
                        n_iter=200, fit_intercept=True)
    clf.fit(X, Y)

    # plot the line, the points, and the nearest vectors to the plane
    xx = np.linspace(-1, 5, 10)
    yy = np.linspace(-1, 5, 10)

    X1, X2 = np.meshgrid(xx, yy)
    Z = np.empty(X1.shape)
    for (i, j), val in np.ndenumerate(X1):
        x1 = val
        x2 = X2[i, j]
        p = clf.decision_function([x1, x2])
        Z[i, j] = p[0]
    levels = [-1.0, 0.0, 1.0]
    linestyles = ['dashed', 'solid', 'dashed']
    colors = 'k'

    ax = plt.axes()
    ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
    ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

    ax.axis('tight')

开发者ID:Balu-Varanasi，项目名称:pycon_2013_india，代码行数:30，代码来源:sgd_separator.py

示例10: exercise_1

def exercise_1():
    X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
    n_samples = len(X)
    kf = cross_validation.KFold(n_samples, n_folds=10, shuffle=False, random_state=None)
    # kf = cross_validation.ShuffleSplit(1000,n_iter=25, test_size=0.1, train_size=0.9, random_state=None)

    error_total = np.zeros([49, 1], dtype=float)
    for k in range(1,50):
        error = []
        clf = KNeighborsClassifier(n_neighbors=k)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.fit(X_train, y_train)
            error.append( zero_one_loss(y_test, clf.predict(X_test)) )


            # error.append(clf.predict(X_test))
            # error.append( 1. - clf.score(X_test, y_test) ) #, accuracy_score(y_test, clf.predict(X_test))
            # error.append(mean_squared_error(y_test, clf.predict(X_test)))
            # error.append()
        # print error
        error_total[k-1, 0] = np.array(error).mean()
    # print error_total
    x = np.arange(1,50, dtype=int)
    plt.style.use('ggplot')
    plt.plot(x, error_total[:, 0], '#009999', marker='o')
    # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
    plt.xticks(x, x)
    plt.margins(0.02)
    plt.xlabel('K values')
    plt.ylabel('Missclasification Error')
    plt.show()

开发者ID:palindrome6，项目名称:Data-Mining---Assignment-2，代码行数:33，代码来源:model_selection.py

示例11: exercise_2a

def exercise_2a():
    X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
    # plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
    # plt.show()
    kf = KFold(1000, n_folds=10, shuffle=False, random_state=None)
    accuracy_lst = np.zeros([49, 2], dtype=float)
    accuracy_current = np.zeros(10, dtype=float)
    for k in range(1,50):
        iterator = 0
        clf = KNeighborsClassifier(n_neighbors=k)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf.fit(X_train, y_train)
            accuracy_current[iterator] = (1. - clf.score(X_test,y_test))
            iterator+=1
        accuracy_lst[k-1, 0] = accuracy_current.mean()
        # accuracy_lst[k-1, 1] = accuracy_current.std() #confidence interval 95%
    x = np.arange(1,50, dtype=int)
    plt.style.use('ggplot')
    plt.plot(x, accuracy_lst[:, 0], '#009999', marker='o')
    # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
    plt.xticks(x, x)
    plt.margins(0.02)
    plt.xlabel('K values')
    plt.ylabel('Missclasification Error')
    plt.show()

开发者ID:palindrome6，项目名称:Data-Mining---Assignment-2，代码行数:28，代码来源:model_selection.py

示例12: test_spectral_amg_mode

def test_spectral_amg_mode():
    # Test the amg mode of SpectralClustering
    centers = np.array([
        [0., 0., 0.],
        [10., 10., 10.],
        [20., 20., 20.],
    ])
    X, true_labels = make_blobs(n_samples=100, centers=centers,
                                cluster_std=1., random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    S = sparse.coo_matrix(S)
    try:
        from pyamg import smoothed_aggregation_solver
        amg_loaded = True
    except ImportError:
        amg_loaded = False
    if amg_loaded:
        labels = spectral_clustering(S, n_clusters=len(centers),
                                     random_state=0, mode="amg")
        # We don't care too much that it's good, just that it *worked*.
        # There does have to be some lower limit on the performance though.
        assert_greater(np.mean(labels == true_labels), .3)
    else:
        assert_raises(ValueError, spectral_embedding, S,
                      n_components=len(centers), random_state=0, mode="amg")

开发者ID:osdf，项目名称:scikit-learn，代码行数:26，代码来源:test_spectral.py

示例13: test_grid_search_iid

def test_grid_search_iid():
    # test the iid parameter
    # noise-free simple 2d-data
    X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0,
                      cluster_std=0.1, shuffle=False, n_samples=80)
    # split dataset into two folds that are not iid
    # first one contains data of all 4 blobs, second only from two.
    mask = np.ones(X.shape[0], dtype=np.bool)
    mask[np.where(y == 1)[0][::2]] = 0
    mask[np.where(y == 2)[0][::2]] = 0
    # this leads to perfect classification on one fold and a score of 1/3 on
    # the other
    svm = SVC(kernel='linear')
    # create "cv" for splits
    cv = [[mask, ~mask], [~mask, mask]]
    # once with iid=True (default)
    grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv)
    grid_search.fit(X, y)
    _, average_score, scores = grid_search.cv_scores_[0]
    assert_array_almost_equal(scores, [1, 1. / 3.])
    # for first split, 1/4 of dataset is in test, for second 3/4.
    # take weighted average
    assert_almost_equal(average_score, 1 * 1. / 4. + 1. / 3. * 3. / 4.)

    # once with iid=False (default)
    grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv,
                               iid=False)
    grid_search.fit(X, y)
    _, average_score, scores = grid_search.cv_scores_[0]
    # scores are the same as above
    assert_array_almost_equal(scores, [1, 1. / 3.])
    # averaged score is just mean of scores
    assert_almost_equal(average_score, np.mean(scores))

开发者ID:Calvin-O，项目名称:scikit-learn，代码行数:33，代码来源:test_grid_search.py

示例14: test_dbscan_optics_parity

def test_dbscan_optics_parity(eps, min_samples):
    # Test that OPTICS clustering labels are <= 5% difference of DBSCAN

    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750, centers=centers,
                                cluster_std=0.4, random_state=0)

    # calculate optics with dbscan extract at 0.3 epsilon
    op = OPTICS(min_samples=min_samples).fit(X)
    core_optics, labels_optics = op.extract_dbscan(eps)

    # calculate dbscan labels
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)

    contingency = contingency_matrix(db.labels_, labels_optics)
    agree = min(np.sum(np.max(contingency, axis=0)),
                np.sum(np.max(contingency, axis=1)))
    disagree = X.shape[0] - agree

    # verify core_labels match
    assert_array_equal(core_optics, db.core_sample_indices_)

    non_core_count = len(labels_optics) - len(core_optics)
    percent_mismatch = np.round((disagree - 1) / non_core_count, 2)

    # verify label mismatch is <= 5% labels
    assert percent_mismatch <= 0.05

开发者ID:MartinThoma，项目名称:scikit-learn，代码行数:27，代码来源:test_optics.py

示例15: test_bin_seeds

def test_bin_seeds():
    # Test the bin seeding technique which can be used in the mean shift
    # algorithm
    # Data is just 6 points in the plane
    X = np.array([[1., 1.], [1.4, 1.4], [1.8, 1.2],
                  [2., 1.], [2.1, 1.1], [0., 0.]])

    # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be
    # found
    ground_truth = {(1., 1.), (2., 1.), (0., 0.)}
    test_bins = get_bin_seeds(X, 1, 1)
    test_result = set(tuple(p) for p in test_bins)
    assert len(ground_truth.symmetric_difference(test_result)) == 0

    # With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be
    # found
    ground_truth = {(1., 1.), (2., 1.)}
    test_bins = get_bin_seeds(X, 1, 2)
    test_result = set(tuple(p) for p in test_bins)
    assert len(ground_truth.symmetric_difference(test_result)) == 0

    # With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found
    # we bail and use the whole data here.
    with warnings.catch_warnings(record=True):
        test_bins = get_bin_seeds(X, 0.01, 1)
    assert_array_almost_equal(test_bins, X)

    # tight clusters around [0, 0] and [1, 1], only get two bins
    X, _ = make_blobs(n_samples=100, n_features=2, centers=[[0, 0], [1, 1]],
                      cluster_std=0.1, random_state=0)
    test_bins = get_bin_seeds(X, 1)
    assert_array_equal(test_bins, [[0, 0], [1, 1]])

开发者ID:allefpablo，项目名称:scikit-learn，代码行数:32，代码来源:test_mean_shift.py

注：本文中的sklearn.datasets.samples_generator.make_blobs函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。