本文整理汇总了Python中sklearn.datasets.samples_generator.make_blobs函数的典型用法代码示例。如果您正苦于以下问题:Python make_blobs函数的具体用法?Python make_blobs怎么用?Python make_blobs使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了make_blobs函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: createCluster
def createCluster():
X1, y1 = make_blobs(n_samples=50, centers=1, n_features=2,random_state=0,center_box = (-5.0,5.0))
X2, y2 = make_blobs(n_samples=200, centers=1, n_features=2,random_state=0,center_box = (-4.0,6.0))
X = np.concatenate((X1,X2),axis=0)
y = np.concatenate((y1,[1]*len(y2)),axis=0)
return X.tolist(),y.tolist()
示例2: main
def main():
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
n_centers = 3
X, y = make_blobs(n_samples=1000, centers=n_centers, n_features=2,
cluster_std=0.7, random_state=0)
# Run this K-Means
import kmeans
t0 = time.time()
y_pred, centers, obj_val_seq = kmeans.kmeans(X, n_centers)
t1 = time.time()
print("Final obj val: {}".format(obj_val_seq[-1]))
print("Time taken (this implementation): {}".format(t1 - t0))
# Run scikit-learn's K-Means
from sklearn.cluster import k_means
t0 = time.time()
centers, y_pred, obj_val = k_means(X, n_centers, random_state=0)
t1 = time.time()
print("Final obj val: {}".format(obj_val))
print("Time taken (Scikit, 1 job): {}".format(t1 - t0))
# Plot change in objective value over iteration
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(obj_val_seq, 'b-', marker='*')
fig.suptitle("Change in K-means objective value across iterations")
ax.set_xlabel("Iteration")
ax.set_ylabel("Objective value")
fig.show()
# Plot data
from itertools import cycle
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
fig = plt.figure(figsize=plt.figaspect(0.5)) # Make twice as wide to accomodate both plots
ax = fig.add_subplot(121)
ax.set_title("Data with true labels and final centers")
for k, color in zip(range(n_centers), colors):
ax.plot(X[y==k, 0], X[y==k, 1], color + '.')
initial_centers = kmeans.init_centers(X, n_centers, 2) # This is valid because we always use the same random seed.
# Plot initial centers
for x in initial_centers:
ax.plot(x[0], x[1], "mo", markeredgecolor="k", markersize=8)
# Plot final centers
for x in centers:
ax.plot(x[0], x[1], "co", markeredgecolor="k", markersize=8)
# Plot assignments
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
ax = fig.add_subplot(122)
ax.set_title("Data with final assignments")
for k, color in zip(range(n_centers), colors):
ax.plot(X[y_pred==k, 0], X[y_pred==k, 1], color + '.')
fig.tight_layout()
fig.gca()
fig.show()
示例3: test_fitted_model
def test_fitted_model(self):
# non centered, sparse centers to check the
centers = np.array([
[0.0, 5.0, 0.0, 0.0, 0.0],
[1.0, 1.0, 4.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 5.0, 1.0],
])
n_samples = 100
n_clusters, n_features = centers.shape
X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
cluster_std=1., random_state=42)
cbook = CoodeBook(n_words=3)
cbook = cbook.fit(X) # TODO: Is it neaded to reasign? or it can be just cbook.fit(X)
# check that the number of clusters centers and distinct labels match
# the expectation
centers = cbook.get_dictionary()
assert_equal(centers.shape, (n_clusters, n_features))
labels = cbook.predict(X)
assert_equal(np.unique(labels).shape[0], n_clusters)
# check that the labels assignment are perfect (up to a permutation)
assert_equal(v_measure_score(true_labels, labels), 1.0)
assert_greater(cbook.cluster_core.inertia_, 0.0)
# check that the descriptor looks like the homogenous PDF used
# to create the original samples
cbook_hist = cbook.get_BoF_descriptor(X)
expected_value = float(1)/cbook.n_words
for bin_value in cbook_hist[0]:
assert_less(round(bin_value-expected_value,3), 0.01)
示例4: generate_anisotropically_clusters
def generate_anisotropically_clusters(number_of_samples, number_of_clusters, n_features=2, variances=None, filename=""):
"""
:param number_of_samples: The total number of points equally divided among clusters.
:param number_of_clusters: The number of clusters to generate
:param n_features: The number of features for each sample.
:param variances: The standard deviation of the clusters.
:param filename: The file to store the results
:return:
"""
if variances is None: variances = [0.5 for _ in xrange(number_of_clusters)]
if filename == "":
filename = "./Data/anisotropically_" + str(number_of_samples) + "_features_" + str(n_features) \
+ "_cluster_" + str(number_of_clusters) + ".csv"
random_state = 170
X, y = make_blobs(n_samples=number_of_samples, centers=number_of_clusters, n_features=n_features,
random_state=random_state, cluster_std=variances)
transformation = np.array([[random() if i == j else uniform(-1, 1) for j in xrange(n_features)] for i in xrange(n_features)])
X = np.dot(X, transformation)
features = ["features_" + str(i + 1) for i in xrange(n_features)]
df = pd.DataFrame()
for i, feature in enumerate(features): df[feature] = X[:, i]
df["class"] = y
df.to_csv(filename, index=False)
return X, y
示例5: test_soft
def test_soft():
X, Y = make_blobs(n_samples=10, centers=2, n_features=2, random_state=1)
for i in range(0, len(Y)):
if Y[i] == 0 :
Y[i] = -1.0
X1, y1, X2, y2 = gen_lin_separable_data()
#print Y
#print X1
#X1, y1, X2, y2 = gen_lin_separable_overlap_data()
#print y2
X_train, y_train = split_train(X1, y1, X2, y2)
#print X_train
#X_test, y_test = split_test(X1, y1, X2, y2)
clf = SVM(C=0.1)
#clf.fit(X_train, y_train)
clf.fit(X, Y)
#y_predict = clf.predict(X_test)
#correct = np.sum(y_predict == y_test)
#print "%d out of %d predictions correct" % (correct, len(y_predict))
plot_contour(X_train[y_train==1], X_train[y_train==-1], clf)
示例6: test_k_means_fit_predict
def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
# check that fit.predict gives same result as fit_predict
# There's a very small chance of failure with elkan on unstructured dataset
# because predict method uses fast euclidean distances computation which
# may cause small numerical instabilities.
# NB: This test is largely redundant with respect to test_predict and
# test_predict_equal_labels. This test has the added effect of
# testing idempotence of the fittng procesdure which appears to
# be where it fails on some MacOS setups.
if sys.platform == "darwin":
pytest.xfail(
"Known failures on MacOS, See "
"https://github.com/scikit-learn/scikit-learn/issues/12644")
if not (algo == 'elkan' and constructor is sp.csr_matrix):
rng = np.random.RandomState(seed)
X = make_blobs(n_samples=1000, n_features=10, centers=10,
random_state=rng)[0].astype(dtype, copy=False)
X = constructor(X)
kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed,
tol=tol, max_iter=max_iter, n_jobs=1)
labels_1 = kmeans.fit(X).predict(X)
labels_2 = kmeans.fit_predict(X)
assert_array_equal(labels_1, labels_2)
示例7: tree_evaluation
def tree_evaluation():
print 'DecisionTreeClassifier'
np.random.seed(123)
dataset,true_labels = make_blobs(n_samples=10000, n_features=2)
color = ['r-', 'b-']
methods = [True, False]
for b in methods:
print 'bootstrapping = %s' % methods[b]
misclassification_rates = []
min_rate = np.inf
min_k = 0
for i in range(2,16):
tree_classifier = tree.DecisionTreeClassifier(max_depth=i)
scores = validation(tree_classifier, dataset, true_labels, methods[b])
misclassifications = 1 - scores
misclassification_rates.append(np.average(misclassifications))
if min_rate > misclassification_rates[i-2]:
min_rate = misclassification_rates[i-2]
min_k = i
print 'minimum rate = %s' % min_rate
print 'best depth = %s' % min_k
label = 'bootstrap' if methods[b] else 'cross-validation'
pyplot.plot(range(2,16), misclassification_rates, color[b], label = label)
pyplot.title('Mis-classification rates of DecisionTreeClassifier')
pyplot.xlabel('Values of k')
pyplot.ylabel('Mis classification rates')
pyplot.legend(loc = 'upper left')
pyplot.show()
示例8: plot_sgd_classifier
def plot_sgd_classifier(num_samples, clt_std):
#generation of data
X, y = make_blobs(n_samples=num_samples, centers=2, cluster_std=clt_std)
#fitting of data using logistic regression
clf = SGDClassifier(loss='log', alpha=0.01)
clf.fit(X, y)
#plotting of data
x_ = np.linspace(min(X[:, 0]), max(X[:, 0]), 10)
y_ = np.linspace(min(X[:, 1]), max(X[:, 1]), 10)
X_, Y_ = np.meshgrid(x_, y_)
Z = np.empty(X_.shape)
for (i, j), val in np.ndenumerate(X_):
x1 = val
x2 = Y_[i, j]
conf_score = clf.decision_function([x1, x2])
Z[i, j] = conf_score[0]
levels = [-1.0, 0, 1.0]
colors = 'k'
linestyles = ['dashed', 'solid', 'dashed']
ax = plt.axes()
plt.xlabel('X1')
plt.ylabel('X2')
ax.contour(X_, Y_, Z, colors=colors,
levels=levels, linestyles=linestyles, labels='Boundary')
ax.scatter(X[:, 0], X[:, 1], c=y)
示例9: plot_sgd_separator
def plot_sgd_separator():
# we create 50 separable points
X, Y = make_blobs(n_samples=50, centers=2,
random_state=0, cluster_std=0.60)
# fit the model
clf = SGDClassifier(loss="hinge", alpha=0.01,
n_iter=200, fit_intercept=True)
clf.fit(X, Y)
# plot the line, the points, and the nearest vectors to the plane
xx = np.linspace(-1, 5, 10)
yy = np.linspace(-1, 5, 10)
X1, X2 = np.meshgrid(xx, yy)
Z = np.empty(X1.shape)
for (i, j), val in np.ndenumerate(X1):
x1 = val
x2 = X2[i, j]
p = clf.decision_function([x1, x2])
Z[i, j] = p[0]
levels = [-1.0, 0.0, 1.0]
linestyles = ['dashed', 'solid', 'dashed']
colors = 'k'
ax = plt.axes()
ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)
ax.axis('tight')
示例10: exercise_1
def exercise_1():
X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
n_samples = len(X)
kf = cross_validation.KFold(n_samples, n_folds=10, shuffle=False, random_state=None)
# kf = cross_validation.ShuffleSplit(1000,n_iter=25, test_size=0.1, train_size=0.9, random_state=None)
error_total = np.zeros([49, 1], dtype=float)
for k in range(1,50):
error = []
clf = KNeighborsClassifier(n_neighbors=k)
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf.fit(X_train, y_train)
error.append( zero_one_loss(y_test, clf.predict(X_test)) )
# error.append(clf.predict(X_test))
# error.append( 1. - clf.score(X_test, y_test) ) #, accuracy_score(y_test, clf.predict(X_test))
# error.append(mean_squared_error(y_test, clf.predict(X_test)))
# error.append()
# print error
error_total[k-1, 0] = np.array(error).mean()
# print error_total
x = np.arange(1,50, dtype=int)
plt.style.use('ggplot')
plt.plot(x, error_total[:, 0], '#009999', marker='o')
# plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
plt.xticks(x, x)
plt.margins(0.02)
plt.xlabel('K values')
plt.ylabel('Missclasification Error')
plt.show()
示例11: exercise_2a
def exercise_2a():
X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
# plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
# plt.show()
kf = KFold(1000, n_folds=10, shuffle=False, random_state=None)
accuracy_lst = np.zeros([49, 2], dtype=float)
accuracy_current = np.zeros(10, dtype=float)
for k in range(1,50):
iterator = 0
clf = KNeighborsClassifier(n_neighbors=k)
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf.fit(X_train, y_train)
accuracy_current[iterator] = (1. - clf.score(X_test,y_test))
iterator+=1
accuracy_lst[k-1, 0] = accuracy_current.mean()
# accuracy_lst[k-1, 1] = accuracy_current.std() #confidence interval 95%
x = np.arange(1,50, dtype=int)
plt.style.use('ggplot')
plt.plot(x, accuracy_lst[:, 0], '#009999', marker='o')
# plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
plt.xticks(x, x)
plt.margins(0.02)
plt.xlabel('K values')
plt.ylabel('Missclasification Error')
plt.show()
示例12: test_spectral_amg_mode
def test_spectral_amg_mode():
# Test the amg mode of SpectralClustering
centers = np.array([
[0., 0., 0.],
[10., 10., 10.],
[20., 20., 20.],
])
X, true_labels = make_blobs(n_samples=100, centers=centers,
cluster_std=1., random_state=42)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
S = sparse.coo_matrix(S)
try:
from pyamg import smoothed_aggregation_solver
amg_loaded = True
except ImportError:
amg_loaded = False
if amg_loaded:
labels = spectral_clustering(S, n_clusters=len(centers),
random_state=0, mode="amg")
# We don't care too much that it's good, just that it *worked*.
# There does have to be some lower limit on the performance though.
assert_greater(np.mean(labels == true_labels), .3)
else:
assert_raises(ValueError, spectral_embedding, S,
n_components=len(centers), random_state=0, mode="amg")
示例13: test_grid_search_iid
def test_grid_search_iid():
# test the iid parameter
# noise-free simple 2d-data
X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0,
cluster_std=0.1, shuffle=False, n_samples=80)
# split dataset into two folds that are not iid
# first one contains data of all 4 blobs, second only from two.
mask = np.ones(X.shape[0], dtype=np.bool)
mask[np.where(y == 1)[0][::2]] = 0
mask[np.where(y == 2)[0][::2]] = 0
# this leads to perfect classification on one fold and a score of 1/3 on
# the other
svm = SVC(kernel='linear')
# create "cv" for splits
cv = [[mask, ~mask], [~mask, mask]]
# once with iid=True (default)
grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv)
grid_search.fit(X, y)
_, average_score, scores = grid_search.cv_scores_[0]
assert_array_almost_equal(scores, [1, 1. / 3.])
# for first split, 1/4 of dataset is in test, for second 3/4.
# take weighted average
assert_almost_equal(average_score, 1 * 1. / 4. + 1. / 3. * 3. / 4.)
# once with iid=False (default)
grid_search = GridSearchCV(svm, param_grid={'C': [1, 10]}, cv=cv,
iid=False)
grid_search.fit(X, y)
_, average_score, scores = grid_search.cv_scores_[0]
# scores are the same as above
assert_array_almost_equal(scores, [1, 1. / 3.])
# averaged score is just mean of scores
assert_almost_equal(average_score, np.mean(scores))
示例14: test_dbscan_optics_parity
def test_dbscan_optics_parity(eps, min_samples):
# Test that OPTICS clustering labels are <= 5% difference of DBSCAN
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers,
cluster_std=0.4, random_state=0)
# calculate optics with dbscan extract at 0.3 epsilon
op = OPTICS(min_samples=min_samples).fit(X)
core_optics, labels_optics = op.extract_dbscan(eps)
# calculate dbscan labels
db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
contingency = contingency_matrix(db.labels_, labels_optics)
agree = min(np.sum(np.max(contingency, axis=0)),
np.sum(np.max(contingency, axis=1)))
disagree = X.shape[0] - agree
# verify core_labels match
assert_array_equal(core_optics, db.core_sample_indices_)
non_core_count = len(labels_optics) - len(core_optics)
percent_mismatch = np.round((disagree - 1) / non_core_count, 2)
# verify label mismatch is <= 5% labels
assert percent_mismatch <= 0.05
示例15: test_bin_seeds
def test_bin_seeds():
# Test the bin seeding technique which can be used in the mean shift
# algorithm
# Data is just 6 points in the plane
X = np.array([[1., 1.], [1.4, 1.4], [1.8, 1.2],
[2., 1.], [2.1, 1.1], [0., 0.]])
# With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be
# found
ground_truth = {(1., 1.), (2., 1.), (0., 0.)}
test_bins = get_bin_seeds(X, 1, 1)
test_result = set(tuple(p) for p in test_bins)
assert len(ground_truth.symmetric_difference(test_result)) == 0
# With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be
# found
ground_truth = {(1., 1.), (2., 1.)}
test_bins = get_bin_seeds(X, 1, 2)
test_result = set(tuple(p) for p in test_bins)
assert len(ground_truth.symmetric_difference(test_result)) == 0
# With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found
# we bail and use the whole data here.
with warnings.catch_warnings(record=True):
test_bins = get_bin_seeds(X, 0.01, 1)
assert_array_almost_equal(test_bins, X)
# tight clusters around [0, 0] and [1, 1], only get two bins
X, _ = make_blobs(n_samples=100, n_features=2, centers=[[0, 0], [1, 1]],
cluster_std=0.1, random_state=0)
test_bins = get_bin_seeds(X, 1)
assert_array_equal(test_bins, [[0, 0], [1, 1]])