本文整理汇总了Python中sklearn.datasets.make_classification函数的典型用法代码示例。如果您正苦于以下问题:Python make_classification函数的具体用法?Python make_classification怎么用?Python make_classification使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了make_classification函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_logistic_regressioncv_class_weights
def test_logistic_regressioncv_class_weights():
X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
n_classes=3, random_state=0)
msg = ("In LogisticRegressionCV the liblinear solver cannot handle "
"multiclass with class_weight of type dict. Use the lbfgs, "
"newton-cg or sag solvers or set class_weight='balanced'")
clf_lib = LogisticRegressionCV(class_weight={0: 0.1, 1: 0.2},
solver='liblinear')
assert_raise_message(ValueError, msg, clf_lib.fit, X, y)
y_ = y.copy()
y_[y == 2] = 1
clf_lib.fit(X, y_)
assert_array_equal(clf_lib.classes_, [0, 1])
# Test for class_weight=balanced
X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
random_state=0)
clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False,
class_weight='balanced')
clf_lbf.fit(X, y)
clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False,
class_weight='balanced')
clf_lib.fit(X, y)
clf_sag = LogisticRegressionCV(solver='sag', fit_intercept=False,
class_weight='balanced', max_iter=2000)
clf_sag.fit(X, y)
assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
assert_array_almost_equal(clf_sag.coef_, clf_lbf.coef_, decimal=4)
assert_array_almost_equal(clf_lib.coef_, clf_sag.coef_, decimal=4)
示例2: test_make_classification
def test_make_classification():
weights = [0.1, 0.25]
X, y = make_classification(n_samples=100, n_features=20, n_informative=5,
n_redundant=1, n_repeated=1, n_classes=3,
n_clusters_per_class=1, hypercube=False,
shift=None, scale=None, weights=weights,
random_state=0)
assert_equal(weights, [0.1, 0.25])
assert_equal(X.shape, (100, 20), "X shape mismatch")
assert_equal(y.shape, (100,), "y shape mismatch")
assert_equal(np.unique(y).shape, (3,), "Unexpected number of classes")
assert_equal(sum(y == 0), 10, "Unexpected number of samples in class #0")
assert_equal(sum(y == 1), 25, "Unexpected number of samples in class #1")
assert_equal(sum(y == 2), 65, "Unexpected number of samples in class #2")
# Test for n_features > 30
X, y = make_classification(n_samples=2000, n_features=31, n_informative=31,
n_redundant=0, n_repeated=0, hypercube=True,
scale=0.5, random_state=0)
assert_equal(X.shape, (2000, 31), "X shape mismatch")
assert_equal(y.shape, (2000,), "y shape mismatch")
assert_equal(np.unique(X.view([('', X.dtype)]*X.shape[1])).view(X.dtype)
.reshape(-1, X.shape[1]).shape[0], 2000,
"Unexpected number of unique rows")
开发者ID:dominicSchiller,项目名称:DataScience_EA12_Clustering_Exercise,代码行数:26,代码来源:test_samples_generator.py
示例3: test_logistic_regressioncv_class_weights
def test_logistic_regressioncv_class_weights():
X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
n_classes=3, random_state=0)
# Test the liblinear fails when class_weight of type dict is
# provided, when it is multiclass. However it can handle
# binary problems.
clf_lib = LogisticRegressionCV(class_weight={0: 0.1, 1: 0.2},
solver='liblinear')
assert_raises(ValueError, clf_lib.fit, X, y)
y_ = y.copy()
y_[y == 2] = 1
clf_lib.fit(X, y_)
assert_array_equal(clf_lib.classes_, [0, 1])
# Test for class_weight=auto
X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
random_state=0)
clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False,
class_weight='auto')
clf_lbf.fit(X, y)
clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False,
class_weight='auto')
clf_lib.fit(X, y)
assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
示例4: setUp
def setUp(self):
np.random.seed(488881)
# binomial
x, y = make_classification(n_samples=300, random_state=6601)
x_sparse = csr_matrix(x)
x_wide, y_wide = make_classification(n_samples=100, n_features=150,
random_state=8911)
x_wide_sparse = csr_matrix(x_wide)
self.binomial = [(x, y), (x_sparse, y), (x_wide, y_wide),
(x_wide_sparse, y_wide)]
# multinomial
x, y = make_classification(n_samples=400, n_classes=3, n_informative=15,
n_features=25, random_state=10585)
x_sparse = csr_matrix(x)
x_wide, y_wide = make_classification(n_samples=400, n_classes=3,
n_informative=15, n_features=500,
random_state=15841)
x_wide_sparse = csr_matrix(x_wide)
self.multinomial = [(x, y), (x_sparse, y), (x_wide, y_wide),
(x_wide_sparse, y_wide)]
self.alphas = [0., 0.25, 0.50, 0.75, 1.]
self.n_splits = [-1, 0, 5]
self.scoring = [
"accuracy",
"roc_auc",
"average_precision",
"log_loss",
"precision_macro",
"precision_micro",
"precision_weighted",
"f1_micro",
"f1_macro",
"f1_weighted",
]
self.multinomial_scoring = [
"accuracy",
"log_loss",
"precision_macro",
"precision_micro",
"precision_weighted",
"f1_micro",
"f1_macro",
"f1_weighted"
]
示例5: test_liblinear_random_state
def test_liblinear_random_state():
X, y = make_classification(n_samples=20)
lr1 = LogisticRegression(random_state=0)
lr1.fit(X, y)
lr2 = LogisticRegression(random_state=0)
lr2.fit(X, y)
assert_array_almost_equal(lr1.coef_, lr2.coef_)
示例6: test_importances
def test_importances():
"""Check variable importances."""
X, y = datasets.make_classification(n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=0)
for name, Tree in CLF_TREES.items():
clf = Tree(random_state=0)
clf.fit(X, y)
importances = clf.feature_importances_
n_important = np.sum(importances > 0.1)
assert_equal(importances.shape[0], 10, "Failed with {0}".format(name))
assert_equal(n_important, 3, "Failed with {0}".format(name))
X_new = clf.transform(X, threshold="mean")
assert_less(0, X_new.shape[1], "Failed with {0}".format(name))
assert_less(X_new.shape[1], X.shape[1], "Failed with {0}".format(name))
# Check on iris that importances are the same for all builders
clf = DecisionTreeClassifier(random_state=0)
clf.fit(iris.data, iris.target)
clf2 = DecisionTreeClassifier(random_state=0,
max_leaf_nodes=len(iris.data))
clf2.fit(iris.data, iris.target)
assert_array_equal(clf.feature_importances_,
clf2.feature_importances_)
示例7: test_grid_search_precomputed_kernel
def test_grid_search_precomputed_kernel():
"""Test that grid search works when the input features are given in the
form of a precomputed kernel matrix """
X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
# compute the training kernel matrix corresponding to the linear kernel
K_train = np.dot(X_[:180], X_[:180].T)
y_train = y_[:180]
clf = SVC(kernel='precomputed')
cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
cv.fit(K_train, y_train)
assert_true(cv.best_score_ >= 0)
# compute the test kernel matrix
K_test = np.dot(X_[180:], X_[:180].T)
y_test = y_[180:]
y_pred = cv.predict(K_test)
assert_true(np.mean(y_pred == y_test) >= 0)
# test error is raised when the precomputed kernel is not array-like
# or sparse
assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
示例8: test_grid_search_sparse_scoring
def test_grid_search_sparse_scoring():
X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
cv.fit(X_[:180], y_[:180])
y_pred = cv.predict(X_[180:])
C = cv.best_estimator_.C
X_ = sp.csr_matrix(X_)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
cv.fit(X_[:180], y_[:180])
y_pred2 = cv.predict(X_[180:])
C2 = cv.best_estimator_.C
assert_array_equal(y_pred, y_pred2)
assert_equal(C, C2)
# Smoke test the score
#np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
# cv.score(X_[:180], y[:180]))
# test loss where greater is worse
def f1_loss(y_true_, y_pred_):
return -f1_score(y_true_, y_pred_)
F1Loss = make_scorer(f1_loss, greater_is_better=False)
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss)
cv.fit(X_[:180], y_[:180])
y_pred3 = cv.predict(X_[180:])
C3 = cv.best_estimator_.C
assert_equal(C, C3)
assert_array_equal(y_pred, y_pred3)
示例9: test_engine_info
def test_engine_info(self):
n_samples = 20
n_features = 100
n_proc = 2
X, y = datasets.make_classification(n_samples=n_samples,
n_features=n_features,
n_informative=2,
random_state=1)
Xy = dict(X=X, y=y)
cv_svm_local = CV(Methods(*[SVC(kernel="linear"),
SVC(kernel="rbf")]),
n_folds=3)
swf_engine = SomaWorkflowEngine(cv_svm_local,
num_processes=n_proc,
resource_id="[email protected]",
login="jl237561",
remove_finished_wf=False,
remove_local_tree=False,
queue="Global_long")
swf_engine.run(**Xy)
print "engine_info ================"
for job_info in swf_engine.engine_info:
print " job_info================="
print " mem_cost= ", job_info.mem_cost
print " vmem_cost= ", job_info.vmem_cost
print " time_cost= ", job_info.time_cost
self.assertTrue(job_info.time_cost > 0)
示例10: test_intercept_logistic_helper
def test_intercept_logistic_helper():
n_samples, n_features = 10, 5
X, y = make_classification(n_samples=n_samples, n_features=n_features,
random_state=0)
# Fit intercept case.
alpha = 1.
w = np.ones(n_features + 1)
grad_interp, hess_interp = _logistic_grad_hess(w, X, y, alpha)
loss_interp = _logistic_loss(w, X, y, alpha)
# Do not fit intercept. This can be considered equivalent to adding
# a feature vector of ones, i.e column of one vectors.
X_ = np.hstack((X, np.ones(10)[:, np.newaxis]))
grad, hess = _logistic_grad_hess(w, X_, y, alpha)
loss = _logistic_loss(w, X_, y, alpha)
# In the fit_intercept=False case, the feature vector of ones is
# penalized. This should be taken care of.
assert_almost_equal(loss_interp + 0.5 * (w[-1] ** 2), loss)
# Check gradient.
assert_array_almost_equal(grad_interp[:n_features], grad[:n_features])
assert_almost_equal(grad_interp[-1] + alpha * w[-1], grad[-1])
rng = np.random.RandomState(0)
grad = rng.rand(n_features + 1)
hess_interp = hess_interp(grad)
hess = hess(grad)
assert_array_almost_equal(hess_interp[:n_features], hess[:n_features])
assert_almost_equal(hess_interp[-1] + alpha * grad[-1], hess[-1])
示例11: test_class_weight_auto_classifiers
def test_class_weight_auto_classifiers():
"""Test that class_weight="auto" improves f1-score"""
# This test is broken; its success depends on:
# * a rare fortuitous RNG seed for make_classification; and
# * the use of binary F1 over a seemingly arbitrary positive class for two
# datasets, and weighted average F1 for the third.
# Its expectations need to be clarified and reimplemented.
raise SkipTest("This test requires redefinition")
classifiers = all_estimators(type_filter="classifier")
clean_warning_registry()
with warnings.catch_warnings(record=True):
classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()]
for n_classes, weights in zip([2, 3], [[0.8, 0.2], [0.8, 0.1, 0.1]]):
# create unbalanced dataset
X, y = make_classification(
n_classes=n_classes, n_samples=200, n_features=10, weights=weights, random_state=0, n_informative=n_classes
)
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
for name, Classifier in classifiers:
if (
name != "NuSVC"
# the sparse version has a parameter that doesn't do anything
and not name.startswith("RidgeClassifier")
# RidgeClassifier behaves unexpected
# FIXME!
and not name.endswith("NB")
):
# NaiveBayes classifiers have a somewhat different interface.
# FIXME SOON!
yield (check_class_weight_auto_classifiers, name, Classifier, X_train, y_train, X_test, y_test, weights)
示例12: test_cv
def test_cv(self):
X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2)
n_folds = 2
# = With EPAC
wf = CV(SVC(kernel="linear"), n_folds=n_folds, reducer=ClassificationReport(keep=True))
r_epac = wf.top_down(X=X, y=y)
# = With SKLEARN
clf = SVC(kernel="linear")
r_sklearn = list()
for idx_train, idx_test in StratifiedKFold(y=y, n_folds=n_folds):
# idx_train, idx_test = cv.__iter__().next()
X_train = X[idx_train, :]
X_test = X[idx_test, :]
y_train = y[idx_train, :]
clf.fit(X_train, y_train)
r_sklearn.append(clf.predict(X_test))
# = Comparison
key2cmp = "y" + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION
for icv in range(n_folds):
comp = np.all(np.asarray(r_epac[0][key2cmp]) == np.asarray(r_sklearn[0]))
self.assertTrue(comp, u"Diff CV: EPAC vs sklearn")
# test reduce
r_epac_reduce = wf.reduce().values()[0][key2cmp]
comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn))
self.assertTrue(comp, u"Diff CV: EPAC reduce")
示例13: test_perm
def test_perm(self):
X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2)
n_perms = 2
rnd = 0
# = With EPAC
wf = Perms(SVC(kernel="linear"), n_perms=n_perms, permute="y", random_state=rnd, reducer=None)
r_epac = wf.top_down(X=X, y=y)
# = With SKLEARN
clf = SVC(kernel="linear")
r_sklearn = list()
for perm in Permutations(n=y.shape[0], n_perms=n_perms, random_state=rnd):
y_p = y[perm, :]
clf.fit(X, y_p)
r_sklearn.append(clf.predict(X))
key2cmp = "y" + conf.SEP + conf.PREDICTION
# = Comparison
for iperm in range(n_perms):
comp = np.all(np.asarray(r_epac[iperm][key2cmp]) == np.asarray(r_sklearn[iperm]))
self.assertTrue(comp, u"Diff Perm: EPAC vs sklearn")
# test reduce
for iperm in range(n_perms):
r_epac_reduce = wf.reduce().values()[iperm][key2cmp]
comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn[iperm]))
self.assertTrue(comp, u"Diff Perm: EPAC reduce")
示例14: test_cvbestsearchrefit_select_k_best
def test_cvbestsearchrefit_select_k_best(self):
list_C_value = range(2, 10, 1)
# print repr(list_C_value)
for C_value in list_C_value:
# C_value = 2
# print C_value
X, y = datasets.make_classification(n_samples=100, n_features=500, n_informative=5)
n_folds_nested = 2
# random_state = 0
k_values = [2, 3, 4, 5, 6]
key_y_pred = "y" + conf.SEP + conf.PREDICTION
# With EPAC
methods = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C_value, kernel="linear")) for k in k_values])
wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested)
wf.run(X=X, y=y)
r_epac = wf.reduce().values()[0]
# - Without EPAC
from sklearn.pipeline import Pipeline
r_sklearn = dict()
clf = Pipeline([("anova", SelectKBest(k=3)), ("svm", SVC(C=C_value, kernel="linear"))])
parameters = {"anova__k": k_values}
cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
gscv.fit(X, y)
r_sklearn[key_y_pred] = gscv.predict(X)
r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
r_sklearn[conf.BEST_PARAMS]["k"] = r_sklearn[conf.BEST_PARAMS]["anova__k"]
# - Comparisons
comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: prediction")
for key_param in r_epac[conf.BEST_PARAMS][0]:
if key_param in r_sklearn[conf.BEST_PARAMS]:
comp = r_sklearn[conf.BEST_PARAMS][key_param] == r_epac[conf.BEST_PARAMS][0][key_param]
self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: best parameters")
示例15: test_cvbestsearchrefit
def test_cvbestsearchrefit(self):
X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2)
n_folds_nested = 2
# random_state = 0
C_values = [0.1, 0.5, 1, 2, 5]
kernels = ["linear", "rbf"]
key_y_pred = "y" + conf.SEP + conf.PREDICTION
# With EPAC
methods = Methods(*[SVC(C=C, kernel=kernel) for C in C_values for kernel in kernels])
wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested)
wf.run(X=X, y=y)
r_epac = wf.reduce().values()[0]
# - Without EPAC
r_sklearn = dict()
clf = SVC(kernel="linear")
parameters = {"C": C_values, "kernel": kernels}
cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
gscv.fit(X, y)
r_sklearn[key_y_pred] = gscv.predict(X)
r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
# - Comparisons
comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: prediction")
for key_param in r_epac[conf.BEST_PARAMS][0]:
if key_param in r_sklearn[conf.BEST_PARAMS]:
comp = r_sklearn[conf.BEST_PARAMS][key_param] == r_epac[conf.BEST_PARAMS][0][key_param]
self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: best parameters")