本文整理汇总了Python中sklearn.datasets.samples_generator.make_classification函数的典型用法代码示例。如果您正苦于以下问题:Python make_classification函数的具体用法?Python make_classification怎么用?Python make_classification使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了make_classification函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: make_sparse_data
def make_sparse_data(use_feature_hashing=False):
"""
Function to create sparse data with two features always zero
in the training set and a different one always zero in the
test set
"""
# Create training data
X, y = make_classification(n_samples=500, n_features=3,
n_informative=3, n_redundant=0,
n_classes=2, random_state=1234567890)
# we need features to be non-negative since we will be
# using naive bayes laster
X = np.abs(X)
# make sure that none of the features are zero
X[np.where(X == 0)] += 1
# since we want to use SKLL's FeatureSet class, we need to
# create a list of IDs
ids = ['EXAMPLE_{}'.format(n) for n in range(1, 501)]
# create a list of dictionaries as the features
# with f1 and f5 always 0
feature_names = ['f{}'.format(n) for n in range(1, 6)]
features = []
for row in X:
row = [0] + row.tolist() + [0]
features.append(dict(zip(feature_names, row)))
# use a FeatureHasher if we are asked to do feature hashing
vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None
train_fs = FeatureSet('train_sparse', ids,
features=features, labels=y,
vectorizer=vectorizer)
# now create the test set with f4 always 0 but nothing else
X, y = make_classification(n_samples=100, n_features=4,
n_informative=4, n_redundant=0,
n_classes=2, random_state=1234567890)
X = np.abs(X)
X[np.where(X == 0)] += 1
ids = ['EXAMPLE_{}'.format(n) for n in range(1, 101)]
# create a list of dictionaries as the features
# with f4 always 0
feature_names = ['f{}'.format(n) for n in range(1, 6)]
features = []
for row in X:
row = row.tolist()
row = row[:3] + [0] + row[3:]
features.append(dict(zip(feature_names, row)))
test_fs = FeatureSet('test_sparse', ids,
features=features, labels=y,
vectorizer=vectorizer)
return train_fs, test_fs
示例2: test_linearsvc_parameters
def test_linearsvc_parameters():
"""
Test possible parameter combinations in LinearSVC
"""
# Generate list of possible parameter combinations
losses = ['hinge', 'squared_hinge', 'logistic_regression', 'foo']
penalties, duals = ['l1', 'l2', 'bar'], [True, False]
X, y = make_classification(n_samples=5, n_features=5)
for loss, penalty, dual in itertools.product(losses, penalties, duals):
clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual)
if ((loss, penalty) == ('hinge', 'l1') or
(loss, penalty, dual) == ('hinge', 'l2', False) or
(penalty, dual) == ('l1', True) or
loss == 'foo' or penalty == 'bar'):
assert_raises_regexp(ValueError,
"Unsupported set of arguments.*penalty='%s.*"
"loss='%s.*dual=%s"
% (penalty, loss, dual),
clf.fit, X, y)
else:
clf.fit(X, y)
# Incorrect loss value - test if explicit error message is raised
assert_raises_regexp(ValueError, ".*loss='L3' is not supported.*",
svm.LinearSVC(loss="L3").fit, X, y)
示例3: test_f_classif_multi_class
def test_f_classif_multi_class():
"""
Test whether the F test yields meaningful results
on a simple simulated classification problem
"""
X, Y = make_classification(
n_samples=200,
n_features=20,
n_informative=3,
n_redundant=2,
n_repeated=0,
n_classes=8,
n_clusters_per_class=1,
flip_y=0.0,
class_sep=10,
shuffle=False,
random_state=0,
)
F, pv = f_classif(X, Y)
assert (F > 0).all()
assert (pv > 0).all()
assert (pv < 1).all()
assert (pv[:5] < 0.05).all()
assert (pv[5:] > 1.0e-5).all()
示例4: test_select_heuristics_classif
def test_select_heuristics_classif():
# Test whether the relative univariate feature selection
# gets the correct items in a simple classification problem
# with the fdr, fwe and fpr heuristics
X, y = make_classification(
n_samples=200,
n_features=20,
n_informative=3,
n_redundant=2,
n_repeated=0,
n_classes=8,
n_clusters_per_class=1,
flip_y=0.0,
class_sep=10,
shuffle=False,
random_state=0,
)
univariate_filter = SelectFwe(f_classif, alpha=0.01)
X_r = univariate_filter.fit(X, y).transform(X)
gtruth = np.zeros(20)
gtruth[:5] = 1
for mode in ["fdr", "fpr", "fwe"]:
X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
assert_array_almost_equal(support, gtruth)
示例5: test_select_percentile_classif_sparse
def test_select_percentile_classif_sparse():
# Test whether the relative univariate feature selection
# gets the correct items in a simple classification problem
# with the percentile heuristic
X, y = make_classification(
n_samples=200,
n_features=20,
n_informative=3,
n_redundant=2,
n_repeated=0,
n_classes=8,
n_clusters_per_class=1,
flip_y=0.0,
class_sep=10,
shuffle=False,
random_state=0,
)
X = sparse.csr_matrix(X)
univariate_filter = SelectPercentile(f_classif, percentile=25)
X_r = univariate_filter.fit(X, y).transform(X)
X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X)
assert_array_equal(X_r.toarray(), X_r2.toarray())
support = univariate_filter.get_support()
gtruth = np.zeros(20)
gtruth[:5] = 1
assert_array_equal(support, gtruth)
X_r2inv = univariate_filter.inverse_transform(X_r2)
assert_true(sparse.issparse(X_r2inv))
support_mask = safe_mask(X_r2inv, support)
assert_equal(X_r2inv.shape, X.shape)
assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
# Check other columns are empty
assert_equal(X_r2inv.getnnz(), X_r.getnnz())
示例6: test_select_kbest_all
def test_select_kbest_all():
# Test whether k="all" correctly returns all features.
X, y = make_classification(n_samples=20, n_features=10, shuffle=False, random_state=0)
univariate_filter = SelectKBest(f_classif, k="all")
X_r = univariate_filter.fit(X, y).transform(X)
assert_array_equal(X, X_r)
示例7: test_f_classif
def test_f_classif():
# Test whether the F test yields meaningful results
# on a simple simulated classification problem
X, y = make_classification(
n_samples=200,
n_features=20,
n_informative=3,
n_redundant=2,
n_repeated=0,
n_classes=8,
n_clusters_per_class=1,
flip_y=0.0,
class_sep=10,
shuffle=False,
random_state=0,
)
F, pv = f_classif(X, y)
F_sparse, pv_sparse = f_classif(sparse.csr_matrix(X), y)
assert_true((F > 0).all())
assert_true((pv > 0).all())
assert_true((pv < 1).all())
assert_true((pv[:5] < 0.05).all())
assert_true((pv[5:] > 1.0e-4).all())
assert_array_almost_equal(F_sparse, F)
assert_array_almost_equal(pv_sparse, pv)
示例8: test_mismatch_labels_features
def test_mismatch_labels_features():
"""
Test to catch mistmatch between the shape of the labels vector and the feature matrix
"""
# get a 100 instances with 4 features but ignore the labels we
# get from here
X, y = make_classification(n_samples=100, n_features=4,
n_informative=4, n_redundant=0,
n_classes=3, random_state=1234567890)
# double-stack y to ensure we don't match the number of feature rows
y2 = np.hstack([y, y])
# convert the features into a list of dictionaries
feature_names = ['f{}'.format(n) for n in range(1, 5)]
features = []
for row in X:
features.append(dict(zip(feature_names, row)))
# get 100 ids
ids = ['EXAMPLE_{}'.format(i) for i in range(100)]
# This should raise a ValueError
FeatureSet('test', ids, features=features, labels=y2)
示例9: test_select_kbest_classif
def test_select_kbest_classif():
# Test whether the relative univariate feature selection
# gets the correct items in a simple classification problem
# with the k best heuristic
X, y = make_classification(
n_samples=200,
n_features=20,
n_informative=3,
n_redundant=2,
n_repeated=0,
n_classes=8,
n_clusters_per_class=1,
flip_y=0.0,
class_sep=10,
shuffle=False,
random_state=0,
)
univariate_filter = SelectKBest(f_classif, k=5)
X_r = univariate_filter.fit(X, y).transform(X)
X_r2 = GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(20)
gtruth[:5] = 1
assert_array_equal(support, gtruth)
示例10: test_grid_search_sparse_scoring
def test_grid_search_sparse_scoring():
X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
cv.fit(X_[:180], y_[:180])
y_pred = cv.predict(X_[180:])
C = cv.best_estimator_.C
X_ = sp.csr_matrix(X_)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
cv.fit(X_[:180], y_[:180])
y_pred2 = cv.predict(X_[180:])
C2 = cv.best_estimator_.C
assert_array_equal(y_pred, y_pred2)
assert_equal(C, C2)
# Smoke test the score
#np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
# cv.score(X_[:180], y[:180]))
# test loss where greater is worse
def f1_loss(y_true_, y_pred_):
return -f1_score(y_true_, y_pred_)
F1Loss = Scorer(f1_loss, greater_is_better=False)
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss)
cv.fit(X_[:180], y_[:180])
y_pred3 = cv.predict(X_[180:])
C3 = cv.best_estimator_.C
assert_equal(C, C3)
assert_array_equal(y_pred, y_pred3)
示例11: test_deprecated_score_func
def test_deprecated_score_func():
# test that old deprecated way of passing a score / loss function is still
# supported
X, y = make_classification(n_samples=200, n_features=100, random_state=0)
clf = LinearSVC(random_state=0)
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
cv.fit(X[:180], y[:180])
y_pred = cv.predict(X[180:])
C = cv.best_estimator_.C
clf = LinearSVC(random_state=0)
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
with warnings.catch_warnings(record=True):
# catch deprecation warning
cv.fit(X[:180], y[:180])
y_pred_func = cv.predict(X[180:])
C_func = cv.best_estimator_.C
assert_array_equal(y_pred, y_pred_func)
assert_equal(C, C_func)
# test loss where greater is worse
def f1_loss(y_true_, y_pred_):
return -f1_score(y_true_, y_pred_)
clf = LinearSVC(random_state=0)
cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, loss_func=f1_loss)
with warnings.catch_warnings(record=True):
# catch deprecation warning
cv.fit(X[:180], y[:180])
y_pred_loss = cv.predict(X[180:])
C_loss = cv.best_estimator_.C
assert_array_equal(y_pred, y_pred_loss)
assert_equal(C, C_loss)
示例12: test_select_fwe_classif
def test_select_fwe_classif():
"""
Test whether the relative univariate feature selection
gets the correct items in a simple classification problem
with the fpr heuristic
"""
X, Y = make_classification(
n_samples=200,
n_features=20,
n_informative=3,
n_redundant=2,
n_repeated=0,
n_classes=8,
n_clusters_per_class=1,
flip_y=0.0,
class_sep=10,
shuffle=False,
random_state=0,
)
univariate_filter = SelectFwe(f_classif, alpha=0.01)
X_r = univariate_filter.fit(X, Y).transform(X)
X_r2 = GenericUnivariateSelect(f_classif, mode="fwe", param=0.01).fit(X, Y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(20)
gtruth[:5] = 1
assert np.sum(np.abs(support - gtruth)) < 2
示例13: test_mutual_info_classif
def test_mutual_info_classif():
X, y = make_classification(
n_samples=100,
n_features=5,
n_informative=1,
n_redundant=1,
n_repeated=0,
n_classes=2,
n_clusters_per_class=1,
flip_y=0.0,
class_sep=10,
shuffle=False,
random_state=0,
)
# Test in KBest mode.
univariate_filter = SelectKBest(mutual_info_classif, k=2)
X_r = univariate_filter.fit(X, y).transform(X)
X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="k_best", param=2).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(5)
gtruth[:2] = 1
assert_array_equal(support, gtruth)
# Test in Percentile mode.
univariate_filter = SelectPercentile(mutual_info_classif, percentile=40)
X_r = univariate_filter.fit(X, y).transform(X)
X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="percentile", param=40).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(5)
gtruth[:2] = 1
assert_array_equal(support, gtruth)
示例14: test_grid_search_precomputed_kernel
def test_grid_search_precomputed_kernel():
"""Test that grid search works when the input features are given in the
form of a precomputed kernel matrix """
X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
# compute the training kernel matrix corresponding to the linear kernel
K_train = np.dot(X_[:180], X_[:180].T)
y_train = y_[:180]
clf = SVC(kernel='precomputed')
cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
cv.fit(K_train, y_train)
assert_true(cv.best_score_ >= 0)
# compute the test kernel matrix
K_test = np.dot(X_[180:], X_[:180].T)
y_test = y_[180:]
y_pred = cv.predict(K_test)
assert_true(np.mean(y_pred == y_test) >= 0)
# test error is raised when the precomputed kernel is not array-like
# or sparse
assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
示例15: test_grid_search_precomputed_kernel_error_kernel_function
def test_grid_search_precomputed_kernel_error_kernel_function():
"""Test that grid search returns an error when using a kernel_function"""
X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
kernel_function = lambda x1, x2: np.dot(x1, x2.T)
clf = SVC(kernel=kernel_function)
cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
assert_raises(ValueError, cv.fit, X_, y_)