本文整理汇总了Python中sklearn.datasets.make_classification方法的典型用法代码示例。如果您正苦于以下问题:Python datasets.make_classification方法的具体用法?Python datasets.make_classification怎么用?Python datasets.make_classification使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.datasets
的用法示例。
在下文中一共展示了datasets.make_classification方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_plot_estimator_and_lightgbm
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_plot_estimator_and_lightgbm(tmpdir):
pytest.importorskip('graphviz')
lightgbm = pytest.importorskip('lightgbm')
from pygbm.plotting import plot_tree
n_classes = 3
X, y = make_classification(n_samples=150, n_classes=n_classes,
n_features=5, n_informative=3, n_redundant=0,
random_state=0)
n_trees = 3
est_pygbm = GradientBoostingClassifier(max_iter=n_trees,
n_iter_no_change=None)
est_pygbm.fit(X, y)
est_lightgbm = lightgbm.LGBMClassifier(n_estimators=n_trees)
est_lightgbm.fit(X, y)
n_total_trees = n_trees * n_classes
for i in range(n_total_trees):
filename = tmpdir.join('plot_mixed_predictors.pdf')
plot_tree(est_pygbm, est_lightgbm=est_lightgbm, tree_index=i,
view=False, filename=filename)
assert filename.exists()
示例2: make_classification_df
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def make_classification_df(n_samples: int = 1024,
n_num_features: int = 20,
n_cat_features: int = 0,
class_sep: float = 1.0,
n_classes: int = 2,
feature_name: str = 'col_{}',
target_name: str = 'target',
random_state: int = 0,
id_column: str = None) -> Tuple[pd.DataFrame, pd.Series]:
np.random.seed(random_state)
X, y = make_classification(n_samples=n_samples, n_features=n_num_features, class_sep=class_sep,
random_state=random_state, n_classes=n_classes, n_informative=max(n_classes, 2))
X = pd.DataFrame(X, columns=[feature_name.format(i) for i in range(n_num_features)])
y = pd.Series(y, name=target_name)
if id_column is not None:
X[id_column] = range(n_samples)
for i in range(n_cat_features):
X['cat_{}'.format(i)] = \
pd.Series(np.random.choice(['A', 'B', None], size=n_samples)).astype('category')
return X, y
示例3: test_cv_lgbm
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_cv_lgbm():
X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
models = [LGBMClassifier(n_estimators=300) for _ in range(5)]
pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
eval_func=roc_auc_score,
fit_params={'early_stopping_rounds': 200})
print(scores)
assert len(scores) == 5 + 1
assert scores[-1] >= 0.85 # overall roc_auc
assert roc_auc_score(y_train, pred_oof) == scores[-1]
assert roc_auc_score(y_test, pred_test) >= 0.85 # test roc_auc
assert roc_auc_score(y, models[0].predict_proba(X)[:, 1]) >= 0.85 # make sure models are trained
assert len(importance) == 5
assert list(importance[0].columns) == ['feature', 'importance']
assert len(importance[0]) == 20
示例4: test_cv_partial_evaluate
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_cv_partial_evaluate():
X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
model = RidgeClassifier(alpha=1.0)
n = 0
def _fold_count(*args):
nonlocal n
n += 1
cv = Take(2, KFold(5))
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=cv, eval_func=roc_auc_score,
on_each_fold=_fold_count)
assert len(scores) == 2 + 1
assert scores[-1] >= 0.8 # overall auc
assert n == 2
示例5: test_fit_params_callback
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_fit_params_callback():
X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
models = [LGBMClassifier(n_estimators=300) for _ in range(5)]
sample_weights = np.random.randint(1, 10, size=len(X_train))
sample_weights = sample_weights / sample_weights.sum()
def fit_params(n: int, train_index: List[int], valid_index: List[int]):
return {
'early_stopping_rounds': 100,
'sample_weight': list(sample_weights[train_index]),
'eval_sample_weight': [list(sample_weights[valid_index])]
}
result_w_weight = cross_validate(models, X_train, y_train, X_test, cv=5,
eval_func=roc_auc_score, fit_params=fit_params)
result_wo_weight = cross_validate(models, X_train, y_train, X_test, cv=5,
eval_func=roc_auc_score, fit_params={'early_stopping_rounds': 50})
assert result_w_weight.scores[-1] != result_wo_weight.scores[-1]
示例6: test_label_spreading_closed_form
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_label_spreading_closed_form():
n_classes = 2
X, y = make_classification(n_classes=n_classes, n_samples=200,
random_state=0)
y[::3] = -1
clf = label_propagation.LabelSpreading().fit(X, y)
# adopting notation from Zhou et al (2004):
S = clf._build_graph()
Y = np.zeros((len(y), n_classes + 1))
Y[np.arange(len(y)), y] = 1
Y = Y[:, :-1]
for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
expected = np.dot(np.linalg.inv(np.eye(len(S)) - alpha * S), Y)
expected /= expected.sum(axis=1)[:, np.newaxis]
clf = label_propagation.LabelSpreading(max_iter=10000, alpha=alpha)
clf.fit(X, y)
assert_array_almost_equal(expected, clf.label_distributions_, 4)
示例7: test_importances
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_importances():
# Check variable importances.
X, y = datasets.make_classification(n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=1)
for alg in ['SAMME', 'SAMME.R']:
clf = AdaBoostClassifier(algorithm=alg)
clf.fit(X, y)
importances = clf.feature_importances_
assert_equal(importances.shape[0], 10)
assert_equal((importances[:3, np.newaxis] >= importances[3:]).all(),
True)
示例8: test_importances_gini_equal_mse
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_importances_gini_equal_mse():
# Check that gini is equivalent to mse for binary output variable
X, y = datasets.make_classification(n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=0)
# The gini index and the mean square error (variance) might differ due
# to numerical instability. Since those instabilities mainly occurs at
# high tree depth, we restrict this maximal depth.
clf = DecisionTreeClassifier(criterion="gini", max_depth=5,
random_state=0).fit(X, y)
reg = DecisionTreeRegressor(criterion="mse", max_depth=5,
random_state=0).fit(X, y)
assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
assert_array_equal(clf.tree_.feature, reg.tree_.feature)
assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
示例9: test_mean_variance_illegal_axis
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_mean_variance_illegal_axis():
X, _ = make_classification(5, 4, random_state=0)
# Sparsify the array a little bit
X[0, 0] = 0
X[2, 1] = 0
X[4, 3] = 0
X_csr = sp.csr_matrix(X)
assert_raises(ValueError, mean_variance_axis, X_csr, axis=-3)
assert_raises(ValueError, mean_variance_axis, X_csr, axis=2)
assert_raises(ValueError, mean_variance_axis, X_csr, axis=-1)
assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-3,
last_mean=None, last_var=None, last_n=None)
assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=2,
last_mean=None, last_var=None, last_n=None)
assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-1,
last_mean=None, last_var=None, last_n=None)
示例10: test_max_features_tiebreak
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_max_features_tiebreak():
# Test if max_features can break tie among feature importance
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
n_repeated=0, shuffle=False, random_state=0)
max_features = X.shape[1]
feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1])
for n_features in range(1, max_features + 1):
transformer = SelectFromModel(
FixedImportanceEstimator(feature_importances),
max_features=n_features,
threshold=-np.inf)
X_new = transformer.fit_transform(X, y)
selected_feature_indices = np.where(transformer._get_support_mask())[0]
assert_array_equal(selected_feature_indices, np.arange(n_features))
assert X_new.shape[1] == n_features
示例11: test_threshold_and_max_features
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_threshold_and_max_features():
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
n_repeated=0, shuffle=False, random_state=0)
est = RandomForestClassifier(n_estimators=50, random_state=0)
transformer1 = SelectFromModel(estimator=est, max_features=3,
threshold=-np.inf)
X_new1 = transformer1.fit_transform(X, y)
transformer2 = SelectFromModel(estimator=est, threshold=0.04)
X_new2 = transformer2.fit_transform(X, y)
transformer3 = SelectFromModel(estimator=est, max_features=3,
threshold=0.04)
X_new3 = transformer3.fit_transform(X, y)
assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1])
selected_indices = transformer3.transform(
np.arange(X.shape[1])[np.newaxis, :])
assert_allclose(X_new3, X[:, selected_indices[0]])
示例12: test_feature_importances
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_feature_importances():
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
n_repeated=0, shuffle=False, random_state=0)
est = RandomForestClassifier(n_estimators=50, random_state=0)
for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
transformer = SelectFromModel(estimator=est, threshold=threshold)
transformer.fit(X, y)
assert hasattr(transformer.estimator_, 'feature_importances_')
X_new = transformer.transform(X)
assert_less(X_new.shape[1], X.shape[1])
importances = transformer.estimator_.feature_importances_
feature_mask = np.abs(importances) > func(importances)
assert_array_almost_equal(X_new, X[:, feature_mask])
示例13: test_2d_coef
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_2d_coef():
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
n_repeated=0, shuffle=False, random_state=0, n_classes=4)
est = LogisticRegression()
for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
for order in [1, 2, np.inf]:
# Fit SelectFromModel a multi-class problem
transformer = SelectFromModel(estimator=LogisticRegression(),
threshold=threshold,
norm_order=order)
transformer.fit(X, y)
assert hasattr(transformer.estimator_, 'coef_')
X_new = transformer.transform(X)
assert_less(X_new.shape[1], X.shape[1])
# Manually check that the norm is correctly performed
est.fit(X, y)
importances = np.linalg.norm(est.coef_, axis=0, ord=order)
feature_mask = importances > func(importances)
assert_array_almost_equal(X_new, X[:, feature_mask])
示例14: test_weight
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_weight():
# Test class weights
clf = svm.SVC(gamma='scale', class_weight={1: 0.1})
# we give a small weights to class 1
clf.fit(X, Y)
# so all predicted values belong to class 2
assert_array_almost_equal(clf.predict(X), [2] * 6)
X_, y_ = make_classification(n_samples=200, n_features=10,
weights=[0.833, 0.167], random_state=2)
for clf in (linear_model.LogisticRegression(),
svm.LinearSVC(random_state=0), svm.SVC(gamma="scale")):
clf.set_params(class_weight={0: .1, 1: 10})
clf.fit(X_[:100], y_[:100])
y_pred = clf.predict(X_[100:])
assert f1_score(y_[100:], y_pred) > .3
示例15: test_cross_val_score_predict_groups
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_cross_val_score_predict_groups():
# Check if ValueError (when groups is None) propagates to cross_val_score
# and cross_val_predict
# And also check if groups is correctly passed to the cv object
X, y = make_classification(n_samples=20, n_classes=2, random_state=0)
clf = SVC(kernel="linear")
group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
GroupShuffleSplit()]
for cv in group_cvs:
assert_raise_message(ValueError,
"The 'groups' parameter should not be None.",
cross_val_score, estimator=clf, X=X, y=y, cv=cv)
assert_raise_message(ValueError,
"The 'groups' parameter should not be None.",
cross_val_predict, estimator=clf, X=X, y=y, cv=cv)