Python datasets.make_classification方法代码示例

本文整理汇总了Python中sklearn.datasets.make_classification方法的典型用法代码示例。如果您正苦于以下问题：Python datasets.make_classification方法的具体用法？Python datasets.make_classification怎么用？Python datasets.make_classification使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.datasets的用法示例。

在下文中一共展示了datasets.make_classification方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_plot_estimator_and_lightgbm

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_plot_estimator_and_lightgbm(tmpdir):
    pytest.importorskip('graphviz')
    lightgbm = pytest.importorskip('lightgbm')
    from pygbm.plotting import plot_tree

    n_classes = 3
    X, y = make_classification(n_samples=150, n_classes=n_classes,
                               n_features=5, n_informative=3, n_redundant=0,
                               random_state=0)

    n_trees = 3
    est_pygbm = GradientBoostingClassifier(max_iter=n_trees,
                                           n_iter_no_change=None)
    est_pygbm.fit(X, y)
    est_lightgbm = lightgbm.LGBMClassifier(n_estimators=n_trees)
    est_lightgbm.fit(X, y)

    n_total_trees = n_trees * n_classes
    for i in range(n_total_trees):
        filename = tmpdir.join('plot_mixed_predictors.pdf')
        plot_tree(est_pygbm, est_lightgbm=est_lightgbm, tree_index=i,
                  view=False, filename=filename)
        assert filename.exists()

开发者ID:ogrisel，项目名称:pygbm，代码行数:25，代码来源:test_plotting.py

示例2: make_classification_df

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def make_classification_df(n_samples: int = 1024,
                           n_num_features: int = 20,
                           n_cat_features: int = 0,
                           class_sep: float = 1.0,
                           n_classes: int = 2,
                           feature_name: str = 'col_{}',
                           target_name: str = 'target',
                           random_state: int = 0,
                           id_column: str = None) -> Tuple[pd.DataFrame, pd.Series]:
    np.random.seed(random_state)
    X, y = make_classification(n_samples=n_samples, n_features=n_num_features, class_sep=class_sep,
                               random_state=random_state, n_classes=n_classes, n_informative=max(n_classes, 2))

    X = pd.DataFrame(X, columns=[feature_name.format(i) for i in range(n_num_features)])
    y = pd.Series(y, name=target_name)

    if id_column is not None:
        X[id_column] = range(n_samples)

    for i in range(n_cat_features):
        X['cat_{}'.format(i)] = \
            pd.Series(np.random.choice(['A', 'B', None], size=n_samples)).astype('category')

    return X, y

开发者ID:nyanp，项目名称:nyaggle，代码行数:26，代码来源:util.py

示例3: test_cv_lgbm

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_cv_lgbm():
    X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    models = [LGBMClassifier(n_estimators=300) for _ in range(5)]

    pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
                                                             eval_func=roc_auc_score,
                                                             fit_params={'early_stopping_rounds': 200})

    print(scores)
    assert len(scores) == 5 + 1
    assert scores[-1] >= 0.85  # overall roc_auc
    assert roc_auc_score(y_train, pred_oof) == scores[-1]
    assert roc_auc_score(y_test, pred_test) >= 0.85  # test roc_auc
    assert roc_auc_score(y, models[0].predict_proba(X)[:, 1]) >= 0.85  # make sure models are trained
    assert len(importance) == 5
    assert list(importance[0].columns) == ['feature', 'importance']
    assert len(importance[0]) == 20

开发者ID:nyanp，项目名称:nyaggle，代码行数:21，代码来源:test_cross_validate.py

示例4: test_cv_partial_evaluate

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_cv_partial_evaluate():
    X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    model = RidgeClassifier(alpha=1.0)

    n = 0

    def _fold_count(*args):
        nonlocal n
        n += 1

    cv = Take(2, KFold(5))

    pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=cv, eval_func=roc_auc_score,
                                                    on_each_fold=_fold_count)

    assert len(scores) == 2 + 1
    assert scores[-1] >= 0.8  # overall auc
    assert n == 2

开发者ID:nyanp，项目名称:nyaggle，代码行数:22，代码来源:test_cross_validate.py

示例5: test_fit_params_callback

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_fit_params_callback():
    X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    models = [LGBMClassifier(n_estimators=300) for _ in range(5)]

    sample_weights = np.random.randint(1, 10, size=len(X_train))
    sample_weights = sample_weights / sample_weights.sum()

    def fit_params(n: int, train_index: List[int], valid_index: List[int]):
        return {
            'early_stopping_rounds': 100,
            'sample_weight': list(sample_weights[train_index]),
            'eval_sample_weight': [list(sample_weights[valid_index])]
        }

    result_w_weight = cross_validate(models, X_train, y_train, X_test, cv=5,
                                     eval_func=roc_auc_score, fit_params=fit_params)

    result_wo_weight = cross_validate(models, X_train, y_train, X_test, cv=5,
                                      eval_func=roc_auc_score, fit_params={'early_stopping_rounds': 50})

    assert result_w_weight.scores[-1] != result_wo_weight.scores[-1]

开发者ID:nyanp，项目名称:nyaggle，代码行数:25，代码来源:test_cross_validate.py

示例6: test_label_spreading_closed_form

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_label_spreading_closed_form():
    n_classes = 2
    X, y = make_classification(n_classes=n_classes, n_samples=200,
                               random_state=0)
    y[::3] = -1
    clf = label_propagation.LabelSpreading().fit(X, y)
    # adopting notation from Zhou et al (2004):
    S = clf._build_graph()
    Y = np.zeros((len(y), n_classes + 1))
    Y[np.arange(len(y)), y] = 1
    Y = Y[:, :-1]
    for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
        expected = np.dot(np.linalg.inv(np.eye(len(S)) - alpha * S), Y)
        expected /= expected.sum(axis=1)[:, np.newaxis]
        clf = label_propagation.LabelSpreading(max_iter=10000, alpha=alpha)
        clf.fit(X, y)
        assert_array_almost_equal(expected, clf.label_distributions_, 4)

开发者ID:PacktPublishing，项目名称:Mastering-Elasticsearch-7.0，代码行数:19，代码来源:test_label_propagation.py

示例7: test_importances

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_importances():
    # Check variable importances.
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=1)

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)

        clf.fit(X, y)
        importances = clf.feature_importances_

        assert_equal(importances.shape[0], 10)
        assert_equal((importances[:3, np.newaxis] >= importances[3:]).all(),
                     True)

开发者ID:PacktPublishing，项目名称:Mastering-Elasticsearch-7.0，代码行数:21，代码来源:test_weight_boosting.py

示例8: test_importances_gini_equal_mse

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_importances_gini_equal_mse():
    # Check that gini is equivalent to mse for binary output variable

    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=0)

    # The gini index and the mean square error (variance) might differ due
    # to numerical instability. Since those instabilities mainly occurs at
    # high tree depth, we restrict this maximal depth.
    clf = DecisionTreeClassifier(criterion="gini", max_depth=5,
                                 random_state=0).fit(X, y)
    reg = DecisionTreeRegressor(criterion="mse", max_depth=5,
                                random_state=0).fit(X, y)

    assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
    assert_array_equal(clf.tree_.feature, reg.tree_.feature)
    assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
    assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
    assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)

开发者ID:PacktPublishing，项目名称:Mastering-Elasticsearch-7.0，代码行数:26，代码来源:test_tree.py

示例9: test_mean_variance_illegal_axis

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_mean_variance_illegal_axis():
    X, _ = make_classification(5, 4, random_state=0)
    # Sparsify the array a little bit
    X[0, 0] = 0
    X[2, 1] = 0
    X[4, 3] = 0
    X_csr = sp.csr_matrix(X)
    assert_raises(ValueError, mean_variance_axis, X_csr, axis=-3)
    assert_raises(ValueError, mean_variance_axis, X_csr, axis=2)
    assert_raises(ValueError, mean_variance_axis, X_csr, axis=-1)

    assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-3,
                  last_mean=None, last_var=None, last_n=None)
    assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=2,
                  last_mean=None, last_var=None, last_n=None)
    assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-1,
                  last_mean=None, last_var=None, last_n=None)

开发者ID:PacktPublishing，项目名称:Mastering-Elasticsearch-7.0，代码行数:19，代码来源:test_sparsefuncs.py

示例10: test_max_features_tiebreak

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_max_features_tiebreak():
    # Test if max_features can break tie among feature importance
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)
    max_features = X.shape[1]

    feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1])
    for n_features in range(1, max_features + 1):
        transformer = SelectFromModel(
            FixedImportanceEstimator(feature_importances),
            max_features=n_features,
            threshold=-np.inf)
        X_new = transformer.fit_transform(X, y)
        selected_feature_indices = np.where(transformer._get_support_mask())[0]
        assert_array_equal(selected_feature_indices, np.arange(n_features))
        assert X_new.shape[1] == n_features

开发者ID:PacktPublishing，项目名称:Mastering-Elasticsearch-7.0，代码行数:19，代码来源:test_from_model.py

示例11: test_threshold_and_max_features

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_threshold_and_max_features():
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)
    est = RandomForestClassifier(n_estimators=50, random_state=0)

    transformer1 = SelectFromModel(estimator=est, max_features=3,
                                   threshold=-np.inf)
    X_new1 = transformer1.fit_transform(X, y)

    transformer2 = SelectFromModel(estimator=est, threshold=0.04)
    X_new2 = transformer2.fit_transform(X, y)

    transformer3 = SelectFromModel(estimator=est, max_features=3,
                                   threshold=0.04)
    X_new3 = transformer3.fit_transform(X, y)
    assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1])
    selected_indices = transformer3.transform(
        np.arange(X.shape[1])[np.newaxis, :])
    assert_allclose(X_new3, X[:, selected_indices[0]])

开发者ID:PacktPublishing，项目名称:Mastering-Elasticsearch-7.0，代码行数:22，代码来源:test_from_model.py

示例12: test_feature_importances

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_feature_importances():
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)

    est = RandomForestClassifier(n_estimators=50, random_state=0)
    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
        transformer = SelectFromModel(estimator=est, threshold=threshold)
        transformer.fit(X, y)
        assert hasattr(transformer.estimator_, 'feature_importances_')

        X_new = transformer.transform(X)
        assert_less(X_new.shape[1], X.shape[1])
        importances = transformer.estimator_.feature_importances_

        feature_mask = np.abs(importances) > func(importances)
        assert_array_almost_equal(X_new, X[:, feature_mask])

开发者ID:PacktPublishing，项目名称:Mastering-Elasticsearch-7.0，代码行数:19，代码来源:test_from_model.py

示例13: test_2d_coef

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_2d_coef():
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0, n_classes=4)

    est = LogisticRegression()
    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
        for order in [1, 2, np.inf]:
            # Fit SelectFromModel a multi-class problem
            transformer = SelectFromModel(estimator=LogisticRegression(),
                                          threshold=threshold,
                                          norm_order=order)
            transformer.fit(X, y)
            assert hasattr(transformer.estimator_, 'coef_')
            X_new = transformer.transform(X)
            assert_less(X_new.shape[1], X.shape[1])

            # Manually check that the norm is correctly performed
            est.fit(X, y)
            importances = np.linalg.norm(est.coef_, axis=0, ord=order)
            feature_mask = importances > func(importances)
            assert_array_almost_equal(X_new, X[:, feature_mask])

开发者ID:PacktPublishing，项目名称:Mastering-Elasticsearch-7.0，代码行数:24，代码来源:test_from_model.py

示例14: test_weight

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_weight():
    # Test class weights
    clf = svm.SVC(gamma='scale', class_weight={1: 0.1})
    # we give a small weights to class 1
    clf.fit(X, Y)
    # so all predicted values belong to class 2
    assert_array_almost_equal(clf.predict(X), [2] * 6)

    X_, y_ = make_classification(n_samples=200, n_features=10,
                                 weights=[0.833, 0.167], random_state=2)

    for clf in (linear_model.LogisticRegression(),
                svm.LinearSVC(random_state=0), svm.SVC(gamma="scale")):
        clf.set_params(class_weight={0: .1, 1: 10})
        clf.fit(X_[:100], y_[:100])
        y_pred = clf.predict(X_[100:])
        assert f1_score(y_[100:], y_pred) > .3

开发者ID:PacktPublishing，项目名称:Mastering-Elasticsearch-7.0，代码行数:19，代码来源:test_svm.py

示例15: test_cross_val_score_predict_groups

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import make_classification [as 别名]
def test_cross_val_score_predict_groups():
    # Check if ValueError (when groups is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if groups is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_score, estimator=clf, X=X, y=y, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_predict, estimator=clf, X=X, y=y, cv=cv)

开发者ID:PacktPublishing，项目名称:Mastering-Elasticsearch-7.0，代码行数:19，代码来源:test_validation.py

注：本文中的sklearn.datasets.make_classification方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。