当前位置: 首页>>代码示例>>Python>>正文


Python model_selection.StratifiedKFold类代码示例

本文整理汇总了Python中sklearn.model_selection.StratifiedKFold的典型用法代码示例。如果您正苦于以下问题:Python StratifiedKFold类的具体用法?Python StratifiedKFold怎么用?Python StratifiedKFold使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了StratifiedKFold类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: stacking_proba

def stacking_proba(clf,X_train,y,X_test,nfolds=5,random_seed=2017,return_score=False,
                   shuffle=True,metric='acc',clf_name='UnKnown'):
    folds = StratifiedKFold(n_splits=nfolds, shuffle=shuffle, random_state=random_seed)
    folds.get_n_splits(X_train,y)
    #return stacking_proba for train set
    train_stacking_proba=np.zeros((X_train.shape[0],np.unique(y).shape[0]))
    score=0
    for i,(train_index, validate_index) in enumerate(folds.split(X_train, y)):
        # print(str(clf_name)+" folds:"+str(i+1)+"/"+str(nfolds))
        X_train_fold=X_train[train_index,:]
        y_train_fold=y[train_index]
        X_validate_fold=X_train[validate_index,:]
        y_validate_fold=y[validate_index]
        clf.fit(X_train_fold,y_train_fold)
        fold_preds=clf.predict_proba(X_validate_fold)
        train_stacking_proba[validate_index,:]=fold_preds
        #validation
        fold_preds_a = np.argmax(fold_preds, axis=1)
        fold_score=len(np.nonzero(y_validate_fold - fold_preds_a == 0)[0]) / len(y_validate_fold)
        # print('validate '+metric+":"+str(fold_score))
        score+=fold_score
    score/=nfolds
    #return stacking_proba for test set
    clf.fit(X_train,y)
    test_stacking_proba=clf.predict_proba(X_test)

    if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
        train_stacking_proba=train_stacking_proba[:,1]
        test_stacking_proba=test_stacking_proba[:,1]
    if return_score:
        return train_stacking_proba,test_stacking_proba,score
    else:
        return train_stacking_proba,test_stacking_proba
开发者ID:sunlinyu1993,项目名称:Machine-Learning-Toolbox,代码行数:33,代码来源:Ensemble.py

示例2: test_kfold_valueerrors

def test_kfold_valueerrors():
    X1 = np.array([[1, 2], [3, 4], [5, 6]])
    X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, next, KFold(4).split(X1))

    # Check that a warning is raised if the least populated class has too few
    # members.
    y = np.array([3, 3, -1, -1, 2])

    skf_3 = StratifiedKFold(3)
    assert_warns_message(Warning, "The least populated class",
                         next, skf_3.split(X2, y))

    # Check that despite the warning the folds are still computed even
    # though all the classes are not necessarily represented at on each
    # side of the split at each split
    with warnings.catch_warnings():
        check_cv_coverage(skf_3, X2, y, labels=None, expected_n_iter=3)

    # Error when number of folds is <= 1
    assert_raises(ValueError, KFold, 0)
    assert_raises(ValueError, KFold, 1)
    assert_raises(ValueError, StratifiedKFold, 0)
    assert_raises(ValueError, StratifiedKFold, 1)

    # When n_folds is not integer:
    assert_raises(ValueError, KFold, 1.5)
    assert_raises(ValueError, KFold, 2.0)
    assert_raises(ValueError, StratifiedKFold, 1.5)
    assert_raises(ValueError, StratifiedKFold, 2.0)

    # When shuffle is not  a bool:
    assert_raises(TypeError, KFold, n_folds=4, shuffle=None)
开发者ID:absolutelyNoWarranty,项目名称:scikit-learn,代码行数:34,代码来源:test_split.py

示例3: test_grid_search_correct_score_results

def test_grid_search_correct_score_results():
    # test that correct scores are used
    n_splits = 3
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    for score in ['f1', 'roc_auc']:
        grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits)
        results = grid_search.fit(X, y).cv_results_

        # Test scorer names
        result_keys = list(results.keys())
        expected_keys = (("mean_test_score", "rank_test_score") +
                         tuple("split%d_test_score" % cv_i
                               for cv_i in range(n_splits)))
        assert_true(all(in1d(expected_keys, result_keys)))

        cv = StratifiedKFold(n_splits=n_splits)
        n_splits = grid_search.n_splits_
        for candidate_i, C in enumerate(Cs):
            clf.set_params(C=C)
            cv_scores = np.array(
                list(grid_search.cv_results_['split%d_test_score'
                                             % s][candidate_i]
                     for s in range(n_splits)))
            for i, (train, test) in enumerate(cv.split(X, y)):
                clf.fit(X[train], y[train])
                if score == "f1":
                    correct_score = f1_score(y[test], clf.predict(X[test]))
                elif score == "roc_auc":
                    dec = clf.decision_function(X[test])
                    correct_score = roc_auc_score(y[test], dec)
                assert_almost_equal(correct_score, cv_scores[i])
开发者ID:YinongLong,项目名称:scikit-learn,代码行数:33,代码来源:test_search.py

示例4: classify

def classify(X,y, clf,**para):
    # y = profile["Loss"].as_matrix()
    # X = profile[features].as_matrix()

    kf = KFold(n_splits=10)
    skf = StratifiedKFold(n_splits=6)

    # print(**para)
    classifier = clf(**para)
    name = str(classifier).split("(")[0]


    # dt = tree.DecisionTreeClassifier(min_samples_split=min_split, max_depth=max_dep)
    print("{0} has been established with {1}".format(name, para))
    # lr = LogisticRegression(penalty='l1')

    for train_index, test_index in skf.split(X, y):
        #     print("TRAIN:",train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        print("10-fold Score is: {0}".format(score))

    return classifier,y_test, y_pred
开发者ID:wislish,项目名称:Python-Data-Analysis,代码行数:26,代码来源:userClassify.py

示例5: cv_score

def cv_score(X, y, n_epochs = 10, n_folds=10, random_state=1999):
    kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state)
    scores = np.zeros((n_folds, n_epochs))
    val_scores = np.zeros((n_folds, n_epochs))
    best_epochs = np.zeros(n_folds)
    clfs = [KerasWrapper(num_features=X.shape[1], label='keras_{}'.format(i)) for i in range(n_folds)]
    folds = kf.split(X, y_train)
    #iteratively train epochs
    kfsplit = [(itrain, itest) for itrain, itest in folds]
    for i in range(n_epochs):
        print('=============Epoch {}================'.format(i))
        i_fold = 0
        for itrain, itest in kfsplit:
            print('Fold ', i_fold)
            train = X[itrain,:]
            test = X[itest,:]
            ytrain, ytest = y[itrain], y[itest]
            clf, score, num_epoch = clfs[i_fold].fit(train, ytrain, nb_epoch=1, 
                                               validation_split=None, batch_size=64,
                                               patience=1)

            print('score: {}'.format(score))
            scores[i_fold, i] = score
            best_epochs[i_fold] = num_epoch

            # predict on oof
            pred = clf.predict_proba(test)
            val_score = log_loss(ytest, pred)
            print('Validation score: ', val_score)
            val_scores[i_fold, i] = val_score
            i_fold += 1
    return scores, val_scores, best_epochs
开发者ID:canzheng,项目名称:kaggle-talkingdata,代码行数:32,代码来源:nnet.py

示例6: split_data

    def split_data(self, X, y, stratified = True, bad_chess = False):
        if bad_chess:
            n_points = int(X.shape[0] / self.nodes)
            for node in range(self.nodes):
                start_slice = node * n_points
                final_slice = start_slice + n_points
                dx = X[start_slice:final_slice]
                dy = y[start_slice:final_slice]

                frame_dx = pd.DataFrame(dx)
                frame_dy = pd.DataFrame(dy)

                file_data  = datas_path.joinpath('data_' + str(node) + '.csv')
                file_class = datas_path.joinpath('class_' + str(node) + '.csv')
                frame_dx.to_csv(file_data, index = False)
                frame_dy.to_csv(file_class, index = False)
        else:
            node = 0
            if stratified:
                skf  = StratifiedKFold(n_splits = self.nodes)
            else:
                skf  = KFold(n_splits = self.nodes, shuffle = True, random_state = 17)
            for splited_index in skf.split(X, y):
                new_X = pd.DataFrame(X[splited_index[1]])
                new_y = pd.DataFrame(y[splited_index[1]])

                X_path = datas_path.joinpath("data_" + str(node) + ".csv")
                y_path = datas_path.joinpath("class_" + str(node) + ".csv")
                new_X.to_csv(X_path, index = False)
                new_y.to_csv(y_path, index = False)
                node += 1
开发者ID:caiodadauto,项目名称:Distributed-SVM,代码行数:31,代码来源:Network.py

示例7: cv

def cv(X_train, y_train):

    kfold = StratifiedKFold(n_splits=5, shuffle=True)

    scores_f = []
    scores_p = []
    scores_r = []


    for train, test in kfold.split(X_train, y_train):

        model = TargetEnsembler(features)
        X_train_cv = pd.DataFrame(X_train.values[train], columns=X_train.columns)
        y_train_cv = pd.DataFrame(y_train.values[train], columns=["PCL_Strict3"])
        X_test_cv = pd.DataFrame(X_train.values[test], columns=X_train.columns)
        y_test_cv = pd.DataFrame(y_train.values[test], columns=["PCL_Strict3"])
        model.fit(X_train_cv, y_train_cv)

        y_pred = model.predict(X_test_cv)

        s_f = f1_score(y_test_cv, y_pred)
        s_p = precision_score(y_test_cv, y_pred)
        s_r = recall_score(y_test_cv, y_pred)
        print("\tscores f1", (s_f))
        print("\tscores p", (s_p))
        print("\tscores r", (s_r))
        scores_f.append(s_f)
        scores_p.append(s_p)
        scores_r.append(s_r)

    print("mean scores f1", np.mean(scores_f))
    print("mean scores p", np.mean(scores_p))
    print("mean scores r", np.mean(scores_r))
开发者ID:nogur9,项目名称:PTSD,代码行数:33,代码来源:ensembler_single_features.py

示例8: get_cv_results

def get_cv_results(design, data, cv_splits=10):
  test_df, unit_onehot, unit_x = data
  cv_results = []
  for i in range(design.shape[0]):
    lambda_int, lambda_x = design[i, :]
    val_losses = []
    for rep in range(3): # Almost like bootstrap. Reshuffling
      
      cv_val_losses = []
      skf = StratifiedKFold(n_splits=10, shuffle=True)
      for train_index, test_index in skf.split(unit_x, test_df['unit']):
         re_model = create_model(unit_onehot.shape[1], lambda_int, lambda_x,
                                 .01, .0001, .92)

         X_train = [test_df["x"][train_index], unit_onehot[train_index],
                    unit_x[train_index]]
         X_test = [test_df["x"][test_index], unit_onehot[test_index],
                    unit_x[test_index]]

         y_train, y_test = test_df["y"][train_index], test_df["y"][test_index]
         h = re_model.fit(X_train, y_train,
                          epochs = 15000, batch_size = 450,
                          validation_data = (X_test, y_test),
                          callbacks = callbacks, verbose = 0)
         cv_val_losses.append(np.min(h.history['val_loss']))

      val_losses.append(np.mean(cv_val_losses))
    cv_results.append(np.mean(val_losses)) 
  return cv_results
开发者ID:baogorek,项目名称:Miscellaneous,代码行数:29,代码来源:utils.py

示例9: __init__

 def __init__(self, fm_decoder, n_iter=5, n_folds=3,
              random_state=None):
     self.fm_decoder = fm_decoder
     StratifiedKFold.__init__(
         self,
         n_folds=n_folds,
         random_state=random_state)
开发者ID:arthurmensch,项目名称:scikit-learn-sandbox,代码行数:7,代码来源:split.py

示例10: stratified_cross_validate

    def stratified_cross_validate(self, k):
        attributes = np.append(self.training_attributes, self.testing_attributes, axis=0)
        labels = np.append(self.training_labels, self.testing_labels, axis=0)

        all_data = np.array([np.append(attributes[i], labels[i]) for i in range(len(attributes))])

        #print("all data : %s" % all_data)
        #print("")

        np.random.shuffle(all_data)

        X = all_data[:, :-1]
        y = all_data[:, -1]
        print(X.shape, y.shape)
        skf = StratifiedKFold(n_splits=2)
        print(skf.get_n_splits(X, y))
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            yield (X_train, y_train, X_test, y_test)

        #print("shuffled data : %s" % all_data)
        #print("")

        for i in range(k):
            split = len(all_data) / k
            #print("split : %s" % split)

            test_data = all_data[i * split:(i + 1) * split, :]
            train_data = np.delete(all_data, np.arange(i * split, (i + 1) * split), axis=0)

            train_input, train_output = train_data[:, :-1], train_data[:, -1]
            test_input, test_output = test_data[:, :-1], test_data[:, -1]

            yield (train_input, train_output, test_input, test_output)
开发者ID:Piggelinus,项目名称:Project,代码行数:35,代码来源:reader.py

示例11: test_datasets

def test_datasets(dataset_names):
    from sklearn.svm import SVC
    data = Data(dataset_names=dataset_names)

    def separate_sets(x, y, test_fold_id, test_folds):
        x_test = x[test_folds == test_fold_id, :]
        y_test = y[test_folds == test_fold_id]

        x_train = x[test_folds != test_fold_id, :]
        y_train = y[test_folds != test_fold_id]
        return [x_train, y_train, x_test, y_test]

    n_folds = 2
    accuracies = {}
    for name, dataset in data.datasets.items():
        dataset.print_summary()
        skf = StratifiedKFold(dataset.target, n_folds=n_folds, shuffle=True)
        test_folds = skf.test_folds
        accuracies[name] = np.zeros(n_folds)
        test_fold = 0
        for train_idx, test_idx in skf.split(X=dataset.data, y=dataset.target):
            x_train, y_train = dataset.data[train_idx], dataset.target[train_idx]
            x_test, y_test = dataset.data[test_idx], dataset.target[test_idx]

            svc = SVC(C=1.0, kernel='rbf', degree=1, tol=0.01)
            svc.fit(x_train, y_train)
            prediction = svc.predict(x_test)
            accuracies[name][test_fold] = 100*np.mean((prediction == y_test))
            print("Acc = {0:.2f}%".format(accuracies[name][test_fold]))
            test_fold += 1
    return accuracies
开发者ID:perellonieto,项目名称:PyDatasets,代码行数:31,代码来源:datasets.py

示例12: split

def split(dependent, independent, n_folds):
  skf = StratifiedKFold(n_splits=n_folds, random_state=RANDOM_STATE)
  for train_indices, test_indices in skf.split(dependent, independent):
    train_x = dependent[train_indices]
    train_y = independent[train_indices]
    test_x = dependent[test_indices]
    test_y = independent[test_indices]
    yield train_x, train_y, test_x, test_y
开发者ID:ai-se,项目名称:citemap,代码行数:8,代码来源:predict.py

示例13: test_ovr_multinomial_iris

def test_ovr_multinomial_iris():
    # Test that OvR and multinomial are correct using the iris dataset.
    train, target = iris.data, iris.target
    n_samples, n_features = train.shape

    # The cv indices from stratified kfold (where stratification is done based
    # on the fine-grained iris classes, i.e, before the classes 0 and 1 are
    # conflated) is used for both clf and clf1
    n_cv = 2
    cv = StratifiedKFold(n_cv)
    precomputed_folds = list(cv.split(train, target))

    # Train clf on the original dataset where classes 0 and 1 are separated
    clf = LogisticRegressionCV(cv=precomputed_folds)
    clf.fit(train, target)

    # Conflate classes 0 and 1 and train clf1 on this modified dataset
    clf1 = LogisticRegressionCV(cv=precomputed_folds)
    target_copy = target.copy()
    target_copy[target_copy == 0] = 1
    clf1.fit(train, target_copy)

    # Ensure that what OvR learns for class2 is same regardless of whether
    # classes 0 and 1 are separated or not
    assert_array_almost_equal(clf.scores_[2], clf1.scores_[2])
    assert_array_almost_equal(clf.intercept_[2:], clf1.intercept_)
    assert_array_almost_equal(clf.coef_[2][np.newaxis, :], clf1.coef_)

    # Test the shape of various attributes.
    assert_equal(clf.coef_.shape, (3, n_features))
    assert_array_equal(clf.classes_, [0, 1, 2])
    coefs_paths = np.asarray(list(clf.coefs_paths_.values()))
    assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10, n_features + 1))
    assert_equal(clf.Cs_.shape, (10,))
    scores = np.asarray(list(clf.scores_.values()))
    assert_equal(scores.shape, (3, n_cv, 10))

    # Test that for the iris data multinomial gives a better accuracy than OvR
    for solver in ['lbfgs', 'newton-cg', 'sag', 'saga']:
        max_iter = 2000 if solver in ['sag', 'saga'] else 15
        clf_multi = LogisticRegressionCV(
            solver=solver, multi_class='multinomial', max_iter=max_iter,
            random_state=42, tol=1e-5 if solver in ['sag', 'saga'] else 1e-2,
            cv=2)
        clf_multi.fit(train, target)
        multi_score = clf_multi.score(train, target)
        ovr_score = clf.score(train, target)
        assert_greater(multi_score, ovr_score)

        # Test attributes of LogisticRegressionCV
        assert_equal(clf.coef_.shape, clf_multi.coef_.shape)
        assert_array_equal(clf_multi.classes_, [0, 1, 2])
        coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values()))
        assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10,
                                                      n_features + 1))
        assert_equal(clf_multi.Cs_.shape, (10,))
        scores = np.asarray(list(clf_multi.scores_.values()))
        assert_equal(scores.shape, (3, n_cv, 10))
开发者ID:huafengw,项目名称:scikit-learn,代码行数:58,代码来源:test_logistic.py

示例14: gen_folds

def gen_folds(X, y, n_folds=5, random_state=0):
    from sklearn.model_selection import StratifiedKFold

    kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state)

    folds = kf.split(X, y)
    # iteratively train epochs
    kfsplit = [(itrain, itest) for itrain, itest in folds]
    return kfsplit
开发者ID:canzheng,项目名称:kaggle-talkingdata,代码行数:9,代码来源:model_info.py

示例15: categorical_average

def categorical_average(variable, y, pred_0, feature_name):
    def calculate_average(sub1, sub2):
        s = pd.DataFrame(data = {
                                 variable: sub1.groupby(variable, as_index = False).count()[variable],                              
                                 'sumy': sub1.groupby(variable, as_index = False).sum()['y'],
                                 'avgY': sub1.groupby(variable, as_index = False).mean()['y'],
                                 'cnt': sub1.groupby(variable, as_index = False).count()['y']
                                 })
                                 
        tmp = sub2.merge(s.reset_index(), how='left', left_on=variable, right_on=variable) 
        del tmp['index']                       
        tmp.loc[pd.isnull(tmp['cnt']), 'cnt'] = 0.0
        tmp.loc[pd.isnull(tmp['cnt']), 'sumy'] = 0.0

        def compute_beta(row):
            cnt = row['cnt'] if row['cnt'] < 200 else float('inf')
            return 1.0 / (g + exp((cnt - k) / f))
            
        if lambda_val is not None:
            tmp['beta'] = lambda_val
        else:
            tmp['beta'] = tmp.apply(compute_beta, axis = 1)
            
        tmp['adj_avg'] = tmp.apply(lambda row: (1.0 - row['beta']) * row['avgY'] + row['beta'] * row['pred_0'],
                                   axis = 1)
                                   
        tmp.loc[pd.isnull(tmp['avgY']), 'avgY'] = tmp.loc[pd.isnull(tmp['avgY']), 'pred_0']
        tmp.loc[pd.isnull(tmp['adj_avg']), 'adj_avg'] = tmp.loc[pd.isnull(tmp['adj_avg']), 'pred_0']
        tmp['random'] = np.random.uniform(size = len(tmp))
        tmp['adj_avg'] = tmp.apply(lambda row: row['adj_avg'] *(1 + (row['random'] - 0.5) * r_k),
                                   axis = 1)
    
        return tmp['adj_avg'].ravel()
     
    #cv for training set 
    k_fold = StratifiedKFold(5)
    X_train[feature_name] = -999 
    for (train_index, cv_index) in k_fold.split(np.zeros(len(X_train)),
                                                X_train['interest_level'].ravel()):
        sub = pd.DataFrame(data = {variable: X_train[variable],
                                   'y': X_train[y],
                                   'pred_0': X_train[pred_0]})
            
        sub1 = sub.iloc[train_index]        
        sub2 = sub.iloc[cv_index]
        
        X_train.loc[cv_index, feature_name] = calculate_average(sub1, sub2)
    
    #for test set
    sub1 = pd.DataFrame(data = {variable: X_train[variable],
                                'y': X_train[y],
                                'pred_0': X_train[pred_0]})
    sub2 = pd.DataFrame(data = {variable: X_test[variable],
                                'y': X_test[y],
                                'pred_0': X_test[pred_0]})
    X_test.loc[:, feature_name] = calculate_average(sub1, sub2)                               
开发者ID:Paliking,项目名称:ML_examples,代码行数:56,代码来源:LtIsLit_XGB.py


注:本文中的sklearn.model_selection.StratifiedKFold类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。