Python StratifiedKFold.get_n_splits方法代码示例

本文整理汇总了Python中sklearn.model_selection.StratifiedKFold.get_n_splits方法的典型用法代码示例。如果您正苦于以下问题：Python StratifiedKFold.get_n_splits方法的具体用法？Python StratifiedKFold.get_n_splits怎么用？Python StratifiedKFold.get_n_splits使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.model_selection.StratifiedKFold的用法示例。

在下文中一共展示了StratifiedKFold.get_n_splits方法的11个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: stacking_proba

# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
def stacking_proba(clf,X_train,y,X_test,nfolds=5,random_seed=2017,return_score=False,
                   shuffle=True,metric='acc',clf_name='UnKnown'):
    folds = StratifiedKFold(n_splits=nfolds, shuffle=shuffle, random_state=random_seed)
    folds.get_n_splits(X_train,y)
    #return stacking_proba for train set
    train_stacking_proba=np.zeros((X_train.shape[0],np.unique(y).shape[0]))
    score=0
    for i,(train_index, validate_index) in enumerate(folds.split(X_train, y)):
        # print(str(clf_name)+" folds:"+str(i+1)+"/"+str(nfolds))
        X_train_fold=X_train[train_index,:]
        y_train_fold=y[train_index]
        X_validate_fold=X_train[validate_index,:]
        y_validate_fold=y[validate_index]
        clf.fit(X_train_fold,y_train_fold)
        fold_preds=clf.predict_proba(X_validate_fold)
        train_stacking_proba[validate_index,:]=fold_preds
        #validation
        fold_preds_a = np.argmax(fold_preds, axis=1)
        fold_score=len(np.nonzero(y_validate_fold - fold_preds_a == 0)[0]) / len(y_validate_fold)
        # print('validate '+metric+":"+str(fold_score))
        score+=fold_score
    score/=nfolds
    #return stacking_proba for test set
    clf.fit(X_train,y)
    test_stacking_proba=clf.predict_proba(X_test)

    if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
        train_stacking_proba=train_stacking_proba[:,1]
        test_stacking_proba=test_stacking_proba[:,1]
    if return_score:
        return train_stacking_proba,test_stacking_proba,score
    else:
        return train_stacking_proba,test_stacking_proba

开发者ID:sunlinyu1993，项目名称:Machine-Learning-Toolbox，代码行数:35，代码来源:Ensemble.py

示例2: _get_fold_generator

# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
def _get_fold_generator(target_values):
    if params.stratified_cv:
        cv = StratifiedKFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED)
        cv.get_n_splits(target_values)
        fold_generator = cv.split(target_values, target_values)
    else:
        cv = KFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED)
        fold_generator = cv.split(target_values)
    return fold_generator

开发者ID:yjsgcjdfz123，项目名称:open-solution-home-credit，代码行数:11，代码来源:pipeline_manager.py

示例3: stratified_cross_validate

# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
    def stratified_cross_validate(self, k):
        attributes = np.append(self.training_attributes, self.testing_attributes, axis=0)
        labels = np.append(self.training_labels, self.testing_labels, axis=0)

        all_data = np.array([np.append(attributes[i], labels[i]) for i in range(len(attributes))])

        #print("all data : %s" % all_data)
        #print("")

        np.random.shuffle(all_data)

        X = all_data[:, :-1]
        y = all_data[:, -1]
        print(X.shape, y.shape)
        skf = StratifiedKFold(n_splits=2)
        print(skf.get_n_splits(X, y))
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            yield (X_train, y_train, X_test, y_test)

        #print("shuffled data : %s" % all_data)
        #print("")

        for i in range(k):
            split = len(all_data) / k
            #print("split : %s" % split)

            test_data = all_data[i * split:(i + 1) * split, :]
            train_data = np.delete(all_data, np.arange(i * split, (i + 1) * split), axis=0)

            train_input, train_output = train_data[:, :-1], train_data[:, -1]
            test_input, test_output = test_data[:, :-1], test_data[:, -1]

            yield (train_input, train_output, test_input, test_output)

开发者ID:Piggelinus，项目名称:Project，代码行数:37，代码来源:reader.py

示例4: StratifiedKFold

# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
        ycat.name = ycat.name + '_cat';
    
    ##########################################################################################        
    # <PLACEHOLDER FOR NON-GENERIC CODE: INSERT CODE HERE>
    X = X.dropna(); 
    y = np.log(200+y);
    ycat = pd.qcut(y, quantiles);
    ycat.name = ycat.name + '_cat';
    # <PLACEHOLDER FOR NON-GENERIC CODE: INSERT CODE HERE>    
    ########################################################################################## 
    
    # Get first iteration of the k-fold indices, use it for the train-validation split
    # Other iterations may be used later  
    #print 'Splitting training data into training and validation sets...';
    skf = StratifiedKFold(n_splits=int(1./validation_size), shuffle=True);
    skf.get_n_splits(X, y);
    train_indices, valid_indices = next(iter(skf.split(X, ycat)));
    # Scale the numeric columns if required.
    X = X.join(pd.Series('TRAIN', index=train_indices, name = 'rowtype').append(pd.Series('VALID', index=valid_indices, name = 'rowtype')));
    X_test=test_dataset.join(pd.Series('TEST', index=test_dataset.index, name = 'rowtype'));   

    # Combine train, valid and test covariates to create a consolidated covariate set
    covariates = pd.concat([X, X_test], axis=0, ignore_index=True);     
    # If id column does not exist, create one.
    if (idcol is None) or ( len(idcol) == 0 ):
        idcol = 'id';
        covariates=covariates.join(pd.Series( range(1, len(covariates) + 1,1), index=covariates.index, name = idcol ));
    
    # Find and add columns with zero std deviation to irrelevant columns- These add no information.                    
    irrelevant_cols = irrelevant_cols + (covariates.std(axis=0, numeric_only=True) < 0.5)[(covariates.std(axis=0) == 0.0)].index.tolist();

开发者ID:vinaybenny，项目名称:pyanalytics-reusablesnippets，代码行数:32，代码来源:dataprocessor.py

示例5: len

# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
			else:
				Y_raw.append(0)
			X_raw.append(float(row[6]))

print len(X_raw)
print len(Y_raw)
X = np.array(X_raw)
X = np.reshape(X,(-2,1))
Y = np.array(Y_raw)

print len(X)
print len(Y)
# print X

skf = StratifiedKFold(n_splits=10,random_state=40)
skf.get_n_splits(X,Y)

# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)

index = 0

precision_score_list_LR = list()
recall_score_list_LR = list()
precision_score_list_SVC_poly = list()
recall_score_list_SVC_poly = list()
precision_score_list_RF = list()
recall_score_list_RF = list()
for train_index, test_index in skf.split(X,Y):
	print "########################"
	X_train, X_test = X[train_index], X[test_index]
	# X_train, X_test = X.iloc[train_index], X.iloc[test_index]

开发者ID:bdqnghi，项目名称:code_clone_2vec，代码行数:33，代码来源:evaluate_prediction_result2.py

示例6: identification

# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
def identification(data,data_flip,labels,thread_cnt,data_filename):
    print("Identification")

    # Get k-fold split of dataset (k=5)
    cv = StratifiedKFold(n_splits=5,shuffle=False,random_state=1)
    cv.get_n_splits(data,labels)

    ### Perform k-fold cross validation
    y_prob_list = []
    y_pred = np.array([])
    y_true = np.array([])
    for k,(train_index,test_index) in enumerate(cv.split(data,labels)):
        print("     Fold - " + str(k))

        # Get training and testing sets
        train = np.vstack([data[train_index,:],data_flip[train_index,:]])
        train_labels = np.append(labels[train_index],labels[train_index])
        test = data[test_index,:]
        test_labels = labels[test_index]

        # Normalize to z-scores
        mu = np.mean(train,axis=0)
        std = np.std(train,axis=0)
        train = (train - mu) / std
        test = (test - mu) / std

        # Get training classes
        classes = np.unique(train_labels)

        ### TRAINING
        svm = SVC(kernel='linear', probability=True)
        svm.fit(train,train_labels)

        ### TESTING
        prediction = svm.predict(test)
        prob = svm.predict_proba(test)

        for i,label in enumerate(test_labels):
            j = int(label-1)
            y_prob_list.append(prob[i,j]) 

        y_true = np.append(y_true,test_labels)
        y_pred = np.append(y_pred,prediction)
    
    print()

    ### OVERALL RESULTS    
    confusion_matrix = metrics.confusion_matrix(y_true,y_pred)
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for i in range(confusion_matrix.shape[0]):
        TP_i = confusion_matrix[i,i]
        FP_i = np.sum(confusion_matrix[i,:]) - TP_i
        FN_i = np.sum(confusion_matrix[:,i]) - TP_i
        TN_i = np.sum(np.sum(confusion_matrix)) - TP_i - FP_i - FN_i

        TP = TP + TP_i
        FP = FP + FP_i
        FN = FN + FN_i
        TN = TN + TN_i

    ACC = (TP + TN) / (TP + TN + FP + FN)
    FAR = FP / (FP + TN)
    FRR = FN / (FN + TP)

    # Print results
    print(data_filename)
    print("--------------------------------------------------------------------------------------")
    print("Identification Results:")
    print("TP: " + str(TP) + "\n" +
    "FP: " + str(FP) + "\n" +
    "FN: " + str(FN) + "\n" +
    "TN: " + str(TN) + "\n" +
    "ACC: " + str(ACC) + "\n" +
    "FAR: " + str(FAR) + "\n" +
    "FRR: " + str(FRR))
    print(str(min(y_prob_list)))
    print()

开发者ID:phillity，项目名称:Biocapsule，代码行数:82，代码来源:Identification.py

示例7: isinstance

# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
i = 0
for (train, test), color in zip(cv.split(X, y), colors):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    assert isinstance(probas_, np.ndarray)
    print(probas_.shape)
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=lw, color=color,
             label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k',
         label='Luck')

mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.savefig(os.path.join(local_path, 'plot_roc_crossval.png'))

开发者ID:hongbin0908，项目名称:pytrade，代码行数:32，代码来源:plot_roc_crossval.py

示例8: gbdt_cv_modeling

# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
def gbdt_cv_modeling():
    """

    :return:
    """

    '''Data input'''
    data_b_train = pd.read_csv('../data/B_train_final.csv', index_col='no')
    data_test = pd.read_csv('../data/B_test_final.csv', index_col='no')

    data_train = data_b_train

    data_train_without_label = data_train.drop('flag', axis=1)
    frames = [data_train_without_label, data_test]

    '''给定一个随机数种子，打乱train'''
    s = 0
    np.random.seed(s)
    sampler = np.random.permutation(len(data_train.values))
    data_train_randomized = data_train.take(sampler)

    feature_name = list(data_train.columns.values)
    '''缺失值填充'''
    data_train_filled = data_train_randomized.fillna(value=10)

    '''构造训练集和测试集'''
    x_temp = data_train_filled.iloc[:, :-1].as_matrix()  # 自变量
    y = data_train_filled.iloc[:, -1].as_matrix()  # 因变量

    '''Feature selection'''
    X, dropped_feature_name, len_feature_choose = lgb_feature_selection(feature_name, x_temp, y, '0.1*mean')

    '''处理 验证集 B_test'''
    data_test_filled = data_test.fillna(value=10)
    data_test_filled_after_feature_selection = data_test_feature_drop(data_test_filled, dropped_feature_name)

    '''Split train/test data sets'''
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)  # 分层抽样  cv的意思是cross-validation

    '''Choose a classification model'''
    parameter_n_estimators = 400
    classifier = GradientBoostingClassifier(n_estimators=parameter_n_estimators)

    '''Model fit, predict and ROC'''
    colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue'])
    lw = 2
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 500)
    i_of_roc = 0
    a = 0

    probability_set_of_b_test = []

    for (train_indice, test_indice), color in zip(cv.split(X, y), colors):
        a_model = classifier.fit(X[train_indice], y[train_indice])

        probas_ = a_model.predict_proba(X[test_indice])

        prob_of_b_test = a_model.predict_proba(data_test_filled_after_feature_selection)  # 对B_test进行预测

        probability_set_of_b_test.append(prob_of_b_test[:, 1])

        fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1])

        a += 1  # 序号加1

        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0

        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc))
        i_of_roc += 1

    plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')

    mean_tpr /= cv.get_n_splits(X, y)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    print 'mean_auc=' + str(mean_auc)
    plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw)

    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

    plt.title('ROC_rd_' + str(s) + '_gbdt_' + str(len_feature_choose) + '_features')
    plt.legend(loc="lower right")
    plt.show()

    avg_prob = (probability_set_of_b_test[0] + probability_set_of_b_test[1] + probability_set_of_b_test[2] +
                probability_set_of_b_test[3] + probability_set_of_b_test[4]) * 1.0 / 5

    result_file_name = '../result/B_test_gbdt_predict_cv_fillna_10_rd_' + str(s) + '_N_' + str(parameter_n_estimators) + '_features_' + \
                       str(len_feature_choose) + '.csv'

开发者ID:tomzhang，项目名称:QH_FInSight，代码行数:97，代码来源:B_LGB_GBDT.py

示例9: StratifiedKFold

# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
import numpy as np
from sklearn.model_selection import StratifiedKFold

X = ["a", "b", "c", "d"]
y = [1, 1, 2, 2]
skf = StratifiedKFold(n_splits=2)
#for train, test in kf.split(X):
#    print("%s %s" % (train, test))

splits = skf.get_n_splits(X,y)

print(splits)

开发者ID:tomaye，项目名称:Thesis，代码行数:14，代码来源:playWithDictVectors.py

示例10: authentication

# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
def authentication(data,data_flip,labels,thread_cnt,data_filename):
    print("Authentication")

    # Get k-fold split of dataset (k=5)
    cv = StratifiedKFold(n_splits=2,shuffle=False,random_state=0)
    cv.get_n_splits(data,labels)

    ### Perform k-fold cross validation
    y_prob = np.array([])
    y_pred = np.array([])
    y_true = np.array([])
    for k,(train_index,test_index) in enumerate(cv.split(data,labels)):
        print("     Fold - " + str(k))

        # Get training and testing sets
        train = np.vstack([data[train_index,:],data_flip[train_index,:]])
        train_labels = np.append(labels[train_index],labels[train_index])
        test = data[test_index,:]
        test_labels = labels[test_index]

        # Normalize to z-scores
        mu = np.mean(train,axis=0)
        std = np.std(train,axis=0)
        train = (train - mu) / std
        test = (test - mu) / std

        # Get training classes
        classes = np.unique(train_labels)
        classes_split = list(split_list(classes.tolist(),thread_cnt))

        ### TRAINING
        # Binary SVM for each class
        class_svms = []
        c_idxes = []
        threads = []
        que = Queue()

        # Thread to train each class binary SVM
        for li in classes_split:
            for i,c in enumerate(li):
                threads.append(Thread(target=authentication_train,args=(c,train,train_labels,que)))
                threads[-1].start()
            
            # Collect training thread results
            _ = [ t.join() for t in threads ]
            while not que.empty():
                (c_idx,svm) = que.get()
                c_idxes.append(c_idx)
                class_svms.append(svm)

        ### TESTING
        threads = []
        que = Queue()
        for li in classes_split:
            for i,c in enumerate(li):
                c_idx = c_idxes.index(c)
                threads.append(Thread(target=authentication_test,args=(c,class_svms[c_idx],test,test_labels,que)))
                threads[-1].start()

            # Collect testing thread results
            _ = [ t.join() for t in threads ]
            while not que.empty():
                result = que.get()

                c = int(result[2])
                c_prob = result[0]
                c_true = result[1]
                c_pred = np.zeros(c_prob.shape[0])
                c_pred[c_prob<0.5] = 1

                y_prob = np.append(y_prob,c_prob)
                y_true = np.append(y_true,c_true)
                y_pred = np.append(y_pred,c_pred)
    
    print()

    ### OVERALL RESULTS    
    TP, FN, FP, TN = metrics.confusion_matrix(y_true,y_pred,labels=[0,1]).ravel()
    ACC = (TP + TN) / (TP + TN + FP + FN)
    FAR = FP / (FP + TN)
    FRR = FN / (FN + TP)

    fpr, tpr, thresholds = metrics.roc_curve(y_true,y_prob,pos_label=0)
    EER = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
    EER_thresh = interp1d(fpr, thresholds)(EER)
    y_prob = np.ones(y_prob.shape) - y_prob
    AUC = metrics.roc_auc_score(y_true,y_prob)
    
    # Print results
    print(data_filename)
    print("--------------------------------------------------------------------------------------")
    print("Authentication Results:")
    print("TP: " + str(TP) + "\n" +
    "FP: " + str(FP) + "\n" +
    "FN: " + str(FN) + "\n" +
    "TN: " + str(TN) + "\n" +
    "ACC: " + str(ACC) + "\n" +
    "FAR: " + str(FAR) + "\n" +
    "FRR: " + str(FRR) + "\n" +
    "AUC: " + str(AUC) + "\n" +
#.........这里部分代码省略.........

开发者ID:phillity，项目名称:Biocapsule，代码行数:103，代码来源:Authentication.py

示例11: encode_dataset

# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
                          }}




all_data , y_train = encode_dataset(train=train,test=test,meta=meta,target_model='lightgbm')
print("*****************************")
print(all_data.head())
train_obs = len(y_train)
train = all_data[:train_obs]
test = all_data[train_obs:]
train_ids = train.index
test_ids  = test.index

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf.get_n_splits(train_ids, y_train)

lgb_test_result  = np.zeros(test_ids.shape[0])
#lgb_train_result = np.zeros(train_ids.shape[0])
#xgb_test_result  = np.zeros(test_ids.shape[0])
#xgb_train_result = np.zeros(train_ids.shape[0])
counter = 0
#Transform data using small groups to reduce memory usage
m = 100000
print('\nLightGBM\n')

for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold {}\n'.format(counter + 1))
    print("**************************") 
    print("train_index:",train_index)
    print("**************************")

开发者ID:gtesei，项目名称:fast-furious，代码行数:33，代码来源:base_lightGBM.py

注：本文中的sklearn.model_selection.StratifiedKFold.get_n_splits方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。