本文整理汇总了Python中sklearn.model_selection.StratifiedKFold.get_n_splits方法的典型用法代码示例。如果您正苦于以下问题:Python StratifiedKFold.get_n_splits方法的具体用法?Python StratifiedKFold.get_n_splits怎么用?Python StratifiedKFold.get_n_splits使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.model_selection.StratifiedKFold
的用法示例。
在下文中一共展示了StratifiedKFold.get_n_splits方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: stacking_proba
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
def stacking_proba(clf,X_train,y,X_test,nfolds=5,random_seed=2017,return_score=False,
shuffle=True,metric='acc',clf_name='UnKnown'):
folds = StratifiedKFold(n_splits=nfolds, shuffle=shuffle, random_state=random_seed)
folds.get_n_splits(X_train,y)
#return stacking_proba for train set
train_stacking_proba=np.zeros((X_train.shape[0],np.unique(y).shape[0]))
score=0
for i,(train_index, validate_index) in enumerate(folds.split(X_train, y)):
# print(str(clf_name)+" folds:"+str(i+1)+"/"+str(nfolds))
X_train_fold=X_train[train_index,:]
y_train_fold=y[train_index]
X_validate_fold=X_train[validate_index,:]
y_validate_fold=y[validate_index]
clf.fit(X_train_fold,y_train_fold)
fold_preds=clf.predict_proba(X_validate_fold)
train_stacking_proba[validate_index,:]=fold_preds
#validation
fold_preds_a = np.argmax(fold_preds, axis=1)
fold_score=len(np.nonzero(y_validate_fold - fold_preds_a == 0)[0]) / len(y_validate_fold)
# print('validate '+metric+":"+str(fold_score))
score+=fold_score
score/=nfolds
#return stacking_proba for test set
clf.fit(X_train,y)
test_stacking_proba=clf.predict_proba(X_test)
if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
train_stacking_proba=train_stacking_proba[:,1]
test_stacking_proba=test_stacking_proba[:,1]
if return_score:
return train_stacking_proba,test_stacking_proba,score
else:
return train_stacking_proba,test_stacking_proba
示例2: _get_fold_generator
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
def _get_fold_generator(target_values):
if params.stratified_cv:
cv = StratifiedKFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED)
cv.get_n_splits(target_values)
fold_generator = cv.split(target_values, target_values)
else:
cv = KFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED)
fold_generator = cv.split(target_values)
return fold_generator
示例3: stratified_cross_validate
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
def stratified_cross_validate(self, k):
attributes = np.append(self.training_attributes, self.testing_attributes, axis=0)
labels = np.append(self.training_labels, self.testing_labels, axis=0)
all_data = np.array([np.append(attributes[i], labels[i]) for i in range(len(attributes))])
#print("all data : %s" % all_data)
#print("")
np.random.shuffle(all_data)
X = all_data[:, :-1]
y = all_data[:, -1]
print(X.shape, y.shape)
skf = StratifiedKFold(n_splits=2)
print(skf.get_n_splits(X, y))
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
yield (X_train, y_train, X_test, y_test)
#print("shuffled data : %s" % all_data)
#print("")
for i in range(k):
split = len(all_data) / k
#print("split : %s" % split)
test_data = all_data[i * split:(i + 1) * split, :]
train_data = np.delete(all_data, np.arange(i * split, (i + 1) * split), axis=0)
train_input, train_output = train_data[:, :-1], train_data[:, -1]
test_input, test_output = test_data[:, :-1], test_data[:, -1]
yield (train_input, train_output, test_input, test_output)
示例4: StratifiedKFold
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
ycat.name = ycat.name + '_cat';
##########################################################################################
# <PLACEHOLDER FOR NON-GENERIC CODE: INSERT CODE HERE>
X = X.dropna();
y = np.log(200+y);
ycat = pd.qcut(y, quantiles);
ycat.name = ycat.name + '_cat';
# <PLACEHOLDER FOR NON-GENERIC CODE: INSERT CODE HERE>
##########################################################################################
# Get first iteration of the k-fold indices, use it for the train-validation split
# Other iterations may be used later
#print 'Splitting training data into training and validation sets...';
skf = StratifiedKFold(n_splits=int(1./validation_size), shuffle=True);
skf.get_n_splits(X, y);
train_indices, valid_indices = next(iter(skf.split(X, ycat)));
# Scale the numeric columns if required.
X = X.join(pd.Series('TRAIN', index=train_indices, name = 'rowtype').append(pd.Series('VALID', index=valid_indices, name = 'rowtype')));
X_test=test_dataset.join(pd.Series('TEST', index=test_dataset.index, name = 'rowtype'));
# Combine train, valid and test covariates to create a consolidated covariate set
covariates = pd.concat([X, X_test], axis=0, ignore_index=True);
# If id column does not exist, create one.
if (idcol is None) or ( len(idcol) == 0 ):
idcol = 'id';
covariates=covariates.join(pd.Series( range(1, len(covariates) + 1,1), index=covariates.index, name = idcol ));
# Find and add columns with zero std deviation to irrelevant columns- These add no information.
irrelevant_cols = irrelevant_cols + (covariates.std(axis=0, numeric_only=True) < 0.5)[(covariates.std(axis=0) == 0.0)].index.tolist();
示例5: len
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
else:
Y_raw.append(0)
X_raw.append(float(row[6]))
print len(X_raw)
print len(Y_raw)
X = np.array(X_raw)
X = np.reshape(X,(-2,1))
Y = np.array(Y_raw)
print len(X)
print len(Y)
# print X
skf = StratifiedKFold(n_splits=10,random_state=40)
skf.get_n_splits(X,Y)
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)
index = 0
precision_score_list_LR = list()
recall_score_list_LR = list()
precision_score_list_SVC_poly = list()
recall_score_list_SVC_poly = list()
precision_score_list_RF = list()
recall_score_list_RF = list()
for train_index, test_index in skf.split(X,Y):
print "########################"
X_train, X_test = X[train_index], X[test_index]
# X_train, X_test = X.iloc[train_index], X.iloc[test_index]
示例6: identification
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
def identification(data,data_flip,labels,thread_cnt,data_filename):
print("Identification")
# Get k-fold split of dataset (k=5)
cv = StratifiedKFold(n_splits=5,shuffle=False,random_state=1)
cv.get_n_splits(data,labels)
### Perform k-fold cross validation
y_prob_list = []
y_pred = np.array([])
y_true = np.array([])
for k,(train_index,test_index) in enumerate(cv.split(data,labels)):
print(" Fold - " + str(k))
# Get training and testing sets
train = np.vstack([data[train_index,:],data_flip[train_index,:]])
train_labels = np.append(labels[train_index],labels[train_index])
test = data[test_index,:]
test_labels = labels[test_index]
# Normalize to z-scores
mu = np.mean(train,axis=0)
std = np.std(train,axis=0)
train = (train - mu) / std
test = (test - mu) / std
# Get training classes
classes = np.unique(train_labels)
### TRAINING
svm = SVC(kernel='linear', probability=True)
svm.fit(train,train_labels)
### TESTING
prediction = svm.predict(test)
prob = svm.predict_proba(test)
for i,label in enumerate(test_labels):
j = int(label-1)
y_prob_list.append(prob[i,j])
y_true = np.append(y_true,test_labels)
y_pred = np.append(y_pred,prediction)
print()
### OVERALL RESULTS
confusion_matrix = metrics.confusion_matrix(y_true,y_pred)
TP = 0
FP = 0
FN = 0
TN = 0
for i in range(confusion_matrix.shape[0]):
TP_i = confusion_matrix[i,i]
FP_i = np.sum(confusion_matrix[i,:]) - TP_i
FN_i = np.sum(confusion_matrix[:,i]) - TP_i
TN_i = np.sum(np.sum(confusion_matrix)) - TP_i - FP_i - FN_i
TP = TP + TP_i
FP = FP + FP_i
FN = FN + FN_i
TN = TN + TN_i
ACC = (TP + TN) / (TP + TN + FP + FN)
FAR = FP / (FP + TN)
FRR = FN / (FN + TP)
# Print results
print(data_filename)
print("--------------------------------------------------------------------------------------")
print("Identification Results:")
print("TP: " + str(TP) + "\n" +
"FP: " + str(FP) + "\n" +
"FN: " + str(FN) + "\n" +
"TN: " + str(TN) + "\n" +
"ACC: " + str(ACC) + "\n" +
"FAR: " + str(FAR) + "\n" +
"FRR: " + str(FRR))
print(str(min(y_prob_list)))
print()
示例7: isinstance
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
i = 0
for (train, test), color in zip(cv.split(X, y), colors):
probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
assert isinstance(probas_, np.ndarray)
print(probas_.shape)
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=lw, color=color,
label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k',
label='Luck')
mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.savefig(os.path.join(local_path, 'plot_roc_crossval.png'))
示例8: gbdt_cv_modeling
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
def gbdt_cv_modeling():
"""
:return:
"""
'''Data input'''
data_b_train = pd.read_csv('../data/B_train_final.csv', index_col='no')
data_test = pd.read_csv('../data/B_test_final.csv', index_col='no')
data_train = data_b_train
data_train_without_label = data_train.drop('flag', axis=1)
frames = [data_train_without_label, data_test]
'''给定一个随机数种子,打乱train'''
s = 0
np.random.seed(s)
sampler = np.random.permutation(len(data_train.values))
data_train_randomized = data_train.take(sampler)
feature_name = list(data_train.columns.values)
'''缺失值填充'''
data_train_filled = data_train_randomized.fillna(value=10)
'''构造训练集和测试集'''
x_temp = data_train_filled.iloc[:, :-1].as_matrix() # 自变量
y = data_train_filled.iloc[:, -1].as_matrix() # 因变量
'''Feature selection'''
X, dropped_feature_name, len_feature_choose = lgb_feature_selection(feature_name, x_temp, y, '0.1*mean')
'''处理 验证集 B_test'''
data_test_filled = data_test.fillna(value=10)
data_test_filled_after_feature_selection = data_test_feature_drop(data_test_filled, dropped_feature_name)
'''Split train/test data sets'''
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) # 分层抽样 cv的意思是cross-validation
'''Choose a classification model'''
parameter_n_estimators = 400
classifier = GradientBoostingClassifier(n_estimators=parameter_n_estimators)
'''Model fit, predict and ROC'''
colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue'])
lw = 2
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 500)
i_of_roc = 0
a = 0
probability_set_of_b_test = []
for (train_indice, test_indice), color in zip(cv.split(X, y), colors):
a_model = classifier.fit(X[train_indice], y[train_indice])
probas_ = a_model.predict_proba(X[test_indice])
prob_of_b_test = a_model.predict_proba(data_test_filled_after_feature_selection) # 对B_test进行预测
probability_set_of_b_test.append(prob_of_b_test[:, 1])
fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1])
a += 1 # 序号加1
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc))
i_of_roc += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')
mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
print 'mean_auc=' + str(mean_auc)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC_rd_' + str(s) + '_gbdt_' + str(len_feature_choose) + '_features')
plt.legend(loc="lower right")
plt.show()
avg_prob = (probability_set_of_b_test[0] + probability_set_of_b_test[1] + probability_set_of_b_test[2] +
probability_set_of_b_test[3] + probability_set_of_b_test[4]) * 1.0 / 5
result_file_name = '../result/B_test_gbdt_predict_cv_fillna_10_rd_' + str(s) + '_N_' + str(parameter_n_estimators) + '_features_' + \
str(len_feature_choose) + '.csv'
示例9: StratifiedKFold
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
import numpy as np
from sklearn.model_selection import StratifiedKFold
X = ["a", "b", "c", "d"]
y = [1, 1, 2, 2]
skf = StratifiedKFold(n_splits=2)
#for train, test in kf.split(X):
# print("%s %s" % (train, test))
splits = skf.get_n_splits(X,y)
print(splits)
示例10: authentication
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
def authentication(data,data_flip,labels,thread_cnt,data_filename):
print("Authentication")
# Get k-fold split of dataset (k=5)
cv = StratifiedKFold(n_splits=2,shuffle=False,random_state=0)
cv.get_n_splits(data,labels)
### Perform k-fold cross validation
y_prob = np.array([])
y_pred = np.array([])
y_true = np.array([])
for k,(train_index,test_index) in enumerate(cv.split(data,labels)):
print(" Fold - " + str(k))
# Get training and testing sets
train = np.vstack([data[train_index,:],data_flip[train_index,:]])
train_labels = np.append(labels[train_index],labels[train_index])
test = data[test_index,:]
test_labels = labels[test_index]
# Normalize to z-scores
mu = np.mean(train,axis=0)
std = np.std(train,axis=0)
train = (train - mu) / std
test = (test - mu) / std
# Get training classes
classes = np.unique(train_labels)
classes_split = list(split_list(classes.tolist(),thread_cnt))
### TRAINING
# Binary SVM for each class
class_svms = []
c_idxes = []
threads = []
que = Queue()
# Thread to train each class binary SVM
for li in classes_split:
for i,c in enumerate(li):
threads.append(Thread(target=authentication_train,args=(c,train,train_labels,que)))
threads[-1].start()
# Collect training thread results
_ = [ t.join() for t in threads ]
while not que.empty():
(c_idx,svm) = que.get()
c_idxes.append(c_idx)
class_svms.append(svm)
### TESTING
threads = []
que = Queue()
for li in classes_split:
for i,c in enumerate(li):
c_idx = c_idxes.index(c)
threads.append(Thread(target=authentication_test,args=(c,class_svms[c_idx],test,test_labels,que)))
threads[-1].start()
# Collect testing thread results
_ = [ t.join() for t in threads ]
while not que.empty():
result = que.get()
c = int(result[2])
c_prob = result[0]
c_true = result[1]
c_pred = np.zeros(c_prob.shape[0])
c_pred[c_prob<0.5] = 1
y_prob = np.append(y_prob,c_prob)
y_true = np.append(y_true,c_true)
y_pred = np.append(y_pred,c_pred)
print()
### OVERALL RESULTS
TP, FN, FP, TN = metrics.confusion_matrix(y_true,y_pred,labels=[0,1]).ravel()
ACC = (TP + TN) / (TP + TN + FP + FN)
FAR = FP / (FP + TN)
FRR = FN / (FN + TP)
fpr, tpr, thresholds = metrics.roc_curve(y_true,y_prob,pos_label=0)
EER = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
EER_thresh = interp1d(fpr, thresholds)(EER)
y_prob = np.ones(y_prob.shape) - y_prob
AUC = metrics.roc_auc_score(y_true,y_prob)
# Print results
print(data_filename)
print("--------------------------------------------------------------------------------------")
print("Authentication Results:")
print("TP: " + str(TP) + "\n" +
"FP: " + str(FP) + "\n" +
"FN: " + str(FN) + "\n" +
"TN: " + str(TN) + "\n" +
"ACC: " + str(ACC) + "\n" +
"FAR: " + str(FAR) + "\n" +
"FRR: " + str(FRR) + "\n" +
"AUC: " + str(AUC) + "\n" +
#.........这里部分代码省略.........
示例11: encode_dataset
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import get_n_splits [as 别名]
}}
all_data , y_train = encode_dataset(train=train,test=test,meta=meta,target_model='lightgbm')
print("*****************************")
print(all_data.head())
train_obs = len(y_train)
train = all_data[:train_obs]
test = all_data[train_obs:]
train_ids = train.index
test_ids = test.index
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf.get_n_splits(train_ids, y_train)
lgb_test_result = np.zeros(test_ids.shape[0])
#lgb_train_result = np.zeros(train_ids.shape[0])
#xgb_test_result = np.zeros(test_ids.shape[0])
#xgb_train_result = np.zeros(train_ids.shape[0])
counter = 0
#Transform data using small groups to reduce memory usage
m = 100000
print('\nLightGBM\n')
for train_index, test_index in skf.split(train_ids, y_train):
print('Fold {}\n'.format(counter + 1))
print("**************************")
print("train_index:",train_index)
print("**************************")