本文整理汇总了Python中sklearn.model_selection.StratifiedKFold类的典型用法代码示例。如果您正苦于以下问题:Python StratifiedKFold类的具体用法?Python StratifiedKFold怎么用?Python StratifiedKFold使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了StratifiedKFold类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: stacking_proba
def stacking_proba(clf,X_train,y,X_test,nfolds=5,random_seed=2017,return_score=False,
shuffle=True,metric='acc',clf_name='UnKnown'):
folds = StratifiedKFold(n_splits=nfolds, shuffle=shuffle, random_state=random_seed)
folds.get_n_splits(X_train,y)
#return stacking_proba for train set
train_stacking_proba=np.zeros((X_train.shape[0],np.unique(y).shape[0]))
score=0
for i,(train_index, validate_index) in enumerate(folds.split(X_train, y)):
# print(str(clf_name)+" folds:"+str(i+1)+"/"+str(nfolds))
X_train_fold=X_train[train_index,:]
y_train_fold=y[train_index]
X_validate_fold=X_train[validate_index,:]
y_validate_fold=y[validate_index]
clf.fit(X_train_fold,y_train_fold)
fold_preds=clf.predict_proba(X_validate_fold)
train_stacking_proba[validate_index,:]=fold_preds
#validation
fold_preds_a = np.argmax(fold_preds, axis=1)
fold_score=len(np.nonzero(y_validate_fold - fold_preds_a == 0)[0]) / len(y_validate_fold)
# print('validate '+metric+":"+str(fold_score))
score+=fold_score
score/=nfolds
#return stacking_proba for test set
clf.fit(X_train,y)
test_stacking_proba=clf.predict_proba(X_test)
if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
train_stacking_proba=train_stacking_proba[:,1]
test_stacking_proba=test_stacking_proba[:,1]
if return_score:
return train_stacking_proba,test_stacking_proba,score
else:
return train_stacking_proba,test_stacking_proba
示例2: test_kfold_valueerrors
def test_kfold_valueerrors():
X1 = np.array([[1, 2], [3, 4], [5, 6]])
X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
# Check that errors are raised if there is not enough samples
assert_raises(ValueError, next, KFold(4).split(X1))
# Check that a warning is raised if the least populated class has too few
# members.
y = np.array([3, 3, -1, -1, 2])
skf_3 = StratifiedKFold(3)
assert_warns_message(Warning, "The least populated class",
next, skf_3.split(X2, y))
# Check that despite the warning the folds are still computed even
# though all the classes are not necessarily represented at on each
# side of the split at each split
with warnings.catch_warnings():
check_cv_coverage(skf_3, X2, y, labels=None, expected_n_iter=3)
# Error when number of folds is <= 1
assert_raises(ValueError, KFold, 0)
assert_raises(ValueError, KFold, 1)
assert_raises(ValueError, StratifiedKFold, 0)
assert_raises(ValueError, StratifiedKFold, 1)
# When n_folds is not integer:
assert_raises(ValueError, KFold, 1.5)
assert_raises(ValueError, KFold, 2.0)
assert_raises(ValueError, StratifiedKFold, 1.5)
assert_raises(ValueError, StratifiedKFold, 2.0)
# When shuffle is not a bool:
assert_raises(TypeError, KFold, n_folds=4, shuffle=None)
示例3: test_grid_search_correct_score_results
def test_grid_search_correct_score_results():
# test that correct scores are used
n_splits = 3
clf = LinearSVC(random_state=0)
X, y = make_blobs(random_state=0, centers=2)
Cs = [.1, 1, 10]
for score in ['f1', 'roc_auc']:
grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits)
results = grid_search.fit(X, y).cv_results_
# Test scorer names
result_keys = list(results.keys())
expected_keys = (("mean_test_score", "rank_test_score") +
tuple("split%d_test_score" % cv_i
for cv_i in range(n_splits)))
assert_true(all(in1d(expected_keys, result_keys)))
cv = StratifiedKFold(n_splits=n_splits)
n_splits = grid_search.n_splits_
for candidate_i, C in enumerate(Cs):
clf.set_params(C=C)
cv_scores = np.array(
list(grid_search.cv_results_['split%d_test_score'
% s][candidate_i]
for s in range(n_splits)))
for i, (train, test) in enumerate(cv.split(X, y)):
clf.fit(X[train], y[train])
if score == "f1":
correct_score = f1_score(y[test], clf.predict(X[test]))
elif score == "roc_auc":
dec = clf.decision_function(X[test])
correct_score = roc_auc_score(y[test], dec)
assert_almost_equal(correct_score, cv_scores[i])
示例4: classify
def classify(X,y, clf,**para):
# y = profile["Loss"].as_matrix()
# X = profile[features].as_matrix()
kf = KFold(n_splits=10)
skf = StratifiedKFold(n_splits=6)
# print(**para)
classifier = clf(**para)
name = str(classifier).split("(")[0]
# dt = tree.DecisionTreeClassifier(min_samples_split=min_split, max_depth=max_dep)
print("{0} has been established with {1}".format(name, para))
# lr = LogisticRegression(penalty='l1')
for train_index, test_index in skf.split(X, y):
# print("TRAIN:",train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
score = accuracy_score(y_test, y_pred)
print("10-fold Score is: {0}".format(score))
return classifier,y_test, y_pred
示例5: cv_score
def cv_score(X, y, n_epochs = 10, n_folds=10, random_state=1999):
kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state)
scores = np.zeros((n_folds, n_epochs))
val_scores = np.zeros((n_folds, n_epochs))
best_epochs = np.zeros(n_folds)
clfs = [KerasWrapper(num_features=X.shape[1], label='keras_{}'.format(i)) for i in range(n_folds)]
folds = kf.split(X, y_train)
#iteratively train epochs
kfsplit = [(itrain, itest) for itrain, itest in folds]
for i in range(n_epochs):
print('=============Epoch {}================'.format(i))
i_fold = 0
for itrain, itest in kfsplit:
print('Fold ', i_fold)
train = X[itrain,:]
test = X[itest,:]
ytrain, ytest = y[itrain], y[itest]
clf, score, num_epoch = clfs[i_fold].fit(train, ytrain, nb_epoch=1,
validation_split=None, batch_size=64,
patience=1)
print('score: {}'.format(score))
scores[i_fold, i] = score
best_epochs[i_fold] = num_epoch
# predict on oof
pred = clf.predict_proba(test)
val_score = log_loss(ytest, pred)
print('Validation score: ', val_score)
val_scores[i_fold, i] = val_score
i_fold += 1
return scores, val_scores, best_epochs
示例6: split_data
def split_data(self, X, y, stratified = True, bad_chess = False):
if bad_chess:
n_points = int(X.shape[0] / self.nodes)
for node in range(self.nodes):
start_slice = node * n_points
final_slice = start_slice + n_points
dx = X[start_slice:final_slice]
dy = y[start_slice:final_slice]
frame_dx = pd.DataFrame(dx)
frame_dy = pd.DataFrame(dy)
file_data = datas_path.joinpath('data_' + str(node) + '.csv')
file_class = datas_path.joinpath('class_' + str(node) + '.csv')
frame_dx.to_csv(file_data, index = False)
frame_dy.to_csv(file_class, index = False)
else:
node = 0
if stratified:
skf = StratifiedKFold(n_splits = self.nodes)
else:
skf = KFold(n_splits = self.nodes, shuffle = True, random_state = 17)
for splited_index in skf.split(X, y):
new_X = pd.DataFrame(X[splited_index[1]])
new_y = pd.DataFrame(y[splited_index[1]])
X_path = datas_path.joinpath("data_" + str(node) + ".csv")
y_path = datas_path.joinpath("class_" + str(node) + ".csv")
new_X.to_csv(X_path, index = False)
new_y.to_csv(y_path, index = False)
node += 1
示例7: cv
def cv(X_train, y_train):
kfold = StratifiedKFold(n_splits=5, shuffle=True)
scores_f = []
scores_p = []
scores_r = []
for train, test in kfold.split(X_train, y_train):
model = TargetEnsembler(features)
X_train_cv = pd.DataFrame(X_train.values[train], columns=X_train.columns)
y_train_cv = pd.DataFrame(y_train.values[train], columns=["PCL_Strict3"])
X_test_cv = pd.DataFrame(X_train.values[test], columns=X_train.columns)
y_test_cv = pd.DataFrame(y_train.values[test], columns=["PCL_Strict3"])
model.fit(X_train_cv, y_train_cv)
y_pred = model.predict(X_test_cv)
s_f = f1_score(y_test_cv, y_pred)
s_p = precision_score(y_test_cv, y_pred)
s_r = recall_score(y_test_cv, y_pred)
print("\tscores f1", (s_f))
print("\tscores p", (s_p))
print("\tscores r", (s_r))
scores_f.append(s_f)
scores_p.append(s_p)
scores_r.append(s_r)
print("mean scores f1", np.mean(scores_f))
print("mean scores p", np.mean(scores_p))
print("mean scores r", np.mean(scores_r))
示例8: get_cv_results
def get_cv_results(design, data, cv_splits=10):
test_df, unit_onehot, unit_x = data
cv_results = []
for i in range(design.shape[0]):
lambda_int, lambda_x = design[i, :]
val_losses = []
for rep in range(3): # Almost like bootstrap. Reshuffling
cv_val_losses = []
skf = StratifiedKFold(n_splits=10, shuffle=True)
for train_index, test_index in skf.split(unit_x, test_df['unit']):
re_model = create_model(unit_onehot.shape[1], lambda_int, lambda_x,
.01, .0001, .92)
X_train = [test_df["x"][train_index], unit_onehot[train_index],
unit_x[train_index]]
X_test = [test_df["x"][test_index], unit_onehot[test_index],
unit_x[test_index]]
y_train, y_test = test_df["y"][train_index], test_df["y"][test_index]
h = re_model.fit(X_train, y_train,
epochs = 15000, batch_size = 450,
validation_data = (X_test, y_test),
callbacks = callbacks, verbose = 0)
cv_val_losses.append(np.min(h.history['val_loss']))
val_losses.append(np.mean(cv_val_losses))
cv_results.append(np.mean(val_losses))
return cv_results
示例9: __init__
def __init__(self, fm_decoder, n_iter=5, n_folds=3,
random_state=None):
self.fm_decoder = fm_decoder
StratifiedKFold.__init__(
self,
n_folds=n_folds,
random_state=random_state)
示例10: stratified_cross_validate
def stratified_cross_validate(self, k):
attributes = np.append(self.training_attributes, self.testing_attributes, axis=0)
labels = np.append(self.training_labels, self.testing_labels, axis=0)
all_data = np.array([np.append(attributes[i], labels[i]) for i in range(len(attributes))])
#print("all data : %s" % all_data)
#print("")
np.random.shuffle(all_data)
X = all_data[:, :-1]
y = all_data[:, -1]
print(X.shape, y.shape)
skf = StratifiedKFold(n_splits=2)
print(skf.get_n_splits(X, y))
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
yield (X_train, y_train, X_test, y_test)
#print("shuffled data : %s" % all_data)
#print("")
for i in range(k):
split = len(all_data) / k
#print("split : %s" % split)
test_data = all_data[i * split:(i + 1) * split, :]
train_data = np.delete(all_data, np.arange(i * split, (i + 1) * split), axis=0)
train_input, train_output = train_data[:, :-1], train_data[:, -1]
test_input, test_output = test_data[:, :-1], test_data[:, -1]
yield (train_input, train_output, test_input, test_output)
示例11: test_datasets
def test_datasets(dataset_names):
from sklearn.svm import SVC
data = Data(dataset_names=dataset_names)
def separate_sets(x, y, test_fold_id, test_folds):
x_test = x[test_folds == test_fold_id, :]
y_test = y[test_folds == test_fold_id]
x_train = x[test_folds != test_fold_id, :]
y_train = y[test_folds != test_fold_id]
return [x_train, y_train, x_test, y_test]
n_folds = 2
accuracies = {}
for name, dataset in data.datasets.items():
dataset.print_summary()
skf = StratifiedKFold(dataset.target, n_folds=n_folds, shuffle=True)
test_folds = skf.test_folds
accuracies[name] = np.zeros(n_folds)
test_fold = 0
for train_idx, test_idx in skf.split(X=dataset.data, y=dataset.target):
x_train, y_train = dataset.data[train_idx], dataset.target[train_idx]
x_test, y_test = dataset.data[test_idx], dataset.target[test_idx]
svc = SVC(C=1.0, kernel='rbf', degree=1, tol=0.01)
svc.fit(x_train, y_train)
prediction = svc.predict(x_test)
accuracies[name][test_fold] = 100*np.mean((prediction == y_test))
print("Acc = {0:.2f}%".format(accuracies[name][test_fold]))
test_fold += 1
return accuracies
示例12: split
def split(dependent, independent, n_folds):
skf = StratifiedKFold(n_splits=n_folds, random_state=RANDOM_STATE)
for train_indices, test_indices in skf.split(dependent, independent):
train_x = dependent[train_indices]
train_y = independent[train_indices]
test_x = dependent[test_indices]
test_y = independent[test_indices]
yield train_x, train_y, test_x, test_y
示例13: test_ovr_multinomial_iris
def test_ovr_multinomial_iris():
# Test that OvR and multinomial are correct using the iris dataset.
train, target = iris.data, iris.target
n_samples, n_features = train.shape
# The cv indices from stratified kfold (where stratification is done based
# on the fine-grained iris classes, i.e, before the classes 0 and 1 are
# conflated) is used for both clf and clf1
n_cv = 2
cv = StratifiedKFold(n_cv)
precomputed_folds = list(cv.split(train, target))
# Train clf on the original dataset where classes 0 and 1 are separated
clf = LogisticRegressionCV(cv=precomputed_folds)
clf.fit(train, target)
# Conflate classes 0 and 1 and train clf1 on this modified dataset
clf1 = LogisticRegressionCV(cv=precomputed_folds)
target_copy = target.copy()
target_copy[target_copy == 0] = 1
clf1.fit(train, target_copy)
# Ensure that what OvR learns for class2 is same regardless of whether
# classes 0 and 1 are separated or not
assert_array_almost_equal(clf.scores_[2], clf1.scores_[2])
assert_array_almost_equal(clf.intercept_[2:], clf1.intercept_)
assert_array_almost_equal(clf.coef_[2][np.newaxis, :], clf1.coef_)
# Test the shape of various attributes.
assert_equal(clf.coef_.shape, (3, n_features))
assert_array_equal(clf.classes_, [0, 1, 2])
coefs_paths = np.asarray(list(clf.coefs_paths_.values()))
assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10, n_features + 1))
assert_equal(clf.Cs_.shape, (10,))
scores = np.asarray(list(clf.scores_.values()))
assert_equal(scores.shape, (3, n_cv, 10))
# Test that for the iris data multinomial gives a better accuracy than OvR
for solver in ['lbfgs', 'newton-cg', 'sag', 'saga']:
max_iter = 2000 if solver in ['sag', 'saga'] else 15
clf_multi = LogisticRegressionCV(
solver=solver, multi_class='multinomial', max_iter=max_iter,
random_state=42, tol=1e-5 if solver in ['sag', 'saga'] else 1e-2,
cv=2)
clf_multi.fit(train, target)
multi_score = clf_multi.score(train, target)
ovr_score = clf.score(train, target)
assert_greater(multi_score, ovr_score)
# Test attributes of LogisticRegressionCV
assert_equal(clf.coef_.shape, clf_multi.coef_.shape)
assert_array_equal(clf_multi.classes_, [0, 1, 2])
coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values()))
assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10,
n_features + 1))
assert_equal(clf_multi.Cs_.shape, (10,))
scores = np.asarray(list(clf_multi.scores_.values()))
assert_equal(scores.shape, (3, n_cv, 10))
示例14: gen_folds
def gen_folds(X, y, n_folds=5, random_state=0):
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state)
folds = kf.split(X, y)
# iteratively train epochs
kfsplit = [(itrain, itest) for itrain, itest in folds]
return kfsplit
示例15: categorical_average
def categorical_average(variable, y, pred_0, feature_name):
def calculate_average(sub1, sub2):
s = pd.DataFrame(data = {
variable: sub1.groupby(variable, as_index = False).count()[variable],
'sumy': sub1.groupby(variable, as_index = False).sum()['y'],
'avgY': sub1.groupby(variable, as_index = False).mean()['y'],
'cnt': sub1.groupby(variable, as_index = False).count()['y']
})
tmp = sub2.merge(s.reset_index(), how='left', left_on=variable, right_on=variable)
del tmp['index']
tmp.loc[pd.isnull(tmp['cnt']), 'cnt'] = 0.0
tmp.loc[pd.isnull(tmp['cnt']), 'sumy'] = 0.0
def compute_beta(row):
cnt = row['cnt'] if row['cnt'] < 200 else float('inf')
return 1.0 / (g + exp((cnt - k) / f))
if lambda_val is not None:
tmp['beta'] = lambda_val
else:
tmp['beta'] = tmp.apply(compute_beta, axis = 1)
tmp['adj_avg'] = tmp.apply(lambda row: (1.0 - row['beta']) * row['avgY'] + row['beta'] * row['pred_0'],
axis = 1)
tmp.loc[pd.isnull(tmp['avgY']), 'avgY'] = tmp.loc[pd.isnull(tmp['avgY']), 'pred_0']
tmp.loc[pd.isnull(tmp['adj_avg']), 'adj_avg'] = tmp.loc[pd.isnull(tmp['adj_avg']), 'pred_0']
tmp['random'] = np.random.uniform(size = len(tmp))
tmp['adj_avg'] = tmp.apply(lambda row: row['adj_avg'] *(1 + (row['random'] - 0.5) * r_k),
axis = 1)
return tmp['adj_avg'].ravel()
#cv for training set
k_fold = StratifiedKFold(5)
X_train[feature_name] = -999
for (train_index, cv_index) in k_fold.split(np.zeros(len(X_train)),
X_train['interest_level'].ravel()):
sub = pd.DataFrame(data = {variable: X_train[variable],
'y': X_train[y],
'pred_0': X_train[pred_0]})
sub1 = sub.iloc[train_index]
sub2 = sub.iloc[cv_index]
X_train.loc[cv_index, feature_name] = calculate_average(sub1, sub2)
#for test set
sub1 = pd.DataFrame(data = {variable: X_train[variable],
'y': X_train[y],
'pred_0': X_train[pred_0]})
sub2 = pd.DataFrame(data = {variable: X_test[variable],
'y': X_test[y],
'pred_0': X_test[pred_0]})
X_test.loc[:, feature_name] = calculate_average(sub1, sub2)