本文整理汇总了Python中sklearn.model_selection.StratifiedKFold.split方法的典型用法代码示例。如果您正苦于以下问题:Python StratifiedKFold.split方法的具体用法?Python StratifiedKFold.split怎么用?Python StratifiedKFold.split使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.model_selection.StratifiedKFold
的用法示例。
在下文中一共展示了StratifiedKFold.split方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_fold_generator
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def _get_fold_generator(target_values):
if params.stratified_cv:
cv = StratifiedKFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED)
cv.get_n_splits(target_values)
fold_generator = cv.split(target_values, target_values)
else:
cv = KFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED)
fold_generator = cv.split(target_values)
return fold_generator
示例2: test_shuffle_stratifiedkfold
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def test_shuffle_stratifiedkfold():
# Check that shuffling is happening when requested, and for proper
# sample coverage
X_40 = np.ones(40)
y = [0] * 20 + [1] * 20
kf0 = StratifiedKFold(5, shuffle=True, random_state=0)
kf1 = StratifiedKFold(5, shuffle=True, random_state=1)
for (_, test0), (_, test1) in zip(kf0.split(X_40, y),
kf1.split(X_40, y)):
assert_not_equal(set(test0), set(test1))
check_cv_coverage(kf0, X_40, y, labels=None, expected_n_iter=5)
示例3: Kfold
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def Kfold(dataset, k, shuffle=False, stratify=False):
"""
Envelop function for folding operation
"""
# remove class labels
data = dataset[0]
if stratify:
kf = StratifiedKFold(k, shuffle)
return kf.split(dataset[0], dataset[1])
kf = KFold(k, shuffle)
return kf.split(data)
示例4: test_kfold_valueerrors
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def test_kfold_valueerrors():
X1 = np.array([[1, 2], [3, 4], [5, 6]])
X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
# Check that errors are raised if there is not enough samples
assert_raises(ValueError, next, KFold(4).split(X1))
# Check that a warning is raised if the least populated class has too few
# members.
y = np.array([3, 3, -1, -1, 3])
skf_3 = StratifiedKFold(3)
assert_warns_message(Warning, "The least populated class",
next, skf_3.split(X2, y))
# Check that despite the warning the folds are still computed even
# though all the classes are not necessarily represented at on each
# side of the split at each split
with warnings.catch_warnings():
warnings.simplefilter("ignore")
check_cv_coverage(skf_3, X2, y, labels=None, expected_n_splits=3)
# Check that errors are raised if all n_labels for individual
# classes are less than n_splits.
y = np.array([3, 3, -1, -1, 2])
assert_raises(ValueError, next, skf_3.split(X2, y))
# Check that errors are raised if all n_labels for individual
# classes are less than n_folds.
y = np.array([3, 3, -1, -1, 2])
assert_raises(ValueError, next, skf_3.split(X2, y))
# Error when number of folds is <= 1
assert_raises(ValueError, KFold, 0)
assert_raises(ValueError, KFold, 1)
error_string = ("k-fold cross-validation requires at least one"
" train/test split")
assert_raise_message(ValueError, error_string,
StratifiedKFold, 0)
assert_raise_message(ValueError, error_string,
StratifiedKFold, 1)
# When n_splits is not integer:
assert_raises(ValueError, KFold, 1.5)
assert_raises(ValueError, KFold, 2.0)
assert_raises(ValueError, StratifiedKFold, 1.5)
assert_raises(ValueError, StratifiedKFold, 2.0)
# When shuffle is not a bool:
assert_raises(TypeError, KFold, n_splits=4, shuffle=None)
示例5: test_datasets
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def test_datasets(dataset_names):
from sklearn.svm import SVC
data = Data(dataset_names=dataset_names)
def separate_sets(x, y, test_fold_id, test_folds):
x_test = x[test_folds == test_fold_id, :]
y_test = y[test_folds == test_fold_id]
x_train = x[test_folds != test_fold_id, :]
y_train = y[test_folds != test_fold_id]
return [x_train, y_train, x_test, y_test]
n_folds = 2
accuracies = {}
for name, dataset in data.datasets.items():
dataset.print_summary()
skf = StratifiedKFold(dataset.target, n_folds=n_folds, shuffle=True)
test_folds = skf.test_folds
accuracies[name] = np.zeros(n_folds)
test_fold = 0
for train_idx, test_idx in skf.split(X=dataset.data, y=dataset.target):
x_train, y_train = dataset.data[train_idx], dataset.target[train_idx]
x_test, y_test = dataset.data[test_idx], dataset.target[test_idx]
svc = SVC(C=1.0, kernel='rbf', degree=1, tol=0.01)
svc.fit(x_train, y_train)
prediction = svc.predict(x_test)
accuracies[name][test_fold] = 100*np.mean((prediction == y_test))
print("Acc = {0:.2f}%".format(accuracies[name][test_fold]))
test_fold += 1
return accuracies
示例6: stacking_proba
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def stacking_proba(clf,X_train,y,X_test,nfolds=5,random_seed=2017,return_score=False,
shuffle=True,metric='acc',clf_name='UnKnown'):
folds = StratifiedKFold(n_splits=nfolds, shuffle=shuffle, random_state=random_seed)
folds.get_n_splits(X_train,y)
#return stacking_proba for train set
train_stacking_proba=np.zeros((X_train.shape[0],np.unique(y).shape[0]))
score=0
for i,(train_index, validate_index) in enumerate(folds.split(X_train, y)):
# print(str(clf_name)+" folds:"+str(i+1)+"/"+str(nfolds))
X_train_fold=X_train[train_index,:]
y_train_fold=y[train_index]
X_validate_fold=X_train[validate_index,:]
y_validate_fold=y[validate_index]
clf.fit(X_train_fold,y_train_fold)
fold_preds=clf.predict_proba(X_validate_fold)
train_stacking_proba[validate_index,:]=fold_preds
#validation
fold_preds_a = np.argmax(fold_preds, axis=1)
fold_score=len(np.nonzero(y_validate_fold - fold_preds_a == 0)[0]) / len(y_validate_fold)
# print('validate '+metric+":"+str(fold_score))
score+=fold_score
score/=nfolds
#return stacking_proba for test set
clf.fit(X_train,y)
test_stacking_proba=clf.predict_proba(X_test)
if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
train_stacking_proba=train_stacking_proba[:,1]
test_stacking_proba=test_stacking_proba[:,1]
if return_score:
return train_stacking_proba,test_stacking_proba,score
else:
return train_stacking_proba,test_stacking_proba
示例7: stratified_cross_validate
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def stratified_cross_validate(self, k):
attributes = np.append(self.training_attributes, self.testing_attributes, axis=0)
labels = np.append(self.training_labels, self.testing_labels, axis=0)
all_data = np.array([np.append(attributes[i], labels[i]) for i in range(len(attributes))])
#print("all data : %s" % all_data)
#print("")
np.random.shuffle(all_data)
X = all_data[:, :-1]
y = all_data[:, -1]
print(X.shape, y.shape)
skf = StratifiedKFold(n_splits=2)
print(skf.get_n_splits(X, y))
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
yield (X_train, y_train, X_test, y_test)
#print("shuffled data : %s" % all_data)
#print("")
for i in range(k):
split = len(all_data) / k
#print("split : %s" % split)
test_data = all_data[i * split:(i + 1) * split, :]
train_data = np.delete(all_data, np.arange(i * split, (i + 1) * split), axis=0)
train_input, train_output = train_data[:, :-1], train_data[:, -1]
test_input, test_output = test_data[:, :-1], test_data[:, -1]
yield (train_input, train_output, test_input, test_output)
示例8: cv
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def cv(X_train, y_train):
kfold = StratifiedKFold(n_splits=5, shuffle=True)
scores_f = []
scores_p = []
scores_r = []
for train, test in kfold.split(X_train, y_train):
model = TargetEnsembler(features)
X_train_cv = pd.DataFrame(X_train.values[train], columns=X_train.columns)
y_train_cv = pd.DataFrame(y_train.values[train], columns=["PCL_Strict3"])
X_test_cv = pd.DataFrame(X_train.values[test], columns=X_train.columns)
y_test_cv = pd.DataFrame(y_train.values[test], columns=["PCL_Strict3"])
model.fit(X_train_cv, y_train_cv)
y_pred = model.predict(X_test_cv)
s_f = f1_score(y_test_cv, y_pred)
s_p = precision_score(y_test_cv, y_pred)
s_r = recall_score(y_test_cv, y_pred)
print("\tscores f1", (s_f))
print("\tscores p", (s_p))
print("\tscores r", (s_r))
scores_f.append(s_f)
scores_p.append(s_p)
scores_r.append(s_r)
print("mean scores f1", np.mean(scores_f))
print("mean scores p", np.mean(scores_p))
print("mean scores r", np.mean(scores_r))
示例9: classify
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def classify(X,y, clf,**para):
# y = profile["Loss"].as_matrix()
# X = profile[features].as_matrix()
kf = KFold(n_splits=10)
skf = StratifiedKFold(n_splits=6)
# print(**para)
classifier = clf(**para)
name = str(classifier).split("(")[0]
# dt = tree.DecisionTreeClassifier(min_samples_split=min_split, max_depth=max_dep)
print("{0} has been established with {1}".format(name, para))
# lr = LogisticRegression(penalty='l1')
for train_index, test_index in skf.split(X, y):
# print("TRAIN:",train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
score = accuracy_score(y_test, y_pred)
print("10-fold Score is: {0}".format(score))
return classifier,y_test, y_pred
示例10: cv_score
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def cv_score(X, y, n_epochs = 10, n_folds=10, random_state=1999):
kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state)
scores = np.zeros((n_folds, n_epochs))
val_scores = np.zeros((n_folds, n_epochs))
best_epochs = np.zeros(n_folds)
clfs = [KerasWrapper(num_features=X.shape[1], label='keras_{}'.format(i)) for i in range(n_folds)]
folds = kf.split(X, y_train)
#iteratively train epochs
kfsplit = [(itrain, itest) for itrain, itest in folds]
for i in range(n_epochs):
print('=============Epoch {}================'.format(i))
i_fold = 0
for itrain, itest in kfsplit:
print('Fold ', i_fold)
train = X[itrain,:]
test = X[itest,:]
ytrain, ytest = y[itrain], y[itest]
clf, score, num_epoch = clfs[i_fold].fit(train, ytrain, nb_epoch=1,
validation_split=None, batch_size=64,
patience=1)
print('score: {}'.format(score))
scores[i_fold, i] = score
best_epochs[i_fold] = num_epoch
# predict on oof
pred = clf.predict_proba(test)
val_score = log_loss(ytest, pred)
print('Validation score: ', val_score)
val_scores[i_fold, i] = val_score
i_fold += 1
return scores, val_scores, best_epochs
示例11: split_data
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def split_data(self, X, y, stratified = True, bad_chess = False):
if bad_chess:
n_points = int(X.shape[0] / self.nodes)
for node in range(self.nodes):
start_slice = node * n_points
final_slice = start_slice + n_points
dx = X[start_slice:final_slice]
dy = y[start_slice:final_slice]
frame_dx = pd.DataFrame(dx)
frame_dy = pd.DataFrame(dy)
file_data = datas_path.joinpath('data_' + str(node) + '.csv')
file_class = datas_path.joinpath('class_' + str(node) + '.csv')
frame_dx.to_csv(file_data, index = False)
frame_dy.to_csv(file_class, index = False)
else:
node = 0
if stratified:
skf = StratifiedKFold(n_splits = self.nodes)
else:
skf = KFold(n_splits = self.nodes, shuffle = True, random_state = 17)
for splited_index in skf.split(X, y):
new_X = pd.DataFrame(X[splited_index[1]])
new_y = pd.DataFrame(y[splited_index[1]])
X_path = datas_path.joinpath("data_" + str(node) + ".csv")
y_path = datas_path.joinpath("class_" + str(node) + ".csv")
new_X.to_csv(X_path, index = False)
new_y.to_csv(y_path, index = False)
node += 1
示例12: test_kfold_valueerrors
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def test_kfold_valueerrors():
X1 = np.array([[1, 2], [3, 4], [5, 6]])
X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
# Check that errors are raised if there is not enough samples
assert_raises(ValueError, next, KFold(4).split(X1))
# Check that a warning is raised if the least populated class has too few
# members.
y = np.array([3, 3, -1, -1, 2])
skf_3 = StratifiedKFold(3)
assert_warns_message(Warning, "The least populated class",
next, skf_3.split(X2, y))
# Check that despite the warning the folds are still computed even
# though all the classes are not necessarily represented at on each
# side of the split at each split
with warnings.catch_warnings():
check_cv_coverage(skf_3, X2, y, labels=None, expected_n_iter=3)
# Error when number of folds is <= 1
assert_raises(ValueError, KFold, 0)
assert_raises(ValueError, KFold, 1)
assert_raises(ValueError, StratifiedKFold, 0)
assert_raises(ValueError, StratifiedKFold, 1)
# When n_folds is not integer:
assert_raises(ValueError, KFold, 1.5)
assert_raises(ValueError, KFold, 2.0)
assert_raises(ValueError, StratifiedKFold, 1.5)
assert_raises(ValueError, StratifiedKFold, 2.0)
# When shuffle is not a bool:
assert_raises(TypeError, KFold, n_folds=4, shuffle=None)
示例13: get_cv_results
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def get_cv_results(design, data, cv_splits=10):
test_df, unit_onehot, unit_x = data
cv_results = []
for i in range(design.shape[0]):
lambda_int, lambda_x = design[i, :]
val_losses = []
for rep in range(3): # Almost like bootstrap. Reshuffling
cv_val_losses = []
skf = StratifiedKFold(n_splits=10, shuffle=True)
for train_index, test_index in skf.split(unit_x, test_df['unit']):
re_model = create_model(unit_onehot.shape[1], lambda_int, lambda_x,
.01, .0001, .92)
X_train = [test_df["x"][train_index], unit_onehot[train_index],
unit_x[train_index]]
X_test = [test_df["x"][test_index], unit_onehot[test_index],
unit_x[test_index]]
y_train, y_test = test_df["y"][train_index], test_df["y"][test_index]
h = re_model.fit(X_train, y_train,
epochs = 15000, batch_size = 450,
validation_data = (X_test, y_test),
callbacks = callbacks, verbose = 0)
cv_val_losses.append(np.min(h.history['val_loss']))
val_losses.append(np.mean(cv_val_losses))
cv_results.append(np.mean(val_losses))
return cv_results
示例14: test_grid_search_correct_score_results
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def test_grid_search_correct_score_results():
# test that correct scores are used
n_splits = 3
clf = LinearSVC(random_state=0)
X, y = make_blobs(random_state=0, centers=2)
Cs = [.1, 1, 10]
for score in ['f1', 'roc_auc']:
grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits)
results = grid_search.fit(X, y).cv_results_
# Test scorer names
result_keys = list(results.keys())
expected_keys = (("mean_test_score", "rank_test_score") +
tuple("split%d_test_score" % cv_i
for cv_i in range(n_splits)))
assert_true(all(in1d(expected_keys, result_keys)))
cv = StratifiedKFold(n_splits=n_splits)
n_splits = grid_search.n_splits_
for candidate_i, C in enumerate(Cs):
clf.set_params(C=C)
cv_scores = np.array(
list(grid_search.cv_results_['split%d_test_score'
% s][candidate_i]
for s in range(n_splits)))
for i, (train, test) in enumerate(cv.split(X, y)):
clf.fit(X[train], y[train])
if score == "f1":
correct_score = f1_score(y[test], clf.predict(X[test]))
elif score == "roc_auc":
dec = clf.decision_function(X[test])
correct_score = roc_auc_score(y[test], dec)
assert_almost_equal(correct_score, cv_scores[i])
示例15: split
# 需要导入模块: from sklearn.model_selection import StratifiedKFold [as 别名]
# 或者: from sklearn.model_selection.StratifiedKFold import split [as 别名]
def split(dependent, independent, n_folds):
skf = StratifiedKFold(n_splits=n_folds, random_state=RANDOM_STATE)
for train_indices, test_indices in skf.split(dependent, independent):
train_x = dependent[train_indices]
train_y = independent[train_indices]
test_x = dependent[test_indices]
test_y = independent[test_indices]
yield train_x, train_y, test_x, test_y