本文整理汇总了Python中sklearn.model_selection.StratifiedKFold方法的典型用法代码示例。如果您正苦于以下问题:Python model_selection.StratifiedKFold方法的具体用法?Python model_selection.StratifiedKFold怎么用?Python model_selection.StratifiedKFold使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.model_selection
的用法示例。
在下文中一共展示了model_selection.StratifiedKFold方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_stratified_kfold_ratios
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def test_stratified_kfold_ratios():
# Check that stratified kfold preserves class ratios in individual splits
# Repeat with shuffling turned off and on
n_samples = 1000
X = np.ones(n_samples)
y = np.array([4] * int(0.10 * n_samples) +
[0] * int(0.89 * n_samples) +
[1] * int(0.01 * n_samples))
for shuffle in (False, True):
for train, test in StratifiedKFold(5, shuffle=shuffle).split(X, y):
assert_almost_equal(np.sum(y[train] == 4) / len(train), 0.10, 2)
assert_almost_equal(np.sum(y[train] == 0) / len(train), 0.89, 2)
assert_almost_equal(np.sum(y[train] == 1) / len(train), 0.01, 2)
assert_almost_equal(np.sum(y[test] == 4) / len(test), 0.10, 2)
assert_almost_equal(np.sum(y[test] == 0) / len(test), 0.89, 2)
assert_almost_equal(np.sum(y[test] == 1) / len(test), 0.01, 2)
示例2: k_fold
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def k_fold(dataset, folds):
skf = StratifiedKFold(folds, shuffle=True, random_state=12345)
test_indices, train_indices = [], []
for _, idx in skf.split(torch.zeros(len(dataset)), dataset.data.y):
test_indices.append(torch.from_numpy(idx).to(torch.long))
val_indices = [test_indices[i - 1] for i in range(folds)]
for i in range(folds):
train_mask = torch.ones(len(dataset), dtype=torch.bool)
train_mask[test_indices[i]] = 0
train_mask[val_indices[i]] = 0
train_indices.append(train_mask.nonzero().view(-1))
return train_indices, test_indices, val_indices
示例3: _split_fold10
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def _split_fold10(self, labels, fold_idx=0, seed=0, shuffle=True):
''' 10 flod '''
assert 0 <= fold_idx and fold_idx < 10, print(
"fold_idx must be from 0 to 9.")
skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed)
idx_list = []
for idx in skf.split(np.zeros(len(labels)), labels): # split(x, y)
idx_list.append(idx)
train_idx, valid_idx = idx_list[fold_idx]
print(
"train_set : test_set = %d : %d",
len(train_idx), len(valid_idx))
return train_idx, valid_idx
示例4: kfold
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def kfold(self, k=5, stratify=False, shuffle=True, seed=33):
"""K-Folds cross validation iterator.
Parameters
----------
k : int, default 5
stratify : bool, default False
shuffle : bool, default True
seed : int, default 33
Yields
-------
X_train, y_train, X_test, y_test, train_index, test_index
"""
if stratify:
kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle)
else:
kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle)
for train_index, test_index in kf.split(self.X_train, self.y_train):
X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index]
X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index]
yield X_train, y_train, X_test, y_test, train_index, test_index
示例5: cvsplit
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def cvsplit(fold, totalfold, mydict):
'''get the split of train and test
fold is the returned fold th data, from 0 to totalfold-1
total fold is for the cross validation
mydict is the return dict from readlabel'''
skf = StratifiedKFold(n_splits=totalfold) # default shuffle is false, okay!
#readdicom(mydict)
y = mydict.values()
x = mydict.keys()
count = 0
for train, test in skf.split(x,y):
print(len(train), len(test))
if count == fold:
#print test
return train, test
count += 1
示例6: test_2d_y
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def test_2d_y():
# smoke test for 2d y and multi-label
n_samples = 30
rng = np.random.RandomState(1)
X = rng.randint(0, 3, size=(n_samples, 2))
y = rng.randint(0, 3, size=(n_samples,))
y_2d = y.reshape(-1, 1)
y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
groups = rng.randint(0, 3, size=(n_samples,))
splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
RepeatedKFold(), RepeatedStratifiedKFold(),
ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
GroupShuffleSplit(), LeaveOneGroupOut(),
LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
PredefinedSplit(test_fold=groups)]
for splitter in splitters:
list(splitter.split(X, y, groups))
list(splitter.split(X, y_2d, groups))
try:
list(splitter.split(X, y_multilabel, groups))
except ValueError as e:
allowed_target_types = ('binary', 'multiclass')
msg = "Supported target types are: {}. Got 'multilabel".format(
allowed_target_types)
assert msg in str(e)
示例7: test_shuffle_stratifiedkfold
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def test_shuffle_stratifiedkfold():
# Check that shuffling is happening when requested, and for proper
# sample coverage
X_40 = np.ones(40)
y = [0] * 20 + [1] * 20
kf0 = StratifiedKFold(5, shuffle=True, random_state=0)
kf1 = StratifiedKFold(5, shuffle=True, random_state=1)
for (_, test0), (_, test1) in zip(kf0.split(X_40, y),
kf1.split(X_40, y)):
assert_not_equal(set(test0), set(test1))
check_cv_coverage(kf0, X_40, y, groups=None, expected_n_splits=5)
# Ensure that we shuffle each class's samples with different
# random_state in StratifiedKFold
# See https://github.com/scikit-learn/scikit-learn/pull/13124
X = np.arange(10)
y = [0] * 5 + [1] * 5
kf1 = StratifiedKFold(5, shuffle=True, random_state=0)
kf2 = StratifiedKFold(5, shuffle=True, random_state=1)
test_set1 = sorted([tuple(s[1]) for s in kf1.split(X, y)])
test_set2 = sorted([tuple(s[1]) for s in kf2.split(X, y)])
assert test_set1 != test_set2
示例8: test_cross_val_predict_unbalanced
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def test_cross_val_predict_unbalanced():
X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
n_informative=2, n_clusters_per_class=1,
random_state=1)
# Change the first sample to a new class
y[0] = 2
clf = LogisticRegression(random_state=1)
cv = StratifiedKFold(n_splits=2, random_state=1)
train, test = list(cv.split(X, y))
yhat_proba = cross_val_predict(clf, X, y, cv=cv, method="predict_proba")
assert y[test[0]][0] == 2 # sanity check for further assertions
assert np.all(yhat_proba[test[0]][:, 2] == 0)
assert np.all(yhat_proba[test[0]][:, 0:1] > 0)
assert np.all(yhat_proba[test[1]] > 0)
assert_array_almost_equal(yhat_proba.sum(axis=1), np.ones(y.shape),
decimal=12)
示例9: test_grid_search_groups
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def test_grid_search_groups():
# Check if ValueError (when groups is None) propagates to GridSearchCV
# And also check if groups is correctly passed to the cv object
rng = np.random.RandomState(0)
X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
groups = rng.randint(0, 3, 15)
clf = LinearSVC(random_state=0)
grid = {'C': [1]}
group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
GroupShuffleSplit()]
for cv in group_cvs:
gs = GridSearchCV(clf, grid, cv=cv)
assert_raise_message(ValueError,
"The 'groups' parameter should not be None.",
gs.fit, X, y)
gs.fit(X, y, groups=groups)
non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
for cv in non_group_cvs:
gs = GridSearchCV(clf, grid, cv=cv)
# Should not raise an error
gs.fit(X, y)
示例10: stratified_kfold_indices
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def stratified_kfold_indices(samples, **xval_kw):
"""
TODO: check xval label frequency
"""
from sklearn import model_selection
X = np.empty((len(samples), 0))
y = samples.encoded_1d().values
groups = samples.group_ids
type_ = xval_kw.pop('type', 'StratifiedGroupKFold')
if type_ == 'StratifiedGroupKFold':
assert groups is not None
# FIXME: The StratifiedGroupKFold could be implemented better.
splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw)
skf_list = list(splitter.split(X=X, y=y, groups=groups))
elif type_ == 'StratifiedKFold':
splitter = model_selection.StratifiedKFold(**xval_kw)
skf_list = list(splitter.split(X=X, y=y))
return skf_list
示例11: setup
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def setup(pblm):
import sklearn.datasets
iris = sklearn.datasets.load_iris()
pblm.primary_task_key = 'iris'
pblm.default_data_key = 'learn(all)'
pblm.default_clf_key = 'RF'
X_df = pd.DataFrame(iris.data, columns=iris.feature_names)
samples = MultiTaskSamples(X_df.index)
samples.apply_indicators(
{'iris': {name: iris.target == idx
for idx, name in enumerate(iris.target_names)}})
samples.X_dict = {'learn(all)': X_df}
pblm.samples = samples
pblm.xval_kw['type'] = 'StratifiedKFold'
示例12: kfold_cv
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def kfold_cv(clf_type, data_sets: [DataSet], fold_count=5, repetitions=5, n_jobs=-1, parallel_verbose=1, persist=True):
"""
Do a kfold cross validation with a SVM classifier.
:param data_sets: list of data sets
:param fold_count: count of folds to be made and hence also runs
:return: a Statistics object
"""
log.info('Starting {!s}-fold cv. Set count: {!s}'.format(fold_count, len(data_sets)))
parallel = Parallel(n_jobs=n_jobs, verbose=parallel_verbose)
skf = StratifiedKFold(n_splits=fold_count, shuffle=True)
stats_list = parallel(delayed(_fit_and_score)(clf, domains, labels, train_index, test_index, i, data_set_id, fold_count)
for domains, labels, data_set_id, clf in _data_sets_generator(data_sets, clf_type)
for i in range(repetitions)
for train_index, test_index in skf.split(domains, labels)
)
where = settings.EVAL_FOLDER + '/' + '{!s}fold_cv_{!s}_{!s}rep_{!s}sets_{!s}.pkl'.format(fold_count, clf_type, repetitions, len(data_sets),
settings.NOW_STR)
return _serialize_cv_results(stats_list, persist, where)
示例13: example_of_cross_validation_using_model_selection
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def example_of_cross_validation_using_model_selection(raw_data, labels, num_subjects, num_epochs_per_subj):
# NOTE: this method does not work for sklearn.svm.SVC with precomputed kernel
# when the kernel matrix is computed in portions; also, this method only works
# for self-correlation, i.e. correlation between the same data matrix.
# no shrinking, set C=1
svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1, gamma='auto')
#logit_clf = LogisticRegression()
clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj)
# doing leave-one-subject-out cross validation
# no shuffling in cv
skf = model_selection.StratifiedKFold(n_splits=num_subjects,
shuffle=False)
scores = model_selection.cross_val_score(clf, list(zip(raw_data, raw_data)),
y=labels,
cv=skf)
print(scores)
logger.info(
'the overall cross validation accuracy is %.2f' %
np.mean(scores)
)
示例14: _sfn
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def _sfn(data, mask, myrad, bcast_var):
"""Score classifier on searchlight data using cross-validation.
The classifier is in `bcast_var[2]`. The labels are in `bast_var[0]`. The
number of cross-validation folds is in `bast_var[1].
"""
clf = bcast_var[2]
masked_data = data[0][mask, :].T
# print(l[0].shape, mask.shape, data.shape)
skf = model_selection.StratifiedKFold(n_splits=bcast_var[1],
shuffle=False)
accuracy = np.mean(model_selection.cross_val_score(clf, masked_data,
y=bcast_var[0],
cv=skf,
n_jobs=1))
return accuracy
示例15: _get_stratified_crossval_split
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import StratifiedKFold [as 别名]
def _get_stratified_crossval_split(stimuli, fixations, split_count, included_splits, random=True, stratified_attributes=None):
from sklearn.model_selection import StratifiedKFold
labels = []
for attribute_name in stratified_attributes:
attribute_data = np.array(stimuli.attributes[attribute_name])
if attribute_data.ndim == 1:
attribute_data = attribute_data[:, np.newaxis]
labels.append(attribute_data)
labels = np.vstack(labels)
X = np.ones((len(stimuli), 1))
rst = np.random.RandomState(42)
inds = []
k_fold = StratifiedKFold(n_splits=split_count, shuffle=random, random_state=rst)
for i, (train_index, test_index) in enumerate(k_fold.split(X, labels)):
if i in included_splits:
inds.extend(test_index)
stimuli, fixations = create_subset(stimuli, fixations, inds)
return stimuli, fixations