本文整理汇总了Python中sklearn.cross_validation.StratifiedShuffleSplit方法的典型用法代码示例。如果您正苦于以下问题:Python cross_validation.StratifiedShuffleSplit方法的具体用法?Python cross_validation.StratifiedShuffleSplit怎么用?Python cross_validation.StratifiedShuffleSplit使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.cross_validation
的用法示例。
在下文中一共展示了cross_validation.StratifiedShuffleSplit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_stratified_shuffle_split_init
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def test_stratified_shuffle_split_init():
y = np.asarray([0, 1, 1, 1, 2, 2, 2])
# Check that error is raised if there is a class with only one sample
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2)
# Check that error is raised if the test set size is smaller than n_classes
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2)
# Check that error is raised if the train set size is smaller than
# n_classes
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2)
y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
# Check that errors are raised if there is not enough samples
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6)
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6)
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8)
# Train size or test size too small
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2)
assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2)
示例2: base_learners
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def base_learners(data_path='data.csv', seed=123456789):
"""
Test some classifiers on the raw data.
"""
# Params
nsplits = 8
pct_train = 0.8
# Get data
data = pd.read_csv(data_path)
x = data.ix[:, :-1].as_matrix()
y = data.ix[:, -1].as_matrix()
x, y = convert_data_to_int(x, y)
# Run random forest in parallel
sss = StratifiedShuffleSplit(y, n_iter=nsplits, train_size=pct_train,
random_state=seed)
results = Parallel(n_jobs=-1)(delayed(train_score_clf)(
RandomForestClassifier(random_state=i), x[tr], x[te], y[tr], y[te])
for i, (tr, te) in enumerate(sss))
print 'Random Forest: {0:.3f} %'.format(np.median(results))
# Run SVM in parallel
sss = StratifiedShuffleSplit(y, n_iter=nsplits, train_size=pct_train,
random_state=seed)
results = Parallel(n_jobs=-1)(delayed(train_score_clf)(
LinearSVC(random_state=i), x[tr], x[te], y[tr], y[te])
for i, (tr, te) in enumerate(sss))
print 'Linear SVM: {0:.3f} %'.format(np.median(results))
示例3: _create_generator
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def _create_generator(self):
"""
Create a generator for the data. Yield a tuple containing the current
training and testing split.
"""
# Create the CV iterators
sss_tr = StratifiedShuffleSplit(self.tr_y, self.nsplits,
train_size=self.train_size, random_state=self.seed)
sss_te = StratifiedShuffleSplit(self.te_y, self.nsplits,
train_size=self.test_size, random_state=self.seed)
# Yield each item
for tr, te in izip(sss_tr, sss_te):
yield tr[0], te[0] + len(self.tr_y) # Offset testing indexes
示例4: stratified_split
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def stratified_split(x, y, test_size=0.2):
strat_shuffled_split = StratifiedShuffleSplit(y, n_iter=1, test_size=test_size, random_state=23)
train_index, valid_index = [s for s in strat_shuffled_split][0]
x_train, y_train, x_valid, y_valid = x[train_index, :], y[train_index], x[valid_index, :], y[valid_index]
return x_train, y_train, x_valid, y_valid
示例5: _train_val_split_indices
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def _train_val_split_indices(labels):
split = StratifiedShuffleSplit(
labels, n_iter=1, test_size=VAL_SIZE, random_state=42)
indices_tr, indices_val = next(iter(split))
_save_organized_data_info(
split.classes, indices_tr, indices_val, multi_crop=False)
_save_organized_data_info(
split.classes, indices_tr, indices_val, multi_crop=True)
return indices_tr, indices_val, split.classes
示例6: split_indices
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def split_indices(files, labels, test_size=0.1, random_state=RANDOM_STATE):
names = get_names(files)
labels = get_labels(names, per_patient=True)
spl = cross_validation.StratifiedShuffleSplit(labels[:, 0],
test_size=test_size,
random_state=random_state,
n_iter=1)
tr, te = next(iter(spl))
tr = np.hstack([tr * 2, tr * 2 + 1])
te = np.hstack([te * 2, te * 2 + 1])
return tr, te
示例7: Get_yPred
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def Get_yPred (X,y,clf_class,n_folds=10, pred_proba=False) : #,**kwargs):
'''
Return "Full" Y_predictions from a given c;assifier (not just from one split): (From def run_cv)
http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html
Could also be done with stratified shuffle split (+Append output) ?
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
'''
# Construct a kfolds object
# kf = StratifiedKFold(len(y),n_folds,shuffle=True) #shuffle?
kf = StratifiedKFold(y,n_folds,shuffle=True) #shuffle?
y_pred = y.copy()
# Iterate through folds
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
# sample_weight=balance_weights(y_train)
# Initialize a classifier with key word arguments
clf = clf_class #(**kwargs)
#sample_weight weighting not working here.. ? TODO
clf.fit(X_train,y_train) #,sample_weight) #
if pred_proba == True:
y_pred[test_index] = clf.predict_proba(X_test)
else:
y_pred[test_index] = clf.predict(X_test)
return y_pred
示例8: PlotPerfPercentFeatures
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def PlotPerfPercentFeatures(X,y,est=LinearSVC()):
'''
Performance of a classifier (default: SVM-Anova)
varying the percentile of features selected (F-test) .
http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py
See Also: (Similar but with model seelction from among classifiers):
http://nbviewer.ipython.org/github/bugra/pydata-nyc-2014/blob/master/6.%20Scikit%20Learn%20-%20Model%20Selection.ipynb
'''
transform = SelectPercentile(f_classif)
clf = Pipeline([('anova', transform), ('est', est)])
###############################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds = list()
percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 99)
# percentiles = (1,5,10,25,50,75,90)
for percentile in percentiles:
# print(percentile)
clf.set_params(anova__percentile=percentile)
this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1)
score_means.append(this_scores.mean())
score_stds.append(this_scores.std())
print("Outputting Graph:")
plt.errorbar(percentiles, score_means, np.array(score_stds))
plt.title(
'Predictor Performance, varying percent of features used')
plt.xlabel('Percentile')
plt.ylabel('Prediction Performance')
plt.axis('tight')
plt.show()
示例9: PlotPerfPercentFeatures
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def PlotPerfPercentFeatures(X,y,est=LinearSVC()):
'''
Performance of a classifier (default: SVM-Anova)
varying the percentile of features selected (F-test) .
http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py
'''
transform = SelectPercentile(f_classif)
clf = Pipeline([('anova', transform), ('est', est)])
###############################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds = list()
percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 100)
# percentiles = (1,5,10,25,50,75,90)
for percentile in percentiles:
# print(percentile)
clf.set_params(anova__percentile=percentile)
this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1)
score_means.append(this_scores.mean())
score_stds.append(this_scores.std())
print("Outputting Graph:")
plt.errorbar(percentiles, score_means, np.array(score_stds))
plt.title(
'Predictor Performance, varying percent of features used')
plt.xlabel('Percentile')
plt.ylabel('Prediction Performance')
plt.axis('tight')
plt.show()
示例10: CV_multi_stats
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def CV_multi_stats(X, y, model,n=6) :
'''
http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
This version uses multiclass (or multilabel) compatible metrics.
May be expanded to use the cross_val_score helper function:
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.cross_val_score.html
http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
'''
scores = cross_val_score(estimator=model, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=n, test_size=0.16), n_jobs=-1) #Accuracy
scores_f1 = cross_val_score(estimator=model, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=n, test_size=0.16), n_jobs=-1, scoring='f1')
print("Model Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2))
print("Model f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))
return (scores.mean(), scores.std() ,scores_f1.mean(), scores_f1.std() ) #Removed * 2 from returned STD .. ?
示例11: test_stratified_shuffle_split_iter
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def test_stratified_shuffle_split_iter():
ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
np.array([-1] * 800 + [1] * 50)
]
for y in ys:
sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
random_state=0)
test_size = np.ceil(0.33 * len(y))
train_size = len(y) - test_size
for train, test in sss:
assert_array_equal(np.unique(y[train]), np.unique(y[test]))
# Checks if folds keep classes proportions
p_train = (np.bincount(np.unique(y[train],
return_inverse=True)[1]) /
float(len(y[train])))
p_test = (np.bincount(np.unique(y[test],
return_inverse=True)[1]) /
float(len(y[test])))
assert_array_almost_equal(p_train, p_test, 1)
assert_equal(len(train) + len(test), y.size)
assert_equal(len(train), train_size)
assert_equal(len(test), test_size)
assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
示例12: test_stratified_shuffle_split_overlap_train_test_bug
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def test_stratified_shuffle_split_overlap_train_test_bug():
# See https://github.com/scikit-learn/scikit-learn/issues/6121 for
# the original bug report
labels = [0, 1, 2, 3] * 3 + [4, 5] * 5
splits = cval.StratifiedShuffleSplit(labels, n_iter=1,
test_size=0.5, random_state=0)
train, test = next(iter(splits))
assert_array_equal(np.intersect1d(train, test), [])
示例13: grouped_stratified_train_test_split
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def grouped_stratified_train_test_split(y, x, group_by=None, test_size=0.33, group_labeler=None, return_indices=False, **kwargs):
"""
Split arrays or matrices into random training and test subsets. Subsets will contain equal proportions of each label in `y`.
Based on StratifiedShuffleSplit from sklearn.cross_validation.
if `group_by` is an iterable of length `len(y)`, indices with the same `group_by[i]` will be kept together in either the training or the test set.
if `group_labeler` is a callable, it will be used to assign a label to a group of labels. The default is `lambda labels: int(np.round(np.average(labels)))`
--------
Example:
X = np.array([[1, 2], [3, 4], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1]])
y = np.array([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1])
id = np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6])
x_train, x_test, y_train, y_test = grouped_stratified_train_test_split(y,X,id)
"""
if not group_labeler:
group_labeler = lambda labels: int(np.round(np.average(labels)))
group_indices = dict()
group_labels = dict()
for i,(label, group) in enumerate(zip(y, group_by)):
if not group in group_labels:
group_labels[group] = list()
group_indices[group] = list()
group_indices[group].append(i)
group_labels[group].append(label)
groups, labels = zip(*{ group: group_labeler(labels) for group, labels in group_labels.items() }.items())
sss = StratifiedShuffleSplit(labels, 1, test_size=test_size, **kwargs)
group_train_indices, group_test_indices = list(sss)[0]
test_groups = [groups[i] for i in group_test_indices]
train_groups = [groups[j] for j in group_train_indices]
test_indices = [idx for group in test_groups for idx in group_indices[group]]
train_indices = [idx for group in train_groups for idx in group_indices[group]]
if return_indices:
return train_indices, test_indices
else:
return x[train_indices], x[test_indices], y[train_indices], y[test_indices]
示例14: load_titanic
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999):
f = open(os.path.join('datasets', 'titanic', 'titanic3.csv'))
# Remove . from home.dest, split on quotes because some fields have commas
keys = f.readline().strip().replace('.', '').split('","')
lines = f.readlines()
f.close()
string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat',
'homedest']
string_keys = [s for s in string_keys if s not in feature_skip_tuple]
numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare']
numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple]
train_vectorizer_list = []
test_vectorizer_list = []
n_samples = len(lines)
numeric_data = np.zeros((n_samples, len(numeric_keys)))
numeric_labels = np.zeros((n_samples,), dtype=int)
# Doing this twice is horribly inefficient but the file is small...
for n, l in enumerate(lines):
line_dict = process_titanic_line(l)
strings = {k: line_dict[k] for k in string_keys}
numeric_labels[n] = line_dict["survived"]
sss = StratifiedShuffleSplit(numeric_labels, n_iter=1, test_size=test_size,
random_state=12)
# This is a weird way to get the indices but it works
train_idx = None
test_idx = None
for train_idx, test_idx in sss:
pass
for n, l in enumerate(lines):
line_dict = process_titanic_line(l)
strings = {k: line_dict[k] for k in string_keys}
if n in train_idx:
train_vectorizer_list.append(strings)
else:
test_vectorizer_list.append(strings)
numeric_data[n] = np.asarray([line_dict[k]
for k in numeric_keys])
train_numeric = numeric_data[train_idx]
test_numeric = numeric_data[test_idx]
train_labels = numeric_labels[train_idx]
test_labels = numeric_labels[test_idx]
vec = DictVectorizer()
# .toarray() due to returning a scipy sparse array
train_categorical = vec.fit_transform(train_vectorizer_list).toarray()
test_categorical = vec.transform(test_vectorizer_list).toarray()
train_data = np.concatenate([train_numeric, train_categorical], axis=1)
test_data = np.concatenate([test_numeric, test_categorical], axis=1)
keys = numeric_keys + string_keys
return keys, train_data, test_data, train_labels, test_labels
示例15: CV_Binary_stats
# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def CV_Binary_stats(X, y, model,n=10) :
'''
http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
Note that some of the metrics here ONLY work for BINARY tasks.
This will be VERY slow compared to the built-in, multicore CV implementation. (Unless
used with a classifier that is parallelized anyway, such as RF).
By default, balances weights when fitting
http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
'''
from sklearn.metrics import precision_score, accuracy_score, recall_score,precision_recall_fscore_support
mean_auc = 0.0
mean_precision = 0.0
mean_recall = 0.0
mean_accuracy = 0.0
sss = StratifiedShuffleSplit(y, n_iter=n, test_size=0.2, random_state=0)
for train_index, test_index in sss:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# for i in range(n) :
# # for each iteration, randomly hold out 30% of the data as CV set
# X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y,
# test_size=.15,
# random_state=i)
# cv=StratifiedShuffleSplit(y=y_train, n_iter=11, test_size=0.11)
# train model and make predictions
model.fit(X_train, y_train,sample_weight=balance_weights(y_train))
# preds = model.predict(X_cv)
preds = model.predict(X_test)
'''
# ROC_AUC - Restricted to binary (not multiclass) case.
fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
roc_auc = metrics.auc(fpr, tpr)
# print("( %d/%d)" % (i + 1, n))
mean_auc += roc_auc
'''
accuracy = accuracy_score(y_cv, preds)
precision = precision_score(y_cv, preds)
recall = recall_score(y_cv, preds)
mean_accuracy += accuracy
mean_precision += precision
mean_recall += recall
mean_accuracy = (mean_accuracy / n)
mean_precision = mean_precision / n
mean_recall = mean_recall / n
# mean_auc = mean_auc / n
print('mean_accuracy: %s ' %(round(mean_accuracy, 3)))
print('mean_precision: %s ' %(round(mean_precision, 3)))
print('mean_recall: %s ' %(round(mean_recall, 3)))
# print('mean_auc: %s ' %(round(mean_auc, 3)))
return (mean_accuracy,mean_precision,mean_recall)