本文整理汇总了Python中sklearn.model_selection.ShuffleSplit类的典型用法代码示例。如果您正苦于以下问题:Python ShuffleSplit类的具体用法?Python ShuffleSplit怎么用?Python ShuffleSplit使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ShuffleSplit类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: FitModel
def FitModel(cnnc, A, Y, T, FN):
print('Fitting model...')
ss = ShuffleSplit(n_splits = 1)
trn, tst = next(ss.split(A))
#Fit the network
cnnc.fit(A[trn], Y[trn])
#The predictions as sequences of character indices
YH = []
for i in np.array_split(np.arange(A.shape[0]), 32):
YH.append(cnnc.predict(A[i]))
YH = np.vstack(YH)
#Convert from sequence of char indices to strings
PS = np.array([''.join(YHi) for YHi in YH])
#Compute the accuracy
S1 = SAcc(PS[trn], T[trn])
S2 = SAcc(PS[tst], T[tst])
print('Train: ' + str(S1))
print('Test: ' + str(S2))
for PSi, Ti, FNi in zip(PS, T, FN):
if np.random.rand() > 0.99: #Randomly select rows to print
print(FNi + ': ' + Ti + ' -> ' + PSi)
print('Fitting with CV data...')
#Fit remainder
cnnc.SetMaxIter(4)
cnnc.fit(A, Y)
return cnnc
示例2: main
def main():
from io import open as uopen
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('fname')
parser.add_argument('idx', default=2, type=int)
parser.add_argument('--key', default=u'V;1;SG;IND;PST;PFV')
parser.add_argument('--shuffle', action='store_true')
parser.add_argument('--folds', default=10, type=int)
parser.add_argument('--lang', default='sp')
parser.add_argument('--key-idx', default=3, type=int)
args = parser.parse_args()
fh = uopen(args.fname, encoding='utf-8')
lines = [x.strip().split(u'\t') for x in fh]
to_extract = [(x[0], x[args.idx]) for x in lines if x[args.key_idx] == args.key]
if args.shuffle:
from random import shuffle
shuffle(to_extract)
from distutils.dir_util import mkpath
from sklearn.model_selection import ShuffleSplit
rs = ShuffleSplit(n_splits=args.folds, test_size=0.2, random_state=42)
for i, (train_indices, test_indices) in enumerate(rs.split(to_extract)):
mkpath('res/ryan_splits/{}-10fold/{}'.format(args.lang, i))
train_fh, dev_fh, test_fh = (uopen('res/ryan_splits/{}-10fold/{}/train.uniq'.format(args.lang, i), mode='w', encoding='utf-8'),
uopen('res/ryan_splits/{}-10fold/{}/dev.uniq'.format(args.lang, i), mode='w', encoding='utf-8'),
uopen('res/ryan_splits/{}-10fold/{}/test.uniq'.format(args.lang, i), mode='w', encoding='utf-8'),
)
for idx in train_indices:
train_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))
for j, idx in enumerate(test_indices):
if j % 2 == 0:
dev_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))
else:
test_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))
示例3: train_model
def train_model(clf, X, Y, name="NB ngram", plot=False):
# create it again for plotting
# cv = ShuffleSplit(
# n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
cv = ShuffleSplit(
n_splits=10, test_size=0.3, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []
clfs = [] # just to later get the median
for train, test in cv.split(X):
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf.fit(X_train, y_train)
clfs.append(clf)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test)
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
if plot:
scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)]
plot_pr(pr_scores[median], name, phase, precisions[median],
recalls[median], label=name)
log_false_positives(clfs[median], X_test, y_test, name)
summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
return np.mean(train_errors), np.mean(test_errors)
示例4: run
def run():
# Load data set
X_train, Y_train, X_test, submission_file_content = load_data()
Y_train = np.log(Y_train + 200)
# Initiate model
model = init_model(X_train.shape[1])
vanilla_weights = model.get_weights()
# Cross validation
cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
for cross_validation_index, (train_index, valid_index) in enumerate(
cross_validation_iterator.split(X_train), start=1
):
print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))
optimal_weights_path = "/tmp/Optimal_Weights_{}.h5".format(cross_validation_index)
submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))
if os.path.isfile(submission_file_path):
continue
if not os.path.isfile(optimal_weights_path):
# Load the vanilla weights
model.set_weights(vanilla_weights)
# Perform the training procedure
earlystopping_callback = EarlyStopping(monitor="val_actual_mae", patience=EARLYSTOPPING_PATIENCE)
modelcheckpoint_callback = ModelCheckpoint(optimal_weights_path, monitor="val_loss", save_best_only=True)
model.fit(
X_train[train_index],
Y_train[train_index],
batch_size=TRAIN_BATCH_SIZE,
nb_epoch=MAXIMUM_EPOCH_NUM,
validation_data=(X_train[valid_index], Y_train[valid_index]),
callbacks=[earlystopping_callback, modelcheckpoint_callback],
verbose=2,
)
# Load the optimal weights
model.load_weights(optimal_weights_path)
# Perform the testing procedure
Y_test = model.predict(X_test, batch_size=TEST_BATCH_SIZE, verbose=2)
# Save submission to disk
if not os.path.isdir(SUBMISSION_FOLDER_PATH):
os.makedirs(SUBMISSION_FOLDER_PATH)
submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
submission_file_content.to_csv(submission_file_path, index=False)
# Perform ensembling
ensemble_predictions()
print("All done!")
示例5: train_model
def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
# cv = ShuffleSplit(
# n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html
# old:http://scikit-learn.org/0.15/modules/generated/sklearn
# .cross_validation.ShuffleSplit.html#sklearn.cross_validation.ShuffleSplit
cv = ShuffleSplit(
n_splits=10, test_size=0.3, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []
for train, test in cv.split(X):
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf = clf_factory()
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test)
# print('proba:', proba)
# fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
scores_to_sort = pr_scores
# print('np.argsort(scores_to_sort):', np.argsort(scores_to_sort),len(scores_to_sort) / 2)
median = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)]
if plot:
plot_pr(pr_scores[median], name, "01", precisions[median],
recalls[median], label=name)
summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
return np.mean(train_errors), np.mean(test_errors)
示例6: fit_models
def fit_models(imps, X, Y, all_props, props=None,
labels=None, n_splits=5,
clf_args={'n_estimators':25,
'max_features':'auto',
'random_state':0}):
if props is None:
props = all_props
n_obs = X['missing'].shape[0] # Number of observations.
n_features = X['missing'].shape[1] # Number of observations.
n_props = len(props) # Number of properties to predict.
test_size = 0.2
if labels is None:
shuffle_split = ShuffleSplit(n_iter=n_splits,
test_size=test_size,random_state=0)
else:
shuffle_split = GroupShuffleSplit(n_iter=n_splits,
test_size=test_size,random_state=0)
n_test_samples = np.max([len(list(shuffle_split.split(range(n_obs),groups=labels))[i][1]) \
for i in range(n_splits)])
rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps}
ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
feature_importances = {imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps}
for n_prop,prop in enumerate(props):
j = all_props.index(prop)
print("Fitting model for %s..." % prop)
for imp in imps:
for k,(train,test) in enumerate(shuffle_split.split(range(n_obs),
groups=labels)):
X_train,X_test = X[imp][train],X[imp][test]
Y_train,Y_test = Y[imp][train,j],Y['missing'][test,j]
clf_args_ = {key:(value if type(value) is not dict \
else value[prop])\
for key,value in clf_args.items()}
if clf_args_['max_features'] not in [None, 'auto']:
clf_args_['max_features'] = min(X_train.shape[1],
clf_args_['max_features'])
rfc = RandomForestClassifier(**clf_args_)
#if Y_train.shape[1] == 1:
# Y_train = Y_train.ravel()
rfc.fit(X_train,Y_train)
Y_predict = rfc.predict(X_test)#.reshape(-1,n_props)
probs = rfc.predict_proba(X_test)
if probs.shape[1]<2 and probs.mean()==1.0:
n_test_samples = len(probs)
ps[imp][n_prop,k,:n_test_samples] = 0.0
else:
n_test_samples = len(probs[:,1])
ps[imp][n_prop,k,:n_test_samples] = probs[:,1]
ys[imp][n_prop,k,:n_test_samples] = Y_test
rs[imp][n_prop,k] = np.ma.corrcoef(Y_predict,Y_test)[0,1]
feature_importances[imp][n_prop,:,k] = rfc.feature_importances_
return rs,feature_importances,ys,ps
示例7: fit_models_mc
def fit_models_mc(imps, X, Y, all_props, props=None,
labels=None, n_splits=5,
clf_args={'n_estimators':25,
'max_features':'auto',
'random_state':0}):
if props is None:
props = all_props
n_obs = X['missing'].shape[0] # Number of observations.
n_features = X['missing'].shape[1] # Number of observations.
n_props = len(props) # Number of properties to predict.
test_size = 0.2
if labels is None:
shuffle_split = ShuffleSplit(n_iter=n_splits,
test_size=test_size,random_state=0)
else:
shuffle_split = LabelShuffleSplit(n_iter=n_splits,
test_size=test_size,random_state=0)
n_test_samples = np.max([len(list(shuffle_split)[i][1]) \
for i in range(n_splits)])
rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps}
ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
feature_importances = None#{imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps}
cols = np.array([i for i in range(len(all_props)) if all_props[i] in props])
for imp in imps:
for k,(train,test) in enumerate(shuffle_split.split(range(n_obs),groups=labels)):
#X_train,X_test = X[imp][train][:,cols],X[imp][test][:,cols]
#Y_train,Y_test = Y[imp][train][:,cols],Y['missing'][test][:,cols]
X_train,X_test = X[imp][train,:],X[imp][test,:]
Y_train,Y_test = Y[imp][train,:],Y['missing'][test,:]
clf_args_ = {key:(value if type(value) is not dict \
else value[prop])\
for key,value in clf_args.items()}
if clf_args_['max_features'] not in [None, 'auto']:
clf_args_['max_features'] = min(X_train.shape[1],
clf_args_['max_features'])
rfc = RandomForestClassifier(**clf_args_)
onevsrest = OneVsRestClassifier(rfc)
onevsrest.fit(X_train,Y_train)
Y_predict = onevsrest.predict(X_test)#.reshape(-1,n_props)
probs = onevsrest.predict_proba(X_test)
if probs.shape[1]<2 and probs.mean()==1.0:
n_test_samples = len(probs)
ps[imp][:,k,:n_test_samples] = 0.0
else:
n_test_samples = len(probs[:,1])
ps[imp][:,k,:n_test_samples] = probs.T
ys[imp][:,k,:n_test_samples] = Y_test.T
for i in range(n_props):
rs[imp][i,k] = np.ma.corrcoef(Y_predict[:,i],Y_test[:,i])[0,1]
#feature_importances[imp][n_prop,:,k] = onevsrest.feature_importances_
return rs,feature_importances,ys,ps
示例8: run
def run():
# Load data set
X_train, Y_train, X_test, submission_file_content = load_data()
Y_train = np.log(Y_train + 200)
# Cross validation
cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1):
print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))
submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))
if os.path.isfile(submission_file_path):
continue
model = XGBRegressor(
learning_rate=0.01,
max_depth=12,
n_estimators=N_ESTIMATORS,
silent=False,
objective="reg:linear",
gamma=1,
min_child_weight=1,
subsample=0.8,
colsample_bytree=0.5,
reg_alpha=1,
seed=cross_validation_index,
nthread=-1)
model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])],
eval_metric=lambda y_predicted, y_true:("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))),
early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True)
# Perform the testing procedure
Y_test = model.predict(X_test)
# Save submission to disk
if not os.path.isdir(SUBMISSION_FOLDER_PATH):
os.makedirs(SUBMISSION_FOLDER_PATH)
submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
submission_file_content.to_csv(submission_file_path, index=False)
# Perform ensembling
ensemble_predictions()
print("All done!")
示例9: test_safe_split_with_precomputed_kernel
def test_safe_split_with_precomputed_kernel():
clf = SVC()
clfp = SVC(kernel="precomputed")
X, y = iris.data, iris.target
K = np.dot(X, X.T)
cv = ShuffleSplit(test_size=0.25, random_state=0)
tr, te = list(cv.split(X))[0]
X_tr, y_tr = _safe_split(clf, X, y, tr)
K_tr, y_tr2 = _safe_split(clfp, K, y, tr)
assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T))
X_te, y_te = _safe_split(clf, X, y, te, tr)
K_te, y_te2 = _safe_split(clfp, K, y, te, tr)
assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
示例10: TestPerformance
def TestPerformance(self, df = None):
#If no dataframe is provided, use the currently learned one
if(df is None):
D = self.D
else:
D = self.S.transform(df.copy())
#Get features from the data frame
A = self._ExtractFeat(D)
#Get the target values and their corresponding column names
y, _ = self._ExtractTarg(D)
#Begin cross validation
ss = ShuffleSplit(n_splits = 1)
for trn, tst in ss.split(A):
s1 = self.R.score(A, y)
s2 = self.R.score(A[tst], y[tst])
s3 = self.R.score(A[trn], y[trn])
print('C-V:\t' + str(s1) + '\nTst:\t' + str(s2) + '\nTrn:\t' + str(s3))
示例11: run
def run():
# Load data set
X_train, Y_train, X_test, submission_file_content = load_data()
Y_train = np.log(Y_train + 200)
# Cross validation
cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1):
print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))
submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))
if os.path.isfile(submission_file_path):
continue
model = GBMRegressor(
learning_rate=0.01,
num_iterations=NUM_ITERATIONS,
num_leaves=200,
min_data_in_leaf=10,
feature_fraction=0.3,
feature_fraction_seed=cross_validation_index,
bagging_fraction=0.8,
bagging_freq=10,
bagging_seed=cross_validation_index,
metric="l1",
metric_freq=10,
early_stopping_round=EARLY_STOPPING_ROUND,
num_threads=-1)
model.fit(X_train[train_index], Y_train[train_index], test_data=[(X_train[valid_index], Y_train[valid_index])])
# Perform the testing procedure
Y_test = model.predict(X_test)
# Save submission to disk
if not os.path.isdir(SUBMISSION_FOLDER_PATH):
os.makedirs(SUBMISSION_FOLDER_PATH)
submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
submission_file_content.to_csv(submission_file_path, index=False)
# Perform ensembling
ensemble_predictions()
print("All done!")
示例12: plot_shuffle_split
def plot_shuffle_split():
from sklearn.model_selection import ShuffleSplit
plt.figure(figsize=(10, 2))
plt.title("ShuffleSplit with 10 points"
", train_size=5, test_size=2, n_splits=4")
axes = plt.gca()
axes.set_frame_on(False)
n_folds = 10
n_samples = 10
n_iter = 4
n_samples_per_fold = 1
ss = ShuffleSplit(n_splits=4, train_size=5, test_size=2, random_state=43)
mask = np.zeros((n_iter, n_samples))
for i, (train, test) in enumerate(ss.split(range(10))):
mask[i, train] = 1
mask[i, test] = 2
for i in range(n_folds):
# test is grey
colors = ["grey" if x == 2 else "white" for x in mask[:, i]]
# not selected has no hatch
boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter,
left=i * n_samples_per_fold, height=.6, color=colors,
hatch="//", edgecolor='k', align='edge')
for j in np.where(mask[:, i] == 0)[0]:
boxes[j].set_hatch("")
axes.invert_yaxis()
axes.set_xlim(0, n_samples + 1)
axes.set_ylabel("CV iterations")
axes.set_xlabel("Data points")
axes.set_xticks(np.arange(n_samples) + .5)
axes.set_xticklabels(np.arange(1, n_samples + 1))
axes.set_yticks(np.arange(n_iter) + .3)
axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)])
# legend hacked for this random state
plt.legend([boxes[1], boxes[0], boxes[2]], [
"Training set", "Test set", "Not selected"], loc=(1, .3))
plt.tight_layout()
示例13: test_safe_split_with_precomputed_kernel
def test_safe_split_with_precomputed_kernel():
clf = SVC()
clfp = SVC(kernel="precomputed")
iris = datasets.load_iris()
X, y = iris.data, iris.target
K = np.dot(X, X.T)
cv = ShuffleSplit(test_size=0.25, random_state=0)
train, test = list(cv.split(X))[0]
X_train, y_train = _safe_split(clf, X, y, train)
K_train, y_train2 = _safe_split(clfp, K, y, train)
assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
assert_array_almost_equal(y_train, y_train2)
X_test, y_test = _safe_split(clf, X, y, test, train)
K_test, y_test2 = _safe_split(clfp, K, y, test, train)
assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
assert_array_almost_equal(y_test, y_test2)
示例14: KFold
optimal.predict(data)
#k-fold validation
# k-fold is a type of cross validation where the data are divided into k bins. For each experiment, pick one of the k bins as the test set,
#the remaining k-1 bins as training. Run k separate experiments and average all k test results.
#This technique helps to test different part of the data to prevent overfitting
#i.e. it prevents grid search from returning a parameter set that optimized specifically for a specific training data set but not overall.
from sklearn.model_selection import KFold
cv_set = KFold(n_splits=10)
for train_index, test_index in cv_sets.split(X):
print("%s %s" % (train_index, test_index))
#Shufflesplit
#ShuffleSplit() for an alternative form of cross-validation (see the 'cv_sets' variable).
#The ShuffleSplit() will create 10 ('n_splits') shuffled sets, and for each shuffle, 20% ('test_size') of the data will be used as the validation set.
from sklearn.model_selection import ShuffleSplit
cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 0)
for train_index, test_index in cv_sets.split(X):
print("%s %s" % (train_index, test_index))
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score
# pipelining
#Sequentially apply a list of transforms and a final estimator. Intermediate steps
#of the pipeline must be ‘transforms’, that is, they must implement fit and
#transform methods. The final estimator only needs to implement fit.
#The purpose of the pipeline is to assemble several steps that can be
#cross-validated together while setting different parameters.
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest
示例15: test_shufflesplit_reproducible
def test_shufflesplit_reproducible():
# Check that iterating twice on the ShuffleSplit gives the same
# sequence of train-test when the random_state is given
ss = ShuffleSplit(random_state=21)
assert_array_equal(list(a for a, b in ss.split(X)),
list(a for a, b in ss.split(X)))