本文整理汇总了Python中sklearn.model_selection.train_test_split函数的典型用法代码示例。如果您正苦于以下问题:Python train_test_split函数的具体用法?Python train_test_split怎么用?Python train_test_split使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了train_test_split函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: learning
def learning( self):
X = self.X
y = self.y
print( "Shape of X and y are", X.shape, y.shape)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train,
test_size=0.2, random_state=42)
val_monitor = skflow.monitors.ValidationMonitor(X_val, y_val,
early_stopping_rounds=200)
model = skflow.TensorFlowDNNRegressor(hidden_units=[100, 50, 10], steps=5000)
model.fit(X_train, y_train, val_monitor)
yP = model.predict(X_test)
score_r2 = metrics.r2_score(y_test, yP)
score_MedAE = metrics.median_absolute_error(y_test, yP)
print('Accuracy')
print('--------')
print('R2: {0:f}, MedAE: {1:f}'.format(score_r2, score_MedAE))
if self.graph:
kutil.regress_show4( y_test, yP)
示例2: lda_tuner
def lda_tuner(ingroup_otu, best_models):
best_score = -1*np.inf
dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
topic_series = [3]
X = ingroup_otu.values
eval_counter = 0
for topics in topic_series:
for dtp in dtp_series:
for twp in twp_series:
eval_counter +=1
X_train, X_test = train_test_split(X, test_size=0.5)
lda = LatentDirichletAllocation(n_topics=topics,
doc_topic_prior=dtp,
topic_word_prior=twp,
learning_method='batch',
random_state=42,
max_iter=20)
lda.fit(X_train)
this_score = lda.score(X_test)
this_perplexity = lda.perplexity(X_test)
if this_score > best_score:
best_score = this_score
print "New Max Likelihood: {}".format(best_score)
print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter,
topics, dtp, twp,
this_score, this_perplexity)
best_models.append({'n': topics, 'dtp': dtp, 'twp': twp,
'score': this_score, 'perp': this_perplexity})
if (dtp == dtp_series[-1]) and (twp == twp_series[-1]):
eval_counter +=1
X_train, X_test = train_test_split(X, test_size=0.5)
lda = LatentDirichletAllocation(n_topics=topics,
doc_topic_prior=1./topics,
topic_word_prior=1./topics,
learning_method='batch',
random_state=42,
max_iter=20)
lda.fit(X_train)
this_score = lda.score(X_test)
this_perplexity = lda.perplexity(X_test)
if this_score > best_score:
best_score = this_score
print "New Max Likelihood: {}".format(best_score)
print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter,
topics,
(1./topics),
(1./topics),
this_score,
this_perplexity)
best_models.append({'n': topics, 'dtp': (1./topics),
'twp': (1./topics), 'score': this_score,
'perp': this_perplexity})
return best_models
示例3: test_base_estimator
def test_base_estimator():
# Check base_estimator and its default values.
rng = check_random_state(0)
# Classification
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng)
ensemble = BaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier))
ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier))
ensemble = BaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, Perceptron))
# Regression
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng)
ensemble = BaggingRegressor(None, n_jobs=3, random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor))
ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor))
ensemble = BaggingRegressor(SVR(), n_jobs=3, random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, SVR))
示例4: stacking
def stacking():
X_train,X_test,Y_train,Y_test =train_test_split(x,y,
random_state=35,
test_size=0.2)
x1_test =np.zeros((X_test.shape[0],len(classifiers)))#存储第一层测试集的输出结果
x1_train =np.zeros((X_train.shape[0],len(classifiers)))
print 'x1.shape',np.shape(x1_train)
print 'y....',np.shape(Y_train)
accuracy = np.zeros(len(classifiers))#每个模型的准确率
for train_index, test_index in sss.split(X_train, Y_train):
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
clf_num = 0
for clf in classifiers:
clf_name = clf.__class__.__name__
clf.fit(x_train, y_train)
x1_train[test_index,clf_num]=clf.predict(x_test)#下层模型的训练集输入是上层模型对于对应测试集的预测输出
x1_test[:, clf_num] += clf.predict(X_test)#直接对测试集进行预测,总共有十次,进行平均
accuracy[clf_num] += (y_test == x1_train[test_index,clf_num]).mean()#该模型的准确率,十次平均
clf_num += 1
print np.shape(x1_train)
print np.shape(y_train)
x2_train,x2_test,y2_train,y2_test =train_test_split(x1_train,Y_train,test_size=0.1)
lr =LogisticRegression()
lr.fit(x2_train,y2_train)
print lr.predict(x1_test)
print Y_test
示例5: test_thresholded_scorers
def test_thresholded_scorers():
# Test scorers that take thresholds.
X, y = make_blobs(random_state=0, centers=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.decision_function(X_test))
score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
assert_almost_equal(score1, score2)
assert_almost_equal(score1, score3)
logscore = get_scorer('log_loss')(clf, X_test, y_test)
logloss = log_loss(y_test, clf.predict_proba(X_test))
assert_almost_equal(-logscore, logloss)
# same for an estimator without decision_function
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
assert_almost_equal(score1, score2)
# test with a regressor (no decision_function)
reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(reg, X_test, y_test)
score2 = roc_auc_score(y_test, reg.predict(X_test))
assert_almost_equal(score1, score2)
# Test that an exception is raised on more than two classes
X, y = make_blobs(random_state=0, centers=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf.fit(X_train, y_train)
assert_raises(ValueError, get_scorer('roc_auc'), clf, X_test, y_test)
示例6: train_test_split_mock_pandas
def train_test_split_mock_pandas():
# X mock dataframe
X_df = MockDataFrame(X)
X_train, X_test = train_test_split(X_df)
assert_true(isinstance(X_train, MockDataFrame))
assert_true(isinstance(X_test, MockDataFrame))
X_train_arr, X_test_arr = train_test_split(X_df)
示例7: read
def read(d):
data = pd.read_table(path+uni+"_"+d+".txt",delimiter='\t')
data['label'] = 0
for i in range(len(data.index)):
if data.iloc[i,3]<1000:
data.iloc[i,len(data.columns)-1]=1
else:
data.iloc[i,len(data.columns)-1]=0
X_0 = data.iloc[:,7:len(data.columns)-1]
y_0 = data.iloc[:,len(data.columns)-1]
X_0,X_,y_0,y_ = train_test_split(X_0,y_0,test_size=0.0,random_state=3421)
X_1,X_test,y_1,y_test = train_test_split(X_0,y_0,test_size=0.2,random_state=1257)
X_2,X_3,y_2,y_3 = train_test_split(X_1,y_1,test_size=1-label_rate,random_state=11)
############## 整体预测与交互检验 ###########
# scores_all = cross_val_score(RandomForestClassifier(n_estimators=500), X_1, y_1, cv=5, scoring='accuracy')
# score_all_mean =scores_all.mean()
# print(d+'5折交互检验:'+str(score_all_mean))
# rf_all = RandomForestClassifier(n_estimators=500).fit(X_1,y_1)
# answer_rf_all = rf_all.predict(X_test)
# accuracy_all = metrics.accuracy_score(y_test,answer_rf_all)
# print(d+'整体预测:'+str(accuracy_all))
################################################
return data,X_2,y_2,X_3,y_3,X_test,y_test
示例8: reduce_dataset
def reduce_dataset(uid):
ds = load_validation_dataframe(uid)
X_train, X_valid, X_test, y_train, y_valid, y_test = ds
X=pd.concat((X_train,X_valid,X_test))
y=np.concatenate((y_train,y_valid,y_test))
if len(y) > 5000:
neg_inds = [i for i, v in enumerate(y) if v==0]
pos_inds = [i for i, v in enumerate(y) if v==1]
n_neg = 5000 - len(pos_inds)
neg_inds = sample(neg_inds, n_neg)
inds = sorted(neg_inds + pos_inds)
X = X.iloc[inds,:]
y = y[inds]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.66666, random_state=42)
Xtrain_fname = join(DATAFRAMES_FOLDER, "dfXtrain_%d_small.pickle" % uid)
Xvalid_fname = join(DATAFRAMES_FOLDER, "dfXvalid_%d_small.pickle" % uid)
Xtest_fname = join(DATAFRAMES_FOLDER, "dfXtestv_%d_small.pickle" % uid)
ys_fname = join(DATAFRAMES_FOLDER, "ysv_%d_small.pickle" % uid)
X_train.to_pickle(Xtrain_fname)
X_valid.to_pickle(Xvalid_fname)
X_test.to_pickle(Xtest_fname)
pickle.dump((y_train, y_valid, y_test), open(ys_fname, 'wb'))
return X_train, X_valid, X_test, y_train, y_valid, y_test
示例9: test_classification_with_validation
def test_classification_with_validation(self):
tol_places = 4
data_x, data_y = make_classification(n_samples=100, n_features=7,
n_redundant=0, n_informative=7,
n_clusters_per_class=2,
random_state=3227)
label_y = np.where(data_y == 0, 'A', 'B')
train_x, test_x, train_y, test_y = train_test_split(data_x, label_y,
test_size=0.25,
random_state=3227)
train_x, validate_x, train_y, validate_y = train_test_split(
train_x, train_y, test_size=0.5, random_state=3227)
params = {
'ref_functions': ('linear_cov',),
'criterion_type': 'bias_retrain',
'criterion_minimum_width': 5,
'max_layer_count': 5,
'verbose': 0,
'n_jobs': 'max'
}
model = Classifier(**params)
model.fit(train_x, train_y, validation_data=(validate_x, validate_y))
pred_y = model.predict_proba(test_x)
roc_auc = roc_auc_score(model.le.transform(test_y), pred_y)
self.assertAlmostEqual(roc_auc, 0.76, places=tol_places)
no1 = model.predict_neuron_output(test_x, 0, 0)
no2 = model.predict_neuron_output(test_x, 1, 0)
示例10: __init__
def __init__(self, root, train=True, val=False, color_space='lab', transform=None, test_size=0.9, val_size=0.125, location='cpu'):
"""
color_space: 'yub' or 'lab'
"""
self.root_dir = root
all_files = []
for r, _, files in walk(self.root_dir):
for f in files:
if f.endswith('.jpg'):
all_files.append(join(r, f))
train_val_files, test_files = train_test_split(
all_files, test_size=test_size, random_state=69)
train_files, val_files = train_test_split(train_val_files,
test_size=val_size, random_state=69)
if (train and val):
self.filenames = val_files
elif train:
self.filenames = train_files
else:
self.filenames = test_files
self.color_space = color_space
if (self.color_space not in ['rgb', 'lab']):
raise(NotImplementedError)
self.transform = transform
self.location = location
self.nnenc = NNEncode(location=self.location)
self.train = train
示例11: main
def main(_):
if FLAGS.dataset == 'cifar10':
(X_train, y_train), (_, _) = cifar10.load_data()
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
else:
with open('data/train.p', mode='rb') as f:
train = pickle.load(f)
X_train, X_val, y_train, y_val = train_test_split(train['features'], train['labels'], test_size=0.33, random_state=0)
train_output_file = "{}_{}_{}.p".format(FLAGS.network, FLAGS.dataset, 'bottleneck_features_train')
validation_output_file = "{}_{}_{}.p".format(FLAGS.network, FLAGS.dataset, 'bottleneck_features_validation')
print("Resizing to", (w, h, ch))
print("Saving to ...")
print(train_output_file)
print(validation_output_file)
with tf.Session() as sess:
K.set_session(sess)
K.set_learning_phase(1)
model = create_model()
print('Bottleneck training')
train_gen = gen(sess, X_train, y_train, batch_size)
bottleneck_features_train = model.predict_generator(train_gen(), X_train.shape[0])
data = {'features': bottleneck_features_train, 'labels': y_train}
pickle.dump(data, open(train_output_file, 'wb'))
print('Bottleneck validation')
val_gen = gen(sess, X_val, y_val, batch_size)
bottleneck_features_validation = model.predict_generator(val_gen(), X_val.shape[0])
data = {'features': bottleneck_features_validation, 'labels': y_val}
pickle.dump(data, open(validation_output_file, 'wb'))
示例12: split_data
def split_data(data):
X_train, X_test, Y_train, Y_test = train_test_split(data.loc[:, data.columns != label], data[label],
train_size=train_size + validation_size, test_size=test_size,
shuffle=False, random_state=0)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train,
train_size=train_size / (train_size + validation_size),
test_size=validation_size / (train_size + validation_size),
shuffle=False, random_state=0)
return X_train, X_val, X_test, Y_train, Y_val, Y_test
示例13: test_split
def test_split(self):
ds = self.create_dataset()
indexes = list(range(len(ds)))
train, test = train_test_split(indexes)
train, valid = train_test_split(train)
splitter = SpecifiedIndexSplitter(train, valid, test)
train_ds, valid_ds, test_ds = splitter.train_valid_test_split(ds)
self.assertTrue(np.all(train_ds.X == ds.X[train]))
self.assertTrue(np.all(valid_ds.X == ds.X[valid]))
self.assertTrue(np.all(test_ds.X == ds.X[test]))
示例14: get_train_valid_test_split
def get_train_valid_test_split(n, train=0.7, valid=0.1, test=0.2, shuffle=False):
other_split = valid+test
if train+other_split!=1:
raise ValueError("Train, Valid, Test splits should sum to 1")
train_set, other_set = train_test_split(range(1,n+1),
train_size=train, test_size=other_split, shuffle=shuffle)
valid_set, test_set = train_test_split(other_set,
train_size=valid/other_split,
test_size=test/other_split,
shuffle=False)
print("train:{} valid:{} test:{}".format(len(train_set), len(valid_set), len(test_set)))
return train_set, valid_set, test_set
示例15: preprocess
def preprocess(data, test_size, sample=None, scale=True):
data_frame_all = pandas.read_table(data)
df = data_frame_all
# for simplicity for now--and since only 11093 or <3 % of our data, we're just gonna drop those rows
no_null_df = df.dropna(axis=0, how='any')
# this shows us that we no longer have null values
no_null_df.isnull().values.any()
# let's rename our new data frame df again. we're left with 238907 rows
df = no_null_df
df_unprocessed = df
if sample:
df = df.sample(frac=sample)
print("sampled")
df = df[['order_estimated_driving_time_min','order_estimated_shopping_time_min']]
df['total_time_min'] = df.sum(axis=1)
df['time_in_hours'] = df.total_time_min.divide(60)
target = df.time_in_hours * 15
df = df.drop(['time_in_hours', 'total_time_min'], axis=1)
s1 = target.std()
s2 = 7.5 #our chosen std deviation
m1 = target.mean()
m2 = 15 #our chosen mean
target = m2 + (target - m1) * s2/s1 #scale our output to a mean of 15 and std deviation of 3
X = df
y = target
if scale:
df_pp = preprocessing.scale(df)
print("scaled")
X_train, X_test, y_train, y_test = train_test_split(df_pp, target, test_size=test_size, random_state=42)
else:
df_pp = None
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=test_size, random_state=42)
return df_unprocessed, df, df_pp, target, X, X_train, X_test, y, y_train, y_test