本文整理汇总了Python中sklearn.model_selection.train_test_split方法的典型用法代码示例。如果您正苦于以下问题:Python model_selection.train_test_split方法的具体用法?Python model_selection.train_test_split怎么用?Python model_selection.train_test_split使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.model_selection
的用法示例。
在下文中一共展示了model_selection.train_test_split方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: optimize_hyperparam
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def optimize_hyperparam(self, X, y, test_size=.2, n_eval=100):
X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=test_size, shuffle=self.shuffle)
def objective(hyperparams):
model = XGBModel(n_estimators=self.n_est, **self.params, **hyperparams)
model.fit(X=X_trn, y=y_trn,
eval_set=[(X_val, y_val)],
eval_metric=self.metric,
early_stopping_rounds=self.n_stop,
verbose=False)
score = model.evals_result()['validation_0'][self.metric][model.best_iteration] * self.loss_sign
return {'loss': score, 'status': STATUS_OK, 'model': model}
trials = Trials()
best = hyperopt.fmin(fn=objective, space=self.space, trials=trials,
algo=tpe.suggest, max_evals=n_eval, verbose=1,
rstate=self.random_state)
hyperparams = space_eval(self.space, best)
return hyperparams, trials
示例2: test_different_results
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def test_different_results(self):
from sklearn import datasets
from sklearn import linear_model
from sklearn.model_selection import train_test_split
dataset = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)
clf = LogisticRegression(data_norm=12)
clf.fit(X_train, y_train)
predict1 = clf.predict(X_test)
clf = LogisticRegression(data_norm=12)
clf.fit(X_train, y_train)
predict2 = clf.predict(X_test)
clf = linear_model.LogisticRegression(solver="lbfgs", multi_class="ovr")
clf.fit(X_train, y_train)
predict3 = clf.predict(X_test)
self.assertFalse(np.all(predict1 == predict2))
self.assertFalse(np.all(predict3 == predict1) and np.all(predict3 == predict2))
示例3: test_same_results
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def test_same_results(self):
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import linear_model
dataset = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)
clf = LogisticRegression(data_norm=12, epsilon=float("inf"))
clf.fit(X_train, y_train)
predict1 = clf.predict(X_test)
clf = linear_model.LogisticRegression(solver="lbfgs", multi_class="ovr")
clf.fit(X_train, y_train)
predict2 = clf.predict(X_test)
self.assertTrue(np.all(predict1 == predict2))
示例4: test_different_results
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def test_different_results(self):
from sklearn import datasets
from sklearn import linear_model
from sklearn.model_selection import train_test_split
dataset = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)
clf = LinearRegression(data_norm=12, bounds_X=([4.3, 2.0, 1.1, 0.1], [7.9, 4.4, 6.9, 2.5]), bounds_y=(0, 2))
clf.fit(X_train, y_train)
predict1 = clf.predict(X_test)
clf = LinearRegression(data_norm=12, bounds_X=([4.3, 2.0, 1.1, 0.1], [7.9, 4.4, 6.9, 2.5]), bounds_y=(0, 2))
clf.fit(X_train, y_train)
predict2 = clf.predict(X_test)
clf = linear_model.LinearRegression()
clf.fit(X_train, y_train)
predict3 = clf.predict(X_test)
self.assertFalse(np.all(predict1 == predict2))
self.assertFalse(np.all(predict3 == predict1) and np.all(predict3 == predict2))
示例5: test_same_results
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def test_same_results(self):
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import linear_model
dataset = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2)
clf = LinearRegression(data_norm=12, epsilon=float("inf"),
bounds_X=([4.3, 2.0, 1.0, 0.1], [7.9, 4.4, 6.9, 2.5]), bounds_y=(0, 2))
clf.fit(X_train, y_train)
predict1 = clf.predict(X_test)
clf = linear_model.LinearRegression(normalize=False)
clf.fit(X_train, y_train)
predict2 = clf.predict(X_test)
self.assertTrue(np.allclose(predict1, predict2))
示例6: test_with_iris
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def test_with_iris(self):
global_seed(12345)
from sklearn import datasets
dataset = datasets.load_iris()
x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=.2)
bounds = ([4.3, 2.0, 1.0, 0.1], [7.9, 4.4, 6.9, 2.5])
clf = GaussianNB(epsilon=5.0, bounds=bounds)
clf.fit(x_train, y_train)
accuracy = clf.score(x_test, y_test)
counts = clf.class_count_.copy()
self.assertGreater(accuracy, 0.5)
clf.partial_fit(x_train, y_train)
new_counts = clf.class_count_
self.assertEqual(np.sum(new_counts), np.sum(counts) * 2)
示例7: create_cancer_data
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def create_cancer_data():
# Import cancer dataset
cancer = (
retrieve_dataset("breast-cancer.train.csv", na_values="?")
.interpolate()
.astype("int64")
)
cancer_target = cancer.iloc[:, 0]
cancer_data = cancer.iloc[:, 1:]
feature_names = cancer_data.columns.values
target_names = ["no_cancer", "cancer"]
# Split data into train and test
x_train, x_test, y_train, y_validation = train_test_split(
cancer_data, cancer_target, test_size=0.2, random_state=0
)
return x_train, x_test, y_train, y_validation, feature_names, target_names
示例8: create_simple_titanic_data
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def create_simple_titanic_data():
titanic_url = (
"https://raw.githubusercontent.com/amueller/"
"scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv"
)
data = read_csv(titanic_url)
# fill missing values
data = data.fillna(method="ffill")
data = data.fillna(method="bfill")
numeric_features = ["age", "fare"]
categorical_features = ["embarked", "sex", "pclass"]
y = data["survived"].values
X = data[categorical_features + numeric_features]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
return X_train, X_test, y_train, y_test, numeric_features, categorical_features
示例9: mmb_evaluate_model
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def mmb_evaluate_model(self):
"""
Returns scores from cross validation evaluation on the malicious / benign classifier
"""
predictive_features = self.features['predictive_features']
self.clf_X = self.modeldata[predictive_features].values
self.clf_y = np.array(self.modeldata['label'])
X_train, X_test, y_train, y_test = train_test_split(self.clf_X, self.clf_y, test_size=0.2, random_state=0)
lb = LabelBinarizer()
y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
eval_cls = RandomForestClassifier(n_estimators=100, max_features=.2)
eval_cls.fit(X_train, y_train)
recall = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='recall')
precision = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='precision')
accuracy = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='accuracy')
f1_score = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='f1_macro')
return {'accuracy': accuracy, 'f1': f1_score, 'precision': precision, 'recall': recall}
示例10: bootstrapped_split
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def bootstrapped_split(car_ids, seed=args.seed):
"""
# Arguments
metadata: metadata.csv provided by Carvana (should include
`train` column).
# Returns
A tuple (train_ids, test_ids)
"""
all_ids = pd.Series(car_ids)
train_ids, valid_ids = train_test_split(car_ids, test_size=args.test_size_float,
random_state=seed)
np.random.seed(seed)
bootstrapped_idx = np.random.random_integers(0, len(train_ids))
bootstrapped_train_ids = train_ids[bootstrapped_idx]
return generate_filenames(bootstrapped_train_ids.values), generate_filenames(valid_ids)
示例11: main
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def main():
from sklearn import preprocessing
from sklearn.datasets import fetch_openml as fetch_mldata
from sklearn.model_selection import train_test_split
db_name = 'diabetes'
data_set = fetch_mldata(db_name)
data_set.data = preprocessing.normalize(data_set.data)
tmp = data_set.target
tmpL = [ 1 if i == "tested_positive" else -1 for i in tmp]
data_set.target = tmpL
X_train, X_test, y_train, y_test = train_test_split(
data_set.data, data_set.target, test_size=0.4)
mlelm = MLELM(hidden_units=(10, 30, 200)).fit(X_train, y_train)
elm = ELM(200).fit(X_train, y_train)
print("MLELM Accuracy %0.3f " % mlelm.score(X_test, y_test))
print("ELM Accuracy %0.3f " % elm.score(X_test, y_test))
示例12: train_test_val_split
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def train_test_val_split(X, Y, split=(0.2, 0.1), shuffle=True):
"""Split dataset into train/val/test subsets by 70:20:10(default).
Args:
X: List of data.
Y: List of labels corresponding to data.
split: Tuple of split ratio in `test:val` order.
shuffle: Bool of shuffle or not.
Returns:
Three dataset in `train:test:val` order.
"""
from sklearn.model_selection import train_test_split
assert len(X) == len(Y), 'The length of X and Y must be consistent.'
X_train, X_test_val, Y_train, Y_test_val = train_test_split(X, Y,
test_size=(split[0]+split[1]), shuffle=shuffle)
X_test, X_val, Y_test, Y_val = train_test_split(X_test_val, Y_test_val,
test_size=split[1], shuffle=False)
return (X_train, Y_train), (X_test, Y_test), (X_val, Y_val)
示例13: pipeline
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def pipeline(args):
'''
Runs the model loop.
'''
df = pd.read_csv(args.filename)
df.loc[:,args.x_label] = df[args.x_label].fillna("None")
if args.dedupe:
df = df.drop_duplicates(subset='content')
if args.reduce:
df = restrict_sources(df)
X = df[args.x_label]
y = df[args.y_label]
parser = spacy.load('en')
X_train, X_test, y_train, y_test = train_test_split(X, y)
loop = ModelLoop(X_train, X_test, y_train, y_test, args.models,
args.iterations, args.output_dir,
thresholds = args.thresholds, ks = args.ks,
setting=args.features[0])
loop.run()
示例14: plot_learning_curve
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def plot_learning_curve(x_train, y_train, subsets=20, mmr=None, cv=5, tool='matplotlib'):
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)
subset_sizes = np.exp(np.linspace(3, np.log(len(y_train)), subsets)).astype(int)
results_list = [[], []]
for subset_size in subset_sizes:
logger.info('Performing cross validation on subset_size %d', subset_size)
_, _, cv_score, roc_auc, _ = evaluate([x_train[:subset_size], y_train[:subset_size]],
[x_test, y_test], cv=cv)
results_list[0].append(1 - cv_score)
results_list[1].append(1 - roc_auc)
if tool == 'matplotlib':
_plot_matplotlib(subset_sizes, results_list, mmr)
else:
_plot_plotly(subset_sizes, results_list, mmr)
示例15: return_train_dataset
# 需要导入模块: from sklearn import model_selection [as 别名]
# 或者: from sklearn.model_selection import train_test_split [as 别名]
def return_train_dataset(self):
"""Returns train data set
Returns:
X (numpy.ndarray): Features
y (numpy.ndarray): Labels
"""
X, y = self.return_main_dataset()
if self.test_dataset['method'] == 'split_from_main':
X, X_test, y, y_test = train_test_split(
X,
y,
test_size=self.test_dataset['split_ratio'],
random_state=self.test_dataset['split_seed'],
stratify=y
)
return X, y