本文整理匯總了Python中sklearn.model_selection.TimeSeriesSplit方法的典型用法代碼示例。如果您正苦於以下問題:Python model_selection.TimeSeriesSplit方法的具體用法?Python model_selection.TimeSeriesSplit怎麽用?Python model_selection.TimeSeriesSplit使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.model_selection
的用法示例。
在下文中一共展示了model_selection.TimeSeriesSplit方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: backtest
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def backtest(data_set_path,n_test_split):
X,y = prepare_data(data_set_path,as_retention=False)
tscv = TimeSeriesSplit(n_splits=n_test_split)
lift_scorer = make_scorer(calc_lift, needs_proba=True)
score_models = {'lift': lift_scorer, 'AUC': 'roc_auc'}
retain_reg = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True)
gsearch = GridSearchCV(estimator=retain_reg,scoring=score_models, cv=tscv, verbose=1,
return_train_score=False, param_grid={'C' : [1]}, refit='AUC')
gsearch.fit(X,y)
result_df = pd.DataFrame(gsearch.cv_results_)
save_path = data_set_path.replace('.csv', '_backtest.csv')
result_df.to_csv(save_path, index=False)
print('Saved test scores to ' + save_path)
示例2: test_diff_detector_cross_validate
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def test_diff_detector_cross_validate(return_estimator: bool):
"""
DiffBasedAnomalyDetector.cross_validate implementation should be the
same as sklearn.model_selection.cross_validate if called the same.
And it always will update `return_estimator` to True, as it requires
the intermediate models to calculate the thresholds
"""
X = np.random.random((100, 10))
y = np.random.random((100, 1))
model = DiffBasedAnomalyDetector(base_estimator=LinearRegression())
cv = TimeSeriesSplit(n_splits=3)
cv_results_da = model.cross_validate(
X=X, y=y, cv=cv, return_estimator=return_estimator
)
cv_results_sk = cross_validate(model, X=X, y=y, cv=cv, return_estimator=True)
assert cv_results_da.keys() == cv_results_sk.keys()
示例3: test_2d_y
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def test_2d_y():
# smoke test for 2d y and multi-label
n_samples = 30
rng = np.random.RandomState(1)
X = rng.randint(0, 3, size=(n_samples, 2))
y = rng.randint(0, 3, size=(n_samples,))
y_2d = y.reshape(-1, 1)
y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
groups = rng.randint(0, 3, size=(n_samples,))
splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
RepeatedKFold(), RepeatedStratifiedKFold(),
ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
GroupShuffleSplit(), LeaveOneGroupOut(),
LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
PredefinedSplit(test_fold=groups)]
for splitter in splitters:
list(splitter.split(X, y, groups))
list(splitter.split(X, y_2d, groups))
try:
list(splitter.split(X, y_multilabel, groups))
except ValueError as e:
allowed_target_types = ('binary', 'multiclass')
msg = "Supported target types are: {}. Got 'multilabel".format(
allowed_target_types)
assert msg in str(e)
示例4: prepare_xy
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def prepare_xy(self,groups=True):
if groups:
self.apply_behavior_grouping()
dat= pd.DataFrame(self.churn_data_reduced)
cols=self.grouped_columns
else:
self.normalize_skewscale()
dat = pd.DataFrame(self.data_scores)
cols = self.metric_columns
# The result has to be sorted by date for the TimeSeriesSplit to work properly
dat['temp_obs_date'] = self.observe_dates.values
dat.sort_values('temp_obs_date',inplace=True)
X = dat[cols]
y = dat['is_churn']
return X,y
示例5: crossvalidate_churn_model
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def crossvalidate_churn_model(self,model_code,groups=True):
X,y = self.prepare_xy(groups)
params = self.cv_params(model_code)
model = self.model_instance(model_code)
tscv = TimeSeriesSplit(n_splits=3)
lift_scorer = make_scorer(top_decile_lift,needs_proba=True)
score_models = {'lift_scorer' : lift_scorer, 'AUC' : 'roc_auc'}
gsearch = GridSearchCV(estimator=model, param_grid=params, scoring=score_models, cv=tscv, n_jobs=8,verbose=5,
return_train_score=True,refit='AUC')
gsearch.fit(X, y)
result_df = pd.DataFrame(gsearch.cv_results_)
if len(params)>1:
result_df.sort_values('mean_test_AUC',ascending=False,inplace=True)
save_file_name = model_code + '_CV'
save_path = self.save_path(save_file_name, subdir=self.grouping_correlation_subdir(groups))
result_df.to_csv(save_path)
print('Saved result to ' + save_path)
return result_df
示例6: test_keras_autoencoder_crossval
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def test_keras_autoencoder_crossval(model, kind):
"""
Test ability for cross validation
"""
Model = pydoc.locate(f"gordo.machine.model.models.{model}")
model = Pipeline([("model", Model(kind=kind))])
X = np.random.random(size=(15, 2))
y = X.copy()
scores = cross_val_score(
model, X, y, cv=TimeSeriesSplit(n_splits=2, max_train_size=2)
)
assert isinstance(scores, np.ndarray)
logger.info(f"Mean score: {scores.mean():.4f} - Std score: {scores.std():.4f}")
示例7: test_time_series_cv
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def test_time_series_cv():
X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]]
# Should fail if there are more folds than samples
assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
next,
TimeSeriesSplit(n_splits=7).split(X))
tscv = TimeSeriesSplit(2)
# Manually check that Time Series CV preserves the data
# ordering on toy datasets
splits = tscv.split(X[:-1])
train, test = next(splits)
assert_array_equal(train, [0, 1])
assert_array_equal(test, [2, 3])
train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3])
assert_array_equal(test, [4, 5])
splits = TimeSeriesSplit(2).split(X)
train, test = next(splits)
assert_array_equal(train, [0, 1, 2])
assert_array_equal(test, [3, 4])
train, test = next(splits)
assert_array_equal(train, [0, 1, 2, 3, 4])
assert_array_equal(test, [5, 6])
# Check get_n_splits returns the correct number of splits
splits = TimeSeriesSplit(2).split(X)
n_splits_actual = len(list(splits))
assert_equal(n_splits_actual, tscv.get_n_splits())
assert_equal(n_splits_actual, 2)
示例8: test_time_series_max_train_size
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def test_time_series_max_train_size():
X = np.zeros((6, 1))
splits = TimeSeriesSplit(n_splits=3).split(X)
check_splits = TimeSeriesSplit(n_splits=3, max_train_size=3).split(X)
_check_time_series_max_train_size(splits, check_splits, max_train_size=3)
# Test for the case where the size of a fold is greater than max_train_size
check_splits = TimeSeriesSplit(n_splits=3, max_train_size=2).split(X)
_check_time_series_max_train_size(splits, check_splits, max_train_size=2)
# Test for the case where the size of each fold is less than max_train_size
check_splits = TimeSeriesSplit(n_splits=3, max_train_size=5).split(X)
_check_time_series_max_train_size(splits, check_splits, max_train_size=2)
示例9: test_nsplit_default_warn
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def test_nsplit_default_warn():
# Test that warnings are raised. Will be removed in 0.22
assert_warns_message(FutureWarning, NSPLIT_WARNING, KFold)
assert_warns_message(FutureWarning, NSPLIT_WARNING, GroupKFold)
assert_warns_message(FutureWarning, NSPLIT_WARNING, StratifiedKFold)
assert_warns_message(FutureWarning, NSPLIT_WARNING, TimeSeriesSplit)
assert_no_warnings(KFold, n_splits=5)
assert_no_warnings(GroupKFold, n_splits=5)
assert_no_warnings(StratifiedKFold, n_splits=5)
assert_no_warnings(TimeSeriesSplit, n_splits=5)
示例10: split
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def split(self, df, y=None, groups=None):
self._validate_df(df)
groups = df.groupby(self.groupby).indices
splits = {}
while True:
X_idxs, y_idxs = [], []
for key, sub_idx in groups.items():
sub_df = df.iloc[sub_idx]
sub_y = y[sub_idx] if y is not None else None
if key not in splits:
splitter = TimeSeriesSplit(
self.n_splits, self.max_train_size
)
splits[key] = splitter.split(sub_df, sub_y)
try:
X_idx, y_idx = next(splits[key])
X_idx = np.array(
[df.index.get_loc(i) for i in sub_df.iloc[X_idx].index]
)
y_idx = np.array(
[df.index.get_loc(i) for i in sub_df.iloc[y_idx].index]
)
X_idxs.append(X_idx)
y_idxs.append(y_idx)
except StopIteration:
pass
if len(X_idxs) == 0:
break
yield np.concatenate(X_idxs), np.concatenate(y_idxs)
示例11: crossvalidate_xgb
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def crossvalidate_xgb(data_set_path,n_test_split):
X,y = prepare_data(data_set_path,ext='',as_retention=False)
tscv = TimeSeriesSplit(n_splits=n_test_split)
score_models = {'lift': make_scorer(calc_lift, needs_proba=True), 'AUC': 'roc_auc'}
xgb_model = xgb.XGBClassifier(objective='binary:logistic')
test_params = { 'max_depth': [1,2,4,6],
'learning_rate': [0.1,0.2,0.3,0.4],
'n_estimators': [20,40,80,120],
'min_child_weight' : [3,6,9,12]}
gsearch = GridSearchCV(estimator=xgb_model,n_jobs=-1, scoring=score_models, cv=tscv, verbose=1,
return_train_score=False, param_grid=test_params,refit='AUC')
gsearch.fit(X.values,y)
result_df = pd.DataFrame(gsearch.cv_results_)
result_df.sort_values('mean_test_AUC',ascending=False,inplace=True)
save_path = data_set_path.replace('.csv', '_crossval_xgb.csv')
result_df.to_csv(save_path, index=False)
print('Saved test scores to ' + save_path)
pickle_path = data_set_path.replace('.csv', '_xgb_model.pkl')
with open(pickle_path, 'wb') as fid:
pickle.dump(gsearch.best_estimator_, fid)
print('Saved model pickle to ' + pickle_path)
predictions = gsearch.best_estimator_.predict_proba(X.values)
predict_df = pd.DataFrame(predictions, index=X.index, columns=['retain_prob','churn_prob'])
forecast_save_path = data_set_path.replace('.csv', '_xgb_predictions.csv')
print('Saving results to %s' % forecast_save_path)
predict_df.to_csv(forecast_save_path, header=True)
forecast_histogram(data_set_path,predict_df,ext='xgb')
示例12: crossvalidate
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def crossvalidate(data_set_path,n_test_split):
X,y = prepare_data(data_set_path,as_retention=False)
tscv = TimeSeriesSplit(n_splits=n_test_split)
score_models = {'lift': make_scorer(calc_lift, needs_proba=True), 'AUC': 'roc_auc'}
retain_reg = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True)
test_params = {'C' : [0.64, 0.32, 0.16, 0.08, 0.04, 0.02, 0.01, 0.005, 0.0025]}
gsearch = GridSearchCV(estimator=retain_reg,scoring=score_models, cv=tscv, verbose=1,
return_train_score=False, param_grid=test_params, refit=False)
gsearch.fit(X,y)
result_df = pd.DataFrame(gsearch.cv_results_)
result_df['n_weights']= test_n_weights(X,y,test_params)
result_df.to_csv(data_set_path.replace('.csv', '_crossval.csv'), index=False)
plot_regression_test(data_set_path,result_df)
示例13: test_objectmapper
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def test_objectmapper(self):
df = pdml.ModelFrame([])
# Splitter Classes
self.assertIs(df.model_selection.KFold, ms.KFold)
self.assertIs(df.model_selection.GroupKFold, ms.GroupKFold)
self.assertIs(df.model_selection.StratifiedKFold, ms.StratifiedKFold)
self.assertIs(df.model_selection.LeaveOneGroupOut, ms.LeaveOneGroupOut)
self.assertIs(df.model_selection.LeavePGroupsOut, ms.LeavePGroupsOut)
self.assertIs(df.model_selection.LeaveOneOut, ms.LeaveOneOut)
self.assertIs(df.model_selection.LeavePOut, ms.LeavePOut)
self.assertIs(df.model_selection.ShuffleSplit, ms.ShuffleSplit)
self.assertIs(df.model_selection.GroupShuffleSplit,
ms.GroupShuffleSplit)
# self.assertIs(df.model_selection.StratifiedShuffleSplit,
# ms.StratifiedShuffleSplit)
self.assertIs(df.model_selection.PredefinedSplit, ms.PredefinedSplit)
self.assertIs(df.model_selection.TimeSeriesSplit, ms.TimeSeriesSplit)
# Splitter Functions
# Hyper-parameter optimizers
self.assertIs(df.model_selection.GridSearchCV, ms.GridSearchCV)
self.assertIs(df.model_selection.RandomizedSearchCV, ms.RandomizedSearchCV)
self.assertIs(df.model_selection.ParameterGrid, ms.ParameterGrid)
self.assertIs(df.model_selection.ParameterSampler, ms.ParameterSampler)
# Model validation
示例14: test_objectmapper_abbr
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def test_objectmapper_abbr(self):
df = pdml.ModelFrame([])
# Splitter Classes
self.assertIs(df.ms.KFold, ms.KFold)
self.assertIs(df.ms.GroupKFold, ms.GroupKFold)
self.assertIs(df.ms.StratifiedKFold, ms.StratifiedKFold)
self.assertIs(df.ms.LeaveOneGroupOut, ms.LeaveOneGroupOut)
self.assertIs(df.ms.LeavePGroupsOut, ms.LeavePGroupsOut)
self.assertIs(df.ms.LeaveOneOut, ms.LeaveOneOut)
self.assertIs(df.ms.LeavePOut, ms.LeavePOut)
self.assertIs(df.ms.ShuffleSplit, ms.ShuffleSplit)
self.assertIs(df.ms.GroupShuffleSplit,
ms.GroupShuffleSplit)
# self.assertIs(df.ms.StratifiedShuffleSplit,
# ms.StratifiedShuffleSplit)
self.assertIs(df.ms.PredefinedSplit, ms.PredefinedSplit)
self.assertIs(df.ms.TimeSeriesSplit, ms.TimeSeriesSplit)
# Splitter Functions
# Hyper-parameter optimizers
self.assertIs(df.ms.GridSearchCV, ms.GridSearchCV)
self.assertIs(df.ms.RandomizedSearchCV, ms.RandomizedSearchCV)
self.assertIs(df.ms.ParameterGrid, ms.ParameterGrid)
self.assertIs(df.ms.ParameterSampler, ms.ParameterSampler)
# Model validation
示例15: function
# 需要導入模塊: from sklearn import model_selection [as 別名]
# 或者: from sklearn.model_selection import TimeSeriesSplit [as 別名]
def function(self):
self.out_1.val = TimeSeriesSplit()