当前位置: 首页>>代码示例>>Python>>正文


Python ShuffleSplit.split方法代码示例

本文整理汇总了Python中sklearn.model_selection.ShuffleSplit.split方法的典型用法代码示例。如果您正苦于以下问题:Python ShuffleSplit.split方法的具体用法?Python ShuffleSplit.split怎么用?Python ShuffleSplit.split使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.model_selection.ShuffleSplit的用法示例。


在下文中一共展示了ShuffleSplit.split方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: fit_models

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
def fit_models(imps, X, Y, all_props, props=None,
               labels=None, n_splits=5, 
               clf_args={'n_estimators':25, 
                         'max_features':'auto', 
                         'random_state':0}):
    if props is None:
        props = all_props
    n_obs = X['missing'].shape[0] # Number of observations.  
    n_features = X['missing'].shape[1] # Number of observations.  
    n_props = len(props) # Number of properties to predict.  
    test_size = 0.2
    if labels is None:
        shuffle_split = ShuffleSplit(n_iter=n_splits,
                                     test_size=test_size,random_state=0)
    else:
        shuffle_split = GroupShuffleSplit(n_iter=n_splits,
                                          test_size=test_size,random_state=0)
    n_test_samples = np.max([len(list(shuffle_split.split(range(n_obs),groups=labels))[i][1]) \
                            for i in range(n_splits)])
    rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps}
    ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    feature_importances = {imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps}
    for n_prop,prop in enumerate(props):
        j = all_props.index(prop)
        print("Fitting model for %s..." % prop)
        for imp in imps:
            for k,(train,test) in enumerate(shuffle_split.split(range(n_obs),
                                                                groups=labels)):
                X_train,X_test = X[imp][train],X[imp][test]
                Y_train,Y_test = Y[imp][train,j],Y['missing'][test,j]
                clf_args_ = {key:(value if type(value) is not dict \
                             else value[prop])\
                             for key,value in clf_args.items()}
                if clf_args_['max_features'] not in [None, 'auto']:
                   clf_args_['max_features'] = min(X_train.shape[1],
                                                   clf_args_['max_features'])
                rfc = RandomForestClassifier(**clf_args_)
                #if Y_train.shape[1] == 1:
                #    Y_train = Y_train.ravel()
                rfc.fit(X_train,Y_train)
                Y_predict = rfc.predict(X_test)#.reshape(-1,n_props)
                probs = rfc.predict_proba(X_test)
                if probs.shape[1]<2 and probs.mean()==1.0:
                    n_test_samples = len(probs)
                    ps[imp][n_prop,k,:n_test_samples] = 0.0
                else:
                    n_test_samples = len(probs[:,1])
                    ps[imp][n_prop,k,:n_test_samples] = probs[:,1]
                ys[imp][n_prop,k,:n_test_samples] = Y_test
                rs[imp][n_prop,k] = np.ma.corrcoef(Y_predict,Y_test)[0,1]
                feature_importances[imp][n_prop,:,k] = rfc.feature_importances_
    return rs,feature_importances,ys,ps
开发者ID:rgerkin,项目名称:upsit,代码行数:55,代码来源:scratch.py

示例2: FitModel

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
def FitModel(cnnc, A, Y, T, FN):
    print('Fitting model...')
    ss = ShuffleSplit(n_splits = 1)
    trn, tst = next(ss.split(A))
    #Fit the network
    cnnc.fit(A[trn], Y[trn])
    #The predictions as sequences of character indices
    YH = []
    for i in np.array_split(np.arange(A.shape[0]), 32): 
        YH.append(cnnc.predict(A[i]))
    YH = np.vstack(YH)
    #Convert from sequence of char indices to strings
    PS = np.array([''.join(YHi) for YHi in YH])
    #Compute the accuracy
    S1 = SAcc(PS[trn], T[trn])
    S2 = SAcc(PS[tst], T[tst])
    print('Train: ' + str(S1))
    print('Test: ' + str(S2))
    for PSi, Ti, FNi in zip(PS, T, FN):
        if np.random.rand() > 0.99: #Randomly select rows to print
            print(FNi + ': ' + Ti + ' -> ' + PSi)
    print('Fitting with CV data...')
    #Fit remainder
    cnnc.SetMaxIter(4)
    cnnc.fit(A, Y)
    return cnnc
开发者ID:nicholastoddsmith,项目名称:pythonml,代码行数:28,代码来源:DeepOCR.py

示例3: main

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
def main():
    from io import open as uopen
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('fname')
    parser.add_argument('idx', default=2, type=int)
    parser.add_argument('--key', default=u'V;1;SG;IND;PST;PFV')
    parser.add_argument('--shuffle', action='store_true')
    parser.add_argument('--folds', default=10, type=int)
    parser.add_argument('--lang', default='sp')
    parser.add_argument('--key-idx', default=3, type=int)
    args = parser.parse_args()
    fh = uopen(args.fname, encoding='utf-8')
    lines = [x.strip().split(u'\t') for x in fh]
    to_extract = [(x[0], x[args.idx]) for x in lines if x[args.key_idx] == args.key]
    if args.shuffle:
        from random import shuffle
        shuffle(to_extract)
    from distutils.dir_util import mkpath
    from sklearn.model_selection import ShuffleSplit
    rs = ShuffleSplit(n_splits=args.folds, test_size=0.2, random_state=42)
    for i, (train_indices, test_indices) in enumerate(rs.split(to_extract)):
        mkpath('res/ryan_splits/{}-10fold/{}'.format(args.lang, i))
        train_fh, dev_fh, test_fh = (uopen('res/ryan_splits/{}-10fold/{}/train.uniq'.format(args.lang, i), mode='w', encoding='utf-8'),
                                     uopen('res/ryan_splits/{}-10fold/{}/dev.uniq'.format(args.lang, i), mode='w', encoding='utf-8'),
                                     uopen('res/ryan_splits/{}-10fold/{}/test.uniq'.format(args.lang, i), mode='w', encoding='utf-8'),
                                     )
        for idx in train_indices:
            train_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))

        for j, idx in enumerate(test_indices):
            if j % 2 == 0:
                dev_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))
            else:
                test_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))
开发者ID:as1986,项目名称:neural_wfst,代码行数:37,代码来源:extract.py

示例4: train_model

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    # cv = ShuffleSplit(
    #     n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
    cv = ShuffleSplit(
        n_splits=10, test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv.split(X):
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors)
开发者ID:greatabel,项目名称:MachineLearning,代码行数:57,代码来源:i14combine+classify.py

示例5: run

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Initiate model
    model = init_model(X_train.shape[1])
    vanilla_weights = model.get_weights()

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(
        cross_validation_iterator.split(X_train), start=1
    ):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        optimal_weights_path = "/tmp/Optimal_Weights_{}.h5".format(cross_validation_index)
        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        if not os.path.isfile(optimal_weights_path):
            # Load the vanilla weights
            model.set_weights(vanilla_weights)

            # Perform the training procedure
            earlystopping_callback = EarlyStopping(monitor="val_actual_mae", patience=EARLYSTOPPING_PATIENCE)
            modelcheckpoint_callback = ModelCheckpoint(optimal_weights_path, monitor="val_loss", save_best_only=True)
            model.fit(
                X_train[train_index],
                Y_train[train_index],
                batch_size=TRAIN_BATCH_SIZE,
                nb_epoch=MAXIMUM_EPOCH_NUM,
                validation_data=(X_train[valid_index], Y_train[valid_index]),
                callbacks=[earlystopping_callback, modelcheckpoint_callback],
                verbose=2,
            )

        # Load the optimal weights
        model.load_weights(optimal_weights_path)

        # Perform the testing procedure
        Y_test = model.predict(X_test, batch_size=TEST_BATCH_SIZE, verbose=2)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
开发者ID:nixingyang,项目名称:Kaggle-Competitions,代码行数:57,代码来源:solution_Keras.py

示例6: train_model

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
    # cv = ShuffleSplit(
    #     n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
    # http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html
    # old:http://scikit-learn.org/0.15/modules/generated/sklearn
    # .cross_validation.ShuffleSplit.html#sklearn.cross_validation.ShuffleSplit
    cv = ShuffleSplit(
        n_splits=10, test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []
    for train, test in cv.split(X):
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_factory()
        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)
        # print('proba:', proba)
        # fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    scores_to_sort = pr_scores
    # print('np.argsort(scores_to_sort):', np.argsort(scores_to_sort),len(scores_to_sort) / 2)
    median = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)]

    if plot:
        plot_pr(pr_scores[median], name, "01", precisions[median],
                recalls[median], label=name)

        summary = (np.mean(scores), np.std(scores),
                   np.mean(pr_scores), np.std(pr_scores))
        print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors)
开发者ID:greatabel,项目名称:MachineLearning,代码行数:55,代码来源:i12tunning+parameter+of+classify.py

示例7: fit_models_mc

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
def fit_models_mc(imps, X, Y, all_props, props=None,
               labels=None, n_splits=5, 
               clf_args={'n_estimators':25, 
                         'max_features':'auto', 
                         'random_state':0}):
    if props is None:
        props = all_props
    n_obs = X['missing'].shape[0] # Number of observations.  
    n_features = X['missing'].shape[1] # Number of observations.  
    n_props = len(props) # Number of properties to predict.  
    test_size = 0.2
    if labels is None:
        shuffle_split = ShuffleSplit(n_iter=n_splits,
                                     test_size=test_size,random_state=0)
    else:
        shuffle_split = LabelShuffleSplit(n_iter=n_splits,
                                          test_size=test_size,random_state=0)
    n_test_samples = np.max([len(list(shuffle_split)[i][1]) \
                            for i in range(n_splits)])
    rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps}
    ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    feature_importances = None#{imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps}
    cols = np.array([i for i in range(len(all_props)) if all_props[i] in props])
    for imp in imps:
        for k,(train,test) in enumerate(shuffle_split.split(range(n_obs),groups=labels)):
            #X_train,X_test = X[imp][train][:,cols],X[imp][test][:,cols]
            #Y_train,Y_test = Y[imp][train][:,cols],Y['missing'][test][:,cols]
            X_train,X_test = X[imp][train,:],X[imp][test,:]
            Y_train,Y_test = Y[imp][train,:],Y['missing'][test,:]
            clf_args_ = {key:(value if type(value) is not dict \
                         else value[prop])\
                         for key,value in clf_args.items()}
            if clf_args_['max_features'] not in [None, 'auto']:
               clf_args_['max_features'] = min(X_train.shape[1],
                                               clf_args_['max_features'])
            rfc = RandomForestClassifier(**clf_args_)
            onevsrest = OneVsRestClassifier(rfc)
            onevsrest.fit(X_train,Y_train)
            Y_predict = onevsrest.predict(X_test)#.reshape(-1,n_props)
            probs = onevsrest.predict_proba(X_test)
            if probs.shape[1]<2 and probs.mean()==1.0:
                n_test_samples = len(probs)
                ps[imp][:,k,:n_test_samples] = 0.0
            else:
                n_test_samples = len(probs[:,1])
                ps[imp][:,k,:n_test_samples] = probs.T
            ys[imp][:,k,:n_test_samples] = Y_test.T
            for i in range(n_props):
                rs[imp][i,k] = np.ma.corrcoef(Y_predict[:,i],Y_test[:,i])[0,1]
            #feature_importances[imp][n_prop,:,k] = onevsrest.feature_importances_
    return rs,feature_importances,ys,ps
开发者ID:rgerkin,项目名称:upsit,代码行数:54,代码来源:scratch.py

示例8: run

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        model = XGBRegressor(
            learning_rate=0.01,
            max_depth=12,
            n_estimators=N_ESTIMATORS,
            silent=False,
            objective="reg:linear",
            gamma=1,
            min_child_weight=1,
            subsample=0.8,
            colsample_bytree=0.5,
            reg_alpha=1,
            seed=cross_validation_index,
            nthread=-1)

        model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])],
            eval_metric=lambda y_predicted, y_true:("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))),
            early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True)

        # Perform the testing procedure
        Y_test = model.predict(X_test)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
开发者ID:nixingyang,项目名称:Kaggle-Competitions,代码行数:48,代码来源:solution_XGBoost.py

示例9: test_safe_split_with_precomputed_kernel

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
def test_safe_split_with_precomputed_kernel():
    clf = SVC()
    clfp = SVC(kernel="precomputed")

    X, y = iris.data, iris.target
    K = np.dot(X, X.T)

    cv = ShuffleSplit(test_size=0.25, random_state=0)
    tr, te = list(cv.split(X))[0]

    X_tr, y_tr = _safe_split(clf, X, y, tr)
    K_tr, y_tr2 = _safe_split(clfp, K, y, tr)
    assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T))

    X_te, y_te = _safe_split(clf, X, y, te, tr)
    K_te, y_te2 = _safe_split(clfp, K, y, te, tr)
    assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
开发者ID:absolutelyNoWarranty,项目名称:scikit-learn,代码行数:19,代码来源:test_split.py

示例10: TestPerformance

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
 def TestPerformance(self, df = None):
     #If no dataframe is provided, use the currently learned one
     if(df is None):
         D = self.D
     else:
         D = self.S.transform(df.copy())
     #Get features from the data frame
     A = self._ExtractFeat(D)
     #Get the target values and their corresponding column names
     y, _ = self._ExtractTarg(D)
     #Begin cross validation
     ss = ShuffleSplit(n_splits = 1)
     for trn, tst in ss.split(A):
         s1 = self.R.score(A, y)
         s2 = self.R.score(A[tst], y[tst])
         s3 = self.R.score(A[trn], y[trn])
         print('C-V:\t' + str(s1) + '\nTst:\t' + str(s2) + '\nTrn:\t' + str(s3))
开发者ID:nicholastoddsmith,项目名称:pythonml,代码行数:19,代码来源:StockPredictor.py

示例11: run

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        model = GBMRegressor(
            learning_rate=0.01,
            num_iterations=NUM_ITERATIONS,
            num_leaves=200,
            min_data_in_leaf=10,
            feature_fraction=0.3,
            feature_fraction_seed=cross_validation_index,
            bagging_fraction=0.8,
            bagging_freq=10,
            bagging_seed=cross_validation_index,
            metric="l1",
            metric_freq=10,
            early_stopping_round=EARLY_STOPPING_ROUND,
            num_threads=-1)

        model.fit(X_train[train_index], Y_train[train_index], test_data=[(X_train[valid_index], Y_train[valid_index])])

        # Perform the testing procedure
        Y_test = model.predict(X_test)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
开发者ID:nixingyang,项目名称:Kaggle-Competitions,代码行数:47,代码来源:solution_LightGBM.py

示例12: plot_shuffle_split

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
def plot_shuffle_split():
    from sklearn.model_selection import ShuffleSplit
    plt.figure(figsize=(10, 2))
    plt.title("ShuffleSplit with 10 points"
              ", train_size=5, test_size=2, n_splits=4")

    axes = plt.gca()
    axes.set_frame_on(False)

    n_folds = 10
    n_samples = 10
    n_iter = 4
    n_samples_per_fold = 1

    ss = ShuffleSplit(n_splits=4, train_size=5, test_size=2, random_state=43)
    mask = np.zeros((n_iter, n_samples))
    for i, (train, test) in enumerate(ss.split(range(10))):
        mask[i, train] = 1
        mask[i, test] = 2

    for i in range(n_folds):
        # test is grey
        colors = ["grey" if x == 2 else "white" for x in mask[:, i]]
        # not selected has no hatch

        boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter,
                          left=i * n_samples_per_fold, height=.6, color=colors,
                          hatch="//", edgecolor='k', align='edge')
        for j in np.where(mask[:, i] == 0)[0]:
            boxes[j].set_hatch("")

    axes.invert_yaxis()
    axes.set_xlim(0, n_samples + 1)
    axes.set_ylabel("CV iterations")
    axes.set_xlabel("Data points")
    axes.set_xticks(np.arange(n_samples) + .5)
    axes.set_xticklabels(np.arange(1, n_samples + 1))
    axes.set_yticks(np.arange(n_iter) + .3)
    axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)])
    # legend hacked for this random state
    plt.legend([boxes[1], boxes[0], boxes[2]], [
               "Training set", "Test set", "Not selected"], loc=(1, .3))
    plt.tight_layout()
开发者ID:iop956,项目名称:introduction_to_ml_with_python,代码行数:45,代码来源:plot_cross_validation.py

示例13: test_safe_split_with_precomputed_kernel

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
def test_safe_split_with_precomputed_kernel():
    clf = SVC()
    clfp = SVC(kernel="precomputed")

    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    K = np.dot(X, X.T)

    cv = ShuffleSplit(test_size=0.25, random_state=0)
    train, test = list(cv.split(X))[0]

    X_train, y_train = _safe_split(clf, X, y, train)
    K_train, y_train2 = _safe_split(clfp, K, y, train)
    assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
    assert_array_almost_equal(y_train, y_train2)

    X_test, y_test = _safe_split(clf, X, y, test, train)
    K_test, y_test2 = _safe_split(clfp, K, y, test, train)
    assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
    assert_array_almost_equal(y_test, y_test2)
开发者ID:hmshan,项目名称:scikit-learn,代码行数:22,代码来源:test_multiclass.py

示例14: KFold

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
    
#k-fold validation
# k-fold is a type of cross validation where the data are divided into k bins. For each experiment, pick one of the k bins as the test set, 
#the remaining k-1 bins as training. Run k separate experiments and average all k test results. 
#This technique helps to test different part of the data to prevent overfitting 
#i.e. it prevents grid search from returning a parameter set that optimized specifically for a specific training data set but not overall.
from sklearn.model_selection import KFold
cv_set = KFold(n_splits=10)
for train_index, test_index in cv_sets.split(X):
     print("%s %s" % (train_index, test_index))
#Shufflesplit
#ShuffleSplit() for an alternative form of cross-validation (see the 'cv_sets' variable). 
#The ShuffleSplit() will create 10 ('n_splits') shuffled sets, and for each shuffle, 20% ('test_size') of the data will be used as the validation set.
from sklearn.model_selection import ShuffleSplit
cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 0)
for train_index, test_index in cv_sets.split(X):
     print("%s %s" % (train_index, test_index))
     
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score

# pipelining
#Sequentially apply a list of transforms and a final estimator. Intermediate steps 
#of the pipeline must be ‘transforms’, that is, they must implement fit and 
#transform methods. The final estimator only needs to implement fit.
#The purpose of the pipeline is to assemble several steps that can be 
#cross-validated together while setting different parameters.
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
开发者ID:greatObelix,项目名称:datatoolbox,代码行数:33,代码来源:ML_alg.py

示例15: test_shufflesplit_reproducible

# 需要导入模块: from sklearn.model_selection import ShuffleSplit [as 别名]
# 或者: from sklearn.model_selection.ShuffleSplit import split [as 别名]
def test_shufflesplit_reproducible():
    # Check that iterating twice on the ShuffleSplit gives the same
    # sequence of train-test when the random_state is given
    ss = ShuffleSplit(random_state=21)
    assert_array_equal(list(a for a, b in ss.split(X)),
                       list(a for a, b in ss.split(X)))
开发者ID:absolutelyNoWarranty,项目名称:scikit-learn,代码行数:8,代码来源:test_split.py


注:本文中的sklearn.model_selection.ShuffleSplit.split方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。