当前位置: 首页>>代码示例>>Python>>正文


Python cross_validation.StratifiedShuffleSplit方法代码示例

本文整理汇总了Python中sklearn.cross_validation.StratifiedShuffleSplit方法的典型用法代码示例。如果您正苦于以下问题:Python cross_validation.StratifiedShuffleSplit方法的具体用法?Python cross_validation.StratifiedShuffleSplit怎么用?Python cross_validation.StratifiedShuffleSplit使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.cross_validation的用法示例。


在下文中一共展示了cross_validation.StratifiedShuffleSplit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_stratified_shuffle_split_init

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def test_stratified_shuffle_split_init():
    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
    # Check that error is raised if there is a class with only one sample
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2)

    # Check that error is raised if the test set size is smaller than n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2)
    # Check that error is raised if the train set size is smaller than
    # n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2)

    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8)

    # Train size or test size too small
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2) 
开发者ID:alvarobartt,项目名称:twitter-stock-recommendation,代码行数:22,代码来源:test_cross_validation.py

示例2: base_learners

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def base_learners(data_path='data.csv', seed=123456789):
	"""
	Test some classifiers on the raw data.
	"""
	
	# Params
	nsplits = 8
	pct_train = 0.8
	
	# Get data
	data = pd.read_csv(data_path)
	x = data.ix[:, :-1].as_matrix()
	y = data.ix[:, -1].as_matrix()
	x, y = convert_data_to_int(x, y)
	
	# Run random forest in parallel
	sss = StratifiedShuffleSplit(y, n_iter=nsplits, train_size=pct_train,
		random_state=seed)
	results = Parallel(n_jobs=-1)(delayed(train_score_clf)(
		RandomForestClassifier(random_state=i), x[tr], x[te], y[tr], y[te])
		for i, (tr, te) in enumerate(sss))
	print 'Random Forest: {0:.3f} %'.format(np.median(results))
	
	# Run SVM in parallel
	sss = StratifiedShuffleSplit(y, n_iter=nsplits, train_size=pct_train,
		random_state=seed)
	results = Parallel(n_jobs=-1)(delayed(train_score_clf)(
		LinearSVC(random_state=i), x[tr], x[te], y[tr], y[te])
		for i, (tr, te) in enumerate(sss))
	print 'Linear SVM: {0:.3f} %'.format(np.median(results)) 
开发者ID:tehtechguy,项目名称:mHTM,代码行数:32,代码来源:car_eval.py

示例3: _create_generator

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def _create_generator(self):
		"""
		Create a generator for the data. Yield a tuple containing the current
		training and testing split.
		"""
		
		# Create the CV iterators
		sss_tr = StratifiedShuffleSplit(self.tr_y, self.nsplits,
			train_size=self.train_size, random_state=self.seed)
		sss_te = StratifiedShuffleSplit(self.te_y, self.nsplits,
			train_size=self.test_size, random_state=self.seed)
		
		# Yield each item
		for tr, te in izip(sss_tr, sss_te):
			yield tr[0], te[0] + len(self.tr_y) # Offset testing indexes 
开发者ID:tehtechguy,项目名称:mHTM,代码行数:17,代码来源:loader.py

示例4: stratified_split

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def stratified_split(x, y, test_size=0.2):
    strat_shuffled_split = StratifiedShuffleSplit(y, n_iter=1, test_size=test_size, random_state=23)
    train_index, valid_index = [s for s in strat_shuffled_split][0]

    x_train, y_train, x_valid, y_valid = x[train_index, :], y[train_index], x[valid_index, :], y[valid_index]

    return x_train, y_train, x_valid, y_valid 
开发者ID:ahara,项目名称:kaggle_otto,代码行数:9,代码来源:utils.py

示例5: _train_val_split_indices

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def _train_val_split_indices(labels):
    split = StratifiedShuffleSplit(
        labels, n_iter=1, test_size=VAL_SIZE, random_state=42)
    indices_tr, indices_val = next(iter(split))

    _save_organized_data_info(
        split.classes, indices_tr, indices_val, multi_crop=False)
    _save_organized_data_info(
        split.classes, indices_tr, indices_val, multi_crop=True)
    return indices_tr, indices_val, split.classes 
开发者ID:inejc,项目名称:painters,代码行数:12,代码来源:data_dirs_organizer.py

示例6: split_indices

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def split_indices(files, labels, test_size=0.1, random_state=RANDOM_STATE):
    names = get_names(files)
    labels = get_labels(names, per_patient=True)
    spl = cross_validation.StratifiedShuffleSplit(labels[:, 0], 
                                                  test_size=test_size, 
                                                  random_state=random_state,
                                                  n_iter=1)
    tr, te = next(iter(spl))
    tr = np.hstack([tr * 2, tr * 2 + 1])
    te = np.hstack([te * 2, te * 2 + 1])
    return tr, te 
开发者ID:sveitser,项目名称:kaggle_diabetic,代码行数:13,代码来源:data.py

示例7: Get_yPred

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def Get_yPred (X,y,clf_class,n_folds=10, pred_proba=False) : #,**kwargs):
    '''
    Return "Full" Y_predictions from a given c;assifier (not just from one split): (From def run_cv)
    http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html

    Could also be done with stratified shuffle split (+Append output) ?
    http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
    '''
    # Construct a kfolds object
    # kf = StratifiedKFold(len(y),n_folds,shuffle=True) #shuffle?
    kf = StratifiedKFold(y,n_folds,shuffle=True) #shuffle?
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # sample_weight=balance_weights(y_train)

        # Initialize a classifier with key word arguments
        clf = clf_class #(**kwargs)
        #sample_weight weighting not working here.. ?  TODO
        clf.fit(X_train,y_train) #,sample_weight) #
        if pred_proba == True:
            y_pred[test_index] = clf.predict_proba(X_test)
        else:
            y_pred[test_index] = clf.predict(X_test)
    return y_pred 
开发者ID:ddofer,项目名称:ProFET,代码行数:30,代码来源:PipeTasks.py

示例8: PlotPerfPercentFeatures

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def PlotPerfPercentFeatures(X,y,est=LinearSVC()):
    '''
    Performance of a classifier (default: SVM-Anova)
    varying the percentile of features selected (F-test) .

    http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py

    See Also: (Similar but with model seelction from among classifiers):
    http://nbviewer.ipython.org/github/bugra/pydata-nyc-2014/blob/master/6.%20Scikit%20Learn%20-%20Model%20Selection.ipynb

    '''
    transform = SelectPercentile(f_classif)

    clf = Pipeline([('anova', transform), ('est', est)])
    ###############################################################################
    # Plot the cross-validation score as a function of percentile of features
    score_means = list()
    score_stds = list()
    percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 99)
    # percentiles = (1,5,10,25,50,75,90)

    for percentile in percentiles:
        # print(percentile)
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())
    print("Outputting Graph:")

    plt.errorbar(percentiles, score_means, np.array(score_stds))

    plt.title(
        'Predictor Performance, varying percent of features used')
    plt.xlabel('Percentile')
    plt.ylabel('Prediction Performance')
    plt.axis('tight')
    plt.show() 
开发者ID:ddofer,项目名称:ProFET,代码行数:39,代码来源:PipeTasks.py

示例9: PlotPerfPercentFeatures

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def PlotPerfPercentFeatures(X,y,est=LinearSVC()):
    '''
    Performance of a classifier (default: SVM-Anova)
    varying the percentile of features selected (F-test) .

    http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py
    '''
    transform = SelectPercentile(f_classif)

    clf = Pipeline([('anova', transform), ('est', est)])
    ###############################################################################
    # Plot the cross-validation score as a function of percentile of features
    score_means = list()
    score_stds = list()
    percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 100)
    # percentiles = (1,5,10,25,50,75,90)

    for percentile in percentiles:
        # print(percentile)
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())
    print("Outputting Graph:")

    plt.errorbar(percentiles, score_means, np.array(score_stds))

    plt.title(
        'Predictor Performance, varying percent of features used')
    plt.xlabel('Percentile')
    plt.ylabel('Prediction Performance')
    plt.axis('tight')
    plt.show() 
开发者ID:ddofer,项目名称:ProFET,代码行数:35,代码来源:OutPutRes.py

示例10: CV_multi_stats

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def CV_multi_stats(X, y, model,n=6) :
    '''
    http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
    This version uses multiclass (or multilabel) compatible metrics.

    May be expanded to use the cross_val_score helper function:
    http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.cross_val_score.html
    http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
    '''

    scores = cross_val_score(estimator=model, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=n, test_size=0.16), n_jobs=-1) #Accuracy
    scores_f1 = cross_val_score(estimator=model, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=n, test_size=0.16), n_jobs=-1, scoring='f1')
    print("Model Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2))
    print("Model f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))
    return (scores.mean(), scores.std() ,scores_f1.mean(), scores_f1.std() ) #Removed * 2 from returned STD .. ? 
开发者ID:ddofer,项目名称:ProFET,代码行数:17,代码来源:OutPutRes.py

示例11: test_stratified_shuffle_split_iter

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def test_stratified_shuffle_split_iter():
    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
          np.array([-1] * 800 + [1] * 50)
          ]

    for y in ys:
        sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
                                          random_state=0)
        test_size = np.ceil(0.33 * len(y))
        train_size = len(y) - test_size
        for train, test in sss:
            assert_array_equal(np.unique(y[train]), np.unique(y[test]))
            # Checks if folds keep classes proportions
            p_train = (np.bincount(np.unique(y[train],
                                   return_inverse=True)[1]) /
                       float(len(y[train])))
            p_test = (np.bincount(np.unique(y[test],
                                  return_inverse=True)[1]) /
                      float(len(y[test])))
            assert_array_almost_equal(p_train, p_test, 1)
            assert_equal(len(train) + len(test), y.size)
            assert_equal(len(train), train_size)
            assert_equal(len(test), test_size)
            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) 
开发者ID:alvarobartt,项目名称:twitter-stock-recommendation,代码行数:29,代码来源:test_cross_validation.py

示例12: test_stratified_shuffle_split_overlap_train_test_bug

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def test_stratified_shuffle_split_overlap_train_test_bug():
    # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
    # the original bug report
    labels = [0, 1, 2, 3] * 3 + [4, 5] * 5

    splits = cval.StratifiedShuffleSplit(labels, n_iter=1,
                                         test_size=0.5, random_state=0)
    train, test = next(iter(splits))

    assert_array_equal(np.intersect1d(train, test), []) 
开发者ID:alvarobartt,项目名称:twitter-stock-recommendation,代码行数:12,代码来源:test_cross_validation.py

示例13: grouped_stratified_train_test_split

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def grouped_stratified_train_test_split(y, x, group_by=None, test_size=0.33, group_labeler=None, return_indices=False, **kwargs):
    """
    Split arrays or matrices into random training and test subsets. Subsets will contain equal proportions of each label in `y`.
    Based on StratifiedShuffleSplit from sklearn.cross_validation.

    if `group_by` is an iterable of length `len(y)`, indices with the same `group_by[i]` will be kept together in either the training or the test set.

    if `group_labeler` is a callable, it will be used to assign a label to a group of labels. The default is `lambda labels: int(np.round(np.average(labels)))`
    

    --------
    Example:

     X = np.array([[1, 2], [3, 4], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1]])
     y = np.array([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1])
     id = np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6])

     x_train, x_test, y_train, y_test = grouped_stratified_train_test_split(y,X,id)

    """

    if not group_labeler:
        group_labeler = lambda labels: int(np.round(np.average(labels)))

    group_indices = dict()
    group_labels = dict()
    for i,(label, group) in enumerate(zip(y, group_by)):
        if not group in group_labels:
            group_labels[group] = list()
            group_indices[group] = list()
        group_indices[group].append(i)
        group_labels[group].append(label)
    groups, labels = zip(*{ group: group_labeler(labels) for group, labels in group_labels.items() }.items())

    sss = StratifiedShuffleSplit(labels, 1, test_size=test_size, **kwargs)

    group_train_indices, group_test_indices = list(sss)[0]
    test_groups = [groups[i] for i in group_test_indices]
    train_groups = [groups[j] for j in group_train_indices]

    test_indices = [idx for group in test_groups for idx in group_indices[group]]
    train_indices = [idx for group in train_groups for idx in group_indices[group]]
    if return_indices:
        return train_indices, test_indices
    else:
        return x[train_indices], x[test_indices], y[train_indices], y[test_indices] 
开发者ID:SMAPPNYU,项目名称:smappPy,代码行数:48,代码来源:cross_validation.py

示例14: load_titanic

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999):
    f = open(os.path.join('datasets', 'titanic', 'titanic3.csv'))
    # Remove . from home.dest, split on quotes because some fields have commas
    keys = f.readline().strip().replace('.', '').split('","')
    lines = f.readlines()
    f.close()
    string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat',
                   'homedest']
    string_keys = [s for s in string_keys if s not in feature_skip_tuple]
    numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare']
    numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple]
    train_vectorizer_list = []
    test_vectorizer_list = []

    n_samples = len(lines)
    numeric_data = np.zeros((n_samples, len(numeric_keys)))
    numeric_labels = np.zeros((n_samples,), dtype=int)

    # Doing this twice is horribly inefficient but the file is small...
    for n, l in enumerate(lines):
        line_dict = process_titanic_line(l)
        strings = {k: line_dict[k] for k in string_keys}
        numeric_labels[n] = line_dict["survived"]

    sss = StratifiedShuffleSplit(numeric_labels, n_iter=1, test_size=test_size,
                                 random_state=12)
    # This is a weird way to get the indices but it works
    train_idx = None
    test_idx = None
    for train_idx, test_idx in sss:
        pass

    for n, l in enumerate(lines):
        line_dict = process_titanic_line(l)
        strings = {k: line_dict[k] for k in string_keys}
        if n in train_idx:
            train_vectorizer_list.append(strings)
        else:
            test_vectorizer_list.append(strings)
        numeric_data[n] = np.asarray([line_dict[k]
                                      for k in numeric_keys])

    train_numeric = numeric_data[train_idx]
    test_numeric = numeric_data[test_idx]
    train_labels = numeric_labels[train_idx]
    test_labels = numeric_labels[test_idx]

    vec = DictVectorizer()
    # .toarray() due to returning a scipy sparse array
    train_categorical = vec.fit_transform(train_vectorizer_list).toarray()
    test_categorical = vec.transform(test_vectorizer_list).toarray()
    train_data = np.concatenate([train_numeric, train_categorical], axis=1)
    test_data = np.concatenate([test_numeric, test_categorical], axis=1)
    keys = numeric_keys + string_keys
    return keys, train_data, test_data, train_labels, test_labels 
开发者ID:amueller,项目名称:scipy_2015_sklearn_tutorial,代码行数:57,代码来源:helpers.py

示例15: CV_Binary_stats

# 需要导入模块: from sklearn import cross_validation [as 别名]
# 或者: from sklearn.cross_validation import StratifiedShuffleSplit [as 别名]
def CV_Binary_stats(X, y, model,n=10) :
    '''
    http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
    Note that some of the metrics here ONLY work for BINARY tasks.
    This will be VERY slow compared to the built-in, multicore CV implementation. (Unless
     used with a classifier that is parallelized anyway, such as RF).
    By default, balances weights when fitting

    http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
    '''
    from sklearn.metrics import precision_score, accuracy_score, recall_score,precision_recall_fscore_support

    mean_auc = 0.0
    mean_precision = 0.0
    mean_recall = 0.0
    mean_accuracy = 0.0

    sss = StratifiedShuffleSplit(y,  n_iter=n, test_size=0.2, random_state=0)
    for train_index, test_index in sss:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # for i in range(n) :
    #     # for each iteration, randomly hold out 30% of the data as CV set
    #     X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y,
    #                                                                      test_size=.15,
    #                                                                      random_state=i)
    #     cv=StratifiedShuffleSplit(y=y_train, n_iter=11, test_size=0.11)
        # train model and make predictions
        model.fit(X_train, y_train,sample_weight=balance_weights(y_train))
        # preds = model.predict(X_cv)
        preds = model.predict(X_test)

        '''
        # ROC_AUC - Restricted to binary (not multiclass) case.
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
        roc_auc = metrics.auc(fpr, tpr)
        # print("( %d/%d)" % (i + 1, n))
        mean_auc += roc_auc
        '''
        accuracy = accuracy_score(y_cv, preds)
        precision = precision_score(y_cv, preds)
        recall = recall_score(y_cv, preds)
        mean_accuracy += accuracy
        mean_precision += precision
        mean_recall += recall

    mean_accuracy = (mean_accuracy / n)
    mean_precision = mean_precision / n
    mean_recall = mean_recall / n
    # mean_auc = mean_auc / n
    print('mean_accuracy:  %s ' %(round(mean_accuracy, 3)))
    print('mean_precision:  %s ' %(round(mean_precision, 3)))
    print('mean_recall:  %s ' %(round(mean_recall, 3)))
    # print('mean_auc:  %s ' %(round(mean_auc, 3)))
    return (mean_accuracy,mean_precision,mean_recall) 
开发者ID:ddofer,项目名称:ProFET,代码行数:58,代码来源:OutPutRes.py


注:本文中的sklearn.cross_validation.StratifiedShuffleSplit方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。