当前位置: 首页>>代码示例>>Python>>正文


Python cross_validation.train_test_split函数代码示例

本文整理汇总了Python中sklearn.cross_validation.train_test_split函数的典型用法代码示例。如果您正苦于以下问题:Python train_test_split函数的具体用法?Python train_test_split怎么用?Python train_test_split使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了train_test_split函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: splitDataset

def splitDataset(data, random_seed):
    '''
    Given a dataframe and a seed value, this function splits out the dataframe into a training set, a validation set, and a test set using the provided seed value for consistency. It uses a 60/20/20 split, but this could easily be parameterized and passed into the function. It returns a dictionary of dataframes with keys train, valid and test.
    '''
    #Get column headers
    col_headers = list(data.columns.values)
    feature_cols = copy.deepcopy(col_headers)
    feature_cols.remove('Sample')
    feature_cols.remove('Diagnosis')
    class_col = ['Diagnosis']
    
    #Train/test/validate split
    train, test = train_test_split(data, test_size=0.2, random_state=random_seed)
    train = pd.DataFrame(train)
    test = pd.DataFrame(test)
    train.columns = col_headers
    test.columns = col_headers
    train, validate = train_test_split(train, test_size=0.25, random_state=random_seed)
    train = pd.DataFrame(train)
    validate = pd.DataFrame(validate)
    train.columns = col_headers
    validate.columns = col_headers
    
    #Separate features and classes
    all_data = {'train': train, 'valid': validate, 'test': test}
    return extractFeatures(all_data)
开发者ID:bwelsh,项目名称:projectW4761,代码行数:26,代码来源:cgen_include.py

示例2: tribunalTrain

 def tribunalTrain(data,predict,tribunal,split=.2,stat=False,statLis=None):
     #data for testing the tribunal performance, not in actual judge training
     dat_train, dat_test, lab_train, lab_test = train_test_split(data,predict, test_size=split)
     verdict = []
      
     print 'Tribunal in session'
     
     for judge in tribunal:
         jdat_train, jdat_test, jlab_train, jlab_test = train_test_split(dat_train,lab_train, test_size=split)
         judge.fit(jdat_train, jlab_train)
         print 'judge trained'
 
     for d in dat_test:
         votes = []
         for judge in tribunal:
             v = judge.predict(d)
             votes.append(v)
         decision = stats.mode(votes,axis=None)
         verdict.append(decision[0])
     npVerdict = np.array(verdict)
     
     if stat == False:        
         svmDesc(npVerdict,lab_test,title='Tribunal Confusion Matrix')
     else:
         jac = jaccard_similarity_score(npVerdict,lab_test)
         statLis.append(jac)
开发者ID:am4002,项目名称:Hybrid-SOM-for-MEG,代码行数:26,代码来源:som_cluster_lib.py

示例3: processMethod3

def processMethod3(userid, featureCondition=1, classificationCondition=1, offsetFeatureOn=False):
    """ User-i Device-j hack in User-i Device-k Model: iphone6plus hack iphone5

    Returns
    -------
    float : error rate
    """
    # rawDataiPhone6Plus = loadUserData(userid, 1, datatype=1) # moment data
    # rawDataiPhone5     = loadUserData(userid, 2, datatype=1) # moment data

    # trainingData  = splitMomentDataByFeature(rawDataiPhone5, featureCondition=featureCondition)
    # trainingLabel = rawDataiPhone5[:, 4]

    # testData  = splitMomentDataByFeature(rawDataiPhone6Plus, featureCondition=featureCondition)
    # testLabel = rawDataiPhone6Plus[:, 4]

    iPhone6Plus = 1
    iPhone5     = 2
    trainingData, trainingLabel = splitMomentDataByFeatureAndLabel(userid, iPhone5, featureCondition, classificationCondition, offsetFeatureOn=offsetFeatureOn)
    testData, testLabel         = splitMomentDataByFeatureAndLabel(userid, iPhone6Plus, featureCondition, classificationCondition, offsetFeatureOn=offsetFeatureOn)

    # use same test size with method1
    trainingDataIP5, testDataIP5, trainingLabelIP5, testLabelIP5 = train_test_split(trainingData, trainingLabel, test_size=my_test_size, random_state=my_random_state)
    trainingDataIP6, testDataIP6, trainingLabelIP6, testLabelIP6 = train_test_split(    testData,     testLabel, test_size=my_test_size, random_state=my_random_state)

    return classify(trainingDataIP5, trainingLabelIP5, testDataIP6, testLabelIP6, kernel=my_kernel, max_iter=my_max_iteration)
开发者ID:changkun,项目名称:AugmentedTouch,代码行数:26,代码来源:moment.py

示例4: test_train_test_split

def test_train_test_split():
    X = np.arange(100).reshape((10, 10))
    X_s = coo_matrix(X)
    y = np.arange(10)

    # simple test
    split = cval.train_test_split(X, y, test_size=None, train_size=.5)
    X_train, X_test, y_train, y_test = split
    assert_equal(len(y_test), len(y_train))
    # test correspondence of X and y
    assert_array_equal(X_train[:, 0], y_train * 10)
    assert_array_equal(X_test[:, 0], y_test * 10)

    # conversion of lists to arrays (deprecated?)
    split = cval.train_test_split(X, X_s, y.tolist(), allow_lists=False)
    X_train, X_test, X_s_train, X_s_test, y_train, y_test = split
    assert_array_equal(X_train, X_s_train.toarray())
    assert_array_equal(X_test, X_s_test.toarray())

    # don't convert lists to anything else by default
    split = cval.train_test_split(X, X_s, y.tolist())
    X_train, X_test, X_s_train, X_s_test, y_train, y_test = split
    assert_true(isinstance(y_train, list))
    assert_true(isinstance(y_test, list))

    # allow nd-arrays
    X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2)
    y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11)
    split = cval.train_test_split(X_4d, y_3d)
    assert_equal(split[0].shape, (7, 5, 3, 2))
    assert_equal(split[1].shape, (3, 5, 3, 2))
    assert_equal(split[2].shape, (7, 7, 11))
    assert_equal(split[3].shape, (3, 7, 11))
开发者ID:jjhelmus,项目名称:scikit-learn,代码行数:33,代码来源:test_cross_validation.py

示例5: tuning_l2_penalty

def tuning_l2_penalty(out_file, featurizers = None):
    # featurizers for blog/blog, twitter+wiki/blog and twitter+wiki/twitter+wiki respectively
    if not featurizers:
        featurizers = [feat4, feat5, feat4]
    # used to weigh L-2 penalty
    c_vals = [ v / 100.0 for v in range(50, 110, 10)]
    # data splits used
    b_train, b_test = train_test_split(blog_80, test_size = 0.1, random_state = 1)
    tw_train, tw_test = train_test_split(tw, test_size = 0.1, random_state = 1)
    # count sizes only once
    n_btest = float(len(b_test))
    n_b80 = float(len(blog_80))
    n_twtest = float(len(tw_test))

    for c_val in c_vals:
        print "Running l-2 tunning for C:%.2f" % c_val
        # Using split validation, as otherwise too slow
        make_model = lambda: Models.LogisticRegression(C = c_val)
        blog_errors = error_analyze(make_model, b_train, b_test, featurizers[0])
        twb_errors = error_analyze(make_model, tw, blog_80, featurizers[1])
        tw_errors = error_analyze(make_model, tw_train, tw_test, featurizers[2])

        blog_acc = 1 - len(blog_errors["error_indices"]) / n_btest
        twb_acc = 1 - len(twb_errors['error_indices']) / n_b80
        tw_acc = 1 - len(tw_errors['error_indices']) / n_twtest
        # write to file provided
        out_file.write("C=%f\n" % c_val)
        out_file.write("b=%f, twb=%f, tw=%f\n\n" % (blog_acc, twb_acc, tw_acc))
开发者ID:josepablocam,项目名称:snlp_project,代码行数:28,代码来源:maxent_experiments.py

示例6: get_best_k_model

def get_best_k_model(model, max_k, x, y):
    # Fit a model using a range of best-k values, 
    # returning the model that produces the best test score
    
    # Input
    # model: scikit-learn model
    # max_k: maximum k-value to iterate to (inclusive)
    # x: independent variables
    # y: dependent variable
    
    # Output
    # best_k: Number of dependent variables using to produce output
    # train_score: training score
    # test_score: test score
    # train_mse: training mse
    # test_mse: test mse       
    
    test_scores = []
    k_vals = []    
    
    k_limit = min(max_k, len(x.columns))
    for k_val in range(1, k_limit + 1):
        best_x = fs.SelectKBest(fs.chi2, k = k_val).fit_transform(x, y)
        x_train, x_test, y_train, y_test = cv.train_test_split(best_x, y, test_size = 0.2, random_state = 0)
        test_scores.append(model.fit(x_train, y_train).score(x_test, y_test))
        k_vals.append(k_val)

    best_k = k_vals[np.argmax(test_scores)]
    best_x = fs.SelectKBest(fs.chi2, k = best_k).fit_transform(x, y)
    x_train, x_test, y_train, y_test = cv.train_test_split(best_x, y, test_size = 0.2, random_state = 0)
       
    train_score, test_score, train_mse, test_mse = get_model_values(model, x_train, y_train, x_test, y_test)
    
    return best_k, train_score, test_score, train_mse, test_mse
开发者ID:kcavagnolo,项目名称:ml_fun,代码行数:34,代码来源:linkedin_salary.py

示例7: load_dataset

def load_dataset(path_id="", folder="", use_float_32=False, test_ratio=0.3, valid_ratio=0.1):	
#def load_dataset(path_id="", use_float_32=False, test_ratio=0.2, valid_ratio=0.1):
	# reading full dataset
	features_path = "data/%s/features%s.npy"%(folder, path_id)
	labels_path = "data/%s/labels%s.npy"%(folder, path_id)
	

	features = np.load(features_path)
	if use_float_32:
		features = features.astype(np.float32)
	labels = np.load(labels_path)
	
	# splitting data
	train_set_x, test_set_x, train_set_y, test_set_y = train_test_split(features, labels, test_size=test_ratio, random_state=89677)
	#train_set_x = features[:2500]
	#train_set_y = labels[:2500]
	
	#test_set_x = features[2500:]
	#test_set_y = labels[2500:]
	test_set_x = theano.shared(value=test_set_x, name='test_set_x', borrow=True)
	test_set_y = theano.shared(value=np.array(test_set_y), name='test_set_y', borrow=True)
	
	# split train set into validation set
	train_set_x, valid_set_x, train_set_y, valid_set_y = train_test_split(train_set_x, train_set_y, test_size=valid_ratio, random_state=89677)
	
	print train_set_x.shape, valid_set_x.shape, test_set_x.get_value(borrow=True).shape
	
	train_set_x = theano.shared(value=train_set_x, name='train_set_x', borrow=True)
	train_set_y = theano.shared(value=np.array(train_set_y), name='train_set_y', borrow=True)
	
	valid_set_x = theano.shared(value=valid_set_x, name='valid_set_x', borrow=True)
	valid_set_y = theano.shared(value=np.array(valid_set_y), name='valid_set_y', borrow=True)
	
	return ((train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y))	
开发者ID:Ahmed--Mohsen,项目名称:authorship,代码行数:34,代码来源:data_handler.py

示例8: getImages

def getImages():
   digitsImagesNormalized = getImagesFromDir(digitsPath)
   lettersImagesNormalized = getImagesFromDir(lettersPath)

   digitsImagesNormalized = [skpre.scale(digitsImagesNormalized[0]), digitsImagesNormalized[1]]
   lettersImagesNormalized = [skpre.scale(lettersImagesNormalized[0]), lettersImagesNormalized[1]]

   allImages = []
   for i in digitsImagesNormalized[0]:
      allImages.append(i)

   for i in lettersImagesNormalized[0]:
      allImages.append(i)

   # Divide em teste e treino.
   # Calcula PCA - Reducao de dimensionalidade dos dados. :)
   pca = computePCA(allImages)
   digitstransformedData = pca.transform(digitsImagesNormalized[0])
   letterstransformedData = pca.transform(lettersImagesNormalized[0])

   dtrainDataTF, dtestDataTF, dclassesTrainTF, dclassesTestTF = train_test_split(digitstransformedData, digitsImagesNormalized[1], train_size=0.65)

   ltrainDataTF, ltestDataTF, lclassesTrainTF, lclassesTestTF = train_test_split(letterstransformedData, lettersImagesNormalized[1], train_size=0.65)
   
   return [[dtrainDataTF, dclassesTrainTF], [dtestDataTF, dclassesTestTF]], [[ltrainDataTF, lclassesTrainTF], [ltestDataTF, lclassesTestTF]]
开发者ID:pedrokalmeida,项目名称:mc861,代码行数:25,代码来源:charDetect.py

示例9: train_lsvr

def train_lsvr():
    train_sys = np.load('fc2_train_sys.npy')
    test_sys =  np.load('fc2_test_sys.npy')
    # from sklearn.preprocessing import StandardScaler
    # sle = StandardScaler()
    # train_sys = sle.fit_transform(train_sys)
    # test_sys  = sle.fit_transform(test_sys)
    
    y = np.load('data/y_train.npy')
    from sklearn import svm
    #from sklearn.metrics import mean_squared_error
    from sklearn.ensemble import RandomForestRegressor
    lsvr = svm.SVR(C=0.1) # 0.045
    #lsvr = RandomForestRegressor(n_estimators = 100)
    train_sys, val_sys, train_y_sys, val_y_sys = train_test_split(train_sys, y[:,0])
    lsvr.fit(train_sys, train_y_sys)
    #print mean_squared_error(val_y_sys, l
    pred_systole = lsvr.predict(val_sys)
    cdf_val = real_to_cdf(val_y_sys)
    cdf_pred_systole = real_to_cdf(pred_systole)
    crps_val = crps(cdf_val, cdf_pred_systole)
    print('CRPS(val sys) = {0}'.format(crps_val))

    train_dia = np.load('fc2_train_dia.npy')
    test_dia  = np.load('fc2_test_dia.npy')

    train_dia, val_dia, train_y_dia, val_y_dia = train_test_split(train_dia, y[:,1])
    lsvr.fit(train_dia, train_y_dia)

    pred_dia = lsvr.predict(val_dia)
    cdf_val_dia = real_to_cdf(val_y_dia)
    cdf_pred_dia = real_to_cdf(pred_dia)
    crps_val = crps(cdf_val_dia, cdf_pred_dia)
    print('CRPS(val dia) = {0}'.format(crps_val))
开发者ID:ouceduxzk,项目名称:kaggle-ndsb2,代码行数:34,代码来源:extract.py

示例10: split_dataset

 def split_dataset(index, random_state, test_ratio=0.2, valid_ratio=0.2):
     index = list(index)
     ix_train, ix_test = train_test_split(index, test_size=test_ratio,
         random_state=random_state)
     ix_train, ix_valid = train_test_split(ix_train,
         test_size=valid_ratio / (1 - test_ratio), random_state=random_state)
     return {'train': ix_train, 'valid': ix_valid, 'test': ix_test}
开发者ID:bzamecnik,项目名称:ml-playground,代码行数:7,代码来源:prepare_training_data.py

示例11: create_sets

def create_sets(img_dir, train_set_proportion=.6, test_set_proportion=.2, val_set_proportion=.2):
    '''Split a list of image files up into training, testing and validation sets.'''

    if os.path.isfile(img_dir+ 'imgs.list'):
        baseimgfilenames = pickle.load(open(img_dir+'imgs.list','rb'))
    else:
        imgfilenames = glob.glob(img_dir + '*.jpg')
        baseimgfilenames = [os.path.basename(f) for f in imgfilenames]

    train,val = train_test_split(np.arange(len(baseimgfilenames)),
                                       train_size=train_set_proportion+test_set_proportion,
                                       test_size=val_set_proportion,
                                       random_state=1)

    train_test_prop = train_set_proportion + test_set_proportion
    train,test = train_test_split(train,
                                  train_size=train_set_proportion/train_test_prop,
                                  test_size=test_set_proportion/train_test_prop,
                                  random_state=1)

    trainfiles = [baseimgfilenames[i] for i in train]
    valfiles = [baseimgfilenames[i] for i in val]
    testfiles = [baseimgfilenames[i] for i in test]

    return trainfiles, valfiles,testfiles
开发者ID:emwebaze,项目名称:microscopy-object-detection,代码行数:25,代码来源:createdb.py

示例12: create_sets

def create_sets(img_dir, train_set_proportion=.6, test_set_proportion=.2, val_set_proportion=.2):
    '''Split a list of image files up into training, testing and validation sets.'''

    imgfilenames = glob.glob(img_dir + '*.jpg')
    baseimgfilenames = [os.path.basename(f) for f in imgfilenames]

    if train_set_proportion + test_set_proportion < 1:
        train,val = train_test_split(np.arange(len(baseimgfilenames)),
                                           train_size=train_set_proportion+test_set_proportion,
                                           test_size=val_set_proportion,
                                           random_state=1) 
    else:
        train = np.arange(len(baseimgfilenames))
        val = []

    train_test_prop = train_set_proportion + test_set_proportion
    train,test = train_test_split(train,
                                  train_size=train_set_proportion/train_test_prop,
                                  test_size=test_set_proportion/train_test_prop,
                                  random_state=1)

    trainfiles = [baseimgfilenames[i] for i in train]
    testfiles = [baseimgfilenames[i] for i in test]
    valfiles = [baseimgfilenames[i] for i in val]

    return trainfiles, valfiles,testfiles
开发者ID:jqug,项目名称:microscopy-object-detection,代码行数:26,代码来源:readdata.py

示例13: main

def main(unused_argv):
  iris = datasets.load_iris()
  x_train, x_test, y_train, y_test = train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  x_train, x_val, y_train, y_val = train_test_split(
      x_train, y_train, test_size=0.2, random_state=42)
  val_monitor = learn.monitors.ValidationMonitor(
      x_val, y_val, early_stopping_rounds=200)

  # classifier with early stopping on training data
  classifier1 = learn.DNNClassifier(
      hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model/')
  classifier1.fit(x=x_train, y=y_train, steps=2000)
  score1 = metrics.accuracy_score(y_test, classifier1.predict(x_test))

  # classifier with early stopping on validation data, save frequently for
  # monitor to pick up new checkpoints.
  classifier2 = learn.DNNClassifier(
      hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model_val/',
      config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
  classifier2.fit(x=x_train, y=y_train, steps=2000, monitors=[val_monitor])
  score2 = metrics.accuracy_score(y_test, classifier2.predict(x_test))

  # In many applications, the score is improved by using early stopping
  print('score1: ', score1)
  print('score2: ', score2)
  print('score2 > score1: ', score2 > score1)
开发者ID:AadityaJ,项目名称:tensorflow,代码行数:28,代码来源:iris_val_based_early_stopping.py

示例14: split_data

def split_data(x_train, y_train):
    """
    Given training data cropped from the original dataset by create_training_set.py, split this data up into training, cross-validation, and test data.

    INPUTS:
    x_train = Features cropped from original dataset
    y_train = Labels manually inputed from x_train

    OUTPUTS:
    new_x_train = New training data randomly selected from x_train
    new_x_crossval = Cross-validation samples from x_train
    new_x_test = Test samples from x_train
    new_y_train = Training labels
    new_y_crossval = Cross-validation labels
    new_y_test = Testing labels
    """
    new_x_train, new_x_test, new_y_train, new_y_test \
     = cross_val.train_test_split(x_train,
                                  y_train,
                                  test_size=0.3,
                                  random_state=53)
    new_x_crossval, new_x_test, new_y_crossval, new_y_test \
     = cross_val.train_test_split(new_x_test,
                                  new_y_test,
                                  test_size=0.5,
                                  random_state=41)
    return new_x_train, new_x_crossval, new_x_test, new_y_train, \
            new_y_crossval, new_y_test
开发者ID:EthanRosenthal,项目名称:stm-routines,代码行数:28,代码来源:train_model.py

示例15: cook

def cook():
    x, y, weights = load_data()
    n_components = 200
    svd = TruncatedSVD(n_components, random_state=42)
    x_unweighted = svd.fit_transform(x)
    x_weighted = svd.fit_transform(weighted(x, weights))

    for i in range(9):
        frac = 1 - (i * 0.01 + 0.01)
        print frac

        x_train, x_test, y_train, y_test = train_test_split(x_unweighted, y, test_size=frac)
        classifier = AdaBoostClassifier(n_estimators=100)
        classifier.fit(x_train, y_train)
        print "Unweighted: ", classifier.score(x_test, y_test)

        x_train, x_test, y_train, y_test = train_test_split(x_weighted, y, test_size=frac)
        classifier = AdaBoostClassifier(n_estimators=100)
        classifier.fit(x_train, y_train)
        print "Weighted: ", classifier.score(x_test, y_test)

        print '--------------------------'


    '''
开发者ID:wangchr,项目名称:eMeriL,代码行数:25,代码来源:cook.py


注:本文中的sklearn.cross_validation.train_test_split函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。