当前位置: 首页>>代码示例>>Python>>正文


Python DecisionTreeClassifier.predict_proba方法代码示例

本文整理汇总了Python中sklearn.tree.DecisionTreeClassifier.predict_proba方法的典型用法代码示例。如果您正苦于以下问题:Python DecisionTreeClassifier.predict_proba方法的具体用法?Python DecisionTreeClassifier.predict_proba怎么用?Python DecisionTreeClassifier.predict_proba使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.tree.DecisionTreeClassifier的用法示例。


在下文中一共展示了DecisionTreeClassifier.predict_proba方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: evaluateDecisionTree

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def evaluateDecisionTree(train_x,train_y,test_x,test_y):
    clf = DecisionTreeClassifier(criterion='entropy',min_samples_leaf=5,max_depth=20)
    clf.fit(train_x,train_y)
    p = clf.predict_proba(test_x)[:,1]
    auc = roc_auc_score(test_y,p)
    plotAUC(test_y,clf.predict_proba(test_x)[:,1],'DT')
    return auc
开发者ID:ds-ga-1001-final,项目名称:project,代码行数:9,代码来源:decision_tree.py

示例2: programmer_2

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def programmer_2():
    datafile = 'data/model.xls'
    data = pd.read_excel(datafile)
    data = data.as_matrix()
    shuffle(data)  # 随机打乱数据

    # 设置训练数据比8:2
    p = 0.8
    train = data[:int(len(data) * p), :]
    test = data[int(len(data) * p):, :]

    # 构建CART决策树模型
    treefile = 'tmp/tree.pkl'
    tree = DecisionTreeClassifier()
    tree.fit(train[:, :3], train[:, 3])

    joblib.dump(tree, treefile)

    cm_plot(train[:, 3], tree.predict(train[:, :3])).show()  # 显示混淆矩阵可视化结果
    # 注意到Scikit-Learn使用predict方法直接给出预测结果。

    fpr, tpr, thresholds = roc_curve(
        test[:, 3], tree.predict_proba(test[:, :3])[:, 1], pos_label=1)
    plt.plot(fpr, tpr, linewidth=2, label='ROC of CART', color='green')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # 设定边界范围
    plt.ylim(0, 1.05)
    plt.xlim(0, 1.05)
    plt.legend(loc=4)
    plt.show()
    print(thresholds)
开发者ID:Ctipsy,项目名称:python_data_analysis_and_mining_action,代码行数:34,代码来源:code.py

示例3: MultEstimator

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
class MultEstimator(BaseEstimator):
    def __init__(self, categories):
        self.categories = categories

    def fit(self, X, y, **params):
        self.models = {_: None for _ in self.categories}
        self.tot_model = DecisionTreeClassifier(max_depth=8, min_samples_leaf=100)
        categ = X[:, -1]
        data = X[:, :-1]
        self.tot_model.fit(data, y)
        for c in self.models.keys():
            mask = categ == c
            m = DecisionTreeClassifier(max_depth=8, min_samples_leaf=100)
            m.fit(data[mask], y[mask])
            self.models[c] = m

    def predict(self, X):
        categ = X[:, -1]
        data = X[:, :-1]
        p = self.tot_model.predict(data)
        for c in self.models.keys():
            mask = categ == c
            if mask.any():
                p[mask] = self.models[c].predict(data[mask])
        return p

    def predict_proba(self, X):
        categ = X[:, -1]
        data = X[:, :-1]
        p = self.tot_model.predict_proba(data)
        for c in self.models.keys():
            mask = categ == c
            if mask.any():
                p[mask] = self.models[c].predict_proba(data[mask])
        return p
开发者ID:alfiya400,项目名称:kaggle-avitoDuplicatesDetection,代码行数:37,代码来源:model.py

示例4: main

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def main():
	print('Start', datetime.datetime.now())

	#train data
	X, y, features_encoding, enc_X, le = prepare_train_data()
	#test data
	#X_test, users = prepare_test_data(features_encoding, enc_X)

	#clf = DecisionTreeClassifier(max_depth=5)
	#train_and_test(clf, X, y)
	#clf = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) 
	#train_and_test(clf, X, y)

	clf = DecisionTreeClassifier(max_depth=5)
	train_cross_validate(clf, X, y)
	clf = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) 
	train_cross_validate(clf, X, y)

	1+1
	#train_compare_classifiers(X, y)

	#write results
	#clf = DecisionTreeClassifier(max_depth=5)
	clf = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) 

	print('fit', datetime.datetime.now())
	clf.fit(X, y)

	print('predict', datetime.datetime.now())
	y_test = clf.predict_proba(X_test)
	y_top5 = calculate_top5(y_test, le)

	write_results(users, y_top5)
开发者ID:mircean,项目名称:ML,代码行数:35,代码来源:module2_python_v1.py

示例5: main

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def main(args):
    exec "import main.pandas_talib.sig_%s as conf" % args.signame
    build.work2(20, 'sp500Top50', args.signame)
    df = base.get_merged(conf.__name__, yeod.get_sp500Top50())
    df.to_csv("ta.csv")

    tree = DecisionTreeClassifier() 
    
    feat_names = base.get_feat_names(df)

    dfTrain = df[(df.date>='1970-01-01') & (df.date <='2009-12-31')]
    npTrainFeat = dfTrain.loc[:,feat_names].values.copy()
    npTrainLabel = dfTrain.loc[:,"label5"].values.copy()
    npTrainLabel[npTrainLabel >  1.0] = 1
    npTrainLabel[npTrainLabel <  1.0] = 0

    tree.fit(npTrainFeat, npTrainLabel)
    joblib.dump(tree, "tree.pkl", compress = 3)
    
    dfTest = df[(df.date>='2010-01-01') & (df.date <='2099-12-31')]
    npTestFeat = dfTest.loc[:, feat_names].values.copy()
    
    npPred = tree.predict_proba(npTestFeat)

    dfTest.loc[:,"pred"] = npPred[:,1]
    
    print dfTest['pred'].head()

    dfPos = dfTest[ dfTest['pred'] > 0.55 ]
    print 1.0 * len(dfPos[dfPos['label5']>1])  / len(dfPos)
    print 1.0 * len(dfTest[dfTest['label5']>1])  / len(dfTest)
开发者ID:hongbin0908,项目名称:pytrade,代码行数:33,代码来源:check_sig2.py

示例6: plot_tree

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def plot_tree(max_depth=1):
    fig, ax = plt.subplots(1, 2, figsize=(15, 7))
    h = 0.02

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    if max_depth != 0:
        tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y)
        Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        Z = Z.reshape(xx.shape)
        faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))
        faces = faces.reshape(xx.shape)
        border = ndimage.laplace(faces) != 0
        ax[0].contourf(xx, yy, Z, alpha=.4)
        ax[0].scatter(xx[border], yy[border], marker='.', s=1)
        ax[0].set_title("max_depth = %d" % max_depth)
        ax[1].imshow(tree_image(tree))
        ax[1].axis("off")
    else:
        ax[0].set_title("data set")
        ax[1].set_visible(False)
    ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
    ax[0].set_xlim(x_min, x_max)
    ax[0].set_ylim(y_min, y_max)
    ax[0].set_xticks(())
    ax[0].set_yticks(())
开发者ID:TheloniusJ,项目名称:scipy_2015_sklearn_tutorial,代码行数:30,代码来源:plot_interactive_tree.py

示例7: main

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def main():
    l_encoder = pickle.load(open("encoder.pickle"))
    train = pd.read_csv("dataset/transformed_train.csv")
    test = pd.read_csv("dataset/transformed_test.csv")
    test["Yearly_ExpensesK"] = +1
    test["Yearly_IncomeK"] = +1
    test["Overall_happiness_score"] = +1
    test["Financial_agenda_matters"] = 1

    test["Vote"] = l_encoder.inverse_transform(test["Vote"])
    counts = pd.DataFrame()
    # counts['real'] = test.Vote.value_counts()

    classifier = DecisionTreeClassifier(max_depth=10)
    classifier.fit(train.drop("Vote", axis=1), train.Vote.values)

    print "Division of voters by probabilistic prediction:"
    proba = pd.DataFrame(classifier.predict_proba(test.drop("Vote", axis=1)))
    proba.columns = l_encoder.classes_
    counts["predicted"] = proba.sum()
    # counts['difference'] = counts.real - counts.predicted
    print counts
    print "Total alternative coalition votes: " + str(
        counts["predicted"]["Greens"] + counts["predicted"]["Pinks"] + counts["predicted"]["Whites"]
    )
开发者ID:radotzki,项目名称:the-elections-challenge,代码行数:27,代码来源:alternative_coalition.py

示例8: decision_tree_prediction

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def decision_tree_prediction(features_train, labels_train, features_test, ids):

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(features_train, labels_train, random_state=1301, stratify=labels_train, test_size=0.3)

    clf = DecisionTreeClassifier(criterion='gini',
                                 min_samples_split=10,
                                 max_depth=10,
                                 max_leaf_nodes=16,
                                 max_features=2)


    #clf_acc = clf.fit(X_train, y_train)
    # print(clf.best_estimator_)
    #feature_importance = clf.feature_importances_
    #print (feature_importance)

    #pred = clf_acc.predict_proba(X_test)[:,1]
    #print (y_test, pred)
    # acc = accuracy_score(y_test, pred)
    # print ("Acc {}".format(acc))

    clf = clf.fit(features_train, labels_train)

    pred = clf.predict_proba(features_test)[:,1]

    predictions_file = open("data/canivel_decision_tree.csv", "wb")
    predictions_file_object = csv.writer(predictions_file)
    predictions_file_object.writerow(["ID", "TARGET"])
    predictions_file_object.writerows(zip(ids, pred))
    predictions_file.close()
开发者ID:canivel,项目名称:Kaggle-Santander,代码行数:32,代码来源:regular_classifiers.py

示例9: bloodTrain

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def bloodTrain(data,model):
	#Load the amount of blood in each slice
	bf_df = load_scan(data)
	
	#Split the data
	X_train, X_test, y_train, y_test = train_test_split(bf_df[['blood']], 
							    bf_df.cancer, 
							    random_state = 12345,
							   train_size = 0.8,
							   stratify = bf_df.cancer)
	
	print('Training patients:{}, testing patients:{}'.format(X_train.shape[0], X_test.shape[0]))
	
	#Models
	#clf = SVC(probability=True)
	clf = DecisionTreeClassifier()
	clf.fit(X_train, y_train)
	
	#Saving the model
	pickle.dumps(model,clf)
	
	#Model Evaluation
	X_pred = clf.predict_proba(X_test)
	X_pred = np.array([i[1] for i in X_pred])
	test = log_loss(y_test, X_pred)
	return test
开发者ID:etheleon,项目名称:finalSubmissionDSB2017,代码行数:28,代码来源:BloodModel.py

示例10: tipdm_chapter5_id3_test

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def tipdm_chapter5_id3_test():
	# 参数初始化
	filename = '../../../MyFile/chapter5/data/sales_data.xls'
	data = pd.read_excel(filename, index_col = u'序号')	# 导入数据

	# 数据是类别标签,要将它转化为数据形式
	# 对于属性“高”、“好”和“是”使用1表示,对于“低”、“坏”和“否”使用-1表示
	data[data == u'高'] = 1
	data[data == u'是'] = 1
	data[data == u'好'] = 1
	data[data != 1] = -1
	x = data.iloc[:,:3].as_matrix().astype(int)
	y = data.iloc[:,3].as_matrix().astype(int)

	dtc = DTC(criterion = 'entropy')	# 建立决策树模型,基于信息熵
	dtc.fit(x, y)

	# 导入相关函数,可视化决策树
	# 导出的结果是一个dot文件,需要安装GraphViz才能将其转化为pdf或png格式
	with open("tree.dot", "w") as f:
		f = export_graphviz(dtc, out_file = f, feature_names = data.columns)

	# 预测新数据
	print('New data to be predicted!')
	data_to_be_predicted = [[-1, -1, -1], [1, 1, 1], [-1, 1, 1]]
	print(data_to_be_predicted)
	result = dtc.predict(data_to_be_predicted)
	result_proba = dtc.predict_proba(data_to_be_predicted)	# with probability
	print(result)
	print(result_proba)
开发者ID:JoshuaMichaelKing,项目名称:MyLearning,代码行数:32,代码来源:decision_tree_demo.py

示例11: test_dt

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def test_dt():
    cancer = load_breast_cancer()
    X, y = cancer.data, cancer.target
    feature_names = cancer.feature_names

    sk_dt = SKDT(random_state=1, max_depth=3)
    our_dt = ClassificationTree(feature_names=feature_names, random_state=1)

    sk_dt.fit(X, y)
    our_dt.fit(X, y)

    sk_pred = sk_dt.predict_proba(X)
    our_pred = our_dt.predict_proba(X)
    assert np.allclose(sk_pred, our_pred)

    sk_pred = sk_dt.predict(X)
    our_pred = our_dt.predict(X)
    assert np.allclose(sk_pred, our_pred)

    # With labels
    local_expl = our_dt.explain_local(X, y)
    local_viz = local_expl.visualize(0)
    assert local_viz is not None

    # Without labels
    local_expl = our_dt.explain_local(X)
    local_viz = local_expl.visualize(0)
    assert local_viz is not None

    global_expl = our_dt.explain_global()
    global_viz = global_expl.visualize()
    assert global_viz is not None
开发者ID:caskeep,项目名称:interpret,代码行数:34,代码来源:test_decisiontree.py

示例12: get_clfs

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def get_clfs(rank, Nfeatures=20, Nscores=10):
    """ Traning decision tree on a chank of data and returns predictions"""

    df = pd.read_csv('data/train_%d.csv'%rank, names=headers)
    print rank, df.shape
    np.random.seed(rank)
    fselect = np.random.choice(range(2, Nscores), Nfeatures, replace = False)
    print rank, fselect

    indexes = np.array(scores_indexes)[fselect]

    Nr, Nc  = df.shape
    Nf = len(indexes)
    X = np.zeros([Nr,Nf+1]) 
    y = np.zeros([Nr]) 

    get_X_y(X, y, df, features_touples, indexes)
    print rank, 'Xy read'
    del df
    
    if rank == 0: print 'Size of numpy array in GB:', X.nbytes/1.e9
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X, y)
    y_pred = clf.predict_proba(X)
    etmp = log_loss(y, y_pred)

    del X, y
    print 'IN error on rank:', rank, 'is', etmp
    return (clf, rank, etmp)
开发者ID:pzawadzk,项目名称:Data-Analysis-Miscellaneous,代码行数:31,代码来源:setup.py

示例13: DTree

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def DTree(X, Y, XTest, YTest):
    print '-----------------------------------------------------'
    # dot_data = StringIO()
    # tree.export_graphviz(dtree_model, out_file=dot_data)
    # graph = pydot.graph_from_dot_data(dot_data.getvalue())
    # graph.write_pdf("../dtree.pdf")

    # param_grid = {'max_depth': np.arange(1, 15)}

    # tree_grid = GridSearchCV(DecisionTreeClassifier(), param_grid)
    tree_grid = DecisionTreeClassifier(max_depth=3)
    tree_grid.fit(X, Y)
    export_graphviz(tree_grid, out_file=dot_data)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("dtreevis.pdf")

    # print("The best parameters are %s with a score of %0.2f"
    #       % (tree_grid.best_params_, tree_grid.best_score_))

    print "Computing training statistics"
    dtree_predict_time_training = time.time()
    Ypred_dtree_training = tree_grid.predict(X)
    dtree_predict_time_training = time.time() - dtree_predict_time_training

    dtree_accuracy_training = metrics.accuracy_score(Y, Ypred_dtree_training)
    dt_precision_training = metrics.precision_score(Y, Ypred_dtree_training,
                                                    average='binary')
    dtree_recall_training = metrics.recall_score(Y, Ypred_dtree_training,
                                                 average='binary')

    print "DT training prediction time: " + str(dtree_predict_time_training)
    print "DT training accuracy Score: " + str(dtree_accuracy_training)
    print "DT training precision Score: " + str(dt_precision_training)
    print "DT training recall Score: " + str(dtree_recall_training)

    print "Computing testing statistics"
    dtree_predict_time_test = time.time()
    Ypred_dtree_test = tree_grid.predict(XTest)
    dtree_predict_time_test = time.time() - dtree_predict_time_test

    dtree_accuracy_test = metrics.accuracy_score(YTest, Ypred_dtree_test)
    dt_precision_test = metrics.precision_score(YTest, Ypred_dtree_test,
                                                average='binary')
    dtree_recall_test = metrics.recall_score(YTest, Ypred_dtree_test,
                                             average='binary')

    print "DT test prediction time: " + str(dtree_predict_time_test)
    print "DT test accuracy Score: " + str(dtree_accuracy_test)
    print "DT test precision Score: " + str(dt_precision_test)
    print "DT test recall Score: " + str(dtree_recall_test)

    print "Creating ROC curve"
    y_true = YTest
    y_score = tree_grid.predict_proba(XTest)
    fprSVM, trpSVM, _ = metrics.roc_curve(y_true=y_true,
                                          y_score=y_score[:, 0],
                                          pos_label=0)
    plt.plot(fprSVM, trpSVM, 'r-', label='DT')
开发者ID:jhurwitzupenn,项目名称:CIS419Project,代码行数:60,代码来源:trainClassifiers.py

示例14: main

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def main():

    # Load the data
    print('Reading data...')
    main_data = pd.read_csv('../data/main_data.csv')
    targets = pd.read_csv('../data/target.csv')
    big_array = pd.concat([main_data, targets], axis=1)
    big_array = big_array.sample(frac=0.010)
    print(len(main_data.index))
    print(len(big_array.index))


    # Split the Data
    print('Splitting...')
    X_train, X_test, y_train, y_test = train_test_split(main_data, targets, test_size=0.25, random_state=42)

    # Train tree
    print('Training Tree...')
    tree = DecisionTreeClassifier()
    tree.fit(X_train, y_train)
    print('Predicting Tree...')
    tree_pred = tree.predict_proba(X_test)[:, 1]
    tree_fpr, tree_tpr, _ = roc_curve(y_test, tree_pred)

    # train random forest
    print('Training Random Forest...')
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    print('Predicting Random Forest...')
    rf_pred = rf.predict_proba(X_test)[:, 1]
    rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_pred)

    # train svm
    # Had to split it to a subset, way too mcuh data, too long to run
    svm_train, svm_test, svm_y_train, svm_y_test = train_test_split(big_array.drop('TARGET', axis=1),
                                                                    big_array['TARGET'],
                                                                    test_size=0.75)
    print('Training SVM...')
    svm_c = SVC(kernel='linear', probability=True)
    svm_c.fit(svm_train, svm_y_train)
    print('Predicting SVM...')
    svm_pred = svm_c.predict_proba(X_test)[:, 1]
    svm_fpr, svm_tpr, _ = roc_curve(y_test, svm_pred)

    # plot model comparison
    print('Creating Plot...')
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(tree_fpr, tree_tpr, label='Tree')
    plt.plot(rf_fpr, rf_tpr, label='RF')
    plt.plot(svm_fpr, svm_tpr, label='SVM')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    print('Saving Plot...')
    plt.savefig('rocCurve.png')
开发者ID:jaydik,项目名称:santander,代码行数:59,代码来源:model_comp.py

示例15: main

# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def main(train_file='train.csv', test_file='test.csv', output_file='predict_dectree.csv'):
    print "Loading data..."
    
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    y = np.array(train_data[["ACTION"]])
    #X = np.array(train_data.ix[:,1:-1])     # Ignores ACTION, ROLE_CODE
    X = np.array(train_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY", "ROLE_DEPTNAME", "ROLE_CODE"]])
    X_test = np.array(test_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY","ROLE_DEPTNAME", "ROLE_CODE"]]) # Ignores ID, ROLE_CODE
    
    SEED = 4
    #clf = DecisionTreeClassifier(criterion="entropy").fit(X,y)
    
    
    clf = DecisionTreeClassifier(criterion="entropy", min_samples_split=61, min_samples_leaf=1,min_density=0.1).fit(X,y)

    
    mean_auc = 0.0
    n = 10
    for i in range(n):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.20, random_state=i*SEED)

        # if you want to perform feature selection / hyperparameter
        # optimization, this is where you want to do it
        
        # train model and make predictions
        clf.fit(X_train, y_train) 
        preds = clf.predict_proba(X_cv)[:,1]

        # compute AUC metric for this CV fold
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
        mean_auc += roc_auc
    
    print "Mean AUC: %f" % (mean_auc/n)

    predictions = clf.predict_proba(X_test)[:,1]
    #print predictions
    
    #print 'Writing predictions to %s...' % (output_file)
    create_test_submission(output_file, predictions)
 
    return 0
开发者ID:sarwarbhuiyan,项目名称:datascience-ga-amazon-kaggle,代码行数:46,代码来源:amazon_dectree.py


注:本文中的sklearn.tree.DecisionTreeClassifier.predict_proba方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。