本文整理汇总了Python中sklearn.tree.DecisionTreeClassifier.predict_proba方法的典型用法代码示例。如果您正苦于以下问题:Python DecisionTreeClassifier.predict_proba方法的具体用法?Python DecisionTreeClassifier.predict_proba怎么用?Python DecisionTreeClassifier.predict_proba使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.tree.DecisionTreeClassifier
的用法示例。
在下文中一共展示了DecisionTreeClassifier.predict_proba方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: evaluateDecisionTree
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def evaluateDecisionTree(train_x,train_y,test_x,test_y):
clf = DecisionTreeClassifier(criterion='entropy',min_samples_leaf=5,max_depth=20)
clf.fit(train_x,train_y)
p = clf.predict_proba(test_x)[:,1]
auc = roc_auc_score(test_y,p)
plotAUC(test_y,clf.predict_proba(test_x)[:,1],'DT')
return auc
示例2: programmer_2
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def programmer_2():
datafile = 'data/model.xls'
data = pd.read_excel(datafile)
data = data.as_matrix()
shuffle(data) # 随机打乱数据
# 设置训练数据比8:2
p = 0.8
train = data[:int(len(data) * p), :]
test = data[int(len(data) * p):, :]
# 构建CART决策树模型
treefile = 'tmp/tree.pkl'
tree = DecisionTreeClassifier()
tree.fit(train[:, :3], train[:, 3])
joblib.dump(tree, treefile)
cm_plot(train[:, 3], tree.predict(train[:, :3])).show() # 显示混淆矩阵可视化结果
# 注意到Scikit-Learn使用predict方法直接给出预测结果。
fpr, tpr, thresholds = roc_curve(
test[:, 3], tree.predict_proba(test[:, :3])[:, 1], pos_label=1)
plt.plot(fpr, tpr, linewidth=2, label='ROC of CART', color='green')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# 设定边界范围
plt.ylim(0, 1.05)
plt.xlim(0, 1.05)
plt.legend(loc=4)
plt.show()
print(thresholds)
示例3: MultEstimator
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
class MultEstimator(BaseEstimator):
def __init__(self, categories):
self.categories = categories
def fit(self, X, y, **params):
self.models = {_: None for _ in self.categories}
self.tot_model = DecisionTreeClassifier(max_depth=8, min_samples_leaf=100)
categ = X[:, -1]
data = X[:, :-1]
self.tot_model.fit(data, y)
for c in self.models.keys():
mask = categ == c
m = DecisionTreeClassifier(max_depth=8, min_samples_leaf=100)
m.fit(data[mask], y[mask])
self.models[c] = m
def predict(self, X):
categ = X[:, -1]
data = X[:, :-1]
p = self.tot_model.predict(data)
for c in self.models.keys():
mask = categ == c
if mask.any():
p[mask] = self.models[c].predict(data[mask])
return p
def predict_proba(self, X):
categ = X[:, -1]
data = X[:, :-1]
p = self.tot_model.predict_proba(data)
for c in self.models.keys():
mask = categ == c
if mask.any():
p[mask] = self.models[c].predict_proba(data[mask])
return p
示例4: main
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def main():
print('Start', datetime.datetime.now())
#train data
X, y, features_encoding, enc_X, le = prepare_train_data()
#test data
#X_test, users = prepare_test_data(features_encoding, enc_X)
#clf = DecisionTreeClassifier(max_depth=5)
#train_and_test(clf, X, y)
#clf = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
#train_and_test(clf, X, y)
clf = DecisionTreeClassifier(max_depth=5)
train_cross_validate(clf, X, y)
clf = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
train_cross_validate(clf, X, y)
1+1
#train_compare_classifiers(X, y)
#write results
#clf = DecisionTreeClassifier(max_depth=5)
clf = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
print('fit', datetime.datetime.now())
clf.fit(X, y)
print('predict', datetime.datetime.now())
y_test = clf.predict_proba(X_test)
y_top5 = calculate_top5(y_test, le)
write_results(users, y_top5)
示例5: main
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def main(args):
exec "import main.pandas_talib.sig_%s as conf" % args.signame
build.work2(20, 'sp500Top50', args.signame)
df = base.get_merged(conf.__name__, yeod.get_sp500Top50())
df.to_csv("ta.csv")
tree = DecisionTreeClassifier()
feat_names = base.get_feat_names(df)
dfTrain = df[(df.date>='1970-01-01') & (df.date <='2009-12-31')]
npTrainFeat = dfTrain.loc[:,feat_names].values.copy()
npTrainLabel = dfTrain.loc[:,"label5"].values.copy()
npTrainLabel[npTrainLabel > 1.0] = 1
npTrainLabel[npTrainLabel < 1.0] = 0
tree.fit(npTrainFeat, npTrainLabel)
joblib.dump(tree, "tree.pkl", compress = 3)
dfTest = df[(df.date>='2010-01-01') & (df.date <='2099-12-31')]
npTestFeat = dfTest.loc[:, feat_names].values.copy()
npPred = tree.predict_proba(npTestFeat)
dfTest.loc[:,"pred"] = npPred[:,1]
print dfTest['pred'].head()
dfPos = dfTest[ dfTest['pred'] > 0.55 ]
print 1.0 * len(dfPos[dfPos['label5']>1]) / len(dfPos)
print 1.0 * len(dfTest[dfTest['label5']>1]) / len(dfTest)
示例6: plot_tree
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def plot_tree(max_depth=1):
fig, ax = plt.subplots(1, 2, figsize=(15, 7))
h = 0.02
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
if max_depth != 0:
tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y)
Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))
faces = faces.reshape(xx.shape)
border = ndimage.laplace(faces) != 0
ax[0].contourf(xx, yy, Z, alpha=.4)
ax[0].scatter(xx[border], yy[border], marker='.', s=1)
ax[0].set_title("max_depth = %d" % max_depth)
ax[1].imshow(tree_image(tree))
ax[1].axis("off")
else:
ax[0].set_title("data set")
ax[1].set_visible(False)
ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
ax[0].set_xlim(x_min, x_max)
ax[0].set_ylim(y_min, y_max)
ax[0].set_xticks(())
ax[0].set_yticks(())
示例7: main
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def main():
l_encoder = pickle.load(open("encoder.pickle"))
train = pd.read_csv("dataset/transformed_train.csv")
test = pd.read_csv("dataset/transformed_test.csv")
test["Yearly_ExpensesK"] = +1
test["Yearly_IncomeK"] = +1
test["Overall_happiness_score"] = +1
test["Financial_agenda_matters"] = 1
test["Vote"] = l_encoder.inverse_transform(test["Vote"])
counts = pd.DataFrame()
# counts['real'] = test.Vote.value_counts()
classifier = DecisionTreeClassifier(max_depth=10)
classifier.fit(train.drop("Vote", axis=1), train.Vote.values)
print "Division of voters by probabilistic prediction:"
proba = pd.DataFrame(classifier.predict_proba(test.drop("Vote", axis=1)))
proba.columns = l_encoder.classes_
counts["predicted"] = proba.sum()
# counts['difference'] = counts.real - counts.predicted
print counts
print "Total alternative coalition votes: " + str(
counts["predicted"]["Greens"] + counts["predicted"]["Pinks"] + counts["predicted"]["Whites"]
)
示例8: decision_tree_prediction
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def decision_tree_prediction(features_train, labels_train, features_test, ids):
X_train, X_test, y_train, y_test = cross_validation.train_test_split(features_train, labels_train, random_state=1301, stratify=labels_train, test_size=0.3)
clf = DecisionTreeClassifier(criterion='gini',
min_samples_split=10,
max_depth=10,
max_leaf_nodes=16,
max_features=2)
#clf_acc = clf.fit(X_train, y_train)
# print(clf.best_estimator_)
#feature_importance = clf.feature_importances_
#print (feature_importance)
#pred = clf_acc.predict_proba(X_test)[:,1]
#print (y_test, pred)
# acc = accuracy_score(y_test, pred)
# print ("Acc {}".format(acc))
clf = clf.fit(features_train, labels_train)
pred = clf.predict_proba(features_test)[:,1]
predictions_file = open("data/canivel_decision_tree.csv", "wb")
predictions_file_object = csv.writer(predictions_file)
predictions_file_object.writerow(["ID", "TARGET"])
predictions_file_object.writerows(zip(ids, pred))
predictions_file.close()
示例9: bloodTrain
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def bloodTrain(data,model):
#Load the amount of blood in each slice
bf_df = load_scan(data)
#Split the data
X_train, X_test, y_train, y_test = train_test_split(bf_df[['blood']],
bf_df.cancer,
random_state = 12345,
train_size = 0.8,
stratify = bf_df.cancer)
print('Training patients:{}, testing patients:{}'.format(X_train.shape[0], X_test.shape[0]))
#Models
#clf = SVC(probability=True)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
#Saving the model
pickle.dumps(model,clf)
#Model Evaluation
X_pred = clf.predict_proba(X_test)
X_pred = np.array([i[1] for i in X_pred])
test = log_loss(y_test, X_pred)
return test
示例10: tipdm_chapter5_id3_test
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def tipdm_chapter5_id3_test():
# 参数初始化
filename = '../../../MyFile/chapter5/data/sales_data.xls'
data = pd.read_excel(filename, index_col = u'序号') # 导入数据
# 数据是类别标签,要将它转化为数据形式
# 对于属性“高”、“好”和“是”使用1表示,对于“低”、“坏”和“否”使用-1表示
data[data == u'高'] = 1
data[data == u'是'] = 1
data[data == u'好'] = 1
data[data != 1] = -1
x = data.iloc[:,:3].as_matrix().astype(int)
y = data.iloc[:,3].as_matrix().astype(int)
dtc = DTC(criterion = 'entropy') # 建立决策树模型,基于信息熵
dtc.fit(x, y)
# 导入相关函数,可视化决策树
# 导出的结果是一个dot文件,需要安装GraphViz才能将其转化为pdf或png格式
with open("tree.dot", "w") as f:
f = export_graphviz(dtc, out_file = f, feature_names = data.columns)
# 预测新数据
print('New data to be predicted!')
data_to_be_predicted = [[-1, -1, -1], [1, 1, 1], [-1, 1, 1]]
print(data_to_be_predicted)
result = dtc.predict(data_to_be_predicted)
result_proba = dtc.predict_proba(data_to_be_predicted) # with probability
print(result)
print(result_proba)
示例11: test_dt
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def test_dt():
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
feature_names = cancer.feature_names
sk_dt = SKDT(random_state=1, max_depth=3)
our_dt = ClassificationTree(feature_names=feature_names, random_state=1)
sk_dt.fit(X, y)
our_dt.fit(X, y)
sk_pred = sk_dt.predict_proba(X)
our_pred = our_dt.predict_proba(X)
assert np.allclose(sk_pred, our_pred)
sk_pred = sk_dt.predict(X)
our_pred = our_dt.predict(X)
assert np.allclose(sk_pred, our_pred)
# With labels
local_expl = our_dt.explain_local(X, y)
local_viz = local_expl.visualize(0)
assert local_viz is not None
# Without labels
local_expl = our_dt.explain_local(X)
local_viz = local_expl.visualize(0)
assert local_viz is not None
global_expl = our_dt.explain_global()
global_viz = global_expl.visualize()
assert global_viz is not None
示例12: get_clfs
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def get_clfs(rank, Nfeatures=20, Nscores=10):
""" Traning decision tree on a chank of data and returns predictions"""
df = pd.read_csv('data/train_%d.csv'%rank, names=headers)
print rank, df.shape
np.random.seed(rank)
fselect = np.random.choice(range(2, Nscores), Nfeatures, replace = False)
print rank, fselect
indexes = np.array(scores_indexes)[fselect]
Nr, Nc = df.shape
Nf = len(indexes)
X = np.zeros([Nr,Nf+1])
y = np.zeros([Nr])
get_X_y(X, y, df, features_touples, indexes)
print rank, 'Xy read'
del df
if rank == 0: print 'Size of numpy array in GB:', X.nbytes/1.e9
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X, y)
y_pred = clf.predict_proba(X)
etmp = log_loss(y, y_pred)
del X, y
print 'IN error on rank:', rank, 'is', etmp
return (clf, rank, etmp)
示例13: DTree
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def DTree(X, Y, XTest, YTest):
print '-----------------------------------------------------'
# dot_data = StringIO()
# tree.export_graphviz(dtree_model, out_file=dot_data)
# graph = pydot.graph_from_dot_data(dot_data.getvalue())
# graph.write_pdf("../dtree.pdf")
# param_grid = {'max_depth': np.arange(1, 15)}
# tree_grid = GridSearchCV(DecisionTreeClassifier(), param_grid)
tree_grid = DecisionTreeClassifier(max_depth=3)
tree_grid.fit(X, Y)
export_graphviz(tree_grid, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("dtreevis.pdf")
# print("The best parameters are %s with a score of %0.2f"
# % (tree_grid.best_params_, tree_grid.best_score_))
print "Computing training statistics"
dtree_predict_time_training = time.time()
Ypred_dtree_training = tree_grid.predict(X)
dtree_predict_time_training = time.time() - dtree_predict_time_training
dtree_accuracy_training = metrics.accuracy_score(Y, Ypred_dtree_training)
dt_precision_training = metrics.precision_score(Y, Ypred_dtree_training,
average='binary')
dtree_recall_training = metrics.recall_score(Y, Ypred_dtree_training,
average='binary')
print "DT training prediction time: " + str(dtree_predict_time_training)
print "DT training accuracy Score: " + str(dtree_accuracy_training)
print "DT training precision Score: " + str(dt_precision_training)
print "DT training recall Score: " + str(dtree_recall_training)
print "Computing testing statistics"
dtree_predict_time_test = time.time()
Ypred_dtree_test = tree_grid.predict(XTest)
dtree_predict_time_test = time.time() - dtree_predict_time_test
dtree_accuracy_test = metrics.accuracy_score(YTest, Ypred_dtree_test)
dt_precision_test = metrics.precision_score(YTest, Ypred_dtree_test,
average='binary')
dtree_recall_test = metrics.recall_score(YTest, Ypred_dtree_test,
average='binary')
print "DT test prediction time: " + str(dtree_predict_time_test)
print "DT test accuracy Score: " + str(dtree_accuracy_test)
print "DT test precision Score: " + str(dt_precision_test)
print "DT test recall Score: " + str(dtree_recall_test)
print "Creating ROC curve"
y_true = YTest
y_score = tree_grid.predict_proba(XTest)
fprSVM, trpSVM, _ = metrics.roc_curve(y_true=y_true,
y_score=y_score[:, 0],
pos_label=0)
plt.plot(fprSVM, trpSVM, 'r-', label='DT')
示例14: main
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def main():
# Load the data
print('Reading data...')
main_data = pd.read_csv('../data/main_data.csv')
targets = pd.read_csv('../data/target.csv')
big_array = pd.concat([main_data, targets], axis=1)
big_array = big_array.sample(frac=0.010)
print(len(main_data.index))
print(len(big_array.index))
# Split the Data
print('Splitting...')
X_train, X_test, y_train, y_test = train_test_split(main_data, targets, test_size=0.25, random_state=42)
# Train tree
print('Training Tree...')
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
print('Predicting Tree...')
tree_pred = tree.predict_proba(X_test)[:, 1]
tree_fpr, tree_tpr, _ = roc_curve(y_test, tree_pred)
# train random forest
print('Training Random Forest...')
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print('Predicting Random Forest...')
rf_pred = rf.predict_proba(X_test)[:, 1]
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_pred)
# train svm
# Had to split it to a subset, way too mcuh data, too long to run
svm_train, svm_test, svm_y_train, svm_y_test = train_test_split(big_array.drop('TARGET', axis=1),
big_array['TARGET'],
test_size=0.75)
print('Training SVM...')
svm_c = SVC(kernel='linear', probability=True)
svm_c.fit(svm_train, svm_y_train)
print('Predicting SVM...')
svm_pred = svm_c.predict_proba(X_test)[:, 1]
svm_fpr, svm_tpr, _ = roc_curve(y_test, svm_pred)
# plot model comparison
print('Creating Plot...')
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(tree_fpr, tree_tpr, label='Tree')
plt.plot(rf_fpr, rf_tpr, label='RF')
plt.plot(svm_fpr, svm_tpr, label='SVM')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
print('Saving Plot...')
plt.savefig('rocCurve.png')
示例15: main
# 需要导入模块: from sklearn.tree import DecisionTreeClassifier [as 别名]
# 或者: from sklearn.tree.DecisionTreeClassifier import predict_proba [as 别名]
def main(train_file='train.csv', test_file='test.csv', output_file='predict_dectree.csv'):
print "Loading data..."
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
y = np.array(train_data[["ACTION"]])
#X = np.array(train_data.ix[:,1:-1]) # Ignores ACTION, ROLE_CODE
X = np.array(train_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY", "ROLE_DEPTNAME", "ROLE_CODE"]])
X_test = np.array(test_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY","ROLE_DEPTNAME", "ROLE_CODE"]]) # Ignores ID, ROLE_CODE
SEED = 4
#clf = DecisionTreeClassifier(criterion="entropy").fit(X,y)
clf = DecisionTreeClassifier(criterion="entropy", min_samples_split=61, min_samples_leaf=1,min_density=0.1).fit(X,y)
mean_auc = 0.0
n = 10
for i in range(n):
X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.20, random_state=i*SEED)
# if you want to perform feature selection / hyperparameter
# optimization, this is where you want to do it
# train model and make predictions
clf.fit(X_train, y_train)
preds = clf.predict_proba(X_cv)[:,1]
# compute AUC metric for this CV fold
fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
roc_auc = metrics.auc(fpr, tpr)
print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
mean_auc += roc_auc
print "Mean AUC: %f" % (mean_auc/n)
predictions = clf.predict_proba(X_test)[:,1]
#print predictions
#print 'Writing predictions to %s...' % (output_file)
create_test_submission(output_file, predictions)
return 0