本文整理汇总了Python中sklearn.ensemble.AdaBoostClassifier.decision_function方法的典型用法代码示例。如果您正苦于以下问题:Python AdaBoostClassifier.decision_function方法的具体用法?Python AdaBoostClassifier.decision_function怎么用?Python AdaBoostClassifier.decision_function使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.ensemble.AdaBoostClassifier
的用法示例。
在下文中一共展示了AdaBoostClassifier.decision_function方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
def main():
print 'Loading training data ...'
data_train = pd.read_csv('csv/CamKt12LCTopoSplitFilteredMu100SmallR30YCut414tev_350_500_vxp_0_99-merged.csv')
r =np.random.rand(data_train.shape[0])
#Algorithm = 'AKT10LCTRIM530'
plt.figure(1)
Y_train = data_train['label'][r<0.9]
# W_train = data_train['weight'][r<0.9]
Y_valid = data_train['label'][r>=0.9]
# W_valid = data_train['weight'][r>=0.9]
# data_train.drop('AKT10LCTRIM530_MassDropSplit', axis=1, inplace=True)
for varset in itertools.combinations(data_train.columns.values[1:-1],2):
print list(varset)
X_train = data_train[list(varset)][r<0.9]
X_valid = data_train[list(varset)][r>=0.9]
#gbc = Pipeline([("scale", StandardScaler()), ("gbc",GBC(n_estimators=1,verbose=1, max_depth=10,min_samples_leaf=50))])
# gbc = GBC(n_estimators=20,verbose=1, max_depth=10,min_samples_leaf=50)
#gbc = GaussianNB()
dt = DC(max_depth=3,min_samples_leaf=0.05*len(X_train))
abc = ABC(dt,algorithm='SAMME',
n_estimators=800,
learning_rate=0.5)
print 'Training classifier with all the data..'
abc.fit(X_train.values, Y_train.values)
# sample_weight=W_train.values
print 'Done.. Applying to validation sample and drawing ROC'
prob_predict_valid = abc.predict(X_valid)
#[:,1]
#
print prob_predict_valid
Y_score = abc.decision_function(X_valid.values)
print Y_score
fpr, tpr, _ = roc_curve(Y_valid.values, Y_score)
# W_valid.values
labelstring = 'And'.join(var.replace('_','') for var in varset)
print labelstring
plt.plot(tpr, (1-fpr), label=labelstring)
plt.figure(2)
plt.hist(abc.decision_function(X_valid[Y_valid==1.]).ravel(),
color='r', alpha=0.5, range=(-1.0,1.0), bins=50)
plt.hist(abc.decision_function(X_valid[Y_valid==0.]).ravel(),
color='b', alpha=0.5, range=(-1.0,1.0), bins=50)
plt.xlabel("scikit-learn BDT output")
plt.savefig(labelstring+'bdtout.pdf')
# labelstring = ' and '.join(var.replace(Algorithm,'') for var in varset)
plt.figure(1)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.ylabel('1- Background Efficiency')
plt.xlabel('Signal Efficiency')
plt.title('ROC Curve')
plt.legend(loc="lower left",prop={'size':6})
#plt.show()
plt.savefig('rocmva.pdf')
示例2: test_iris
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
def test_iris():
# Check consistency on dataset iris.
classes = np.unique(iris.target)
clf_samme = prob_samme = None
for alg in ['SAMME', 'SAMME.R']:
clf = AdaBoostClassifier(algorithm=alg)
clf.fit(iris.data, iris.target)
assert_array_equal(classes, clf.classes_)
proba = clf.predict_proba(iris.data)
if alg == "SAMME":
clf_samme = clf
prob_samme = proba
assert_equal(proba.shape[1], len(classes))
assert_equal(clf.decision_function(iris.data).shape[1], len(classes))
score = clf.score(iris.data, iris.target)
assert score > 0.9, "Failed with algorithm %s and score = %f" % \
(alg, score)
# Somewhat hacky regression test: prior to
# ae7adc880d624615a34bafdb1d75ef67051b8200,
# predict_proba returned SAMME.R values for SAMME.
clf_samme.algorithm = "SAMME.R"
assert_array_less(0,
np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
示例3: test_classification_toy
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
def test_classification_toy():
# Check classification on a toy dataset.
for alg in ['SAMME', 'SAMME.R']:
clf = AdaBoostClassifier(algorithm=alg, random_state=0)
clf.fit(X, y_class)
assert_array_equal(clf.predict(T), y_t_class)
assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
assert_equal(clf.predict_proba(T).shape, (len(T), 2))
assert_equal(clf.decision_function(T).shape, (len(T),))
示例4: n1check
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
def n1check(d_train, d_test, opts):
# Load the data with no weights and put it into panda format
# for easier manipulation
pd_train = pd.DataFrame(d_train.getDataNoWeight())
pd_test = pd.DataFrame(d_test.getDataNoWeight())
# Holder for results
results = {}
# Setup classifier
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=opts.maxdepth),
n_estimators = opts.ntrees,
learning_rate = opts.lrate)
# Train the classifier on total data set for comparison
clf.fit(pd_train, d_train.targets)
results['total'] = roc_auc_score(d_test.targets, clf.decision_function(pd_test))
# Loop over the variables and store the results in dict
keys = d_train.t_varnames
for i in range(len(keys)):
sub_train = pd_train.drop(i,axis=1)
sub_test = pd_test.drop(i,axis=1)
clf.fit(sub_train, d_train.targets)
results[keys[i]] = roc_auc_score(d_test.targets, clf.decision_function(sub_test))
# Now that we have the results, print all information
print "--------------------------------------------"
for key in results:
print "Leaving out ", key, "gives score: ", results[key]
print ""
示例5: test_iris
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
def test_iris():
"""Check consistency on dataset iris."""
classes = np.unique(iris.target)
for alg in ['SAMME', 'SAMME.R']:
clf = AdaBoostClassifier(algorithm=alg)
clf.fit(iris.data, iris.target)
assert_array_equal(classes, clf.classes_)
assert_equal(clf.predict_proba(iris.data).shape[1], len(classes))
assert_equal(clf.decision_function(iris.data).shape[1], len(classes))
score = clf.score(iris.data, iris.target)
assert score > 0.9, "Failed with algorithm %s and score = %f" % \
(alg, score)
示例6: main
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
def main():
Algorithm = 'CamKt12LCTopoSplitFilteredMu67SmallR0YCut9'
print 'Loading training data ...'
data_train = pd.read_csv(Algorithm+'merged.csv')
r =np.random.rand(data_train.shape[0])
#Set label and weight vectors - and drop any unwanted tranining one
Y_train = data_train['label'].values[r<0.5]
# W_train = data_train['weight'].values[r<0.9]
Y_valid = data_train['label'].values[r>=0.5]
# W_valid = data_train['weight'].values[r>=0.9]
# data_train.drop('AKT10LCTRIM530_MassDropSplit', axis=1, inplace=True)
varcombinations = itertools.combinations(data_train.columns.values[1:-1],2)
fac = lambda n: 1 if n < 2 else n * fac(n - 1)
combos = lambda n, k: fac(n) / fac(k) / fac(n - k)
colors = plt.get_cmap('jet')(np.linspace(0, 1.0,combos(len(data_train.columns.values[1:-1]),2) ))
for varset,color in zip(varcombinations, colors):
print list(varset)
X_train = data_train[list(varset)].values[r<0.5]
X_valid = data_train[list(varset)].values[r>=0.5]
dt = DC(max_depth=3,min_samples_leaf=0.05*len(X_train))
abc = ABC(dt,algorithm='SAMME',
n_estimators=8,
learning_rate=0.5)
print 'Training classifier with all the data..'
abc.fit(X_train, Y_train)
print 'Done.. Applying to validation sample and drawing ROC'
prob_predict_valid = abc.predict_proba(X_valid)[:,1]
Y_score = abc.decision_function(X_valid)
fpr, tpr, _ = roc_curve(Y_valid, prob_predict_valid)
labelstring = ' And '.join(var.replace('_','') for var in varset)
print labelstring
plt.plot(tpr, (1-fpr), label=labelstring, color=color)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.ylabel('1- Background Efficiency')
plt.xlabel('Signal Efficiency')
plt.title(Algorithm+' ROC Curve')
plt.legend(loc="lower left",prop={'size':6})
plt.savefig(Algorithm+'rocmva.pdf')
示例7: ada_boost
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
def ada_boost(X_train, X_test, y_train, y_test, C=1):
X1 = []
X2 = []
y1 = []
y2 = []
for x, y in zip(X_train, y_train):
if y==1:
y1.append(y)
X1.append(x)
else:
y2.append(y)
X2.append(x)
print(y1.count(1))
print(y2.count(0))
X1 =np.asarray(X1)
X2 =np.asarray(X2)
y1 = np.asarray(y1)
y2 = np.asarray(y2)
# y = np.asarray(y)
X = np.concatenate((X1, X2))
y = np.concatenate((y1, y2))
# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
algorithm="SAMME",
n_estimators=200)
bdt.fit(X, y)
# Plot the two-class decision scores
twoclass_output = bdt.decision_function(X)
print(type(twoclass_output))
# import IPython
# IPython.embed()
y_pre = bdt.predict(X_test)
return y_pre,classification_report(y_test, y_pre)
示例8: zip
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
plt.axis("tight")
# Plot the training points
for i, n, c in zip(range(2), class_names, plot_colors):
idx = np.where(Ydf_train['default_Yes'] == i)
plt.scatter(Xdf_train[Xdf_train['student']==1.0].ix[idx].ix[:,0], Xdf_train[Xdf_train['student']==1.0].ix[idx].ix[:,2],c=c, cmap=plt.cm.Paired,label="Class %s" % n)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.legend(loc='upper right')
plt.xlabel("Decision Boundary")
plt.show()
# Plot the two-class decision scores
twoclass_output = clf.decision_function(Xdf_train)
plot_range = (twoclass_output.min(), twoclass_output.max())
plt.subplot(132)
for i, n, c in zip(range(2), class_names, plot_colors):
idx = np.where(Ydf_train['default_Yes'] == i)
plt.hist(twoclass_output[idx],
bins=10,
range=plot_range,
facecolor=c,
label='Class %s' % n,
alpha=.5)
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, y1, y2 * 1.2))
plt.legend(loc='upper left')
plt.ylabel('Samples')
plt.xlabel('Decision Scores')
示例9: __init__
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
class adaBoost:
__all__=['run','plotFeatureRanking','plotScores']
def __init__(self, foundVariables, trainingData, trainingClasses, trainingWeights, testingData, testingClasses, adaName, bkg_name):
"""Build a forest and compute the feature importances.
Keyword args:
foundVariables -- The list of the names of found variabes, can get using Sample_x.returnFoundVariables()
trainingData -- The training data
trainingClasses -- The training data classes
testingData -- the testing data
testingClasses -- the testing data classes
adaName -- the name of the object (eg. sig+bkg_name)
"""
self.ada = AdaBoostClassifier(DecisionTreeClassifier(compute_importances=True,max_depth=4,min_samples_split=2,min_samples_leaf=100),n_estimators=400, learning_rate=0.5, algorithm="SAMME",compute_importances=True)
#class sklearn.tree.DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_density=0.10000000000000001, max_features=None, compute_importances=False, random_state=None)
self.foundVariables = foundVariables
self.trainingData = trainingData
self.trainingClasses = trainingClasses
self.testingData = testingData
self.testingClasses = testingClasses
self.trainingWeights = trainingWeights
self.name = adaName
self.bkg_name = bkg_name
self.elapsed = 0.0
def returnName(self):
return self.name
def run(self):
"""Run the fitting and testing."""
#start the fitting and time it
start = clock()
print 'starting training on AdaBoostClassifier'
self.ada.fit(self.trainingData, self.trainingClasses, self.trainingWeights)
self.elapsed = clock()-start
print 'time taken for training: ' + str(self.elapsed)
#set up the arrays for testing/ eval
#xtA_C = copy.deepcopy(self.testingData)
#pred = self.ada.predict(xtA_C)
#import createHists
#createHists.drawSigBkgDistrib(xtA_C, pred, self.foundVariables) # draw the signal and background distributions together
# list the importances of each variable in the bdt, get the score on the test data
self.importancesada = self.ada.feature_importances_
print 'importances'
print self.importancesada
self.score= self.ada.score(self.testingData,self.testingClasses)
self.params = self.ada.get_params()
self.std_mat = np.std([tree.feature_importances_ for tree in self.ada.estimators_],
axis=0)
self.indicesada = np.argsort(self.importancesada)[::-1]
self.variableNamesSorted = []
for i in self.indicesada:
self.variableNamesSorted.append(self.foundVariables[i])
# Print the feature ranking
print "Feature ranking:"
for f in xrange(12):
print "%d. feature %d (%f)" % (f + 1, self.indicesada[f], self.importancesada[self.indicesada[f]]) + " " +self.variableNamesSorted[f]
self.twoclass_output = self.ada.decision_function(self.testingData)
self.twoclass_output_train = self.ada.decision_function(self.trainingData)
self.class_proba = self.ada.predict_proba(self.testingData)[:, -1]
def plotFeatureRanking(self):
# We need this to run in batch because it complains about not being able to open display
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
import matplotlib.pyplot as plt
import pylab as pl
#plot the feature ranking
pl.figure()
pl.title("Feature importances Ada")
pl.bar(xrange(len(self.variableNamesSorted)), self.importancesada[self.indicesada],
color="r", yerr=self.std_mat[self.indicesada], align="center")
pl.xticks(xrange(12), self.variableNamesSorted)#indicesada)
pl.xlim([-1, 12])
pl.show()
def plotScores(self, returnROC = False, rocInput = []):
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
import matplotlib.pyplot as plt
import pylab as pl
from sklearn.metrics import roc_curve, auc
plot_colors = "rb"
plot_step = 1000.0
class_names = "AB"
# Plot the training points
pl.subplot(131)
for i, n, c in zip(xrange(2), class_names, plot_colors):
idx = np.where(self.trainingClasses == i)
pl.scatter(self.trainingData[idx, 0], self.trainingData[idx, 1],
c=c, cmap=pl.cm.Paired,
label="Class %s" % n)
pl.axis("tight")
#.........这里部分代码省略.........
示例10: AdaBoostClassifier
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
#################
# 2 JET #
#################
# Create BDT object.
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3, min_samples_split=0.05),
learning_rate=0.15,
algorithm="SAMME",
n_estimators=200
)
# Train BDT for 2 jet.
bdt.fit(train_2jet, train_2jet_class, sample_weight=train_2jet_weights)
# Get decision scores for test set.
twoclass_output = np.array(bdt.decision_function(test_2jet))
# Plot decision histogram.
plot_range = (twoclass_output.min(), twoclass_output.max())
plt.subplot(122)
plot_colors = 2*"r" + 12*"g" + "y" + 3*"b" + 3*"m"
plot_step = 0.02
class_names = ['qqZvvH125', 'qqWlvH125', 'Wbb', 'Wbc', 'Wcc', 'Wbl', 'Wcl', 'Wl',
'Zbb', 'Zbc', 'Zcc', 'Zbl', 'Zcl', 'Zl', 'ttbar', 'stopt', 'stops',
'stopWt', 'WW', 'ZZ', 'WZ']
for n, c in zip(class_names, plot_colors):
this_data = twoclass_output[test_2jet_processes == n]
this_weights = test_2jet_weights[test_2jet_processes == n] * SF_map_2jet[n]
plt.hist(this_data,
示例11: int
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
X_train_sig = df_cheat.query(
hlt2_cut_string)[features][int(0.2*n_events):n_events]
X_train = X_train_bkg.append(X_train_sig, ignore_index=True).values
# DEFINE WHICH PARTS OF TEST AND TRAINING SAMPLES CONTAIN SIGNAL OR BACKGROUND
y_test = int(0.2*n_events)*[0]+int(0.2*n_events)*[1]
y_train = int(0.8*n_events)*[0]+int(0.8*n_events)*[1]
# DEFINE BDT ALGORITHM
dt = DecisionTreeClassifier(max_depth=3,
min_samples_leaf=0.05*len(X_train))
bdt = AdaBoostClassifier(dt,
algorithm='SAMME',
n_estimators=800,
learning_rate=0.5)
# RUN BDT TRAINING AND SHOW RESULTS
bdt.fit(X_train, y_train)
sk_y_predicted = bdt.predict(X_test)
print classification_report(y_test, sk_y_predicted,
target_names=["background", "signal"])
print "Area under ROC curve: %.4f" % (roc_auc_score(y_test, sk_y_predicted))
plt.hist(bdt.decision_function(X_test_bkg).ravel(), color='r', alpha=0.5,
range=(-0.4, 0.4), bins=30)
plt.hist(bdt.decision_function(X_test_sig).ravel(), color='b', alpha=0.5,
range=(-0.4, 0.4), bins=30)
plt.xlabel("scikit-learn BDT output")
plt.savefig('BDT.pdf')
示例12: time
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
signalScore)
print "- When we predict that we have a signal event, it is actually signal %.1f%% of the time (%i out of %i)" % (
100.0 * fcorrect, int(fcorrect * len(predictionsForSignal)),
len(predictionsForSignal))
### PLOT
# plot feature distributions
if first:
first = False
for idx, indicator in enumerate(whichIndicators):
featureDistributions(Xtrain, Ytrain, indicator, idx)
# shamelessly stolen from https://dbaumgartel.wordpress.com/2014/03/14/machine-learning-examples-scikit-learn-versus-tmva-cern-root/
Classifier_training_S = alg.decision_function(
Xtrain[Ytrain > 0.5]).ravel()
Classifier_training_B = alg.decision_function(
Xtrain[Ytrain < 0.5]).ravel()
Classifier_testing_S = alg.decision_function(
Xtest[Ytest > 0.5]).ravel()
Classifier_testing_B = alg.decision_function(
Xtest[Ytest < 0.5]).ravel()
# This will be the min/max of our plots
c_max = 1.5
c_min = -1.5
# Get histograms of the classifiers
Histo_training_S = np.histogram(
Classifier_training_S, bins=40, range=(c_min, c_max))
Histo_training_B = np.histogram(
示例13: precision_recall_curve
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
#log
precision_l, recall_l, thresholds_l = precision_recall_curve(test["los"], log.decision_function(test_variables))
pl.plot(recall_l, precision_l)
pl.xlabel("precision")
pl.ylabel("recall")
pl.title("LogisticRegression")
pl.show()
#cart
precision_c, recall_c, thresholds_c = precision_recall_curve(test["los"], test_cart_prob[::,1])
pl.plot(recall_c, precision_c)
pl.xlabel("precision")
pl.ylabel("recall")
pl.title("CART")
pl.show()
#ad
precision_ad, recall_ad, thresholds_ad = precision_recall_curve(test["los"], ad.decision_function(test_variables))
pl.plot(recall_ad, precision_ad)
pl.xlabel("precision")
pl.ylabel("recall")
pl.title("AdBoosting")
pl.show()
#Naive
precision_n, recall_n, thresholds_n = precision_recall_curve(test["los"], test_naive_prob[::,1])
pl.plot(recall_n, precision_n)
pl.xlabel("precision")
pl.ylabel("recall")
pl.title("NaiveBayes")
pl.show()
#integral
plt.plot(recall_l, precision_l)
plt.plot(recall_c, precision_c)
示例14: zip
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
# Plot the class probabilities
class_proba = ada.predict_proba(x)[:, -1]
pl.subplot(132)
for i, n, c in zip(xrange(2), class_names, plot_colors):
pl.hist(class_proba[y == i],
bins=20,
range=(0, 1),
facecolor=c,
label='Class %s' % n)
pl.legend(loc='upper center')
pl.ylabel('Samples')
pl.xlabel('Class Probability')
# Plot the two-class decision scores
twoclass_output = ada.decision_function(x)
pl.subplot(133)
for i, n, c in zip(xrange(2), class_names, plot_colors):
pl.hist(twoclass_output[y == i],
bins=20,
range=(-1, 1),
facecolor=c,
label='Class %s' % n)
pl.legend(loc='upper right')
pl.ylabel('Samples')
pl.xlabel('Two-class Decision Scores')
pl.subplots_adjust(wspace=0.25)
pl.show()
示例15: bdtModel
# 需要导入模块: from sklearn.ensemble import AdaBoostClassifier [as 别名]
# 或者: from sklearn.ensemble.AdaBoostClassifier import decision_function [as 别名]
def bdtModel(df_sig_train, df_bkg_train, df_sig_test, df_bkg_test):
# '---------- Prepare Training ----------'
X_sig = np.array(df_sig_train)
y_sig = np.array(X_sig.shape[0] * [1])
X_bkg = np.array(df_bkg_train)
y_bkg = np.array(X_bkg.shape[0] * [0])
X = np.concatenate((X_sig, X_bkg))
y = np.concatenate((y_sig, y_bkg))
print 'X_sig.shape: ', X_sig.shape
print 'y_sig.shape: ', y_sig.shape
print 'X_bkg.shape: ', X_bkg.shape
print 'y_bkg.shape: ', y_bkg.shape
print 'X.shape: ', X.shape
print 'y.shape: ', y.shape
# '---------- Prepare Testing ----------'
X_sig_test = np.array(df_sig_test)
y_sig_test = np.array(X_sig_test.shape[0] * [1])
X_bkg_test = np.array(df_bkg_test)
y_bkg_test = np.array(X_bkg_test.shape[0] * [0])
X_test = np.concatenate((X_sig_test, X_bkg_test))
y_test = np.concatenate((y_sig_test, y_bkg_test))
print 'X_sig_test.shape: ', X_sig_test.shape
print 'y_sig_test.shape: ', y_sig_test.shape
print 'X_bkg_test.shape: ', X_bkg_test.shape
print 'y_bkg_test.shape: ', y_bkg_test.shape
print 'X_test.shape: ', X_test.shape
print 'y_test.shape: ', y_test.shape
# '---------- Model ----------'
#scaler = preprocessing.StandardScaler().fit(X)
#X = scaler.transform(X)
#model = svm.SVC(C = 50, kernel = 'rbf', tol=0.001, gamma=0.005, probability=True)
#model.fit(X, y)
dt = DecisionTreeClassifier(max_depth=3,
min_samples_leaf=0.05*len(X))
model = AdaBoostClassifier(dt,
algorithm='SAMME',
n_estimators=400,
learning_rate=0.5)
model.fit(X, y)
print '---------- Training/Testing info ----------'
print 'Accuracy (training): ', model.score(X, y)
print 'Null Error Rate (training): ', y.mean()
#X_test = scaler.transform(X_test)
predicted_test = model.predict(X_test)
predicted_test_clever = (predicted_test + y_test).tolist()
error_test = float(predicted_test_clever.count(1)) / float(len(predicted_test_clever))
print "Error: ", error_test
print "Accuracy (testing): ", metrics.accuracy_score(y_test, predicted_test)
print "Recall (testing): ", metrics.recall_score(y_test, predicted_test)
print "F1 score (testing): ", metrics.f1_score(y_test, predicted_test)
print "ROC area under curve (testing): ", metrics.roc_auc_score(y_test, predicted_test)
#'PTS','AST','REB','STL','BLK','FG_PCT','FG3_PCT','FT_PCT','MIN','EFF','WL']
#user_input = scaler.transform(np.array([10, 1, 2, 0, 2, 0.3, 0.3, 0.3, 10, 5, 1], dtype=float))
#user_input = scaler.transform(np.array([10,1,2,2,2,2,2,2,2,2,1], dtype=float))
#user_input = scaler.transform(np.array([10,1,2], dtype=float))
user_input = np.array([10.15, 1.95, 6.77, 1.12, 0.28, 0.51, 0.37, 0.47, 32.5, 14.8, 0.53], dtype=float)
score = model.decision_function(user_input)
print 'Score (user input): ', score
result = model.predict_proba(user_input)
print 'Probability of 1 (user input): ', result
# '--------- Visualization -----------'
Classifier_training_S = model.decision_function(X[y>0.5]).ravel()
Classifier_training_B = model.decision_function(X[y<0.5]).ravel()
Classifier_testing_S = model.decision_function(X_test[y_test>0.5]).ravel()
Classifier_testing_B = model.decision_function(X_test[y_test<0.5]).ravel()
(h_test_s, h_test_b) = visualSigBkg("BDT", Classifier_training_S, Classifier_training_B, Classifier_testing_S, Classifier_testing_B)
# '-------- Variable Importance ---------'
feature_importance = model.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
#.........这里部分代码省略.........