本文整理汇总了Python中sklearn.ensemble.GradientBoostingClassifier类的典型用法代码示例。如果您正苦于以下问题:Python GradientBoostingClassifier类的具体用法?Python GradientBoostingClassifier怎么用?Python GradientBoostingClassifier使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了GradientBoostingClassifier类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main(args):
global verbose
verbose = args.verbose
# Load files
if verbose: logger.info('Loading {}'.format(args.train_file))
train_X, train_y = load_file(args.train_file)
if verbose: logger.info('Loading {}'.format(args.test_file))
test_X, test_y = load_file(args.test_file)
# # Codes for Grid Search
# params = [
# {'n_estimators': [50000], 'learning_rate': [2**i for i in np.arange(-10, -9, .25)], 'max_features': ['log2',], 'max_depth': [7,]},
# ]
# method = GradientBoostingClassifier(random_state=1, verbose=1)
# gscv = GridSearchCV(method, params, scoring='roc_auc', verbose=verbose, n_jobs=5)
# gscv.fit(train_X.toarray(), train_y)
# if verbose:
# for params, mean_score, all_scores in gscv.grid_scores_:
# logger.info('{:.6f} (+/- {:.6f}) for {}'.format(mean_score, all_scores.std() / 2, params))
# logger.info('params:{params}'.format(params=gscv.best_params_))
# logger.info('score:{params}'.format(params=gscv.best_score_))
# pred = gscv.best_estimator_.predict_proba(test_X.toarray())
# Best parameters for the competition data
method = GradientBoostingClassifier(n_estimators=50000, learning_rate=2**(-9,5),
max_features='log2', max_depth=7
random_state=1, verbose=1)
method.fit(train_X.toarray(), train_y)
pred = method.predict_proba(test_X.toarray())
np.savetxt(args.output, pred[:, 1], fmt='%.6f')
if verbose: logger.info('Wrote preds to {file}'.format(file=args.output))
return 0
示例2: test_partial_dependecy_input
def test_partial_dependecy_input():
# Test input validation of partial dependence.
clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
clf.fit(X, y)
assert_raises(ValueError, partial_dependence,
clf, [0], grid=None, X=None)
assert_raises(ValueError, partial_dependence,
clf, [0], grid=[0, 1], X=X)
# first argument must be an instance of BaseGradientBoosting
assert_raises(ValueError, partial_dependence,
{}, [0], X=X)
# Gradient boosting estimator must be fit
assert_raises(ValueError, partial_dependence,
GradientBoostingClassifier(), [0], X=X)
assert_raises(ValueError, partial_dependence, clf, [-1], X=X)
assert_raises(ValueError, partial_dependence, clf, [100], X=X)
# wrong ndim for grid
grid = np.random.rand(10, 2, 1)
assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
示例3: test_gradient_boosting_early_stopping
def test_gradient_boosting_early_stopping():
X, y = make_classification(n_samples=1000, random_state=0)
gbc = GradientBoostingClassifier(n_estimators=1000,
n_iter_no_change=10,
learning_rate=0.1, max_depth=3,
random_state=42)
gbr = GradientBoostingRegressor(n_estimators=1000, n_iter_no_change=10,
learning_rate=0.1, max_depth=3,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y,
random_state=42)
# Check if early_stopping works as expected
for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 24), (gbr, 1e-1, 13),
(gbc, 1e-3, 36),
(gbr, 1e-3, 28)):
est.set_params(tol=tol)
est.fit(X_train, y_train)
assert_equal(est.n_estimators_, early_stop_n_estimators)
assert est.score(X_test, y_test) > 0.7
# Without early stopping
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
max_depth=3, random_state=42)
gbc.fit(X, y)
gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1,
max_depth=3, random_state=42)
gbr.fit(X, y)
assert gbc.n_estimators_ == 100
assert gbr.n_estimators_ == 200
示例4: PlotFeaturesImportance
def PlotFeaturesImportance(X,y,featureNames,dataName):
'''
Plot the relative contribution/importance of the features.
Best to reduce to top X features first - for interpretability
Code example from:
http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/
'''
gbc = GradientBoostingClassifier(n_estimators=40)
gbc.fit(X, y)
# Get Feature Importance from the classifier
feature_importance = gbc.feature_importances_
# Normalize The Features
feature_importance = 100 * (feature_importance / feature_importance.max())
sorted_idx = numpy.argsort(feature_importance)
pos = numpy.arange(sorted_idx.shape[0]) + 4.5
# pos = numpy.arange(sorted_idx.shape[0])
# plt.figure(figsize=(16, 12))
plt.figure(figsize=(14, 9), dpi=250)
plt.barh(pos, feature_importance[sorted_idx], align='center', color='#7A68A6')
#plt.yticks(pos, numpy.asanyarray(df.columns.tolist())[sorted_idx]) #ORIG
plt.yticks(pos, numpy.asanyarray(featureNames)[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('%s: Top Features' %(dataName))
plt.grid('off')
plt.ion()
plt.show()
plt.savefig(str(dataName)+'TopFeatures.png',dpi=200)
示例5: train_GBDT
def train_GBDT(self):
samples=self.trainset.values
target=self.trainlabel.values
classifier_GB=GradientBoostingClassifier(n_estimators=1000)
classifier_GB.fit(samples,target)
return classifier_GB
示例6: train
def train():
posi_result = {}
train_feature, test_feature, train_id_list, test_id_list, train_tar_list = merge_feature(feature_str)
tmp1 = [m < 32 for m in trainTarList]
tmp1 = np.array(tmp1)
# train_feature = train_feature[tmp1]
target_list = np.array(trainTarList)
target_list = target_list[tmp1]
# train_id_list = np.array(train_id_list)
# train_id_list = train_id_list[tmp1]
c_feature = trainFeature.columns[:]
clf1 = RandomForestClassifier(n_estimators=200, min_samples_split=17)
clf1.fit(trainFeature[c_feature], target_list)
# rf_preds = clf1.predict(test_feature)
rf_prob = clf1.predict_proba(test_feature)
gbdt1 = GradientBoostingClassifier(n_estimators=150, min_samples_split=17)
gbdt1.fit(trainFeature[c_feature], target_list)
# gbdt_preds = gbdt1.predict(test_feature)
gbdt_prob = gbdt1.predict_proba(test_feature)
all_prob = rf_prob + gbdt_prob
all_preds = []
print all_prob.shape
for k in range(all_prob.shape[0]):
prob1 = list(allProb[k, :])
ind1 = prob.index(max(prob1))
allPreds.append(ind1)
for j in range(len(all_preds)):
all_pre_name = dl.get_num_position(all_preds[j])
posi_result[test_id_list[j]] = all_pre_name
return posi_result
示例7: main
def main():
makeSub = True
featureImportance = False
cvfold = True
df = pd.read_csv('../data/cprobTrain15NA.csv')
X, y = np.array(pd.read_csv('../data/train.csv',usecols=range(1,9))), np.array(pd.read_csv('../data/train.csv').ACTION)
X = np.hstack((X,np.array(df)))
params = {'max_depth':4, 'subsample':0.5, 'verbose':0, 'random_state':1337,
'min_samples_split':10, 'min_samples_leaf':10, 'max_features':10,
'n_estimators': 350, 'learning_rate': 0.05}
clf = GradientBoostingClassifier(**params)
prefix = 'lib/gbm350d4m10c15'
if cvfold:
c = classifier.Classifier(X,y)
c.validate(clf,nFolds=10,out=prefix+'Train.csv')
if makeSub:
Xt = np.array(pd.read_csv('../data/test.csv',usecols=range(1,9)))
Xt = np.hstack((Xt,np.array(pd.read_csv('../data/cprobTest15NA.csv'))))
clf.fit(X,y)
y_ = clf.predict_proba(Xt)[:,1]
out = pd.read_csv('subs/nbBaseTest.csv')
out.ACTION = y_
out.to_csv(prefix+'Test.csv',index=False)
if featureImportance:
print "Feature ranking:"
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
np.savetxt('indices.txt',indices,delimiter=',')
for f in xrange(df.shape[1]):
print "%d. feature (%s,%f)" % (f + 1, df.columns[indices[f]], importances[indices[f]])
示例8: partial_dependence
def partial_dependence(df, y):
'''
INPUT: X = features
y = target variable binary, imbalanced classes
OUPUT: X = features oversampled to have balanced target classes
y = target variable oversample to have balanced classes
Discovers the minority class and then oversamples until eah class makes up
50% of your data.
'''
X_train, X_test, y_train, y_test = oversample_train_test(df, y)
# X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42)
feature_engineering = Pipeline([
('lists', ListSplitter()),
('race', RaceDummies()),
('crime_sentence', CrimeAndSentence()),
('feat_eng', FeatureEngineer()),
('columns', ColumnFilter(prejudice=False))
])
X = feature_engineering.fit_transform(X_train.copy(), y_train)
X_test = feature_engineering.fit_transform(X_test.copy(), y_test)
gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75)
gbc.fit(X.copy(), y_train)
most_imp = np.argsort(gbc.feature_importances_)[-6:]
names = list(X_test.columns)
feats = list(most_imp)
fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names,
n_jobs=3, grid_resolution=50)
示例9: trainModelComb4
def trainModelComb4(self):
ntrain = self.data_train.shape[0]
self.xtra = 5
est_prob = np.zeros([ntrain,self.xtra+1]) #for original data, essay and others, which would be fed to a second gb
self.mlmodel2 = [LogisticRegression() for i in range(self.xtra)]
for i in range(self.xtra-1):
self.mlmodel2[i].fit(self.data_train,self.labels_train[:,i+1])
set_result = self.mlmodel2[i].predict_proba(self.data_train)
est_prob[:,i] = set_result[:,1]
self.mlmodel2[self.xtra-1].fit(self.data_train_ess,self.labels_train[:,0])
set_result2 = self.mlmodel2[self.xtra-1].predict_proba(self.data_train_ess)
est_prob[:,self.xtra-1] = set_result2[:,1]
#self.data_train = np.hstack((self.data_train,est_prob))
#self.mlmodel = AdaBoostClassifier()
self.mlmodel = GradientBoostingClassifier(learning_rate=0.2,subsample=0.4)
#self.mlmodel = RandomForestClassifier(n_estimators = 200, n_jobs=3,verbose =1)
self.mlmodel.fit(self.data_train,self.labels_train[:,0])
set_result3 = self.mlmodel.predict_proba(self.data_train)
est_prob[:,self.xtra] = set_result3[:,1]
#2nd layer GB
self.mlmodel3 = GradientBoostingClassifier(learning_rate=0.1)
self.mlmodel3.fit(est_prob,self.labels_train[:,0])
示例10: run_gradient_boosting_classifier
def run_gradient_boosting_classifier(data, _max_depth):
(feature_train, feature_test, label_train, label_test) = train_test_split(data[:, 0:-1], data[:, -1].astype(int),
test_size=0.25)
# TODO: Vary Number of Estimators and Learning Rate
gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50, max_depth=_max_depth, verbose = True)
gbc.fit(feature_train, label_train)
training_error = gbc.score(feature_train, label_train)
#cross_validation_score = cross_val_score(gbc, feature_train, label_train, cv=10)
testing_error = gbc.score(feature_test, label_test)
print "Random Forest Results for Max Depth:", _max_depth
print "Training Accuracy:", training_error
#print "10-fold Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (cross_validation_score.mean(), cross_validation_score.std() * 2)
print "Testing Accuracy:", testing_error
feature_importance = gbc.feature_importances_
stddev = np.std([tree[0].feature_importances_ for tree in gbc.estimators_], axis=0)
indices = np.argsort(feature_importance)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(len(feature_importance)):
print("%d. feature %d (%f)" % (f + 1, indices[f], feature_importance[indices[f]]))
plot_feature_importance(feature_importance, indices, stddev, "gradient-boosted-classifier-feature-importance-depth-" + str(_max_depth))
示例11: main
def main():
print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S'))
testing_file = file('test.p', 'r')
training_file = file('train.p', 'r')
train = pickle.load(training_file)
test = pickle.load(testing_file)
testing_file.close()
training_file.close()
trainX = train[:,:-1]
trainy = train[:,-1]
testX = test[:,:-1]
testy = test[:,-1]
print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'GradientBoostingClassifier(n_estimators=1000)')
clf = GradientBoostingClassifier(n_estimators=1000)
clf.fit(trainX, trainy)
print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S'))
prediction = clf.predict(testX)
print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))
model_save_file = file('gradient_1000.p', 'w')
pickle.dump(clf, model_save_file)
model_save_file.close()
print 'All done'
示例12: predict
def predict(fea, df, t, t9):
Un = df.columns == 'Blank'
for f in Fea:
'''
try:
df[(f+'_y')] = df[(f+'_x')] - df[(f+'_y')]
print(1)
except:
pass
'''
Un = Un | (df.columns == f)
Un = Un | (df.columns == (f+'_x'))
Un = Un | (df.columns == (f+'_y'))
Un = Un & (df.columns != 'New_y')
clf = GradientBoostingClassifier()
y = df[t].label
X = df[t].ix[:,Un]
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.9, random_state = 1)
clf.fit(X_train, y_train)
re = 'Testing AUC: \t' + str(roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print re
re = 'September AUC: \t' + str(roc_auc_score(df[t9].label,clf.predict_proba(df[t9].ix[:,Un])[:,1]))
print re
print(X.columns)
print(clf.feature_importances_)
return Un, clf
示例13: ctr_gbdt
def ctr_gbdt(model='sklearn-clicklog', from_cache=False, train_dataset_length=100000, test_dataset_length=100000):
TRAIN_FILE, TEST_FILE = create_dataset(model, from_cache, train_dataset_length, test_dataset_length)
prediction_model = GradientBoostingClassifier(
loss='deviance',
learning_rate=0.1,
n_estimators=30,
subsample=1.0,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_depth=5,
)
x_train, y_train = clean_data(TRAIN_FILE)
x_test, y_test = clean_data(TEST_FILE)
with Timer('fit model'):
prediction_model.fit(x_train, y_train)
with Timer('evaluate model'):
y_prediction_train = prediction_model.predict_proba(x_train)
y_prediction_test = prediction_model.predict_proba(x_test)
loss_train = log_loss(y_train, y_prediction_train)
loss_test = log_loss(y_test, y_prediction_test)
print 'loss_train: %s' % loss_train
print 'loss_test: %s' % loss_test
示例14: train_classifiers
def train_classifiers(X_data, y_data):
############ Linear SVM: 0.908 #############
clf_LSVM = svm.SVC(kernel = 'linear')
clf_LSVM.fit(X_data, y_data)
############ MultinomialNB: 0.875 #############
clf_MNB = MultinomialNB()
clf_MNB.fit(X_data, y_data)
############ Random Forest: 0.910 #############
clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy')
clf_RF.fit(X_data, y_data)
############ Extra Tree: 0.915 ##################
clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0)
clf_ETC.fit(X_data, y_data)
############ AdaBoost: 0.88 ##################
clf_Ada = AdaBoostClassifier()
clf_Ada.fit(X_data, y_data)
############ rbf SVM: 0.895 #############
clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf')
clf_rbf.fit(X_data, y_data)
############ GradientBoosting: 0.88 #############
clf_GBC = GradientBoostingClassifier()
clf_GBC.fit(X_data, y_data)
return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC
示例15: ada_boost
def ada_boost():
savefile = open('traindata.pkl', 'rb')
(x_train, y_train, t1) = cPickle.load(savefile)
savefile.close()
savefile = open('testdata.pkl', 'rb')
(x_test, t1, name1) = cPickle.load(savefile)
savefile.close()
# X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(
# X, y, test_size=0.1, random_state=42)
x_train = np.asarray(x_train,dtype=np.float32)
y_train = np.asarray(y_train, dtype='int32')-1
nest = 190
lr = .1
md = 6
# clf1 = DecisionTreeClassifier(max_depth=2)
# clf = AdaBoostClassifier(clf1, n_estimators=200, learning_rate=.25)
clf = GradientBoostingClassifier(n_estimators=nest, learning_rate=lr, max_depth=md, random_state=0)
# clf = RandomForestClassifier(n_estimators=200) #.81
# clf = ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=10, random_state=0,n_jobs=8) #.81
# clf = KNeighborsClassifier(15)
if 1:
clf.fit(x_train, y_train)
ypred = clf.predict_proba(x_test)
y_str = ['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']
kcsv.print_csv(ypred, name1, y_str,indexname='id')
print (nest, lr, md)
if 0:
multiclass_log_loss = make_scorer(score_func=logloss_mc, greater_is_better=True, needs_proba=True)
scores = cross_val_score(clf, x_train, y_train, n_jobs=8, cv=5,scoring=multiclass_log_loss)
print scores
print (nest, lr, md, scores.mean())