本文整理汇总了Python中sklearn.pipeline.Pipeline.decision_function方法的典型用法代码示例。如果您正苦于以下问题:Python Pipeline.decision_function方法的具体用法?Python Pipeline.decision_function怎么用?Python Pipeline.decision_function使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.pipeline.Pipeline
的用法示例。
在下文中一共展示了Pipeline.decision_function方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_pipeline_methods_preprocessing_svm
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
def test_pipeline_methods_preprocessing_svm():
# Test the various methods of the pipeline (preprocessing + svm).
iris = load_iris()
X = iris.data
y = iris.target
n_samples = X.shape[0]
n_classes = len(np.unique(y))
scaler = StandardScaler()
pca = RandomizedPCA(n_components=2, whiten=True)
clf = SVC(probability=True, random_state=0)
for preprocessing in [scaler, pca]:
pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
pipe.fit(X, y)
# check shapes of various prediction functions
predict = pipe.predict(X)
assert_equal(predict.shape, (n_samples,))
proba = pipe.predict_proba(X)
assert_equal(proba.shape, (n_samples, n_classes))
log_proba = pipe.predict_log_proba(X)
assert_equal(log_proba.shape, (n_samples, n_classes))
decision_function = pipe.decision_function(X)
assert_equal(decision_function.shape, (n_samples, n_classes))
pipe.score(X, y)
示例2: Pipeline
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
stop_words='english',
ngram_range=(1, 2),
max_df=1.0,
max_features=100000
)
print "Create pipeline for vectorizer => classifier"
vect_clf = Pipeline([('vect', marisa_uni_vect),
('clf', LinearSVC())])
print "Train Model"
vect_clf = vect_clf.fit(train_resume_text, train_labels)
print "Predict test samples"
predicted_score = vect_clf.predict(test_resume_text)
predicted_decision = vect_clf.decision_function(test_resume_text)
# accuracy = np.mean(predicted_score == test_labels)
# p = precision_score(test_labels, predicted_score, average='macro')
# r = recall_score(test_labels, predicted_score, average='macro')
#
# print accuracy
# print p
# print r
# print classification_report([t for t in test_labels], [p for p in predicted_score])
predicted = []
actual_vs_predicted = []
for i in range(len(test_labels)):
示例3: Predictor
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
#.........这里部分代码省略.........
self.x_train.append(abstract)
self.y_train.append(category)
# To count for RuntimeWarning: divide by zero encountered in log
if (not self.x_train or 0 not in self.y_train or
1 not in self.y_train):
self.l.error("Not enough data yet to feed the classifier")
return
self.classifier = Pipeline([
('vectorizer', CountVectorizer(stop_words=self.stop_words)),
('tfidf', TfidfTransformer()),
('clf', LinearSVC())])
try:
self.classifier.fit(self.x_train, self.y_train)
except ValueError:
self.l.error("Not enough data yet to train the classifier")
return
elapsed_time = datetime.datetime.now() - start_time
self.l.debug("Initializing classifier in {0}".format(elapsed_time))
return True
# @profile
# def calculatePercentageMatch(self):
def run(self):
"""Calculate the match percentage for each article,
based on the abstract text and the liked articles"""
self.l.debug("Starting calculations of match percentages")
start_time = datetime.datetime.now()
query = QtSql.QSqlQuery(self.bdd)
query.exec_("SELECT id, topic_simple FROM papers")
list_id = []
x_test = []
while query.next():
record = query.record()
abstract = record.value('topic_simple')
x_test.append(abstract)
list_id.append(record.value('id'))
try:
# Normalize the percentages: the highest is set to 100%
# http://stackoverflow.com/questions/929103/convert-a-number-range-to-another-range-maintaining-ratio
x_test = self.classifier.decision_function(x_test)
elapsed_time = datetime.datetime.now() - start_time
self.l.debug("Classifier predicted proba in {}".format(elapsed_time))
diff_time = datetime.datetime.now()
maximum = max(x_test)
minimum = min(x_test)
list_percentages = 100 - (x_test - minimum) * 100 / (maximum - minimum)
self.l.debug("Classifier normalized proba in {}".
format(datetime.datetime.now() - diff_time))
except AttributeError:
self.l.error("Not enough data yet to predict probability")
return
except Exception as e:
self.l.error("predictor: {}".format(e))
self.l.error(traceback.format_exc())
return
self.bdd.transaction()
query = QtSql.QSqlQuery(self.bdd)
query.prepare("UPDATE papers SET percentage_match = ? WHERE id = ?")
for id_bdd, percentage in zip(list_id, list_percentages):
# Convert the percentage to a float, because the number is
# probably a type used by numpy. MANDATORY
params = (float(percentage), id_bdd)
for value in params:
query.addBindValue(value)
query.exec_()
# # Set the percentage_match to 0 if the abstact is 'Empty' or empty
# query.prepare("UPDATE papers SET percentage_match = 0 WHERE abstract = 'Empty' OR abstract = ''")
# query.exec_()
if not self.bdd.commit():
self.l.critical("Percentages match not correctly written in db")
else:
elapsed_time = datetime.datetime.now() - start_time
self.l.info("Done calculating match percentages in {0} s".format(elapsed_time))
self.calculated_something = True
示例4: zip
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
'''doc_id=0
for s, p, r in zip(docs_test, y_predicted, y_test):
print(u'----------')
print(u'[Text] %s' % s)
print(u'[Label] %s' % p)
print(u'[Actual] %s' % r)'''
# Check if the total classification is empty
# If empty, fill with the first classification
total_prediction.append(y_predicted)
# Average Positive score: ~0.7
# Min Score: ~0.002
# Max Score: ~2.86
dec = clf.decision_function(docs_test)
# Numpy array, .T = Transpose
# Transpose the classification to be exported to csv file
multiLabel = np.array(total_prediction).T
# Save the classification to file: binaryClass.csv
with open('workbook/binaryClass.csv', 'w', newline='') as z:
writer = csv.writer(z)
writer.writerows(multiLabel)
# Save values from confusion matrix to variables to use later
TP, TN, FP, FN = calcValues(testY, multiLabel)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
accuracy = (TP + TN) / (TP + TN + FP + FN)
示例5: blend_clfs_CV
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
#.........这里部分代码省略.........
#print "Number of sub-training samples: ", len(X_train)
#print "Number of validation samples: :", len(X_Val)
# feature selection
#select = SelectKBest(chi2, k=7)
# dimensionality reduction ( PCA)
pca = PCA(n_components=2, whiten=True)
# randomized grid search???
clfs = [
LogisticRegression(),
SVC(kernel='rbf', gamma=1.0, C=0.1, probability=True, verbose=True, random_state=1),
xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05),
KNeighborsClassifier(n_neighbors=100),
RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
#RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1)
RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1),
AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1),
ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'),
ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'),
GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)]
#C_range = 10.0 ** np.arange(-2, 3)
#gamma_range = 10.0 ** np.arange(-2, 3)
#param_grid = {"gamma": gamma_range.tolist(), "C": C_range.tolist(), "kernel": ['rbf', 'linear', 'sigmoid', 'poly']}
#grid = GridSearchCV(SVC(), param_grid, n_jobs=-1, verbose=2)
#grid = RandomizedSearchCV(SVC(), param_grid, n_iter=20, n_jobs=-1, verbose=2)
#grid.fit(X, X_target)
#print("The best classifier is: ", grid.best_estimator_)
#print(grid.grid_scores_)
for j, clf in enumerate(clfs):
print j, clf
# pipeline with feature selection, pca and classifier
if pcompa==True:
#pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)])
pipeline = Pipeline([('pca', pca), ('clf', clf)])
else:
pipeline = Pipeline([('clf', clf)])
# cross validation
skf = StratifiedKFold(train_target, n_folds=5, random_state=1)
scores = []
for k, (train, test) in enumerate(skf):
pipeline.fit(X_train[train], train_target[train])
if hasattr(pipeline, 'predict_proba'):
score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1])
else:
score = log_loss(train_target[test], pipeline.decision_function(X_train[test]))
scores.append(score)
print 'Fold: %s, Class dist: %s, Log loss: %.3f ' %(k+1, np.bincount(train_target[train]), score)
print 'CV accuracy: %.3f +/- %.3f ' %(
np.mean(scores), np.std(scores))
## Learning curves
#train_sizes, train_scores, test_scores = \
# learning_curve(estimator=pipeline,
# X=X_train,
# y=train_target,
# train_sizes=np.linspace(.1, 1.0, 5),
# cv=5,
# scoring='log_loss',
# n_jobs=1)
#train_mean = np.mean(train_scores, axis=1)
#train_std = np.std(train_scores, axis=1)
#test_mean = np.mean(test_scores, axis=1)
#test_std = np.std(test_scores, axis=1)
#total_training_probabilities
training_probs = pipeline.predict_proba(X)[:,1]
training_probs_df = pd.DataFrame(data=training_probs, columns=["probability"])
training_submission = 'CV_training_layer_' + str(layer) + '_' + str(clf.__class__.__name__) + str(j) + '_feature_' + str(f_number) + '_pca_' + str(pcompa)
training_probs_df.to_csv(training_submission + '.csv', index=False)
## test on the hold out set
print 'Log Loss: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))
## test on real test set, save submission
test_predictions = pipeline.predict_proba(Y_test)[:,1]
test_predictions_df = pd.DataFrame(data=test_predictions, columns=["probability"])
Y_test_id.columns = ["t_id"]
pred_submission = pd.concat((Y_test_id, test_predictions_df), axis = 1)
submission = 'CV_layer_' + str(layer) + '_' + str(clf.__class__.__name__) + str(j) + '_feature_' + str(f_number)
pred_submission.to_csv(submission + '.csv', index = False)
submission_stats = open(submission + '.txt', 'a')
submission_stats.write(str(clf) + '\n')
submission_stats.write('pca = ' + str(pcompa) + '\n')
submission_stats.write('Log Loss on Validation set: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1])) + '\n')
submission_stats.write(' ' + '\n')
submission_stats.close()
示例6: open
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
metrics.classification_report(
Y_test,
logistic_classifier.predict(X_test))))
print 'classes : ',classifier.classes_
print 'RBM and Logistic regression : ', classifier.predict(X_test)
print 'Raw Logistic regression', logistic_classifier.predict(X_test)
logistic_proba = logistic_classifier.predict_proba(X_test)
print 'logistic_classifier decision function : \n',logistic_classifier.decision_function(X_test)
print 'logistic_classifier predict_proba : \n', logistic_proba
classifier_proba = classifier.predict_proba(X_test)
print 'classifier decision function : \n',classifier.decision_function(X_test)
print 'classifier decision predict_proba : \n',classifier_proba
if classifier_proba[0][1] < 0.6:
print 'classifier ___________ led is acting strange'
print 'current value : ',led_status[end-start-1]
print 'desired value : ',X[0][end-start-1]
f = open('transmit_confirm.txt','w')
f.write(str(1))
f.close()
print 'set led to : ', X[0][end-start-1]
f = open('set_led.txt','w')
f.write(str(X[0][end-start-1]))
示例7: CV_holdout
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
def CV_holdout(pcompa = False):
#X, training_target, Y_test, Y_test_id = load_data()
X, Y = load_data()
test_id = Y[['t_id']].as_matrix()
test_id = test_id.flatten()
Y = Y.drop( 't_id', axis = 1 )
training_target = X[['target']].as_matrix()
training_target = training_target.flatten()
X = X.drop( 'target', axis = 1)
X_np = X.as_matrix()
Y_np = Y.as_matrix()
# split traininf data in to training and validation set
X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33)
#X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33, random_state=4)
# feature selection
select = SelectKBest(chi2, k=20)
# dimensionality reduction ( PCA)
pca = PCA(n_components=2, whiten=True)
# randomized grid search???
clfs = [
LogisticRegression()]
#xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05),
#KNeighborsClassifier(n_neighbors=100),
#RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
#RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1)
#RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1),
#AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1),
#ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
#ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'),
#ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'),
#GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)]
for j, clf in enumerate(clfs):
print j, clf.__class__.__name__
# pipeline with feature selection, pca and classifier
if pcompa==True:
#pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)])
pipeline = Pipeline([('pca', pca), ('clf', clf)])
else:
#pipeline = Pipeline([('clf', clf)])
pipeline = Pipeline([('select', select), ('clf', clf)])
# cross validation
skf = StratifiedKFold(train_target, n_folds=5, random_state=1)
scores = []
for k, (train, test) in enumerate(skf):
pipeline.fit(X_train[train], train_target[train])
if hasattr(pipeline, 'predict_proba'):
score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1])
print pipeline.predict(X_train[test])[:10], train_target[test][:10]
else:
score = log_loss(train_target[test], pipeline.decision_function(X_train[test]))
scores.append(score)
#print 'Fold: %s, Class dist: %s, Log loss: %.3f ' %(k+1, np.bincount(train_target[train]), score)
print 'CV accuracy: %.3f +/- %.3f ' %(
np.mean(scores), np.std(scores))
## test on the hold out set
print 'Log Loss: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))
示例8: SelectKBest
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
('feature_selection', SelectKBest(f_regression, k=1000)),
#('reduce_dims',PCA()),
('mnb', MultinomialNB())
])
clf.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)
t0 = time()
pred = clf.predict(X_test)
try:
pred_prob = clf.predict_proba(X_test)
except AttributeError:
try:
dec_f = clf.decision_function(X_test)
pred_prob = np.exp(dec_f) / np.sum(np.exp(dec_f))
except AttributeError:
pred_prob = LabelBinarizer().fit_transform(pred.tolist())
test_time = time() - t0
print("test time: %0.3fs" % test_time)
score = metrics.accuracy_score(y_test, pred)
print("accuracy: %0.3f" % score)
y_test_prob = LabelBinarizer().fit_transform(y_test)
log_loss = metrics.log_loss(y_test_prob, pred_prob)
print("log_loss: %0.3f" % log_loss)
if hasattr(clf, 'coef_'):
示例9: singular_lgls
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
def singular_lgls(pcompa = False):
#X, training_target, Y_test, Y_test_id = load_data()
X, Y = load_data(original=True)
test_id = Y[['t_id']].as_matrix()
test_id = test_id.flatten()
training_target = X[['target']].as_matrix()
training_target = training_target.flatten()
features = []
lgls = []
for i in X.columns:
if str(i) == 'target':
pass
else:
#print "Feature %s " %(str(i))
features.append(str(i))
feature_X = X[str(i)]
feature_Y = Y[str(i)]
X_np = feature_X.as_matrix()
Y_np = feature_Y.as_matrix()
# split traininf data in to training and validation set
X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33, random_state=4)
X_train = np.reshape(X_train, (len(X_train), 1))
X_Val = np.reshape(X_Val, (len(X_Val), 1))
np.reshape(train_target, (len(train_target), 1))
np.reshape(val_target, (len(val_target), 1))
# feature selection
select = SelectKBest(chi2, k=20)
# dimensionality reduction ( PCA)
pca = PCA(n_components=2, whiten=True)
# randomized grid search???
clfs = [
LogisticRegression()]
#xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05),
#KNeighborsClassifier(n_neighbors=100),
#RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
#RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1)
#RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1),
#AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1),
#ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
#ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'),
#ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'),
#GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)]
for j, clf in enumerate(clfs):
#print j, clf.__class__.__name__
# pipeline with feature selection, pca and classifier
if pcompa==True:
#pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)])
pipeline = Pipeline([('pca', pca), ('clf', clf)])
else:
pipeline = Pipeline([('clf', clf)])
#pipeline = Pipeline([('select', select), ('clf', clf)])
# cross validation
skf = StratifiedKFold(train_target, n_folds=5, random_state=1)
scores = []
for k, (train, test) in enumerate(skf):
pipeline.fit(X_train[train], train_target[train])
if hasattr(pipeline, 'predict_proba'):
score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1])
else:
score = log_loss(train_target[test], pipeline.decision_function(X_train[test]))
scores.append(score)
#print 'Fold: %s, Class dist: %s, Log loss: %.3f ' %(k+1, np.bincount(train_target[train]), score)
#print 'CV accuracy: %.3f +/- %.3f ' %(
# np.mean(scores), np.std(scores))
## test on the hold out set
#print 'Log Loss: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))
lgls.append(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))
## Learning curves
#train_sizes, train_scores, test_scores = \
# learning_curve(estimator=pipeline,
# X=X_train,
# y=train_target,
# train_sizes=np.linspace(.1, 1.0, 5),
# cv=5,
# scoring='log_loss',
# n_jobs=1)
#train_mean = np.mean(train_scores, axis=1)
#train_std = np.std(train_scores, axis=1)
#test_mean = np.mean(test_scores, axis=1)
#test_std = np.std(test_scores, axis=1)
#.........这里部分代码省略.........
示例10: combinations_lgls
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
def combinations_lgls(pcompa = False, differences = True, addition = False, multiplication = False, division = False):
#X, training_target, Y_test, Y_test_id = load_data()
X, Y = load_data(original=True)
test_id = Y[['t_id']].as_matrix()
test_id = test_id.flatten()
training_target = X[['target']].as_matrix()
training_target = training_target.flatten()
### INCLUDE ALL NOT JUST THESE 5 ###
f_s = [ 'feature%d' %x for x in range(1,22)]
g_s = [ 'feature%d' %x for x in range(1,22)]
features = []
lgls = []
for f in f_s:
for g in g_s:
if f == g:
pass
else:
if differences:
features.append(str(f)+"-"+str(g))
feature_X = X[str(f)]-X[str(g)]
feature_Y = Y[str(f)]-Y[str(g)]
elif addition:
features.append(str(f)+"+"+str(g))
feature_X = X[str(f)]+X[str(g)]
feature_Y = Y[str(f)]+Y[str(g)]
elif multiplication:
features.append(str(f)+"x"+str(g))
feature_X = X[str(f)]*X[str(g)]
feature_Y = Y[str(f)]*Y[str(g)]
elif division:
features.append(str(f)+"/"+str(g))
feature_X = X[str(f)].div(X[str(g)])
feature_Y = Y[str(f)].div(Y[str(g)])
X_np = feature_X.as_matrix()
Y_np = feature_Y.as_matrix()
# split traininf data in to training and validation set
X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33, random_state=4)
X_train = np.reshape(X_train, (len(X_train), 1))
X_Val = np.reshape(X_Val, (len(X_Val), 1))
np.reshape(train_target, (len(train_target), 1))
np.reshape(val_target, (len(val_target), 1))
# feature selection
select = SelectKBest(chi2, k=20)
# dimensionality reduction ( PCA)
pca = PCA(n_components=2, whiten=True)
# randomized grid search???
clfs = [
LogisticRegression()]
#xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05),
for j, clf in enumerate(clfs):
#print j, clf.__class__.__name__
# pipeline with feature selection, pca and classifier
if pcompa==True:
#pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)])
pipeline = Pipeline([('pca', pca), ('clf', clf)])
else:
pipeline = Pipeline([('clf', clf)])
#pipeline = Pipeline([('select', select), ('clf', clf)])
# cross validation
skf = StratifiedKFold(train_target, n_folds=5, random_state=1)
scores = []
for k, (train, test) in enumerate(skf):
pipeline.fit(X_train[train], train_target[train])
if hasattr(pipeline, 'predict_proba'):
score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1])
else:
score = log_loss(train_target[test], pipeline.decision_function(X_train[test]))
scores.append(score)
lgls.append(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))
combination_scores = sorted(zip(features, lgls), key=lambda x: x[1])
single_f_average = singular_lgls()
return [x for x in combination_scores if x[1]<single_f_average]
示例11: LogisticRegression
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
('features', features),
('Logistic', LogisticRegression(C=0.00077426, class_weight='balanced'))
])
model2.fit(fannie_train, status_train)
status_pred2 = model2.predict(fannie_test)
# print('Best C is: ', model2.named_steps['Logistic'].C_)
print('Coefficients: ', model2.named_steps['Logistic'].coef_)
print(classification_report(status_test, status_pred2))
print(pd.DataFrame(confusion_matrix(status_test, status_pred2), index=['Actual Healthy',
'Actual Default'],
columns=['Pred. Healthy', 'Pred. Default']))
print('Area under the curve is', roc_auc_score(status_test, status_pred2))
prec, rec, thres1 = precision_recall_curve(status_test, status_pred2)
fpr, tpr, thres2 = roc_curve(status_test, model2.decision_function(fannie_test))
with open('log_prec_rec.dill', 'wb') as f:
dill.dump((prec, rec, thres1), f)
with open('log_fpr_tpr.dill', 'wb') as f:
dill.dump((fpr, tpr, thres2), f)
with open('log_model.dill', 'wb') as f:
dill.dump(model2, f)
print('finishing dumping Logistic regression results to file!')
# # Support Vector Machine
# features = FeatureUnion([
# ('Loan_Amount', ExtractNormalized('STATE', 'ORIG_AMT')),
# #('Interest_Rate', ExtractNormalized('STATE','ORIG_RT')),