Python Pipeline.decision_function方法代码示例

本文整理汇总了Python中sklearn.pipeline.Pipeline.decision_function方法的典型用法代码示例。如果您正苦于以下问题：Python Pipeline.decision_function方法的具体用法？Python Pipeline.decision_function怎么用？Python Pipeline.decision_function使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.pipeline.Pipeline的用法示例。

在下文中一共展示了Pipeline.decision_function方法的11个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_pipeline_methods_preprocessing_svm

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = RandomizedPCA(n_components=2, whiten=True)
    clf = SVC(probability=True, random_state=0)

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples,))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)

开发者ID:Givonaldo，项目名称:scikit-learn，代码行数:31，代码来源:test_pipeline.py

示例2: Pipeline

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
        stop_words='english',
        ngram_range=(1, 2),
        max_df=1.0,
        max_features=100000
    )

    print "Create pipeline for vectorizer => classifier"
    vect_clf = Pipeline([('vect', marisa_uni_vect),
                         ('clf', LinearSVC())])

    print "Train Model"
    vect_clf = vect_clf.fit(train_resume_text, train_labels)

    print "Predict test samples"
    predicted_score = vect_clf.predict(test_resume_text)
    predicted_decision = vect_clf.decision_function(test_resume_text)

    # accuracy = np.mean(predicted_score == test_labels)
    # p = precision_score(test_labels, predicted_score, average='macro')
    # r = recall_score(test_labels, predicted_score, average='macro')
    #
    # print accuracy
    # print p
    # print r

    # print classification_report([t for t in test_labels], [p for p in predicted_score])
    predicted = []

    actual_vs_predicted = []

    for i in range(len(test_labels)):

开发者ID:MysteriousMagics，项目名称:NLPCareerTrajectory，代码行数:33，代码来源:career_trajectory_svm_final_0503.py

示例3: Predictor

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]

#.........这里部分代码省略.........
            self.x_train.append(abstract)
            self.y_train.append(category)

        # To count for RuntimeWarning: divide by zero encountered in log
        if (not self.x_train or 0 not in self.y_train or
                1 not in self.y_train):
            self.l.error("Not enough data yet to feed the classifier")
            return

        self.classifier = Pipeline([
            ('vectorizer', CountVectorizer(stop_words=self.stop_words)),
            ('tfidf', TfidfTransformer()),
            ('clf', LinearSVC())])

        try:
            self.classifier.fit(self.x_train, self.y_train)
        except ValueError:
            self.l.error("Not enough data yet to train the classifier")
            return

        elapsed_time = datetime.datetime.now() - start_time
        self.l.debug("Initializing classifier in {0}".format(elapsed_time))

        return True


    # @profile
    # def calculatePercentageMatch(self):
    def run(self):

        """Calculate the match percentage for each article,
        based on the abstract text and the liked articles"""

        self.l.debug("Starting calculations of match percentages")
        start_time = datetime.datetime.now()

        query = QtSql.QSqlQuery(self.bdd)

        query.exec_("SELECT id, topic_simple FROM papers")

        list_id = []
        x_test = []

        while query.next():
            record = query.record()
            abstract = record.value('topic_simple')
            x_test.append(abstract)
            list_id.append(record.value('id'))

        try:
            # Normalize the percentages: the highest is set to 100%
            # http://stackoverflow.com/questions/929103/convert-a-number-range-to-another-range-maintaining-ratio
            x_test = self.classifier.decision_function(x_test)

            elapsed_time = datetime.datetime.now() - start_time
            self.l.debug("Classifier predicted proba in {}".format(elapsed_time))
            diff_time = datetime.datetime.now()

            maximum = max(x_test)
            minimum = min(x_test)
            list_percentages = 100 - (x_test - minimum) * 100 / (maximum - minimum)

            self.l.debug("Classifier normalized proba in {}".
                         format(datetime.datetime.now() - diff_time))

        except AttributeError:
            self.l.error("Not enough data yet to predict probability")
            return
        except Exception as e:
            self.l.error("predictor: {}".format(e))
            self.l.error(traceback.format_exc())
            return

        self.bdd.transaction()
        query = QtSql.QSqlQuery(self.bdd)

        query.prepare("UPDATE papers SET percentage_match = ? WHERE id = ?")

        for id_bdd, percentage in zip(list_id, list_percentages):

            # Convert the percentage to a float, because the number is
            # probably a type used by numpy. MANDATORY
            params = (float(percentage), id_bdd)

            for value in params:
                query.addBindValue(value)

            query.exec_()

        # # Set the percentage_match to 0 if the abstact is 'Empty' or empty
        # query.prepare("UPDATE papers SET percentage_match = 0 WHERE abstract = 'Empty' OR abstract = ''")
        # query.exec_()

        if not self.bdd.commit():
            self.l.critical("Percentages match not correctly written in db")
        else:
            elapsed_time = datetime.datetime.now() - start_time
            self.l.info("Done calculating match percentages in {0} s".format(elapsed_time))

        self.calculated_something = True

开发者ID:RKBK，项目名称:ChemBrows，代码行数:104，代码来源:predictor.py

示例4: zip

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
        '''doc_id=0
        for s, p, r in zip(docs_test, y_predicted, y_test):
            print(u'----------')
            print(u'[Text] %s' % s)
            print(u'[Label] %s' % p)
            print(u'[Actual] %s' % r)'''

        # Check if the total classification is empty
        # If empty, fill with the first classification
        total_prediction.append(y_predicted)


        # Average Positive score: ~0.7
        # Min Score: ~0.002
        # Max Score: ~2.86
        dec = clf.decision_function(docs_test)

    # Numpy array, .T = Transpose
    # Transpose the classification to be exported to csv file
    multiLabel = np.array(total_prediction).T

    # Save the classification to file: binaryClass.csv
    with open('workbook/binaryClass.csv', 'w', newline='') as z:
        writer = csv.writer(z)
        writer.writerows(multiLabel)

    # Save values from confusion matrix to variables to use later
    TP, TN, FP, FN = calcValues(testY, multiLabel)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    accuracy = (TP + TN) / (TP + TN + FP + FN)

开发者ID:RuneUK，项目名称:MSc-Dissertation，代码行数:33，代码来源:binaryClassification.py

示例5: blend_clfs_CV

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]

#.........这里部分代码省略.........
    #print "Number of sub-training samples: ", len(X_train)
    #print "Number of validation samples: :", len(X_Val)

    # feature selection
    #select = SelectKBest(chi2, k=7)

    # dimensionality reduction ( PCA)
    pca = PCA(n_components=2, whiten=True)

    # randomized grid search???

    clfs = [
            LogisticRegression(),
            SVC(kernel='rbf', gamma=1.0, C=0.1, probability=True, verbose=True, random_state=1),
            xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05),
            KNeighborsClassifier(n_neighbors=100),
            RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
            #RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1)
            RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1),
            AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1),
            ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
            ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'),
            ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'),
            GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)]

    #C_range = 10.0 ** np.arange(-2, 3)
    #gamma_range = 10.0 ** np.arange(-2, 3)
    #param_grid = {"gamma": gamma_range.tolist(), "C": C_range.tolist(), "kernel": ['rbf', 'linear', 'sigmoid', 'poly']}
    #grid = GridSearchCV(SVC(), param_grid, n_jobs=-1, verbose=2)
    #grid = RandomizedSearchCV(SVC(), param_grid, n_iter=20, n_jobs=-1, verbose=2)
    #grid.fit(X, X_target)
    #print("The best classifier is: ", grid.best_estimator_)
    #print(grid.grid_scores_)

    for j, clf in enumerate(clfs):
        print j, clf
        # pipeline with feature selection, pca and classifier
        if pcompa==True:
            #pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)])
            pipeline = Pipeline([('pca', pca), ('clf', clf)])
        else:
            pipeline = Pipeline([('clf', clf)])

        # cross validation
        skf = StratifiedKFold(train_target, n_folds=5, random_state=1)
        
        scores = []

        for k, (train, test) in enumerate(skf):
            pipeline.fit(X_train[train], train_target[train])
            if hasattr(pipeline, 'predict_proba'):
                score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1])
            else:
                score = log_loss(train_target[test], pipeline.decision_function(X_train[test]))
            
            scores.append(score)
            
            print 'Fold: %s, Class dist: %s, Log loss: %.3f ' %(k+1, np.bincount(train_target[train]), score)

        print 'CV accuracy: %.3f +/- %.3f ' %(
                            np.mean(scores), np.std(scores))

        ## Learning curves
        #train_sizes, train_scores, test_scores = \
        #        learning_curve(estimator=pipeline,
        #                       X=X_train,
        #                       y=train_target,
        #                       train_sizes=np.linspace(.1, 1.0, 5),
        #                       cv=5,
        #                       scoring='log_loss',
        #                       n_jobs=1)

        #train_mean = np.mean(train_scores, axis=1)
        #train_std = np.std(train_scores, axis=1)

        #test_mean = np.mean(test_scores, axis=1)
        #test_std = np.std(test_scores, axis=1)
        
        #total_training_probabilities
        training_probs = pipeline.predict_proba(X)[:,1]
        training_probs_df = pd.DataFrame(data=training_probs, columns=["probability"])
        training_submission = 'CV_training_layer_' + str(layer) + '_' + str(clf.__class__.__name__) + str(j) + '_feature_' + str(f_number) + '_pca_' + str(pcompa) 
        training_probs_df.to_csv(training_submission + '.csv', index=False)

        ## test on the hold out set
        print 'Log Loss: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))
        
        ## test on real test set, save submission
        test_predictions = pipeline.predict_proba(Y_test)[:,1]
        test_predictions_df = pd.DataFrame(data=test_predictions, columns=["probability"])
        Y_test_id.columns = ["t_id"]
        pred_submission = pd.concat((Y_test_id, test_predictions_df), axis = 1)
        submission = 'CV_layer_' + str(layer) + '_' + str(clf.__class__.__name__) + str(j) + '_feature_' + str(f_number)
        pred_submission.to_csv(submission + '.csv', index = False)
        submission_stats = open(submission + '.txt', 'a')
        submission_stats.write(str(clf) + '\n')
        submission_stats.write('pca = ' + str(pcompa) + '\n')
        submission_stats.write('Log Loss on Validation set: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1])) + '\n')
        submission_stats.write(' ' + '\n')
        submission_stats.close()

开发者ID:jhayes14，项目名称:Num，代码行数:104，代码来源:model_CV.py

示例6: open

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
     metrics.classification_report(
         Y_test,
         logistic_classifier.predict(X_test))))
 
 print 'classes : ',classifier.classes_
 print 'RBM and Logistic regression : ', classifier.predict(X_test) 
 print 'Raw Logistic regression', logistic_classifier.predict(X_test)
 
 logistic_proba = logistic_classifier.predict_proba(X_test)
 
 print 'logistic_classifier decision function : \n',logistic_classifier.decision_function(X_test)
 print 'logistic_classifier predict_proba : \n', logistic_proba
 
 classifier_proba = classifier.predict_proba(X_test)
 
 print 'classifier decision function : \n',classifier.decision_function(X_test)
 print 'classifier decision predict_proba : \n',classifier_proba
 
 
 if classifier_proba[0][1] < 0.6:
     print 'classifier ___________ led is acting strange'
     print 'current value : ',led_status[end-start-1]
     print 'desired value : ',X[0][end-start-1]
     
     f = open('transmit_confirm.txt','w')
     f.write(str(1))
     f.close()
     
     print 'set led to : ', X[0][end-start-1]
     f = open('set_led.txt','w')
     f.write(str(X[0][end-start-1]))

开发者ID:BenSlabbert，项目名称:Skripsie，代码行数:33，代码来源:CMU_RBM_AND_LOGISTIC_REGRESSION_LED_STATE.py

示例7: CV_holdout

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
def CV_holdout(pcompa = False):
    #X, training_target, Y_test, Y_test_id = load_data()
    X, Y = load_data()

    test_id = Y[['t_id']].as_matrix()
    test_id = test_id.flatten()
    Y = Y.drop( 't_id', axis = 1 )
    training_target = X[['target']].as_matrix()
    training_target = training_target.flatten()
    X = X.drop( 'target', axis = 1)
    X_np = X.as_matrix()
    Y_np = Y.as_matrix()

    # split traininf data in to training and validation set
    X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33)
    #X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33, random_state=4)

    # feature selection
    select = SelectKBest(chi2, k=20)

    # dimensionality reduction ( PCA)
    pca = PCA(n_components=2, whiten=True)

    # randomized grid search???

    clfs = [
            LogisticRegression()]
            #xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05),
            #KNeighborsClassifier(n_neighbors=100),
            #RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
            #RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1)
            #RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1),
            #AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1),
            #ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
            #ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'),
            #ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'),
            #GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)]

    for j, clf in enumerate(clfs):
        print j, clf.__class__.__name__
        # pipeline with feature selection, pca and classifier
        if pcompa==True:
            #pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)])
            pipeline = Pipeline([('pca', pca), ('clf', clf)])
        else:
            #pipeline = Pipeline([('clf', clf)])
            pipeline = Pipeline([('select', select), ('clf', clf)])

        # cross validation
        skf = StratifiedKFold(train_target, n_folds=5, random_state=1)

        scores = []

        for k, (train, test) in enumerate(skf):
            pipeline.fit(X_train[train], train_target[train])
            if hasattr(pipeline, 'predict_proba'):
                score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1])
                print pipeline.predict(X_train[test])[:10], train_target[test][:10]
            else:
                score = log_loss(train_target[test], pipeline.decision_function(X_train[test]))

            scores.append(score)

            #print 'Fold: %s, Class dist: %s, Log loss: %.3f ' %(k+1, np.bincount(train_target[train]), score)

        print 'CV accuracy: %.3f +/- %.3f ' %(
                            np.mean(scores), np.std(scores))

        ## test on the hold out set
        
        print 'Log Loss: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))

开发者ID:jhayes14，项目名称:Num，代码行数:73，代码来源:f_iterate_bad_CV.py

示例8: SelectKBest

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
      ('feature_selection', SelectKBest(f_regression, k=1000)),
      #('reduce_dims',PCA()),
      ('mnb', MultinomialNB())
        ])
clf.fit(X_train, y_train)

train_time = time() - t0
print("train time: %0.3fs" % train_time)

t0 = time()
pred = clf.predict(X_test)
try:
    pred_prob = clf.predict_proba(X_test)
except AttributeError:
    try:
        dec_f = clf.decision_function(X_test)
        pred_prob = np.exp(dec_f) / np.sum(np.exp(dec_f))
    except AttributeError:
        pred_prob = LabelBinarizer().fit_transform(pred.tolist())

test_time = time() - t0
print("test time:  %0.3fs" % test_time)

score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

y_test_prob = LabelBinarizer().fit_transform(y_test)
log_loss = metrics.log_loss(y_test_prob, pred_prob)
print("log_loss:   %0.3f" % log_loss)

if hasattr(clf, 'coef_'):

开发者ID:XuQiao，项目名称:codestudy，代码行数:33，代码来源:featuresel.py

示例9: singular_lgls

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
def singular_lgls(pcompa = False):
    #X, training_target, Y_test, Y_test_id = load_data()
    X, Y = load_data(original=True)
    
    test_id = Y[['t_id']].as_matrix()
    test_id = test_id.flatten()
   
    training_target = X[['target']].as_matrix()
    training_target = training_target.flatten()
    
    features = []
    lgls = []

    for i in X.columns:
        if str(i) == 'target':
            pass
        else:
            #print "Feature %s " %(str(i))
            features.append(str(i))
            feature_X = X[str(i)]
            feature_Y = Y[str(i)]
            X_np = feature_X.as_matrix() 
            Y_np = feature_Y.as_matrix() 

        # split traininf data in to training and validation set
        X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33, random_state=4)
        X_train = np.reshape(X_train, (len(X_train), 1))
        X_Val = np.reshape(X_Val, (len(X_Val), 1))
        np.reshape(train_target, (len(train_target), 1))
        np.reshape(val_target, (len(val_target), 1))

        # feature selection
        select = SelectKBest(chi2, k=20)

        # dimensionality reduction ( PCA)
        pca = PCA(n_components=2, whiten=True)

        # randomized grid search???

        clfs = [
                LogisticRegression()]
                #xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05),
                #KNeighborsClassifier(n_neighbors=100),
                #RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
                #RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1)
                #RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1),
                #AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1),
                #ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
                #ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'),
                #ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'),
                #GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)]

        for j, clf in enumerate(clfs):
            #print j, clf.__class__.__name__
            # pipeline with feature selection, pca and classifier
            if pcompa==True:
                #pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)])
                pipeline = Pipeline([('pca', pca), ('clf', clf)])
            else:
                pipeline = Pipeline([('clf', clf)])
                #pipeline = Pipeline([('select', select), ('clf', clf)])

            # cross validation
            skf = StratifiedKFold(train_target, n_folds=5, random_state=1)

            scores = []

            for k, (train, test) in enumerate(skf):
                pipeline.fit(X_train[train], train_target[train])
                if hasattr(pipeline, 'predict_proba'):
                    score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1])
                else:
                    score = log_loss(train_target[test], pipeline.decision_function(X_train[test]))

                scores.append(score)

                #print 'Fold: %s, Class dist: %s, Log loss: %.3f ' %(k+1, np.bincount(train_target[train]), score)

            #print 'CV accuracy: %.3f +/- %.3f ' %(
            #                    np.mean(scores), np.std(scores))

            ## test on the hold out set
            #print 'Log Loss: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))
            lgls.append(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))

            ## Learning curves
            #train_sizes, train_scores, test_scores = \
            #        learning_curve(estimator=pipeline,
            #                       X=X_train,
            #                       y=train_target,
            #                       train_sizes=np.linspace(.1, 1.0, 5),
            #                       cv=5,
            #                       scoring='log_loss',
            #                       n_jobs=1)

            #train_mean = np.mean(train_scores, axis=1)
            #train_std = np.std(train_scores, axis=1)

            #test_mean = np.mean(test_scores, axis=1)
            #test_std = np.std(test_scores, axis=1)
#.........这里部分代码省略.........

开发者ID:jhayes14，项目名称:Num，代码行数:103，代码来源:f_singular.py

示例10: combinations_lgls

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
def combinations_lgls(pcompa = False, differences = True, addition = False, multiplication = False, division = False):
    #X, training_target, Y_test, Y_test_id = load_data()
    X, Y = load_data(original=True)
    
    test_id = Y[['t_id']].as_matrix()
    test_id = test_id.flatten()
   
    training_target = X[['target']].as_matrix()
    training_target = training_target.flatten()
   
    ### INCLUDE ALL NOT JUST THESE 5 ###

    f_s = [ 'feature%d' %x for x in range(1,22)]
    g_s = [ 'feature%d' %x for x in range(1,22)]

    features = []
    lgls = []

    for f in f_s:
        for g in g_s:
            if f == g:
                pass
            else:
                if differences: 
                    features.append(str(f)+"-"+str(g))
                    feature_X = X[str(f)]-X[str(g)]
                    feature_Y = Y[str(f)]-Y[str(g)]
                elif addition:
                    features.append(str(f)+"+"+str(g))
                    feature_X = X[str(f)]+X[str(g)]
                    feature_Y = Y[str(f)]+Y[str(g)]
                elif multiplication:
                    features.append(str(f)+"x"+str(g))
                    feature_X = X[str(f)]*X[str(g)]
                    feature_Y = Y[str(f)]*Y[str(g)]
                elif division:
                    features.append(str(f)+"/"+str(g))
                    feature_X = X[str(f)].div(X[str(g)])
                    feature_Y = Y[str(f)].div(Y[str(g)])

                X_np = feature_X.as_matrix() 
                Y_np = feature_Y.as_matrix() 

                # split traininf data in to training and validation set
                X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33, random_state=4)
                X_train = np.reshape(X_train, (len(X_train), 1))
                X_Val = np.reshape(X_Val, (len(X_Val), 1))
                np.reshape(train_target, (len(train_target), 1))
                np.reshape(val_target, (len(val_target), 1))

                # feature selection
                select = SelectKBest(chi2, k=20)

                # dimensionality reduction ( PCA)
                pca = PCA(n_components=2, whiten=True)

                # randomized grid search???

                clfs = [
                        LogisticRegression()]
                        #xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05),

                for j, clf in enumerate(clfs):
                    #print j, clf.__class__.__name__
                    # pipeline with feature selection, pca and classifier
                    if pcompa==True:
                        #pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)])
                        pipeline = Pipeline([('pca', pca), ('clf', clf)])
                    else:
                        pipeline = Pipeline([('clf', clf)])
                        #pipeline = Pipeline([('select', select), ('clf', clf)])

                    # cross validation
                    skf = StratifiedKFold(train_target, n_folds=5, random_state=1)

                    scores = []

                    for k, (train, test) in enumerate(skf):
                        pipeline.fit(X_train[train], train_target[train])
                        if hasattr(pipeline, 'predict_proba'):
                            score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1])
                        else:
                            score = log_loss(train_target[test], pipeline.decision_function(X_train[test]))

                        scores.append(score)

                        lgls.append(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))

    combination_scores = sorted(zip(features, lgls), key=lambda x: x[1])
    single_f_average = singular_lgls()
    
    return [x for x in combination_scores if x[1]<single_f_average]

开发者ID:jhayes14，项目名称:Num，代码行数:94，代码来源:f_singular.py

示例11: LogisticRegression

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import decision_function [as 别名]
    ('features', features),
    ('Logistic', LogisticRegression(C=0.00077426, class_weight='balanced'))
])

model2.fit(fannie_train, status_train)
status_pred2 = model2.predict(fannie_test)
# print('Best C is: ', model2.named_steps['Logistic'].C_)
print('Coefficients: ', model2.named_steps['Logistic'].coef_)

print(classification_report(status_test, status_pred2))
print(pd.DataFrame(confusion_matrix(status_test, status_pred2), index=['Actual Healthy',
                                                                       'Actual Default'],
                   columns=['Pred. Healthy', 'Pred. Default']))
print('Area under the curve is', roc_auc_score(status_test, status_pred2))
prec, rec, thres1 = precision_recall_curve(status_test, status_pred2)
fpr, tpr, thres2 = roc_curve(status_test, model2.decision_function(fannie_test))
with open('log_prec_rec.dill', 'wb') as f:
    dill.dump((prec, rec, thres1), f)

with open('log_fpr_tpr.dill', 'wb') as f:
    dill.dump((fpr, tpr, thres2), f)

with open('log_model.dill', 'wb') as f:
    dill.dump(model2, f)

print('finishing dumping Logistic regression results to file!')

# # Support Vector Machine
# features = FeatureUnion([
#     ('Loan_Amount', ExtractNormalized('STATE', 'ORIG_AMT')),
#     #('Interest_Rate', ExtractNormalized('STATE','ORIG_RT')),

开发者ID:DigitalPig，项目名称:SmartUnderwriter，代码行数:33，代码来源:learning.py

注：本文中的sklearn.pipeline.Pipeline.decision_function方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。