当前位置: 首页>>代码示例>>Python>>正文


Python SGDClassifier.predict_proba方法代码示例

本文整理汇总了Python中sklearn.linear_model.SGDClassifier.predict_proba方法的典型用法代码示例。如果您正苦于以下问题:Python SGDClassifier.predict_proba方法的具体用法?Python SGDClassifier.predict_proba怎么用?Python SGDClassifier.predict_proba使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.linear_model.SGDClassifier的用法示例。


在下文中一共展示了SGDClassifier.predict_proba方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: train_kaggle

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def train_kaggle(dataset, alg="rig", data="bow"):
    train_x, train_y, test_x = dataset
    print "shape for training data is", train_x.shape

    if alg == "svm":
        clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20)
    elif alg == "svm_sq":
        clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20, loss="squared_hinge")
    elif alg == "log":
        clf = LogisticRegression(verbose=1, n_jobs=2)
    elif alg == "per":
        clf = Perceptron(verbose=1, n_jobs=2, n_iter=25)
    elif alg == "rig":
        clf = RidgeClassifier()
    elif alg == "pa":
        clf = PassiveAggressiveClassifier(n_jobs=2, n_iter=25)
    else:
        raise NotImplementedError

    print "training with %s..." % alg

    clf.fit(train_x, train_y)
    # clf.fit(validate_x, validate_y)
    predicted = clf.predict(test_x)
    save_csv(predicted, fname=alg + "_" + data)

    if alg != "nb":
        return clf.decision_function(train_x), clf.decision_function(test_x)
    else:
        return clf.predict_proba(train_x), clf.predict_proba(test_x)
开发者ID:csong27,项目名称:NgramNeuralNetworks,代码行数:32,代码来源:train_base.py

示例2: SGDC_SVM_Classifier

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def SGDC_SVM_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
    print("***************Starting SVM***************")
    t0 = time()
    clf = SGDClassifier(loss='log', penalty='l2',alpha=1e-5, n_iter=100)
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_cv)
    score = clf.score(X_cv,Y_cv)

    print("{0:.2f}%".format(100 * score))
    Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
                      rownames=['actual'], colnames=['preds'])
    Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
    print(Summary)

    #Check with log loss function
    epsilon = 1e-15
    #ll_output = log_loss_func(Y_cv, preds, epsilon)
    preds2 = clf.predict_proba(X_cv)
    ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
    print(ll_output2)

    print("done in %0.3fs" % (time() - t0))

    preds3 = clf.predict_proba(X_test)
    #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
    preds4 = clf.predict_proba(Actual_DS)
    print("***************Ending SVM***************")
    return pd.DataFrame(preds2),pd.DataFrame(preds3),pd.DataFrame(preds4)
开发者ID:roshankr,项目名称:DS_Competition,代码行数:30,代码来源:Otto_Classification.py

示例3: crossvalidate

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def crossvalidate(feas, labels, param):
    labels = np.array(list(labels), dtype=int)
    accs = []
    for train_ids, valid_ids in StratifiedKFold(labels, 10):
        idf=train_idf([feas[i] for i in train_ids])
        X,vocab=extract_feas(feas, idf)
        #lda=train_lda(X, vocab, num_topics)
        #X=transform_lda(X, lda)
        labels_train = labels[train_ids].copy()
        weights = balance_weights(labels_train, param['bg_weight'])
        labels_train[labels_train == 0] = 1
        model=SGDClassifier(loss='log',
                            alpha=param['regu']/len(labels_train),
                            fit_intercept=True,
                            shuffle=True, n_iter=50)
        model.fit(X[train_ids], labels_train, sample_weight=weights)
        pp = model.predict_proba(X[valid_ids])
        pred_labels = np.argmax(pp, 1)
        pred_labels = model.classes_[pred_labels]
        #a=accuracy(labels[valid_ids], pred_labels, 1)
        # return all scores for "good" class
        assert model.classes_[1] == 2
        pred_scores = pp[:,1]
        a=avg_precision(labels[valid_ids], pred_scores)
        print '%.2f' % a,
        accs.append(a)
    return np.mean(accs)
开发者ID:jseppanen,项目名称:textpile,代码行数:29,代码来源:model.py

示例4: main

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def main():
    """ Generates features and fits classifier. """
    
    featureIndexes = processData(os.path.join(dataFolder, "avito_train.tsv"), itemsLimit=300000)
    trainFeatures,trainTargets, trainItemIds=processData(os.path.join(dataFolder,"avito_train.tsv"), featureIndexes, itemsLimit=300000)
    testFeatures, testItemIds=processData(os.path.join(dataFolder,"avito_test.tsv"), featureIndexes)
    joblib.dump((trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds), os.path.join(dataFolder,"train_data.pkl"))
    trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(os.path.join(dataFolder,"train_data.pkl"))
    logging.info("Feature preparation done, fitting model...")
    clf = SGDClassifier(    loss="log", 
                            penalty="l2", 
                            alpha=1e-4, 
                            class_weight="auto")
    clf.fit(trainFeatures,trainTargets)

    logging.info("Predicting...")
    
    predicted_scores = clf.predict_proba(testFeatures).T[1]

    
    logging.info("Write results...")
    output_file = "avito_starter_solution.csv"
    logging.info("Writing submission to %s" % output_file)
    f = open(os.path.join(dataFolder,output_file), "w")
    f.write("id\n")
    
    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        f.write("%d\n" % (item_id))
    f.close()
    logging.info("Done.")
开发者ID:albertoandreottiATgmail,项目名称:datasci_course_materials,代码行数:32,代码来源:avito_ProhibitedContent_SampleCode.py

示例5: SGD

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
class SGD(object):
	def __init__(self):
		self.sgd = SGDClassifier(loss='modified_huber', alpha = .00001, penalty='elasticnet',shuffle=True, n_jobs=-1,random_state = 2014)
	def predict(self, X):
		return self.sgd.predict_proba(X)[:,1][:,np.newaxis]
	def fit(self, X, y):
		self.sgd.fit(X,y)
开发者ID:MLevinson-OR,项目名称:GBx-testbed,代码行数:9,代码来源:base_estimators.py

示例6: classify

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def classify(dummy_train,dummy_test,feature_pkl,output_file):
    # Train classifier, iterating over subsets
    # Load Features
    print 'Loading features...'
    featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl)
    trainTargets = np.array(trainTargets)
    testItemIds = np.array(testItemIds)
    predicted_ids = []
    predicted_scores = []
    # SGD Logistic Regression per sample 
    clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1,
          eta0=0.0, fit_intercept=True, l1_ratio=0.15,
          learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
          penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False,
          verbose=0, warm_start=False)
    for col in range(np.shape(dummy_train)[1]):
        # Get nonzero dummy indices as array
        idx_train = dummy_train[:,col].astype('bool').T.toarray()[0]
        print 'Training subset {} of {}...'.format(col,np.shape(dummy_train)[1])
        sub_train = normalize(trainFeatures.tocsr()[idx_train,:], norm='l2', axis=0)
        clf.fit(sub_train,trainTargets[idx_train])
       # Use probabilities instead of binary class prediction in order to generate a ranking    
        idx_test = dummy_test[:,col].astype('bool').T.toarray()[0]
        sub_test = normalize(testFeatures.tocsr()[idx_test,:], norm='l2', axis=0)
        predicted_scores += clf.predict_proba(sub_test).T[1].tolist()
        predicted_ids += testItemIds[idx_test].tolist()
    
    with open(os.path.splitext(feature_pkl)[0]+'_'+output_file,'w') as out_fid:
        out_fid.write("id\n")
        for pred_score, item_id in sorted(zip(predicted_scores, predicted_ids), reverse = True):
           # only writes item_id per output spec, but may want to look at predicted_scores
            out_fid.write("%d\n" % (item_id))
开发者ID:phecy,项目名称:cdips-kaggle,代码行数:34,代码来源:categories.py

示例7: __init__

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
class LightModel:
    def __init__(self,learningRate, numEpochs, ppenalty="l1", mustShuffle=True):
        #Init scikit models
        self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle)
    def train(self, gen,  v=False):
        i = 0
        for x, y in gen: #For each batch
            self.Classifier.partial_fit(x, y, [0,1])
            i += len(x)
            if v : print(str(datetime.now())[:-7] , "example:", i)
            
    def test(self, gen,  v=False):

        #init target and prediction arrays
        ytot = np.array([])
        ptot = np.array([])
        #Get prediction for each batch
        i = 0
        for x,y in gen:
            p = self.Classifier.predict_proba(x)
            p = p.T[1].T #Keep column corresponding to probability of class 1
            #Stack target and prediction for later analysis
            ytot = np.hstack((ytot, y)) 
            ptot = np.hstack((ptot, p))
            i += y.shape[0]
            if v : print(str(datetime.now())[:-7] , "example:", i)
        if v: print("Score:", self.score(ytot, ptot))
        
        return (ytot, ptot)
    def score(self, target, prediction):
        return llfun(target, prediction)
开发者ID:EtienneDesticourt,项目名称:Kaggle-Avazu,代码行数:33,代码来源:LightModel.py

示例8: predict_sgd

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def predict_sgd(X_train, y_train, X_test, sample_weight):
    clf = SGDClassifier(loss='log', alpha=0.01, l1_ratio=0, n_jobs=2,
                        n_iter=50)
    clf.fit(X_train, y_train, sample_weight=sample_weight)

    predictions = clf.predict_proba(X_test)
    return predictions
开发者ID:seylom,项目名称:kaggle-hashtags,代码行数:9,代码来源:utils.py

示例9: twoclass

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
class twoclass(SGDClassifier):
    # THE HACK IS NOW GETTING EVEN MORE EVIL
    def __init__(self):
        self.clazz= SGDClassifier(loss='log')

    def fit(self,X,y, crossval=False):

        if crossval:
            print "layers crossvalscore:",sklearn.model_selection.cross_val_score(SGDClassifier(loss='log'),X, y).mean()

        self.clazz.fit(X,y)
        self.intercept_= self.clazz.intercept_
        self.classes_= self.clazz.classes_
        return self

    # eden cant annotate two classes if the esti is not a sgdregressor
    #  -> this hack is made!
    '''
    details: decission function returns a one d array.
    eden only accepts these if the estimater is instance of sgdregressor.
    so i make a two d array from my 1 d array.
    if i hack something like this in the future maybe the intercept array needs to be provided..
    (see the annotator code)
    '''

    # default guy:
    #def decision_function(self, vector):
    #    answer =  super(self.__class__,self).decision_function(vector)
    #    return np.vstack((answer, (answer-1))).T

    def decision_function(self,vector):
        return self.clazz.predict_proba(vector)

    '''
开发者ID:fabriziocosta,项目名称:GraphLearn,代码行数:36,代码来源:annotate.py

示例10: classify

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
 def classify(self):
     """Perform classification"""
     clf = SGDClassifier(loss='log', penalty='l1')
     pca = PCA(n_components = 10)
     self._ClassifyDriver__traindata = pca.fit_transform(self._ClassifyDriver__traindata)
     self._ClassifyDriver__testdata = pca.transform(self._ClassifyDriver__testdata)
     clf.fit(self._ClassifyDriver__traindata, self._ClassifyDriver__trainlabels)
     self._ClassifyDriver__y = clf.predict_proba(self._ClassifyDriver__testdata)[:,0]
开发者ID:thekannman,项目名称:kaggle,代码行数:10,代码来源:ClassifyDriver.py

示例11: SGDModel

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
class SGDModel(BaseModel):
    
    def __init__(self, cached_features=True):
        BaseModel.__init__(self, cached_features)
        self.model = SGDClassifier(loss="modified_huber", average=True, random_state=1)

    def _predict_internal(self, X_test):
        return self.model.predict_proba(X_test)[:, 1]
开发者ID:sjuvekar,项目名称:Kaggle-Dato,代码行数:10,代码来源:sgd_model.py

示例12: test_threshold_SGD

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def test_threshold_SGD():
    train = pandas.read_csv('data/train_v2.csv')
#    test = pandas.read_csv('data/test_v2.csv')
    train_loss = train.loss
       
#    train = train[['f527', 'f528', 'f274', 'f271', 'f2', 'f727', 'f337', 'f431', 'f757']]
#    test = test[['f527', 'f528', 'f274', 'f271', 'f2', 'f727', 'f337', 'f431', 'f757']]
    
#    train = train[['f527', 'f528', 'f274', 'f271']]
#    test = test[['f527', 'f528', 'f274', 'f271']]
    
    imp = Imputer()
    imp.fit(train)
    
    train = imp.transform(train)
#    test = imp.transform(test)
    
    train=pre.StandardScaler().fit_transform(train)
#    test=pre.StandardScaler().fit_transform(test)
    
    
    train_loss_array = train_loss.apply(lambda x: 1 if x>0 else 0).values
    
    clf = SGDClassifier(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=6, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None)
       
    
    clf.fit(train,train_loss_array)      
    train = clf.transform(train, threshold = "1.25*mean")
    print train.shape    
    
    kf = StratifiedKFold(train_loss.values, n_folds=10, indices=False)    

    threshold  = 0.999999999164       
    mean_mae = 0.
    for train_i, test_i in kf:
#        print len(train_i)
        X_train_split, X_test_split, y_train_split, y_test_split = train[train_i], train[test_i], train_loss_array[train_i], train_loss_array[test_i]
        y_test_split_initial = train_loss[test_i].values
        
        clf = SGDClassifier(loss='log', penalty='l2', alpha=1e-4, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=6, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None)
    
        clf.fit(X_train_split,y_train_split)      
        probas_ = clf.predict_proba(X_test_split)
        prediction_proba = probas_[:,1]
        
        predictionIndexes0 = np.where(prediction_proba <= threshold)[0]
        predictionIndexes1 = np.where(prediction_proba > threshold)[0]
        
        prediction = np.asarray([0.] * y_test_split_initial.shape[0])
        prediction[predictionIndexes1] = 10.
        prediction[predictionIndexes0] = 0.
        mae = mean_absolute_error(y_test_split_initial, prediction)
    
        mean_mae += mae
        
        print "Split MAE: " + str(mae)
    mean_mae = mean_mae / 10.
    print "Average MAE: " + str(mean_mae)
开发者ID:EmanuelaBoros,项目名称:kaggle_loan_default_prediction,代码行数:60,代码来源:find_best_threshold.py

示例13: __init__

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
class Model:
    def __init__(self,numFeatures, learningRate, numEpochs, ppenalty="l1", mustShuffle=True):
        #Init scikit models
        self.FH = FeatureHasher(n_features=numFeatures, input_type='string')
        self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle)
    def train(self, gen,  v=False):

        i = 0
        for x, y in gen: #For each batch
            xHash = self.FH.transform(x) #hash trick
            y = np.array(y)            
##            for epoch in range(numEpochs):
            self.Classifier.partial_fit(xHash, y, [0,1])
            i += len(x)
            if v : print(str(datetime.now())[:-7] , "example:", i)
            
    def test(self, gen,  v=False):

        #init target and prediction arrays
        ytot = np.array([])
        ptot = np.array([])
        #Get prediction for each batch
        i = 0
        for x,y in gen:
            xHash = self.FH.transform(x) #hash trick
            p = self.Classifier.predict_proba(xHash)
            p = p.T[1].T #Keep column corresponding to probability of class 1
            #Stack target and prediction for later analysis
            ytot = np.hstack((ytot, y)) 
            ptot = np.hstack((ptot, p))
            i += y.shape[0]
            if v : print(str(datetime.now())[:-7] , "example:", i)
        if v: print("Score:", self.score(ytot, ptot))
        
        return (ytot, ptot)
    def predictBatch(self, batch):
        hashedBatch = self.FH.transform(batch)
        prediction = self.Classifier.predict_proba(hashedBatch)
        return prediction
    def generatePrediction(self, generator):
        for xBatch, idBatch in generator:
            prediction = self.predictBatch(xBatch)
            yield prediction, idBatch
    def score(self, target, prediction):
        return llfun(target, prediction)
开发者ID:EtienneDesticourt,项目名称:Kaggle-Avazu,代码行数:47,代码来源:Model.py

示例14: get_predications

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def get_predications(df, idf, train_set, test_set,target_values):
	"""
	get predication using liner regression model
	"""
	m_train = idf.transform(train_set)
	m_test=idf.transform(test_set)
	lm = SGDClassifier(penalty="l2",loss="log",fit_intercept=True, shuffle=True,n_iter=20, n_jobs=-1,alpha=0.000005)
	lm.fit(m_train, target_values)
	return lm.predict_proba(m_test)[:,1]
开发者ID:XingyuGit,项目名称:ExcitementPrediction,代码行数:11,代码来源:text_mining.py

示例15: main

# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def main():
    """ Generates features and fits classifier. """
    
    # The following 5 command lines can be outcommented if the features are already created.
    # There is no need to process the data every single time.
    # Fine tuning the learning algorythm is much faster without that extra step.
    
    # by reading the train dataset the feature index is created.
    # First calling of the processdata function
    # originally the items are limited to 300000
    featureIndexes = processData(os.path.join(dataFolder,"avito_train_small.tsv"), itemsLimit=5000) # Original itemsLimit=300000

#    # Trainfeature is created using the indexfeatures...
    # Second calling of the processdata function
    trainFeatures,trainTargets, trainItemIds=processData(os.path.join(dataFolder,"avito_train_small.tsv"), featureIndexes, itemsLimit=5000) # Original itemsLimit=300000
#
#    # Building the test dataset... just like the training...
    testFeatures, testItemIds=processData(os.path.join(dataFolder,"avito_test.tsv"), featureIndexes)
#
#    # Dumping data into file...
#    joblib.dump((trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds), os.path.join(dataFolder,"train_data.pkl"))
    joblib.dump((trainFeatures, trainTargets, trainItemIds), os.path.join(dataFolder,"train_data_small.pkl"))

#
#    # loading data pack...
    trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(os.path.join(dataFolder,"train_data.pkl"))
#
    logging.info("Feature preparation done, fitting model...")
#
#    # Stochastic gradient model
    clf = SGDClassifier(    loss="log", 
                            penalty="l2", 
                            alpha=1e-4, 
                            class_weight="auto")
    #
    clf.fit(trainFeatures,trainTargets)
#
    logging.info("Predicting...")
#
#    #     
    predicted_scores = clf.predict_proba(testFeatures).T[1]
#
#    
    logging.info("Write results...")
#    #    
    output_file = "avito_starter_solution.csv"
    logging.info("Writing submission to %s" % output_file)
    f = open(os.path.join(dataFolder,output_file), "w")
    f.write("id\n")
    
    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        f.write("%d\n" % (item_id))
    f.close()
    logging.info("Done.")
开发者ID:eyedvabny,项目名称:CDIPS-WS-2014,代码行数:56,代码来源:Sample_code.py


注:本文中的sklearn.linear_model.SGDClassifier.predict_proba方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。