本文整理汇总了Python中sklearn.linear_model.SGDClassifier.predict_proba方法的典型用法代码示例。如果您正苦于以下问题:Python SGDClassifier.predict_proba方法的具体用法?Python SGDClassifier.predict_proba怎么用?Python SGDClassifier.predict_proba使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.linear_model.SGDClassifier
的用法示例。
在下文中一共展示了SGDClassifier.predict_proba方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: train_kaggle
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def train_kaggle(dataset, alg="rig", data="bow"):
train_x, train_y, test_x = dataset
print "shape for training data is", train_x.shape
if alg == "svm":
clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20)
elif alg == "svm_sq":
clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20, loss="squared_hinge")
elif alg == "log":
clf = LogisticRegression(verbose=1, n_jobs=2)
elif alg == "per":
clf = Perceptron(verbose=1, n_jobs=2, n_iter=25)
elif alg == "rig":
clf = RidgeClassifier()
elif alg == "pa":
clf = PassiveAggressiveClassifier(n_jobs=2, n_iter=25)
else:
raise NotImplementedError
print "training with %s..." % alg
clf.fit(train_x, train_y)
# clf.fit(validate_x, validate_y)
predicted = clf.predict(test_x)
save_csv(predicted, fname=alg + "_" + data)
if alg != "nb":
return clf.decision_function(train_x), clf.decision_function(test_x)
else:
return clf.predict_proba(train_x), clf.predict_proba(test_x)
示例2: SGDC_SVM_Classifier
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def SGDC_SVM_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
print("***************Starting SVM***************")
t0 = time()
clf = SGDClassifier(loss='log', penalty='l2',alpha=1e-5, n_iter=100)
clf.fit(X_train, Y_train)
preds = clf.predict(X_cv)
score = clf.score(X_cv,Y_cv)
print("{0:.2f}%".format(100 * score))
Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
rownames=['actual'], colnames=['preds'])
Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
print(Summary)
#Check with log loss function
epsilon = 1e-15
#ll_output = log_loss_func(Y_cv, preds, epsilon)
preds2 = clf.predict_proba(X_cv)
ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
print(ll_output2)
print("done in %0.3fs" % (time() - t0))
preds3 = clf.predict_proba(X_test)
#preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
preds4 = clf.predict_proba(Actual_DS)
print("***************Ending SVM***************")
return pd.DataFrame(preds2),pd.DataFrame(preds3),pd.DataFrame(preds4)
示例3: crossvalidate
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def crossvalidate(feas, labels, param):
labels = np.array(list(labels), dtype=int)
accs = []
for train_ids, valid_ids in StratifiedKFold(labels, 10):
idf=train_idf([feas[i] for i in train_ids])
X,vocab=extract_feas(feas, idf)
#lda=train_lda(X, vocab, num_topics)
#X=transform_lda(X, lda)
labels_train = labels[train_ids].copy()
weights = balance_weights(labels_train, param['bg_weight'])
labels_train[labels_train == 0] = 1
model=SGDClassifier(loss='log',
alpha=param['regu']/len(labels_train),
fit_intercept=True,
shuffle=True, n_iter=50)
model.fit(X[train_ids], labels_train, sample_weight=weights)
pp = model.predict_proba(X[valid_ids])
pred_labels = np.argmax(pp, 1)
pred_labels = model.classes_[pred_labels]
#a=accuracy(labels[valid_ids], pred_labels, 1)
# return all scores for "good" class
assert model.classes_[1] == 2
pred_scores = pp[:,1]
a=avg_precision(labels[valid_ids], pred_scores)
print '%.2f' % a,
accs.append(a)
return np.mean(accs)
示例4: main
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def main():
""" Generates features and fits classifier. """
featureIndexes = processData(os.path.join(dataFolder, "avito_train.tsv"), itemsLimit=300000)
trainFeatures,trainTargets, trainItemIds=processData(os.path.join(dataFolder,"avito_train.tsv"), featureIndexes, itemsLimit=300000)
testFeatures, testItemIds=processData(os.path.join(dataFolder,"avito_test.tsv"), featureIndexes)
joblib.dump((trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds), os.path.join(dataFolder,"train_data.pkl"))
trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(os.path.join(dataFolder,"train_data.pkl"))
logging.info("Feature preparation done, fitting model...")
clf = SGDClassifier( loss="log",
penalty="l2",
alpha=1e-4,
class_weight="auto")
clf.fit(trainFeatures,trainTargets)
logging.info("Predicting...")
predicted_scores = clf.predict_proba(testFeatures).T[1]
logging.info("Write results...")
output_file = "avito_starter_solution.csv"
logging.info("Writing submission to %s" % output_file)
f = open(os.path.join(dataFolder,output_file), "w")
f.write("id\n")
for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
f.write("%d\n" % (item_id))
f.close()
logging.info("Done.")
开发者ID:albertoandreottiATgmail,项目名称:datasci_course_materials,代码行数:32,代码来源:avito_ProhibitedContent_SampleCode.py
示例5: SGD
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
class SGD(object):
def __init__(self):
self.sgd = SGDClassifier(loss='modified_huber', alpha = .00001, penalty='elasticnet',shuffle=True, n_jobs=-1,random_state = 2014)
def predict(self, X):
return self.sgd.predict_proba(X)[:,1][:,np.newaxis]
def fit(self, X, y):
self.sgd.fit(X,y)
示例6: classify
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def classify(dummy_train,dummy_test,feature_pkl,output_file):
# Train classifier, iterating over subsets
# Load Features
print 'Loading features...'
featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl)
trainTargets = np.array(trainTargets)
testItemIds = np.array(testItemIds)
predicted_ids = []
predicted_scores = []
# SGD Logistic Regression per sample
clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1,
eta0=0.0, fit_intercept=True, l1_ratio=0.15,
learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False,
verbose=0, warm_start=False)
for col in range(np.shape(dummy_train)[1]):
# Get nonzero dummy indices as array
idx_train = dummy_train[:,col].astype('bool').T.toarray()[0]
print 'Training subset {} of {}...'.format(col,np.shape(dummy_train)[1])
sub_train = normalize(trainFeatures.tocsr()[idx_train,:], norm='l2', axis=0)
clf.fit(sub_train,trainTargets[idx_train])
# Use probabilities instead of binary class prediction in order to generate a ranking
idx_test = dummy_test[:,col].astype('bool').T.toarray()[0]
sub_test = normalize(testFeatures.tocsr()[idx_test,:], norm='l2', axis=0)
predicted_scores += clf.predict_proba(sub_test).T[1].tolist()
predicted_ids += testItemIds[idx_test].tolist()
with open(os.path.splitext(feature_pkl)[0]+'_'+output_file,'w') as out_fid:
out_fid.write("id\n")
for pred_score, item_id in sorted(zip(predicted_scores, predicted_ids), reverse = True):
# only writes item_id per output spec, but may want to look at predicted_scores
out_fid.write("%d\n" % (item_id))
示例7: __init__
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
class LightModel:
def __init__(self,learningRate, numEpochs, ppenalty="l1", mustShuffle=True):
#Init scikit models
self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle)
def train(self, gen, v=False):
i = 0
for x, y in gen: #For each batch
self.Classifier.partial_fit(x, y, [0,1])
i += len(x)
if v : print(str(datetime.now())[:-7] , "example:", i)
def test(self, gen, v=False):
#init target and prediction arrays
ytot = np.array([])
ptot = np.array([])
#Get prediction for each batch
i = 0
for x,y in gen:
p = self.Classifier.predict_proba(x)
p = p.T[1].T #Keep column corresponding to probability of class 1
#Stack target and prediction for later analysis
ytot = np.hstack((ytot, y))
ptot = np.hstack((ptot, p))
i += y.shape[0]
if v : print(str(datetime.now())[:-7] , "example:", i)
if v: print("Score:", self.score(ytot, ptot))
return (ytot, ptot)
def score(self, target, prediction):
return llfun(target, prediction)
示例8: predict_sgd
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def predict_sgd(X_train, y_train, X_test, sample_weight):
clf = SGDClassifier(loss='log', alpha=0.01, l1_ratio=0, n_jobs=2,
n_iter=50)
clf.fit(X_train, y_train, sample_weight=sample_weight)
predictions = clf.predict_proba(X_test)
return predictions
示例9: twoclass
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
class twoclass(SGDClassifier):
# THE HACK IS NOW GETTING EVEN MORE EVIL
def __init__(self):
self.clazz= SGDClassifier(loss='log')
def fit(self,X,y, crossval=False):
if crossval:
print "layers crossvalscore:",sklearn.model_selection.cross_val_score(SGDClassifier(loss='log'),X, y).mean()
self.clazz.fit(X,y)
self.intercept_= self.clazz.intercept_
self.classes_= self.clazz.classes_
return self
# eden cant annotate two classes if the esti is not a sgdregressor
# -> this hack is made!
'''
details: decission function returns a one d array.
eden only accepts these if the estimater is instance of sgdregressor.
so i make a two d array from my 1 d array.
if i hack something like this in the future maybe the intercept array needs to be provided..
(see the annotator code)
'''
# default guy:
#def decision_function(self, vector):
# answer = super(self.__class__,self).decision_function(vector)
# return np.vstack((answer, (answer-1))).T
def decision_function(self,vector):
return self.clazz.predict_proba(vector)
'''
示例10: classify
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def classify(self):
"""Perform classification"""
clf = SGDClassifier(loss='log', penalty='l1')
pca = PCA(n_components = 10)
self._ClassifyDriver__traindata = pca.fit_transform(self._ClassifyDriver__traindata)
self._ClassifyDriver__testdata = pca.transform(self._ClassifyDriver__testdata)
clf.fit(self._ClassifyDriver__traindata, self._ClassifyDriver__trainlabels)
self._ClassifyDriver__y = clf.predict_proba(self._ClassifyDriver__testdata)[:,0]
示例11: SGDModel
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
class SGDModel(BaseModel):
def __init__(self, cached_features=True):
BaseModel.__init__(self, cached_features)
self.model = SGDClassifier(loss="modified_huber", average=True, random_state=1)
def _predict_internal(self, X_test):
return self.model.predict_proba(X_test)[:, 1]
示例12: test_threshold_SGD
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def test_threshold_SGD():
train = pandas.read_csv('data/train_v2.csv')
# test = pandas.read_csv('data/test_v2.csv')
train_loss = train.loss
# train = train[['f527', 'f528', 'f274', 'f271', 'f2', 'f727', 'f337', 'f431', 'f757']]
# test = test[['f527', 'f528', 'f274', 'f271', 'f2', 'f727', 'f337', 'f431', 'f757']]
# train = train[['f527', 'f528', 'f274', 'f271']]
# test = test[['f527', 'f528', 'f274', 'f271']]
imp = Imputer()
imp.fit(train)
train = imp.transform(train)
# test = imp.transform(test)
train=pre.StandardScaler().fit_transform(train)
# test=pre.StandardScaler().fit_transform(test)
train_loss_array = train_loss.apply(lambda x: 1 if x>0 else 0).values
clf = SGDClassifier(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=6, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None)
clf.fit(train,train_loss_array)
train = clf.transform(train, threshold = "1.25*mean")
print train.shape
kf = StratifiedKFold(train_loss.values, n_folds=10, indices=False)
threshold = 0.999999999164
mean_mae = 0.
for train_i, test_i in kf:
# print len(train_i)
X_train_split, X_test_split, y_train_split, y_test_split = train[train_i], train[test_i], train_loss_array[train_i], train_loss_array[test_i]
y_test_split_initial = train_loss[test_i].values
clf = SGDClassifier(loss='log', penalty='l2', alpha=1e-4, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=6, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None)
clf.fit(X_train_split,y_train_split)
probas_ = clf.predict_proba(X_test_split)
prediction_proba = probas_[:,1]
predictionIndexes0 = np.where(prediction_proba <= threshold)[0]
predictionIndexes1 = np.where(prediction_proba > threshold)[0]
prediction = np.asarray([0.] * y_test_split_initial.shape[0])
prediction[predictionIndexes1] = 10.
prediction[predictionIndexes0] = 0.
mae = mean_absolute_error(y_test_split_initial, prediction)
mean_mae += mae
print "Split MAE: " + str(mae)
mean_mae = mean_mae / 10.
print "Average MAE: " + str(mean_mae)
示例13: __init__
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
class Model:
def __init__(self,numFeatures, learningRate, numEpochs, ppenalty="l1", mustShuffle=True):
#Init scikit models
self.FH = FeatureHasher(n_features=numFeatures, input_type='string')
self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle)
def train(self, gen, v=False):
i = 0
for x, y in gen: #For each batch
xHash = self.FH.transform(x) #hash trick
y = np.array(y)
## for epoch in range(numEpochs):
self.Classifier.partial_fit(xHash, y, [0,1])
i += len(x)
if v : print(str(datetime.now())[:-7] , "example:", i)
def test(self, gen, v=False):
#init target and prediction arrays
ytot = np.array([])
ptot = np.array([])
#Get prediction for each batch
i = 0
for x,y in gen:
xHash = self.FH.transform(x) #hash trick
p = self.Classifier.predict_proba(xHash)
p = p.T[1].T #Keep column corresponding to probability of class 1
#Stack target and prediction for later analysis
ytot = np.hstack((ytot, y))
ptot = np.hstack((ptot, p))
i += y.shape[0]
if v : print(str(datetime.now())[:-7] , "example:", i)
if v: print("Score:", self.score(ytot, ptot))
return (ytot, ptot)
def predictBatch(self, batch):
hashedBatch = self.FH.transform(batch)
prediction = self.Classifier.predict_proba(hashedBatch)
return prediction
def generatePrediction(self, generator):
for xBatch, idBatch in generator:
prediction = self.predictBatch(xBatch)
yield prediction, idBatch
def score(self, target, prediction):
return llfun(target, prediction)
示例14: get_predications
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def get_predications(df, idf, train_set, test_set,target_values):
"""
get predication using liner regression model
"""
m_train = idf.transform(train_set)
m_test=idf.transform(test_set)
lm = SGDClassifier(penalty="l2",loss="log",fit_intercept=True, shuffle=True,n_iter=20, n_jobs=-1,alpha=0.000005)
lm.fit(m_train, target_values)
return lm.predict_proba(m_test)[:,1]
示例15: main
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_proba [as 别名]
def main():
""" Generates features and fits classifier. """
# The following 5 command lines can be outcommented if the features are already created.
# There is no need to process the data every single time.
# Fine tuning the learning algorythm is much faster without that extra step.
# by reading the train dataset the feature index is created.
# First calling of the processdata function
# originally the items are limited to 300000
featureIndexes = processData(os.path.join(dataFolder,"avito_train_small.tsv"), itemsLimit=5000) # Original itemsLimit=300000
# # Trainfeature is created using the indexfeatures...
# Second calling of the processdata function
trainFeatures,trainTargets, trainItemIds=processData(os.path.join(dataFolder,"avito_train_small.tsv"), featureIndexes, itemsLimit=5000) # Original itemsLimit=300000
#
# # Building the test dataset... just like the training...
testFeatures, testItemIds=processData(os.path.join(dataFolder,"avito_test.tsv"), featureIndexes)
#
# # Dumping data into file...
# joblib.dump((trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds), os.path.join(dataFolder,"train_data.pkl"))
joblib.dump((trainFeatures, trainTargets, trainItemIds), os.path.join(dataFolder,"train_data_small.pkl"))
#
# # loading data pack...
trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(os.path.join(dataFolder,"train_data.pkl"))
#
logging.info("Feature preparation done, fitting model...")
#
# # Stochastic gradient model
clf = SGDClassifier( loss="log",
penalty="l2",
alpha=1e-4,
class_weight="auto")
#
clf.fit(trainFeatures,trainTargets)
#
logging.info("Predicting...")
#
# #
predicted_scores = clf.predict_proba(testFeatures).T[1]
#
#
logging.info("Write results...")
# #
output_file = "avito_starter_solution.csv"
logging.info("Writing submission to %s" % output_file)
f = open(os.path.join(dataFolder,output_file), "w")
f.write("id\n")
for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
f.write("%d\n" % (item_id))
f.close()
logging.info("Done.")