本文整理汇总了Python中sklearn.linear_model.SGDClassifier.fit方法的典型用法代码示例。如果您正苦于以下问题:Python SGDClassifier.fit方法的具体用法?Python SGDClassifier.fit怎么用?Python SGDClassifier.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.linear_model.SGDClassifier
的用法示例。
在下文中一共展示了SGDClassifier.fit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: SGDC_SVM_Classifier
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
def SGDC_SVM_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
print("***************Starting SVM***************")
t0 = time()
clf = SGDClassifier(loss='log', penalty='l2',alpha=1e-5, n_iter=100)
clf.fit(X_train, Y_train)
preds = clf.predict(X_cv)
score = clf.score(X_cv,Y_cv)
print("{0:.2f}%".format(100 * score))
Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
rownames=['actual'], colnames=['preds'])
Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
print(Summary)
#Check with log loss function
epsilon = 1e-15
#ll_output = log_loss_func(Y_cv, preds, epsilon)
preds2 = clf.predict_proba(X_cv)
ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
print(ll_output2)
print("done in %0.3fs" % (time() - t0))
preds3 = clf.predict_proba(X_test)
#preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
preds4 = clf.predict_proba(Actual_DS)
print("***************Ending SVM***************")
return pd.DataFrame(preds2),pd.DataFrame(preds3),pd.DataFrame(preds4)
示例2: main
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
def main():
""" Generates features and fits classifier. """
featureIndexes = processData(os.path.join(dataFolder, "avito_train.tsv"), itemsLimit=300000)
trainFeatures,trainTargets, trainItemIds=processData(os.path.join(dataFolder,"avito_train.tsv"), featureIndexes, itemsLimit=300000)
testFeatures, testItemIds=processData(os.path.join(dataFolder,"avito_test.tsv"), featureIndexes)
joblib.dump((trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds), os.path.join(dataFolder,"train_data.pkl"))
trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(os.path.join(dataFolder,"train_data.pkl"))
logging.info("Feature preparation done, fitting model...")
clf = SGDClassifier( loss="log",
penalty="l2",
alpha=1e-4,
class_weight="auto")
clf.fit(trainFeatures,trainTargets)
logging.info("Predicting...")
predicted_scores = clf.predict_proba(testFeatures).T[1]
logging.info("Write results...")
output_file = "avito_starter_solution.csv"
logging.info("Writing submission to %s" % output_file)
f = open(os.path.join(dataFolder,output_file), "w")
f.write("id\n")
for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
f.write("%d\n" % (item_id))
f.close()
logging.info("Done.")
开发者ID:albertoandreottiATgmail,项目名称:datasci_course_materials,代码行数:32,代码来源:avito_ProhibitedContent_SampleCode.py
示例3: plot_sgd_separator
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
def plot_sgd_separator():
# we create 50 separable points
X, Y = make_blobs(n_samples=50, centers=2,
random_state=0, cluster_std=0.60)
# fit the model
clf = SGDClassifier(loss="hinge", alpha=0.01,
n_iter=200, fit_intercept=True)
clf.fit(X, Y)
# plot the line, the points, and the nearest vectors to the plane
xx = np.linspace(-1, 5, 10)
yy = np.linspace(-1, 5, 10)
X1, X2 = np.meshgrid(xx, yy)
Z = np.empty(X1.shape)
for (i, j), val in np.ndenumerate(X1):
x1 = val
x2 = X2[i, j]
p = clf.decision_function([x1, x2])
Z[i, j] = p[0]
levels = [-1.0, 0.0, 1.0]
linestyles = ['dashed', 'solid', 'dashed']
colors = 'k'
ax = plt.axes()
ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)
ax.axis('tight')
示例4: classify_reviews
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
def classify_reviews():
import featurizer
import gen_training_data
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
data = gen_training_data.gen_data();
stemmed_data = featurizer.stem(data);
tfidf= featurizer.tfidf(data);
clf = MultinomialNB().fit(tfidf['train_tfidf'], data['training_labels']);
predicted = clf.predict(tfidf['test_tfidf']);
num_wrong = 0;
tot = 0;
for expected, guessed in zip(data['testing_labels'], predicted):
if(expected-guessed != 0):
num_wrong += 1;
print("num_wrong: %d",num_wrong)
sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42);
_ = sgd_clf.fit(tfidf['train_tfidf'], data['training_labels']);
sgd_pred = sgd_clf.predict(tfidf['test_tfidf']);
print np.mean(sgd_pred == data['testing_labels']);
stem_tfidf = featurizer.tfidf(stemmed_data);
_ = sgd_clf.fit(stem_tfidf['train_tfidf'], data['training_labels']);
sgd_stem_prd = sgd_clf.predict(stem_tfidf['test_tfidf']);
print np.mean(sgd_stem_prd==data['testing_labels']);
示例5: classify
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
def classify(dummy_train,dummy_test,feature_pkl,output_file):
# Train classifier, iterating over subsets
# Load Features
print 'Loading features...'
featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl)
trainTargets = np.array(trainTargets)
testItemIds = np.array(testItemIds)
predicted_ids = []
predicted_scores = []
# SGD Logistic Regression per sample
clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1,
eta0=0.0, fit_intercept=True, l1_ratio=0.15,
learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False,
verbose=0, warm_start=False)
for col in range(np.shape(dummy_train)[1]):
# Get nonzero dummy indices as array
idx_train = dummy_train[:,col].astype('bool').T.toarray()[0]
print 'Training subset {} of {}...'.format(col,np.shape(dummy_train)[1])
sub_train = normalize(trainFeatures.tocsr()[idx_train,:], norm='l2', axis=0)
clf.fit(sub_train,trainTargets[idx_train])
# Use probabilities instead of binary class prediction in order to generate a ranking
idx_test = dummy_test[:,col].astype('bool').T.toarray()[0]
sub_test = normalize(testFeatures.tocsr()[idx_test,:], norm='l2', axis=0)
predicted_scores += clf.predict_proba(sub_test).T[1].tolist()
predicted_ids += testItemIds[idx_test].tolist()
with open(os.path.splitext(feature_pkl)[0]+'_'+output_file,'w') as out_fid:
out_fid.write("id\n")
for pred_score, item_id in sorted(zip(predicted_scores, predicted_ids), reverse = True):
# only writes item_id per output spec, but may want to look at predicted_scores
out_fid.write("%d\n" % (item_id))
示例6: twoclass
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
class twoclass(SGDClassifier):
# THE HACK IS NOW GETTING EVEN MORE EVIL
def __init__(self):
self.clazz= SGDClassifier(loss='log')
def fit(self,X,y, crossval=False):
if crossval:
print "layers crossvalscore:",sklearn.model_selection.cross_val_score(SGDClassifier(loss='log'),X, y).mean()
self.clazz.fit(X,y)
self.intercept_= self.clazz.intercept_
self.classes_= self.clazz.classes_
return self
# eden cant annotate two classes if the esti is not a sgdregressor
# -> this hack is made!
'''
details: decission function returns a one d array.
eden only accepts these if the estimater is instance of sgdregressor.
so i make a two d array from my 1 d array.
if i hack something like this in the future maybe the intercept array needs to be provided..
(see the annotator code)
'''
# default guy:
#def decision_function(self, vector):
# answer = super(self.__class__,self).decision_function(vector)
# return np.vstack((answer, (answer-1))).T
def decision_function(self,vector):
return self.clazz.predict_proba(vector)
'''
示例7: buildModel
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
def buildModel(size):
with open('Sentiment Analysis Dataset.csv', 'rb') as csvfile:
pos_tweets =[]
neg_tweets =[]
spamreader = csv.reader(csvfile, delimiter=',')
for row in spamreader:
if row[1] == '1':
if not (len(pos_tweets) > size):
pos_tweets.append(_cleanTweet(row[3]))
else:
if not (len(neg_tweets) > size):
neg_tweets.append(_cleanTweet(row[3]))
y = np.concatenate((np.ones(len(pos_tweets[0:size])), np.zeros(len(neg_tweets[0:size]))))
x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets[0:size], neg_tweets[0:size])), y, test_size=0.2)
x_train = _cleanText(x_train)
x_test = _cleanText(x_test)
n_dim = 100
#Initialize model and build vocab
imdb_w2v = Word2Vec(size=n_dim, min_count=10)
imdb_w2v.build_vocab(x_train)
imdb_w2v.train(x_train)
train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train])
train_vecs = scale(train_vecs)
#Train word2vec on test tweets
imdb_w2v.train(x_test)
#Build test tweet vectors then scale
test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test])
test_vecs = scale(test_vecs)
lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(train_vecs, y_train)
imdb_w2v.save("imdb_w2v")
f = open("Accuracy.txt","w")
f.write(str(lr.score(test_vecs, y_test))+" "+str(size*2))
f.close()
示例8: predict_sgd
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
def predict_sgd(X_train, y_train, X_test, sample_weight):
clf = SGDClassifier(loss='log', alpha=0.01, l1_ratio=0, n_jobs=2,
n_iter=50)
clf.fit(X_train, y_train, sample_weight=sample_weight)
predictions = clf.predict_proba(X_test)
return predictions
示例9: sgd_classifier
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
def sgd_classifier(V_train, y_train, V_val, y_val, V_test, y_test):
t0 = time.time()
print 'Building Random Forest model'
clf = SGDClassifier(n_iter = 50)
#clf = grid_search.GridSearchCV(svm_clf, parameters)
clf.fit(V_train, y_train)
#print clf.best_params_
t1 = time.time()
print 'Building Random Forest model ... Done', str(int((t1 - t0)*100)/100.)
print ''
p_val =clf.predict(V_val)
print 'Training accuracy on validation set', accuracy_score(y_val, p_val)
p_test = clf.predict(V_test)
print 'Accuracy on testing set'
print classification_report(y_test, p_test)
示例10: train_and_predict_m3
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
def train_and_predict_m3 (train, test, labels) :
## Apply basic concatenation + stemming
trainData, testData = stemmer_clean (train, test, stemmerEnableM3, stemmer_type = 'porter')
"""
# Beautiful soup cleanup and stemming
stemmer = PorterStemmer()
trainData = modified_cleanup(train, stemmer, is_train = True)
testData = modified_cleanup(test, stemmer, is_train = False)
"""
## TF-IDF transform with sub-linear TF and stop-word removal
tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
tfv.fit(trainData)
X = tfv.transform(trainData)
X_test = tfv.transform(testData)
## Create the classifier
clf = SGDClassifier(random_state = randomState, n_jobs = 1, penalty = 'l2', loss = 'huber', n_iter = 50, class_weight = 'auto', learning_rate = 'optimal', epsilon = 1)
## Create a parameter grid to search for best parameters for everything in the pipeline
param_grid = {'n_iter' : [30, 50, 80, 100, 200], 'loss': ['huber'], 'epsilon' : [0.3, 1], 'alpha' : [0.0001, 0.0003, 0.001] }
## Predict model with best parameters optimized for quadratic_weighted_kappa
if (gridSearch) :
model = perform_grid_search (clf, param_grid, X, labels)
pred = model.predict(X_test)
else :
clf.fit(X, labels)
pred = clf.predict(X_test)
return pred
示例11: train_kaggle
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
def train_kaggle(dataset, alg="rig", data="bow"):
train_x, train_y, test_x = dataset
print "shape for training data is", train_x.shape
if alg == "svm":
clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20)
elif alg == "svm_sq":
clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20, loss="squared_hinge")
elif alg == "log":
clf = LogisticRegression(verbose=1, n_jobs=2)
elif alg == "per":
clf = Perceptron(verbose=1, n_jobs=2, n_iter=25)
elif alg == "rig":
clf = RidgeClassifier()
elif alg == "pa":
clf = PassiveAggressiveClassifier(n_jobs=2, n_iter=25)
else:
raise NotImplementedError
print "training with %s..." % alg
clf.fit(train_x, train_y)
# clf.fit(validate_x, validate_y)
predicted = clf.predict(test_x)
save_csv(predicted, fname=alg + "_" + data)
if alg != "nb":
return clf.decision_function(train_x), clf.decision_function(test_x)
else:
return clf.predict_proba(train_x), clf.predict_proba(test_x)
示例12: scikit_GDS
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
def scikit_GDS(x,y, X_test,y_test=None, prevmodel="yes", output=False):
from sklearn.linear_model import SGDClassifier
from sklearn.externals import joblib
clf = SGDClassifier(loss="hinge", penalty="l2")
##
if prevmodel !="yes":
clf.fit(X, y)
joblib.dump(clf, 'trained_GDS_model.pkl')
else:
clf =joblib.load('trained_GDS_model.pkl')
if output == False:
predictions = clf.predict(X_test)
correctcount = 0
totalcount = 0
for index, each in enumerate(predictions):
if y_test[index] == each:
correctcount +=1
totalcount+=1
print str(correctcount) +" / " + str(totalcount) +" = " + str(float(correctcount)/totalcount)
else:
predictions = clf.predict(X_test)
return predictions
示例13: SGD
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
class SGD(object):
def __init__(self):
self.sgd = SGDClassifier(loss='modified_huber', alpha = .00001, penalty='elasticnet',shuffle=True, n_jobs=-1,random_state = 2014)
def predict(self, X):
return self.sgd.predict_proba(X)[:,1][:,np.newaxis]
def fit(self, X, y):
self.sgd.fit(X,y)
示例14: main
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
def main(feature_pkl):
print 'Loading data...'
featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl)
print 'Normalizing data...'
trainFeatures = sklearn.preprocessing.normalize(trainFeatures.tocsc(), norm='l2', axis=0)
testFeatures = sklearn.preprocessing.normalize(testFeatures.tocsc(), norm='l2', axis=0)
#trainSplit, testSplit = splitTuple
# Best estimator from grid search:
clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1,
eta0=0.0, fit_intercept=True, l1_ratio=0.15,
learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False,
verbose=0, warm_start=False)
print 'Fitting model...'
clf.fit(trainFeatures,trainTargets)
# Use probabilities or decision function to generate a ranking
predicted_scores = clf.decision_function(testFeatures)
with open(os.path.splitext(feature_pkl)[0]+'_testRanking.csv', 'w') as f:
f.write('id\n')
for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
f.write('%d\n' % (item_id))
# Turn estimator params into word clouds
features, indices = zip(*sorted(featureIndex.iteritems(), key=operator.itemgetter(1)))
coef_tuple = zip(clf.coef_[0],indices)
coef_sort = sorted(coef_tuple, reverse=True)
print 'Top 20 for illicit:'
wordle_print(coef_sort[:20],features)
print 'Top 20 for licit:'
wordle_print(coef_sort[-20:],features)
示例15: sgc_test
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import fit [as 别名]
def sgc_test(X, y, weight):
from sklearn.linear_model import SGDClassifier
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
for i in range(0,1):
X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split(
X, y, weight, test_size=0.2, random_state=0)
clf = SGDClassifier(loss="hinge", n_iter=100, n_jobs=-1, penalty="l2")
#clf = LogisticRegression( max_iter=100)
scaler = StandardScaler(with_mean=False)
scaler.fit(X_train) # Don't cheat - fit only on training data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) # apply same transformation to test data
clf.fit(X_train, y_train, sample_weight=weight_train)
y_pred = clf.predict(X_train)
#print(confusion_matrix(y_train, y_pred))
print(clf.score(X_train,y_train,weight_train))
y_pred = clf.predict(X_test)
#print(confusion_matrix(y_test, y_pred))
print(clf.score(X_test,y_test,weight_test))