本文整理汇总了Python中sklearn.linear_model.SGDClassifier.predict_log_proba方法的典型用法代码示例。如果您正苦于以下问题:Python SGDClassifier.predict_log_proba方法的具体用法?Python SGDClassifier.predict_log_proba怎么用?Python SGDClassifier.predict_log_proba使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.linear_model.SGDClassifier
的用法示例。
在下文中一共展示了SGDClassifier.predict_log_proba方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_sgd_proba
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_log_proba [as 别名]
def test_sgd_proba(self):
"""Check SGD.predict_proba"""
# Hinge loss does not allow for conditional prob estimate.
# We cannot use the factory here, because it defines predict_proba
# anyway.
clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=10).fit(X, Y)
assert_false(hasattr(clf, "predict_proba"))
assert_false(hasattr(clf, "predict_log_proba"))
# log and modified_huber losses can output probability estimates
# binary case
for loss in ["log", "modified_huber"]:
clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10)
clf.fit(X, Y)
p = clf.predict_proba([3, 2])
assert_true(p[0, 1] > 0.5)
p = clf.predict_proba([-1, -1])
assert_true(p[0, 1] < 0.5)
p = clf.predict_log_proba([3, 2])
assert_true(p[0, 1] > p[0, 0])
p = clf.predict_log_proba([-1, -1])
assert_true(p[0, 1] < p[0, 0])
# log loss multiclass probability estimates
clf = self.factory(loss="log", alpha=0.01, n_iter=10).fit(X2, Y2)
d = clf.decision_function([[0.1, -0.1], [0.3, 0.2]])
p = clf.predict_proba([[0.1, -0.1], [0.3, 0.2]])
assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1))
assert_almost_equal(p[0].sum(), 1)
assert_true(np.all(p[0] >= 0))
p = clf.predict_proba([-1, -1])
d = clf.decision_function([-1, -1])
assert_array_equal(np.argsort(p[0]), np.argsort(d[0]))
l = clf.predict_log_proba([3, 2])
p = clf.predict_proba([3, 2])
assert_array_almost_equal(np.log(p), l)
l = clf.predict_log_proba([-1, -1])
p = clf.predict_proba([-1, -1])
assert_array_almost_equal(np.log(p), l)
# Modified Huber multiclass probability estimates; requires a separate
# test because the hard zero/one probabilities may destroy the
# ordering present in decision_function output.
clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10)
clf.fit(X2, Y2)
d = clf.decision_function([3, 2])
p = clf.predict_proba([3, 2])
if not isinstance(self, SparseSGDClassifierTestCase):
assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1))
else: # XXX the sparse test gets a different X2 (?)
assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1))
# the following sample produces decision_function values < -1,
# which would cause naive normalization to fail (see comment
# in SGDClassifier.predict_proba)
x = X.mean(axis=0)
d = clf.decision_function(x)
if np.all(d < -1): # XXX not true in sparse test case (why?)
p = clf.predict_proba(x)
assert_array_almost_equal(p[0], [1 / 3.0] * 3)
示例2: __init__
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_log_proba [as 别名]
class SA_SGDClassifier:
def __init__(self, train_data_percentage=70, count_vector_type='normal',
data_base_path=None, total_entries=1000, starting_pos=0):
self.test_values = None
self.training_data = None
self.test_data_files = None
self.train_data_percentage =train_data_percentage
self.data_base_path = data_base_path
self.starting_pos = starting_pos
self.total_entries = total_entries
self.counts = None
self.classifier = None
if count_vector_type == 'normal':
self.count_vectorizer = CountVectorizer()
else:
# just put anything different than 'normal' count_vector_type while initializing
self.count_vectorizer = CountVectorizer(ngram_range=(1, 2))
self.tf_idf = TfidfTransformer()
self.initialized = False
self.trained = False
self.initialize()
def initialize(self):
max_training_datapoints = int(self.total_entries * (self.train_data_percentage/100))
if self.total_entries - max_training_datapoints < self.starting_pos:
print('incorrect parameters provided check starting_pos and training_percentage')
return
temp = preprocessing.process_raw_data(max_training_datapoints=max_training_datapoints,
starting_pos=self.starting_pos)
self.training_data = temp[0]
self.test_data_files = temp[1]
self.counts = self.count_vectorizer.fit_transform(self.training_data['text'].values)
self.initialized = True
def train(self):
if not self.initialized:
print('classifier not initialized')
return
from sklearn.linear_model import SGDClassifier
self.classifier = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
targets = self.training_data['class'].values
counts_tf_idf = self.tf_idf.fit_transform(self.counts)
self.classifier.fit(counts_tf_idf, targets)
print('classifier was trained from', len(self.training_data['text']), 'entries')
self.trained = True
def predict_from_test_data(self):
if not self.initialized or not self.trained:
print('classifier not initialized or not trained')
return
base_path= './txt_sentoken/'
test_reviews = []
true_outputs = []
for polarity in self.test_data_files:
for file_name in self.test_data_files[polarity]:
file = open(base_path+polarity+'/'+file_name,'r')
file_content = file.read().splitlines()
file.close()
test_reviews.append(" ".join(file_content))
true_outputs.append(polarity)
test_reviews_count = self.count_vectorizer.transform(test_reviews)
print('total reviews analyzed(test data):', len(2 * self.test_data_files['neg']))
predictions = self.classifier.predict(test_reviews_count)
print('f1 score for negative polarity:', f1_score(true_outputs, predictions, pos_label='neg'))
print('f1 score for positive polarity:', f1_score(true_outputs, predictions, pos_label='pos'))
print('mean accuracy on test data:', self.classifier.score(test_reviews_count, true_outputs))
def predict_from_user_string(self, string=None, true_output=None):
if not self.initialized or not self.trained:
print('classifier not initialized or not trained')
return
if string is None:
string = input(prompt='Enter the string:\n')
string_list = [string]
string_count = self.count_vectorizer.transform(string_list)
predicted_polarity = self.classifier.predict(string_count)
if true_output is not None:
if true_output == predicted_polarity:
print('correct prediction :)')
else:
print('incorrect prediction :(')
print('predicted polarity:', predicted_polarity)
print('probability estimate', self.classifier.predict_proba(string_count))
print('log probability estimate', self.classifier.predict_log_proba(string_count))
示例3: get_minibatch
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_log_proba [as 别名]
clf = clf.partial_fit(X_val, y_val)
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))
clf = clf.partial_fit(X_test, y_test)
# Serializing
import pickle, os
# 1. On cree un nouveau répertoire pour sauvegarder nos données
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
os.makedirs(dest)
# On sérialise notre classifier ainsi que nos stop words
stop = stopwords.words('english')
pickle.dump(stop,
open(os.path.join(dest, 'stopwords.pkl'), 'wb'),
protocol=4)
pickle.dump(clf,
open(os.path.join(dest, 'classifier.pkl'), 'wb'),
protocol=4)
# Predicting
x = "This is a stupid movies,I will not recomend it"
label = {0: 'negative', 1: "positive"}
X = vect.transform(x)
print('Prediction %s\nProbability : %.2f%%' % (label[clf.predict(X)[0]], np.max(clf.predict_log_proba(X) * 100)))
示例4: get_positive_c
# 需要导入模块: from sklearn.linear_model import SGDClassifier [as 别名]
# 或者: from sklearn.linear_model.SGDClassifier import predict_log_proba [as 别名]
pred = clf.predict(test4)
np.mean(pred == d['test']['labs'])
metrics.confusion_matrix(d['test']['labs'], pred)
# -- Testing on particular people ---
def get_positive_c(rdd_c):
return rdd_c.map(lambda x: _.flatten([x['msg']])[0]).collect()
rdd_c = rdd.filter(lambda x: x['user'] == 'TheBrosnanDigest' and x['board_id'] == '18971').cache()
msgs = get_positive_c(rdd_c)
test2 = vect.transform(msgs)
test3 = ch2.transform(test2)
test4 = tfidf.transform(test3)
pred = np.array([np.exp(x) for x in clf.predict_log_proba(test4)])
pred_ = sorted(zip(msgs, [p[0] for p in pred]), key = lambda x: x[1], reverse = True)
pprint(pred_[0:10])
# --- Saving Model ----
model = {
'vect' : vect,
'ch2' : ch2,
'tfidf' : tfidf,
'clf' : clf
}
pickle.dump(model, open('models/triclass_model_20150826_1200.pickle', 'wb'))