本文整理汇总了Python中sklearn.naive_bayes.MultinomialNB.partial_fit方法的典型用法代码示例。如果您正苦于以下问题:Python MultinomialNB.partial_fit方法的具体用法?Python MultinomialNB.partial_fit怎么用?Python MultinomialNB.partial_fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.naive_bayes.MultinomialNB
的用法示例。
在下文中一共展示了MultinomialNB.partial_fit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Classifier
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
class Classifier():
def __init__(self):
"""
Initialize a classifier.
"""
self.clf = MultinomialNB()
def classify(self, docs, num_topics=5):
"""
Classify a list of documents.
Args:
| docs (list) -- the documents to classify (a list of strings)
| num_topics (int) -- number of top predicted topics
to return for each doc.
Returns:
| list -- the list of lists of document topics.
"""
# Returns a 2d array, where each array is
# a list of probabilities for labels.
docs_ = vectorize(docs)
probs = self.clf.predict_proba(docs_)
# This will sort the *indices* of the inner arrays, instead of the actual values.
# These indices correspond with labels.
# It goes from low to high.
probs_sorted = probs.argsort()
# Slice all the inner arrays to get `num_topics` top probabilities (their indices).
probs_top = probs_sorted[:, -num_topics:]
# Convert the indices to the actual labels, and return.
return [self.clf.classes_[probs_indices] for prob_indices in top_probs]
def train(self, docs, labels):
"""
Train the classifier with documents and labels.
The training can be online. That is, an existing
classifier can be updated with new training data.
Args:
| docs (list) -- the documents to train on (a list of strings)
| labels (list) -- the labels to train on (a list of lists of strings)
"""
docs_ = vectorize(docs)
self.clf.partial_fit(docs_, labels)
示例2: WeightedPartialFitPassiveTransferClassifier
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
class WeightedPartialFitPassiveTransferClassifier(object):
def __init__(self, target_weight):
self.classifier = MultinomialNB()
self.target_weight = target_weight
self.vectorizer = FullContextBagOfWordsLeftRightCutoff(9)
# Train on unambiguous annotatios which have a group number
def train_source(self, annotations):
X = self.vectorizer.fit_transform(annotations)
y = numpy.array([annotation.get_group_number() for annotation in annotations])
self.classifier.fit(X, y)
# Train on ambiguous annotations with according group labels
def train_target_online(self, annotations, labels):
X = self.vectorizer.transform(annotations)
y = numpy.array([Annotation.GROUP_MAPPING[label] for label in labels])
weight_vector = [self.target_weight] * len(annotations)
self.classifier.partial_fit(X, y, Annotation.GROUP_MAPPING.values(), weight_vector)
def get_group_number_prob_pair(self, annotation, prob_vector):
group_option_indices = annotation.get_group_number()
group_option_prob = [prob_vector[group_option_index] for group_option_index in group_option_indices]
return max(zip(group_option_indices, group_option_prob), key = lambda (index, prob): prob)
def get_group_number(self, annotation, prob_vector):
group_index, _ = self.get_group_number_prob_pair(annotation, prob_vector)
return group_index
# tested, results for the classifier trained on source are not random
def predict(self, annotations):
X = self.vectorizer.transform(annotations)
probs = self.classifier.predict_proba(X) # [n_samples, n_classes]
return numpy.array([self.get_group_number(annotation, row)
for row, annotation in itertools.izip(probs, annotations)])
# tested, results for the classifier trained on source are not random
def get_max_probability(self, annotation, prob_vector):
_, prob = self.get_group_number_prob_pair(annotation, prob_vector)
return prob
def get_prob_estimates(self, annotations):
X = self.vectorizer.transform(annotations)
probs = self.classifier.predict_proba(X)
return numpy.array([self.get_max_probability(annotation, row)
for row, annotation in itertools.izip(probs, annotations)])
示例3: multinomial_bayes_sklearn
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
def multinomial_bayes_sklearn(corpus, documents_training, documents_test, words_features, smoothing):
"""
Multinomial Naive Bayes sing only MultinomialNB sklearn library
Training in parts to avoid memory problems
:param corpus:
:param documents_training:
:param documents_test:
:param words_features:
:param smoothing:
:return:
"""
print "-----Multinomial Bayes sklearn pure algorithm------"
categories = util_classify.get_categories(corpus)
classifier = MultinomialNB(alpha=smoothing)
'''
print "Entrenando algoritmo por completo..."
X_train_features = []
y_train_categories = []
##### Entrenandolo de golpe
for (id ,original_category, annotations) in documents_training:
X_train_features.append(util_classify.transform_document_in_vector(annotations,words_features,corpus))
y_train_categories.append(original_category)
classifier.fit(np.array(X_train_features), np.array(y_train_categories))
'''
# Training in parts
print "Training algorithm in parts..."
first = True
for (id, original_category, annotations) in documents_training:
if first is True:
classifier.partial_fit(np.array(util_classify.transform_document_in_vector(annotations, words_features, corpus)), np.array([original_category]), classes=categories)
first = False
else:
classifier.partial_fit(np.array(util_classify.transform_document_in_vector(annotations, words_features, corpus)), np.array([original_category]))
print "Calculating metrics..."
estimated_categories = []
original_categories = []
for (id, cat_original, annotations) in documents_test:
cat_estimated = classifier.predict(np.array((util_classify.transform_document_in_vector(annotations, words_features, corpus))))
estimated_categories.append(categories.index(cat_estimated))
original_categories.append(categories.index(cat_original))
return original_categories, estimated_categories
示例4: GraphemeBasedModel
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
class GraphemeBasedModel(DiacriticsRestorationModel):
def __init__(self, window=5, input_classes=None):
self.window = window
self.input_classes = input_classes
def train(self, corpus, classes=None, chunk_size=100000):
self.vectorizer = FeatureHasher(non_negative=True,
n_features=len(classes)*2*self.window,
input_type='pair')
self.clf = MultinomialNB()
i = 0
j = 0
X = []
Y = []
for x, y in corpus:
if x[self.window][1] in self.input_classes:
X.append(x)
Y.append(y)
i += 1
if i < chunk_size:
continue
j += 1
click.echo("Running iteration {}".format(j))
X = self.vectorizer.transform(X)
self.clf.partial_fit(X, Y, classes)
X = []
Y = []
i = 0
def restore(self, string):
corpus = []
out = ''
for x, y in string_to_grapheme_corpus(string, self.window):
if x[self.window][1] in self.input_classes:
x = self.vectorizer.transform([x])
out += self.clf.predict(x)[0]
else:
out += y
return out
示例5: __init__
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
class GOTSpoilerChecker:
def __init__(self, X, Y, classes, ngram_range=(1, 3)):
self.X = X
self.Y = Y
self.classes = classes
self.ngram_range = ngram_range
self.learn()
def learn(self):
self.tf_idf = TfidfVectorizer(ngram_range=self.ngram_range, smooth_idf=True, max_df=0.7)
self.tf_idf.fit(self.X)
self.nb = MultinomialNB()
for i in range(0, len(self.X), batch_size):
batch_end = min(i + batch_size, len(self.X))
self.nb.partial_fit(self.tf_idf.transform(self.X[i:batch_end]), self.Y[i:batch_end], classes=self.classes)
def classify(self, data):
return self.nb.predict(self.tf_idf.transform([data]))[0]
def classify_list(self, data):
return self.nb.predict(self.tf_idf.transform(data))
示例6: test_alpha_vector
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
def test_alpha_vector():
X = np.array([[1, 0], [1, 1]])
y = np.array([0, 1])
# Setting alpha=np.array with same length
# as number of features should be fine
alpha = np.array([1, 2])
nb = MultinomialNB(alpha=alpha)
nb.partial_fit(X, y, classes=[0, 1])
# Test feature probabilities uses pseudo-counts (alpha)
feature_prob = np.array([[1 / 2, 1 / 2], [2 / 5, 3 / 5]])
assert_array_almost_equal(nb.feature_log_prob_, np.log(feature_prob))
# Test predictions
prob = np.array([[5 / 9, 4 / 9], [25 / 49, 24 / 49]])
assert_array_almost_equal(nb.predict_proba(X), prob)
# Test alpha non-negative
alpha = np.array([1., -0.1])
expected_msg = ('Smoothing parameter alpha = -1.0e-01. '
'alpha should be > 0.')
m_nb = MultinomialNB(alpha=alpha)
assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)
# Test that too small pseudo-counts are replaced
ALPHA_MIN = 1e-10
alpha = np.array([ALPHA_MIN / 2, 0.5])
m_nb = MultinomialNB(alpha=alpha)
m_nb.partial_fit(X, y, classes=[0, 1])
assert_array_almost_equal(m_nb._check_alpha(),
[ALPHA_MIN, 0.5],
decimal=12)
# Test correct dimensions
alpha = np.array([1., 2., 3.])
m_nb = MultinomialNB(alpha=alpha)
expected_msg = ('alpha should be a scalar or a numpy array '
'with shape [n_features]')
assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)
示例7: CombinedProbTransferClassifier
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
class CombinedProbTransferClassifier(WeightedPartialFitPassiveTransferClassifier):
def __init__(self, beta=0.5):
self.source_classifier = MultinomialNB()
self.target_classifier = MultinomialNB(fit_prior=False)
self.beta = beta
self.vectorizer = FullContextBagOfWordsLeftRightCutoff(9)
# Train on unambiguous annotatios which have a group number
def train_source(self, annotations):
X = self.vectorizer.fit_transform(annotations)
y = numpy.array([annotation.get_group_number() for annotation in annotations])
self.source_classifier.fit(X, y)
self.target_classifier.fit(X, y, sample_weight=0)
# Train on ambiguous annotations with according group labels
def train_target_online(self, annotations, labels):
X = self.vectorizer.transform(annotations)
y = numpy.array([Annotation.GROUP_MAPPING[label] for label in labels])
self.target_classifier.partial_fit(X, y, Annotation.GROUP_MAPPING.values())
def predict(self, annotations):
X = self.vectorizer.transform(annotations)
source_prob = self.source_classifier.predict_proba(X) # [n_samples, n_classes]
target_prob = self.target_classifier.predict_proba(X)
combined_prob = self.beta*source_prob + (1-self.beta)*target_prob
return numpy.array([self.get_group_number(annotation, row)
for row, annotation in itertools.izip(combined_prob, annotations)])
def get_prob_estimates(self, annotations):
X = self.vectorizer.transform(annotations)
source_prob = self.source_classifier.predict_proba(X) # [n_samples, n_classes]
target_prob = self.target_classifier.predict_proba(X)
combined_prob = self.beta*source_prob + (1-self.beta)*target_prob
return numpy.array([self.get_max_probability(annotation, row)
for row, annotation in itertools.izip(combined_prob, annotations)])
示例8: test_mnnb
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
def test_mnnb(kind):
# Test Multinomial Naive Bayes classification.
# This checks that MultinomialNB implements fit and predict and returns
# correct values for a simple toy dataset.
if kind == 'dense':
X = X2
elif kind == 'sparse':
X = scipy.sparse.csr_matrix(X2)
# Check the ability to predict the learning set.
clf = MultinomialNB()
assert_raises(ValueError, clf.fit, -X, y2)
y_pred = clf.fit(X, y2).predict(X)
assert_array_equal(y_pred, y2)
# Verify that np.log(clf.predict_proba(X)) gives the same results as
# clf.predict_log_proba(X)
y_pred_proba = clf.predict_proba(X)
y_pred_log_proba = clf.predict_log_proba(X)
assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
# Check that incremental fitting yields the same results
clf2 = MultinomialNB()
clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2))
clf2.partial_fit(X[2:5], y2[2:5])
clf2.partial_fit(X[5:], y2[5:])
y_pred2 = clf2.predict(X)
assert_array_equal(y_pred2, y2)
y_pred_proba2 = clf2.predict_proba(X)
y_pred_log_proba2 = clf2.predict_log_proba(X)
assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8)
assert_array_almost_equal(y_pred_proba2, y_pred_proba)
assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba)
# Partial fit on the whole data at once should be the same as fit too
clf3 = MultinomialNB()
clf3.partial_fit(X, y2, classes=np.unique(y2))
y_pred3 = clf3.predict(X)
assert_array_equal(y_pred3, y2)
y_pred_proba3 = clf3.predict_proba(X)
y_pred_log_proba3 = clf3.predict_log_proba(X)
assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8)
assert_array_almost_equal(y_pred_proba3, y_pred_proba)
assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba)
示例9: trainRandomForest
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
def trainRandomForest():
columns = components + feature
print columns
# date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,channel,srch_ci,srch_co,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
train = pd.read_csv("data/train.csv", header=0,
names=['date_time', 'site_name', 'posa_continent', 'user_location_user',
'country_location_region', 'user_location_city', 'orig_destination_distance', 'user_id',
'is_mobile',
'is_package', 'channel', 'srch_ci', 'srch_co', 'srch_adults_cnt', 'srch_children_cnt',
'srch_rm_cnt',
'srch_destination_id', 'srch_destination_type_id', 'is_booking', 'cnt',
'hotel_continent',
'hotel_country', 'hotel_market', 'hotel_cluster']
, parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=100000, skiprows=100000)
test = pd.read_csv("data/train.csv", header=0,
names=['date_time', 'site_name', 'posa_continent', 'user_location_user',
'country_location_region', 'user_location_city', 'orig_destination_distance', 'user_id',
'is_mobile',
'is_package', 'channel', 'srch_ci', 'srch_co', 'srch_adults_cnt', 'srch_children_cnt',
'srch_rm_cnt',
'srch_destination_id', 'srch_destination_type_id', 'is_booking', 'cnt',
'hotel_continent',
'hotel_country', 'hotel_market', 'hotel_cluster']
, parse_dates=['date_time', 'srch_ci', 'srch_co'], nrows=100000)
clf = linear_model.SGDClassifier(loss='log', penalty="elasticnet", n_iter=70,n_jobs=4)
clf2 = MultinomialNB()
# n_components = 2
# ipca = IncrementalPCA(n_components=n_components, batch_size=10)
n = 0;
print('-' * 38)
cls = np.arange(100)
# http://stackoverflow.com/questions/28489667/combining-random-forest-models-in-scikit-learn
for chunk in train:
agg = chunk.groupby(columns)['is_booking'].agg(['count'])
agg.reset_index(inplace=True)
X_train = agg[components]
y_train = agg['hotel_cluster']
clf.partial_fit(X_train, y_train, classes= cls)
clf2.partial_fit(X_train, y_train, classes= cls)
print n
n = n + 1
print('')
X_test = test[components]
y_test = test['hotel_cluster']
score = clf.score(X_test, y_test)
print 'score SGDClassifier', score
score = clf2.score(X_test, y_test)
print 'score MultinomialNB', score
return clf
示例10: MultinomialNB
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
clf = MultinomialNB(alpha=ALPHA)
cv = CountVectorizer(stop_words=stop_words, min_df=2)
cv = CountVectorizer()
#import pdb
#pdb.set_trace()
x1 = cv.fit_transform(news.data[0:10]).toarray()
y1 = news.target[0:10]
x2 = cv.fit_transform(news.data[11:20]).toarray()
y2 = news.target[11:20]
x3 = cv.fit_transform(news.data[21:30]).toarray()
y3 = news.target[21:30]
#print X
#print y1, y2
#print cv.get_feature_names()
#print news.target[0:10]
#print news.target[0:5]
#print news.target[5:10]
print np.unique(news.target[0:30])
clf.partial_fit(x1, y1, classes=np.unique(news.target[0:30]))
clf.partial_fit(x2, y2)
clf.partial_fit(x3, y3)
print 'Done'
示例11: float
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
d = datetime.strptime(str(x), "%y%m%d%H")
return [float(d.weekday()), float(d.hour)]
fh = FeatureHasher(n_features = 2**20, input_type="string", non_negative=True)
# Train classifier
clf = MultinomialNB()
train = pd.read_csv("testtrain.csv", chunksize = 50000, iterator = True)
all_classes = np.array([0, 1])
for chunk in train:
y_train = chunk["click"]
chunk = chunk[cols]
chunk = chunk.join(pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"]))
chunk.drop(["hour"], axis=1, inplace = True)
Xcat = fh.transform(np.asarray(chunk.astype(str)))
clf.partial_fit(Xcat, y_train, classes=all_classes)
# Create a submission file
usecols = cols + ["id"]
X_test = pd.read_csv("testtest.csv", usecols=usecols)
X_test = X_test.join(pd.DataFrame([dayhour(x) for x in X_test.hour], columns=["wd", "hr"]))
X_test.drop(["hour"], axis=1, inplace = True)
X_enc_test = fh.transform(np.asarray(X_test.astype(str)))
y_act = pd.read_csv("testtest.csv", usecols=['click'])
y_pred = clf.predict_proba(X_enc_test)[:, 1]
with open('logloss.txt','a') as f:
f.write('\n'+str(log_loss(y_act, y_pred))+'\tMultinomialNB')
示例12: CountVectorizer
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
"""
messages = pandas.read_csv(FILE_TRAIN, sep='\t', quoting=csv.QUOTE_NONE,names=["label", "message"])
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])
messages_bow = bow_transformer.transform(messages['message'])
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)
classe = ['ham', 'spam']
if 'nb_model.pkl' not in os.listdir("./"):
print 'First trainning'
nb = MultinomialNB()
classe = ['ham', 'spam']
nb.partial_fit(messages_tfidf, messages['label'],classes=classe)
all_predictions = nb.predict(messages_tfidf)
msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.3)
print classification_report(messages['label'], all_predictions)
# store the spam detector to disk after training
with open('nb_model.pkl', 'wb') as fout:
cPickle.dump(nb, fout)
else:
print "Training with partial_fit"
with open('nb_model.pkl','rb') as f:
nb = cPickle.load(f)
nb.partial_fit(messages_tfidf, messages['label'],classes=classe)
all_predictions = nb.predict(messages_tfidf)
print classification_report(messages['label'], all_predictions)
# store the spam detector to disk after training
with open('nb_model.pkl', 'wb') as fout:
示例13: enumerate
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
tf = t0
#for irun, chunk in enumerate(pd.read_sql_query("SELECT * FROM trainSearchStream WHERE IsClick IN (0,1) ORDER BY RANDOM();", engine, chunksize=2000000)):
#for irun, chunk in enumerate(pd.read_sql_query("SELECT * FROM trainSearchStream WHERE IsClick IN (0,1);", engine, chunksize=2000000)):
for irun, chunk in enumerate(pd.read_sql_query("SELECT * FROM trainSearchRandom;", engine2, chunksize=2000000)):
# for chunk in pd.read_sql_query("SELECT * FROM trainSearchStream", engine, chunksize=10000):
ti = time.time()
print "Query time: ", ti - tf
if irun == 0:
X_val, Y_val = make_chunk_features(chunk)
tj = time.time()
print "Make feature time: ", tj - ti
else:
X_train_temp, Y_train_temp = make_chunk_features(chunk)
tj = time.time()
print "Make feature time: ", tj - ti
clf.partial_fit(X_train_temp, Y_train_temp, classes=all_classes)
n_train += len(X_train_temp)
n_train_pos += sum(Y_train_temp)
y_pred = clf.predict_proba(X_val.values.astype(float))
logloss = log_loss(Y_val.values.astype(float), y_pred)
losses.append(logloss)
print "Logloss: ", logloss, "n_train: ", n_train, "n_train_pos: ", n_train_pos
#s = clf.score(X_val.values.astype(float), Y_val.values.astype(float))
#scores.append(s)
#print "Score: ", s, "n_train: ", n_train, "n_train_pos: ", n_train_pos
tf = time.time()
print "Training time: ", tj - ti
y_pred = clf.predict_proba(X_val.values.astype(float))
logloss = log_loss(Y_val.values.astype(float), y_pred)
#print scores
示例14: MultinomialNB
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
testlabelnames = testloader.extractLabelNames()
#dictionary { sentiment:[] event:[] time:[] } of label indices for each training example
testlabelindices = testloader.extractLabelIndices()
classifiers = {}
print "training naive bayes classifiers on test data"
# Train a multinomial naive bayes on each label type
for labeltype in ['sentiment', 'event', 'time']:
nbclassifier = MultinomialNB()
# the trainY is a single index for the maximum confidence label in a label class
y = trainlabelindices[labeltype]
# list of all possible labels for nbclassifier
indices = [ i for i in range(len(loader.labelnames[labeltype]))]
# partial fit works when you don't use the full training set
nbclassifier.partial_fit(trainX, y, indices)
classifiers[labeltype] = nbclassifier
print 'running csp on each example'
backsearch = BacktrackingSearch()
#controls the minimum probability for a label to be considered in the csp
probabilitythreshold = .2
# controls the minimum confidence for a label to be present in the gold bit vector
confidencethreshold = .5
# gold output for evaluation for each training example
testgoldvectors = testloader.extractLabelBitVectors(confidencethreshold)
#Create a new csp for each example and assign unary potentials according to the classifier
#Solve the csp using backtracking search
#Compare the resulting assignment to the goldlabel vectors to get accuracy
开发者ID:simonzheng,项目名称:simonlucas-tweet-weather-classifier,代码行数:32,代码来源:evaluatestructuredprediction.py
示例15: __init__
# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import partial_fit [as 别名]
class Classifier:
"""
Multinomial Naive Bayes classifier.
Provides binary classification; that is,
labels are either 0 or 1,
0 being negative,
1 being positive.
"""
def __init__(self, filepath=path.join(__location__, "classifier.pickle")):
"""
Initialize the classifier.
Tries to load the existing one;
if none exists, a new one is created.
"""
self.filepath = filepath
hasher = HashingVectorizer(stop_words="english", non_negative=True, norm=None, binary=False)
self.vectorizer = Pipeline((("hasher", hasher), ("tf_idf", TfidfTransformer())))
# Try to load the existing classifier.
self.clf = self.load()
# If there wasn't one, create a new one.
if not self.clf:
self.clf = MultinomialNB(alpha=0.1)
def train(self, docs, labels, save=True):
"""
Updates the classifier with new training data.
By default, saves the updated classifier as well.
"""
if docs:
training = self.vectorizer.fit_transform(docs)
self.clf.partial_fit(training, labels, [0, 1])
if save:
self.save()
def classify(self, docs):
"""
Classifies a list of documents.
Returns a list of class probabilities
for each document.
"""
docs_ = self.vectorizer.fit_transform(docs)
try:
return self.clf.predict_proba(docs_)
# Likely because the classifier hasn't been trained yet.
except AttributeError:
return []
def save(self):
"""
Persist the classifier to the disk.
"""
file = open(self.filepath, "wb")
pickle.dump(self.clf, file)
def load(self):
"""
Load the classifier from disk.
Returns None if one wasn't found.
"""
try:
file = open(self.filepath, "rb")
return pickle.load(file)
except IOError:
return None