当前位置: 首页>>代码示例>>Python>>正文


Python MultinomialNB.predict_log_proba方法代码示例

本文整理汇总了Python中sklearn.naive_bayes.MultinomialNB.predict_log_proba方法的典型用法代码示例。如果您正苦于以下问题:Python MultinomialNB.predict_log_proba方法的具体用法?Python MultinomialNB.predict_log_proba怎么用?Python MultinomialNB.predict_log_proba使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.naive_bayes.MultinomialNB的用法示例。


在下文中一共展示了MultinomialNB.predict_log_proba方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_mnnb

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]
def test_mnnb(kind):
    # Test Multinomial Naive Bayes classification.
    # This checks that MultinomialNB implements fit and predict and returns
    # correct values for a simple toy dataset.

    if kind == 'dense':
        X = X2
    elif kind == 'sparse':
        X = scipy.sparse.csr_matrix(X2)

    # Check the ability to predict the learning set.
    clf = MultinomialNB()
    assert_raises(ValueError, clf.fit, -X, y2)
    y_pred = clf.fit(X, y2).predict(X)

    assert_array_equal(y_pred, y2)

    # Verify that np.log(clf.predict_proba(X)) gives the same results as
    # clf.predict_log_proba(X)
    y_pred_proba = clf.predict_proba(X)
    y_pred_log_proba = clf.predict_log_proba(X)
    assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)

    # Check that incremental fitting yields the same results
    clf2 = MultinomialNB()
    clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2))
    clf2.partial_fit(X[2:5], y2[2:5])
    clf2.partial_fit(X[5:], y2[5:])

    y_pred2 = clf2.predict(X)
    assert_array_equal(y_pred2, y2)

    y_pred_proba2 = clf2.predict_proba(X)
    y_pred_log_proba2 = clf2.predict_log_proba(X)
    assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8)
    assert_array_almost_equal(y_pred_proba2, y_pred_proba)
    assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba)

    # Partial fit on the whole data at once should be the same as fit too
    clf3 = MultinomialNB()
    clf3.partial_fit(X, y2, classes=np.unique(y2))

    y_pred3 = clf3.predict(X)
    assert_array_equal(y_pred3, y2)
    y_pred_proba3 = clf3.predict_proba(X)
    y_pred_log_proba3 = clf3.predict_log_proba(X)
    assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8)
    assert_array_almost_equal(y_pred_proba3, y_pred_proba)
    assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba)
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:51,代码来源:test_naive_bayes.py

示例2: recommend

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]
def recommend(twitterword):
    newpd = get_words_df()
    #newpd = pd.read_csv('twitter_bigdf_appended_cleanedtweets_averageperuser.csv')
    newpd['Tweet'] = newpd['Tweet'].map(lambda x: str(x))

    newpd['was_retweeted'] = newpd['average_retweet_threshold']

    best_alpha = 50.0
    best_min_df = 0.01

    vectorizer = CountVectorizer(min_df=best_min_df)
    x, y = make_xy(newpd, vectorizer)
    xtrain, xtest, ytrain, ytest = train_test_split(x, y)

    clf = MultinomialNB(alpha=best_alpha).fit(xtrain, ytrain)

    probs = clf.predict_log_proba(x)[:, 0]

    prob = clf.predict_proba(x)[:, 0]
    predict = clf.predict(x)

    retweet_chance = clf.predict_proba(vectorizer.transform([twitterword]))

    answer = retweet_chance[0][1] * 100
    return answer
开发者ID:2dpodcast,项目名称:cs109_twitterapp,代码行数:27,代码来源:twitterword.py

示例3: return_framing_data

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]
def return_framing_data(training_set, output_filepath):
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(training_set.data)
    
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    clf = MultinomialNB(alpha=1.0,fit_prior=False).fit(X_train_tfidf,training_set.target)
    
    X_new_counts = count_vect.transform(frames)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    predicted_logs = clf.predict_log_proba(X_new_tfidf)
    
    return predicted_logs
开发者ID:dmil,项目名称:politicalframing-old,代码行数:16,代码来源:findFrames.py

示例4: return_framing_datum

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]
def return_framing_datum(training_set, frame):
    '''This is hacky- fix it later'''

    frames = [frame.word_string];

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(training_set.data)
    
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    clf = MultinomialNB(alpha=1.0,fit_prior=False).fit(X_train_tfidf,training_set.target)
    
    X_new_counts = count_vect.transform(frames)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    predicted_logs = clf.predict_log_proba(X_new_tfidf)
    print predicted_logs
    return predicted_logs[0]
开发者ID:imclab,项目名称:politicalframing,代码行数:20,代码来源:Analyze.py

示例5: write_framing_data

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]
def write_framing_data(training_set, output_filepath):
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(training_set.data)
    
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    clf = MultinomialNB(alpha=1.0,fit_prior=False).fit(X_train_tfidf,training_set.target)
    
    X_new_counts = count_vect.transform(frames)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    predicted_logs = clf.predict_log_proba(X_new_tfidf)
    
    f = open(output_filepath,'w')
    f.write('Frame Names and Order:\n'+str(frame_order)+'\n\n')
    #f.write('Frames:\n'+str(frames)+'\n\n')
    f.write('Training Set:\n'+str(training_set.target_names)+'\n\n')
    f.write('Log-Likelihoods:\n'+str(predicted_logs)+'\n\n')
    f.close()
开发者ID:dmil,项目名称:politicalframing-old,代码行数:21,代码来源:findFrames.py

示例6: proc

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]
def proc(X, y, ids, out_file):
  #http://stackoverflow.com/questions/31421413/how-to-compute-precision-recall-accuracy-and-f1-score-for-the-multiclass-case
  logger.info("Sentiment Analysis")
  ret = dict()
  
  train = X.shape[0]
  pp( X.shape)
  Xtrain =  X
  ytrain =  y
  clf = MultinomialNB()
  clf.fit(Xtrain, ytrain)
  

  fo = open(out_file,'w')
  error = 0
  for i,text in enumerate(Xtrain):
    if i % PRINT_EVERY == 0:
      logger.info("Working on %d"%i)

    y_pred = clf.predict(text)
    y_pred_proba = clf.predict_proba(text)
    y_pred_log_proba = clf.predict_log_proba(text)
    review_id = ids[i]['rid']
    funny = ids[i]['funny']
    useful = ids[i]['useful']
    cool = ids[i]['cool']
    user_id = ids[i]['user_id']
    business_id = ids[i]['business_id']

    yip = y_pred[0] - 1
    if yip < 0:
      yip = 0

    if y[i] != y_pred[0]:
      error += 1
    #print review_id,y_pred[0],y_pred_proba[0][yip]
    line = json.dumps({'review_id':review_id, 'user_id':user_id, 'business_id':business_id, 'y':y[i],'y_pred':y_pred[0], 'y_pred_proba':y_pred_proba[0][yip], 'y_pred_log_proba':y_pred_log_proba[0][yip], 'funny':funny, 'useful':useful,'cool':cool})
    fo.write("%s\n"%line)

  fo.close()
  print "error:",error
  print "ratio:",100.0*float(error)/float(train)
  logger.info("generated file %s"%out_file)
开发者ID:paolo64,项目名称:data-science-capstone,代码行数:45,代码来源:sentimentNaiveBayes.py

示例7: test_mnnb

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]
def test_mnnb():
    """
    Multinomial Naive Bayes classification.

    This checks that MultinomialNB implements fit and predict and returns
    correct values for a simple toy dataset.
    """

    for X in [X2, scipy.sparse.csr_matrix(X2)]:
        # Check the ability to predict the learning set.
        clf = MultinomialNB()
        y_pred = clf.fit(X, y2).predict(X)

        assert_array_equal(y_pred, y2)

        # Verify that np.log(clf.predict_proba(X)) gives the same results as
        # clf.predict_log_proba(X)
        y_pred_proba = clf.predict_proba(X)
        y_pred_log_proba = clf.predict_log_proba(X)
        assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
开发者ID:danohuiginn,项目名称:scikit-learn,代码行数:22,代码来源:test_naive_bayes.py

示例8: classifyFrames

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]
def classifyFrames(frame_titles, frames):
    """Train a classifier on the 20 Newsgroups training data, feed the frames to the classifier,
    and print results to text file in the results directory.

    Arguments:
        frame_titles -- a list contaitning a string title for each frame in the "frames" arguement
        frames -- a list containing each frame (space deliniated string of words)

    Side-Effects:
        prints a file "20newsgroupsclassifier.txt" to the /results directory
        
    """
    #training_set = load_files('newsgroups',shuffle=True)
    from sklearn.datasets import fetch_20newsgroups
    training_set = fetch_20newsgroups(subset='train')

    #Learn the vocabulary of the dictionary and return a count vector
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(training_set.data)

    #use tf/idf to give low weight to very common words in training data
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    #train multinomial naive bayes classifier on 20newsgroups data
    clf = MultinomialNB(alpha=1.0,fit_prior=False).fit(X_train_tfidf,training_set.target)

    #vectorize and weight words in the frames
    X_new_counts = count_vect.transform(frames)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    predicted_logs = clf.predict_log_proba(X_new_tfidf)

    #write output
    f = open('20newsgroupsclassifier.txt','w')
    f.write('Frame Names and Order:\n'+str(frame_order)+'\n\n')
    f.write('Frames:\n'+str(frames)+'\n\n')
    f.write('Training Set:\n'+str(training_set.target_names)+'\n\n')
    f.write('Log-Likelihoods:\n'+str(predicted_logs)+'\n\n')
    f.close()
开发者ID:dmil,项目名称:politicalframing-old,代码行数:41,代码来源:classifier.py

示例9: train_test_split

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]
xtrain, xtest, ytrain, ytest = train_test_split(X, Y)

clf = MultinomialNB(alpha=best_alpha).fit(xtrain, ytrain)

calibration_plot(clf, xtest, ytest)

training_accuracy = clf.score(xtrain, ytrain)
test_accuracy = clf.score(xtest, ytest)

print "Accuracy on trainnig data:  %0.2f" % training_accuracy
print "Accuraby on test data: %0.2f" % test_accuracy

words = np.array(vectorizer.get_feature_names())

x = np.eye(xtest.shape[1])
probs = clf.predict_log_proba(x)[:, 0]
ind = np.argsort(probs)

good_words = words[ind[:10]]
bad_words = words[ind[-10:]]

good_prob = probs[ind[:10]]
bad_prob = probs[ind[-10:]]

print "Good words\t      P(fresh | word)"
for w, p in zip(good_words, good_prob):
	print "%20s" % w, "%0.2f" % (1 - np.exp(p))

print "Bad words\t       P(fresh | word)"
for w, p in zip(bad_words, bad_prob):
	print "%20s" % w, "%0.2f" % (1 - np.exp(p))
开发者ID:HiroIshikawa,项目名称:data-analysis,代码行数:33,代码来源:bayesian.py

示例10: MultinomialNB

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]
X_MNB_test = X_MNB_test.tocsc()

words = np.array(vectorizer.get_feature_names())

clf_Multinomial = MultinomialNB(alpha=0.1).fit(X_MNB, y_train)

#X_identity = np.eye(X_MNB_test.shape[1])
#X_identity = X_identity.tocsc()

print(X_MNB_test.shape[1])
print(X_MNB_test.shape[0])

from scipy.sparse import identity
X_identity = sparse.csr_matrix(identity(X_MNB_test.shape[1]).toarray())

probs = clf_Multinomial.predict_log_proba(X_identity)[:, 0]
ind = np.argsort(probs)

good_words = words[ind[:10]]
bad_words = words[ind[-10:]]

good_prob = probs[ind[:10]]
bad_prob = probs[ind[-10:]]

print("Good words\t     P(fresh | word)")
for w, p in zip(good_words, good_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))
    
print("Bad words\t     P(fresh | word)")
for w, p in zip(bad_words, bad_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))
开发者ID:donaray,项目名称:SpringBoard,代码行数:33,代码来源:Text_words.py

示例11: CountVectorizer

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(training_set.data)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB(alpha=1.0,fit_prior=False).fit(X_train_tfidf,training_set.target)

testing_set = load_files('/home/dhrumil/Desktop/PoliticalFraming/data/testing',categories=categories,shuffle=True)
docs_test = testing_set.data
#X_new_counts = count_vect.transform(docs_test)
#X_new_tfidf = tfidf_transformer.transform(X_new_counts)

##
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_new_tfidf =  vectorizer.fit_transform(testing_set.data)
##

predicted = clf.predict(X_new_tfidf)
predicted_logs = clf.predict_log_proba(X_new_tfidf)

print np.mean(predicted == testing_set.target)

#f = open('classifyFiles.txt','w')
#f.write('The topics are in the following order:\n'+str(categories)+'\n\n')
#f.write('The predicted values for the training set:\n'+str(predicted)+'\n\n')
#f.write('The actual values for the training set:\n0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4 5 5 5 5 5 6 6 6 6 6\n\n')
#f.write('Log-Likelihoods:\n'+str(predicted_logs)+'\n\n')
#f.close()
开发者ID:dmil,项目名称:politicalframing-old,代码行数:32,代码来源:classifySpeeches.py

示例12: __init__

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]

#.........这里部分代码省略.........
        elif self.metric == 'acc':
            best_alpha = float(mean_valid_acc.idxmax())
        elif self.metric == 'mse':
            # yet another change; just take lowest MSE
            best_alpha = float(mean_valid_mse.idxmin())
        else:
            sys.exit('Metric ' + self.metric + ' not supported')
        self.set_alpha(best_alpha)
        self.trained = False
        return valid_f1_summary, best_alpha

    def get_coefs(self, row=0):
        if self.model_type == 'default' or self.model_type == 'SVM':
            return None
        elif self.model_type == 'myMNB':
            return zip(self.column_names, self.w)
        else:
            return zip(self.column_names, self.model.coef_[row])

    def predict(self, X):
        n, p = X.shape
        if self.model_type == 'default':
            predictions = self.default * np.ones(n, dtype=int)
        elif self.model_type == 'myMNB':
            predictions = np.array((np.dot(X.toarray(), np.array(self.w)) + self.b) > 0, dtype=int)
        else:
            predictions = self.model.predict(X)
        return predictions

    def predict_probs(self, X):
        n, p = X.shape

        if self.model_type == 'LR' or self.model_type == 'MNB':
            log_probs = self.model.predict_log_proba(X)
        else:
            log_probs = np.ones(shape=[n, 1])
        return np.exp(log_probs)

    def predict_max_probs(self, X):
        n, p = X.shape

        if self.model_type == 'LR' or self.model_type == 'MNB':
            all_log_probs = self.model.predict_log_proba(X)
            #predictions = self.predict(X)
            #log_probs = np.array([all_log_probs[i, self.class_index[p]] for i, p in enumerate(predictions)])
            log_probs = np.max(all_log_probs, axis=1)
            #assert np.sum(log_probs == max_log_probs) == len(log_probs)
        else:
            log_probs = np.ones(shape=[n, 1])
        return np.exp(log_probs)

    def get_nonconformity_scores(self, X, y):
        n, p = X.shape

        if self.model_type == 'LR':
            scores = np.dot(X.toarray(), self.model.coef_[0]) + self.model.intercept_[0]
            # multiple the scores for y == 1 by -1
            scores *= -(y*2-1)

        else:
            scores = np.zeros(shape=[n, 1])
        return scores

    def get_model_size(self):
        if self.model_type == 'default':
            size = 0
开发者ID:dallascard,项目名称:guac,代码行数:70,代码来源:sparse_model.py

示例13: __init__

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]

#.........这里部分代码省略.........

            valid_f1_summary.loc[dev_fold] = valid_f1s

            if verbose > 1:
                print dev_fold, valid_f1s

        mean_valid_f1s = valid_f1_summary.mean(axis=0)
        best_alpha = float(mean_valid_f1s.idxmax())
        self.set_alpha(best_alpha)
        self.trained = False
        return valid_f1_summary, best_alpha

    def get_coefs(self):
        if self.model_type == 'default' or self.model_type == 'SVM':
            return None
        elif self.model_type == 'myMNB':
            return zip(self.column_names, self.w)
        else:
            return zip(self.column_names, self.model.coef_[0])

    def predict(self, X):
        n, p = X.shape
        if self.model_type == 'default':
            predictions = self.default * np.ones(shape=[n, 1], dtype=int)
        elif self.model_type == 'myMNB':
            predictions = np.array((np.dot(X.toarray(), np.array(self.w)) + self.b) > 0, dtype=int)
        else:
            predictions = self.model.predict(X)
        return predictions

    def predict_p_y_eq_1(self, X):
        n, p = X.shape
        if self.model_type == 'LR' or self.model_type == 'MNB':
            log_probs = self.model.predict_log_proba(X)[:, 1]
        else:
            log_probs = np.zeros(shape=[n, 1])
        return log_probs

    def get_nonconformity_scores(self, X, y):
        n, p = X.shape

        if self.model_type == 'LR':
            scores = np.dot(X.toarray(), self.model.coef_[0]) + self.model.intercept_[0]
            # multiple the scores for y == 1 by -1
            scores *= -(y*2-1)

        else:
            scores = np.zeros(shape=[n, 1])
        return scores



    """
    def get_scores(self, X):
        n, p = X.shape
        if self.model_type == 'LR' or self.model_type == 'MNB':
            log_probs = self.model.predict_log_proba(X)
            scores = log_probs[:, 1] - log_probs[:, 0]
        else:
            scores = np.zeros(shape=[n, 1])
        return scores
    """

    def eval_f1_acc(self, X, y):
        predicted = self.predict(X)
        if np.isnan(predicted).any() or np.isnan(y).any():
开发者ID:brian-lau,项目名称:guac,代码行数:70,代码来源:sparse_model.py

示例14: enumerate

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]
m = X.shape[0]
if m < chunk:
    clf.fit(X, y)
else:
    for i, idx in enumerate(np.split(np.arange(m), xrange(chunk, m, chunk))):
        print('\t%s\tTraining %d chunk' % (datetime.now(), (i + 1)))
        clf.partial_fit(X[idx], y[idx], classes=list(categoryid_set))

# cv
print('*' * 80)
print('cross validating: ')
print('\t%s' % datetime.now())
X_cv = vectorizer.transform(cv['prodname'] + " " + cv['navigation'] +
                            " " + cv['merchant'] + " " + cv['brand'])
y_true = cv['categoryid'].values
jll = clf.predict_log_proba(X_cv)  # joint likelihood
y_pred = clf.classes_[np.nanargmax(jll, axis=1)]
max_proba = np.nanmax(jll, axis=1)


# trade off between acurry and recall
# search best decision boundry for each category
def search():
    print('*' * 80)
    print('Searching: ')
    with open("remove_id","r") as infile:
        stop_ids = [line.strip() for line in infile]
    boundary_of_category = dict()
    max_p_category = np.nanmax(jll, axis=0)  # max probability in each category
    min_p_category = np.nanmin(jll, axis=0)  # min probability in each category
    for categoryid in categoryid_set:
开发者ID:Jetpie,项目名称:product-labeling,代码行数:33,代码来源:model.py

示例15: handle

# 需要导入模块: from sklearn.naive_bayes import MultinomialNB [as 别名]
# 或者: from sklearn.naive_bayes.MultinomialNB import predict_log_proba [as 别名]

#.........这里部分代码省略.........
                    data_source=positive_data_source,
                    ids_to_filter=event_ids,
                    return_ids=False))

            unlabeled_and_spy_data_ids = chain(
                unlabeled_data_ids, VenyooDocumentUtility.webpage_generator(
                    data_source=positive_data_source,
                    ids_to_filter=event_ids,
                    return_ids=True))

            X_train = vectorizer.fit_transform(
                chain(
                    VenyooDocumentUtility.webpage_generator(
                        data_source=positive_data_source,
                        ids_to_exclude=event_ids),
                    unlabeled_and_spy_data))
            print 'X_train: ', repr(X_train), '\n'

            print 'Create y_train vector of target values (=classes)...'
            y_train = np.append(
                np.array(number_positive_without_spy_docs * [1]),
                np.array(number_unlabeled_and_spy_docs * [-1]))
            print 'y_train:', y_train.shape, '\n'

            print 'Create X_test matrix of token counts for testing...'
            unlabeled_data = CrawledWebpageUtility.webpage_generator(
                data_source=unlabeled_data_source,
                exclude_positives=positive_set_extension,
                filter_domains=rn_domains,
                exclude_hand_labeled_pages=is_hand_labeled_data)[0]

            unlabeled_and_spy_data = chain(
                unlabeled_data, VenyooDocumentUtility.webpage_generator(
                    data_source=positive_data_source,
                    ids_to_filter=event_ids,
                    return_ids=False))

            X_test = vectorizer.transform(unlabeled_and_spy_data)
            print 'X_test:', repr(X_test), '\n'

            print 'Create X_spy matrix to determine threshold t...'
            X_spy = X_test.asformat('csr')[-len(event_ids):]

            print 'Train Multinomial NB classifier...'
            classifier = MultinomialNB(alpha=0.1)
            classifier.fit(X=X_train, y=y_train)

            print 'Create log_probabilities for X_test...'
            X_test_log_proba = classifier.predict_log_proba(X_test)

            print 'Create log_probabilities for X_spy...'
            X_spy_log_proba = classifier.predict_log_proba(X_spy)

            print 'Determine probability threshold t...'

            if 0 < noise_level < 1:

                # Determine number of spy documents to ignore
                num_docs_to_ignore = int(noise_level * X_spy.shape[0])
                print num_docs_to_ignore, \
                    'spy documents were labeled as noise and will be ignored...'

                # Determine number of spy documents to consider for further calculation
                num_docs_to_consider = X_spy.shape[0] - num_docs_to_ignore

                # Create the fraction of documents and determine the threshold from it
                threshold = np.sort(X_spy_log_proba.T[1])[::-1][:num_docs_to_consider].min()

            else:
                threshold = X_spy_log_proba.T[1].min()
            print 'Threshold t =', threshold, '\n'

            print 'Determine reliable negatives...'
            reliable_negative_ids = set()

            for doc in X_test_log_proba:
                current_id = unlabeled_and_spy_data_ids.next()
                if doc[1] < threshold:
                    reliable_negative_ids.add(current_id)

            if not final_reliable_negative_ids:
                final_reliable_negative_ids = final_reliable_negative_ids | reliable_negative_ids
            else:
                final_reliable_negative_ids = final_reliable_negative_ids & reliable_negative_ids

            current_iteration += 1
            iterations -= 1

        # Reset all negative webpages back to 'Unlabeled'
        CrawledWebpage.objects.filter(
            is_spy_reliable_negative='N').update(
            is_spy_reliable_negative='-')

        print 'Label reliable negatives in database...'
        affected_pages = CrawledWebpage.objects.filter(
            id__in=final_reliable_negative_ids).update(
            is_spy_reliable_negative='N')

        print 'Done! Annotation of unlabeled data successful.'
        print affected_pages, 'documents have been annotated as reliable negatives.'
开发者ID:ShamanOfMath,项目名称:Event_Finder,代码行数:104,代码来源:labelreliablenegatives_spy.py


注:本文中的sklearn.naive_bayes.MultinomialNB.predict_log_proba方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。