本文整理汇总了Python中nltk.NaiveBayesClassifier类的典型用法代码示例。如果您正苦于以下问题:Python NaiveBayesClassifier类的具体用法?Python NaiveBayesClassifier怎么用?Python NaiveBayesClassifier使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了NaiveBayesClassifier类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: category_by_movie
def category_by_movie():
from nltk.corpus import movie_reviews as mr
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.corpus import names
from nltk.classify import apply_features
import random
documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
random.shuffle(documents)
all_words = FreqDist(w.lower() for w in mr.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
#print document_features(mr.words('pos/cv957_8737.txt'))
#print documents[0]
features = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = features[100:], features[:100]
classifier = NaiveBayesClassifier.train(train_set)
print classify.accuracy(classifier, train_set)
示例2: train_nltk
def train_nltk(data, labels):
'''
Returns a trained nltk.NaiveBayesClassifier
Inputs
---------
data -- np.array of tuples
'''
# For now, shuffle, since for now assuming that only the post language itself is all that's needed for offensive measure, though in the future, 2 anti-something users may actually not be offended by one another if they are both negative about something
kf = cv.KFold(n=len(data), n_folds=N_FOLDS, shuffle=True)
best_model = None
max_acc = float('-inf')
for k, (train_index, test_index) in enumerate(kf):
X_train, Y_train = data[train_index], labels[train_index]
X_test, Y_test = data[test_index], labels[test_index]
features_train = bulk_extract_features(X_train)
features_test = bulk_extract_features(X_test)
train_set = zip(features_train, Y_train)
test_set = zip(features_test, Y_test)
model = nbc.train(train_set)
acc = nltk.classify.accuracy(model, test_set)
print str(acc)
if acc > max_acc:
max_acc = acc
best_model = model
best_model.show_most_informative_features(30)
return best_model
示例3: nltk_model
def nltk_model():
"""Fits the (non-parametric) naive Bayes classifier from nltk on the names
dataset."""
# each elt of all_names will be a (name, gender) tuple
all_names = list()
with open(MALE_FILE, "r") as f:
for line in f:
all_names.append((line.rstrip(), "male")) # rstrip removes trailing whitespace
with open(FEMALE_FILE, "r") as g:
for line in g:
all_names.append((line.rstrip(), "female"))
# assert stmts can be useful for debugging etc
assert len(all_names) == 7944
# shuffle all_names in place
random.shuffle(all_names)
# features are ({'feature_type': feature_value}, gender) tuples
features = [(nltk_featurize(name), gender) for name, gender in all_names]
split_pt = int(TRAIN_PCT * len(features))
train_set, test_set = features[:split_pt], features[split_pt:]
nb = NaiveBayesClassifier.train(train_set)
print "accuracy = {0} %".format(int(100 * nltk.classify.accuracy(nb, test_set)))
nb.show_most_informative_features(10)
示例4: test_raw_mail
def test_raw_mail(org_email):
features_test = {}
wordtokens_test = [word_limit.lemmatize(key.lower()) for key in
word_tokenize(org_email)]
for key in wordtokens_test:
if key not in stpwords:
features_test[key] = True
return features_test
#Extracting the features(Tonenized, stemmed and non-stopwords emails) from all the emails
feature_sets = [(raw_mail(n), g) for (n,g) in mail_shuffle]
#Splitting the test and training data sets from the whole email set features
size_feature = int(len(feature_sets) * 0.10)
train_set, test_set = feature_sets[size_feature:], feature_sets[:size_feature]
classifier = NaiveBayesClassifier.train(train_set)
#print (test_set[1:5])
#Printing the accuracy of the machine
print ('accuracy of the machine: ', (classify.accuracy(classifier,test_set))*100)
#Printing the top 50 features
classifier.show_most_informative_features(50)
#Printing the spam and ham labels
print ('labels:',classifier.labels())
#Classification of user entered email
while(True):
featset = raw_mail(input("Enter text to classify: "))
print (classifier.classify(featset))
开发者ID:Pooshan,项目名称:Project__spam-and-ham-detection-using-natural-language-processing,代码行数:32,代码来源:NLP-spam-ham.py
示例5: train
def train(self, training_corpus):
assert isinstance(training_corpus, (list, tuple))
assert isinstance(training_corpus[0], dict)
featureset = [(twit_features(i["text"]), i["polarity"])
for i in training_corpus
if i["denied"] == 0]
self.classifier = NaiveBayesClassifier.train(featureset)
示例6: train
def train(self):
"""
"""
catalog = getToolByName(self, "portal_catalog")
presentNouns = dict()
trainingData = []
allNouns = catalog.uniqueValuesFor("noun_terms")
for item in allNouns:
presentNouns.setdefault(item, 0)
subjectIndex = catalog._catalog.getIndex("Subject")
nounTermsIndex = catalog._catalog.getIndex("noun_terms")
# The internal catalog ids of the objects
# that have noun terms in the catalog
nounTermIndexIds = IISet(nounTermsIndex._unindex.keys())
# The internal catalog ids of the objects
# that have subjects in the catalog
subjectIndexIds = IISet(subjectIndex._unindex.keys())
commonIds = intersection(subjectIndexIds, nounTermIndexIds)
for cid in commonIds:
nounPresence = presentNouns.copy()
nouns = nounTermsIndex._unindex[cid]
tags = subjectIndex._unindex[cid]
for noun in nouns:
nounPresence[noun] = 1
for tag in tags:
trainingData.append((nounPresence, tag))
if trainingData:
self.classifier = NaiveBayesClassifier.train(trainingData)
示例7: category_by_pos
def category_by_pos():
from nltk.corpus import brown
from nltk import FreqDist
from nltk import DecisionTreeClassifier
from nltk import NaiveBayesClassifier
from nltk import classify
suffix_fdist = FreqDist()
for word in brown.words():
word = word.lower()
suffix_fdist.inc(word[-1:])
suffix_fdist.inc(word[-2:])
suffix_fdist.inc(word[-3:])
common_suffixes = suffix_fdist.keys()[:100]
# print common_suffixes
def pos_features(word):
features = {}
for suffix in common_suffixes:
features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
return features
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
# classifier = DecisionTreeClassifier.train(train_set)
# print 'Decision Tree %f' % classify.accuracy(classifier, test_set)
classifier = NaiveBayesClassifier.train(train_set)
print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
示例8: get_sentiment_data
def get_sentiment_data(query, training_set):
train = []
with open('training/' + training_set + '/training.txt') as f:
for line in f:
temp = line.split('\t')
#print temp
train.append((get_features(temp[1]), temp[0]))
clf = NaiveBayesClassifier.train(train)
tweets = grab_tweets(query)
print "HERE"
classified = {}
for tweet in tweets:
if tweet.created_at in classified.keys():
classified[tweet.created_at] = classified[tweet.created_at] + [clf.classify(get_features(tweet.text))]
else:
classified[tweet.created_at] = [clf.classify(get_features(tweet.text))]
print classified
returndata = {}
for key in classified:
#numpos = sum([1 if v=='pos' else 0 for v in classified[key]])
#returndata[key] = (numpos, len(classified[key]) - numpos) #tuple of positive, negative
# percent:
returndata[key] = float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])
#returndata[key] = math.ceil(float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])*100)/100.0
print returndata
return returndata
示例9: get_matrix
def get_matrix(spam_set, ham_set, num_folds):
'''
Generate different matrix by taking the average of K Fold data
'''
total_precision = total_recall = F1 = spam_accuracy = ham_accuracy = 0
for train_set, test_spam_set, test_ham_set in utils.get_kfold_data(spam_set, ham_set, num_folds):
classifier = NaiveBayesClassifier.train(train_set)
spam_len = len(test_spam_set)
ham_len = len(test_ham_set)
true_positive = false_positive = true_negative = false_negative = 0
for test in test_spam_set:
features = test[0]
predicted_label = classifier.classify(features)
if predicted_label == 0:
true_positive += 1
else:
false_negative += 1
for test in test_ham_set:
features = test[0]
predicted_label = classifier.classify(features)
if predicted_label == 1:
true_negative += 1
else:
false_positive += 1
precision = true_positive / float(true_positive + false_positive)
recall = true_positive / float(true_positive + false_negative)
F1 += (2 * precision * recall) / (precision + recall)
spam_accuracy += true_positive / float(true_positive + false_negative)
ham_accuracy += true_negative / float(true_negative + false_positive)
total_precision += precision
total_recall += recall
return total_precision/num_folds, total_recall/num_folds, F1/num_folds, spam_accuracy*100/num_folds, ham_accuracy*100/num_folds
示例10: check_classifier
def check_classifier(feature_extractor, **kwargs):
'''
Train the classifier on the training spam and ham, then check its accuracy
on the test data, and show the classifier's most informative features.
'''
# Make training and testing sets of (features, label) data
train_set, test_spam, test_ham = \
make_train_test_sets(feature_extractor, **kwargs)
#===============================================
# ADD YOUR CODE HERE
# Train the classifier on the training set (train_set)
# classifier = /your code/
# Test accuracy on test spam emails (test_spam) and test ham emails(test_ham)
# spam_accuracy = /your code/
# Test accuracy on test ham emails (test_spam) and test ham emails(test_ham)
# ham_accuracy = /your code/
#===============================================
classifier = NaiveBayesClassifier.train(train_set)
spam_accuracy = nltk.classify.accuracy(classifier, test_spam)
ham_accuracy = nltk.classify.accuracy(classifier, test_ham)
# How accurate is the classifier on the test sets?
print ('Test Spam accuracy: {0:.2f}%'
.format(100 * spam_accuracy))
print ('Test Ham accuracy: {0:.2f}%'
.format(100 * ham_accuracy))
# Show the top 20 informative features
print classifier.show_most_informative_features(20)
示例11: __init_naive_bayes
def __init_naive_bayes( self ):
"""
Create and trains the NaiveBayes Classifier
"""
try:
# corpus_no = abs(int(raw_input('Enter the number (1-3) of corpus: ')))
# while corpus_no == 0 or corpus_no > 3:
# corpus_no = abs(int(raw_input('Please the number of corpus from 1 to 2:' )))
corpus = 'corpus2'#+str(corpus_no)
path = os.path.join('corpora/',corpus)
spam_path = os.path.join(path,'spam')
ham_path = os.path.join(path,'ham')
spam_dir = os.listdir(spam_path)
ham_dir = os.listdir(ham_path)
train_spam_filelist = [os.path.join(spam_path, f) for f in spam_dir]
train_ham_filelist = [os.path.join(ham_path, f) for f in ham_dir]
spam_size = len(train_spam_filelist)
ham_size = len(train_ham_filelist)
train_spam_set = self.__make_featured_set(train_spam_filelist,'spam')
train_ham_set = self.__make_featured_set(train_ham_filelist,'ham')
train_set = train_spam_set + train_ham_set
self.classifier = NaiveBayesClassifier.train( train_set )
except:
raise Exception( "Unexpected error in SpamFilter: __spamFilter:",sys.exc_info()[0].__name__,\
os.path.basename( sys.exc_info()[2].tb_frame.f_code.co_filename ),\
sys.exc_info()[2].tb_lineno, \
sys.exc_info()[1].message )
示例12: train_classifiers
def train_classifiers(self):
for word in self.senses:
train_set = []
for senseId in self.senses[word]:
for lsa_vector in self.senses[word][senseId]:
train_set.append([dict(lsa_vector), senseId])
self.classifiers[word] = NaiveBayesClassifier.train(train_set)
示例13: buildclassifiers
def buildclassifiers(featureslist, SAMPLE_PROPORTION, n):
classnames = ['Naive Bayes', 'Logistic Regression', 'Linear SCV']
allclassifiers = []
for name in classnames:
for i in range(n):
random.shuffle(featureslist)
train_set, test_set = buildsets(featureslist, SAMPLE_PROPORTION)
if name == 'Naive Bayes':
spamclassifier = NaiveBayesClassifier.train(train_set)
if name == 'Logistic Regression':
spamclassifier = SklearnClassifier(LogisticRegression())
spamclassifier.train(train_set)
if name == 'Linear SCV':
spamclassifier = SklearnClassifier(LinearSVC(C=0.01))
spamclassifier.train(train_set)
perfmeasures_i = evaluate(train_set, test_set, spamclassifier, name)
if i == 0:
perfmeasures_n = perfmeasures_i
else:
perfmeasures_n = map(add, perfmeasures_n, perfmeasures_i)
# Store last classifier built per model
allclassifiers.append(spamclassifier)
# Print performance measures per classifier
printperformance(name, perfmeasures_n, n)
return allclassifiers
示例14: __init__
def __init__(self, **kwargs):
super(TimeLogicAdapter, self).__init__(**kwargs)
from nltk import NaiveBayesClassifier
self.positive = [
'what time is it',
'do you know the time',
'do you know what time it is',
'what is the time'
]
self.negative = [
'it is time to go to sleep',
'what is your favorite color',
'i had a great time',
'what is'
]
labeled_data = (
[(name, 0) for name in self.negative] +
[(name, 1) for name in self.positive]
)
# train_set = apply_features(self.time_question_features, training_data)
train_set = [(self.time_question_features(n), text) for (n, text) in labeled_data]
self.classifier = NaiveBayesClassifier.train(train_set)
示例15: __init__
def __init__(self, chatbot, **kwargs):
super().__init__(chatbot, **kwargs)
from nltk import NaiveBayesClassifier
self.positive = kwargs.get('positive', [
'what time is it',
'hey what time is it',
'do you have the time',
'do you know the time',
'do you know what time it is',
'what is the time'
])
self.negative = kwargs.get('negative', [
'it is time to go to sleep',
'what is your favorite color',
'i had a great time',
'thyme is my favorite herb',
'do you have time to look at my essay',
'how do you have the time to do all this'
'what is it'
])
labeled_data = (
[(name, 0) for name in self.negative] +
[(name, 1) for name in self.positive]
)
train_set = [
(self.time_question_features(text), n) for (text, n) in labeled_data
]
self.classifier = NaiveBayesClassifier.train(train_set)