Python nltk.NaiveBayesClassifier类代码示例

本文整理汇总了Python中nltk.NaiveBayesClassifier类的典型用法代码示例。如果您正苦于以下问题：Python NaiveBayesClassifier类的具体用法？Python NaiveBayesClassifier怎么用？Python NaiveBayesClassifier使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了NaiveBayesClassifier类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: category_by_movie

def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
    random.shuffle(documents)

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)

开发者ID:brenden17，项目名称:infinity，代码行数:30，代码来源:category_nltk.py

示例2: train_nltk

def train_nltk(data, labels):
    '''
    Returns a trained nltk.NaiveBayesClassifier
    
    Inputs
    ---------
    data -- np.array of tuples
    '''
    # For now, shuffle, since for now assuming that only the post language itself is all that's needed for offensive measure, though in the future, 2 anti-something users may actually not be offended by one another if they are both negative about something
    kf = cv.KFold(n=len(data), n_folds=N_FOLDS, shuffle=True)

    best_model = None
    max_acc = float('-inf')
    for k, (train_index, test_index) in enumerate(kf):
        X_train, Y_train = data[train_index], labels[train_index]
        X_test, Y_test = data[test_index], labels[test_index]

        features_train = bulk_extract_features(X_train)
        features_test = bulk_extract_features(X_test)

        train_set = zip(features_train, Y_train)
        test_set = zip(features_test, Y_test)
        
        model = nbc.train(train_set)

        acc = nltk.classify.accuracy(model, test_set)
        print str(acc)
        if acc > max_acc:
            max_acc = acc
            best_model = model
    best_model.show_most_informative_features(30)
    return best_model

开发者ID:arizonat，项目名称:data-science，代码行数:32，代码来源:insulter.py

示例3: nltk_model

def nltk_model():
    """Fits the (non-parametric) naive Bayes classifier from nltk on the names
    dataset."""
    # each elt of all_names will be a (name, gender) tuple
    all_names = list()

    with open(MALE_FILE, "r") as f:
        for line in f:
            all_names.append((line.rstrip(), "male"))  # rstrip removes trailing whitespace

    with open(FEMALE_FILE, "r") as g:
        for line in g:
            all_names.append((line.rstrip(), "female"))

    # assert stmts can be useful for debugging etc
    assert len(all_names) == 7944

    # shuffle all_names in place
    random.shuffle(all_names)

    # features are ({'feature_type': feature_value}, gender) tuples
    features = [(nltk_featurize(name), gender) for name, gender in all_names]
    split_pt = int(TRAIN_PCT * len(features))

    train_set, test_set = features[:split_pt], features[split_pt:]
    nb = NaiveBayesClassifier.train(train_set)

    print "accuracy = {0} %".format(int(100 * nltk.classify.accuracy(nb, test_set)))
    nb.show_most_informative_features(10)

开发者ID:CBaader，项目名称:science，代码行数:29，代码来源:nbayes.py

示例4: test_raw_mail

def test_raw_mail(org_email):

	features_test = {}
	wordtokens_test = [word_limit.lemmatize(key.lower()) for key in
	word_tokenize(org_email)]
	for key in wordtokens_test:
		if key not in stpwords:
			features_test[key] = True
	return features_test

	#Extracting the features(Tonenized, stemmed and non-stopwords emails) from all the emails
	feature_sets = [(raw_mail(n), g) for (n,g) in mail_shuffle]

	#Splitting the test and training data sets from the whole email set features
	size_feature = int(len(feature_sets) * 0.10)
	train_set, test_set = feature_sets[size_feature:], feature_sets[:size_feature]
	classifier = NaiveBayesClassifier.train(train_set)
	#print (test_set[1:5])

	#Printing the accuracy of the machine
	print ('accuracy of the machine: ', (classify.accuracy(classifier,test_set))*100) 
	
	#Printing the top 50 features
	classifier.show_most_informative_features(50) 

	#Printing the spam and ham labels
	print ('labels:',classifier.labels())

	#Classification of user entered email
	while(True):
		featset = raw_mail(input("Enter text to classify: "))
		print (classifier.classify(featset))

开发者ID:Pooshan，项目名称:Project__spam-and-ham-detection-using-natural-language-processing，代码行数:32，代码来源:NLP-spam-ham.py

示例5: train

 def train(self, training_corpus):
     assert isinstance(training_corpus, (list, tuple))
     assert isinstance(training_corpus[0], dict)
     featureset = [(twit_features(i["text"]), i["polarity"])
                     for i in training_corpus
                     if i["denied"] == 0]
     self.classifier = NaiveBayesClassifier.train(featureset)

开发者ID:yastrov，项目名称:py-tips，代码行数:7，代码来源:SentimentAnalyzerViaNaiveBayes.py

示例6: train

    def train(self):
        """
        """
        catalog = getToolByName(self, "portal_catalog")
        presentNouns = dict()
        trainingData = []
        allNouns = catalog.uniqueValuesFor("noun_terms")
        for item in allNouns:
            presentNouns.setdefault(item, 0)

        subjectIndex = catalog._catalog.getIndex("Subject")
        nounTermsIndex = catalog._catalog.getIndex("noun_terms")

        # The internal catalog ids of the objects
        # that have noun terms in the catalog
        nounTermIndexIds = IISet(nounTermsIndex._unindex.keys())

        # The internal catalog ids of the objects
        # that have subjects in the catalog
        subjectIndexIds = IISet(subjectIndex._unindex.keys())
        commonIds = intersection(subjectIndexIds, nounTermIndexIds)

        for cid in commonIds:
            nounPresence = presentNouns.copy()
            nouns = nounTermsIndex._unindex[cid]
            tags = subjectIndex._unindex[cid]
            for noun in nouns:
                nounPresence[noun] = 1
            for tag in tags:
                trainingData.append((nounPresence, tag))
        if trainingData:
            self.classifier = NaiveBayesClassifier.train(trainingData)

开发者ID:ggozad，项目名称:collective.classification，代码行数:32，代码来源:nounbayesclassifier.py

示例7: category_by_pos

def category_by_pos():
    from nltk.corpus import brown
    from nltk import FreqDist
    from nltk import DecisionTreeClassifier
    from nltk import NaiveBayesClassifier
    from nltk import classify

    suffix_fdist = FreqDist()
    for word in brown.words():
        word = word.lower()
        suffix_fdist.inc(word[-1:])
        suffix_fdist.inc(word[-2:])
        suffix_fdist.inc(word[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
#    print common_suffixes

    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = DecisionTreeClassifier.train(train_set)
#    print 'Decision Tree %f' % classify.accuracy(classifier, test_set)

    classifier = NaiveBayesClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)

开发者ID:brenden17，项目名称:infinity，代码行数:32，代码来源:category_nltk.py

示例8: get_sentiment_data

def get_sentiment_data(query, training_set):
	train = []
	with open('training/' + training_set + '/training.txt') as f:
		for line in f:
			temp = line.split('\t')
			#print temp
			train.append((get_features(temp[1]), temp[0]))
	clf = NaiveBayesClassifier.train(train)

	tweets = grab_tweets(query)
	print "HERE"
	classified = {}
	for tweet in tweets:
		if tweet.created_at in classified.keys():
			classified[tweet.created_at] = classified[tweet.created_at] + [clf.classify(get_features(tweet.text))]
		else:
			classified[tweet.created_at] = [clf.classify(get_features(tweet.text))]
	print classified

	returndata = {}
	for key in classified:
		#numpos = sum([1 if v=='pos' else 0 for v in classified[key]])
		#returndata[key] = (numpos, len(classified[key]) - numpos) #tuple of positive, negative
		# percent:
		returndata[key] = float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])
		#returndata[key] = math.ceil(float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])*100)/100.0
	print returndata
	return returndata

开发者ID:2ricecrackerfolder，项目名称:twittermood，代码行数:28，代码来源:tweet_analyzer.py

示例9: get_matrix

def get_matrix(spam_set, ham_set, num_folds):
	'''
	Generate different matrix by taking the average of K Fold data
	'''
	total_precision = total_recall = F1 = spam_accuracy = ham_accuracy = 0

	for train_set, test_spam_set, test_ham_set in utils.get_kfold_data(spam_set, ham_set, num_folds):
		classifier = NaiveBayesClassifier.train(train_set)
		spam_len = len(test_spam_set)
		ham_len = len(test_ham_set)
		true_positive = false_positive = true_negative = false_negative = 0
		for test in test_spam_set:
			features = test[0]
			predicted_label = classifier.classify(features)
			if predicted_label == 0:
				true_positive += 1
			else:
				false_negative += 1
		for test in test_ham_set:
			features = test[0]
			predicted_label = classifier.classify(features)
			if predicted_label == 1:
				true_negative += 1
			else:
				false_positive += 1
												
		precision = true_positive / float(true_positive + false_positive)
		recall = true_positive / float(true_positive + false_negative)
		F1 += (2 * precision * recall) / (precision + recall)
		spam_accuracy += true_positive / float(true_positive + false_negative)
		ham_accuracy += true_negative / float(true_negative + false_positive)
		total_precision += precision
		total_recall += recall

	return total_precision/num_folds, total_recall/num_folds, F1/num_folds, spam_accuracy*100/num_folds, ham_accuracy*100/num_folds

开发者ID:shwetgarg，项目名称:spam_filter，代码行数:35，代码来源:spam_filter.py

示例10: check_classifier

def check_classifier(feature_extractor, **kwargs):
    '''
    Train the classifier on the training spam and ham, then check its accuracy
    on the test data, and show the classifier's most informative features.
    '''
    
    # Make training and testing sets of (features, label) data
    train_set, test_spam, test_ham = \
        make_train_test_sets(feature_extractor, **kwargs)
    
    #===============================================
    # ADD YOUR CODE HERE
    # Train the classifier on the training set (train_set)
    # classifier = /your code/
    # Test accuracy on test spam emails (test_spam) and test ham emails(test_ham)
    # spam_accuracy = /your code/
    # Test accuracy on test ham emails (test_spam) and test ham emails(test_ham)
    # ham_accuracy = /your code/
    #===============================================
    classifier = NaiveBayesClassifier.train(train_set)
    spam_accuracy = nltk.classify.accuracy(classifier, test_spam)
    ham_accuracy = nltk.classify.accuracy(classifier, test_ham)
    
    # How accurate is the classifier on the test sets?
    print ('Test Spam accuracy: {0:.2f}%'
       .format(100 * spam_accuracy))
    print ('Test Ham accuracy: {0:.2f}%'
       .format(100 * ham_accuracy))

    # Show the top 20 informative features
    print classifier.show_most_informative_features(20)

开发者ID:leafsherry，项目名称:UM_F14_EECS445_courseWork，代码行数:31，代码来源:q5solution.py

示例11: __init_naive_bayes

    def __init_naive_bayes( self ):
        """
    	    Create and trains the NaiveBayes Classifier
        """
	try:
#		corpus_no = abs(int(raw_input('Enter the number (1-3) of corpus: ')))
#		while corpus_no == 0 or corpus_no > 3:
#		    corpus_no = abs(int(raw_input('Please the number of corpus from 1 to 2:' )))
		corpus = 'corpus2'#+str(corpus_no)
		
		path = os.path.join('corpora/',corpus)
		spam_path = os.path.join(path,'spam')
		ham_path = os.path.join(path,'ham')
		
		
		spam_dir = os.listdir(spam_path)
		ham_dir = os.listdir(ham_path)
		
		train_spam_filelist = [os.path.join(spam_path, f) for f in spam_dir]
		train_ham_filelist = [os.path.join(ham_path, f) for f in ham_dir]

		spam_size = len(train_spam_filelist)
		ham_size = len(train_ham_filelist)
		
		train_spam_set = self.__make_featured_set(train_spam_filelist,'spam')
		train_ham_set = self.__make_featured_set(train_ham_filelist,'ham')
		train_set = train_spam_set + train_ham_set
		
		self.classifier = NaiveBayesClassifier.train( train_set )

	except:
		    raise Exception( "Unexpected error in SpamFilter: __spamFilter:",sys.exc_info()[0].__name__,\
			os.path.basename( sys.exc_info()[2].tb_frame.f_code.co_filename ),\
			sys.exc_info()[2].tb_lineno, \
			sys.exc_info()[1].message )

开发者ID:leo-pard，项目名称:HealthCare_Twitter_Analysis，代码行数:35，代码来源:spam_filter.py

示例12: train_classifiers

 def train_classifiers(self):
     for word in self.senses:
         train_set = []
         for senseId in self.senses[word]:
             for lsa_vector in self.senses[word][senseId]:
                 train_set.append([dict(lsa_vector), senseId])
         self.classifiers[word] = NaiveBayesClassifier.train(train_set)

开发者ID:phdowling，项目名称:CompLingApplications，代码行数:7，代码来源:TMWSD.py

示例13: buildclassifiers

def buildclassifiers(featureslist, SAMPLE_PROPORTION, n):
	classnames = ['Naive Bayes', 'Logistic Regression', 'Linear SCV']
	allclassifiers = []
	for name in classnames:
		for i in range(n):
			random.shuffle(featureslist)
			train_set, test_set = buildsets(featureslist, SAMPLE_PROPORTION)

			if name == 'Naive Bayes':
				spamclassifier = NaiveBayesClassifier.train(train_set)
			if name == 'Logistic Regression':
				spamclassifier = SklearnClassifier(LogisticRegression())
				spamclassifier.train(train_set)
			if name == 'Linear SCV':
				spamclassifier = SklearnClassifier(LinearSVC(C=0.01))
				spamclassifier.train(train_set)
			perfmeasures_i = evaluate(train_set, test_set, spamclassifier, name)
			if i == 0:
				perfmeasures_n = perfmeasures_i
			else:
				perfmeasures_n = map(add, perfmeasures_n, perfmeasures_i)
	
		# Store last classifier built per model
		allclassifiers.append(spamclassifier)
		
		# Print performance measures per classifier
		printperformance(name, perfmeasures_n, n)	
		
	return allclassifiers

开发者ID:Vermeij，项目名称:Spamfilter，代码行数:29，代码来源:classifyspam.py

示例14: init

    def __init__(self, **kwargs):
        super(TimeLogicAdapter, self).__init__(**kwargs)
        from nltk import NaiveBayesClassifier

        self.positive = [
            'what time is it',
            'do you know the time',
            'do you know what time it is',
            'what is the time'
        ]

        self.negative = [
            'it is time to go to sleep',
            'what is your favorite color',
            'i had a great time',
            'what is'
        ]

        labeled_data = (
            [(name, 0) for name in self.negative] +
            [(name, 1) for name in self.positive]
        )

        # train_set = apply_features(self.time_question_features, training_data)
        train_set = [(self.time_question_features(n), text) for (n, text) in labeled_data]

        self.classifier = NaiveBayesClassifier.train(train_set)

开发者ID:Gustavo6046，项目名称:ChatterBot，代码行数:27，代码来源:time_adapter.py

示例15: init

    def __init__(self, chatbot, **kwargs):
        super().__init__(chatbot, **kwargs)
        from nltk import NaiveBayesClassifier

        self.positive = kwargs.get('positive', [
            'what time is it',
            'hey what time is it',
            'do you have the time',
            'do you know the time',
            'do you know what time it is',
            'what is the time'
        ])

        self.negative = kwargs.get('negative', [
            'it is time to go to sleep',
            'what is your favorite color',
            'i had a great time',
            'thyme is my favorite herb',
            'do you have time to look at my essay',
            'how do you have the time to do all this'
            'what is it'
        ])

        labeled_data = (
            [(name, 0) for name in self.negative] +
            [(name, 1) for name in self.positive]
        )

        train_set = [
            (self.time_question_features(text), n) for (text, n) in labeled_data
        ]

        self.classifier = NaiveBayesClassifier.train(train_set)

开发者ID:hundredrab，项目名称:ChatterBot，代码行数:33，代码来源:time_adapter.py

注：本文中的nltk.NaiveBayesClassifier类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。