当前位置: 首页>>代码示例>>Python>>正文


Python SklearnClassifier.batch_classify方法代码示例

本文整理汇总了Python中nltk.classify.scikitlearn.SklearnClassifier.batch_classify方法的典型用法代码示例。如果您正苦于以下问题:Python SklearnClassifier.batch_classify方法的具体用法?Python SklearnClassifier.batch_classify怎么用?Python SklearnClassifier.batch_classify使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.classify.scikitlearn.SklearnClassifier的用法示例。


在下文中一共展示了SklearnClassifier.batch_classify方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: SVMTweetClassifier

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
class SVMTweetClassifier(TweetClassifier):
    """
    A simple Naive Bayes classifier. Documents are tokenized and stemmed, and then converted to bag-of-words format.
    The preprocessed documents are then handled by NLTKs Naive Bayes classifier.
    """
    def __init__(self, trainfile=None, datafile=None, outfile=None):
        super(SVMTweetClassifier, self).__init__(trainfile, datafile, outfile)
        self.dictionary = SimpleDict()
        self.scores = {}
        self.stemmer = PorterStemmer()

    def getFeatures(self, tweet):
        """
        Replace this method to select different features than just bag-of-words representation of the whole tweet.
        This is probably the one piece of code we should work on most, since features basically decide whether we have a good or bad classifier.
        """
        return self.getFeatures2(tweet)
    #tokens = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation)).split(" ")
    #tokens = [self.stemmer.stem(token) for token in tokens]
    #tokens = [token for token in tokens if not token[0:4] == "http"] #remove links
    #for stop in STOPWORDS:
    #		if stop in tokens:
    #				tokens.remove(stop)
    #return self.dictionary.doc2bow(tokens, True)

    def getFeatures2(self, tweet):
        """
        POS tag and take only nouns, verbs and adjectives
        """
        text = nltk.word_tokenize(tweet.tweet)
        return self.dictionary.doc2bow([pos for pos in nltk.pos_tag(text) if pos[1] in ["NN","JJ","JJR","JJS","VBD","VBG","VBN" ,"VBP","VBZ" ,"RB"] ])

    def train(self,  trainfile=None):
        self.readTrainingData((trainfile or self.trainfile))
        print "getting features.."
        # the classifier expects a list of (feature_set, label) elements, where each feature_set is a dictionary of {feature_name: value, ...} mappings
        train_set = [(self.getFeatures(tweet), tweet.sentiment) for tweet in self.trainingTweets]
        print train_set
        print "training SVM classifier"
        self.classifier = SklearnClassifier(SVC(), sparse=True).train(train_set)

    def classifyTweets(self, datafile=None, outfile=None):
        print "reading dataset"
        self.readDataset(datafile)

        print "classifying Tweets with SVM classifier"

        # we use prob_classify so we can remember the scores. This means we could later on judge the certainty of a measurement, and if it's low, change the sentiment.
        res = self.classifier.batch_classify([self.getFeatures(tweet) for tweet in self.evalTweets])
        print "assigning sentiments"
        for idx, tweet in enumerate(self.evalTweets):
            tweet.sentiment = res[idx]

        #self.scores[(tweet.id1,tweet.id2)] = res
        #tweet.sentiment = res.max()

        self.writeResults(outfile)
开发者ID:phdowling,项目名称:CompLingApplications,代码行数:59,代码来源:TweetClassifier.py

示例2: getAccuracy

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
	def getAccuracy(self, classifier):
		classifier = SklearnClassifier(classifier)
		accuracy = 0
		for fold in range(0, self.n_fold):
			log(str(fold+1) + " iteration...")
			log("    Partitioning...")
			datacv = self.getCrossValidationData(self.tweets, fold)
			traincv = datacv[0]
			testcv = datacv[1]
			testlabel = datacv[2]
			log("    Training...")
			classifier.train(traincv)
			log("    Classifying...")
			label_pred = classifier.batch_classify(testcv)
			tempScore = accuracy_score(testlabel, label_pred)
			log("    Accuracy for this iteration: " + str(tempScore))
			accuracy += tempScore
		return accuracy/self.n_fold
开发者ID:zlmoment,项目名称:Tweet-Sentiment-Classification,代码行数:20,代码来源:classify.py

示例3: score

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
def score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(trainset)

    pred = classifier.batch_classify(test)
    return accuracy_score(tag_test, pred)
开发者ID:0rchard,项目名称:Review-Helpfulness-Prediction,代码行数:8,代码来源:store+sentiment+classifier.py

示例4: range

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
pred_NB=cf.batch_classify(test_feat)
#results=[cf.classify(test[a][0]) for a in range(size)]
#gold=[test[a][1] for a in range(size)]
cm_NB=nltk.ConfusionMatrix(test_tag,pred_NB)
print cm_NB.pp(sort_by_count=True, show_percents=False, truncate=10)

#create structures for classification
test_doc=[a[0] for a in test]

#build, train, and test classifiers
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
sv=SklearnClassifier(LinearSVC())
sv.train(train)
#note that train performance matches tmp.sum()
pred_train_sv=sv.batch_classify(train_feat)
nltk.ConfusionMatrix(train_tag,pred_train_sv)
#also test performance matches tmp_test.sum()
pred_sv=sv.batch_classify(test_feat)
#confusion matrices
cmsv=nltk.ConfusionMatrix(test_tag,pred_sv)
print cmsv.pp(sort_by_count=True, show_percents=False, truncate=5)
#some SklearnClassifier internals
featsets, labs = zip(*train)
X = sv._convert(featsets)
import numpy
y=numpy.array([sv._label_index[l] for l in labs])
#then to train one would use sv._clf.fit(X,y)

#-------------------------------------
#To vectorize/classify all in sklearn
开发者ID:akhil137,项目名称:nlp-tagging,代码行数:33,代码来源:ipyTxtClassSetEnv.py

示例5: evaluate

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]

#.........这里部分代码省略.........

        # if tt == 0:
        #     print 'processing train'

        # else:
        #     print 'processing test'

        # for each question in the split
        for qid in split:

            q = split[qid]
            ave = zeros((d, 1))
            words = zeros((d, 1))
            bow = []
            count = 0.0
            curr_ave = None
            curr_words = None

            # for each sentence in the question, generate features
            for i in range(0, len(q)):

                try:
                    tree = q[i]
                except:
                    continue

                curr_feats = {}
                if rnn_feats:
                    forward_prop(None, params, tree, d, labels=False)

                    # features: average of hidden representations and average of word embeddings
                    for ex, node in enumerate(tree.get_nodes()):

                        if node.word not in stop:
                            ave += node.p_norm
                            words += node.vec
                            count += 1.0

                    if count > 0:
                        curr_ave = ave / count
                        curr_words = words / count

                    featvec = concatenate([curr_ave.flatten(), curr_words.flatten()])

                    # add QANTA's features to the current feature set
                    for dim, val in ndenumerate(featvec):
                        curr_feats["__" + str(dim)] = val

                # add unigram indicator features to the current feature set
                if bow_feats:
                    bow += [l.word for l in tree.get_nodes()]
                    for word in bow:
                        curr_feats[word] = 1.0

                # add dependency relation indicator features to the current feature set
                if rel_feats:
                    for l in tree.get_nodes():
                        if len(l.parent) > 0:
                            par, rel = l.parent[0]
                            this_rel = l.word + "__" + rel + "__" + tree.get(par).word
                            curr_feats[this_rel] = 1.0

                if tt == 0:
                    train_feats.append((curr_feats, tree.ans.lower()))

                else:
                    test_feats.append((curr_feats, tree.ans.lower()))
                    test_ord.append(tree)

    # print 'total training instances:', len(train_feats)
    # print 'total testing instances:', len(test_feats)

    # can modify this classifier / do grid search on regularization parameter using sklearn
    classifier = SklearnClassifier(LogisticRegression(C=10))
    classifier.train(train_feats)

    print "accuracy train:", nltk.classify.util.accuracy(classifier, train_feats)
    print "accuracy test:", nltk.classify.util.accuracy(classifier, test_feats)
    print ""

    # finer-grained evaluation, see how well QANTA does at each sentence position
    pred = classifier.batch_classify([fs for (fs, l) in test_feats])

    count_dists = Counter()
    corr_dists = Counter()

    for ind, tree in enumerate(test_ord):
        curr_dist = tree.dist
        count_dists[curr_dist] += 1.0
        label = tree.ans
        if label == pred[ind]:
            corr_dists[curr_dist] += 1.0

    prob_dists = {}

    print "sentence position: correctly answered at that position, total sentences at that position,", "accuracy"

    for key in corr_dists:
        prob_dists[key] = corr_dists[key] / count_dists[key]
        print key, ": ", corr_dists[key], count_dists[key], prob_dists[key]
开发者ID:luoq,项目名称:qanta,代码行数:104,代码来源:learn_classifiers.py

示例6: __init__

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]

#.........这里部分代码省略.........
        #NaiveBayes Classfication
        self.NaiveBayesClassification(self.train_features, self.test_features)
        
        #Support vecotr machine - Linear support vector classification
        self.SVMClassification(self.train_features, self.test_features)

        self.testing("75-25")

    def NaiveBayesClassification(self, train_features, test_features):
        # Training and finding accuracy of  NaiveBayes Classifier    
    
        #Training
        self.nb_classifier = NaiveBayesClassifier.train(train_features)

        #Testing
        #print '\n ACCURACY - NAIVE BAYE CLASSIFIER: %s \n' % (nltk.classify.util.accuracy(self.nb_classifier, test_features))
        #self.nb_classifier.show_most_informative_features()

    def SVMClassification(self, train_features, test_features):
        # Training and finding accuracy of  SVM Linear SVC classifier  
        
        test_feat_list = []
        test_feat_labels_list = []        

        #Training
        self.svm_classifier = SklearnClassifier(LinearSVC()) 
        self.svm_classifier.train(train_features)
        
        #Testing
        for test_feat in test_features:
            test_feat_list.append(test_feat[0])
            test_feat_labels_list.append(test_feat[1])            

        svm_test = self.svm_classifier.batch_classify(test_feat_list)
        
        #print classification_report(test_feat_labels_list, svm_test, labels=['pos','neg'], target_names=['pos', 'neg'])

    def testing(self, iteration):
        #Findng precision, recall and f measures for both classifier 

        #Naive Bayes classification
        print "NAIVE BAYES - ITERATION %s" % (iteration) 
        actual_pol_dict, predicted_pol_dict = self.get_actual_and_predicted_polarity_dict(self.nb_classifier)
        pos_precision, neg_precision = self.find_precision(actual_pol_dict, predicted_pol_dict)
        pos_recall, neg_recall = self.find_recall(actual_pol_dict, predicted_pol_dict)
        self.find_fmeasure(pos_precision, neg_precision, pos_recall, neg_recall)

    
        print "SVM - Linear SVC - ITERATION %s" % (iteration)
        #Support Vector Machine - Linear SVC classification
        actual_pol_dict, predicted_pol_dict = self.get_actual_and_predicted_polarity_dict(self.svm_classifier)
        pos_precision, neg_precision = self.find_precision(actual_pol_dict, predicted_pol_dict)
        pos_recall, neg_recall = self.find_recall(actual_pol_dict, predicted_pol_dict)
        self.find_fmeasure(pos_precision, neg_precision, pos_recall, neg_recall)
       

    def cross_validation(self):
        #10 fold cross validation for both classifiers

        pos_feats_count = len(self.selected_pos_feats)
        neg_feats_count = len(self.selected_neg_feats)
        
        pos_feats_fold_size = int(pos_feats_count / 10)
        neg_feats_fold_size = int(neg_feats_count / 10)

        for a in range(10):
开发者ID:albin-sayonetech,项目名称:SentimentAnalaysisOfMovierReviews,代码行数:70,代码来源:movie_sentimental_analysis.py

示例7: MNBayes

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]

#.........这里部分代码省略.........
        feature_sets = self.getFeatures()
        p = nltk.classify.accuracy(self.classifier,feature_sets)
        return p
        
    """ Make sure that the algorithm works on training data using a k fold 
        cross validation scheme """
    def kfoldCrossValidation(self,k):
        feature_sets = self.getFeatures()
        error = 0
        for i in range(k):
            self.classifier = SklearnClassifier(MultinomialNB())
            n = len(feature_sets)/k
            train_set,test_set = feature_sets[:n*i],feature_sets[n*i:]
            test_set1 = feature_sets[:n*i]
            train_set   = feature_sets[n*i:n*(i+1)]
            test_set2 = feature_sets[i+1:]
            test_set = test_set1+test_set2
            self.classifier.train(feature_sets)
            p = nltk.classify.accuracy(self.classifier,test_set)
        return p
    """ Make sure that the algorithm works on training data using a leave one out 
        cross validation scheme """
    def leave1OutCrossValidation(self):
        error = 0
        feature_sets = self.getFeatures()
        N = len(feature_sets)
        for i in range(N):
            self.classifier = SklearnClassifier(MultinomialNB())
            train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:]
            train_set = train_set1+train_set2
            test_set = [test_set]
            self.classifier.train(feature_sets)
            p = nltk.classify.accuracy(self.classifier,test_set)
            error+=p
        return error/N
            
    """ Construct a learning curve to see if there is overfitting"""
    def learningCurve(self,numTrials=4):
        accuracies = []
        feature_sets = self.getFeatures()
        for k in xrange(1,len(feature_sets)-1):
            total = 0
            for i in xrange(numTrials):
                self.classifier = SklearnClassifier(MultinomialNB())
                random.shuffle(feature_sets)
                train_set,test_set = feature_sets[:k],feature_sets[k:]
                self.classifier.train(feature_sets)
                p = nltk.classify.accuracy(self.classifier,test_set)
                total+=p
            accuracies.append(total/numTrials)
        return accuracies

        

    """ Train on only k features and return training labels and predicted labels """
    def testClassify(self,k):
        feature_sets = self.getFeatures()
        random.shuffle(feature_sets)
        self.classifier = SklearnClassifier(MultinomialNB())
        
        self.classifier.train(feature_sets[k:])
        features,ref_labels = zip(*feature_sets[:k])
        pred_labels = self.classifier.classify_many(features)   
        return ref_labels,pred_labels
        
    """ nltk confusion matrix """
    def confusionMatrix(self,ref,test):
        ref.sort(key=lambda x: x[0])
        test.sort(key=lambda x: x[0])
        _,ref_labels = zip(*ref)
        _,test_labels = zip(*test)
        cm = ConfusionMatrix(ref_labels, test_labels)
        return cm
    
    """ Classifies proteins based on its text """
    def classify(self,db,fastain):
        proIDs,features,labels = [],[],[]
        prevFeatureset = ''
        prevText = ''
        for seq_record in SeqIO.parse(fastain, "fasta"):
            title = seq_record.id
            toks = title.split("|")
            proteinID = toks[5]
            query_rows = genbank.proteinQuery(proteinID,db)
            ids,text = zip(*query_rows)
            text = ''.join(map(str,text))
            if text=='': 
                label = ['na']
            else:
                text = word_reg.findall(text)
                featureset = self.gene_features(text)
                assert text!=prevText
                assert featureset!=prevFeatureset
                prevFeatureset = featureset
                prevText = text
                label = self.classifier.batch_classify([featureset])    
            
            proIDs.append(proteinID)  
            labels+=label
        return zip(proIDs,labels)
开发者ID:mortonjt,项目名称:Boa,代码行数:104,代码来源:mnbayes.py

示例8: word_feats

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
from sklearn.pipeline import Pipeline

DATA_PATH = '../../datasets/sentiment_analysis/en/rt-polaritydata'


def word_feats(words):
    return dict([(word, True) for word in sent_tokenize(words)])


add_label = lambda lst, lab: [(x, lab) for x in lst]

pipeline = Pipeline([('tfidf', TfidfTransformer()),
                     ('chi2', SelectKBest(chi2, k=1000)),
                     ('nb', MultinomialNB())])
classifier = SklearnClassifier(pipeline)

pos = map(word_feats,
          open(os.path.join(DATA_PATH, 'rt-polarity.pos')).readlines())
neg = map(word_feats,
          open(os.path.join(DATA_PATH, 'rt-polarity.neg')).readlines())

features = zip(pos[:len(pos) / 2], itertools.repeat("pos")) + \
           zip(neg[:len(neg) / 2], itertools.repeat("neg"))
classifier.train(features)

l_pos = np.array(classifier.batch_classify(pos[len(pos) / 2:]))
l_neg = np.array(classifier.batch_classify(neg[len(neg) / 2:]))
print "Confusion matrix:\n%d\t%d\n%d\t%d" % (
    (l_pos == 'pos').sum(), (l_pos == 'neg').sum(),
    (l_neg == 'pos').sum(), (l_neg == 'neg').sum())
开发者ID:cscenter,项目名称:BuzzScore,代码行数:32,代码来源:nltk_sample.py

示例9: form

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
# label set
cls_set = ['Emotion', 'ynQuestion', 'yAnswer', 'Continuer', 'whQuestion', 'System', 'Accept', 'Clarify', 'Emphasis',  'nAnswer', 'Greet', 'Statement', 'Reject', 'Bye', 'Other']
featuresets = [] # list of tuples of the form (post, features)
for post in posts: # applying the feature extractor to each post
	# post.get('class') is the label of the current post
	featuresets.append((dialogue_act_features(post.text),cls_set.index(post.get('class'))))

from random import shuffle
shuffle(featuresets)
size = int(len(featuresets) * .1) # 10% is used for the test set
train = featuresets[size:]
test = featuresets[:size]
print(train)

from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
# SVM with a Linear Kernel and default parameters 
classif = SklearnClassifier(LinearSVC())
classif.train(train)

test_skl = []
t_test_skl = []
for d in test:
	test_skl.append(d[0])
	t_test_skl.append(d[1])

# run the classifier on the train test
p = classif.batch_classify(test_skl)
from sklearn.metrics import classification_report
# getting a full report
print(classification_report(t_test_skl, p, labels=list(set(t_test_skl)),target_names=cls_set))
开发者ID:Martbov,项目名称:scriptie,代码行数:33,代码来源:multinomialtest.py

示例10: RForests

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
class RForests(text_classifier.TextClassifier):
    def __init__(self,trainDir,labelFile,numTrees=10,numJobs=1):
        self.classifier = None
        self.labelFile = labelFile
        self.trainingDir = trainDir
        self.labels = None
        self.all_words = None
        self.numTrees = numTrees
        self.numJobs = numJobs
        self.classifier = SklearnClassifier(RandomForestClassifier(
                                            n_estimators=self.numTrees,
                                            n_jobs=numJobs),sparse=False)
        #self.labels = training.setup(labelFile)
        #self.train()
    
    def train(self):
        feature_sets = self.getFeatures()
        self.classifier.train(feature_sets)
        
    """ Determines training error"""
    def trainingError(self):
        feature_sets = self.getFeatures()
        p = nltk.classify.accuracy(self.classifier,feature_sets)
        return p
        
    """ Make sure that the algorithm works on training data using a k fold 
        cross validation scheme """
    def kfoldCrossValidation(self,k):
        feature_sets = self.getFeatures()
        error = 0
        for i in range(k):
            self.classifier = SklearnClassifier(RandomForestClassifier(
                                                n_estimators=self.numTrees),sparse=False)
            n = len(feature_sets)/k
            train_set,test_set = feature_sets[:n*i],feature_sets[n*i:]
            test_set1 = feature_sets[:n*i]
            train_set   = feature_sets[n*i:n*(i+1)]
            test_set2 = feature_sets[i+1:]
            test_set = test_set1+test_set2
            self.classifier.train(feature_sets)
            p = nltk.classify.accuracy(self.classifier,test_set)
        return p
    """ Make sure that the algorithm works on training data using a leave one out 
        cross validation scheme """
    def leave1OutCrossValidation(self):
        error = 0
        feature_sets = self.getFeatures()
        N = len(feature_sets)
        for i in range(N):
            self.classifier = SklearnClassifier(RandomForestClassifier(
                                                n_estimators=self.numTrees),sparse=False)
            train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:]
            train_set = train_set1+train_set2
            test_set = [test_set]
            self.classifier.train(feature_sets)
            p = nltk.classify.accuracy(self.classifier,test_set)
            error+=p
        return error/N
            
    """ Construct a learning curve to see if there is overfitting"""
    def learningCurve(self,numTrials=4):
        accuracies = []
        feature_sets = self.getFeatures()
        for k in xrange(1,len(feature_sets)-1):
            total = 0
            for i in xrange(numTrials):
                self.classifier = SklearnClassifier(RandomForestClassifier(
                                                    n_estimators=self.numTrees),
                                                    sparse=False)
                random.shuffle(feature_sets)
                train_set,test_set = feature_sets[:k],feature_sets[k:]
                self.classifier.train(train_set)
                p = nltk.classify.accuracy(self.classifier,test_set)
                print len(train_set),len(test_set),p
                total+=p
            accuracies.append(total/numTrials)
        return accuracies
    
    """ Train on only k features and return training labels and predicted labels """
    def testClassify(self,k):
        feature_sets = self.getFeatures()
        random.shuffle(feature_sets)
        self.classifier = SklearnClassifier(RandomForestClassifier(
                                            n_estimators=self.numTrees),sparse=False)
        
        self.classifier.train(feature_sets[k:])
        features,ref_labels = zip(*feature_sets[:k])
        pred_labels = self.classifier.batch_classify(features)   
        return ref_labels,pred_labels
    
    """ nltk confusion matrix """
    def confusionMatrix(self,ref,test):
        ref.sort(key=lambda x: x[0])
        test.sort(key=lambda x: x[0])
        _,ref_labels = zip(*ref)
        _,test_labels = zip(*test)
        cm = ConfusionMatrix(ref_labels, test_labels)
        return cm

    def prob_classify(self,db,fastain):
#.........这里部分代码省略.........
开发者ID:mortonjt,项目名称:Boa,代码行数:103,代码来源:rforests.py


注:本文中的nltk.classify.scikitlearn.SklearnClassifier.batch_classify方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。