当前位置: 首页>>代码示例>>Python>>正文


Python SklearnClassifier.classify方法代码示例

本文整理汇总了Python中nltk.classify.scikitlearn.SklearnClassifier.classify方法的典型用法代码示例。如果您正苦于以下问题:Python SklearnClassifier.classify方法的具体用法?Python SklearnClassifier.classify怎么用?Python SklearnClassifier.classify使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.classify.scikitlearn.SklearnClassifier的用法示例。


在下文中一共展示了SklearnClassifier.classify方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
class SKClassifier:

    classifier = None

    def __init__(self, cls='SVC'):
        self.classifier = SklearnClassifier({
            'SVC': SVC(),
            'LogisticRegression': LogisticRegression(),
            'BernoulliNB': BernoulliNB()
        }[cls])
        if not self.classifier:
            self.classifier = SklearnClassifier(SVC())

    def train(self, trainset):
        self.classifier.train(trainset)

    def test(self, tagged, featuresets):
        predict = self.classifier.classify_many(featuresets)
        print predict
        return accuracy_score(tagged, predict)

    def classify(self, featureset):
        return self.classifier.classify(featureset)

    def classify_many(self, featuresets):
        return self.classifier.classify_many(featuresets)
开发者ID:Palazor,项目名称:sentiment,代码行数:28,代码来源:SkClassifier.py

示例2: getSubjObj

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
    def getSubjObj(self, text):
        words = Text(text.split(" "))
        bigrams = self.getBigrams(words)
        subjclassifier = self.loadSOClsssifier()
        posnegclassifier = self.loadPNClsssifier()

        subj_or_obj = SklearnClassifier.classify(subjclassifier, bigrams)
        if subj_or_obj == "objective":
            return "neutral"

        pos_or_neg = SklearnClassifier.classify(posnegclassifier, bigrams)

        if pos_or_neg == "negative":
            return "negative"
        else:
            return "positive"
开发者ID:Balu-Varanasi,项目名称:sentiment-analyzer,代码行数:18,代码来源:predictor.py

示例3: evaluate

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def evaluate(classifier_alo):
    
    classifier = SklearnClassifier(classifier_alo) #在nltk 中使用scikit-learn 的接口
    classifier.train(trainFeatures) #训练分类器
    
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)	
    i = 0
    for item in testFeatures:
        referenceSets[item[1]].add(i)
        predicted = classifier.classify(item[0])
        testSets[predicted].add(i)	
        i += 1
    
    pos_pre = nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
    pos_recall = nltk.metrics.recall(referenceSets['pos'], testSets['pos'])
    neg_pre =  nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
    neg_recall = nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
    
    print (str('{0:.3f}'.format(float(pos_pre))) + "  "
    +str('{0:.3f}'.format(float(pos_recall))) + "  "
    +str('{0:.3f}'.format(float(neg_pre))) + "  "
    +str( '{0:.3f}'.format(float(neg_recall))) + "  "
    +str('{0:.3f}'.format(2*(float(pos_pre)*float(pos_recall)) / (float(pos_recall)+float(pos_pre)))) + "  "
    +str('{0:.3f}'.format(2*(float(neg_pre)*float(neg_recall)) / (float(neg_recall)+float(neg_pre)))))
开发者ID:delili,项目名称:NLP_Comments_Sentiment_Analysis,代码行数:27,代码来源:process.py

示例4: handle

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
    def handle(self, *args, **options):
        trains = get_train_tweets()
        if not trains:
            raise CommandError('No train data, please add some from the admin page!')

        train_count = trains.count()
        train_set = generate_trainset(trains)
        nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
        sci_classifier = SklearnClassifier(LinearSVC())
        sci_classifier.train(train_set)

        while True:
            unclassified_tweets = Tweet.objects.filter(train=False, klass=None)
            total_count = unclassified_tweets.count()
            if total_count > 0:
                print('Classifying %d tweets...' % total_count)
                counts_nb = defaultdict(int)
                counts_svm = defaultdict(int)
                start_time = time.time()
                for tweet in unclassified_tweets:
                    feature_vect = get_feature_vector(process_tweet(tweet.body))
                    features = extract_features(feature_vect)
                    sentiment_nb = nb_classifier.classify(features)
                    sentiment_svm = sci_classifier.classify(features)
                    counts_nb[sentiment_nb] += 1
                    counts_svm[sentiment_svm] += 1
                    tweet.klass = sentiment_nb
                    tweet.klass_svm = sentiment_svm
                    msg_nb = ['%d %s' % (counts_nb[k], v) for k, v in Tweet.CLASSES]
                    msg_svm = ['%d %s' % (counts_svm[k], v) for k, v in Tweet.CLASSES]
                    print('\rNB: ' + ', '.join(msg_nb) + ';\tSVM: ' + ', '.join(msg_svm), end='')
                    # print('\r' + ', '.join(msg_nb), end='')
                    tweet.save()
                    if settings.DEBUG:
                        db.reset_queries()
                elapsed = int(time.time() - start_time)
                print('\nClassifying finished in %d seconds.' % elapsed)

            new_trains = get_train_tweets()
            if new_trains.count() != train_count:
                print('Train set has been changed, retraining...')
                trains = new_trains
                train_count = new_trains.count()
                train_set = generate_trainset(trains)
                nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
                sci_classifier = SklearnClassifier(LinearSVC())
                sci_classifier.train(train_set)
            else:
                print('Waiting...')
                time.sleep(3)
开发者ID:ackaraosman,项目名称:hatemap,代码行数:52,代码来源:classify.py

示例5: multinomial_bayes_nltk_wrapper

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def multinomial_bayes_nltk_wrapper(corpus, documents_training, documents_test, words_features, smoothing, kbest):
    """
    Multinomial Naive Bayes Algorithm using wrapper NLTK SklearnClassifier
    Memory problems can occur if very large dataset
    :param corpus:
    :param documents_training:
    :param documents_test:
    :param words_features:
    :param smoothing:
    :param kbest:
    :return:
    """

    print
    print "----- Multinomial Bayes with wrapper nltk Algorithm------"
    print "Creating Training Feature Vectors..."
    array_features_training = []
    for (id, original_category, annotations) in documents_training:
        array_features_training.append((util_classify.transform_document_in_dict(annotations, words_features, corpus), original_category))
    # array_features_training = apply_features(extract_document_features,documents_training)
    print "Training algorithm..."
    # ('chi2', SelectKBest(chi2, k=3000)),
    if kbest == 0:
        kbest = "all"
    pipeline = Pipeline([('chi2', SelectKBest(chi2, k=kbest)), ('tfidf', TfidfTransformer()),
                         ('nb', MultinomialNB(alpha=smoothing))])

    # pipeline = Pipeline([('nb', MultinomialNB(alpha=smoothing))])

    classifier = SklearnClassifier(pipeline)
    classifier.train(array_features_training)

    print "Calculating metrics ..."
    categories = util_classify.get_categories(corpus)
    estimated_categories = []
    original_categories = []               

    for (id, cat_original, annotations) in documents_test:
        cat_estimated = classifier.classify((util_classify.transform_document_in_dict(annotations, words_features, corpus)))
        estimated_categories.append(categories.index(cat_estimated))
        original_categories.append(categories.index(cat_original))
    return original_categories, estimated_categories
开发者ID:itecsde,项目名称:classification,代码行数:44,代码来源:classify_methods.py

示例6: linear_support_vector_machines_tf_idf

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def linear_support_vector_machines_tf_idf(corpus, documents_training, documents_test, words_features, kbest):
    """
    Linear Support Vector Machines Algorithm. The Support Vector Machines algorithm with a linear kernel and using TF/IDF
    :param corpus:
    :param documents_training:
    :param documents_test:
    :param words_features:
    :param kbest:
    :return:
    """

    print
    print "----- Linear Support Vector Machines with tfidf algorithm ------"
    print "Creating Features Training Vectors..."
    categories = util_classify.get_categories(corpus)
    array_features_training = []

    for (id, original_category, annotations) in documents_training:
        array_features_training.append((util_classify.transform_document_in_dict(annotations, words_features, corpus), original_category))

    print "Training algorithm..."

    if kbest == 0:
        kbest = "all"

    pipeline = Pipeline([('chi2', SelectKBest(chi2, k=kbest)), ('tfidf', TfidfTransformer()),
                         ('svc', LinearSVC())])

    classifier = SklearnClassifier(pipeline)
    classifier.train(array_features_training)

    print "Calculating metrics..."
    estimated_categories = []
    original_categories = []

    for (id, cat_original, annotations) in documents_test:
        cat_estimated = classifier.classify((util_classify.transform_document_in_dict(annotations, words_features, corpus)))
        estimated_categories.append(categories.index(cat_estimated))
        original_categories.append(categories.index(cat_original))
    return original_categories, estimated_categories
开发者ID:itecsde,项目名称:classification,代码行数:42,代码来源:classify_methods.py

示例7: train_Classifier

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def train_Classifier(posfeats,negfeats,index):
    # divide dataset into train and validation sets
    posCutoff = int(math.floor(len(posfeats)*7/10))
    negCutoff = int(math.floor(len(negfeats)*7/10))
    trainFeatures = posfeats[:posCutoff] + negfeats[:negCutoff]
    testFeatures = posfeats[posCutoff:] + negfeats[negCutoff:]

    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    classsifiername=''

    if (index == 0):
        classifier = nltk.classify.maxent.MaxentClassifier.train(trainFeatures, 'GIS', trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter = 5)
        classsifiername= 'Maximum Entropy'
    elif (index ==1):
        classifier = SklearnClassifier(BernoulliNB())
	classifier.train(trainFeatures)
        classsifiername='Bernoulli Naive Bayes'
    else:
        classifier = SklearnClassifier(LogisticRegression())
	classifier.train(trainFeatures)
        classsifiername = 'LogisticRegression'

    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)

    print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
    print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
    print 'pos recall:', nltk.metrics.recall(referenceSets['pos'], testSets['pos'])
    print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
    print 'neg recall:', nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
    #classifier.show_most_informative_features(10)
    return classifier
开发者ID:gyasmeen,项目名称:twiitter-sentiment-analysis-machine-learning,代码行数:39,代码来源:Twitter_Sentiment_Analysis.py

示例8: evaluate_features

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def evaluate_features(feature_select):

    posFeatures = []
    negFeatures = []
    
    training = []
    #process positive dataset "processed_pro_GMO.txt"
    for i in short_pos.split('\n'):
        posWords = word_tokenize(i)
        posWords_tag = [feature_select(posWords),"pos"]
        #post each word as "pos" in positive dataset
        posFeatures.append(posWords_tag)
       
    #process negative dataset "processed_anti_GMO.txt"
    for i in short_neg.split('\n'):
        negWords = word_tokenize(i)
        negWords_tag = [feature_select(negWords),"neg"]
        negFeatures.append(negWords_tag)

    #get 6-Fold cross validation for Accuracy,Recall,Prediction
    num_folds = 6
    training = posFeatures + negFeatures
    cv = cross_validation.KFold(len(training),n_folds=6, shuffle=True, random_state=None)

    Naive_Accu = 0
    neg_Precision = 0
    neg_recall = 0
    pos_Precision = 0
    pos_recall = 0

    SVC_Accu = 0
    Regression_Accu = 0
    testFeatures = []

    precision = dict()
    recall = dict()
    average_Precision = dict()

    for traincv, testcv in cv:
        #BasedNaiveClassifier
        BasedNaiveClassifier = NaiveBayesClassifier.train(training[traincv[0]:traincv[len(traincv)-1]])
        accuracy = (nltk.classify.util.accuracy(BasedNaiveClassifier, training[testcv[0]:testcv[len(testcv)-1]]))*100
        Naive_Accu += accuracy
        BasedNaiveClassifier.show_most_informative_features(10)

        save_classifier = open("GMO_Hanzhe/BasedNaiveClassifier10k.pickle","wb")
        pickle.dump(BasedNaiveClassifier, save_classifier)
        save_classifier.close()
        

        #LogisticRegression
        LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
        LogisticRegression_classifier.train(training[traincv[0]:traincv[len(traincv)-1]])
        Regression_Accuracy = (nltk.classify.util.accuracy(LogisticRegression_classifier, training[testcv[0]:testcv[len(testcv)-1]]))*100
        Regression_Accu += Regression_Accuracy

        save_classifier = open("GMO_Hanzhe/LogisticRegression_classifier10k.pickle","wb")
        pickle.dump(LogisticRegression_classifier, save_classifier)
        save_classifier.close()

        #LinearSVC
        LinearSVC_classifier = SklearnClassifier(LinearSVC())
        LinearSVC_classifier.train(training[traincv[0]:traincv[len(traincv)-1]])
        SVC_Accuracy = (nltk.classify.util.accuracy(LinearSVC_classifier, training[testcv[0]:testcv[len(testcv)-1]]))*100
        SVC_Accu += SVC_Accuracy

        save_classifier = open("GMO_Hanzhe/LinearSVC_classifier10k.pickle","wb")
        pickle.dump(LinearSVC_classifier, save_classifier)
        save_classifier.close()

        #initiates referenceSets and testSets
        referenceSets = collections.defaultdict(set)
        testSets = collections.defaultdict(set)

        for idx in testcv:
            testFeatures.append(training[idx])
        #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
        for i, (features, label) in enumerate(testFeatures):
            referenceSets[label].add(i)
            predicted = LogisticRegression_classifier.classify(features)
            testSets[predicted].add(i)  
#7/5/2015        
##        pos_Precision += (nltk.metrics.precision(referenceSets["pos"], testSets["pos"]))*100     
##        pos_recall += (nltk.metrics.recall(referenceSets["pos"], testSets["pos"]))*100 
##        neg_Precision += (nltk.metrics.precision(referenceSets["neg"], testSets["neg"]))*100
##        neg_recall += (nltk.metrics.recall(referenceSets["neg"], testSets["neg"]))*100
##
##        precision["pos"] = nltk.metrics.precision(referenceSets["pos"], testSets["pos"])     
##        recall["pos"] = nltk.metrics.recall(referenceSets["pos"], testSets["pos"]) 
##        precision["neg"] = nltk.metrics.precision(referenceSets["neg"], testSets["neg"])
##        recall["neg"] = nltk.metrics.recall(referenceSets["neg"], testSets["neg"])
##
##        save_classifier = open("GMOHedging/BasedNaiveClassifier.pickle","wb")
##        pickle.dump(BasedNaiveClassifier, save_classifier)
##        save_classifier.close()
###    average_precision["pos"] = precision["pos"]

    #get Average score for Accuracy, Precision and Recall
    accu = Naive_Accu/num_folds
#7/5/2015
#.........这里部分代码省略.........
开发者ID:lrance,项目名称:TwitterSentimentAnalysis,代码行数:103,代码来源:Sentiment_try_2_classification.py

示例9: SklearnClassifier

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
 classif = SklearnClassifier(pipeline)
 classif.train(zip(trainData,trainLabels))
 cf = None
 if USE_CHI_SQUARE:
     cf = open("nb_classifier_"+str(gram)+"gram_"+str(size)+"_large","w")
 else:
     cf = open("nb_classifier_"+str(gram)+"gram_"+str(size)+"_large_nochi","w")
 pickle.dump(classif, cf)
 
 
 
 matches = 0
 mismatches = 0
 scores = {1:0, 2:0, 3:0, 4:0, 5:0}
 for i in range(len(testLabels)):
     label = classif.classify(testData[i])
     log("test data id: "+str(i),f)
     if label == testLabels[i]:
         matches += 1
         log("matched: label: "+str(label),f)
     else:
         mismatches += 1
         log("mismatched: label: "+str(label)+" was supposed to be: "+str(testLabels[i]),f)
     scores[int(label)]+=1
 log("summary of results for: gram: "+str(gram) +" size: "+str(size),f)
 log("matches = "+str(matches),f)        
 log("mismatches = "+str(mismatches),f)
 log("guesses = "+repr(scores),f)
 log("="*20,f)
 log("="*20,f)
 log("="*20,f)
开发者ID:EdwardKL,项目名称:info290finalproj,代码行数:33,代码来源:nbClassifier.py

示例10: get_train_features_from_tweets

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
neg_train, neg_test = neg_tweets[:negcutoff], neg_tweets[negcutoff:]

neg_feats_train = get_train_features_from_tweets(neg_train, 'neg')
pos_feats_train = get_train_features_from_tweets(pos_train, 'pos')

train_feats = neg_feats_train + pos_feats_train

svm_classifier = SklearnClassifier(LinearSVC())
svm_classifier.train(train_feats)

# Evaluation
correct, wrong = 0, 0

for tweet in neg_test:
    features = get_features_from_tweet(tweet)
    result = svm_classifier.classify(features)
    if result == "neg":
        correct += 1
    else:
        wrong += 1


for tweet in pos_test:
    features = get_features_from_tweet(tweet)
    result = svm_classifier.classify(features)
    if result == "pos":
        correct += 1
    else:
        wrong += 1

print "Accuracy: {}".format(correct / float(correct + wrong))
开发者ID:AkiraKane,项目名称:GA_Data_Science,代码行数:33,代码来源:twitter.py

示例11: print

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
testing_set = nltk.classify.apply_features(extract_features, test_tweets)

for (tweet, sentiment) in test_tweets:
    print(classifier.classify(extract_features(tweet)))

print(nltk.classify.accuracy(classifier, testing_set))

classifier.show_most_informative_features(5)



"""
pipeline = Pipeline([('tfidf', TfidfTransformer()),
                      ('chi2', SelectKBest(chi2, k='all')),
                      ('nb', MultinomialNB())])
"""
pipeline = Pipeline([('tfidf', TfidfTransformer()),
                      ('chi2', SelectKBest(chi2, k='all')),
                      ('nb', MultinomialNB())])

classif = SklearnClassifier(pipeline)

classif.train(training_set)

print(classif.labels())
for (tweet, sentiment) in test_tweets:
    print(classif.classify(extract_features(tweet)))

print(nltk.classify.accuracy(classif, testing_set))
开发者ID:weifengli001,项目名称:DataMining,代码行数:31,代码来源:sklearntest.py

示例12: YoutubeVideoClassifier

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
class YoutubeVideoClassifier(Utility):
    """ Use the collected data as training set and classify test data"""

    def __init__(self):
        Utility.__init__(self)
        self.nb_output_file_name = self.config.get("GLOBAL", "nb_output_file")
        self.svm_output_file_name = self.config.get("GLOBAL", "svm_output_file")
        self.nb_output = os.path.join(self.output_dir, self.nb_output_file_name)
        self.svm_output = os.path.join(self.output_dir, self.svm_output_file_name)

        self.train_features = []
        self.stopwords_set = set(stopwords.words("english"))

    def run_main(self):
        self.pre_processing()
        self.feature_extraction()
        self.classification()
        self.testing()

    def pre_processing(self):
        self.load_data()

    def load_data(self):
        self.load_movies()
        self.load_actors()
        self.load_tvshows()
        self.load_test_data()

    def load_movies(self):
        self.movies_list = []
        movies_fd = codecs.open(self.movies_file)

        for movie in movies_fd.readlines():
            if not movie:
                continue
            self.movies_list.append(movie)
        movies_fd.close()

    def load_actors(self):
        self.actors_list = []
        actors_fd = codecs.open(self.actors_file)

        for actor in actors_fd.readlines():
            if not actor:
                continue
            self.actors_list.append(actor)
        actors_fd.close()

    def load_tvshows(self):
        self.tvshows_list = []
        tvshows_fd = codecs.open(self.tvshows_file)

        for tvshow in tvshows_fd.readlines():
            if not tvshow:
                continue
            self.tvshows_list.append(tvshow)
        tvshows_fd.close()

    def load_test_data(self):
        json_data = open(self.test_file)
        self.test_data = json.load(json_data)

    def feature_selection(self, features_list):
        selected_features = []

        for feat in features_list:
            if feat and feat.strip() and feat.lower() not in self.stopwords_set:
                selected_features.append((feat.strip().lower(), True))
        return dict(selected_features)

    def feature_extraction(self):
        for item in self.tvshows_list:
            if not item:
                continue
            selected_features = self.feature_selection(item.replace("_", " ").split(" "))
            self.train_features.append((selected_features, "tvshow"))

        for item in self.movies_list:
            if not item:
                continue
            selected_features = self.feature_selection(item.replace("_", " ").split(" "))
            self.train_features.append((selected_features, "movie"))

        for item in self.actors_list:
            if not item:
                continue
            selected_features = self.feature_selection(item.replace("_", " ").split(" "))
            self.train_features.append((selected_features, "celebrity"))

    def classification(self):

        # Training NB Classifier
        self.nb_classifier = NaiveBayesClassifier.train(self.train_features)

        # Training SVM classifier
        self.svm_classifier = SklearnClassifier(LinearSVC())
        self.svm_classifier.train(self.train_features)

    def testing(self):
        nb_fd = codecs.open(self.nb_output, "w", "utf-8")
#.........这里部分代码省略.........
开发者ID:karthik-chandrasekar,项目名称:YoutubeVideoClassifier,代码行数:103,代码来源:YoutubeVideoClassifier.py

示例13: RForests

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
class RForests(text_classifier.TextClassifier):
    def __init__(self,trainDir,labelFile,numTrees=10,numJobs=1):
        self.classifier = None
        self.labelFile = labelFile
        self.trainingDir = trainDir
        self.labels = None
        self.all_words = None
        self.numTrees = numTrees
        self.numJobs = numJobs
        self.classifier = SklearnClassifier(RandomForestClassifier(
                                            n_estimators=self.numTrees,
                                            n_jobs=numJobs),sparse=False)
        #self.labels = training.setup(labelFile)
        #self.train()
    
    def train(self):
        feature_sets = self.getFeatures()
        self.classifier.train(feature_sets)
        
    """ Determines training error"""
    def trainingError(self):
        feature_sets = self.getFeatures()
        p = nltk.classify.accuracy(self.classifier,feature_sets)
        return p
        
    """ Make sure that the algorithm works on training data using a k fold 
        cross validation scheme """
    def kfoldCrossValidation(self,k):
        feature_sets = self.getFeatures()
        error = 0
        for i in range(k):
            self.classifier = SklearnClassifier(RandomForestClassifier(
                                                n_estimators=self.numTrees),sparse=False)
            n = len(feature_sets)/k
            train_set,test_set = feature_sets[:n*i],feature_sets[n*i:]
            test_set1 = feature_sets[:n*i]
            train_set   = feature_sets[n*i:n*(i+1)]
            test_set2 = feature_sets[i+1:]
            test_set = test_set1+test_set2
            self.classifier.train(feature_sets)
            p = nltk.classify.accuracy(self.classifier,test_set)
        return p
    """ Make sure that the algorithm works on training data using a leave one out 
        cross validation scheme """
    def leave1OutCrossValidation(self):
        error = 0
        feature_sets = self.getFeatures()
        N = len(feature_sets)
        for i in range(N):
            self.classifier = SklearnClassifier(RandomForestClassifier(
                                                n_estimators=self.numTrees),sparse=False)
            train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:]
            train_set = train_set1+train_set2
            test_set = [test_set]
            self.classifier.train(feature_sets)
            p = nltk.classify.accuracy(self.classifier,test_set)
            error+=p
        return error/N
            
    """ Construct a learning curve to see if there is overfitting"""
    def learningCurve(self,numTrials=4):
        accuracies = []
        feature_sets = self.getFeatures()
        for k in xrange(1,len(feature_sets)-1):
            total = 0
            for i in xrange(numTrials):
                self.classifier = SklearnClassifier(RandomForestClassifier(
                                                    n_estimators=self.numTrees),
                                                    sparse=False)
                random.shuffle(feature_sets)
                train_set,test_set = feature_sets[:k],feature_sets[k:]
                self.classifier.train(train_set)
                p = nltk.classify.accuracy(self.classifier,test_set)
                print len(train_set),len(test_set),p
                total+=p
            accuracies.append(total/numTrials)
        return accuracies
    
    """ Train on only k features and return training labels and predicted labels """
    def testClassify(self,k):
        feature_sets = self.getFeatures()
        random.shuffle(feature_sets)
        self.classifier = SklearnClassifier(RandomForestClassifier(
                                            n_estimators=self.numTrees),sparse=False)
        
        self.classifier.train(feature_sets[k:])
        features,ref_labels = zip(*feature_sets[:k])
        pred_labels = self.classifier.batch_classify(features)   
        return ref_labels,pred_labels
    
    """ nltk confusion matrix """
    def confusionMatrix(self,ref,test):
        ref.sort(key=lambda x: x[0])
        test.sort(key=lambda x: x[0])
        _,ref_labels = zip(*ref)
        _,test_labels = zip(*test)
        cm = ConfusionMatrix(ref_labels, test_labels)
        return cm

    def prob_classify(self,db,fastain):
#.........这里部分代码省略.........
开发者ID:mortonjt,项目名称:Boa,代码行数:103,代码来源:rforests.py

示例14: main

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def main():
    parser = get_argparser()
    args = parser.parse_args()

    util.DPRINT = args.dprint
    featureset_name = os.path.basename(args.featurefn).split('.')[0]
    features.load_featurefile(args.featurefn)

    ## default is 1e-4.
    THETOL = 1e-3
    classifier_pairs = []
    classifier_pairs.append(("MFS", learn.MFSClassifier()))

    classifier = SklearnClassifier(LogisticRegression(C=1,
                                   penalty='l2',
                                   tol=THETOL))
    classifier_pairs.append(("maxent-l2-c1", classifier))
    stamp = util.timestamp()

    for fn in glob(args.testset + "/*data"):
        problems = semeval_testset.extract_wsd_problems(fn)

        w = problems[0][0]
        assert w.endswith(".n")
        w = w[:-2]
        load_training_for_word(w, args.bitextfn, args.alignfn, args.annotatedfn)

        bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es")
        oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es")
        if os.path.exists(bestoutfn):
            os.remove(bestoutfn)
        if os.path.exists(oofoutfn):
            os.remove(oofoutfn)

        training = None

        for problem in problems:
            w = problem[0]
            assert w.endswith(".n")
            w = w[:-2]
            print(problem)

            if training is None:
                training = trainingdata.trainingdata_for(w, nonnull=True)
                print("got {0} instances for {1}".format(len(training), w))
                labels = set(label for (feat,label) in training)
                if len(training) == 0:
                    print("no samples for", w)
                    break
                if len(labels) < 2:
                    print("there's only one sense for", w, " and it is ",
                          labels)
                    break
                classifier.train(training)

            rawtext = problem[2]
            surface, index = semeval_testset.head_surface_and_index(rawtext)
            replaced = re.sub(r"<head>(.*)</head>", " \\1 ", rawtext)
            annotated = preprocessing.preprocess(replaced, "en")
            sentence = [token.lemma for token in annotated]

            focus_index = find_head_token_index(annotated, surface, index)
            feats = features.extract_untagged(sentence, annotated, focus_index)

            bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es")
            oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es")
            with open(bestoutfn, "a") as bestoutfile, \
                 open(oofoutfn, "a") as oofoutfile:

                answer = classifier.classify(feats)
                print(answer)
                dist = classifier.prob_classify(feats)
                oof_answers = topfive(dist)
                print(output_one_best(problem, "es", answer), file=bestoutfile)
                print(output_five_best(problem, "es", oof_answers),
                      file=oofoutfile)
开发者ID:alexrudnick,项目名称:chipa,代码行数:78,代码来源:semeval_experiment.py

示例15: label_feats_from_corpus

# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
""" This is a demo of the Scikit-learn Classifier from the NLTK
    package using the movie reviews corpus  """
from nltk.corpus import movie_reviews
from featx import *
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression
from nltk.classify.util import accuracy
from nltk import word_tokenize
lfeats = label_feats_from_corpus(movie_reviews)# extracts the features and its labels (neg/pos) associated with each tweets
train_feats,test_feats = split_label_feats(lfeats, split = 0.75) # splits labeled feature sets into training and test feats see featx.py
sk_classifier = SklearnClassifier(LogisticRegression())# trains classifier
sk_classifier.train(train_feats)
print("The associated accuracy for this classfier on the data is :" )
print(accuracy(sk_classifier,test_feats))
while True:
    text = input("Enter your fake tweet use only words: \n")
    test = bag_of_words(word_tokenize(text)) # converts text into a bag of words see featx.py
    print("Sentiment:")
    print(sk_classifier.classify(test))
    control = input("press aney key to continue 'q' to quit:")
    if(control == "q" ):
        break
开发者ID:DeamonSpawn,项目名称:UntitledSAProj,代码行数:24,代码来源:scikit_logistic_regression_demo.py


注:本文中的nltk.classify.scikitlearn.SklearnClassifier.classify方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。