本文整理汇总了Python中nltk.classify.scikitlearn.SklearnClassifier.classify方法的典型用法代码示例。如果您正苦于以下问题:Python SklearnClassifier.classify方法的具体用法?Python SklearnClassifier.classify怎么用?Python SklearnClassifier.classify使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.classify.scikitlearn.SklearnClassifier
的用法示例。
在下文中一共展示了SklearnClassifier.classify方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
class SKClassifier:
classifier = None
def __init__(self, cls='SVC'):
self.classifier = SklearnClassifier({
'SVC': SVC(),
'LogisticRegression': LogisticRegression(),
'BernoulliNB': BernoulliNB()
}[cls])
if not self.classifier:
self.classifier = SklearnClassifier(SVC())
def train(self, trainset):
self.classifier.train(trainset)
def test(self, tagged, featuresets):
predict = self.classifier.classify_many(featuresets)
print predict
return accuracy_score(tagged, predict)
def classify(self, featureset):
return self.classifier.classify(featureset)
def classify_many(self, featuresets):
return self.classifier.classify_many(featuresets)
示例2: getSubjObj
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def getSubjObj(self, text):
words = Text(text.split(" "))
bigrams = self.getBigrams(words)
subjclassifier = self.loadSOClsssifier()
posnegclassifier = self.loadPNClsssifier()
subj_or_obj = SklearnClassifier.classify(subjclassifier, bigrams)
if subj_or_obj == "objective":
return "neutral"
pos_or_neg = SklearnClassifier.classify(posnegclassifier, bigrams)
if pos_or_neg == "negative":
return "negative"
else:
return "positive"
示例3: evaluate
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def evaluate(classifier_alo):
classifier = SklearnClassifier(classifier_alo) #在nltk 中使用scikit-learn 的接口
classifier.train(trainFeatures) #训练分类器
referenceSets = collections.defaultdict(set)
testSets = collections.defaultdict(set)
i = 0
for item in testFeatures:
referenceSets[item[1]].add(i)
predicted = classifier.classify(item[0])
testSets[predicted].add(i)
i += 1
pos_pre = nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
pos_recall = nltk.metrics.recall(referenceSets['pos'], testSets['pos'])
neg_pre = nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
neg_recall = nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
print (str('{0:.3f}'.format(float(pos_pre))) + " "
+str('{0:.3f}'.format(float(pos_recall))) + " "
+str('{0:.3f}'.format(float(neg_pre))) + " "
+str( '{0:.3f}'.format(float(neg_recall))) + " "
+str('{0:.3f}'.format(2*(float(pos_pre)*float(pos_recall)) / (float(pos_recall)+float(pos_pre)))) + " "
+str('{0:.3f}'.format(2*(float(neg_pre)*float(neg_recall)) / (float(neg_recall)+float(neg_pre)))))
示例4: handle
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def handle(self, *args, **options):
trains = get_train_tweets()
if not trains:
raise CommandError('No train data, please add some from the admin page!')
train_count = trains.count()
train_set = generate_trainset(trains)
nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
sci_classifier = SklearnClassifier(LinearSVC())
sci_classifier.train(train_set)
while True:
unclassified_tweets = Tweet.objects.filter(train=False, klass=None)
total_count = unclassified_tweets.count()
if total_count > 0:
print('Classifying %d tweets...' % total_count)
counts_nb = defaultdict(int)
counts_svm = defaultdict(int)
start_time = time.time()
for tweet in unclassified_tweets:
feature_vect = get_feature_vector(process_tweet(tweet.body))
features = extract_features(feature_vect)
sentiment_nb = nb_classifier.classify(features)
sentiment_svm = sci_classifier.classify(features)
counts_nb[sentiment_nb] += 1
counts_svm[sentiment_svm] += 1
tweet.klass = sentiment_nb
tweet.klass_svm = sentiment_svm
msg_nb = ['%d %s' % (counts_nb[k], v) for k, v in Tweet.CLASSES]
msg_svm = ['%d %s' % (counts_svm[k], v) for k, v in Tweet.CLASSES]
print('\rNB: ' + ', '.join(msg_nb) + ';\tSVM: ' + ', '.join(msg_svm), end='')
# print('\r' + ', '.join(msg_nb), end='')
tweet.save()
if settings.DEBUG:
db.reset_queries()
elapsed = int(time.time() - start_time)
print('\nClassifying finished in %d seconds.' % elapsed)
new_trains = get_train_tweets()
if new_trains.count() != train_count:
print('Train set has been changed, retraining...')
trains = new_trains
train_count = new_trains.count()
train_set = generate_trainset(trains)
nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
sci_classifier = SklearnClassifier(LinearSVC())
sci_classifier.train(train_set)
else:
print('Waiting...')
time.sleep(3)
示例5: multinomial_bayes_nltk_wrapper
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def multinomial_bayes_nltk_wrapper(corpus, documents_training, documents_test, words_features, smoothing, kbest):
"""
Multinomial Naive Bayes Algorithm using wrapper NLTK SklearnClassifier
Memory problems can occur if very large dataset
:param corpus:
:param documents_training:
:param documents_test:
:param words_features:
:param smoothing:
:param kbest:
:return:
"""
print
print "----- Multinomial Bayes with wrapper nltk Algorithm------"
print "Creating Training Feature Vectors..."
array_features_training = []
for (id, original_category, annotations) in documents_training:
array_features_training.append((util_classify.transform_document_in_dict(annotations, words_features, corpus), original_category))
# array_features_training = apply_features(extract_document_features,documents_training)
print "Training algorithm..."
# ('chi2', SelectKBest(chi2, k=3000)),
if kbest == 0:
kbest = "all"
pipeline = Pipeline([('chi2', SelectKBest(chi2, k=kbest)), ('tfidf', TfidfTransformer()),
('nb', MultinomialNB(alpha=smoothing))])
# pipeline = Pipeline([('nb', MultinomialNB(alpha=smoothing))])
classifier = SklearnClassifier(pipeline)
classifier.train(array_features_training)
print "Calculating metrics ..."
categories = util_classify.get_categories(corpus)
estimated_categories = []
original_categories = []
for (id, cat_original, annotations) in documents_test:
cat_estimated = classifier.classify((util_classify.transform_document_in_dict(annotations, words_features, corpus)))
estimated_categories.append(categories.index(cat_estimated))
original_categories.append(categories.index(cat_original))
return original_categories, estimated_categories
示例6: linear_support_vector_machines_tf_idf
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def linear_support_vector_machines_tf_idf(corpus, documents_training, documents_test, words_features, kbest):
"""
Linear Support Vector Machines Algorithm. The Support Vector Machines algorithm with a linear kernel and using TF/IDF
:param corpus:
:param documents_training:
:param documents_test:
:param words_features:
:param kbest:
:return:
"""
print
print "----- Linear Support Vector Machines with tfidf algorithm ------"
print "Creating Features Training Vectors..."
categories = util_classify.get_categories(corpus)
array_features_training = []
for (id, original_category, annotations) in documents_training:
array_features_training.append((util_classify.transform_document_in_dict(annotations, words_features, corpus), original_category))
print "Training algorithm..."
if kbest == 0:
kbest = "all"
pipeline = Pipeline([('chi2', SelectKBest(chi2, k=kbest)), ('tfidf', TfidfTransformer()),
('svc', LinearSVC())])
classifier = SklearnClassifier(pipeline)
classifier.train(array_features_training)
print "Calculating metrics..."
estimated_categories = []
original_categories = []
for (id, cat_original, annotations) in documents_test:
cat_estimated = classifier.classify((util_classify.transform_document_in_dict(annotations, words_features, corpus)))
estimated_categories.append(categories.index(cat_estimated))
original_categories.append(categories.index(cat_original))
return original_categories, estimated_categories
示例7: train_Classifier
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def train_Classifier(posfeats,negfeats,index):
# divide dataset into train and validation sets
posCutoff = int(math.floor(len(posfeats)*7/10))
negCutoff = int(math.floor(len(negfeats)*7/10))
trainFeatures = posfeats[:posCutoff] + negfeats[:negCutoff]
testFeatures = posfeats[posCutoff:] + negfeats[negCutoff:]
referenceSets = collections.defaultdict(set)
testSets = collections.defaultdict(set)
classsifiername=''
if (index == 0):
classifier = nltk.classify.maxent.MaxentClassifier.train(trainFeatures, 'GIS', trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter = 5)
classsifiername= 'Maximum Entropy'
elif (index ==1):
classifier = SklearnClassifier(BernoulliNB())
classifier.train(trainFeatures)
classsifiername='Bernoulli Naive Bayes'
else:
classifier = SklearnClassifier(LogisticRegression())
classifier.train(trainFeatures)
classsifiername = 'LogisticRegression'
for i, (features, label) in enumerate(testFeatures):
referenceSets[label].add(i)
predicted = classifier.classify(features)
testSets[predicted].add(i)
print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))
print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
print 'pos recall:', nltk.metrics.recall(referenceSets['pos'], testSets['pos'])
print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
print 'neg recall:', nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
#classifier.show_most_informative_features(10)
return classifier
开发者ID:gyasmeen,项目名称:twiitter-sentiment-analysis-machine-learning,代码行数:39,代码来源:Twitter_Sentiment_Analysis.py
示例8: evaluate_features
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def evaluate_features(feature_select):
posFeatures = []
negFeatures = []
training = []
#process positive dataset "processed_pro_GMO.txt"
for i in short_pos.split('\n'):
posWords = word_tokenize(i)
posWords_tag = [feature_select(posWords),"pos"]
#post each word as "pos" in positive dataset
posFeatures.append(posWords_tag)
#process negative dataset "processed_anti_GMO.txt"
for i in short_neg.split('\n'):
negWords = word_tokenize(i)
negWords_tag = [feature_select(negWords),"neg"]
negFeatures.append(negWords_tag)
#get 6-Fold cross validation for Accuracy,Recall,Prediction
num_folds = 6
training = posFeatures + negFeatures
cv = cross_validation.KFold(len(training),n_folds=6, shuffle=True, random_state=None)
Naive_Accu = 0
neg_Precision = 0
neg_recall = 0
pos_Precision = 0
pos_recall = 0
SVC_Accu = 0
Regression_Accu = 0
testFeatures = []
precision = dict()
recall = dict()
average_Precision = dict()
for traincv, testcv in cv:
#BasedNaiveClassifier
BasedNaiveClassifier = NaiveBayesClassifier.train(training[traincv[0]:traincv[len(traincv)-1]])
accuracy = (nltk.classify.util.accuracy(BasedNaiveClassifier, training[testcv[0]:testcv[len(testcv)-1]]))*100
Naive_Accu += accuracy
BasedNaiveClassifier.show_most_informative_features(10)
save_classifier = open("GMO_Hanzhe/BasedNaiveClassifier10k.pickle","wb")
pickle.dump(BasedNaiveClassifier, save_classifier)
save_classifier.close()
#LogisticRegression
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training[traincv[0]:traincv[len(traincv)-1]])
Regression_Accuracy = (nltk.classify.util.accuracy(LogisticRegression_classifier, training[testcv[0]:testcv[len(testcv)-1]]))*100
Regression_Accu += Regression_Accuracy
save_classifier = open("GMO_Hanzhe/LogisticRegression_classifier10k.pickle","wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()
#LinearSVC
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training[traincv[0]:traincv[len(traincv)-1]])
SVC_Accuracy = (nltk.classify.util.accuracy(LinearSVC_classifier, training[testcv[0]:testcv[len(testcv)-1]]))*100
SVC_Accu += SVC_Accuracy
save_classifier = open("GMO_Hanzhe/LinearSVC_classifier10k.pickle","wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()
#initiates referenceSets and testSets
referenceSets = collections.defaultdict(set)
testSets = collections.defaultdict(set)
for idx in testcv:
testFeatures.append(training[idx])
#puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
for i, (features, label) in enumerate(testFeatures):
referenceSets[label].add(i)
predicted = LogisticRegression_classifier.classify(features)
testSets[predicted].add(i)
#7/5/2015
## pos_Precision += (nltk.metrics.precision(referenceSets["pos"], testSets["pos"]))*100
## pos_recall += (nltk.metrics.recall(referenceSets["pos"], testSets["pos"]))*100
## neg_Precision += (nltk.metrics.precision(referenceSets["neg"], testSets["neg"]))*100
## neg_recall += (nltk.metrics.recall(referenceSets["neg"], testSets["neg"]))*100
##
## precision["pos"] = nltk.metrics.precision(referenceSets["pos"], testSets["pos"])
## recall["pos"] = nltk.metrics.recall(referenceSets["pos"], testSets["pos"])
## precision["neg"] = nltk.metrics.precision(referenceSets["neg"], testSets["neg"])
## recall["neg"] = nltk.metrics.recall(referenceSets["neg"], testSets["neg"])
##
## save_classifier = open("GMOHedging/BasedNaiveClassifier.pickle","wb")
## pickle.dump(BasedNaiveClassifier, save_classifier)
## save_classifier.close()
### average_precision["pos"] = precision["pos"]
#get Average score for Accuracy, Precision and Recall
accu = Naive_Accu/num_folds
#7/5/2015
#.........这里部分代码省略.........
示例9: SklearnClassifier
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
classif = SklearnClassifier(pipeline)
classif.train(zip(trainData,trainLabels))
cf = None
if USE_CHI_SQUARE:
cf = open("nb_classifier_"+str(gram)+"gram_"+str(size)+"_large","w")
else:
cf = open("nb_classifier_"+str(gram)+"gram_"+str(size)+"_large_nochi","w")
pickle.dump(classif, cf)
matches = 0
mismatches = 0
scores = {1:0, 2:0, 3:0, 4:0, 5:0}
for i in range(len(testLabels)):
label = classif.classify(testData[i])
log("test data id: "+str(i),f)
if label == testLabels[i]:
matches += 1
log("matched: label: "+str(label),f)
else:
mismatches += 1
log("mismatched: label: "+str(label)+" was supposed to be: "+str(testLabels[i]),f)
scores[int(label)]+=1
log("summary of results for: gram: "+str(gram) +" size: "+str(size),f)
log("matches = "+str(matches),f)
log("mismatches = "+str(mismatches),f)
log("guesses = "+repr(scores),f)
log("="*20,f)
log("="*20,f)
log("="*20,f)
示例10: get_train_features_from_tweets
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
neg_train, neg_test = neg_tweets[:negcutoff], neg_tweets[negcutoff:]
neg_feats_train = get_train_features_from_tweets(neg_train, 'neg')
pos_feats_train = get_train_features_from_tweets(pos_train, 'pos')
train_feats = neg_feats_train + pos_feats_train
svm_classifier = SklearnClassifier(LinearSVC())
svm_classifier.train(train_feats)
# Evaluation
correct, wrong = 0, 0
for tweet in neg_test:
features = get_features_from_tweet(tweet)
result = svm_classifier.classify(features)
if result == "neg":
correct += 1
else:
wrong += 1
for tweet in pos_test:
features = get_features_from_tweet(tweet)
result = svm_classifier.classify(features)
if result == "pos":
correct += 1
else:
wrong += 1
print "Accuracy: {}".format(correct / float(correct + wrong))
示例11: print
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
testing_set = nltk.classify.apply_features(extract_features, test_tweets)
for (tweet, sentiment) in test_tweets:
print(classifier.classify(extract_features(tweet)))
print(nltk.classify.accuracy(classifier, testing_set))
classifier.show_most_informative_features(5)
"""
pipeline = Pipeline([('tfidf', TfidfTransformer()),
('chi2', SelectKBest(chi2, k='all')),
('nb', MultinomialNB())])
"""
pipeline = Pipeline([('tfidf', TfidfTransformer()),
('chi2', SelectKBest(chi2, k='all')),
('nb', MultinomialNB())])
classif = SklearnClassifier(pipeline)
classif.train(training_set)
print(classif.labels())
for (tweet, sentiment) in test_tweets:
print(classif.classify(extract_features(tweet)))
print(nltk.classify.accuracy(classif, testing_set))
示例12: YoutubeVideoClassifier
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
class YoutubeVideoClassifier(Utility):
""" Use the collected data as training set and classify test data"""
def __init__(self):
Utility.__init__(self)
self.nb_output_file_name = self.config.get("GLOBAL", "nb_output_file")
self.svm_output_file_name = self.config.get("GLOBAL", "svm_output_file")
self.nb_output = os.path.join(self.output_dir, self.nb_output_file_name)
self.svm_output = os.path.join(self.output_dir, self.svm_output_file_name)
self.train_features = []
self.stopwords_set = set(stopwords.words("english"))
def run_main(self):
self.pre_processing()
self.feature_extraction()
self.classification()
self.testing()
def pre_processing(self):
self.load_data()
def load_data(self):
self.load_movies()
self.load_actors()
self.load_tvshows()
self.load_test_data()
def load_movies(self):
self.movies_list = []
movies_fd = codecs.open(self.movies_file)
for movie in movies_fd.readlines():
if not movie:
continue
self.movies_list.append(movie)
movies_fd.close()
def load_actors(self):
self.actors_list = []
actors_fd = codecs.open(self.actors_file)
for actor in actors_fd.readlines():
if not actor:
continue
self.actors_list.append(actor)
actors_fd.close()
def load_tvshows(self):
self.tvshows_list = []
tvshows_fd = codecs.open(self.tvshows_file)
for tvshow in tvshows_fd.readlines():
if not tvshow:
continue
self.tvshows_list.append(tvshow)
tvshows_fd.close()
def load_test_data(self):
json_data = open(self.test_file)
self.test_data = json.load(json_data)
def feature_selection(self, features_list):
selected_features = []
for feat in features_list:
if feat and feat.strip() and feat.lower() not in self.stopwords_set:
selected_features.append((feat.strip().lower(), True))
return dict(selected_features)
def feature_extraction(self):
for item in self.tvshows_list:
if not item:
continue
selected_features = self.feature_selection(item.replace("_", " ").split(" "))
self.train_features.append((selected_features, "tvshow"))
for item in self.movies_list:
if not item:
continue
selected_features = self.feature_selection(item.replace("_", " ").split(" "))
self.train_features.append((selected_features, "movie"))
for item in self.actors_list:
if not item:
continue
selected_features = self.feature_selection(item.replace("_", " ").split(" "))
self.train_features.append((selected_features, "celebrity"))
def classification(self):
# Training NB Classifier
self.nb_classifier = NaiveBayesClassifier.train(self.train_features)
# Training SVM classifier
self.svm_classifier = SklearnClassifier(LinearSVC())
self.svm_classifier.train(self.train_features)
def testing(self):
nb_fd = codecs.open(self.nb_output, "w", "utf-8")
#.........这里部分代码省略.........
示例13: RForests
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
class RForests(text_classifier.TextClassifier):
def __init__(self,trainDir,labelFile,numTrees=10,numJobs=1):
self.classifier = None
self.labelFile = labelFile
self.trainingDir = trainDir
self.labels = None
self.all_words = None
self.numTrees = numTrees
self.numJobs = numJobs
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees,
n_jobs=numJobs),sparse=False)
#self.labels = training.setup(labelFile)
#self.train()
def train(self):
feature_sets = self.getFeatures()
self.classifier.train(feature_sets)
""" Determines training error"""
def trainingError(self):
feature_sets = self.getFeatures()
p = nltk.classify.accuracy(self.classifier,feature_sets)
return p
""" Make sure that the algorithm works on training data using a k fold
cross validation scheme """
def kfoldCrossValidation(self,k):
feature_sets = self.getFeatures()
error = 0
for i in range(k):
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees),sparse=False)
n = len(feature_sets)/k
train_set,test_set = feature_sets[:n*i],feature_sets[n*i:]
test_set1 = feature_sets[:n*i]
train_set = feature_sets[n*i:n*(i+1)]
test_set2 = feature_sets[i+1:]
test_set = test_set1+test_set2
self.classifier.train(feature_sets)
p = nltk.classify.accuracy(self.classifier,test_set)
return p
""" Make sure that the algorithm works on training data using a leave one out
cross validation scheme """
def leave1OutCrossValidation(self):
error = 0
feature_sets = self.getFeatures()
N = len(feature_sets)
for i in range(N):
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees),sparse=False)
train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:]
train_set = train_set1+train_set2
test_set = [test_set]
self.classifier.train(feature_sets)
p = nltk.classify.accuracy(self.classifier,test_set)
error+=p
return error/N
""" Construct a learning curve to see if there is overfitting"""
def learningCurve(self,numTrials=4):
accuracies = []
feature_sets = self.getFeatures()
for k in xrange(1,len(feature_sets)-1):
total = 0
for i in xrange(numTrials):
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees),
sparse=False)
random.shuffle(feature_sets)
train_set,test_set = feature_sets[:k],feature_sets[k:]
self.classifier.train(train_set)
p = nltk.classify.accuracy(self.classifier,test_set)
print len(train_set),len(test_set),p
total+=p
accuracies.append(total/numTrials)
return accuracies
""" Train on only k features and return training labels and predicted labels """
def testClassify(self,k):
feature_sets = self.getFeatures()
random.shuffle(feature_sets)
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees),sparse=False)
self.classifier.train(feature_sets[k:])
features,ref_labels = zip(*feature_sets[:k])
pred_labels = self.classifier.batch_classify(features)
return ref_labels,pred_labels
""" nltk confusion matrix """
def confusionMatrix(self,ref,test):
ref.sort(key=lambda x: x[0])
test.sort(key=lambda x: x[0])
_,ref_labels = zip(*ref)
_,test_labels = zip(*test)
cm = ConfusionMatrix(ref_labels, test_labels)
return cm
def prob_classify(self,db,fastain):
#.........这里部分代码省略.........
示例14: main
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
def main():
parser = get_argparser()
args = parser.parse_args()
util.DPRINT = args.dprint
featureset_name = os.path.basename(args.featurefn).split('.')[0]
features.load_featurefile(args.featurefn)
## default is 1e-4.
THETOL = 1e-3
classifier_pairs = []
classifier_pairs.append(("MFS", learn.MFSClassifier()))
classifier = SklearnClassifier(LogisticRegression(C=1,
penalty='l2',
tol=THETOL))
classifier_pairs.append(("maxent-l2-c1", classifier))
stamp = util.timestamp()
for fn in glob(args.testset + "/*data"):
problems = semeval_testset.extract_wsd_problems(fn)
w = problems[0][0]
assert w.endswith(".n")
w = w[:-2]
load_training_for_word(w, args.bitextfn, args.alignfn, args.annotatedfn)
bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es")
oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es")
if os.path.exists(bestoutfn):
os.remove(bestoutfn)
if os.path.exists(oofoutfn):
os.remove(oofoutfn)
training = None
for problem in problems:
w = problem[0]
assert w.endswith(".n")
w = w[:-2]
print(problem)
if training is None:
training = trainingdata.trainingdata_for(w, nonnull=True)
print("got {0} instances for {1}".format(len(training), w))
labels = set(label for (feat,label) in training)
if len(training) == 0:
print("no samples for", w)
break
if len(labels) < 2:
print("there's only one sense for", w, " and it is ",
labels)
break
classifier.train(training)
rawtext = problem[2]
surface, index = semeval_testset.head_surface_and_index(rawtext)
replaced = re.sub(r"<head>(.*)</head>", " \\1 ", rawtext)
annotated = preprocessing.preprocess(replaced, "en")
sentence = [token.lemma for token in annotated]
focus_index = find_head_token_index(annotated, surface, index)
feats = features.extract_untagged(sentence, annotated, focus_index)
bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es")
oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es")
with open(bestoutfn, "a") as bestoutfile, \
open(oofoutfn, "a") as oofoutfile:
answer = classifier.classify(feats)
print(answer)
dist = classifier.prob_classify(feats)
oof_answers = topfive(dist)
print(output_one_best(problem, "es", answer), file=bestoutfile)
print(output_five_best(problem, "es", oof_answers),
file=oofoutfile)
示例15: label_feats_from_corpus
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import classify [as 别名]
""" This is a demo of the Scikit-learn Classifier from the NLTK
package using the movie reviews corpus """
from nltk.corpus import movie_reviews
from featx import *
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression
from nltk.classify.util import accuracy
from nltk import word_tokenize
lfeats = label_feats_from_corpus(movie_reviews)# extracts the features and its labels (neg/pos) associated with each tweets
train_feats,test_feats = split_label_feats(lfeats, split = 0.75) # splits labeled feature sets into training and test feats see featx.py
sk_classifier = SklearnClassifier(LogisticRegression())# trains classifier
sk_classifier.train(train_feats)
print("The associated accuracy for this classfier on the data is :" )
print(accuracy(sk_classifier,test_feats))
while True:
text = input("Enter your fake tweet use only words: \n")
test = bag_of_words(word_tokenize(text)) # converts text into a bag of words see featx.py
print("Sentiment:")
print(sk_classifier.classify(test))
control = input("press aney key to continue 'q' to quit:")
if(control == "q" ):
break