本文整理汇总了Python中nltk.classify.scikitlearn.SklearnClassifier.batch_classify方法的典型用法代码示例。如果您正苦于以下问题:Python SklearnClassifier.batch_classify方法的具体用法?Python SklearnClassifier.batch_classify怎么用?Python SklearnClassifier.batch_classify使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.classify.scikitlearn.SklearnClassifier
的用法示例。
在下文中一共展示了SklearnClassifier.batch_classify方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: SVMTweetClassifier
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
class SVMTweetClassifier(TweetClassifier):
"""
A simple Naive Bayes classifier. Documents are tokenized and stemmed, and then converted to bag-of-words format.
The preprocessed documents are then handled by NLTKs Naive Bayes classifier.
"""
def __init__(self, trainfile=None, datafile=None, outfile=None):
super(SVMTweetClassifier, self).__init__(trainfile, datafile, outfile)
self.dictionary = SimpleDict()
self.scores = {}
self.stemmer = PorterStemmer()
def getFeatures(self, tweet):
"""
Replace this method to select different features than just bag-of-words representation of the whole tweet.
This is probably the one piece of code we should work on most, since features basically decide whether we have a good or bad classifier.
"""
return self.getFeatures2(tweet)
#tokens = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation)).split(" ")
#tokens = [self.stemmer.stem(token) for token in tokens]
#tokens = [token for token in tokens if not token[0:4] == "http"] #remove links
#for stop in STOPWORDS:
# if stop in tokens:
# tokens.remove(stop)
#return self.dictionary.doc2bow(tokens, True)
def getFeatures2(self, tweet):
"""
POS tag and take only nouns, verbs and adjectives
"""
text = nltk.word_tokenize(tweet.tweet)
return self.dictionary.doc2bow([pos for pos in nltk.pos_tag(text) if pos[1] in ["NN","JJ","JJR","JJS","VBD","VBG","VBN" ,"VBP","VBZ" ,"RB"] ])
def train(self, trainfile=None):
self.readTrainingData((trainfile or self.trainfile))
print "getting features.."
# the classifier expects a list of (feature_set, label) elements, where each feature_set is a dictionary of {feature_name: value, ...} mappings
train_set = [(self.getFeatures(tweet), tweet.sentiment) for tweet in self.trainingTweets]
print train_set
print "training SVM classifier"
self.classifier = SklearnClassifier(SVC(), sparse=True).train(train_set)
def classifyTweets(self, datafile=None, outfile=None):
print "reading dataset"
self.readDataset(datafile)
print "classifying Tweets with SVM classifier"
# we use prob_classify so we can remember the scores. This means we could later on judge the certainty of a measurement, and if it's low, change the sentiment.
res = self.classifier.batch_classify([self.getFeatures(tweet) for tweet in self.evalTweets])
print "assigning sentiments"
for idx, tweet in enumerate(self.evalTweets):
tweet.sentiment = res[idx]
#self.scores[(tweet.id1,tweet.id2)] = res
#tweet.sentiment = res.max()
self.writeResults(outfile)
示例2: getAccuracy
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
def getAccuracy(self, classifier):
classifier = SklearnClassifier(classifier)
accuracy = 0
for fold in range(0, self.n_fold):
log(str(fold+1) + " iteration...")
log(" Partitioning...")
datacv = self.getCrossValidationData(self.tweets, fold)
traincv = datacv[0]
testcv = datacv[1]
testlabel = datacv[2]
log(" Training...")
classifier.train(traincv)
log(" Classifying...")
label_pred = classifier.batch_classify(testcv)
tempScore = accuracy_score(testlabel, label_pred)
log(" Accuracy for this iteration: " + str(tempScore))
accuracy += tempScore
return accuracy/self.n_fold
示例3: score
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
def score(classifier):
classifier = SklearnClassifier(classifier)
classifier.train(trainset)
pred = classifier.batch_classify(test)
return accuracy_score(tag_test, pred)
示例4: range
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
pred_NB=cf.batch_classify(test_feat)
#results=[cf.classify(test[a][0]) for a in range(size)]
#gold=[test[a][1] for a in range(size)]
cm_NB=nltk.ConfusionMatrix(test_tag,pred_NB)
print cm_NB.pp(sort_by_count=True, show_percents=False, truncate=10)
#create structures for classification
test_doc=[a[0] for a in test]
#build, train, and test classifiers
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
sv=SklearnClassifier(LinearSVC())
sv.train(train)
#note that train performance matches tmp.sum()
pred_train_sv=sv.batch_classify(train_feat)
nltk.ConfusionMatrix(train_tag,pred_train_sv)
#also test performance matches tmp_test.sum()
pred_sv=sv.batch_classify(test_feat)
#confusion matrices
cmsv=nltk.ConfusionMatrix(test_tag,pred_sv)
print cmsv.pp(sort_by_count=True, show_percents=False, truncate=5)
#some SklearnClassifier internals
featsets, labs = zip(*train)
X = sv._convert(featsets)
import numpy
y=numpy.array([sv._label_index[l] for l in labs])
#then to train one would use sv._clf.fit(X,y)
#-------------------------------------
#To vectorize/classify all in sklearn
示例5: evaluate
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
#.........这里部分代码省略.........
# if tt == 0:
# print 'processing train'
# else:
# print 'processing test'
# for each question in the split
for qid in split:
q = split[qid]
ave = zeros((d, 1))
words = zeros((d, 1))
bow = []
count = 0.0
curr_ave = None
curr_words = None
# for each sentence in the question, generate features
for i in range(0, len(q)):
try:
tree = q[i]
except:
continue
curr_feats = {}
if rnn_feats:
forward_prop(None, params, tree, d, labels=False)
# features: average of hidden representations and average of word embeddings
for ex, node in enumerate(tree.get_nodes()):
if node.word not in stop:
ave += node.p_norm
words += node.vec
count += 1.0
if count > 0:
curr_ave = ave / count
curr_words = words / count
featvec = concatenate([curr_ave.flatten(), curr_words.flatten()])
# add QANTA's features to the current feature set
for dim, val in ndenumerate(featvec):
curr_feats["__" + str(dim)] = val
# add unigram indicator features to the current feature set
if bow_feats:
bow += [l.word for l in tree.get_nodes()]
for word in bow:
curr_feats[word] = 1.0
# add dependency relation indicator features to the current feature set
if rel_feats:
for l in tree.get_nodes():
if len(l.parent) > 0:
par, rel = l.parent[0]
this_rel = l.word + "__" + rel + "__" + tree.get(par).word
curr_feats[this_rel] = 1.0
if tt == 0:
train_feats.append((curr_feats, tree.ans.lower()))
else:
test_feats.append((curr_feats, tree.ans.lower()))
test_ord.append(tree)
# print 'total training instances:', len(train_feats)
# print 'total testing instances:', len(test_feats)
# can modify this classifier / do grid search on regularization parameter using sklearn
classifier = SklearnClassifier(LogisticRegression(C=10))
classifier.train(train_feats)
print "accuracy train:", nltk.classify.util.accuracy(classifier, train_feats)
print "accuracy test:", nltk.classify.util.accuracy(classifier, test_feats)
print ""
# finer-grained evaluation, see how well QANTA does at each sentence position
pred = classifier.batch_classify([fs for (fs, l) in test_feats])
count_dists = Counter()
corr_dists = Counter()
for ind, tree in enumerate(test_ord):
curr_dist = tree.dist
count_dists[curr_dist] += 1.0
label = tree.ans
if label == pred[ind]:
corr_dists[curr_dist] += 1.0
prob_dists = {}
print "sentence position: correctly answered at that position, total sentences at that position,", "accuracy"
for key in corr_dists:
prob_dists[key] = corr_dists[key] / count_dists[key]
print key, ": ", corr_dists[key], count_dists[key], prob_dists[key]
示例6: __init__
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
#.........这里部分代码省略.........
#NaiveBayes Classfication
self.NaiveBayesClassification(self.train_features, self.test_features)
#Support vecotr machine - Linear support vector classification
self.SVMClassification(self.train_features, self.test_features)
self.testing("75-25")
def NaiveBayesClassification(self, train_features, test_features):
# Training and finding accuracy of NaiveBayes Classifier
#Training
self.nb_classifier = NaiveBayesClassifier.train(train_features)
#Testing
#print '\n ACCURACY - NAIVE BAYE CLASSIFIER: %s \n' % (nltk.classify.util.accuracy(self.nb_classifier, test_features))
#self.nb_classifier.show_most_informative_features()
def SVMClassification(self, train_features, test_features):
# Training and finding accuracy of SVM Linear SVC classifier
test_feat_list = []
test_feat_labels_list = []
#Training
self.svm_classifier = SklearnClassifier(LinearSVC())
self.svm_classifier.train(train_features)
#Testing
for test_feat in test_features:
test_feat_list.append(test_feat[0])
test_feat_labels_list.append(test_feat[1])
svm_test = self.svm_classifier.batch_classify(test_feat_list)
#print classification_report(test_feat_labels_list, svm_test, labels=['pos','neg'], target_names=['pos', 'neg'])
def testing(self, iteration):
#Findng precision, recall and f measures for both classifier
#Naive Bayes classification
print "NAIVE BAYES - ITERATION %s" % (iteration)
actual_pol_dict, predicted_pol_dict = self.get_actual_and_predicted_polarity_dict(self.nb_classifier)
pos_precision, neg_precision = self.find_precision(actual_pol_dict, predicted_pol_dict)
pos_recall, neg_recall = self.find_recall(actual_pol_dict, predicted_pol_dict)
self.find_fmeasure(pos_precision, neg_precision, pos_recall, neg_recall)
print "SVM - Linear SVC - ITERATION %s" % (iteration)
#Support Vector Machine - Linear SVC classification
actual_pol_dict, predicted_pol_dict = self.get_actual_and_predicted_polarity_dict(self.svm_classifier)
pos_precision, neg_precision = self.find_precision(actual_pol_dict, predicted_pol_dict)
pos_recall, neg_recall = self.find_recall(actual_pol_dict, predicted_pol_dict)
self.find_fmeasure(pos_precision, neg_precision, pos_recall, neg_recall)
def cross_validation(self):
#10 fold cross validation for both classifiers
pos_feats_count = len(self.selected_pos_feats)
neg_feats_count = len(self.selected_neg_feats)
pos_feats_fold_size = int(pos_feats_count / 10)
neg_feats_fold_size = int(neg_feats_count / 10)
for a in range(10):
开发者ID:albin-sayonetech,项目名称:SentimentAnalaysisOfMovierReviews,代码行数:70,代码来源:movie_sentimental_analysis.py
示例7: MNBayes
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
#.........这里部分代码省略.........
feature_sets = self.getFeatures()
p = nltk.classify.accuracy(self.classifier,feature_sets)
return p
""" Make sure that the algorithm works on training data using a k fold
cross validation scheme """
def kfoldCrossValidation(self,k):
feature_sets = self.getFeatures()
error = 0
for i in range(k):
self.classifier = SklearnClassifier(MultinomialNB())
n = len(feature_sets)/k
train_set,test_set = feature_sets[:n*i],feature_sets[n*i:]
test_set1 = feature_sets[:n*i]
train_set = feature_sets[n*i:n*(i+1)]
test_set2 = feature_sets[i+1:]
test_set = test_set1+test_set2
self.classifier.train(feature_sets)
p = nltk.classify.accuracy(self.classifier,test_set)
return p
""" Make sure that the algorithm works on training data using a leave one out
cross validation scheme """
def leave1OutCrossValidation(self):
error = 0
feature_sets = self.getFeatures()
N = len(feature_sets)
for i in range(N):
self.classifier = SklearnClassifier(MultinomialNB())
train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:]
train_set = train_set1+train_set2
test_set = [test_set]
self.classifier.train(feature_sets)
p = nltk.classify.accuracy(self.classifier,test_set)
error+=p
return error/N
""" Construct a learning curve to see if there is overfitting"""
def learningCurve(self,numTrials=4):
accuracies = []
feature_sets = self.getFeatures()
for k in xrange(1,len(feature_sets)-1):
total = 0
for i in xrange(numTrials):
self.classifier = SklearnClassifier(MultinomialNB())
random.shuffle(feature_sets)
train_set,test_set = feature_sets[:k],feature_sets[k:]
self.classifier.train(feature_sets)
p = nltk.classify.accuracy(self.classifier,test_set)
total+=p
accuracies.append(total/numTrials)
return accuracies
""" Train on only k features and return training labels and predicted labels """
def testClassify(self,k):
feature_sets = self.getFeatures()
random.shuffle(feature_sets)
self.classifier = SklearnClassifier(MultinomialNB())
self.classifier.train(feature_sets[k:])
features,ref_labels = zip(*feature_sets[:k])
pred_labels = self.classifier.classify_many(features)
return ref_labels,pred_labels
""" nltk confusion matrix """
def confusionMatrix(self,ref,test):
ref.sort(key=lambda x: x[0])
test.sort(key=lambda x: x[0])
_,ref_labels = zip(*ref)
_,test_labels = zip(*test)
cm = ConfusionMatrix(ref_labels, test_labels)
return cm
""" Classifies proteins based on its text """
def classify(self,db,fastain):
proIDs,features,labels = [],[],[]
prevFeatureset = ''
prevText = ''
for seq_record in SeqIO.parse(fastain, "fasta"):
title = seq_record.id
toks = title.split("|")
proteinID = toks[5]
query_rows = genbank.proteinQuery(proteinID,db)
ids,text = zip(*query_rows)
text = ''.join(map(str,text))
if text=='':
label = ['na']
else:
text = word_reg.findall(text)
featureset = self.gene_features(text)
assert text!=prevText
assert featureset!=prevFeatureset
prevFeatureset = featureset
prevText = text
label = self.classifier.batch_classify([featureset])
proIDs.append(proteinID)
labels+=label
return zip(proIDs,labels)
示例8: word_feats
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
from sklearn.pipeline import Pipeline
DATA_PATH = '../../datasets/sentiment_analysis/en/rt-polaritydata'
def word_feats(words):
return dict([(word, True) for word in sent_tokenize(words)])
add_label = lambda lst, lab: [(x, lab) for x in lst]
pipeline = Pipeline([('tfidf', TfidfTransformer()),
('chi2', SelectKBest(chi2, k=1000)),
('nb', MultinomialNB())])
classifier = SklearnClassifier(pipeline)
pos = map(word_feats,
open(os.path.join(DATA_PATH, 'rt-polarity.pos')).readlines())
neg = map(word_feats,
open(os.path.join(DATA_PATH, 'rt-polarity.neg')).readlines())
features = zip(pos[:len(pos) / 2], itertools.repeat("pos")) + \
zip(neg[:len(neg) / 2], itertools.repeat("neg"))
classifier.train(features)
l_pos = np.array(classifier.batch_classify(pos[len(pos) / 2:]))
l_neg = np.array(classifier.batch_classify(neg[len(neg) / 2:]))
print "Confusion matrix:\n%d\t%d\n%d\t%d" % (
(l_pos == 'pos').sum(), (l_pos == 'neg').sum(),
(l_neg == 'pos').sum(), (l_neg == 'neg').sum())
示例9: form
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
# label set
cls_set = ['Emotion', 'ynQuestion', 'yAnswer', 'Continuer', 'whQuestion', 'System', 'Accept', 'Clarify', 'Emphasis', 'nAnswer', 'Greet', 'Statement', 'Reject', 'Bye', 'Other']
featuresets = [] # list of tuples of the form (post, features)
for post in posts: # applying the feature extractor to each post
# post.get('class') is the label of the current post
featuresets.append((dialogue_act_features(post.text),cls_set.index(post.get('class'))))
from random import shuffle
shuffle(featuresets)
size = int(len(featuresets) * .1) # 10% is used for the test set
train = featuresets[size:]
test = featuresets[:size]
print(train)
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
# SVM with a Linear Kernel and default parameters
classif = SklearnClassifier(LinearSVC())
classif.train(train)
test_skl = []
t_test_skl = []
for d in test:
test_skl.append(d[0])
t_test_skl.append(d[1])
# run the classifier on the train test
p = classif.batch_classify(test_skl)
from sklearn.metrics import classification_report
# getting a full report
print(classification_report(t_test_skl, p, labels=list(set(t_test_skl)),target_names=cls_set))
示例10: RForests
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import batch_classify [as 别名]
class RForests(text_classifier.TextClassifier):
def __init__(self,trainDir,labelFile,numTrees=10,numJobs=1):
self.classifier = None
self.labelFile = labelFile
self.trainingDir = trainDir
self.labels = None
self.all_words = None
self.numTrees = numTrees
self.numJobs = numJobs
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees,
n_jobs=numJobs),sparse=False)
#self.labels = training.setup(labelFile)
#self.train()
def train(self):
feature_sets = self.getFeatures()
self.classifier.train(feature_sets)
""" Determines training error"""
def trainingError(self):
feature_sets = self.getFeatures()
p = nltk.classify.accuracy(self.classifier,feature_sets)
return p
""" Make sure that the algorithm works on training data using a k fold
cross validation scheme """
def kfoldCrossValidation(self,k):
feature_sets = self.getFeatures()
error = 0
for i in range(k):
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees),sparse=False)
n = len(feature_sets)/k
train_set,test_set = feature_sets[:n*i],feature_sets[n*i:]
test_set1 = feature_sets[:n*i]
train_set = feature_sets[n*i:n*(i+1)]
test_set2 = feature_sets[i+1:]
test_set = test_set1+test_set2
self.classifier.train(feature_sets)
p = nltk.classify.accuracy(self.classifier,test_set)
return p
""" Make sure that the algorithm works on training data using a leave one out
cross validation scheme """
def leave1OutCrossValidation(self):
error = 0
feature_sets = self.getFeatures()
N = len(feature_sets)
for i in range(N):
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees),sparse=False)
train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:]
train_set = train_set1+train_set2
test_set = [test_set]
self.classifier.train(feature_sets)
p = nltk.classify.accuracy(self.classifier,test_set)
error+=p
return error/N
""" Construct a learning curve to see if there is overfitting"""
def learningCurve(self,numTrials=4):
accuracies = []
feature_sets = self.getFeatures()
for k in xrange(1,len(feature_sets)-1):
total = 0
for i in xrange(numTrials):
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees),
sparse=False)
random.shuffle(feature_sets)
train_set,test_set = feature_sets[:k],feature_sets[k:]
self.classifier.train(train_set)
p = nltk.classify.accuracy(self.classifier,test_set)
print len(train_set),len(test_set),p
total+=p
accuracies.append(total/numTrials)
return accuracies
""" Train on only k features and return training labels and predicted labels """
def testClassify(self,k):
feature_sets = self.getFeatures()
random.shuffle(feature_sets)
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees),sparse=False)
self.classifier.train(feature_sets[k:])
features,ref_labels = zip(*feature_sets[:k])
pred_labels = self.classifier.batch_classify(features)
return ref_labels,pred_labels
""" nltk confusion matrix """
def confusionMatrix(self,ref,test):
ref.sort(key=lambda x: x[0])
test.sort(key=lambda x: x[0])
_,ref_labels = zip(*ref)
_,test_labels = zip(*test)
cm = ConfusionMatrix(ref_labels, test_labels)
return cm
def prob_classify(self,db,fastain):
#.........这里部分代码省略.........