本文整理汇总了Python中nltk.classify.scikitlearn.SklearnClassifier.prob_classify方法的典型用法代码示例。如果您正苦于以下问题:Python SklearnClassifier.prob_classify方法的具体用法?Python SklearnClassifier.prob_classify怎么用?Python SklearnClassifier.prob_classify使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.classify.scikitlearn.SklearnClassifier
的用法示例。
在下文中一共展示了SklearnClassifier.prob_classify方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: RForests
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import prob_classify [as 别名]
class RForests(text_classifier.TextClassifier):
def __init__(self,trainDir,labelFile,numTrees=10,numJobs=1):
self.classifier = None
self.labelFile = labelFile
self.trainingDir = trainDir
self.labels = None
self.all_words = None
self.numTrees = numTrees
self.numJobs = numJobs
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees,
n_jobs=numJobs),sparse=False)
#self.labels = training.setup(labelFile)
#self.train()
def train(self):
feature_sets = self.getFeatures()
self.classifier.train(feature_sets)
""" Determines training error"""
def trainingError(self):
feature_sets = self.getFeatures()
p = nltk.classify.accuracy(self.classifier,feature_sets)
return p
""" Make sure that the algorithm works on training data using a k fold
cross validation scheme """
def kfoldCrossValidation(self,k):
feature_sets = self.getFeatures()
error = 0
for i in range(k):
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees),sparse=False)
n = len(feature_sets)/k
train_set,test_set = feature_sets[:n*i],feature_sets[n*i:]
test_set1 = feature_sets[:n*i]
train_set = feature_sets[n*i:n*(i+1)]
test_set2 = feature_sets[i+1:]
test_set = test_set1+test_set2
self.classifier.train(feature_sets)
p = nltk.classify.accuracy(self.classifier,test_set)
return p
""" Make sure that the algorithm works on training data using a leave one out
cross validation scheme """
def leave1OutCrossValidation(self):
error = 0
feature_sets = self.getFeatures()
N = len(feature_sets)
for i in range(N):
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees),sparse=False)
train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:]
train_set = train_set1+train_set2
test_set = [test_set]
self.classifier.train(feature_sets)
p = nltk.classify.accuracy(self.classifier,test_set)
error+=p
return error/N
""" Construct a learning curve to see if there is overfitting"""
def learningCurve(self,numTrials=4):
accuracies = []
feature_sets = self.getFeatures()
for k in xrange(1,len(feature_sets)-1):
total = 0
for i in xrange(numTrials):
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees),
sparse=False)
random.shuffle(feature_sets)
train_set,test_set = feature_sets[:k],feature_sets[k:]
self.classifier.train(train_set)
p = nltk.classify.accuracy(self.classifier,test_set)
print len(train_set),len(test_set),p
total+=p
accuracies.append(total/numTrials)
return accuracies
""" Train on only k features and return training labels and predicted labels """
def testClassify(self,k):
feature_sets = self.getFeatures()
random.shuffle(feature_sets)
self.classifier = SklearnClassifier(RandomForestClassifier(
n_estimators=self.numTrees),sparse=False)
self.classifier.train(feature_sets[k:])
features,ref_labels = zip(*feature_sets[:k])
pred_labels = self.classifier.batch_classify(features)
return ref_labels,pred_labels
""" nltk confusion matrix """
def confusionMatrix(self,ref,test):
ref.sort(key=lambda x: x[0])
test.sort(key=lambda x: x[0])
_,ref_labels = zip(*ref)
_,test_labels = zip(*test)
cm = ConfusionMatrix(ref_labels, test_labels)
return cm
def prob_classify(self,db,fastain):
#.........这里部分代码省略.........
示例2: label_feats_from_tweets
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import prob_classify [as 别名]
print "creating feature sets..."
tweetlist = tweetTest.loadTwitterCSV('trainingandtestdata/testdata.csv')
labeld_features = label_feats_from_tweets(tweetlist)
#labeld_features = label_feats_from_corpus(movie_reviews)
training_set, test_set = split_label_feats(labeld_features)
# tweetlist = tweetTest.loadTwitterCSV('trainingandtestdata/training.1600000.processed.noemoticon.csv')
# training_set = label_feats_from_tweets(tweetlist)
# training_set, garbage = split_label_feats(training_set, 1.0)
# test_set, garbage = split_label_feats(labeld_features, 1.0)
print "training set length: %i test set length: %i" % (len(training_set), len(test_set))
print prettifyFeatureSet(test_set)
print "training classifier..."
#classifier = NaiveBayesClassifier.train(training_set)
#classifier = MaxentClassifier.train(training_set, algorithm='iis', max_iter=99, min_lldelta=0.01)
#classifier = MaxentClassifier.train(training_set)
classifier = SklearnClassifier(LogisticRegression()).train(training_set)
print "calculating accuracy..."
print 'accuracy:', nltk.classify.util.accuracy(classifier, test_set)
#classifier.show_most_informative_features(30)
negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous'])
print classifier.classify(negfeat)
probdist = classifier.prob_classify(negfeat)
print "pos: ", probdist.prob('pos'), " neg: ", probdist.prob('neg')
print classifier.labels()
classify_tweet(classifier, "I love this movie!", True)
classify_tweet(classifier, "!!!", True)
示例3: main
# 需要导入模块: from nltk.classify.scikitlearn import SklearnClassifier [as 别名]
# 或者: from nltk.classify.scikitlearn.SklearnClassifier import prob_classify [as 别名]
def main():
parser = get_argparser()
args = parser.parse_args()
util.DPRINT = args.dprint
featureset_name = os.path.basename(args.featurefn).split('.')[0]
features.load_featurefile(args.featurefn)
## default is 1e-4.
THETOL = 1e-3
classifier_pairs = []
classifier_pairs.append(("MFS", learn.MFSClassifier()))
classifier = SklearnClassifier(LogisticRegression(C=1,
penalty='l2',
tol=THETOL))
classifier_pairs.append(("maxent-l2-c1", classifier))
stamp = util.timestamp()
for fn in glob(args.testset + "/*data"):
problems = semeval_testset.extract_wsd_problems(fn)
w = problems[0][0]
assert w.endswith(".n")
w = w[:-2]
load_training_for_word(w, args.bitextfn, args.alignfn, args.annotatedfn)
bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es")
oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es")
if os.path.exists(bestoutfn):
os.remove(bestoutfn)
if os.path.exists(oofoutfn):
os.remove(oofoutfn)
training = None
for problem in problems:
w = problem[0]
assert w.endswith(".n")
w = w[:-2]
print(problem)
if training is None:
training = trainingdata.trainingdata_for(w, nonnull=True)
print("got {0} instances for {1}".format(len(training), w))
labels = set(label for (feat,label) in training)
if len(training) == 0:
print("no samples for", w)
break
if len(labels) < 2:
print("there's only one sense for", w, " and it is ",
labels)
break
classifier.train(training)
rawtext = problem[2]
surface, index = semeval_testset.head_surface_and_index(rawtext)
replaced = re.sub(r"<head>(.*)</head>", " \\1 ", rawtext)
annotated = preprocessing.preprocess(replaced, "en")
sentence = [token.lemma for token in annotated]
focus_index = find_head_token_index(annotated, surface, index)
feats = features.extract_untagged(sentence, annotated, focus_index)
bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es")
oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es")
with open(bestoutfn, "a") as bestoutfile, \
open(oofoutfn, "a") as oofoutfile:
answer = classifier.classify(feats)
print(answer)
dist = classifier.prob_classify(feats)
oof_answers = topfive(dist)
print(output_one_best(problem, "es", answer), file=bestoutfile)
print(output_five_best(problem, "es", oof_answers),
file=oofoutfile)