当前位置: 首页>>代码示例>>Python>>正文


Python FeatureExtractor.featurizeFiles方法代码示例

本文整理汇总了Python中feature_extractor.FeatureExtractor.featurizeFiles方法的典型用法代码示例。如果您正苦于以下问题:Python FeatureExtractor.featurizeFiles方法的具体用法?Python FeatureExtractor.featurizeFiles怎么用?Python FeatureExtractor.featurizeFiles使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在feature_extractor.FeatureExtractor的用法示例。


在下文中一共展示了FeatureExtractor.featurizeFiles方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: runOnSplit

# 需要导入模块: from feature_extractor import FeatureExtractor [as 别名]
# 或者: from feature_extractor.FeatureExtractor import featurizeFiles [as 别名]
def runOnSplit(penalties, constants, split):
	"Running on a " + str(split*100) + '/' + str((1-split)*100) + ' split' 
	fe = FeatureExtractor(split)
	featurized = fe.featurizeFiles('../data')
	classNames = featurized[0]
	trainMatrix, trainLabels = featurized[1:3]
	devMatrix, devLabels = featurized[3:5]
	trainFiles, devFiles = featurized[5:]


	classCounts = Counter()
	for l in devLabels:
		classCounts[l] += 1

	for penalty in penalties:
		for C in constants:
			print "\nPenalty, regularization: ", str(penalty), str(C)

			abstractModel = LogisticRegression()
			model = abstractModel.scikit(penalty, C)
			model_params = (penalty, C)
			model.fit(trainMatrix, trainLabels)

			errors, rankedExamples = Counter(), []

			score = model.score(devMatrix, devLabels)
			predicted_labels = model.predict(devMatrix)

			probs = model.predict_proba(devMatrix)

			for j,pred in enumerate(predicted_labels):
				if not pred == devLabels[j]:
					errors[devLabels[j]] += 1

			for i, p in enumerate(probs):
				rankedExamples.append((p, devFiles[i], predicted_labels[i] == devLabels[i]))		

			results = ''
			for i, c in enumerate(classNames):
				missRate = str(float(errors[i]) / classCounts[i])
				results += '\t' + c + ' error: ' + missRate + '\n'

			results += '\tScore: ' + str(score)
			fileName = 'results/scores/LRsplit'
			for param in model_params:
				fileName += '_' + str(param)
			fileName += '.txt'
			with open(fileName, 'w') as f:
				f.write(results)
			print results

			print '..ranking examples'
			if len(rankedExamples):
				examples = sorted(rankedExamples, key=lambda e: e[0][0])
				fileName = 'results/rankedExamples/LRsplit_' + str(split*100)
				for param in model_params:
					fileName += '_' + str(param)
				fileName += '.txt'
				with open(fileName,'w') as f:
					for e in examples:
						results = e[1]
						results += '\n\t Probability of class '
						results += classNames[0] + ': '
						results += str(e[0][0])
						results += '\n\t Correct: ' + str(e[2])
						f.write(results)
开发者ID:bmccann,项目名称:party_predictor,代码行数:68,代码来源:logistic_regression.py

示例2: FeatureExtractor

# 需要导入模块: from feature_extractor import FeatureExtractor [as 别名]
# 或者: from feature_extractor.FeatureExtractor import featurizeFiles [as 别名]
# import vectorizeFiles as VF
from sklearn.neighbors import KNeighborsClassifier#, DistanceMetric
# import numpy as np
# import getFileNames as gf
# import sys
# import scipy
from sklearn import grid_search
from feature_extractor import FeatureExtractor


fe = FeatureExtractor(1)
featurized = fe.featurizeFiles('../data')
classNames, repubAndDemMatrix, labels = featurized[:3]
# [repubAndDemMatrix,vectorizerRepubDem,labels]=VF.extractWordCounts(True,True,False)
parameters = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10]}
#,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]}
#'weights':('uniform','distance'), 'p':[1, 2, 3, 4, 5]
#'metric':('euclidean', 'manhattan','chebyshev','minkowski','jaccard','maching','dice','kulsinki','rogerstanimoto','russellrao','sokalmichener','sokalsneath'), 
kn = KNeighborsClassifier()
clf = grid_search.GridSearchCV(kn, parameters)
clf.fit(repubAndDemMatrix, labels)
print clf.best_estimator_ #<-lots of detail
print clf.best_params_ #<-more useful
print clf.best_score_ #<-this is the cv error
print clf.score(repubAndDemMatrix, labels) #<-training error


#optimal parameter of 4 neighbors, best test error is 0.668573607933, best training error is 0.828488372093

# if we use shuffles the training data so that it is not all democrats and then all republicans,
# we get an optimal param of 1 neighbor, .689 test error, 1.0 training error
开发者ID:bmccann,项目名称:party_predictor,代码行数:33,代码来源:KNeighbors.py


注:本文中的feature_extractor.FeatureExtractor.featurizeFiles方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。