本文整理汇总了Python中feature_extractor.FeatureExtractor.featurizeFiles方法的典型用法代码示例。如果您正苦于以下问题:Python FeatureExtractor.featurizeFiles方法的具体用法?Python FeatureExtractor.featurizeFiles怎么用?Python FeatureExtractor.featurizeFiles使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类feature_extractor.FeatureExtractor
的用法示例。
在下文中一共展示了FeatureExtractor.featurizeFiles方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: runOnSplit
# 需要导入模块: from feature_extractor import FeatureExtractor [as 别名]
# 或者: from feature_extractor.FeatureExtractor import featurizeFiles [as 别名]
def runOnSplit(penalties, constants, split):
"Running on a " + str(split*100) + '/' + str((1-split)*100) + ' split'
fe = FeatureExtractor(split)
featurized = fe.featurizeFiles('../data')
classNames = featurized[0]
trainMatrix, trainLabels = featurized[1:3]
devMatrix, devLabels = featurized[3:5]
trainFiles, devFiles = featurized[5:]
classCounts = Counter()
for l in devLabels:
classCounts[l] += 1
for penalty in penalties:
for C in constants:
print "\nPenalty, regularization: ", str(penalty), str(C)
abstractModel = LogisticRegression()
model = abstractModel.scikit(penalty, C)
model_params = (penalty, C)
model.fit(trainMatrix, trainLabels)
errors, rankedExamples = Counter(), []
score = model.score(devMatrix, devLabels)
predicted_labels = model.predict(devMatrix)
probs = model.predict_proba(devMatrix)
for j,pred in enumerate(predicted_labels):
if not pred == devLabels[j]:
errors[devLabels[j]] += 1
for i, p in enumerate(probs):
rankedExamples.append((p, devFiles[i], predicted_labels[i] == devLabels[i]))
results = ''
for i, c in enumerate(classNames):
missRate = str(float(errors[i]) / classCounts[i])
results += '\t' + c + ' error: ' + missRate + '\n'
results += '\tScore: ' + str(score)
fileName = 'results/scores/LRsplit'
for param in model_params:
fileName += '_' + str(param)
fileName += '.txt'
with open(fileName, 'w') as f:
f.write(results)
print results
print '..ranking examples'
if len(rankedExamples):
examples = sorted(rankedExamples, key=lambda e: e[0][0])
fileName = 'results/rankedExamples/LRsplit_' + str(split*100)
for param in model_params:
fileName += '_' + str(param)
fileName += '.txt'
with open(fileName,'w') as f:
for e in examples:
results = e[1]
results += '\n\t Probability of class '
results += classNames[0] + ': '
results += str(e[0][0])
results += '\n\t Correct: ' + str(e[2])
f.write(results)
示例2: FeatureExtractor
# 需要导入模块: from feature_extractor import FeatureExtractor [as 别名]
# 或者: from feature_extractor.FeatureExtractor import featurizeFiles [as 别名]
# import vectorizeFiles as VF
from sklearn.neighbors import KNeighborsClassifier#, DistanceMetric
# import numpy as np
# import getFileNames as gf
# import sys
# import scipy
from sklearn import grid_search
from feature_extractor import FeatureExtractor
fe = FeatureExtractor(1)
featurized = fe.featurizeFiles('../data')
classNames, repubAndDemMatrix, labels = featurized[:3]
# [repubAndDemMatrix,vectorizerRepubDem,labels]=VF.extractWordCounts(True,True,False)
parameters = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10]}
#,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]}
#'weights':('uniform','distance'), 'p':[1, 2, 3, 4, 5]
#'metric':('euclidean', 'manhattan','chebyshev','minkowski','jaccard','maching','dice','kulsinki','rogerstanimoto','russellrao','sokalmichener','sokalsneath'),
kn = KNeighborsClassifier()
clf = grid_search.GridSearchCV(kn, parameters)
clf.fit(repubAndDemMatrix, labels)
print clf.best_estimator_ #<-lots of detail
print clf.best_params_ #<-more useful
print clf.best_score_ #<-this is the cv error
print clf.score(repubAndDemMatrix, labels) #<-training error
#optimal parameter of 4 neighbors, best test error is 0.668573607933, best training error is 0.828488372093
# if we use shuffles the training data so that it is not all democrats and then all republicans,
# we get an optimal param of 1 neighbor, .689 test error, 1.0 training error