本文整理汇总了Python中sandbox.util.Sampling.Sampling类的典型用法代码示例。如果您正苦于以下问题:Python Sampling类的具体用法?Python Sampling怎么用?Python Sampling使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Sampling类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: testSampleUsers
def testSampleUsers(self):
m = 10
n = 15
r = 5
u = 0.3
w = 1-u
X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200)
k = 50
X2, userInds = Sampling.sampleUsers(X, k)
nptst.assert_array_equal(X.toarray(), X2.toarray())
numRuns = 50
for i in range(numRuns):
m = numpy.random.randint(10, 100)
n = numpy.random.randint(10, 100)
k = numpy.random.randint(10, 100)
X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200)
X2, userInds = Sampling.sampleUsers(X, k)
self.assertEquals(X2.shape[0], min(k, m))
self.assertTrue((X.dot(X.T)!=numpy.zeros((m, m)).all()))
self.assertTrue((X2.toarray() == X.toarray()[userInds, :]).all())
self.assertEquals(X.toarray()[userInds, :].nonzero()[0].shape[0], X2.nnz)
示例2: testCrossValidation
def testCrossValidation(self):
numExamples = 10
folds = 2
indices = Sampling.crossValidation(folds, numExamples)
self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4]))
self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]))
indices = Sampling.crossValidation(3, numExamples)
self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([3, 4, 5, 6, 7, 8, 9], [0, 1, 2]))
self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 2, 6, 7, 8, 9], [3, 4, 5]))
self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 2, 3, 4, 5], [6, 7, 8, 9]))
indices = Sampling.crossValidation(4, numExamples)
self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([2, 3, 4, 5, 6, 7, 8, 9], [0, 1]))
self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 5, 6, 7, 8, 9], [2, 3, 4]))
self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 2, 3, 4, 7, 8, 9], [5, 6]))
self.assertEquals((list(indices[3][0]), list(indices[3][1])), ([0, 1, 2, 3, 4, 5, 6], [7, 8, 9]))
indices = Sampling.crossValidation(numExamples, numExamples)
self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([1, 2, 3, 4, 5, 6, 7, 8, 9], [0]))
self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 2, 3, 4, 5, 6, 7, 8, 9], [1]))
self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 3, 4, 5, 6, 7, 8, 9], [2]))
self.assertEquals((list(indices[3][0]), list(indices[3][1])), ([0, 1, 2, 4, 5, 6, 7, 8, 9], [3]))
self.assertEquals((list(indices[4][0]), list(indices[4][1])), ([0, 1, 2, 3, 5, 6, 7, 8, 9], [4]))
self.assertRaises(ValueError, Sampling.crossValidation, numExamples+1, numExamples)
self.assertRaises(ValueError, Sampling.crossValidation, 0, numExamples)
self.assertRaises(ValueError, Sampling.crossValidation, -1, numExamples)
self.assertRaises(ValueError, Sampling.crossValidation, folds, 1)
示例3: testRepCrossValidation
def testRepCrossValidation(self):
numExamples = 10
folds = 3
repetitions = 1
indices = Sampling.repCrossValidation(folds, numExamples, repetitions)
for i in range(folds):
self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all())
repetitions = 2
indices = Sampling.repCrossValidation(folds, numExamples, repetitions)
for i in range(folds):
self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all())
示例4: profileModelSelect
def profileModelSelect(self):
lmbdas = numpy.linspace(1.0, 0.01, 5)
softImpute = IterativeSoftImpute(k=500)
folds = 5
cvInds = Sampling.randCrossValidation(folds, self.X.nnz)
ProfileUtils.profile('softImpute.modelSelect(self.X, lmbdas, cvInds)', globals(), locals())
示例5: testAverageRocCurve
def testAverageRocCurve(self):
m = 50
n = 20
k = 8
u = 20.0 / m
w = 1 - u
X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix(
(m, n), k, w, csarray=True, verbose=True, indsPerRow=200
)
fpr, tpr = MCEvaluator.averageRocCurve(X, U, V)
import matplotlib
matplotlib.use("GTK3Agg")
import matplotlib.pyplot as plt
# plt.plot(fpr, tpr)
# plt.show()
# Now try case where we have a training set
folds = 1
testSize = 5
trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize)
trainX, testX = trainTestXs[0]
fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
示例6: testParallelPen
def testParallelPen(self):
#Check if penalisation == inf when treeSize < gamma
numExamples = 100
X, y = data.make_regression(numExamples)
learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2)
paramDict = {}
paramDict["setGamma"] = numpy.array(numpy.round(2**numpy.arange(1, 10, 0.5)-1), dtype=numpy.int)
folds = 3
alpha = 1.0
Cvs = numpy.array([(folds-1)*alpha])
idx = Sampling.crossValidation(folds, X.shape[0])
resultsList = learner.parallelPen(X, y, idx, paramDict, Cvs)
learner, trainErrors, currentPenalties = resultsList[0]
learner.setGamma(2**10)
treeSize = 0
#Let's work out the size of the unpruned tree
for trainInds, testInds in idx:
trainX = X[trainInds, :]
trainY = y[trainInds]
learner.learnModel(trainX, trainY)
treeSize += learner.tree.size
treeSize /= float(folds)
self.assertTrue(numpy.isinf(currentPenalties[paramDict["setGamma"]>treeSize]).all())
self.assertTrue(not numpy.isinf(currentPenalties[paramDict["setGamma"]<treeSize]).all())
示例7: testShuffleSplit
def testShuffleSplit(self):
numExamples = 10
folds = 5
indices = Sampling.shuffleSplit(folds, numExamples)
for i in range(folds):
self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all())
indices = Sampling.shuffleSplit(folds, numExamples, 0.5)
trainSize = numExamples*0.5
for i in range(folds):
self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all())
self.assertTrue(indices[i][0].shape[0] == trainSize)
indices = Sampling.shuffleSplit(folds, numExamples, 0.55)
示例8: testSampleUsers2
def testSampleUsers2(self):
m = 10
n = 15
r = 5
u = 0.3
w = 1-u
X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200)
k = X.nnz+100
X2, userInds = Sampling.sampleUsers2(X, k)
nptst.assert_array_equal(X.toarray(), X2.toarray())
#Test pruning of cols
k = 500
m = 100
n = 500
u = 0.1
w = 1 - u
X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200)
numpy.random.seed(21)
X2, userInds = Sampling.sampleUsers2(X, k, prune=True)
nnz1 = X2.nnz
self.assertTrue((X2.sum(0)!=0).all())
numpy.random.seed(21)
X2, userInds = Sampling.sampleUsers2(X, k, prune=False)
nnz2 = X2.nnz
self.assertEquals(nnz1, nnz2)
numRuns = 50
for i in range(numRuns):
m = numpy.random.randint(10, 100)
n = numpy.random.randint(10, 100)
k = 500
X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200)
X2, userInds = Sampling.sampleUsers2(X, k)
self.assertTrue((X.dot(X.T)!=numpy.zeros((m, m)).all()))
self.assertTrue((X2.toarray() == X.toarray()[userInds, :]).all())
self.assertEquals(X.toarray()[userInds, :].nonzero()[0].shape[0], X2.nnz)
示例9: cvPrune
def cvPrune(self, validX, validY):
"""
We do something like reduced error pruning but we use cross validation
to decide which nodes to prune.
"""
#First set the value of the vertices using the training set.
#Reset all alphas to zero
inds = Sampling.crossValidation(self.folds, validX.shape[0])
for i in self.tree.getAllVertexIds():
self.tree.getVertex(i).setAlpha(0.0)
self.tree.getVertex(i).setTestError(0.0)
for trainInds, testInds in inds:
rootId = (0,)
root = self.tree.getVertex(rootId)
root.setTrainInds(trainInds)
root.setTestInds(testInds)
root.tempValue = numpy.mean(validY[trainInds])
nodeStack = [(rootId, root.tempValue)]
while len(nodeStack) != 0:
(nodeId, value) = nodeStack.pop()
node = self.tree.getVertex(nodeId)
tempTrainInds = node.getTrainInds()
tempTestInds = node.getTestInds()
node.setTestError(numpy.sum((validY[tempTestInds] - node.tempValue)**2) + node.getTestError())
childIds = [self.getLeftChildId(nodeId), self.getRightChildId(nodeId)]
for childId in childIds:
if self.tree.vertexExists(childId):
child = self.tree.getVertex(childId)
if childId[-1] == 0:
childInds = validX[tempTrainInds, node.getFeatureInd()] < node.getThreshold()
else:
childInds = validX[tempTrainInds, node.getFeatureInd()] >= node.getThreshold()
if childInds.sum() !=0:
value = numpy.mean(validY[tempTrainInds[childInds]])
child.tempValue = value
child.setTrainInds(tempTrainInds[childInds])
nodeStack.append((childId, value))
if childId[-1] == 0:
childInds = validX[tempTestInds, node.getFeatureInd()] < node.getThreshold()
else:
childInds = validX[tempTestInds, node.getFeatureInd()] >= node.getThreshold()
child.setTestInds(tempTestInds[childInds])
self.computeAlphas()
self.prune()
示例10: cvModelSelection
def cvModelSelection(self, graph, paramList, paramFunc, folds, errorFunc):
"""
ParamList is a list of lists of parameters and paramFunc
is a list of the corresponding functions to call with the parameters
as arguments. Note that a parameter can also be a tuple which is expanded
out before the function is called.
e.g.
paramList = [[1, 2], [2, 1], [12, 1]]
paramFunc = [predictor.setC, predictor.setD]
"""
inds = Sampling.crossValidation(folds, graph.getNumEdges())
errors = numpy.zeros((len(paramList), folds))
allEdges = graph.getAllEdges()
for i in range(len(paramList)):
paramSet = paramList[i]
logging.debug("Using paramSet=" + str(paramSet))
for j in range(len(paramSet)):
if type(paramSet[j]) == tuple:
paramFunc[j](*paramSet[j])
else:
paramFunc[j](paramSet[j])
predY = numpy.zeros(0)
y = numpy.zeros(0)
j = 0
for (trainInds, testInds) in inds:
trainEdges = allEdges[trainInds, :]
testEdges = allEdges[testInds, :]
trainGraph = SparseGraph(graph.getVertexList(), graph.isUndirected())
trainGraph.addEdges(trainEdges, graph.getEdgeValues(trainEdges))
testGraph = SparseGraph(graph.getVertexList(), graph.isUndirected())
testGraph.addEdges(testEdges, graph.getEdgeValues(testEdges))
self.learnModel(trainGraph)
predY = self.predictEdges(testGraph, testGraph.getAllEdges())
y = testGraph.getEdgeValues(testGraph.getAllEdges())
#Note that the order the edges is different in testGraphs as
#opposed to graph when calling getAllEdges()
errors[i, j] = errorFunc(y, predY)
j = j+1
logging.info("Error of current fold: " + str(numpy.mean(errors[i, :])))
meanErrors = numpy.mean(errors, 1)
strErrors = numpy.std(errors, 1)
return meanErrors, strErrors
示例11: modelSelect
def modelSelect(self, X, colProbs=None):
"""
Perform model selection on X and return the best parameters.
"""
m, n = X.shape
#cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, colProbs=colProbs)
testMetrics = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(trainTestXs)))
if self.metric == "mrr":
evaluationMethod = computeTestMRR
elif self.metric == "f1":
evaluationMethod = computeTestF1
else:
raise ValueError("Invalid metric: " + self.metric)
logging.debug("Performing model selection")
paramList = []
for i, k in enumerate(self.ks):
for j, lmbda in enumerate(self.lmbdas):
for icv, (trainX, testX) in enumerate(trainTestXs):
learner = self.copy()
learner.k = k
learner.lmbda = lmbda
paramList.append((trainX.toScipyCsr(), testX.toScipyCsr(), learner))
if self.numProcesses != 1:
pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize)
else:
import itertools
resultsIterator = itertools.imap(evaluationMethod, paramList)
for i, k in enumerate(self.ks):
for j, lmbda in enumerate(self.lmbdas):
for icv in range(len(trainTestXs)):
testMetrics[i, j, icv] = resultsIterator.next()
if self.numProcesses != 1:
pool.terminate()
meanTestMetrics= numpy.mean(testMetrics, 2)
stdTestMetrics = numpy.std(testMetrics, 2)
logging.debug("ks=" + str(self.ks))
logging.debug("lmbdas=" + str(self.lmbdas))
logging.debug("Mean metrics=" + str(meanTestMetrics))
self.k = self.ks[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[0]]
self.lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[1]]
logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda))
return meanTestMetrics, stdTestMetrics
示例12: testBootstrap2
def testBootstrap2(self):
numExamples = 10
folds = 2
indices = Sampling.bootstrap2(folds, numExamples)
for i in range(folds):
self.assertEquals(indices[i][0].shape[0], numExamples)
self.assertTrue(indices[i][1].shape[0] < numExamples)
self.assertTrue((numpy.union1d(indices[0][0], indices[0][1]) == numpy.arange(numExamples)).all())
示例13: testParallelPen
def testParallelPen(self):
folds = 3
Cv = numpy.array([4.0])
idx = Sampling.crossValidation(folds, self.X.shape[0])
svm = self.svm
svm.setKernel("gaussian")
paramDict = {}
paramDict["setC"] = svm.getCs()
paramDict["setGamma"] = svm.getGammas()
resultsList = svm.parallelPen(self.X, self.y, idx, paramDict, Cv)
tol = 10**-6
bestError = 1
trainErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
penalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
for i in range(svm.Cs.shape[0]):
C = svm.Cs[i]
for j in range(svm.gammas.shape[0]):
gamma = svm.gammas[j]
penalty = 0
for trainInds, testInds in idx:
trainX = self.X[trainInds, :]
trainY = self.y[trainInds]
svm.setGamma(gamma)
svm.setC(C)
svm.learnModel(trainX, trainY)
predY = svm.predict(self.X)
predTrainY = svm.predict(trainX)
penalty += Evaluator.binaryError(predY, self.y) - Evaluator.binaryError(predTrainY, trainY)
penalty = penalty*Cv[0]/len(idx)
svm.learnModel(self.X, self.y)
predY = svm.predict(self.X)
trainErrors2[i, j] = Evaluator.binaryError(predY, self.y)
penalties2[i, j] = penalty
meanErrors2[i, j] = Evaluator.binaryError(predY, self.y) + penalty
if meanErrors2[i, j] < bestError:
bestC = C
bestGamma = gamma
bestError = meanErrors2[i, j]
bestSVM, trainErrors, currentPenalties = resultsList[0]
meanErrors = trainErrors + currentPenalties
self.assertEquals(bestC, bestSVM.getC())
self.assertEquals(bestGamma, bestSVM.getGamma())
self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
self.assertTrue(numpy.linalg.norm(trainErrors2.T - trainErrors) < tol)
self.assertTrue(numpy.linalg.norm(penalties2.T - currentPenalties) < tol)
示例14: generateLearner
def generateLearner(self, X, y):
"""
Train using the given examples and labels, and use model selection to
find the best parameters.
"""
if numpy.unique(y).shape[0] != 2:
print(y)
raise ValueError("Can only operate on binary data")
#Do model selection first
if self.sampleSize == None:
idx = Sampling.crossValidation(self.folds, X.shape[0])
learner, meanErrors = self.parallelModelSelect(X, y, idx, self.paramDict)
else:
idx = Sampling.crossValidation(self.folds, self.sampleSize)
inds = numpy.random.permutation(X.shape[0])[0:self.sampleSize]
learner, meanErrors = self.parallelModelSelect(X[inds, :], y[inds], idx, self.paramDict)
learner = self.getBestLearner(meanErrors, self.paramDict, X, y)
return learner
示例15: evaluateCv
def evaluateCv(self, X, y, folds, metricMethod=Evaluator.binaryError):
"""
Compute the cross validation according to a given metric.
"""
Parameter.checkInt(folds, 2, float('inf'))
idx = Sampling.crossValidation(folds, y.shape[0])
metrics = AbstractPredictor.evaluateLearn(X, y, idx, self.learnModel, self.predict, metricMethod)
mean = numpy.mean(metrics, 0)
var = numpy.var(metrics, 0)
return (mean, var)