本文整理汇总了Python中sandbox.util.Sampling.Sampling.shuffleSplitRows方法的典型用法代码示例。如果您正苦于以下问题:Python Sampling.shuffleSplitRows方法的具体用法?Python Sampling.shuffleSplitRows怎么用?Python Sampling.shuffleSplitRows使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sandbox.util.Sampling.Sampling
的用法示例。
在下文中一共展示了Sampling.shuffleSplitRows方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: testAverageRocCurve
# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
def testAverageRocCurve(self):
m = 50
n = 20
k = 8
u = 20.0 / m
w = 1 - u
X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix(
(m, n), k, w, csarray=True, verbose=True, indsPerRow=200
)
fpr, tpr = MCEvaluator.averageRocCurve(X, U, V)
import matplotlib
matplotlib.use("GTK3Agg")
import matplotlib.pyplot as plt
# plt.plot(fpr, tpr)
# plt.show()
# Now try case where we have a training set
folds = 1
testSize = 5
trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize)
trainX, testX = trainTestXs[0]
fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
示例2: modelSelect
# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
def modelSelect(self, X, colProbs=None):
"""
Perform model selection on X and return the best parameters.
"""
m, n = X.shape
#cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, colProbs=colProbs)
testMetrics = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(trainTestXs)))
if self.metric == "mrr":
evaluationMethod = computeTestMRR
elif self.metric == "f1":
evaluationMethod = computeTestF1
else:
raise ValueError("Invalid metric: " + self.metric)
logging.debug("Performing model selection")
paramList = []
for i, k in enumerate(self.ks):
for j, lmbda in enumerate(self.lmbdas):
for icv, (trainX, testX) in enumerate(trainTestXs):
learner = self.copy()
learner.k = k
learner.lmbda = lmbda
paramList.append((trainX.toScipyCsr(), testX.toScipyCsr(), learner))
if self.numProcesses != 1:
pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize)
else:
import itertools
resultsIterator = itertools.imap(evaluationMethod, paramList)
for i, k in enumerate(self.ks):
for j, lmbda in enumerate(self.lmbdas):
for icv in range(len(trainTestXs)):
testMetrics[i, j, icv] = resultsIterator.next()
if self.numProcesses != 1:
pool.terminate()
meanTestMetrics= numpy.mean(testMetrics, 2)
stdTestMetrics = numpy.std(testMetrics, 2)
logging.debug("ks=" + str(self.ks))
logging.debug("lmbdas=" + str(self.lmbdas))
logging.debug("Mean metrics=" + str(meanTestMetrics))
self.k = self.ks[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[0]]
self.lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[1]]
logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda))
return meanTestMetrics, stdTestMetrics
示例3: parallelGridSearch
# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
def parallelGridSearch(self, X, paramDict, evaluationMethod, testX=None, minVal=True):
"""
Perform parallel model selection using any learner.
"""
logging.debug("Parallel grid search with params: " + str(paramDict))
m, n = X.shape
if testX == None:
trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize)
else:
trainTestXs = [[X, testX]]
gridSize = []
gridInds = []
for key in paramDict.keys():
gridSize.append(paramDict[key].shape[0])
gridInds.append(numpy.arange(paramDict[key].shape[0]))
meanMetrics = numpy.zeros(tuple(gridSize))
paramList = []
for icv, (trainX, testX) in enumerate(trainTestXs):
indexIter = itertools.product(*gridInds)
for inds in indexIter:
learner = self.copy()
for i, (key, val) in enumerate(paramDict.items()):
setattr(learner, key, val[inds[i]])
paramList.append((trainX, testX, learner))
if self.numProcesses != 1:
pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize)
else:
resultsIterator = itertools.imap(evaluationMethod, paramList)
for icv, (trainX, testX) in enumerate(trainTestXs):
indexIter = itertools.product(*gridInds)
for inds in indexIter:
metric = resultsIterator.next()
meanMetrics[inds] += metric / float(self.folds)
if self.numProcesses != 1:
pool.terminate()
resultDict, bestMetric = self.setBestLearner(meanMetrics, paramDict, minVal)
return meanMetrics
示例4: main
# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
def main():
import sys
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data"
data = numpy.loadtxt(matrixFileName)
X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row")
X[data[:, 0]-1, data[:, 1]-1] = numpy.array(data[:, 2]>3, numpy.int)
logging.debug("Read file: " + matrixFileName)
logging.debug("Shape of data: " + str(X.shape))
logging.debug("Number of non zeros " + str(X.nnz))
u = 0.1
w = 1-u
(m, n) = X.shape
validationSize = 5
trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize)
trainX, testX = trainTestXs[0]
trainX = trainX.toScipyCsr()
learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001)
learner.learnModel(trainX)
示例5: MaxLocalAUC
# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
dataset = sys.argv[1]
else:
dataset = "synthetic"
saveResults = True
prefix = "LossROC"
outputFile = PathDefaults.getOutputDir() + "ranking/" + prefix + dataset.title() + "Results.npz"
X = DatasetUtils.getDataset(dataset, nnz=20000)
m, n = X.shape
u = 0.1
w = 1-u
testSize = 5
folds = 5
trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize)
numRecordAucSamples = 200
k2 = 8
u2 = 0.5
w2 = 1-u2
eps = 10**-4
lmbda = 0.0
maxLocalAuc = MaxLocalAUC(k2, w2, eps=eps, lmbdaU=lmbda, lmbdaV=lmbda, stochastic=True)
maxLocalAuc.alpha = 0.05
maxLocalAuc.alphas = 2.0**-numpy.arange(0, 5, 1)
maxLocalAuc.folds = 1
maxLocalAuc.initialAlg = "rand"
maxLocalAuc.itemExpP = 0.0
maxLocalAuc.itemExpQ = 0.0
示例6: testShuffleSplitRows
# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
def testShuffleSplitRows(self):
m = 10
n = 16
k = 5
u = 0.5
w = 1-u
X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, csarray=True, verbose=True, indsPerRow=200)
#print(X.toarray())
k2 = 5
testSize = 2
trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, rowMajor=True)
for i in range(k2):
trainX = trainTestXs[i][0]
testX = trainTestXs[i][1]
self.assertEquals(trainX.storagetype, "row")
self.assertEquals(testX.storagetype, "row")
nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m))
self.assertEquals(X.nnz, trainX.nnz + testX.nnz)
trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, rowMajor=False)
for i in range(k2):
trainX = trainTestXs[i][0]
testX = trainTestXs[i][1]
self.assertEquals(trainX.storagetype, "col")
self.assertEquals(testX.storagetype, "col")
nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m))
self.assertEquals(X.nnz, trainX.nnz + testX.nnz)
trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, csarray=False)
for i in range(k2):
trainX = trainTestXs[i][0]
testX = trainTestXs[i][1]
nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
nptst.assert_array_equal(numpy.ravel(testX.sum(1)), testSize*numpy.ones(m))
self.assertEquals(X.nnz, trainX.nnz + testX.nnz)
testSize = 0
trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize)
for i in range(k2):
trainX = trainTestXs[i][0]
testX = trainTestXs[i][1]
nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m))
self.assertEquals(X.nnz, trainX.nnz + testX.nnz)
self.assertEquals(testX.nnz, 0)
#Test sampling a subset of the rows
testSize = 2
numRows = 5
trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, numRows=numRows, rowMajor=False)
for i in range(k2):
trainX = trainTestXs[i][0]
testX = trainTestXs[i][1]
nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
self.assertEquals(numpy.nonzero(testX.sum(1))[0].shape[0], numRows)
self.assertEquals(X.nnz, trainX.nnz + testX.nnz)
self.assertEquals(testX.nnz, testSize*numRows)
#Make sure column probabilities are correct
w = 0.0
X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, csarray=True, verbose=True, indsPerRow=200)
testSize = 5
k2 = 500
colProbs = numpy.arange(0, n, dtype=numpy.float)+1
colProbs /= colProbs.sum()
trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs)
colProbs2 = numpy.zeros(n)
for i in range(k2):
trainX = trainTestXs[i][0]
testX = trainTestXs[i][1]
colProbs2 += testX.sum(0)
colProbs2 /= colProbs2.sum()
nptst.assert_array_almost_equal(colProbs, colProbs2, 2)
#Now test when probabilities are uniform
colProbs = numpy.ones(n)/float(n)
trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs)
colProbs = None
trainTestXs2 = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs)
#.........这里部分代码省略.........
示例7: parallelLearnModel
# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
def parallelLearnModel(self, X, verbose=False, U=None, V=None):
"""
Max local AUC with Frobenius norm penalty on V. Solve with parallel (stochastic) gradient descent.
The input is a sparse array.
"""
# Convert to a csarray for faster access
if scipy.sparse.issparse(X):
logging.debug("Converting to csarray")
X2 = sppy.csarray(X, storagetype="row")
X = X2
m, n = X.shape
# We keep a validation set in order to determine when to stop
if self.validationUsers != 0:
numValidationUsers = int(m * self.validationUsers)
trainX, testX, rowSamples = Sampling.shuffleSplitRows(
X, 1, self.validationSize, numRows=numValidationUsers
)[0]
testIndPtr, testColInds = SparseUtils.getOmegaListPtr(testX)
else:
trainX = X
testX = None
rowSamples = None
testIndPtr, testColInds = None, None
# Not that to compute the test AUC we pick i \in X and j \notin X \cup testX
indPtr, colInds = SparseUtils.getOmegaListPtr(trainX)
allIndPtr, allColInds = SparseUtils.getOmegaListPtr(X)
if U == None or V == None:
U, V = self.initUV(trainX)
if self.metric == "f1":
metricInd = 2
elif self.metric == "mrr":
metricInd = 3
else:
raise ValueError("Unknown metric: " + self.metric)
bestMetric = 0
bestU = 0
bestV = 0
trainMeasures = []
testMeasures = []
loopInd = 0
lastObj = 0
currentObj = lastObj - 2 * self.eps
numBlocks = self.numProcesses + 1
gi, gp, gq = self.computeGipq(X)
normGp, normGq = self.computeNormGpq(indPtr, colInds, gp, gq, m)
# Some shared variables
rowIsFree = sharedmem.ones(numBlocks, dtype=numpy.bool)
colIsFree = sharedmem.ones(numBlocks, dtype=numpy.bool)
# Create shared factors
U2 = sharedmem.zeros((m, self.k))
V2 = sharedmem.zeros((n, self.k))
muU2 = sharedmem.zeros((m, self.k))
muV2 = sharedmem.zeros((n, self.k))
U2[:] = U[:]
V2[:] = V[:]
muU2[:] = U[:]
muV2[:] = V[:]
del U, V
rowBlockSize = int(numpy.ceil(float(m) / numBlocks))
colBlockSize = int(numpy.ceil(float(n) / numBlocks))
lock = multiprocessing.Lock()
startTime = time.time()
loopInd = 0
iterationsPerBlock = sharedmem.zeros((numBlocks, numBlocks))
self.learnerCython = self.getCythonLearner()
nextRecord = 0
while loopInd < self.maxIterations and abs(lastObj - currentObj) > self.eps:
if loopInd >= nextRecord:
if loopInd != 0:
print("")
printStr = self.recordResults(
muU2,
muV2,
trainMeasures,
testMeasures,
loopInd,
rowSamples,
indPtr,
colInds,
testIndPtr,
testColInds,
allIndPtr,
allColInds,
gi,
gp,
#.........这里部分代码省略.........
示例8: singleLearnModel
# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
def singleLearnModel(self, X, verbose=False, U=None, V=None):
"""
Max local AUC with Frobenius norm penalty on V. Solve with (stochastic) gradient descent.
The input is a sparse array.
"""
# Convert to a csarray for faster access
if scipy.sparse.issparse(X):
logging.debug("Converting to csarray")
X2 = sppy.csarray(X, storagetype="row")
X = X2
m, n = X.shape
# We keep a validation set in order to determine when to stop
if self.validationUsers != 0:
numValidationUsers = int(m * self.validationUsers)
trainX, testX, rowSamples = Sampling.shuffleSplitRows(
X, 1, self.validationSize, numRows=numValidationUsers
)[0]
testIndPtr, testColInds = SparseUtils.getOmegaListPtr(testX)
logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " + str(trainX.nnz))
logging.debug("Validation X shape and nnz: " + str(testX.shape) + " " + str(testX.nnz))
else:
trainX = X
testX = None
rowSamples = None
testIndPtr, testColInds = None, None
# Note that to compute the test AUC we pick i \in X and j \notin X \cup testX
indPtr, colInds = SparseUtils.getOmegaListPtr(trainX)
allIndPtr, allColInds = SparseUtils.getOmegaListPtr(X)
if type(U) != numpy.ndarray and type(V) != numpy.ndarray:
U, V = self.initUV(trainX)
if self.metric == "f1":
metricInd = 2
elif self.metric == "mrr":
metricInd = 3
else:
raise ValueError("Unknown metric: " + self.metric)
muU = U.copy()
muV = V.copy()
bestMetric = 0
bestU = 0
bestV = 0
trainMeasures = []
testMeasures = []
loopInd = 0
lastObj = 0
currentObj = lastObj - 2 * self.eps
# Try alternative number of iterations
# numIterations = trainX.nnz/self.numAucSamples
numIterations = max(m, n)
self.learnerCython = self.getCythonLearner()
# Set up order of indices for stochastic methods
permutedRowInds = numpy.array(numpy.random.permutation(m), numpy.uint32)
permutedColInds = numpy.array(numpy.random.permutation(n), numpy.uint32)
startTime = time.time()
gi, gp, gq = self.computeGipq(X)
normGp, normGq = self.computeNormGpq(indPtr, colInds, gp, gq, m)
while loopInd < self.maxIterations and abs(lastObj - currentObj) > self.eps:
sigmaU = self.getSigma(loopInd, self.alpha, m)
sigmaV = self.getSigma(loopInd, self.alpha, m)
if loopInd % self.recordStep == 0:
if loopInd != 0 and self.stochastic:
print("")
printStr = self.recordResults(
muU,
muV,
trainMeasures,
testMeasures,
loopInd,
rowSamples,
indPtr,
colInds,
testIndPtr,
testColInds,
allIndPtr,
allColInds,
gi,
gp,
gq,
trainX,
startTime,
)
logging.debug(printStr)
if testIndPtr is not None and testMeasures[-1][metricInd] >= bestMetric:
#.........这里部分代码省略.........
示例9: modelSelect2
# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
def modelSelect2(self, X, rhos, ks, cvInds, colProbs=None):
"""
Pick a value of rho based on a single matrix X. We do cross validation
within, and return the best value of lambda (according to the mean
squared error). The rhos must be in decreasing order and we use
warm restarts. In this case we remove a few non zeros from each row
to form the test set.
"""
if (numpy.flipud(numpy.sort(rhos)) != rhos).all():
raise ValueError("rhos must be in descending order")
trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, rowMajor=False, colProbs=colProbs)
metrics = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds)))
if self.metric == "mse":
metricFuction = learnPredictMSE
elif self.metric == "f1" or self.metric == "mrr":
metricFuction = learnPredictRanking
else:
raise ValueError("Unknown metric: " + self.metric)
paramList = []
for i, (trainX, testX) in enumerate(trainTestXs):
Util.printIteration(i, 1, len(cvInds), "Fold: ")
for m, k in enumerate(ks):
learner = self.copy()
learner.updateAlg="initial"
learner.setK(k)
paramList.append((learner, trainX, testX, rhos))
if self.numProcesses != 1:
pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=10)
resultsIter = pool.imap(metricFuction, paramList)
else:
resultsIter = itertools.imap(metricFuction, paramList)
for i, (trainX, testX) in enumerate(trainTestXs):
for m, k in enumerate(ks):
metrics[:, m, i] = resultsIter.next()
if self.numProcesses != 1:
pool.terminate()
meanMetrics = metrics.mean(2)
stdMetrics = metrics.std(2)
logging.debug("ks=" + str(ks))
logging.debug("rhos=" + str(rhos))
logging.debug(meanMetrics)
#Set the parameters
if self.metric == "mse":
self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[0]])
self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[1]])
elif self.metric == "f1" or self.metric == "mrr":
self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[0]])
self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[1]])
logging.debug("Model parameters: k=" + str(self.k) + " rho=" + str(self.rho))
return meanMetrics, stdMetrics
示例10: modelSelect
# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
def modelSelect(self, X, colProbs=None):
"""
Perform model selection on X and return the best parameters.
"""
m, n = X.shape
trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=True, colProbs=colProbs)
testMetrics = numpy.zeros((self.ks.shape[0], self.lmbdaUsers.shape[0], self.lmbdaItems.shape[0], self.gammas.shape[0], len(trainTestXs)))
logging.debug("Performing model selection with test leave out per row of " + str(self.validationSize))
paramList = []
for i, k in enumerate(self.ks):
for j, lmbdaUser in enumerate(self.lmbdaUsers):
for s, lmbdaItem in enumerate(self.lmbdaItems):
for t, gamma in enumerate(self.gammas):
for icv, (trainX, testX) in enumerate(trainTestXs):
learner = self.copy()
learner.k = k
learner.lmbdaUser = lmbdaUser
learner.lmbdaPos = lmbdaItem
learner.lmbdaNeg = lmbdaItem
learner.gamma = gamma
paramList.append((trainX, testX, learner))
if self.numProcesses != 1:
pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
resultsIterator = pool.imap(computeTestF1, paramList, self.chunkSize)
else:
import itertools
resultsIterator = itertools.imap(computeTestF1, paramList)
for i, k in enumerate(self.ks):
for j, lmbdaUser in enumerate(self.lmbdaUsers):
for s, lmbdaPos in enumerate(self.lmbdaItems):
for t, gamma in enumerate(self.gammas):
for icv, (trainX, testX) in enumerate(trainTestXs):
testMetrics[i, j, s, t, icv] = resultsIterator.next()
if self.numProcesses != 1:
pool.terminate()
meanTestMetrics = numpy.mean(testMetrics, 4)
stdTestMetrics = numpy.std(testMetrics, 4)
logging.debug("ks=" + str(self.ks))
logging.debug("lmbdaUsers=" + str(self.lmbdaUsers))
logging.debug("lmbdaItems=" + str(self.lmbdaItems))
logging.debug("gammas=" + str(self.gammas))
logging.debug("Mean metrics=" + str(meanTestMetrics))
indK, indLmdabUser, indLmbdaItem, indGamma = numpy.unravel_index(meanTestMetrics.argmax(), meanTestMetrics.shape)
self.k = self.ks[indK]
self.lmbdaUser = self.lmbdaUsers[indLmdabUser]
self.lmbdaPos = self.lmbdaItems[indLmbdaItem]
self.lmbdaNeg = self.lmbdaItems[indLmbdaItem]
self.gamma = self.gammas[indGamma]
logging.debug("Model parameters: " + str(self))
return meanTestMetrics, stdTestMetrics
示例11: runExperiment
# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
def runExperiment(self, X):
"""
Run the selected ranking experiments and save results
"""
logging.debug("Splitting into train and test sets")
#Make sure different runs get the same train/test split
numpy.random.seed(21)
m, n = X.shape
#colProbs = (X.sum(0)+1)/float(m+1)
#colProbs = colProbs**-self.algoArgs.itemExp
#colProbs = numpy.ones(n)/float(n)
trainTestXs = Sampling.shuffleSplitRows(X, 1, self.algoArgs.testSize)
trainX, testX = trainTestXs[0]
logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " + str(trainX.nnz))
logging.debug("Test X shape and nnz: " + str(testX.shape) + " " + str(testX.nnz))
#Have scipy versions of each array
trainXScipy = trainX.toScipyCsc()
testXScipy = testX.toScipyCsc()
if self.algoArgs.runSoftImpute:
logging.debug("Running soft impute")
resultsFileName = self.resultsDir + "ResultsSoftImpute.npz"
fileLock = FileLock(resultsFileName)
if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
fileLock.lock()
logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
modelSelectX, userInds = Sampling.sampleUsers2(trainXScipy, self.algoArgs.modelSelectSamples, prune=True)
try:
learner = IterativeSoftImpute(self.algoArgs.rhoSi, eps=self.algoArgs.epsSi, k=self.algoArgs.k, svdAlg=self.algoArgs.svdAlg, postProcess=self.algoArgs.postProcess, p=self.algoArgs.pSi, q=self.algoArgs.qSi)
learner.folds = self.algoArgs.folds
learner.metric = self.algoArgs.metric
learner.numProcesses = self.algoArgs.processes
learner.recommendSize = self.algoArgs.recommendSize
learner.validationSize = self.algoArgs.validationSize
if self.algoArgs.modelSelect:
cvInds = Sampling.randCrossValidation(self.algoArgs.folds, modelSelectX.nnz)
meanErrors, stdErrors = learner.modelSelect2(modelSelectX, self.algoArgs.rhosSi, self.algoArgs.ks, cvInds)
modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")
numpy.savez(modelSelectFileName, meanErrors, stdErrors)
logging.debug("Saved model selection grid as " + modelSelectFileName)
logging.debug(learner)
self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName)
finally:
fileLock.unlock()
else:
logging.debug("File is locked or already computed: " + resultsFileName)
if self.algoArgs.runMaxLocalAuc:
logging.debug("Running max local AUC")
if self.algoArgs.loss != "tanh":
resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + ".npz"
else:
resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + "_rho=" + str(self.algoArgs.rhoMlauc) + ".npz"
fileLock = FileLock(resultsFileName)
if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
fileLock.lock()
try:
learner = MaxLocalAUC(self.algoArgs.k, 1-self.algoArgs.u, lmbdaU=self.algoArgs.lmbdaUMlauc, lmbdaV=self.algoArgs.lmbdaVMlauc, eps=self.algoArgs.epsMlauc, stochastic=not self.algoArgs.fullGradient)
learner.alpha = self.algoArgs.alpha
learner.alphas = self.algoArgs.alphas
learner.eta = self.algoArgs.eta
learner.folds = self.algoArgs.folds
learner.initialAlg = self.algoArgs.initialAlg
learner.itemExpP = self.algoArgs.itemExpP
learner.itemExpQ = self.algoArgs.itemExpQ
learner.ks = self.algoArgs.ks
learner.lmbdas = self.algoArgs.lmbdasMlauc
learner.loss = self.algoArgs.loss
learner.maxIterations = self.algoArgs.maxIterations
learner.maxNorms = self.algoArgs.maxNorms
learner.maxNormU = self.algoArgs.maxNorm
learner.maxNormV = self.algoArgs.maxNorm
learner.metric = self.algoArgs.metric
learner.normalise = self.algoArgs.normalise
learner.numAucSamples = self.algoArgs.numAucSamples
learner.numProcesses = self.algoArgs.processes
learner.numRowSamples = self.algoArgs.numRowSamples
learner.rate = self.algoArgs.rate
learner.recommendSize = self.algoArgs.recommendSize
learner.recordStep = self.algoArgs.recordStep
learner.rho = self.algoArgs.rhoMlauc
learner.rhos = self.algoArgs.rhosMlauc
learner.startAverage = self.algoArgs.startAverage
learner.t0 = self.algoArgs.t0
learner.t0s = self.algoArgs.t0s
learner.validationSize = self.algoArgs.validationSize
learner.validationUsers = self.algoArgs.validationUsers
#.........这里部分代码省略.........
示例12: modelSelect
# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
def modelSelect(self, X, colProbs=None):
"""
Perform model selection on X and return the best parameters.
"""
m, n = X.shape
trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, colProbs=colProbs)
datas = []
for (trainX, testX) in trainTestXs:
testOmegaList = SparseUtils.getOmegaList(testX)
#testX = trainX+testX
datas.append((trainX, testX, testOmegaList))
testAucs = numpy.zeros((len(self.ks), len(self.lmbdas), len(self.gammas), len(trainTestXs)))
logging.debug("Performing model selection")
paramList = []
for i, k in enumerate(self.ks):
U, V = self.initUV(X, k)
for lmbda in self.lmbdas:
for gamma in self.gammas:
for (trainX, testX, testOmegaList) in datas:
learner = self.copy()
learner.k = k
learner.U = U.copy()
learner.V = V.copy()
learner.lmbda = lmbda
learner.gamma = gamma
paramList.append((scipy.sparse.csr_matrix(trainX, dtype=numpy.float64), scipy.sparse.csr_matrix(testX, dtype=numpy.float64), learner))
if self.numProcesses != 1:
pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
resultsIterator = pool.imap(computeTestF1, paramList, self.chunkSize)
else:
resultsIterator = itertools.imap(computeTestF1, paramList)
for i_k in range(len(self.ks)):
for i_lmbda in range(len(self.lmbdas)):
for i_gamma in range(len(self.gammas)):
for i_cv in range(len(trainTestXs)):
testAucs[i_k, i_lmbda, i_gamma, i_cv] = resultsIterator.next()
if self.numProcesses != 1:
pool.terminate()
meanTestMetrics = numpy.mean(testAucs, 3)
stdTestMetrics = numpy.std(testAucs, 3)
logging.debug("ks=" + str(self.ks))
logging.debug("lmbdas=" + str(self.lmbdas))
logging.debug("gammas=" + str(self.gammas))
logging.debug("Mean metrics=" + str(meanTestMetrics))
i_k, i_lmbda, i_gamma = numpy.unravel_index(meanTestMetrics.argmax(), meanTestMetrics.shape)
self.k = self.ks[i_k]
self.lmbda = self.lmbdas[i_lmbda]
self.gamma = self.gammas[i_gamma]
logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda) + " gamma=" + str(self.gamma))
return meanTestMetrics, stdTestMetrics