当前位置: 首页>>代码示例>>Python>>正文


Python Sampling.shuffleSplitRows方法代码示例

本文整理汇总了Python中sandbox.util.Sampling.Sampling.shuffleSplitRows方法的典型用法代码示例。如果您正苦于以下问题:Python Sampling.shuffleSplitRows方法的具体用法?Python Sampling.shuffleSplitRows怎么用?Python Sampling.shuffleSplitRows使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sandbox.util.Sampling.Sampling的用法示例。


在下文中一共展示了Sampling.shuffleSplitRows方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: testAverageRocCurve

# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
    def testAverageRocCurve(self):
        m = 50
        n = 20
        k = 8
        u = 20.0 / m
        w = 1 - u
        X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix(
            (m, n), k, w, csarray=True, verbose=True, indsPerRow=200
        )

        fpr, tpr = MCEvaluator.averageRocCurve(X, U, V)

        import matplotlib

        matplotlib.use("GTK3Agg")
        import matplotlib.pyplot as plt

        # plt.plot(fpr, tpr)
        # plt.show()

        # Now try case where we have a training set
        folds = 1
        testSize = 5
        trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize)
        trainX, testX = trainTestXs[0]

        fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
开发者ID:kentwang,项目名称:sandbox,代码行数:29,代码来源:MCEvaluatorTest.py

示例2: modelSelect

# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
    def modelSelect(self, X, colProbs=None): 
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        #cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, colProbs=colProbs)
        testMetrics = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(trainTestXs)))
        
        if self.metric == "mrr":
            evaluationMethod = computeTestMRR
        elif self.metric == "f1": 
            evaluationMethod = computeTestF1
        else: 
            raise ValueError("Invalid metric: " + self.metric)        
        
        logging.debug("Performing model selection")
        paramList = []        
        
        for i, k in enumerate(self.ks): 
            for j, lmbda in enumerate(self.lmbdas): 
                for icv, (trainX, testX) in enumerate(trainTestXs):                
                    learner = self.copy()
                    learner.k = k
                    learner.lmbda = lmbda 
                
                    paramList.append((trainX.toScipyCsr(), testX.toScipyCsr(), learner))
            
        if self.numProcesses != 1: 
            pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
            resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize)
        else: 
            import itertools
            resultsIterator = itertools.imap(evaluationMethod, paramList)
        
        for i, k in enumerate(self.ks):
            for j, lmbda in enumerate(self.lmbdas):
                for icv in range(len(trainTestXs)):             
                    testMetrics[i, j, icv] = resultsIterator.next()
        
        if self.numProcesses != 1: 
            pool.terminate()
            
        meanTestMetrics= numpy.mean(testMetrics, 2)
        stdTestMetrics = numpy.std(testMetrics, 2)
        
        logging.debug("ks=" + str(self.ks)) 
        logging.debug("lmbdas=" + str(self.lmbdas)) 
        logging.debug("Mean metrics=" + str(meanTestMetrics))
        
        self.k = self.ks[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[0]]
        self.lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[1]]

        logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda))
         
        return meanTestMetrics, stdTestMetrics
开发者ID:charanpald,项目名称:sandbox,代码行数:58,代码来源:WeightedMf.py

示例3: parallelGridSearch

# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
    def parallelGridSearch(self, X, paramDict, evaluationMethod, testX=None, minVal=True):
        """
        Perform parallel model selection using any learner. 
        """
        logging.debug("Parallel grid search with params: " + str(paramDict))

        m, n = X.shape
        if testX == None:
            trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize)
        else:
            trainTestXs = [[X, testX]]

        gridSize = []
        gridInds = []
        for key in paramDict.keys():
            gridSize.append(paramDict[key].shape[0])
            gridInds.append(numpy.arange(paramDict[key].shape[0]))

        meanMetrics = numpy.zeros(tuple(gridSize))
        paramList = []

        for icv, (trainX, testX) in enumerate(trainTestXs):
            indexIter = itertools.product(*gridInds)

            for inds in indexIter:
                learner = self.copy()

                for i, (key, val) in enumerate(paramDict.items()):
                    setattr(learner, key, val[inds[i]])

                paramList.append((trainX, testX, learner))

        if self.numProcesses != 1:
            pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
            resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize)
        else:
            resultsIterator = itertools.imap(evaluationMethod, paramList)

        for icv, (trainX, testX) in enumerate(trainTestXs):
            indexIter = itertools.product(*gridInds)
            for inds in indexIter:
                metric = resultsIterator.next()
                meanMetrics[inds] += metric / float(self.folds)

        if self.numProcesses != 1:
            pool.terminate()

        resultDict, bestMetric = self.setBestLearner(meanMetrics, paramDict, minVal)

        return meanMetrics
开发者ID:kentwang,项目名称:sandbox,代码行数:52,代码来源:MaxLocalAUC.py

示例4: main

# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
def main():
    import sys
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" 
    data = numpy.loadtxt(matrixFileName)
    X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row")
    X[data[:, 0]-1, data[:, 1]-1] = numpy.array(data[:, 2]>3, numpy.int)
    logging.debug("Read file: " + matrixFileName)
    logging.debug("Shape of data: " + str(X.shape))
    logging.debug("Number of non zeros " + str(X.nnz))
    
    u = 0.1 
    w = 1-u
    (m, n) = X.shape

    validationSize = 5
    trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize)
    trainX, testX = trainTestXs[0]
    trainX = trainX.toScipyCsr()

    learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001)
    learner.learnModel(trainX)
开发者ID:charanpald,项目名称:sandbox,代码行数:24,代码来源:CLiMF.py

示例5: MaxLocalAUC

# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
    dataset = sys.argv[1]
else: 
    dataset = "synthetic"

saveResults = True
prefix = "LossROC"
outputFile = PathDefaults.getOutputDir() + "ranking/" + prefix + dataset.title() + "Results.npz" 
X = DatasetUtils.getDataset(dataset, nnz=20000)

m, n = X.shape
u = 0.1 
w = 1-u

testSize = 5
folds = 5
trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize)

numRecordAucSamples = 200

k2 = 8
u2 = 0.5
w2 = 1-u2
eps = 10**-4
lmbda = 0.0
maxLocalAuc = MaxLocalAUC(k2, w2, eps=eps, lmbdaU=lmbda, lmbdaV=lmbda, stochastic=True)
maxLocalAuc.alpha = 0.05
maxLocalAuc.alphas = 2.0**-numpy.arange(0, 5, 1)
maxLocalAuc.folds = 1
maxLocalAuc.initialAlg = "rand"
maxLocalAuc.itemExpP = 0.0
maxLocalAuc.itemExpQ = 0.0
开发者ID:charanpald,项目名称:wallhack,代码行数:33,代码来源:LossROCExp.py

示例6: testShuffleSplitRows

# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
    def testShuffleSplitRows(self): 
        m = 10
        n = 16
        k = 5 
        u = 0.5
        w = 1-u
        X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, csarray=True, verbose=True, indsPerRow=200)
        
        #print(X.toarray())
        
        k2 = 5 
        testSize = 2
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, rowMajor=True)
        
        for i in range(k2): 
            trainX = trainTestXs[i][0]
            testX = trainTestXs[i][1]
                        
            self.assertEquals(trainX.storagetype, "row")
            self.assertEquals(testX.storagetype, "row")
            nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
            nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m))
            self.assertEquals(X.nnz, trainX.nnz + testX.nnz)
        
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, rowMajor=False)
        
        for i in range(k2): 
            trainX = trainTestXs[i][0]
            testX = trainTestXs[i][1]
                       
            self.assertEquals(trainX.storagetype, "col")
            self.assertEquals(testX.storagetype, "col")                       
            nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
            nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m))
            self.assertEquals(X.nnz, trainX.nnz + testX.nnz)        
        
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, csarray=False)
        for i in range(k2): 
            trainX = trainTestXs[i][0]
            testX = trainTestXs[i][1]
                        
            nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
            
            nptst.assert_array_equal(numpy.ravel(testX.sum(1)), testSize*numpy.ones(m))
            self.assertEquals(X.nnz, trainX.nnz + testX.nnz)

        testSize = 0
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize)
        
        for i in range(k2): 
            trainX = trainTestXs[i][0]
            testX = trainTestXs[i][1]
                        
            nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
            nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m))
            self.assertEquals(X.nnz, trainX.nnz + testX.nnz)
            self.assertEquals(testX.nnz, 0)
            
        #Test sampling a subset of the rows 
        testSize = 2
        numRows = 5
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, numRows=numRows, rowMajor=False)

        for i in range(k2): 
            trainX = trainTestXs[i][0]
            testX = trainTestXs[i][1]
            
            nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
            self.assertEquals(numpy.nonzero(testX.sum(1))[0].shape[0], numRows)
            self.assertEquals(X.nnz, trainX.nnz + testX.nnz)
            self.assertEquals(testX.nnz, testSize*numRows)
            
        #Make sure column probabilities are correct 
        w = 0.0            
        X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, csarray=True, verbose=True, indsPerRow=200)            
            
        testSize = 5
        k2 = 500
        colProbs = numpy.arange(0, n, dtype=numpy.float)+1
        colProbs /= colProbs.sum() 
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs)
        
        colProbs2 = numpy.zeros(n)        
        
        for i in range(k2): 
            trainX = trainTestXs[i][0]
            testX = trainTestXs[i][1]
            
            colProbs2 += testX.sum(0)
        
        colProbs2 /= colProbs2.sum() 
        nptst.assert_array_almost_equal(colProbs, colProbs2, 2)
        
        #Now test when probabilities are uniform 
        colProbs = numpy.ones(n)/float(n)        
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs)
        
        colProbs = None
        trainTestXs2 = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs)
        
#.........这里部分代码省略.........
开发者ID:charanpald,项目名称:sandbox,代码行数:103,代码来源:SamplingTest.py

示例7: parallelLearnModel

# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
    def parallelLearnModel(self, X, verbose=False, U=None, V=None):
        """
        Max local AUC with Frobenius norm penalty on V. Solve with parallel (stochastic) gradient descent. 
        The input is a sparse array. 
        """
        # Convert to a csarray for faster access
        if scipy.sparse.issparse(X):
            logging.debug("Converting to csarray")
            X2 = sppy.csarray(X, storagetype="row")
            X = X2

        m, n = X.shape

        # We keep a validation set in order to determine when to stop
        if self.validationUsers != 0:
            numValidationUsers = int(m * self.validationUsers)
            trainX, testX, rowSamples = Sampling.shuffleSplitRows(
                X, 1, self.validationSize, numRows=numValidationUsers
            )[0]
            testIndPtr, testColInds = SparseUtils.getOmegaListPtr(testX)
        else:
            trainX = X
            testX = None
            rowSamples = None
            testIndPtr, testColInds = None, None

        # Not that to compute the test AUC we pick i \in X and j \notin X \cup testX
        indPtr, colInds = SparseUtils.getOmegaListPtr(trainX)
        allIndPtr, allColInds = SparseUtils.getOmegaListPtr(X)

        if U == None or V == None:
            U, V = self.initUV(trainX)

        if self.metric == "f1":
            metricInd = 2
        elif self.metric == "mrr":
            metricInd = 3
        else:
            raise ValueError("Unknown metric: " + self.metric)

        bestMetric = 0
        bestU = 0
        bestV = 0
        trainMeasures = []
        testMeasures = []
        loopInd = 0
        lastObj = 0
        currentObj = lastObj - 2 * self.eps

        numBlocks = self.numProcesses + 1
        gi, gp, gq = self.computeGipq(X)
        normGp, normGq = self.computeNormGpq(indPtr, colInds, gp, gq, m)

        # Some shared variables
        rowIsFree = sharedmem.ones(numBlocks, dtype=numpy.bool)
        colIsFree = sharedmem.ones(numBlocks, dtype=numpy.bool)

        # Create shared factors
        U2 = sharedmem.zeros((m, self.k))
        V2 = sharedmem.zeros((n, self.k))
        muU2 = sharedmem.zeros((m, self.k))
        muV2 = sharedmem.zeros((n, self.k))

        U2[:] = U[:]
        V2[:] = V[:]
        muU2[:] = U[:]
        muV2[:] = V[:]
        del U, V

        rowBlockSize = int(numpy.ceil(float(m) / numBlocks))
        colBlockSize = int(numpy.ceil(float(n) / numBlocks))

        lock = multiprocessing.Lock()
        startTime = time.time()
        loopInd = 0
        iterationsPerBlock = sharedmem.zeros((numBlocks, numBlocks))

        self.learnerCython = self.getCythonLearner()
        nextRecord = 0

        while loopInd < self.maxIterations and abs(lastObj - currentObj) > self.eps:
            if loopInd >= nextRecord:
                if loopInd != 0:
                    print("")

                printStr = self.recordResults(
                    muU2,
                    muV2,
                    trainMeasures,
                    testMeasures,
                    loopInd,
                    rowSamples,
                    indPtr,
                    colInds,
                    testIndPtr,
                    testColInds,
                    allIndPtr,
                    allColInds,
                    gi,
                    gp,
#.........这里部分代码省略.........
开发者ID:kentwang,项目名称:sandbox,代码行数:103,代码来源:MaxLocalAUC.py

示例8: singleLearnModel

# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
    def singleLearnModel(self, X, verbose=False, U=None, V=None):
        """
        Max local AUC with Frobenius norm penalty on V. Solve with (stochastic) gradient descent. 
        The input is a sparse array. 
        """
        # Convert to a csarray for faster access
        if scipy.sparse.issparse(X):
            logging.debug("Converting to csarray")
            X2 = sppy.csarray(X, storagetype="row")
            X = X2

        m, n = X.shape

        # We keep a validation set in order to determine when to stop
        if self.validationUsers != 0:
            numValidationUsers = int(m * self.validationUsers)
            trainX, testX, rowSamples = Sampling.shuffleSplitRows(
                X, 1, self.validationSize, numRows=numValidationUsers
            )[0]

            testIndPtr, testColInds = SparseUtils.getOmegaListPtr(testX)

            logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " + str(trainX.nnz))
            logging.debug("Validation X shape and nnz: " + str(testX.shape) + " " + str(testX.nnz))
        else:
            trainX = X
            testX = None
            rowSamples = None
            testIndPtr, testColInds = None, None

        # Note that to compute the test AUC we pick i \in X and j \notin X \cup testX
        indPtr, colInds = SparseUtils.getOmegaListPtr(trainX)
        allIndPtr, allColInds = SparseUtils.getOmegaListPtr(X)

        if type(U) != numpy.ndarray and type(V) != numpy.ndarray:
            U, V = self.initUV(trainX)

        if self.metric == "f1":
            metricInd = 2
        elif self.metric == "mrr":
            metricInd = 3
        else:
            raise ValueError("Unknown metric: " + self.metric)

        muU = U.copy()
        muV = V.copy()
        bestMetric = 0
        bestU = 0
        bestV = 0
        trainMeasures = []
        testMeasures = []
        loopInd = 0
        lastObj = 0
        currentObj = lastObj - 2 * self.eps

        # Try alternative number of iterations
        # numIterations = trainX.nnz/self.numAucSamples
        numIterations = max(m, n)

        self.learnerCython = self.getCythonLearner()

        # Set up order of indices for stochastic methods
        permutedRowInds = numpy.array(numpy.random.permutation(m), numpy.uint32)
        permutedColInds = numpy.array(numpy.random.permutation(n), numpy.uint32)

        startTime = time.time()

        gi, gp, gq = self.computeGipq(X)
        normGp, normGq = self.computeNormGpq(indPtr, colInds, gp, gq, m)

        while loopInd < self.maxIterations and abs(lastObj - currentObj) > self.eps:
            sigmaU = self.getSigma(loopInd, self.alpha, m)
            sigmaV = self.getSigma(loopInd, self.alpha, m)

            if loopInd % self.recordStep == 0:
                if loopInd != 0 and self.stochastic:
                    print("")

                printStr = self.recordResults(
                    muU,
                    muV,
                    trainMeasures,
                    testMeasures,
                    loopInd,
                    rowSamples,
                    indPtr,
                    colInds,
                    testIndPtr,
                    testColInds,
                    allIndPtr,
                    allColInds,
                    gi,
                    gp,
                    gq,
                    trainX,
                    startTime,
                )
                logging.debug(printStr)

                if testIndPtr is not None and testMeasures[-1][metricInd] >= bestMetric:
#.........这里部分代码省略.........
开发者ID:kentwang,项目名称:sandbox,代码行数:103,代码来源:MaxLocalAUC.py

示例9: modelSelect2

# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
    def modelSelect2(self, X, rhos, ks, cvInds, colProbs=None):
        """
        Pick a value of rho based on a single matrix X. We do cross validation
        within, and return the best value of lambda (according to the mean
        squared error). The rhos must be in decreasing order and we use 
        warm restarts. In this case we remove a few non zeros from each row 
        to form the test set. 
        """
        if (numpy.flipud(numpy.sort(rhos)) != rhos).all(): 
            raise ValueError("rhos must be in descending order")    

        trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, rowMajor=False, colProbs=colProbs)
        metrics = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds)))
        
        if self.metric == "mse": 
            metricFuction = learnPredictMSE
        elif self.metric == "f1" or self.metric == "mrr": 
            metricFuction = learnPredictRanking
        else: 
            raise ValueError("Unknown metric: " + self.metric)
            
            
        paramList = []
        
        for i, (trainX, testX) in enumerate(trainTestXs):
            Util.printIteration(i, 1, len(cvInds), "Fold: ")

            for m, k in enumerate(ks): 
                learner = self.copy()
                learner.updateAlg="initial" 
                learner.setK(k)
                paramList.append((learner, trainX, testX, rhos)) 
            
        if self.numProcesses != 1: 
            pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=10)
            resultsIter = pool.imap(metricFuction, paramList)
        else: 
            resultsIter = itertools.imap(metricFuction, paramList)
        
        for i, (trainX, testX) in enumerate(trainTestXs):
            for m, k in enumerate(ks):
                metrics[:, m, i] = resultsIter.next()
        
        if self.numProcesses != 1: 
            pool.terminate()

        meanMetrics = metrics.mean(2)
        stdMetrics = metrics.std(2)
        
        logging.debug("ks=" + str(ks))
        logging.debug("rhos=" + str(rhos))
        logging.debug(meanMetrics)
        
        #Set the parameters 
        if self.metric == "mse": 
            self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[0]]) 
            self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[1]])
        elif self.metric == "f1" or self.metric == "mrr": 
            self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[0]]) 
            self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[1]])
            

        logging.debug("Model parameters: k=" + str(self.k) + " rho=" + str(self.rho))

        return meanMetrics, stdMetrics
开发者ID:charanpald,项目名称:sandbox,代码行数:67,代码来源:IterativeSoftImpute.py

示例10: modelSelect

# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
    def modelSelect(self, X, colProbs=None): 
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=True, colProbs=colProbs)
        testMetrics = numpy.zeros((self.ks.shape[0], self.lmbdaUsers.shape[0], self.lmbdaItems.shape[0], self.gammas.shape[0], len(trainTestXs)))
        
        logging.debug("Performing model selection with test leave out per row of " + str(self.validationSize))
        paramList = []        
        
        for i, k in enumerate(self.ks): 
            for j, lmbdaUser in enumerate(self.lmbdaUsers): 
                for s, lmbdaItem in enumerate(self.lmbdaItems): 
                    for t, gamma in enumerate(self.gammas):
                        for icv, (trainX, testX) in enumerate(trainTestXs):
                            learner = self.copy()
                            learner.k = k  
                            learner.lmbdaUser = lmbdaUser 
                            learner.lmbdaPos = lmbdaItem
                            learner.lmbdaNeg = lmbdaItem
                            learner.gamma = gamma
                        
                            paramList.append((trainX, testX, learner))
            
        if self.numProcesses != 1: 
            pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
            resultsIterator = pool.imap(computeTestF1, paramList, self.chunkSize)
        else: 
            import itertools
            resultsIterator = itertools.imap(computeTestF1, paramList)
        
        for i, k in enumerate(self.ks): 
            for j, lmbdaUser in enumerate(self.lmbdaUsers): 
                for s, lmbdaPos in enumerate(self.lmbdaItems): 
                    for t, gamma in enumerate(self.gammas):
                        for icv, (trainX, testX) in enumerate(trainTestXs):        
                            testMetrics[i, j, s, t, icv] = resultsIterator.next()
                
        if self.numProcesses != 1: 
            pool.terminate()
        
        meanTestMetrics = numpy.mean(testMetrics, 4)
        stdTestMetrics = numpy.std(testMetrics, 4)
        
        logging.debug("ks=" + str(self.ks)) 
        logging.debug("lmbdaUsers=" + str(self.lmbdaUsers)) 
        logging.debug("lmbdaItems=" + str(self.lmbdaItems)) 
        logging.debug("gammas=" + str(self.gammas)) 
        logging.debug("Mean metrics=" + str(meanTestMetrics))
        
        
        indK, indLmdabUser, indLmbdaItem, indGamma = numpy.unravel_index(meanTestMetrics.argmax(), meanTestMetrics.shape)
        self.k = self.ks[indK]
        self.lmbdaUser = self.lmbdaUsers[indLmdabUser]
        self.lmbdaPos = self.lmbdaItems[indLmbdaItem]
        self.lmbdaNeg = self.lmbdaItems[indLmbdaItem]
        self.gamma = self.gammas[indGamma]

        logging.debug("Model parameters: " + str(self))
         
        return meanTestMetrics, stdTestMetrics
开发者ID:charanpald,项目名称:sandbox,代码行数:64,代码来源:BprRecommender.py

示例11: runExperiment

# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
    def runExperiment(self, X):
        """
        Run the selected ranking experiments and save results
        """
        logging.debug("Splitting into train and test sets")
        #Make sure different runs get the same train/test split
        numpy.random.seed(21)
        m, n = X.shape
        #colProbs = (X.sum(0)+1)/float(m+1)
        #colProbs = colProbs**-self.algoArgs.itemExp
        #colProbs = numpy.ones(n)/float(n)
        trainTestXs = Sampling.shuffleSplitRows(X, 1, self.algoArgs.testSize)
        trainX, testX = trainTestXs[0]
        logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " + str(trainX.nnz))
        logging.debug("Test X shape and nnz: " + str(testX.shape) + " " + str(testX.nnz))

        #Have scipy versions of each array
        trainXScipy = trainX.toScipyCsc()
        testXScipy = testX.toScipyCsc()

        if self.algoArgs.runSoftImpute:
            logging.debug("Running soft impute")
            resultsFileName = self.resultsDir + "ResultsSoftImpute.npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()
                logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
                modelSelectX, userInds = Sampling.sampleUsers2(trainXScipy, self.algoArgs.modelSelectSamples, prune=True)

                try:
                    learner = IterativeSoftImpute(self.algoArgs.rhoSi, eps=self.algoArgs.epsSi, k=self.algoArgs.k, svdAlg=self.algoArgs.svdAlg, postProcess=self.algoArgs.postProcess, p=self.algoArgs.pSi, q=self.algoArgs.qSi)
                    learner.folds = self.algoArgs.folds
                    learner.metric = self.algoArgs.metric
                    learner.numProcesses = self.algoArgs.processes
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.validationSize = self.algoArgs.validationSize

                    if self.algoArgs.modelSelect:
                        cvInds = Sampling.randCrossValidation(self.algoArgs.folds, modelSelectX.nnz)
                        meanErrors, stdErrors = learner.modelSelect2(modelSelectX, self.algoArgs.rhosSi, self.algoArgs.ks, cvInds)

                        modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")
                        numpy.savez(modelSelectFileName, meanErrors, stdErrors)
                        logging.debug("Saved model selection grid as " + modelSelectFileName)

                    logging.debug(learner)

                    self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        if self.algoArgs.runMaxLocalAuc:
            logging.debug("Running max local AUC")

            if self.algoArgs.loss != "tanh":
                resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + ".npz"
            else:
                resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + "_rho=" + str(self.algoArgs.rhoMlauc) + ".npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()

                try:
                    learner = MaxLocalAUC(self.algoArgs.k, 1-self.algoArgs.u, lmbdaU=self.algoArgs.lmbdaUMlauc, lmbdaV=self.algoArgs.lmbdaVMlauc, eps=self.algoArgs.epsMlauc, stochastic=not self.algoArgs.fullGradient)

                    learner.alpha = self.algoArgs.alpha
                    learner.alphas = self.algoArgs.alphas
                    learner.eta = self.algoArgs.eta
                    learner.folds = self.algoArgs.folds
                    learner.initialAlg = self.algoArgs.initialAlg
                    learner.itemExpP = self.algoArgs.itemExpP
                    learner.itemExpQ = self.algoArgs.itemExpQ
                    learner.ks = self.algoArgs.ks
                    learner.lmbdas = self.algoArgs.lmbdasMlauc
                    learner.loss = self.algoArgs.loss
                    learner.maxIterations = self.algoArgs.maxIterations
                    learner.maxNorms = self.algoArgs.maxNorms
                    learner.maxNormU = self.algoArgs.maxNorm
                    learner.maxNormV = self.algoArgs.maxNorm
                    learner.metric = self.algoArgs.metric
                    learner.normalise = self.algoArgs.normalise
                    learner.numAucSamples = self.algoArgs.numAucSamples
                    learner.numProcesses = self.algoArgs.processes
                    learner.numRowSamples = self.algoArgs.numRowSamples
                    learner.rate = self.algoArgs.rate
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.recordStep = self.algoArgs.recordStep
                    learner.rho = self.algoArgs.rhoMlauc
                    learner.rhos = self.algoArgs.rhosMlauc
                    learner.startAverage = self.algoArgs.startAverage
                    learner.t0 = self.algoArgs.t0
                    learner.t0s = self.algoArgs.t0s
                    learner.validationSize = self.algoArgs.validationSize
                    learner.validationUsers = self.algoArgs.validationUsers
#.........这里部分代码省略.........
开发者ID:charanpald,项目名称:wallhack,代码行数:103,代码来源:RankingExpHelper.py

示例12: modelSelect

# 需要导入模块: from sandbox.util.Sampling import Sampling [as 别名]
# 或者: from sandbox.util.Sampling.Sampling import shuffleSplitRows [as 别名]
    def modelSelect(self, X, colProbs=None):
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape

        trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, colProbs=colProbs)
        datas = []
        for (trainX, testX) in trainTestXs:
            testOmegaList = SparseUtils.getOmegaList(testX)
            #testX = trainX+testX
            datas.append((trainX, testX, testOmegaList))
        testAucs = numpy.zeros((len(self.ks), len(self.lmbdas), len(self.gammas), len(trainTestXs)))
        
        logging.debug("Performing model selection")
        paramList = []        
        
        for i, k in enumerate(self.ks): 
            U, V = self.initUV(X, k)
            for lmbda in self.lmbdas:
                for gamma in self.gammas:
                    for (trainX, testX, testOmegaList) in datas:
                        learner = self.copy()
                        learner.k = k
                        learner.U = U.copy()
                        learner.V = V.copy()
                        learner.lmbda = lmbda
                        learner.gamma = gamma
                    
                        paramList.append((scipy.sparse.csr_matrix(trainX, dtype=numpy.float64), scipy.sparse.csr_matrix(testX, dtype=numpy.float64), learner))
            
        if self.numProcesses != 1: 
            pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
            resultsIterator = pool.imap(computeTestF1, paramList, self.chunkSize)
        else: 
            resultsIterator = itertools.imap(computeTestF1, paramList)
        
        for i_k in range(len(self.ks)):
            for i_lmbda in range(len(self.lmbdas)):
                for i_gamma in range(len(self.gammas)):
                    for i_cv in range(len(trainTestXs)):             
                        testAucs[i_k, i_lmbda, i_gamma, i_cv] = resultsIterator.next()
        
        if self.numProcesses != 1: 
            pool.terminate()
        
        meanTestMetrics = numpy.mean(testAucs, 3)
        stdTestMetrics = numpy.std(testAucs, 3)
        
        logging.debug("ks=" + str(self.ks))
        logging.debug("lmbdas=" + str(self.lmbdas))
        logging.debug("gammas=" + str(self.gammas))
        logging.debug("Mean metrics=" + str(meanTestMetrics))
        
        i_k, i_lmbda, i_gamma = numpy.unravel_index(meanTestMetrics.argmax(), meanTestMetrics.shape)
        self.k = self.ks[i_k]
        self.lmbda = self.lmbdas[i_lmbda]
        self.gamma = self.gammas[i_gamma]

        logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda) + " gamma=" + str(self.gamma))
         
        return meanTestMetrics, stdTestMetrics
开发者ID:charanpald,项目名称:sandbox,代码行数:64,代码来源:CLiMF.py


注:本文中的sandbox.util.Sampling.Sampling.shuffleSplitRows方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。