本文整理汇总了Python中sandbox.util.SparseUtils.SparseUtils.pruneMatrix方法的典型用法代码示例。如果您正苦于以下问题:Python SparseUtils.pruneMatrix方法的具体用法?Python SparseUtils.pruneMatrix怎么用?Python SparseUtils.pruneMatrix使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sandbox.util.SparseUtils.SparseUtils
的用法示例。
在下文中一共展示了SparseUtils.pruneMatrix方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: processRatings
# 需要导入模块: from sandbox.util.SparseUtils import SparseUtils [as 别名]
# 或者: from sandbox.util.SparseUtils.SparseUtils import pruneMatrix [as 别名]
def processRatings(self):
"""
Convert the dataset into a matrix and save the results for faster
access.
"""
if not os.path.exists(self.ratingFileName) or not os.path.exists(self.custDictFileName):
dataDir = PathDefaults.getDataDir() + "flixster/"
logging.debug("Processing ratings given in " + dataDir)
custIdDict = {}
custIdSet = set([])
itemIdDict = {}
itemIdSet = set([])
itemInds = array.array("I")
custInds = array.array("I")
ratings = array.array("f")
dates = array.array("L")
i = 0
j = 0
itr = 0
ratingsFile = open(dataDir + "Ratings.timed.txt")
ratingsFile.readline()
for line in ratingsFile:
Util.printIteration(itr, 100000, self.numRatings)
vals = line.split()
custId = int(vals[0])
if custId not in custIdSet:
custIdSet.add(custId)
custIdDict[custId] = j
custInd = j
j += 1
else:
custInd = custIdDict[custId]
itemId = int(vals[1])
if itemId not in itemIdSet:
itemIdSet.add(itemId)
itemIdDict[itemId] = i
itemInd = i
i += 1
else:
itemInd = itemIdDict[itemId]
rating = float(vals[2])
t = datetime.strptime(vals[3].strip(), "%Y-%m-%d")
t = int(time.mktime(t.timetuple()))
#Some dates are before 1970
if t >= 0:
itemInds.append(itemInd)
custInds.append(custInd)
ratings.append(rating)
dates.append(t)
itr += 1
itemInds = numpy.array(itemInds, numpy.uint32)
custInds = numpy.array(custInds, numpy.uint32)
ratings = numpy.array(ratings, numpy.float)
dates = numpy.array(dates, numpy.uint64)
assert ratings.shape[0] == self.numRatings
logging.debug("Number of ratings " + str(ratings.shape[0]))
#Prune data
X = scipy.sparse.csc_matrix((ratings, (custInds, itemInds)))
X2 = scipy.sparse.csc_matrix((dates, (custInds, itemInds)))
print(X.shape)
X, rowInds, colInds = SparseUtils.pruneMatrix(X, minNnzRows=10, minNnzCols=10, verbose=True)
X2 = X2[:, colInds][rowInds, :]
print(X.shape)
(custInds, itemInds) = X.nonzero()
ratings = X.data
dates = X2.data
logging.debug("New number of ratings " + str(ratings.shape[0]))
numpy.savez(self.ratingFileName, itemInds, custInds, ratings, dates)
logging.debug("Saved ratings file as " + self.ratingFileName)
pickle.dump(custIdDict, open(self.custDictFileName, 'wb'))
logging.debug("Saved custIdDict as " + self.custDictFileName)
pickle.dump(itemIdDict, open(self.itemDictFileName, 'wb'))
logging.debug("Saved itemIdDict as " + self.itemDictFileName)
else:
logging.debug("Ratings file " + str(self.ratingFileName) + " already processed")