Python Index.getIDF方法代码示例

本文整理汇总了Python中index.Index.getIDF方法的典型用法代码示例。


示例1: Cluster

# 需要导入模块: from index import Index [as 别名]
# 或者: from index.Index import getIDF [as 别名]

        if self.debug:
            print 'ORIGINAL RANDOM MEANS:', kRandomNumbers
        ''' get those k random items into array (in increasing order) '''
        kRandomCenters = []
        for index in sorted(kRandomNumbers)[::-1]:
            kRandomCenters += [self.dVectors[index]]
        return kRandomCenters, kRandomNumbers
    def distanceBetween(self,postings1,postings2): #squared euclidian distance
        for i in set(postings1.keys() + postings2.keys()):
            value1 = 0.
            value2 = 0.
            #print i,
            if i in postings1:
                value1 = postings1[i]
            if i in postings2:
                value2 = postings2[i]
            # get difference, square it, 
            distance += abs(value1 - value2)**2
        #print distance
        return distance
    def cosineScore(self, document1, document2):
        # calculate dot product
        dotProduct = self.getDotProduct(document1,document2)
        # get magnitudes
        magnitudes = self.calculateMagnitudeOfVector(document1) * self.calculateMagnitudeOfVector(document2)
        if magnitudes == 0:
            magnitudes = 0 + sys.float_info.epsilon #the smallest possible value. avoid divide by zero error
        return 1 - (dotProduct/magnitudes)
    def calculateMagnitudeOfVector(self, vector):
        mag = 0.
        for term in vector:
            mag += vector[term]**2
        mag = math.sqrt(mag)
        return mag
    def getDotProduct(self, document1, document2):
        dotProduct = 0.0
        #print postingsDoc
        #print postingsQuery
        for term in set(document1.keys() + document2.keys()):
            d1 = 0
            d2 = 0
            if term in document1 and term in document2:
                d1 = document1[term]
                d2 = document2[term]
                dotProduct += d1*d2
        return dotProduct    
    def getTFIDF(self, posting):
        tf = posting.getTF()
        idf = self.index.getIDF(posting.getTerm())
        if self.debug:
            print 'tfidf for', posting.getTerm()
            print 'TF:   ', tf
            print 'DF:   ', self.index.getTerm(posting.getTerm()).getDocumentFrequency()
            print 'IDF:  ', idf
            print 'TFIDF:', tf*idf 
        #return tf*idf #* 10   # multiply by some number because values < 1 square differently... just wanted to exaggerate the distances
        return tf
        #return random.randint(0,50)
    def sortDictionary(self, dictionary): 
        def reverse_numeric(x, y):
            if y - x > 0:
                return 1
            if y-x < 0:
                return -1
                return 0
        return collections.OrderedDict(sorted(dictionary.items(), key=itemgetter(1)))
    def reduceDimensionality(self, dictionary, numItemsToKeep=20):
        sorteddict = self.sortDictionary(dictionary)
        newdict = {}
        for i in range(numItemsToKeep):
            term = sorteddict.popitem()
            newdict[term[0]] = term[1] #term[0] is the term and term[1] is the tfidf value
        return newdict
