本文整理汇总了Python中index.Index.getIDF方法的典型用法代码示例。如果您正苦于以下问题:Python Index.getIDF方法的具体用法?Python Index.getIDF怎么用?Python Index.getIDF使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类index.Index
的用法示例。
在下文中一共展示了Index.getIDF方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Cluster
# 需要导入模块: from index import Index [as 别名]
# 或者: from index.Index import getIDF [as 别名]
#.........这里部分代码省略.........
if self.debug:
print 'ORIGINAL RANDOM MEANS:', kRandomNumbers
''' get those k random items into array (in increasing order) '''
kRandomCenters = []
for index in sorted(kRandomNumbers)[::-1]:
kRandomCenters += [self.dVectors[index]]
return kRandomCenters, kRandomNumbers
def distanceBetween(self,postings1,postings2): #squared euclidian distance
distance=0.
for i in set(postings1.keys() + postings2.keys()):
value1 = 0.
value2 = 0.
#print i,
if i in postings1:
value1 = postings1[i]
if i in postings2:
value2 = postings2[i]
# get difference, square it,
distance += abs(value1 - value2)**2
#print distance
return distance
def cosineScore(self, document1, document2):
# calculate dot product
dotProduct = self.getDotProduct(document1,document2)
# get magnitudes
magnitudes = self.calculateMagnitudeOfVector(document1) * self.calculateMagnitudeOfVector(document2)
if magnitudes == 0:
magnitudes = 0 + sys.float_info.epsilon #the smallest possible value. avoid divide by zero error
return 1 - (dotProduct/magnitudes)
def calculateMagnitudeOfVector(self, vector):
mag = 0.
for term in vector:
mag += vector[term]**2
mag = math.sqrt(mag)
return mag
def getDotProduct(self, document1, document2):
dotProduct = 0.0
#print postingsDoc
#print postingsQuery
for term in set(document1.keys() + document2.keys()):
d1 = 0
d2 = 0
if term in document1 and term in document2:
d1 = document1[term]
d2 = document2[term]
dotProduct += d1*d2
return dotProduct
def getTFIDF(self, posting):
tf = posting.getTF()
idf = self.index.getIDF(posting.getTerm())
'''
if self.debug:
print 'tfidf for', posting.getTerm()
print 'TF: ', tf
print 'DF: ', self.index.getTerm(posting.getTerm()).getDocumentFrequency()
print 'IDF: ', idf
print 'TFIDF:', tf*idf
'''
#return tf*idf #* 10 # multiply by some number because values < 1 square differently... just wanted to exaggerate the distances
return tf
#return random.randint(0,50)
def sortDictionary(self, dictionary):
def reverse_numeric(x, y):
if y - x > 0:
return 1
if y-x < 0:
return -1
else:
return 0
return collections.OrderedDict(sorted(dictionary.items(), key=itemgetter(1)))
def reduceDimensionality(self, dictionary, numItemsToKeep=20):
sorteddict = self.sortDictionary(dictionary)
newdict = {}
for i in range(numItemsToKeep):
term = sorteddict.popitem()
newdict[term[0]] = term[1] #term[0] is the term and term[1] is the tfidf value
return newdict