本文整理汇总了Python中Pycluster.kcluster方法的典型用法代码示例。如果您正苦于以下问题:Python Pycluster.kcluster方法的具体用法?Python Pycluster.kcluster怎么用?Python Pycluster.kcluster使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Pycluster
的用法示例。
在下文中一共展示了Pycluster.kcluster方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: multikmeans
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def multikmeans(self, krange=None):
# La recette magique
if krange==None:
kr=np.arange(2, len(self.mat)-1)
else: kr=krange
lmat=len(self.mat)
accords=np.zeros((lmat,lmat), dtype=int) # Où on comptera combien de fois chq paire de documents est classé ensemble
t=deque() # pour sauver temps & mémoire, on emploie deque à la place de list
t0=time()
k2s = lambda x: x*0.85
tunits=k2s(np.array(kr)).sum()
# La boucle elle-même
for k in kr:
t1=time()
# K-means
c,err,nfound=pc.kcluster(self.mat,k)
# Mise à jour des valeurs
for i in np.unique(c):
accords[c==i] += c==i
# Prédiction du temps restant
t2=time()
tunits-=k2s(k)
t.append((t2-t1)/k2s(k))
prediction = tunits*np.mean(tuple(t)[-20:])
print "k={0}: \t{1} ({2} depuis le début) \t{3} à faire".format(k,human_time(t2-t1),human_time(t2-t0),human_time(prediction))
return accords/float(k)
示例2: findcenters
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def findcenters(x, n=1000, k=6):
# get dimensions
m = x.shape[1]
# create centers as empty
centers = DataFrame(np.zeros(shape=(k, m)))
for i in range(n):
labels, _, _ = Pycluster.kcluster(x, nclusters=k, transpose=0, method="a", dist="e", npass=1)
center, _ = Pycluster.clustercentroids(x, clusterid=labels)
# sort centers by the distance to the origin
center = sorted(center, key=lambda t: np.linalg.norm(np.array(t) - np.zeros(m)), reverse=True)
# print np.linalg.norm(np.array(center[0])-np.zeros(m))
# print np.linalg.norm(np.array(center[1])-np.zeros(m))
# print np.linalg.norm(np.array(center[2])-np.zeros(m))
# print np.linalg.norm(np.array(center[3])-np.zeros(m))
# print np.linalg.norm(np.array(center[4])-np.zeros(m))
# print np.linalg.norm(np.array(center[5])-np.zeros(m))
# print np.array(center[0])
# print np.array(center[1])
# print np.array(center[2])
# print np.array(center[3])
# print np.array(center[4])
# print np.array(center[5])
# take the average
for j in range(k):
centers.ix[j, :] = centers.ix[j, :] + center[j]
centers = centers / n
return centers
示例3: cluster
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def cluster(parser, k):
"""
general method for clustering data
"""
#get index number for every page
code_book = parser.get_data_encoding(page_min_occurance=5)
#use only sequence of pages visited
simple_session = [session for session in parser.get_simple_sessions() if config.session_filter_fn(session)]
#use vector representation (v1,v2,v2) where v1 means page v1 was visited
#models = session_modeling.convert_sessions_to_vector(simple_session, code_book, binary=True)
#construct markov chains, estimate transition probabilities
models = session_modeling.convert_sessions_to_markov(simple_session, code_book, bayes=False)
idx, sse, _ = Pycluster.kcluster(models, k, method='a', dist='e')
#idx, sse, _ = cluster_kmedoids(models, k, string_similarity.jaccard_distance)
clusters = {}
for name, clusterid in zip(simple_session, idx):
clusters.setdefault(clusterid, []).append(name)
return clusters, sse
示例4: suggest
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def suggest(self, word):
v = self.analyze(word)
# pick first x
res = []
for nword, nv in self.ndx.items():
wsim = self.compute_similarity([v, nv])
res.append((wsim, nword, self.as_vector(nv)))
res.sort()
res = res[::-1]
# from first y pick the most distant ones
res2 = [v for (sim, word, v) in res]
resw = [word for (sim, word, v) in res]
lab, err, nfound = Pycluster.kcluster(res2, 40)
resg = defaultdict(lambda: [])
for i, l in enumerate(lab):
resg[l] += [res[i]]
res_sug = []
used_groups = set()
for l, w in zip(lab, resw):
if not l in used_groups:
res_sug += [w]
used_groups.add(l)
return res_sug
示例5: testPricesDiffsVecsKmeansClustering
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def testPricesDiffsVecsKmeansClustering(self):
"""Testing whether kmeans clustering with prices differences
vectors works."""
prices_diffs_vecs = utils.make_prices_diffs_vecs(self.data1)
labels, wcss, n = Pycluster.kcluster(prices_diffs_vecs, 3, npass=100)
clusters = utils.make_groups_from_labels(labels, self.data1)
# The result should be sth like this modulo group numbers. Probability
# that this isn't like this with npass=100 is (I think) very low! But
# it can happen that this grouping will be different.
suggested_clusters = {0: ['E'], 1: ['A', 'D'], 2: ['B', 'C']}
# Let's check this.
num_matches = 0
for cluster in clusters.values():
cluster.sort()
for suggested_cluster in suggested_clusters.values():
suggested_cluster.sort()
if cluster == suggested_cluster:
num_matches = num_matches + 1
# Ok, so we've found out that each suggested cluster exists
# in output of our kcluster algorithm and because length of
# clusters dict is 3 we can be sure these dictionaries are equal.
self.assertEqual(num_matches, 3)
self.assertEqual(len(clusters), 3)
示例6: pyclustertest
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def pyclustertest():
data=sp.rand(100,4)
cid,e,n=pcl.kcluster(data)
centroids,cmask=pcl.clustercentroids(D,clusterid=cid)
print data
print centroids
示例7: clusters
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def clusters(labels, data, k):
kclus = Pycluster.kcluster(data, k, npass=1)[0]
nx = numpy.zeros((len(labels), len(labels)), dtype=numpy.float32)
for ind1 in range(len(labels)):
for ind2 in range(len(labels)):
if kclus[ind1] == kclus[ind2]:
nx[ind1][ind2] = 1
print k, " of ", len(labels)
return nx
示例8: getlabels
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def getlabels(x, y, n = 1000 , k = 8):
if y == "none":
y = x
#fit k-means clusters
labels, _, _ = Pycluster.kcluster(y, nclusters = k, transpose=0,
method='a', dist='e', npass = n)
#write labels back
x.loc[:,"group"] = labels
return(x)
示例9: findk
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def findk(x, n=1000, minK=2, maxK=20):
errors = []
# fit k-means clusters for n times
for i in range(minK, maxK + 1, 1):
_, error, nfound = Pycluster.kcluster(x, nclusters=i, transpose=0, method="a", dist="e", npass=n)
# get errors
errors.append(error)
print i
print errors
示例10: cluster_spw_rpw
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def cluster_spw_rpw(list_of_recs):
number_of_clusters = 8
only_serve_return = []
if list_of_recs==[]:
print "ERRROR"
for rec in list_of_recs:
only_serve_return.append([float(rec[0]),float(rec[1])])
k = get_k_value(only_serve_return)
labels, error, nfound = Pycluster.kcluster(scipy.array(only_serve_return), k)
return labels
示例11: _G
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def _G(self, data, K):
labels, _, _ = Pycluster.kcluster(data.T, K)
centers, _ = Pycluster.clustercentroids(data.T, clusterid=labels)
centers = centers.T
G = zeros((K, data.shape[1]))
for k in range(K):
D = data - expand_dims(centers[:, k], axis=1)
G[k, :] = -sqrt(sum(multiply(D, D), axis=0))
return G
示例12: cluster
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def cluster():
x = [[76.0,32.0],[63.0,40.0],[70.0,30.0],[64.0,45.0]]
k = 2
labels, error, nfound = Pycluster.kcluster(scipy.array(x),k)
print "Input data:"
print " spw " + " rpw"
j = 1
for i in x:
print str(j)+") "+str(i[0]) + " " + str(i[1])
j +=1
print " "
print "clusters: " + str(labels)
示例13: getlabels
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def getlabels(x, y, n=1000, k=8):
if y == "none":
y = x
# fit k-means clusters
labels, _, _ = Pycluster.kcluster(y, nclusters=k, transpose=0, method="a", dist="e", npass=n)
# write labels back
x.loc[:, "group"] = labels
# count how many items in each group
labels = list(labels)
for i in range(k):
print labels.count(i)
return x
示例14: kmeans
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def kmeans(data, **kwargs):
"""
Perform k-means clustering on unstructured N-dimensional data.
@type data: array
@param data: The data to be clustered
@type kwargs: dict
@param kwargs: The following args are accepted:
- numClusters: The number of clusters to form (returned number of clusters may be less than k).
- npasses: The number of times the k-means clustering algorithm is performed,
each time with a different (random) initial condition.
- method: describes how the center of a cluster is found:
- method=='a': arithmetic mean.
- method=='m': median.
- initialCenters: a set of points that should be used as the initial
cluster centers
@rtype: tuple
@return: A list where each element indicates the cluster membership of the
corresponding index in the original data and a message string
"""
k = 1
npasses = 1
method = 'a'
initialCenters = None
smartCenters = False
msg = ''
if 'numClusters' in kwargs:
k = int(kwargs['numClusters'])
if 'npasses' in kwargs:
npasses = int(kwargs['npasses'])
if 'method' in kwargs:
method = kwargs['method']
if 'initialCenters' in kwargs:
initialCenters = kwargs['initialCenters']
if 'smartCenters' in kwargs:
smartCenters = kwargs['smartCenters']
logData = tm.getMethod('log')(data)
if initialCenters is not None:
(clusterIDs, err, nOpt) = pc.kcluster(logData, k, npass=npasses, method=method)
msg = "Number of rounds optimal solution was found: %i" % nOpt
else:
logCenters = tm.getMethod('log')(np.array(initialCenters[:k]))
(centroids, clusterIDs) = kmeans2(logData, logCenters, minit='matrix')
if len(np.unique(clusterIDs)) < k:
wx.MessageBox('Warning: One or more of the returned clusters are empty. Please choose different initial cluster centers and re-run k-means for better results.', 'Insufficiently varied cluster centers', wx.OK | wx.ICON_WARNING)
return clusterIDs, msg
示例15: clustering
# 需要导入模块: import Pycluster [as 别名]
# 或者: from Pycluster import kcluster [as 别名]
def clustering(file_path, k, dist_measure, PLOT):
"""
Do the K-means clustering for input data.
@param file_path: Input data file.
@param k: Number of centers in K-means algorithm.
@param dist_measure: Distance measure (in this case, we use Manhattan distance).
@param PLOT: Bool variable, check if plot the result (set it as True only in testing).
@return: Clusters id for all data points in the input data file.
"""
data = numpy.genfromtxt(file_path, delimiter=',')
if len(data.shape) == 1:
return [-1]
print "-- Processing file: " + file_path + " -- Data points: " + str(len(data))
print "-- Start clustering"
k = set_k(len(data), k)
ite_num = method_name(len(data))
# Do the K-means clustering
cluster_id, _, _ = Pycluster.kcluster(data, nclusters=k, mask=None, weight=None, transpose=0, npass=ite_num,
method='a', dist=dist_measure, initialid=None)
if PLOT is False:
return cluster_id
# Draw the clustering result plot.
centroids, _ = Pycluster.clustercentroids(data, clusterid=cluster_id)
if PLOT:
data_pca = mlab.PCA(data)
cutoff = data_pca.fracs[1]
data_2d = data_pca.project(data, minfrac=cutoff)
centroids_2d = data_pca.project(centroids, minfrac=cutoff)
else:
data_2d = data
centroids_2d = centroids
color = ['#2200CC', '#D9007E', '#FF6600', '#FFCC00', '#ACE600', '#0099CC',
'#8900CC', '#FF0000', '#FF9900', '#FFFF00', '#00CC01', '#0055CC']
for i in range(k):
scatter(data_2d[cluster_id == i, 0], data_2d[cluster_id == i, 1], color=color[i % 12])
plot(centroids_2d[:, 0], centroids_2d[:, 1], 'sg', markersize=8)
show()
return cluster_id