本文整理汇总了Python中scipy.cluster.hierarchy.fcluster函数的典型用法代码示例。如果您正苦于以下问题:Python fcluster函数的具体用法?Python fcluster怎么用?Python fcluster使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了fcluster函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: hcluster_cols
def hcluster_cols(self, thresh):
try:
link = linkage(self.X.T, method='complete', metric = 'cosine')
assignments = fcluster(link, thresh, 'distance')
except:
link = linkage(self.X.T, method='complete', metric = 'euclidean')
assignments = fcluster(link, thresh, 'distance')
col_ind = np.arange(len(self.crimes))
d = pd.DataFrame(zip(col_ind, assignments)).groupby(1)[0].aggregate(lambda x: tuple(x))
df_new = pd.DataFrame(index = np.arange(len(self.names)))
for i in d:
cols = []
for w in i:
cols.append(w)
if len(cols) > 1:
df_new[str(self.crimes[cols])] = np.mean(self.X[:,cols], axis = 1)
else:
df_new[str(self.crimes[cols[0]])] = self.X[:,cols[0]]
# plt.figure(figsize=(10,20))
# dendro = dendrogram(link, color_threshold=thresh, leaf_font_size=13, labels = self.crimes, orientation = 'left')
# plt.subplots_adjust(top=.99, bottom=0.5, left=0.05, right=0.99)
# plt.show()
self.df = df_new
self.crimes = df_new.columns.values
示例2: cluster_fps
def cluster_fps(self):
clkg = hcluster.linkage(self.dm,method = 'average')
coarse_r = hcluster.fcluster(clkg,0.3,criterion = 'distance')
self.coarse_r = coarse_r
bcount = np.bincount(coarse_r)
knum = len(np.nonzero(bcount > 1)[0])
s = self.density_matrix.shape
if False and len(s) >1 and s[0] > 10 and s[1] > 10 and knum < min(s) / 2:
(u,s,vt) = la.svds(self.sps_matrixs,k = knum)
self.u = u
print '============'
else:
self.result = self.coarse_r
return (clkg,clkg)
#rankA = npla.matrix_rank(self.sps_matrixs)
# if rankA < 3:
a = np.matrix(np.diag(s)) * np.matrix(vt)
pd = dist.pdist(np.array(a.T),'cosine')
pd[np.abs(pd) < 1e-11] = 0
lkg = hcluster.linkage(pd,method = 'average')
self.lkg = lkg
self.result = hcluster.fcluster(lkg,self.svd_cluster_thr,criterion = 'distance')
# self.result = hcluster.fcluster(lkg,1)
# self.result = hcluster.fclusterdata(u,0.7,metric = 'cosine', criterion = 'distance',method = 'average')
return (lkg,clkg)
示例3: elbow
def elbow(self, no_plot=False):
"""Plot within groups variance vs. number of clusters.
Elbow criterion could be used to determine number of clusters.
"""
from scipy.cluster.hierarchy import fcluster
import matplotlib.pyplot as plt
idx = fcluster(self.Z, len(self.data), criterion='maxclust')
nclust = list(np.arange(1, np.sqrt(idx.max() / 2) + 1, dtype=int))
within_grp_var = []
mean_var = []
for n in nclust:
idx = fcluster(self.Z, n, criterion='maxclust')
grp = [np.flatnonzero(idx == c) for c in np.unique(idx)]
# between_grp_var = Group([self.data[ix].R.uv for ix in grp]).var
var = [100*self.data[ix].var for ix in grp]
within_grp_var.append(var)
mean_var.append(np.mean(var))
if not no_plot:
plt.boxplot(within_grp_var, positions=nclust)
plt.plot(nclust, mean_var, 'k')
plt.xlabel('Number of clusters')
plt.ylabel('Variance')
plt.title('Within-groups variance vs. number of clusters')
plt.show()
else:
return nclust, within_grp_var
示例4: refineEnsemble
def refineEnsemble(ens, lower=.5, upper=10.):
"""Refine a PDB ensemble based on RMSD criterions."""
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
from collections import Counter
### calculate pairwise RMSDs ###
RMSD = ens.getRMSDs(pairwise=True)
# convert the RMSD table to the compressed form
v = squareform(RMSD)
### apply upper threshold ###
Z_upper = linkage(v, method='complete')
labels = fcluster(Z_upper, upper, criterion='distance')
most_common_label = Counter(labels).most_common(1)[0][0]
I = np.where(labels==most_common_label)[0]
### apply lower threshold ###
Z_lower = linkage(v, method='single')
labels = fcluster(Z_lower, lower, criterion='distance')
uniq_labels = np.unique(labels)
clusters = []
for label in uniq_labels:
indices = np.where(labels==label)[0]
clusters.append(indices)
J = np.ones(len(clusters), dtype=int) * -1
rmsd = None
for i, cluster in enumerate(clusters):
if len(cluster) > 0:
# find the conformations with the largest coverage
# (the weight of the ref should be 1)
weights = [ens[j].getWeights().sum() for j in cluster]
js = np.where(weights==np.max(weights))[0]
# in the case where there are multiple structures with the same weight,
# the one with the smallest rmsd wrt the ens._coords is selected.
if len(js) > 1:
# rmsd is not calulated unless necessary for the sake of efficiency
rmsd = ens.getRMSDs() if rmsd is None else rmsd
j = js[np.argmin(rmsd[js])]
else:
j = js[0]
J[i] = cluster[j]
else:
J[i] = cluster[0]
### refine ensemble ###
K = np.intersect1d(I, J)
reens = ens[K]
return reens
示例5: cutTree
def cutTree(z, threshold, crit):
try:
z = np.clip(z,0,9999999)
tree = hac.fcluster(z, threshold, criterion = crit)
return tree
except ValueError, e:
print("cutTree: %s" % str(e))
tree = hac.fcluster(z, 50, criterion = "euclidean")
print "negative values in matrix"
return tree
示例6: process_stay
def process_stay(imei,traj):
# print imei,'------------------------>',traj.shape
r = 20
interval = 60*8
# wfs = wfs[:1000]
# traj = traj[:1000]
if len(traj.shape) < 1 or traj.shape[0] <2:
return
x = traj['x']
y = traj['y']
in_sample = False
#print x,y
if sample_range is not None:
for (cx,cy,cr) in sample_range:
crange = math.sqrt(math.pow(cx-x[0],2) + math.pow(cy-y[0],2))
if crange < cr:
in_sample = True
break
#ids = grid_util.get_grid_ids(np.median(x),np.median(y),300,3)
if not in_sample:
return
ids = G.get_gridids_with_align(np.median(x),np.median(y))
#
# print traj
dm = get_pdist(traj,100,convert_sig = True)
dm[np.abs(dm) < 1e-3] = 0
# print dm
# print dm.shape
#lkg = hcluster.linkage(traj[...,:2],metric = 'euclidean',method = 'average')
# print dm
# print dm.shape
lkg = hcluster.linkage(dm,method = 'average')
rst = hcluster.fcluster(lkg,0.7,criterion = 'distance') #rough dist
rst_merge = hcluster.fcluster(lkg,0.2,criterion = 'distance') #rough dist
seg = []
for i in range(len(rst) + 1):
if i == 0 or i == len(rst) or rst[i] != rst[i-1]:
seg.append(i)
#
# print rst
# print rst_merge
# print seg
for (s,e) in zip(seg[:-1],seg[1:]):
seg_traj = traj[s:e]
seg_id = rst_merge[s:e]
itl = seg_traj[-1]['t'] - seg_traj[0]['t']
if itl > interval:
print_merge_fp(ids,imei,seg_traj,seg_id,itl)
示例7: clusterTrajectories
def clusterTrajectories(
trajectories, fname, path, metric_func=trajectoryDissimilarityL2, user_distance_matrix=None, criterion="distance"
):
"""
trajectories: the trajectories need to be in XY coordinates
"""
plot_path = utils.queryPath(path + "/plots")
if user_distance_matrix is None:
distance_matrix = getTrajectoryDistanceMatrix(trajectories, metric_func)
writeToCSV.saveData(distance_matrix, path + "/" + fname) # save the distance_matrix
else:
distance_matrix = user_distance_matrix
assert len(distance_matrix) == len(
trajectories
), "distance_matrix (n, n) and trajectories(n) should have same number of samples"
print "distance_matrix:\n", distance_matrix
v = DIST.squareform(distance_matrix)
cluster_result = HAC.linkage(v, method="average")
dg = HAC.dendrogram(cluster_result)
plt.xlabel("cluster_dengrogram_{fname}".format(fname=fname))
plt.savefig("{path}/cluster_dengrogram_{fname}.png".format(fname=fname, path=plot_path))
plt.clf()
if criterion == "distance":
if metric_func == trajectoryDissimilarityL2:
this_cluster_label = HAC.fcluster(
Z=cluster_result, t=1 * 1000, criterion="distance"
) # distance for l2 measure
elif metric_func == trajectoryDissimilarityCenterMass:
this_cluster_label = HAC.fcluster(
Z=cluster_result, t=1.5, criterion="distance"
) # distance for center of mass measure
elif criterion == "inconsistent":
this_cluster_label = HAC.fcluster(Z=cluster_result, t=0.8, criterion="inconsistent")
print "this_cluster_label:", this_cluster_label, "number of clusters:", len(set(this_cluster_label))
"""Plot the representative trajectories"""
plotRepresentativeTrajectory(
this_cluster_label,
trajectories,
fname="cluster_centroids_{n}_classes".format(n=len(set(this_cluster_label))),
path=plot_path,
show=False,
)
return this_cluster_label, [this_cluster_label], []
示例8: clusterize_hierarchical
def clusterize_hierarchical(peakels, matrix_dist, cut, clip=False):
"""
:param clip:
:param peakels:
:param matrix_dist:
:param method:
:param cut:
"""
#having negative value in the matrix distance
# leading to a valueerror
# clip i order to prevent negative value in the matrix distance
if clip:
np.clip(matrix_dist, 0, 1, matrix_dist)
k = linkage(matrix_dist, method='complete')
#dist = maxdists(k)
#fit = norm.fit(dist)
#cut = np.percentile(dist, 10.0) #norm.ppf(5.0, loc=fit[0], scale=fit[1])
k2 = fcluster(k, cut, criterion='distance') #, criterion='distance')
clust_by_id = ddict(list)
for i, v in enumerate(k2):
clust_by_id[v].append(peakels[i])
return clust_by_id.values()
示例9: main
def main(): #clustering and write output
if len(pep_array)>1:
matrix=[]
for i in range(0,len(pep_array)):
matrix.append(pep_array[i][4].replace('\"',"").split(','))
dataMatrix=numpy.array(matrix,dtype=float)
d = sch.distance.pdist(dataMatrix,metric)# vector of pairwise distances
if metric=="correlation":
D = numpy.clip(d,0,2) #when using correlation, all values in distance matrix should be in range[0,2]
else:
D=d
try:
cutoff=float(t)
except ValueError:
print "please provide a numeric value for --t"; sys.exit()
L = sch.linkage(D, method,metric)
ind = sch.fcluster(L,cutoff,'distance')#distance is dissmilarity(1-correlation)
p=numpy.array(pep_array)
p=numpy.column_stack([p,ind])
formatoutput(p)
else:
p=numpy.array(pep_array)
p=numpy.column_stack([p,[0]])
formatoutput(p)
示例10: user_fp_group
def user_fp_group(data,key,user,filter = 'mid',merge = False,thr = 0.2):
#data = np.fromiter(data,dtype = dt)
if len(data.shape) == 0 or data.shape[0] == 1:
print '\t'.join([key,user,'%s' % data['wf_list'],str(data['x']),str(data['y']),'1'])
return
dists = get_pdist(data,100)
#print dists
clusters = hcluster.linkage(dists,method = 'average')
# print clusters
r = hcluster.fcluster(clusters,thr,'distance')
ids = np.unique(r)
sz = []
for id in ids:
sz.append(data[r==id].shape[0])
mid_size = max(1.1,max(sz) / 2.0)
for id in ids:
d = data[r==id]
if filter == 'mid' and d.shape[0] < mid_size:
continue
if merge == True:
print '\t'.join([key,user,wf_to_str(get_mean_wf(d)),str(np.median(d['x'])),str(np.median(d['y'])),str(get_largest_dur(d)),str(d.shape[0])])
continue
for od in d:
print '\t'.join([key,user,od['wf_list'],str(od['x']),str(od['y']),str(od['t']),str(id)])
示例11: process
def process(tag,infos,wf_lists,count):
if wf_lists == None or infos == None:
return
x = infos['x']
y = infos['y']
imeis = infos['imei']
#wf_lists = np.fromiter(wf_lists,dtype = np.array)
std_x = np.std(x)
std_y = np.std(y)
users_num = len(np.unique(imeis))
if users_num < 3:
return
if len(wf_lists.shape) < 2 or wf_lists.shape[1] < 2:
return
dists = sci_dist.pdist(wf_lists,'cosine')
dists[(dists < 1e-10)] = 0
clusters = hierarchy.linkage(dists,method ='average')
r = hierarchy.fcluster(clusters,0.3,'distance')
for c in np.unique(r):
idx = (r==c)
c_x = np.median(x[idx] )
c_y = np.median(y[idx] )
c_std_x = np.std(x[idx])
c_std_y = np.std(y[idx])
c_user = len(np.unique(imeis[idx]))
wfs = wf_lists[idx]
wf = np.sum(wfs,axis=0) / len(wfs)
wf = [ '%d' % sig for sig in wf ]
print '%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (tag,'\t'.join(wf),c_x,c_y,c_user,std_x,std_y,c_std_x,c_std_y,count)
示例12: run_entity_model
def run_entity_model(cdev, cprc):
print '____________________________________________________'
print 'running entity model'
hdev, hprc, hmapping, entcorp, er = process_entities(cdev, cprc)
print 'removed', len(cdev)- len(hdev), 'documents', len(hdev), 'left'
voc = build_voc(entcorp, 2)
ent_vectorizer = CountVectorizer(vocabulary = voc)
E = ent_vectorizer.fit_transform(hdev)
Eclean, emapping = filter_rare(E, 0)
E_dense = np.matrix(Eclean).astype('float')
E_scaled = preprocessing.scale(E_dense)
E_normalized = preprocessing.normalize(E_scaled, norm='l2')
EMatrix = pairwise_distances(E_normalized, metric='cosine')
EL = fastcluster.linkage(EMatrix, method='average')
flat_eclust = hierarchy.fcluster(EL, 0.5, 'distance')
ec = organize_clusters(flat_eclust, th = 3)
ecf = []
for cl in ec:
ecf.append([hmapping[emapping[t]] for t in cl])
print 'detected', len(ecf), 'entity clusters'
return ecf, voc
示例13: main
def main():
# distMatrix = loadDistanceMatrix()
# linkage = saveLinkage(distMatrix)
# linkage = loadLinkage()
# loadFCluster()
# R = dendrogram(linkage, truncate_mode='level', p=4, show_contracted=True)
# afile = open(r'/home/rojosewe/Dropbox/MAI90/tesis/structs/R5000.pkl', 'wb')
# pickle.dump(R, afile);
# afile.close();
linkage = loadLinkage()
print len(linkage)
k = 1.5
# 18 -> 54
# 19 -> 46
R = dendrogram(linkage, color_threshold=6.8, show_contracted=True)
pylab.savefig( "/home/rojosewe/Dropbox/MAI90/tesis/images/wordClustering/dgram446.8.png" )
# print "cheese!"
T = sch.fcluster(linkage, k, 'distance')
n = len(T)
# print len(T)
# calculate labels
labels = np.zeros((n, 1))
print str(k) + ": " + str(max(T))
for i in range(n):
labels[i,0] = int(T[i]);
with open(datafolder + 'labels.csv', 'wb') as csvfile:
csvw = csv.writer(csvfile);
for i in range(n):
csvw.writerow(labels[i,:])
print 'done writing'
示例14: cluster_words
def cluster_words(k):
ts = os.listdir('types')
ts.sort(key=alphanum_key)
ts = np.array(ts)
T = fcluster(Z,k,criterion='maxclust')
def words(i):
cluster = ts[T == i]
print(len(cluster))
allwords = []
for t in cluster:
fname = 'types/{}'.format(t)
with open(fname) as file:
data = json.loads(file.read())
desc = data['description']
words = re.findall('\w+', desc.lower())
allwords.extend(words)
allwords = [word for word in allwords if word not in stop_words]
counts = Counter(allwords)
return counts
return [words(i+1) for i in range(k)]
示例15: get_ROIs
def get_ROIs(df_sequence,x,limit_meters):
# encontrar puntos de transacciones origen
X,locations,pi_locations = get_latlong_points(df_sequence)
if len(locations) == 1:
return [[{"lat":X[0,0],"long":X[0,1]}],1.0]
elif len(locations) < 1:
return None
# construir dendrograma
Z = linkage(X,'weighted',lambda x,y: vincenty(x,y).meters)
clusters = fcluster(Z,limit_meters,criterion='distance')
centroids = []
nums_by_clusters =[]
pi_sums = []
the_clusters = []
# join pi_sums of locations that are in the same cluster
for i in range(len(clusters)):
indice = buscar_locacion(the_clusters,clusters[i])
if indice < 0:
the_clusters.append(clusters[i])
indice = len(the_clusters)-1
pi_sums.append(0)
nums_by_clusters.append(0)
centroids.append({"lat":0,"long":0})
pi_sums[indice] += pi_locations[i]
centroids[indice]["lat"] += X[i,0]
centroids[indice]["long"] += X[i,1]
nums_by_clusters[indice] += 1
the_indexs, the_sum = get_upToX_pi_locations(np.asarray(pi_sums),x)
the_centroids = []
for i in the_indexs:
the_centroids.append({"lat":centroids[i]["lat"]/nums_by_clusters[i],"long":centroids[i]["long"]/nums_by_clusters[i]})
return [the_centroids,the_sum]