本文整理汇总了Python中scipy.cluster.hierarchy.fcluster方法的典型用法代码示例。如果您正苦于以下问题:Python hierarchy.fcluster方法的具体用法?Python hierarchy.fcluster怎么用?Python hierarchy.fcluster使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scipy.cluster.hierarchy
的用法示例。
在下文中一共展示了hierarchy.fcluster方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_cluster_map
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def create_cluster_map(self, bow, srcids):
cluster_map = {}
z = linkage(bow, metric='cityblock', method='complete')
dists = list(set(z[:, 2]))
thresh = (dists[1] + dists[2]) / 2
self.logger.info('Threshold: {0}'.format(thresh))
b = hier.fcluster(z, thresh, criterion='distance')
assert bow.shape[0] == len(b)
assert len(b) == len(srcids)
for cid, srcid in zip(b, srcids):
cluster_map[cid] = cluster_map.get(cid, []) + [srcid]
self.logger.info('# of clusters: {0}'.format(len(b)))
self.logger.info('sizes of clustsers:{0}'.format(sorted(map(len, cluster_map.values()))))
return cluster_map
示例2: hier_clustering
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def hier_clustering(d, threshold=3):
srcids = d.keys()
tokenizer = lambda x: x.split()
vectorizer = TfidfVectorizer(tokenizer=tokenizer)
assert isinstance(d, dict)
assert isinstance(list(d.values())[0], list)
assert isinstance(list(d.values())[0][0], str)
doc = [' '.join(d[srcid]) for srcid in srcids]
vect = vectorizer.fit_transform(doc)
#TODO: Make vect aligned to the required format
z = linkage(vect.toarray(), metric='cityblock', method='complete')
dists = list(set(z[:,2]))
# threshold = 3
#threshold = (dists[2] + dists[3]) / 2
b = hier.fcluster(z, threshold, criterion='distance')
cluster_dict = defaultdict(list)
for srcid, cluster_id in zip(srcids, b):
cluster_dict[str(cluster_id)].append(srcid)
value_lengther = lambda x: len(x[1])
return OrderedDict(\
sorted(cluster_dict.items(), key=value_lengther, reverse=True))
示例3: dendrogram
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def dendrogram(data, threshold, layer_directory):
colnames = data.columns
data = np.array(data)
Z = hierarchy.linkage(data.T, 'single', 'cosine')
plt.figure(figsize=(15, 9))
dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
plt.title("Clustering of Samples Based on Mutational Signatures" )
plt.ylabel("Cosine Distance")
plt.xlabel("Sample IDs")
#plt.ylim((0,1))
plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
# which datapoints goes to which cluster
# The indices of the datapoints will be displayed as the ids
Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
dataframe = dataframe.set_index("Sample Names")
#print(dataframe)
dictionary = {"clusters":Y, "informations":dn}
return dataframe
######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature ####################################################
示例4: getClusters
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def getClusters(self, embed):
n, m = len(embed), self.p.embed_dims
X = np.empty((n, m), np.float32)
for i in range(len(embed)):
X[i, :] = embed[i]
dist = pdist(X, metric=self.p.metric)
clust_res = linkage(dist, method=self.p.linkage)
labels = fcluster(clust_res, t=self.p.thresh_val, criterion='distance') - 1
clusters = [[] for i in range(max(labels) + 1)]
for i in range(len(labels)):
clusters[labels[i]].append(i)
return clusters
示例5: cluster
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def cluster(currents,context,original_labels,chrom,pos1,plot,plotdir,cluster=False):
colours = {'m6A':'#B4656F','A':'#55B196'} #TODO update for other labels
if len(currents) > 1 and cluster :
pdistance = ssd.pdist(currents,metric='correlation')
dm = ssd.squareform(pdistance)
link = linkage(dm,method='complete',metric='correlation')
klabels = fcluster(link,2,'maxclust') #1,'inconsistent') #2,'maxclust')
#klabels = [1 if x == 1 else 0 for x in klabels]
#labels = ['m6A']*len(klabels)
strategy = 'correlation'
else:
klabels = [1 if x==1 else 0 for x in original_labels]
strategy = 'classifierProb'
if plot:
plot_w_labels(klabels,original_labels,currents,strategy,context,'chrom.'+chrom+'.pos.'+pos1,plotdir,colours)
#for cluster in clusters:
示例6: fit_predict
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def fit_predict(self, X, y=None):
self.model = linkage(X, method=self.method, metric=self.metric)
return fcluster(self.model, t=self.n_clusters, criterion='maxclust') - 1
示例7: mean_z
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def mean_z(z_all, dim_limit):
# use correlation clustering to average group assignments
lz = hi.linkage(z_all.T, 'single', 'hamming')
# not sure why cluster id starts from 1
z = hi.fcluster(lz, 0) - 1
all_cat = np.unique(z)
for a in all_cat:
a_size = np.sum(a == z)
if a_size > dim_limit:
z[a == z] = sample_multinomial([1.] * a_size, a_size, dim_limit)
return z
示例8: calculate_shell_bvalues_and_indices
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def calculate_shell_bvalues_and_indices(bvalues, max_distance=20e6):
"""
Calculates which measurements belong to different acquisition shells.
It uses scipy's linkage clustering algorithm, which uses the max_distance
input as a limit of including measurements in the same cluster.
For example, if bvalues were [1, 2, 3, 4, 5] and max_distance was 1, then
all bvalues would belong to the same cluster.
However, if bvalues were [1, 2, 4, 5] max max_distance was 1, then this
would result in 2 clusters.
Parameters
----------
bvalues: 1D numpy array of shape (Ndata)
bvalues of the acquisition in s/m^2.
max_distance: float
maximum b-value distance for a measurement to be included in the same
shell.
Returns
-------
shell_indices: 1D numpy array of shape (Ndata)
array of integers, starting from 0, representing to which shell a
measurement belongs. The number itself has no meaning other than just
being different for different shells.
shell_bvalues: 1D numpy array of shape (Nshells)
array of the mean bvalues for every acquisition shell.
"""
linkage_matrix = linkage(np.c_[bvalues])
clusters = fcluster(linkage_matrix, max_distance, criterion='distance')
shell_indices = np.empty_like(bvalues, dtype=int)
cluster_bvalues = np.zeros((np.max(clusters), 2))
for ind in np.unique(clusters):
cluster_bvalues[ind - 1] = np.mean(bvalues[clusters == ind]), ind
shell_bvalues, ordered_cluster_indices = (
cluster_bvalues[cluster_bvalues[:, 0].argsort()].T)
for i, ind in enumerate(ordered_cluster_indices):
shell_indices[clusters == ind] = i
return shell_indices, shell_bvalues
示例9: remove_correlated_feats
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def remove_correlated_feats(df):
tmp = df.T
# Remove columns with no variation
nunique = tmp.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
tmp.drop(cols_to_drop, axis=1, inplace=True)
perc_spearman = scipy.stats.spearmanr(tmp)
abs_corr = np.subtract(np.ones(shape=perc_spearman.correlation.shape),
np.absolute(perc_spearman.correlation))
np.fill_diagonal(abs_corr, 0)
abs_corr_clean = np.maximum(abs_corr,
abs_corr.transpose()) # some floating point mismatches, just make symmetric
clustering = linkage(squareform(abs_corr_clean), method='average')
clusters = fcluster(clustering, .1, criterion='distance')
names = tmp.columns.tolist()
names_to_cluster = list(zip(names, clusters))
indices_to_keep = []
### Extract models closest to cluster centroids
for x in range(1, len(set(clusters)) + 1):
# Create mask from the list of assignments for extracting submatrix of the cluster
mask = np.array([1 if i == x else 0 for i in clusters], dtype=bool)
# Take the index of the column with the smallest sum of distances from the submatrix
idx = np.argmin(sum(abs_corr_clean[:, mask][mask, :]))
# Extract names of cluster elements from names_to_cluster
sublist = [name for (name, cluster) in names_to_cluster if cluster == x]
# Element closest to centroid
centroid = sublist[idx]
indices_to_keep.append(centroid)
return df.loc[df.index.isin(indices_to_keep)]
示例10: cluster_from_dist_matrix
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def cluster_from_dist_matrix(dist_matrix, threshold):
"""Use scipy to cluster a distance matrix.
Args:
dist_matrix: distance matrix, represented in scipy's 1d condensed form
threshold: maximum inter-cluster distance to merge clusters (higher
results in fewer clusters)
Returns:
list c such that c[i] is a collection of all the observations
(whose pairwise distances are indexed in dist) in the i'th
cluster, in sorted order by descending cluster size
"""
linkage = hierarchy.linkage(dist_matrix, method='average')
clusters = hierarchy.fcluster(linkage, threshold, criterion='distance')
# clusters are numbered starting at 1, but base the count on
# first_clust_num just in case this changes
first_clust_num = min(clusters)
num_clusters = max(clusters) + 1 - first_clust_num
elements_in_cluster = defaultdict(list)
for i, clust_num in enumerate(clusters):
elements_in_cluster[clust_num].append(i)
cluster_sizes = {c: len(elements_in_cluster[c])
for c in range(first_clust_num,
num_clusters + first_clust_num)}
elements_in_cluster_sorted = []
for clust_num, _ in sorted(cluster_sizes.items(),
key=operator.itemgetter(1), reverse=True):
elements_in_cluster_sorted += [elements_in_cluster[clust_num]]
return elements_in_cluster_sorted
示例11: cluster_correlations
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def cluster_correlations(corr_mat, indices=None):
"""
Apply agglomerative clustering in order to sort
a correlation matrix.
Based on https://github.com/TheLoneNut/CorrelationMatrixClustering/blob/master/CorrelationMatrixClustering.ipynb
Parameters:
-----------
- corr_mat : a square correlation matrix (pandas DataFrame)
- indices : cluster labels [None]; if not provided we'll do
an aglomerative clustering to get cluster labels.
Returns:
--------
- corr : a sorted correlation matrix
- indices : cluster indexes based on the original dataset
Example:
--------
>> assoc = associations(
customers,
plot=False
)
>> correlations = assoc['corr']
>> correlations, _ = cluster_correlations(correlations)
"""
if indices is None:
X = corr_mat.values
d = sch.distance.pdist(X)
L = sch.linkage(d, method='complete')
indices = sch.fcluster(L, 0.5 * d.max(), 'distance')
columns = [corr_mat.columns.tolist()[i]
for i in list((np.argsort(indices)))]
corr_mat = corr_mat.reindex(columns=columns).reindex(index=columns)
return corr_mat, indices
示例12: get_word_clusters
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def get_word_clusters(sentence_dict):
srcids = list(sentence_dict.keys())
sentences = []
for srcid in srcids:
sentence = []
for metadata_type, sent in sentence_dict[srcid].items():
sentence.append(''.join(sent))
sentence = '\n'.join(sentence)
sentence = ' '.join(re.findall('[a-z]+', sentence))
sentences.append(sentence)
vect = TfidfVectorizer()
#vect = CountVectorizer()
bow = vect.fit_transform(sentences).toarray()
try:
z = linkage(bow, metric='cityblock', method='complete')
except:
pdb.set_trace()
dists = list(set(z[:,2]))
thresh = (dists[2] + dists[3]) /2
#thresh = (dists[1] + dists[2]) /2
print("Threshold: ", thresh)
b = hier.fcluster(z,thresh, criterion='distance')
cluster_dict = defaultdict(list)
for srcid, cluster_id in zip(srcids, b):
cluster_dict[cluster_id].append(srcid)
return dict(cluster_dict)
示例13: _offline_clustering
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def _offline_clustering(self, X):
print('Starting offline clustering...')
p_dist = pdist(X, metric=self._distance_metric)
Z = linkage(p_dist, 'complete')
cluster_index = fcluster(Z, self.max_dist, criterion='distance')
self._extract_representatives(X, cluster_index)
print('Processed {} instances.'.format(X.shape[0]))
print('Found {} clusters offline.\n'.format(len(self.representatives)))
# print('The representive vectors are:')
# pprint.pprint(self.representatives.tolist())
示例14: signal_clustering
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def signal_clustering(corr_matrix: DataFrame,
threshold: float,
cluster_pickle: str = "",
linkage_pickle: str = "",
force: bool = False):
if force:
if path.isfile(cluster_pickle):
remove(cluster_pickle)
if path.isfile(linkage_pickle):
remove(linkage_pickle)
if path.isfile(cluster_pickle) and path.isfile(linkage_pickle):
print("\nSignal clustering already completed and forcing is turned off. Using pickled data...")
return [load(open(cluster_pickle, "rb")), load(open(linkage_pickle, "rb"))]
# Remove negative values from the correlation matrix and invert the values
corr_matrix.where(corr_matrix > 0, 0, inplace=True)
corr_matrix = 1 - corr_matrix
X = corr_matrix.values # type: ndarray
Y = clip(ssd.squareform(X), 0, None)
# Z is the linkage matrix. This can serve as input to the scipy.cluster.hierarchy.dendrogram method
Z = linkage(Y, method='single', optimal_ordering=True)
fclus = fcluster(Z, t=threshold, criterion='distance')
cluster_dict = {}
for i, cluster_label in enumerate(fclus):
if cluster_label in cluster_dict:
cluster_dict[cluster_label].append(corr_matrix.index[i])
else:
cluster_dict[cluster_label] = [corr_matrix.index[i]]
return cluster_dict, Z
示例15: clusters_from_partitions
# 需要导入模块: from scipy.cluster import hierarchy [as 别名]
# 或者: from scipy.cluster.hierarchy import fcluster [as 别名]
def clusters_from_partitions(partitions, options):
"""Finds clusters in partitions using span-position distance and hierarchical clustering.
Assumes that all signatures in the given partition are of the same type and on the same contig"""
clusters_final = []
large_partitions = 0
# Find clusters in each partition individually.
for partition in partitions:
if len(partition) == 1:
clusters_final.append([partition[0]])
continue
elif len(partition) > 100:
partition_sample = sample(partition, 100)
large_partitions += 1
else:
partition_sample = partition
element_type = partition_sample[0].type
if element_type == "DEL" or element_type == "INV" or element_type == "DUP_TAN":
data = np.array( [[signature.get_source()[1], signature.get_source()[2], options.distance_normalizer] for signature in partition_sample])
Z = linkage(data, method = "average", metric = span_position_distance)
elif element_type == "INS":
data = np.array( [[signature.get_source()[1], signature.get_source()[2], options.distance_normalizer] for signature in partition_sample])
Z = linkage(data, method = "average", metric = span_position_distance_insertions)
elif element_type == "DUP_INT":
data = np.array( [[signature.get_source()[1], signature.get_source()[2], signature.get_destination()[1], options.distance_normalizer] for signature in partition_sample])
Z = linkage(data, method = "average", metric = span_position_distance_intdups)
cluster_indices = list(fcluster(Z, options.cluster_max_distance, criterion='distance'))
new_clusters = [[] for i in range(max(cluster_indices))]
for signature_index, cluster_index in enumerate(cluster_indices):
new_clusters[cluster_index-1].append(partition_sample[signature_index])
clusters_final.extend(new_clusters)
if len(partitions) > 0:
if len(partitions[0]) > 0:
logging.debug("%d out of %d partitions for %s exceeded 100 elements." % (large_partitions, len(partitions), partitions[0][0].type))
return clusters_final