本文整理汇总了Python中sklearn.metrics.silhouette_samples函数的典型用法代码示例。如果您正苦于以下问题:Python silhouette_samples函数的具体用法?Python silhouette_samples怎么用?Python silhouette_samples使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了silhouette_samples函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: crank_feats
def crank_feats(fargs):
rss, ccs, lv, installed_in, dfile, nfeatures = fargs
noaa_init(installed_in)
wat = pd.read_csv(dfile).set_index('station')
es = ['e' + str(x) for x in range(0, nfeatures)]
#ccs = [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30, 40]
#rss=[0, 1]
prefix='eigen' + str(nfeatures)
#let's do some clustering with the six eigenvectors and see how they hold together
flatnew, nmeans, nstds = flatten(wat[es]) #strictly speaking not necessary since
flatold, omeans, ostds = flatten(wat[lv])
#note: this method flattens wat internally
produce_kmeans_climates(wat, es, ccs, rss, prefix)
for rs in rss:
kf = pd.read_csv(noaafile('climates/' + prefix + '_rs_' + str(rs) + '.csv'))
for cc in ccs:
#this silhouettes thing gobbles memory, I'm guessing because each worker
#creates an entire new metric matrix.
kf['sil_eigen_' + str(cc)] = silhouette_samples(flatnew, kf['vtx'+str(cc)].values)
#pull out silhouette scores on the old metric too, just for fun...
kf['sil_old_' + str(cc)] = silhouette_samples(flatold, kf['vtx'+str(cc)].values)
kf.to_csv(noaafile('climates/' + prefix + '_sil_rs_' + str(rs) + '.csv'), index=False)
示例2: bestRep
def bestRep(dat,labels,outName):
bestExample = []
silSamp = metrics.silhouette_samples(dat, labels)
for num in np.unique(labels):
clusterMask = labels==num
bestExample.append(outName[clusterMask][np.argmax(silSamp[clusterMask])])
return bestExample
示例3: test_silhouette_samples
def test_silhouette_samples(self):
result = self.df.metrics.silhouette_samples()
expected = metrics.silhouette_samples(self.data, self.pred)
self.assertTrue(isinstance(result, pdml.ModelSeries))
self.assert_index_equal(result.index, self.df.index)
self.assert_numpy_array_almost_equal(result.values, expected)
示例4: get_silhouette
def get_silhouette(df):
df=df[(df.AB!=".")].copy()
df.loc[:,'AB']=pd.to_numeric(df.loc[:,'AB'])
df.loc[:,'CN']=pd.to_numeric(df.loc[:,'CN'])
tp=df.iloc[0,:].loc['svtype']
[mn_CN, mn_AB]=df.loc[:, ['CN', 'AB']].mean(skipna=True)
[sd_CN, sd_AB]=df.loc[:, ['CN', 'AB']].std(skipna=True)
if df.loc[:,'GT'].unique().size==1:
df.loc[:,'sil_gt_avg']=1
df.loc[:, 'sil_gt']=1
df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']]
return df
#standardize the 2 dims
if sd_AB>0.01:
df.loc[:, 'AB1']=(df.loc[:,'AB']-mn_AB)/sd_AB
else:
df.loc[:, 'AB1']=df.loc[:, 'AB']
if tp in ['DEL', 'DUP', 'MEI'] or sd_CN>0.01:
df.loc[:, 'CN1']=(df.loc[:,'CN']-mn_CN)/sd_CN
else:
df.loc[:, 'CN1']=df.loc[:, 'CN']
gt_code={'0/0':1, '0/1':2, '1/1':3}
df.loc[:,'gtn']=df.loc[:, 'GT'].map(gt_code)
dist_2d_sq=spatial.distance.squareform(spatial.distance.pdist(df[['AB1', 'CN1']], metric='cityblock'))
df.loc[:, 'sil_gt_avg']=metrics.silhouette_score(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed')
df.loc[:, 'sil_gt']=metrics.silhouette_samples(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed')
df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']]
return df
示例5: silhouette_original_clusterings
def silhouette_original_clusterings(dataset='CB1', neuropil='Antennal_lobe', clusterer_or_k=60):
"""Returns a pandas dataframe with the silhouette index of each cluster member.
The dataframe have columns (cluster_id, member_id, silhouette).
"""
# Read the expression matrix
print('Reading expression matrix')
Xdf = ExpressionDataset.dataset(dset=dataset, neuropil=neuropil).Xdf(index_type='string')
# Generate a flat map cluster_id -> members
print('Finding cluster assignments')
clusters_df, _ = get_original_clustering(dataset=dataset, neuropil=neuropil,
clusterer_or_k=clusterer_or_k)
dfs = []
for cluster_id, members in zip(clusters_df.cluster_id,
clusters_df.original_voxels_in_cluster):
dfs.append(pd.DataFrame({'cluster_id': cluster_id, 'member_id': members}))
members_df = pd.concat(dfs).set_index('member_id').loc[Xdf.index]
# Compute the distance matrix - this must be parameterised
print('Computing distance')
import mkl
mkl.set_num_threads(6)
D = dicedist_metric(Xdf)
# Compute silhouette
# Here we could go for the faster implementation in third_party, if needed
print('Computing silhouette index')
members_df['silhouette'] = silhouette_samples(D.values,
members_df.cluster_id.values,
metric='precomputed')
return (members_df.
reset_index().
rename(columns=lambda col: {'index': 'member_id'}.get(col, col))
[['cluster_id', 'member_id', 'silhouette']])
示例6: cluster_driver
def cluster_driver(a_driver):
# print a_driver['DStats']
# print "#############################DStats Above##################################################"
X = StandardScaler().fit_transform(a_driver['DStats'])
# print X
# print "DStats are.....::" , a_driver['DStats']
# print "X is...........::" , X
# print "############################Scaled X Above###################################################"
db = DBSCAN(eps=0.6, min_samples=5).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print "###############################################################################"
# print('Estimated number of clusters: %d' % n_clusters_)
# print 'Count of Predicts::', len(X)
# print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))
# print "##############################DBSCAN X Below#################################################"
# print X G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/'
# try:
return (metrics.silhouette_samples(X, labels)+1)/2
示例7: silhouette_analysis
def silhouette_analysis(clustering, labels=None):
distance_df = clustering['distance_df']
if labels is None:
labels = clustering['labels']
sample_scores = silhouette_samples(distance_df, metric='precomputed', labels=labels)
score = np.mean(sample_scores)
return sample_scores, score
示例8: cluster
def cluster(algorithm, data, topics, make_silhouette=False):
print str(algorithm)
clusters = algorithm.fit_predict(data)
labels = algorithm.labels_
print 'Homogeneity: %0.3f' % metrics.homogeneity_score(topics, labels)
print 'Completeness: %0.3f' % metrics.completeness_score(topics, labels)
print 'V-measure: %0.3f' % metrics.v_measure_score(topics, labels)
print 'Adjusted Rand index: %0.3f' % metrics.adjusted_rand_score(topics, labels)
print 'Silhouette test: %0.3f' % metrics.silhouette_score(data, labels)
print ' ***************** '
silhouettes = metrics.silhouette_samples(data, labels)
num_clusters = len(set(clusters))
print 'num clusters: %d' % num_clusters
print 'num fitted: %d' % len(clusters)
# Make a silhouette plot if the flag is set
if make_silhouette:
order = numpy.lexsort((-silhouettes, clusters))
indices = [numpy.flatnonzero(clusters[order] == num_clusters) for k in range(num_clusters)]
ytick = [(numpy.max(ind)+numpy.min(ind))/2 for ind in indices]
ytickLabels = ["%d" % x for x in range(num_clusters)]
cmap = cm.jet( numpy.linspace(0,1,num_clusters) ).tolist()
clr = [cmap[i] for i in clusters[order]]
fig = plt.figure()
ax = fig.add_subplot(111)
ax.barh(range(data.shape[0]), silhouettes[order], height=1.0,
edgecolor='none', color=clr)
ax.set_ylim(ax.get_ylim()[::-1])
plt.yticks(ytick, ytickLabels)
plt.xlabel('Silhouette Value')
plt.ylabel('Cluster')
plt.savefig('cluster.png')
示例9: visualize_silhouette_score
def visualize_silhouette_score(X,y_km):
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = metrics.silhouette_samples(X,
y_km,
metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
c_silhouette_vals = silhouette_vals[y_km == c]
c_silhouette_vals.sort()
y_ax_upper += len(c_silhouette_vals)
color = cm.jet(i / n_clusters)
plt.barh(range(y_ax_lower, y_ax_upper),
c_silhouette_vals,
height=1.0,
edgecolor='none',
color=color)
yticks.append((y_ax_lower + y_ax_upper) / 2)
y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg,
color="red",
linestyle="--")
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show()
示例10: fit
def fit(self, X, y=None, **kwargs):
"""
Fits the model and generates the silhouette visualization.
"""
# TODO: decide to use this method or the score method to draw.
# NOTE: Probably this would be better in score, but the standard score
# is a little different and I'm not sure how it's used.
# Fit the wrapped estimator
self.estimator.fit(X, y, **kwargs)
# Get the properties of the dataset
self.n_samples_ = X.shape[0]
self.n_clusters_ = self.estimator.n_clusters
# Compute the scores of the cluster
labels = self.estimator.predict(X)
self.silhouette_score_ = silhouette_score(X, labels)
self.silhouette_samples_ = silhouette_samples(X, labels)
# Draw the silhouette figure
self.draw(labels)
# Return the estimator
return self
示例11: cluster_driver
def cluster_driver(a_driver):
# print a_driver['DStats']
# print "#############################DStats Above#################################ValueError: zero-size array to reduction operation minimum which has no identity#################"
X = StandardScaler().fit_transform(a_driver['DStats'])
# print X
# print "DStats are.....::" , a_driver['DStats']
# print "X is...........::" ,['AvgDistDel', 'AvgACosDel', 'SDevDistDel', 'SDevACosDel','TotalTime','SkewDistDel','SkewACosDel'] X
# print "############################Scaled X Above###################################################"
# db = KMeans(n_clusters=20,n_jobs = -1).fit(X)
db = DBSCAN(eps=0.45).fit(X)
# core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
# core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print "###############################################################################"
# print('Estimated number of clusters: %d' % n_clusters_)
# print 'Count of Predicts::', len(X)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels,metric="mahalanobis"))
# print "##############################DBSCAN X Below#################################################"
# print X G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/'
# try:
return (metrics.silhouette_samples(X, labels,metric="mahalanobis")+1)/2
示例12: run_clutering
def run_clutering(n_sites,order_dict,sim_mat):
n_clusters = 6
name_file = 'clustering_sil' + str(n_clusters)
output_file = open(name_file,'w')
name_file1 = 'clustering_labels' + str(n_clusters)
output_file1 = open(name_file1,'w')
spectral = cluster.SpectralClustering(n_clusters=n_clusters, \
eigen_solver='arpack',affinity='precomputed')
labels = spectral.fit_predict(sim_mat)
silhouette_avg = metrics.silhouette_score(sim_mat,labels)
output_file.write(" ".join(["aver silhouette_score:",str(silhouette_avg)]))
# Compute the silhouette scores for each sample
sample_silhouette_values = metrics.silhouette_samples(sim_mat, labels)
for siteid in order_dict:
stringa = ' '.join( \
[siteid,
str(sample_silhouette_values[order_dict[siteid]])])
output_file.write(stringa +'\n')
for siteid in order_dict:
stringa = ' '.join( \
[str(siteid),str(labels[order_dict[siteid]])
])
output_file1.write(stringa +'\n')
示例13: calculateNumberOfIdealClusters
def calculateNumberOfIdealClusters(maxAmount, corpus):
print "Initializing silhouette analysis"
range_n_clusters = range(2, maxAmount) # max amount of clusters equal to amount of jobs
silhouette_high = 0;
silhouette_high_n_clusters = 2;
for n_clusters in range_n_clusters:
# Initialize the clusterer with n_clusters value
cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward", affinity="euclidean")
cluster_labels = cluster.fit_predict(corpus)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed clusters
silhouette_avg = silhouette_score(corpus, cluster_labels)
print "For n_clusters = %d, the average silhouette_score is: %.5f" % (n_clusters, silhouette_avg)
if (silhouette_avg > silhouette_high):
silhouette_high = silhouette_avg
silhouette_high_n_clusters = n_clusters
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(corpus, cluster_labels)
print ("Highest score = %f for n_clusters = %d" % (silhouette_high, silhouette_high_n_clusters))
return silhouette_high_n_clusters
示例14: find_clusters
def find_clusters(df, k_vals=[4, 9, 16, 25], how='hierarchical'):
'''Find clusters, and if method is k-means run silhouette analysis
to determine the value of k.
Args:
df (data frame): A data frame with normalised expression data.
k_vals (list or range): The range over which to test k.
how ('hierarchical' or 'kmeans'): Clustering method.
Returns:
A list of cluster numbers.
'''
## Don't run the silhouette analysis for hierarchical clustering,
## just calculate the clusters using estimate of k.
if how == 'hierarchical':
k = int(np.sqrt((len(df) / 2.0)))
hc = hac.linkage(df, method='average')
optimal_clusters = hac.fcluster(hc, t=k, criterion='maxclust')
## If method is k-means, run silhouette analysis.
elif how == 'kmeans':
best_combined_score = 0
optimal_k = 2
## Try values of k from range and keep track of optimal k according
## to silhouette score.
for k in k_vals:
km = KMeans(n_clusters=k, random_state=10)
clusters = km.fit_predict(df)
silhouette_avg = silhouette_score(df, clusters)
sample_silhouette_values = silhouette_samples(df, clusters)
above_mean = 0
silhouette_sizes = []
for i in range(k):
ith_cluster_silhouette_values = sample_silhouette_values[clusters == i]
size_cluster_i = ith_cluster_silhouette_values.shape[0]
silhouette_sizes.append(size_cluster_i)
if max(ith_cluster_silhouette_values) > silhouette_avg:
above_mean += 1
## This combined score should pick the best value of k
above_mean_score = float(above_mean) / k
std_score = 1.0/np.std(silhouette_sizes) if np.std(silhouette_sizes) > 1.0 else 1.0
combined_score = (silhouette_avg + above_mean_score + std_score) / 3
## Put the clusters in the new column in the data frame.
if combined_score > best_combined_score:
best_combined_score = combined_score
optimal_k = k
optimal_clusters = clusters
optimal_clusters = [cluster + 1 for cluster in optimal_clusters]
return optimal_clusters
示例15: test_gmm
def test_gmm():
sil = pyclust.validate.Silhouette()
sil_score = sil.score(X, ypred, sample_size=None)
print(sil_score[0])
print(sil.sample_scores[:10])
print(silhouette_score(X, ypred, sample_size=None))
print(silhouette_samples(X, ypred)[:10])