当前位置: 首页>>代码示例>>Python>>正文


Python metrics.silhouette_samples函数代码示例

本文整理汇总了Python中sklearn.metrics.silhouette_samples函数的典型用法代码示例。如果您正苦于以下问题:Python silhouette_samples函数的具体用法?Python silhouette_samples怎么用?Python silhouette_samples使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了silhouette_samples函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: crank_feats

def crank_feats(fargs):
    rss, ccs, lv, installed_in, dfile, nfeatures = fargs
    noaa_init(installed_in)
    wat = pd.read_csv(dfile).set_index('station')

    es = ['e' + str(x) for x in range(0, nfeatures)]
    #ccs = [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30, 40]
    #rss=[0, 1]
    prefix='eigen' + str(nfeatures)
    #let's do some clustering with the six eigenvectors and see how they hold together
    flatnew, nmeans, nstds = flatten(wat[es])  #strictly speaking not necessary since 
    flatold, omeans, ostds = flatten(wat[lv])

    #note: this method flattens wat internally
    produce_kmeans_climates(wat, es, ccs, rss, prefix)

    for rs in rss:

        kf = pd.read_csv(noaafile('climates/' + prefix + '_rs_' + str(rs) + '.csv'))
        for cc in ccs:
            #this silhouettes thing gobbles memory, I'm guessing because each worker
            #creates an entire new metric matrix.
            kf['sil_eigen_' + str(cc)]  = silhouette_samples(flatnew, kf['vtx'+str(cc)].values)
            #pull out silhouette scores on the old metric too, just for fun...
            kf['sil_old_' + str(cc)]  = silhouette_samples(flatold, kf['vtx'+str(cc)].values)

        kf.to_csv(noaafile('climates/' + prefix + '_sil_rs_' + str(rs) + '.csv'), index=False)    
开发者ID:lpriccio,项目名称:noaa,代码行数:27,代码来源:nfeats_20160407.py

示例2: bestRep

def bestRep(dat,labels,outName):
    bestExample = []
    silSamp = metrics.silhouette_samples(dat, labels)
    for num in np.unique(labels):
        clusterMask = labels==num
        bestExample.append(outName[clusterMask][np.argmax(silSamp[clusterMask])])
    return bestExample
开发者ID:dstuck,项目名称:CompChemClustering,代码行数:7,代码来源:chemClustering.py

示例3: test_silhouette_samples

    def test_silhouette_samples(self):
        result = self.df.metrics.silhouette_samples()
        expected = metrics.silhouette_samples(self.data, self.pred)

        self.assertTrue(isinstance(result, pdml.ModelSeries))
        self.assert_index_equal(result.index, self.df.index)
        self.assert_numpy_array_almost_equal(result.values, expected)
开发者ID:Sandy4321,项目名称:pandas-ml,代码行数:7,代码来源:test_metrics.py

示例4: get_silhouette

def get_silhouette(df):
    df=df[(df.AB!=".")].copy()
    df.loc[:,'AB']=pd.to_numeric(df.loc[:,'AB'])
    df.loc[:,'CN']=pd.to_numeric(df.loc[:,'CN'])

    tp=df.iloc[0,:].loc['svtype']

    [mn_CN, mn_AB]=df.loc[:, ['CN', 'AB']].mean(skipna=True)
    [sd_CN, sd_AB]=df.loc[:, ['CN', 'AB']].std(skipna=True)

    if df.loc[:,'GT'].unique().size==1:
        df.loc[:,'sil_gt_avg']=1
        df.loc[:, 'sil_gt']=1
        df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']]
        return df

    #standardize the 2 dims
    if sd_AB>0.01:
        df.loc[:, 'AB1']=(df.loc[:,'AB']-mn_AB)/sd_AB
    else: 
        df.loc[:, 'AB1']=df.loc[:, 'AB']
    if tp in ['DEL', 'DUP', 'MEI'] or sd_CN>0.01:
        df.loc[:, 'CN1']=(df.loc[:,'CN']-mn_CN)/sd_CN
    else:
        df.loc[:, 'CN1']=df.loc[:, 'CN']

    
    gt_code={'0/0':1, '0/1':2, '1/1':3}
    df.loc[:,'gtn']=df.loc[:, 'GT'].map(gt_code)

    dist_2d_sq=spatial.distance.squareform(spatial.distance.pdist(df[['AB1', 'CN1']], metric='cityblock'))
    df.loc[:, 'sil_gt_avg']=metrics.silhouette_score(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed')
    df.loc[:, 'sil_gt']=metrics.silhouette_samples(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed')
    df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']]
    return df
开发者ID:abelhj,项目名称:svtools,代码行数:35,代码来源:gt_silhouette.py

示例5: silhouette_original_clusterings

def silhouette_original_clusterings(dataset='CB1', neuropil='Antennal_lobe', clusterer_or_k=60):
    """Returns a pandas dataframe with the silhouette index of each cluster member.
    The dataframe have columns (cluster_id, member_id, silhouette).
    """

    # Read the expression matrix
    print('Reading expression matrix')
    Xdf = ExpressionDataset.dataset(dset=dataset, neuropil=neuropil).Xdf(index_type='string')

    # Generate a flat map cluster_id -> members
    print('Finding cluster assignments')
    clusters_df, _ = get_original_clustering(dataset=dataset, neuropil=neuropil,
                                             clusterer_or_k=clusterer_or_k)
    dfs = []
    for cluster_id, members in zip(clusters_df.cluster_id,
                                   clusters_df.original_voxels_in_cluster):
        dfs.append(pd.DataFrame({'cluster_id': cluster_id, 'member_id': members}))
    members_df = pd.concat(dfs).set_index('member_id').loc[Xdf.index]

    # Compute the distance matrix - this must be parameterised
    print('Computing distance')
    import mkl
    mkl.set_num_threads(6)
    D = dicedist_metric(Xdf)

    # Compute silhouette
    # Here we could go for the faster implementation in third_party, if needed
    print('Computing silhouette index')
    members_df['silhouette'] = silhouette_samples(D.values,
                                                  members_df.cluster_id.values,
                                                  metric='precomputed')
    return (members_df.
            reset_index().
            rename(columns=lambda col: {'index': 'member_id'}.get(col, col))
            [['cluster_id', 'member_id', 'silhouette']])
开发者ID:strawlab,项目名称:braincode,代码行数:35,代码来源:clusters_quality.py

示例6: cluster_driver

def cluster_driver(a_driver):
    
#    print a_driver['DStats']
#    print "#############################DStats Above##################################################"

    X = StandardScaler().fit_transform(a_driver['DStats'])
    
#    print X
#    print "DStats are.....::" , a_driver['DStats']
#    print "X is...........::" , X
#    print "############################Scaled X Above###################################################"
    
    db = DBSCAN(eps=0.6, min_samples=5).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print "###############################################################################"
#    print('Estimated number of clusters: %d' % n_clusters_)
#    print 'Count of Predicts::', len(X)
#    print("Silhouette Coefficient: %0.3f"    % metrics.silhouette_score(X, labels))
#    print "##############################DBSCAN  X Below#################################################"
#    print X    G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/'
#    try:
    return (metrics.silhouette_samples(X, labels)+1)/2
开发者ID:RobbieShan,项目名称:MindOnData,代码行数:27,代码来源:eda+3.0.py

示例7: silhouette_analysis

def silhouette_analysis(clustering, labels=None):
    distance_df = clustering['distance_df']
    if labels is None:
        labels = clustering['labels']
    sample_scores = silhouette_samples(distance_df, metric='precomputed', labels=labels)
    score = np.mean(sample_scores)
    return sample_scores, score
开发者ID:IanEisenberg,项目名称:Self_Regulation_Ontology,代码行数:7,代码来源:utils.py

示例8: cluster

def cluster(algorithm, data, topics, make_silhouette=False):
  print str(algorithm)
  clusters = algorithm.fit_predict(data)
  labels = algorithm.labels_
  print 'Homogeneity: %0.3f' % metrics.homogeneity_score(topics, labels)
  print 'Completeness: %0.3f' % metrics.completeness_score(topics, labels)
  print 'V-measure: %0.3f' % metrics.v_measure_score(topics, labels)
  print 'Adjusted Rand index: %0.3f' % metrics.adjusted_rand_score(topics, labels)
  print 'Silhouette test: %0.3f' % metrics.silhouette_score(data, labels)
  print ' ***************** '
  
  silhouettes = metrics.silhouette_samples(data, labels)
  num_clusters = len(set(clusters))
  print 'num clusters: %d' % num_clusters
  print 'num fitted: %d' % len(clusters)

  # Make a silhouette plot if the flag is set
  if make_silhouette:
    order = numpy.lexsort((-silhouettes, clusters)) 
    indices = [numpy.flatnonzero(clusters[order] == num_clusters) for k in range(num_clusters)]
    ytick = [(numpy.max(ind)+numpy.min(ind))/2 for ind in indices]
    ytickLabels = ["%d" % x for x in range(num_clusters)]
    cmap = cm.jet( numpy.linspace(0,1,num_clusters) ).tolist()
    clr = [cmap[i] for i in clusters[order]]

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.barh(range(data.shape[0]), silhouettes[order], height=1.0,   
            edgecolor='none', color=clr)
    ax.set_ylim(ax.get_ylim()[::-1])
    plt.yticks(ytick, ytickLabels)
    plt.xlabel('Silhouette Value')
    plt.ylabel('Cluster')
    plt.savefig('cluster.png')
开发者ID:RuthRainbow,项目名称:DataMining,代码行数:34,代码来源:scilearn.py

示例9: visualize_silhouette_score

def visualize_silhouette_score(X,y_km):

    cluster_labels = np.unique(y_km)
    n_clusters = cluster_labels.shape[0]
    silhouette_vals = metrics.silhouette_samples(X,
                                         y_km,
                                         metric='euclidean')
    y_ax_lower, y_ax_upper = 0, 0
    yticks = []
    for i, c in enumerate(cluster_labels):
        c_silhouette_vals = silhouette_vals[y_km == c]
        c_silhouette_vals.sort()
        y_ax_upper += len(c_silhouette_vals)
        color = cm.jet(i / n_clusters)
        plt.barh(range(y_ax_lower, y_ax_upper),
                c_silhouette_vals,
                height=1.0,
                edgecolor='none',
                color=color)
        yticks.append((y_ax_lower + y_ax_upper) / 2)
        y_ax_lower += len(c_silhouette_vals)

    silhouette_avg = np.mean(silhouette_vals)
    plt.axvline(silhouette_avg,
                color="red",
                linestyle="--")
    plt.yticks(yticks, cluster_labels + 1)
    plt.ylabel('Cluster')
    plt.xlabel('Silhouette coefficient')
    plt.show()
开发者ID:wislish,项目名称:Python-Data-Analysis,代码行数:30,代码来源:userClassify.py

示例10: fit

    def fit(self, X, y=None, **kwargs):
        """
        Fits the model and generates the silhouette visualization.
        """
        # TODO: decide to use this method or the score method to draw.
        # NOTE: Probably this would be better in score, but the standard score
        # is a little different and I'm not sure how it's used.

        # Fit the wrapped estimator
        self.estimator.fit(X, y, **kwargs)

        # Get the properties of the dataset
        self.n_samples_ = X.shape[0]
        self.n_clusters_ = self.estimator.n_clusters

        # Compute the scores of the cluster
        labels = self.estimator.predict(X)
        self.silhouette_score_ = silhouette_score(X, labels)
        self.silhouette_samples_ = silhouette_samples(X, labels)

        # Draw the silhouette figure
        self.draw(labels)

        # Return the estimator
        return self
开发者ID:DistrictDataLabs,项目名称:yellowbrick,代码行数:25,代码来源:silhouette.py

示例11: cluster_driver

def cluster_driver(a_driver):
    
#    print a_driver['DStats']
#    print "#############################DStats Above#################################ValueError: zero-size array to reduction operation minimum which has no identity#################"

    X = StandardScaler().fit_transform(a_driver['DStats'])
#    print X
#    print "DStats are.....::" , a_driver['DStats']
#    print "X is...........::" ,['AvgDistDel', 'AvgACosDel', 'SDevDistDel', 'SDevACosDel','TotalTime','SkewDistDel','SkewACosDel'] X
#    print "############################Scaled X Above###################################################"
    
#    db = KMeans(n_clusters=20,n_jobs = -1).fit(X)
    db = DBSCAN(eps=0.45).fit(X)
#    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    
#    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print "###############################################################################"
#    print('Estimated number of clusters: %d' % n_clusters_)
#    print 'Count of Predicts::', len(X)
    print("Silhouette Coefficient: %0.3f"    % metrics.silhouette_score(X, labels,metric="mahalanobis"))
#    print "##############################DBSCAN  X Below#################################################"
#    print X    G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/'
#    try:
    return (metrics.silhouette_samples(X, labels,metric="mahalanobis")+1)/2
开发者ID:RobbieShan,项目名称:MindOnData,代码行数:27,代码来源:eda+11.0.py

示例12: run_clutering

def run_clutering(n_sites,order_dict,sim_mat):

   	n_clusters = 6
   	name_file = 'clustering_sil' + str(n_clusters)
   	output_file = open(name_file,'w')
   	name_file1 = 'clustering_labels' + str(n_clusters)
   	output_file1 = open(name_file1,'w')
   	
   	spectral = cluster.SpectralClustering(n_clusters=n_clusters, \
   			eigen_solver='arpack',affinity='precomputed')
   	labels = spectral.fit_predict(sim_mat)
   	
   	silhouette_avg = metrics.silhouette_score(sim_mat,labels)
   	output_file.write(" ".join(["aver silhouette_score:",str(silhouette_avg)]))

    # Compute the silhouette scores for each sample
   	sample_silhouette_values = metrics.silhouette_samples(sim_mat, labels)
   	
   	for siteid in order_dict:
   		stringa = ' '.join(                                       \
			[siteid,
        	str(sample_silhouette_values[order_dict[siteid]])])
   		output_file.write(stringa +'\n')
   	
   	for siteid in order_dict:
   		stringa = ' '.join(                                       \
			[str(siteid),str(labels[order_dict[siteid]])
        	])
   		output_file1.write(stringa +'\n')
开发者ID:SherazT,项目名称:Radiumone_code,代码行数:29,代码来源:compute_spectral_clustering.py

示例13: calculateNumberOfIdealClusters

def calculateNumberOfIdealClusters(maxAmount, corpus):
	print "Initializing silhouette analysis"
	range_n_clusters = range(2, maxAmount) # max amount of clusters equal to amount of jobs

	silhouette_high = 0;
	silhouette_high_n_clusters = 2;

	for n_clusters in range_n_clusters:
		# Initialize the clusterer with n_clusters value
		cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward", affinity="euclidean")
		cluster_labels = cluster.fit_predict(corpus)

		# The silhouette_score gives the average value for all the samples.
		# This gives a perspective into the density and separation of the formed clusters
		silhouette_avg = silhouette_score(corpus, cluster_labels)

		print "For n_clusters = %d, the average silhouette_score is: %.5f" % (n_clusters, silhouette_avg)

		if (silhouette_avg > silhouette_high):
		    silhouette_high = silhouette_avg
		    silhouette_high_n_clusters = n_clusters

		# Compute the silhouette scores for each sample
		sample_silhouette_values = silhouette_samples(corpus, cluster_labels)

	print ("Highest score = %f for n_clusters = %d" % (silhouette_high, silhouette_high_n_clusters))
	return silhouette_high_n_clusters
开发者ID:edwardmp,项目名称:clustering-job-offers-and-assessing-job-similarity,代码行数:27,代码来源:clustering.py

示例14: find_clusters

def find_clusters(df, k_vals=[4, 9, 16, 25], how='hierarchical'):
    '''Find clusters, and if method is k-means run silhouette analysis
    to determine the value of k.

    Args:
        df (data frame): A data frame with normalised expression data.
        k_vals (list or range): The range over which to test k.
        how ('hierarchical' or 'kmeans'): Clustering method.

    Returns:
        A list of cluster numbers.

    '''

    ## Don't run the silhouette analysis for hierarchical clustering,
    ## just calculate the clusters using estimate of k.
    if how == 'hierarchical':
        k = int(np.sqrt((len(df) / 2.0)))
        hc = hac.linkage(df, method='average')
        optimal_clusters = hac.fcluster(hc, t=k, criterion='maxclust')

    ## If method is k-means, run silhouette analysis.
    elif how == 'kmeans':
        best_combined_score = 0
        optimal_k = 2

        ## Try values of k from range and keep track of optimal k according
        ## to silhouette score.
        for k in k_vals:
            km = KMeans(n_clusters=k, random_state=10)
            clusters = km.fit_predict(df)
            silhouette_avg = silhouette_score(df, clusters)
            sample_silhouette_values = silhouette_samples(df, clusters)
            above_mean = 0
            silhouette_sizes = []

            for i in range(k):
                ith_cluster_silhouette_values = sample_silhouette_values[clusters == i]
                size_cluster_i = ith_cluster_silhouette_values.shape[0]
                silhouette_sizes.append(size_cluster_i)
                if max(ith_cluster_silhouette_values) > silhouette_avg:
                    above_mean += 1

            ## This combined score should pick the best value of k
            above_mean_score = float(above_mean) / k
            std_score = 1.0/np.std(silhouette_sizes) if np.std(silhouette_sizes) > 1.0 else 1.0
            combined_score = (silhouette_avg + above_mean_score + std_score) / 3

            ## Put the clusters in the new column in the data frame.
            if combined_score > best_combined_score:
                best_combined_score = combined_score
                optimal_k = k
                optimal_clusters = clusters

        optimal_clusters = [cluster + 1 for cluster in optimal_clusters]

    return optimal_clusters
开发者ID:peteashton,项目名称:dots_for_microarrays,代码行数:57,代码来源:dots_analysis.py

示例15: test_gmm

def test_gmm():
    sil = pyclust.validate.Silhouette()
    sil_score = sil.score(X, ypred, sample_size=None)

    print(sil_score[0])

    print(sil.sample_scores[:10])

    print(silhouette_score(X, ypred, sample_size=None))
    
    print(silhouette_samples(X, ypred)[:10])
开发者ID:dominguezus,项目名称:pyclust,代码行数:11,代码来源:test_silhouette.py


注:本文中的sklearn.metrics.silhouette_samples函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。