当前位置: 首页>>代码示例>>Python>>正文


Python KMeans.transform方法代码示例

本文整理汇总了Python中sklearn.cluster.KMeans.transform方法的典型用法代码示例。如果您正苦于以下问题:Python KMeans.transform方法的具体用法?Python KMeans.transform怎么用?Python KMeans.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.cluster.KMeans的用法示例。


在下文中一共展示了KMeans.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: gapstat

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
	def gapstat(self, ref_size=10, max_iter=300, n_init=3):


		Wkestrand = np.zeros(len(self.range))
		Wk = np.zeros(len(self.range))
		sk = np.zeros(len(self.range))
		
		sample = self.randomData(ref_size)
		
		

		for indk, k in enumerate(self.range):
			km = KMeans(n_clusters=k, init='k-means++', max_iter=max_iter, n_init=n_init)
			Wkrand = []
			for i in range(ref_size):
				km.fit(sample[i])
				SS = km.transform(sample[i])
				Wkrand.append((self.intraDist(km.labels_.tolist(), k, km.cluster_centers_)))

			Wkestrand[indk] = (1/ref_size)*sum(Wkrand)

			km.fit(self.X)
			XX = km.transform(self.X)
			clusters = km.labels_.tolist()
			Wk[indk] = self.intraDist(clusters, k, km.cluster_centers_)
			sk[indk] = np.sqrt((1/ref_size)*sum([(Wkrand[i]-Wkestrand[indk])**2 for i in range(ref_size)]))

		sk *= np.sqrt(1+1/ref_size)

		Gapk = [(1/ref_size)*Wkestrand[i]-Wk[i] for i in range(len(self.range))]


		#return min([k for k, j in enumerate([Gapk[g]-Gapk[g+1]+sk[g+1] for g in self.range[:,-1]]) if j>0 ])
		return [(k, Gapk[j], Gapk[j]-Gapk[j+1]+sk[j+1])for j, k in enumerate(self.range[:-1])]
开发者ID:BenHeubl,项目名称:speaksLike,代码行数:36,代码来源:clusteringk.py

示例2: cluster_driver

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def cluster_driver(a_driver):
    
#    print a_driver['DStats']
#    print "#############################DStats Above#################################ValueError: zero-size array to reduction operation minimum which has no identity#################"

#    sys.stdout = open('a_projpath' +'output.txt','w')
#    print a_driver['DStats']
    
    X = StandardScaler().fit_transform(a_driver['DStats'])
    

    
#    print X
#    print "DStats are.....::" , a_driver['DStats']
#    print "X is...........::" ,['AvgDistDel', 'AvgACosDel', 'SDevDistDel', 'SDevACosDel','TotalTime','SkewDistDel','SkewACosDel'] X
#    print "############################Scaled X Above###################################################"
    
    pca = PCA(n_components=5)
    Xpca = pca.fit(X).transform(X)
    
    if plotflag == True:
        
        fig = scatterplot_matrix(np.transpose(Xpca)
                                                , ['PC1'
                                                , 'PC2'
                                                , 'PC3'
                                                , 'PC4'
#                                                ,'PC5'
                                                ]
                                                ,linestyle='none', marker='o', color='black', mfc='none')
        fig.suptitle('Simple Scatterplot Matrix')
        plt.show()
        

    db = KMeans(n_clusters=1,n_jobs = -1).fit(Xpca)
    
#    db = DBSCAN(eps=0.5).fit(Xpca)
    
#    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    
#    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print "###############################################################################"
#    print('Estimated number of clusters: %d' % n_clusters_)
#    print 'Count of Predicts::', len(X)
#    print("Silhouette Coefficient: %0.3f"    % metrics.silhouette_score(Xpca, labels))
  
    
    print "% Variance Explaned: %0.3f" , sum(pca.explained_variance_ratio_)
#    print "##############################DBSCAN  X Below#################################################"
#    print X    G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/'
#    try:
    
    return (1- (db.transform(Xpca)/max(db.transform(Xpca))))
开发者ID:RobbieShan,项目名称:MindOnData,代码行数:58,代码来源:eda+13.0.py

示例3: best_lda_cluster_wine

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
 def best_lda_cluster_wine(self):
     dh = data_helper()
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ##
     ## K-Means
     ##
     km = KMeans(n_clusters=4, algorithm='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
     
     ##
     ## GMM
     ##
     gmm = GaussianMixture(n_components=4, covariance_type='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
开发者ID:rbaxter1,项目名称:CS7641,代码行数:50,代码来源:part3.py

示例4: inertia_clustering_analysis

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def inertia_clustering_analysis(ds, max_clusters=13):

    inertia_val = np.array([])

    #max_clusters = 13#+2 = 15
    for i in np.arange(max_clusters)+2:
        kmeans = KMeans(init='k-means++', n_clusters=i, n_init=10)
        kmeans.transform(ds.samples)
        inertia_val = np.append(inertia_val, kmeans.inertia_)

    f = plt.figure()
    a = f.add_subplot(111)
    a.plot(inertia_val)
    plt.show()

    return inertia_val
开发者ID:robbisg,项目名称:mvpa_itab_wu,代码行数:18,代码来源:main_wu.py

示例5: KnnClassify

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
    def KnnClassify(self,candi):
        words = self.extracAllword(candi)
        word_dict = {w:idx for idx, w in enumerate(words)}
        x = [[0 for _ in xrange(len(words))] for _ in xrange(len(candi))]
        if len(x) < 3:
            return candi
        for id, s in enumerate(candi):
            tmp = self.text_to_vector(s)
            for k,v in tmp.items():
                x[id][word_dict[k]] = float(v)

        km = KMeans(n_clusters=3)
        km.fit(x)
        samples = {}
        X_new = km.transform(x)
        # try:
        #     X_new = km.transform(x)
        # except:
        #     print 'mooo'
        for idx, l in enumerate(km.labels_):
            try:
                samples[l][idx] = X_new[idx][l]
            except:
                samples[l] ={}
                samples[l][idx] = X_new[idx][l]
        ret = []
        for k, v in samples.items():
            sortedv = sorted(v.items(), key=operator.itemgetter(1), reverse=True)
            for it in sortedv:
                ret.append(candi[it[0]])
        return ret
开发者ID:siyuqtt,项目名称:independent,代码行数:33,代码来源:util.py

示例6: kmean_data

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def kmean_data(tune_path=None, test_path=None, cluster=3, isPCA=True):
    '''
    :param tune_path: src of a tuning data set
    :param test_path: src of a testing data set
    :return: tuning data after clustering, in the form of [indep val,
    depen val]
    '''

    def find_min(a):
        return a.min()

    if not tune_path:
        tune_path = "./data/ant/ant-1.4.csv"
    if not test_path:
        test_path = "./data/ant/ant-1.5.csv"
    df_tune = get_data(tune_path, "tune")
    df_test = get_data(test_path, "test")
    if isPCA:
        tune_x, tune_y = pca_analysis(df_tune)
        test_x, test_y = pca_analysis(df_test)
    else:
        tune_x, tune_y = get_xy(df_tune, normalize=True)
        test_x, test_y = get_xy(df_test, normalize=True)
    # tune_x, tune_y = get_xy(df_tune, normalize=True)
    # test_x, test_y = get_xy(df_test, normalize=True)
    kmean = KMeans(n_clusters=cluster).fit(
        test_x)  ## use testing data to do clustering
    avg_distance = kmean.inertia_ / float(len(test_x))
    tune_distance = kmean.transform(tune_x)
    min_distance = np.apply_along_axis(find_min, 1, tune_distance)
    pick_index = min_distance < avg_distance * 2  # find tuning data whose
    # all distance to cluster center is less than avg_distance
    normal_tune_x, normal_tune_y = get_xy(df_tune, normalize=False)
    _tune_x, _tune_y = normal_tune_x[pick_index], normal_tune_y[pick_index]
    return [_tune_x, _tune_y]
开发者ID:ai-se,项目名称:TuneData,代码行数:37,代码来源:clustering.py

示例7: kmeans

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def kmeans(data, model_id, x_col, n_clusters):

    # |Create model, fit data, and return prediction of cluster for each row
    model = KMeans(n_clusters)

    # |Add distance to each cluster for each row to summary data
    headers = []
    for i in range(n_clusters):
        headers.append('dist_%s' % str(i))
    dist = pd.DataFrame(model.transform(data.x), columns=headers)
    data.current_df = data.current_df.join(dist)

    data.df['kmeans']['data'] = data.df['kmeans']['data'].append(data.current_df, ignore_index=True)

    # |Create DataFrame with each cluster and the mean value for each input column
    df = pd.DataFrame()
    for i in range(n_clusters):
        clus = {'cluster':i}
        for j in range(len(x_col)):
            clus['%s_mean' % x_col[j]] = model.cluster_centers_[i][j]
        df = df.append(clus, ignore_index=True)
    df['model_id'] = model_id
    data.df['kmeans']['clusters'] = data.df['kmeans']['clusters'].append(df, ignore_index=True)

    return data, model
开发者ID:rosspalmer,项目名称:DataTools,代码行数:27,代码来源:predict.py

示例8: kmeans_betacv

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def kmeans_betacv(data, num_cluster, batch_kmeans=False, n_runs = 10,
                  confidence = 0.90):
    '''
    Computes the BetaCV for running Kmeans on the dataset. This method
    returns the BetaCV value and half of the size of the confidence interval
    for the same value (BetaCV is an average or the number of runs given).
    
    Arguments
    ---------
    data: matrix
        A matrix of observations. If this is sparse, `batch_kmeans` must 
        be True
    num_cluster: int 
        number of clusters to run k-means for
    batch_kmeans: bool (defauts to False)
        if `sklearn.cluster.MiniBatchKMeans` should be used. This is faster
        and suitable for sparse datasets, but less accurate.
    n_runs: int (default = 10)
        Number of runs to compute the BetaCV
    confidence: double [0, 1) (default = 0.9)
        The confidence used to compute half the confidence interval size
    
    Returns
    -------
    The betacv and half of the confidence interval size
    '''
    algorithm = None
    if not batch_kmeans:
        algorithm = KMeans(num_cluster)
    else:
        algorithm = MiniBatchKMeans(num_cluster)
    
    inter_array = np.zeros(n_runs)
    intra_array = np.zeros(n_runs)
    for i in xrange(n_runs):
        #Run K-Means
        algorithm.fit(data)
        
        centers = algorithm.cluster_centers_
        labels = algorithm.labels_
        
        #KMeans in sklearn uses euclidean
        dist_centers = pairwise.euclidean_distances(centers)
        
        #Inter distance
        mean_dist_between_centers = np.mean(dist_centers)
        inter_array[i] = mean_dist_between_centers

        #Intra distance
        dist_all_centers = algorithm.transform(data)
        intra_dists = []
        for doc_id, cluster in enumerate(labels):
            dist = dist_all_centers[doc_id, cluster]
            intra_dists.append(dist)
        intra_array[i] = np.mean(intra_dists)
    
    betacv = intra_array / inter_array
    cinterval = half_confidence_interval_size(betacv, confidence)
    return np.mean(betacv), cinterval
开发者ID:flaviovdf,项目名称:vodlibs,代码行数:61,代码来源:cluster.py

示例9: compute_clusters

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def compute_clusters(topics, match):
    recipe_topics = topics['W'][match, :]
    cluster = KMeans(n_clusters=4)
    # cluster = AffinityPropagation()
    cluster.fit(recipe_topics)
    distances = cluster.transform(recipe_topics)

    return cluster, distances
开发者ID:jdstemmler,项目名称:stone-soup,代码行数:10,代码来源:search.py

示例10: cluster_encode

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def cluster_encode(X_train, X_test, codebook='kmeans', k=25):
    if codebook == 'kmeans':
        cb = KMeans(k, n_init=1, init='random')
    elif codebook == 'gmm':
        cb = GMM(n_components=k)
    X = np.vstack((X_train, X_test))
    X = StandardScaler().fit_transform(X)
    print('_' * 80)
    print('fitting codebook')
    print
    print cb
    print
    cb.fit(X)
    print 'fin.'
    X_train = cb.transform(X_train)
    X_test = cb.transform(X_test)
    return X_train, X_test
开发者ID:mhdella,项目名称:kaggle-solar-energy,代码行数:19,代码来源:grid_search.py

示例11: _cluster

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
    def _cluster(self, index):
        data = self.data[index]
        kmeans = KMeans(n_clusters=2, random_state=0).fit(data)
        labels = kmeans.labels_
        l_i = np.where(labels == 0)[0]
        r_i = np.where(labels == 1)[0]
        left_index = index[l_i]
        right_index = index[r_i]
        if len(right_index) - len(left_index) > 1:
            distances = kmeans.transform(data[r_i])
            left_index, right_index = self._rebalance(
                left_index, right_index, distances[:, 1])
        elif len(left_index) - len(right_index) > 1:
            distances = kmeans.transform(data[l_i])
            left_index, right_index = self._rebalance(
                right_index, left_index, distances[:, 0])

        return left_index, right_index
开发者ID:q64545,项目名称:x-deeplearning,代码行数:20,代码来源:cluster.py

示例12: test_transform

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def test_transform():
    k_means = KMeans(k=n_clusters)
    k_means.fit(X)
    X_new = k_means.transform(k_means.cluster_centers_)

    for c in range(n_clusters):
        assert_equal(X_new[c, c], 0)
        for c2 in range(n_clusters):
            if c != c2:
                assert_true(X_new[c, c2] > 0)
开发者ID:bennihepp,项目名称:scikit-learn,代码行数:12,代码来源:test_k_means.py

示例13: test_transform

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def test_transform():
    km = KMeans(n_clusters=n_clusters)
    km.fit(X)
    X_new = km.transform(km.cluster_centers_)

    for c in range(n_clusters):
        assert_equal(X_new[c, c], 0)
        for c2 in range(n_clusters):
            if c != c2:
                assert_greater(X_new[c, c2], 0)
开发者ID:Lavanya-Basavaraju,项目名称:scikit-learn,代码行数:12,代码来源:test_k_means.py

示例14: run_k_means

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def run_k_means(df, numberclusters, geoidlabel ='geoid10', plot_silouette = True):
	'''Uses sklearn to run kmeans. 
	
	ARGUMENTS:
	1) df: A dataframe with a geoid column
	2) geoidlabel: the label of the geoid column. 
	3) plot_silouette: whether or not to plot the silouettes of each cluster
	
	OUTPUT: Returns a three part tuple:
	1) the kmeans sklearn model 
	2) a dictionary with geoids as the key, and the cluster as the value
	3) a dictionary with clusters as the key, and a list of related geoids as the value'''

	#Use K means to cluster the dataset. 
	x = df[['wkday_0','wkday_1','hrbin_morning',
	        'hrbin_afternoon','hrbin_evening',
	         'hrbin_latenight','hrbin_dawn']].values
	kmeans = KMeans(n_clusters = numberclusters)
	kmeans.fit(X = x )
	features = df.columns.tolist()[1:]
	geoids = df[geoidlabel]

	#store values in a dictionary
	geoid_dict = defaultdict(int)
	cluster_dict = defaultdict(list)

	#Transforms x into a cluster-distance space. 
	#In this array, each column is a cluster with the value of the distance from 
	#a given neighborhood block (geoid) in each row. 
	#This function returns the cluster belonging to each neighborhood block:
		#the cluster with the smallest distance value 
	assigned_cluster = kmeans.transform(x).argmin(axis=1)

	for i in range(kmeans.n_clusters):
	    cluster = np.arange(0, x.shape[0])[assigned_cluster==i]
	    geoids = [df.ix[geoindx]['hrbin_'] for geoindx in cluster]
	    print len(geoids), 'cluster #', i
	    #make a dictionary with cluster as the key, and geoids as the list
	    cluster_dict[i] = geoids
	    #second dictionary to quickly look up what cluster each geoid belongs to
	    for geo in geoids:
	        geoid_dict[geo] = i
	if  plot_silouette == True:
	    plot_cluster_silouette_values(X, assigned_cluster, n_clusters)

	#save the dictionaries as CSVs
	save_dictionary_as_csv(cluster_dict, 'data/intermediate_data/kmeans/kmeans_clusterdict.csv')
	save_dictionary_as_csv(geoid_dict, 'data/intermediate_data/kmeans/kmeans_geoiddict.csv')

	return kmeans, geoid_dict, cluster_dict
开发者ID:heggy231,项目名称:media_mapper,代码行数:52,代码来源:k_means.py

示例15: cluster_documents

# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def cluster_documents(n_clusters, doc_term_matrix):
    kmeans = KMeans(n_clusters=n_clusters)

    kmeans = kmeans.fit(doc_term_matrix)

    distances = kmeans.transform(doc_term_matrix)

    results = distances.argmin(axis=1)

    clusters = defaultdict(list)

    for document_index, cluster in enumerate(results):
        clusters[cluster].append((document_index, distances[document_index, cluster]))
        
    return clusters
开发者ID:pmatigakis,项目名称:wikileaks_clustering,代码行数:17,代码来源:clusters.py


注:本文中的sklearn.cluster.KMeans.transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。