本文整理汇总了Python中sklearn.cluster.KMeans.transform方法的典型用法代码示例。如果您正苦于以下问题:Python KMeans.transform方法的具体用法?Python KMeans.transform怎么用?Python KMeans.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.cluster.KMeans
的用法示例。
在下文中一共展示了KMeans.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: gapstat
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def gapstat(self, ref_size=10, max_iter=300, n_init=3):
Wkestrand = np.zeros(len(self.range))
Wk = np.zeros(len(self.range))
sk = np.zeros(len(self.range))
sample = self.randomData(ref_size)
for indk, k in enumerate(self.range):
km = KMeans(n_clusters=k, init='k-means++', max_iter=max_iter, n_init=n_init)
Wkrand = []
for i in range(ref_size):
km.fit(sample[i])
SS = km.transform(sample[i])
Wkrand.append((self.intraDist(km.labels_.tolist(), k, km.cluster_centers_)))
Wkestrand[indk] = (1/ref_size)*sum(Wkrand)
km.fit(self.X)
XX = km.transform(self.X)
clusters = km.labels_.tolist()
Wk[indk] = self.intraDist(clusters, k, km.cluster_centers_)
sk[indk] = np.sqrt((1/ref_size)*sum([(Wkrand[i]-Wkestrand[indk])**2 for i in range(ref_size)]))
sk *= np.sqrt(1+1/ref_size)
Gapk = [(1/ref_size)*Wkestrand[i]-Wk[i] for i in range(len(self.range))]
#return min([k for k, j in enumerate([Gapk[g]-Gapk[g+1]+sk[g+1] for g in self.range[:,-1]]) if j>0 ])
return [(k, Gapk[j], Gapk[j]-Gapk[j+1]+sk[j+1])for j, k in enumerate(self.range[:-1])]
示例2: cluster_driver
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def cluster_driver(a_driver):
# print a_driver['DStats']
# print "#############################DStats Above#################################ValueError: zero-size array to reduction operation minimum which has no identity#################"
# sys.stdout = open('a_projpath' +'output.txt','w')
# print a_driver['DStats']
X = StandardScaler().fit_transform(a_driver['DStats'])
# print X
# print "DStats are.....::" , a_driver['DStats']
# print "X is...........::" ,['AvgDistDel', 'AvgACosDel', 'SDevDistDel', 'SDevACosDel','TotalTime','SkewDistDel','SkewACosDel'] X
# print "############################Scaled X Above###################################################"
pca = PCA(n_components=5)
Xpca = pca.fit(X).transform(X)
if plotflag == True:
fig = scatterplot_matrix(np.transpose(Xpca)
, ['PC1'
, 'PC2'
, 'PC3'
, 'PC4'
# ,'PC5'
]
,linestyle='none', marker='o', color='black', mfc='none')
fig.suptitle('Simple Scatterplot Matrix')
plt.show()
db = KMeans(n_clusters=1,n_jobs = -1).fit(Xpca)
# db = DBSCAN(eps=0.5).fit(Xpca)
# core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
# core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print "###############################################################################"
# print('Estimated number of clusters: %d' % n_clusters_)
# print 'Count of Predicts::', len(X)
# print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(Xpca, labels))
print "% Variance Explaned: %0.3f" , sum(pca.explained_variance_ratio_)
# print "##############################DBSCAN X Below#################################################"
# print X G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/'
# try:
return (1- (db.transform(Xpca)/max(db.transform(Xpca))))
示例3: best_lda_cluster_wine
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def best_lda_cluster_wine(self):
dh = data_helper()
dh = data_helper()
X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best()
scl = RobustScaler()
X_train_scl = scl.fit_transform(X_train)
X_test_scl = scl.transform(X_test)
##
## K-Means
##
km = KMeans(n_clusters=4, algorithm='full')
X_train_transformed = km.fit_transform(X_train_scl)
X_test_transformed = km.transform(X_test_scl)
# save
filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt'
pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt'
pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt'
pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt'
pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
##
## GMM
##
gmm = GaussianMixture(n_components=4, covariance_type='full')
X_train_transformed = km.fit_transform(X_train_scl)
X_test_transformed = km.transform(X_test_scl)
# save
filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt'
pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt'
pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt'
pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt'
pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
示例4: inertia_clustering_analysis
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def inertia_clustering_analysis(ds, max_clusters=13):
inertia_val = np.array([])
#max_clusters = 13#+2 = 15
for i in np.arange(max_clusters)+2:
kmeans = KMeans(init='k-means++', n_clusters=i, n_init=10)
kmeans.transform(ds.samples)
inertia_val = np.append(inertia_val, kmeans.inertia_)
f = plt.figure()
a = f.add_subplot(111)
a.plot(inertia_val)
plt.show()
return inertia_val
示例5: KnnClassify
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def KnnClassify(self,candi):
words = self.extracAllword(candi)
word_dict = {w:idx for idx, w in enumerate(words)}
x = [[0 for _ in xrange(len(words))] for _ in xrange(len(candi))]
if len(x) < 3:
return candi
for id, s in enumerate(candi):
tmp = self.text_to_vector(s)
for k,v in tmp.items():
x[id][word_dict[k]] = float(v)
km = KMeans(n_clusters=3)
km.fit(x)
samples = {}
X_new = km.transform(x)
# try:
# X_new = km.transform(x)
# except:
# print 'mooo'
for idx, l in enumerate(km.labels_):
try:
samples[l][idx] = X_new[idx][l]
except:
samples[l] ={}
samples[l][idx] = X_new[idx][l]
ret = []
for k, v in samples.items():
sortedv = sorted(v.items(), key=operator.itemgetter(1), reverse=True)
for it in sortedv:
ret.append(candi[it[0]])
return ret
示例6: kmean_data
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def kmean_data(tune_path=None, test_path=None, cluster=3, isPCA=True):
'''
:param tune_path: src of a tuning data set
:param test_path: src of a testing data set
:return: tuning data after clustering, in the form of [indep val,
depen val]
'''
def find_min(a):
return a.min()
if not tune_path:
tune_path = "./data/ant/ant-1.4.csv"
if not test_path:
test_path = "./data/ant/ant-1.5.csv"
df_tune = get_data(tune_path, "tune")
df_test = get_data(test_path, "test")
if isPCA:
tune_x, tune_y = pca_analysis(df_tune)
test_x, test_y = pca_analysis(df_test)
else:
tune_x, tune_y = get_xy(df_tune, normalize=True)
test_x, test_y = get_xy(df_test, normalize=True)
# tune_x, tune_y = get_xy(df_tune, normalize=True)
# test_x, test_y = get_xy(df_test, normalize=True)
kmean = KMeans(n_clusters=cluster).fit(
test_x) ## use testing data to do clustering
avg_distance = kmean.inertia_ / float(len(test_x))
tune_distance = kmean.transform(tune_x)
min_distance = np.apply_along_axis(find_min, 1, tune_distance)
pick_index = min_distance < avg_distance * 2 # find tuning data whose
# all distance to cluster center is less than avg_distance
normal_tune_x, normal_tune_y = get_xy(df_tune, normalize=False)
_tune_x, _tune_y = normal_tune_x[pick_index], normal_tune_y[pick_index]
return [_tune_x, _tune_y]
示例7: kmeans
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def kmeans(data, model_id, x_col, n_clusters):
# |Create model, fit data, and return prediction of cluster for each row
model = KMeans(n_clusters)
# |Add distance to each cluster for each row to summary data
headers = []
for i in range(n_clusters):
headers.append('dist_%s' % str(i))
dist = pd.DataFrame(model.transform(data.x), columns=headers)
data.current_df = data.current_df.join(dist)
data.df['kmeans']['data'] = data.df['kmeans']['data'].append(data.current_df, ignore_index=True)
# |Create DataFrame with each cluster and the mean value for each input column
df = pd.DataFrame()
for i in range(n_clusters):
clus = {'cluster':i}
for j in range(len(x_col)):
clus['%s_mean' % x_col[j]] = model.cluster_centers_[i][j]
df = df.append(clus, ignore_index=True)
df['model_id'] = model_id
data.df['kmeans']['clusters'] = data.df['kmeans']['clusters'].append(df, ignore_index=True)
return data, model
示例8: kmeans_betacv
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def kmeans_betacv(data, num_cluster, batch_kmeans=False, n_runs = 10,
confidence = 0.90):
'''
Computes the BetaCV for running Kmeans on the dataset. This method
returns the BetaCV value and half of the size of the confidence interval
for the same value (BetaCV is an average or the number of runs given).
Arguments
---------
data: matrix
A matrix of observations. If this is sparse, `batch_kmeans` must
be True
num_cluster: int
number of clusters to run k-means for
batch_kmeans: bool (defauts to False)
if `sklearn.cluster.MiniBatchKMeans` should be used. This is faster
and suitable for sparse datasets, but less accurate.
n_runs: int (default = 10)
Number of runs to compute the BetaCV
confidence: double [0, 1) (default = 0.9)
The confidence used to compute half the confidence interval size
Returns
-------
The betacv and half of the confidence interval size
'''
algorithm = None
if not batch_kmeans:
algorithm = KMeans(num_cluster)
else:
algorithm = MiniBatchKMeans(num_cluster)
inter_array = np.zeros(n_runs)
intra_array = np.zeros(n_runs)
for i in xrange(n_runs):
#Run K-Means
algorithm.fit(data)
centers = algorithm.cluster_centers_
labels = algorithm.labels_
#KMeans in sklearn uses euclidean
dist_centers = pairwise.euclidean_distances(centers)
#Inter distance
mean_dist_between_centers = np.mean(dist_centers)
inter_array[i] = mean_dist_between_centers
#Intra distance
dist_all_centers = algorithm.transform(data)
intra_dists = []
for doc_id, cluster in enumerate(labels):
dist = dist_all_centers[doc_id, cluster]
intra_dists.append(dist)
intra_array[i] = np.mean(intra_dists)
betacv = intra_array / inter_array
cinterval = half_confidence_interval_size(betacv, confidence)
return np.mean(betacv), cinterval
示例9: compute_clusters
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def compute_clusters(topics, match):
recipe_topics = topics['W'][match, :]
cluster = KMeans(n_clusters=4)
# cluster = AffinityPropagation()
cluster.fit(recipe_topics)
distances = cluster.transform(recipe_topics)
return cluster, distances
示例10: cluster_encode
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def cluster_encode(X_train, X_test, codebook='kmeans', k=25):
if codebook == 'kmeans':
cb = KMeans(k, n_init=1, init='random')
elif codebook == 'gmm':
cb = GMM(n_components=k)
X = np.vstack((X_train, X_test))
X = StandardScaler().fit_transform(X)
print('_' * 80)
print('fitting codebook')
print
print cb
print
cb.fit(X)
print 'fin.'
X_train = cb.transform(X_train)
X_test = cb.transform(X_test)
return X_train, X_test
示例11: _cluster
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def _cluster(self, index):
data = self.data[index]
kmeans = KMeans(n_clusters=2, random_state=0).fit(data)
labels = kmeans.labels_
l_i = np.where(labels == 0)[0]
r_i = np.where(labels == 1)[0]
left_index = index[l_i]
right_index = index[r_i]
if len(right_index) - len(left_index) > 1:
distances = kmeans.transform(data[r_i])
left_index, right_index = self._rebalance(
left_index, right_index, distances[:, 1])
elif len(left_index) - len(right_index) > 1:
distances = kmeans.transform(data[l_i])
left_index, right_index = self._rebalance(
right_index, left_index, distances[:, 0])
return left_index, right_index
示例12: test_transform
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def test_transform():
k_means = KMeans(k=n_clusters)
k_means.fit(X)
X_new = k_means.transform(k_means.cluster_centers_)
for c in range(n_clusters):
assert_equal(X_new[c, c], 0)
for c2 in range(n_clusters):
if c != c2:
assert_true(X_new[c, c2] > 0)
示例13: test_transform
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def test_transform():
km = KMeans(n_clusters=n_clusters)
km.fit(X)
X_new = km.transform(km.cluster_centers_)
for c in range(n_clusters):
assert_equal(X_new[c, c], 0)
for c2 in range(n_clusters):
if c != c2:
assert_greater(X_new[c, c2], 0)
示例14: run_k_means
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def run_k_means(df, numberclusters, geoidlabel ='geoid10', plot_silouette = True):
'''Uses sklearn to run kmeans.
ARGUMENTS:
1) df: A dataframe with a geoid column
2) geoidlabel: the label of the geoid column.
3) plot_silouette: whether or not to plot the silouettes of each cluster
OUTPUT: Returns a three part tuple:
1) the kmeans sklearn model
2) a dictionary with geoids as the key, and the cluster as the value
3) a dictionary with clusters as the key, and a list of related geoids as the value'''
#Use K means to cluster the dataset.
x = df[['wkday_0','wkday_1','hrbin_morning',
'hrbin_afternoon','hrbin_evening',
'hrbin_latenight','hrbin_dawn']].values
kmeans = KMeans(n_clusters = numberclusters)
kmeans.fit(X = x )
features = df.columns.tolist()[1:]
geoids = df[geoidlabel]
#store values in a dictionary
geoid_dict = defaultdict(int)
cluster_dict = defaultdict(list)
#Transforms x into a cluster-distance space.
#In this array, each column is a cluster with the value of the distance from
#a given neighborhood block (geoid) in each row.
#This function returns the cluster belonging to each neighborhood block:
#the cluster with the smallest distance value
assigned_cluster = kmeans.transform(x).argmin(axis=1)
for i in range(kmeans.n_clusters):
cluster = np.arange(0, x.shape[0])[assigned_cluster==i]
geoids = [df.ix[geoindx]['hrbin_'] for geoindx in cluster]
print len(geoids), 'cluster #', i
#make a dictionary with cluster as the key, and geoids as the list
cluster_dict[i] = geoids
#second dictionary to quickly look up what cluster each geoid belongs to
for geo in geoids:
geoid_dict[geo] = i
if plot_silouette == True:
plot_cluster_silouette_values(X, assigned_cluster, n_clusters)
#save the dictionaries as CSVs
save_dictionary_as_csv(cluster_dict, 'data/intermediate_data/kmeans/kmeans_clusterdict.csv')
save_dictionary_as_csv(geoid_dict, 'data/intermediate_data/kmeans/kmeans_geoiddict.csv')
return kmeans, geoid_dict, cluster_dict
示例15: cluster_documents
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import transform [as 别名]
def cluster_documents(n_clusters, doc_term_matrix):
kmeans = KMeans(n_clusters=n_clusters)
kmeans = kmeans.fit(doc_term_matrix)
distances = kmeans.transform(doc_term_matrix)
results = distances.argmin(axis=1)
clusters = defaultdict(list)
for document_index, cluster in enumerate(results):
clusters[cluster].append((document_index, distances[document_index, cluster]))
return clusters