本文整理汇总了Python中sklearn.cluster.KMeans.fit_predict方法的典型用法代码示例。如果您正苦于以下问题:Python KMeans.fit_predict方法的具体用法?Python KMeans.fit_predict怎么用?Python KMeans.fit_predict使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.cluster.KMeans
的用法示例。
在下文中一共展示了KMeans.fit_predict方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: kmeans
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def kmeans(self, concepts, labels, m, method = "kmeans"):
### Do kmeans for vectors of concepts, return a list of cluter assigned
### rtype List[int]
### possible methods: kmeans, agg-ward, agg-complete, agg-average
X = []
k = len(set(labels))
for concept in concepts:
X.append(m[concept])
if method == 'kmeans':
km = KMeans(n_clusters=k, random_state=0)
elif method == 'agg-ward':
km = AgglomerativeClustering(n_clusters=k, affinity='cosine')
elif method == 'agg-complete':
km = AgglomerativeClustering(n_clusters=k, affinity='cosine', linkage='complete')
elif method == 'agg-average':
km = AgglomerativeClustering(n_clusters=k, affinity='cosine', linkage='average')
elif method == 'agg_ward':
km = AgglomerativeClustering(n_clusters=k)
elif method == 'agg_complete':
km = AgglomerativeClustering(n_clusters=k, linkage='complete')
elif method == 'agg_average':
km = AgglomerativeClustering(n_clusters=k, linkage='average')
elif method == 'test':
km = AgglomerativeClustering(n_clusters=k, linkage='average', affinity='l2')
km.fit_predict(X)
return km.labels_
示例2: clustering_by_kmeans
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def clustering_by_kmeans(vectorizer, X, true_k):
print "Clustering in " + str(true_k) + " groups by K-means..."
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=1)
km.fit_predict(X)
print "Measuring..."
print("Homogeneity: %0.3f" % metrics.homogeneity_score(documents, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(documents, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(documents, km.labels_)) #V-measure is an entropy-based measure which explicitly measures how successfully the criteria of homogeneity and completeness have been satisfied.
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(documents, km.labels_))
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000))
#print top terms per cluster clusters
clusters = km.labels_.tolist() # 0 iff term is in cluster0, 1 iff term is in cluster1 ... (lista de termos)
#print "Lista de termos pertencentes aos clusters " + str(clusters)
print "Total de " + str(len(km.labels_)) + " documents"
#Example to get all documents in cluster 0
#cluster_0 = np.where(clusters==0) # don't forget import numpy as np
#print cluster_0
#cluster_0 now contains all indices of the documents in this cluster, to get the actual documents you'd do:
#X_cluster_0 = documents[cluster_0]
terms = vectorizer.get_feature_names()
#print terms
measuring_kmeans(true_k,clusters)
示例3: kmeans_predict_center_vectors
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def kmeans_predict_center_vectors(self, contexts, k):
#print(contexts)
if not contexts:
return None
# get the data *(maybe some x not exist in dict. so we may need to filter it out)
X = []
for context in contexts:
if context in self.m:
## TODO: to see if we need to normalize it.
#X.append(self.normalize_vector(self.m[context]))
X.append(self.m[context])
if not contexts:
return None
#X = map(lambda x: self.m[x], contexts)
if k < len(X):
kmeans = KMeans(n_clusters = k, random_state = 1)
else:
#kmeans = KMeans(n_clusters = len(X), random_state = 1)
return False
kmeans.fit_predict(X)
#print af
cluster_centers = kmeans.cluster_centers_
label_counter = Counter(kmeans.labels_).most_common()
center_vectors = [[],[]]
for (indice, count) in label_counter:
center_vectors[0].append(cluster_centers[indice])
center_vectors[1].append(count)
#print center_vectors
return center_vectors
示例4: kmeans_logistic
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def kmeans_logistic():
# Case 2: for unsupervised/semisupervised
article_df2 = getScrapedContent(False)
df2_content = article_df2[['content','date']]
df2_date = article_df2['date']
sp_df = getHistoricalVolatility()
X, y_vol = combineHistVolColumn(df2_content, sp_df)
# generate vectorized clfv
tfidf, clfv = generate_tfidf(X['content'])
clf = KMeans(n_clusters=10, init='k-means++', max_iter=100) #, n_init=1)
clf.fit_predict(clfv)
labels = clf.labels_
# with open ('kmeans_km_model.pkl', 'wb') as fid:
# cPickle.dump(clf, fid)
print '\nSilouette score :', str(silhouette_score(clfv, labels, metric='euclidean')) + '\n'
X_train, X_test, y_train, y_test = train_test_split(clfv, labels, test_size=0.4, random_state=42)
clf_lr = LogisticRegression()
clf_lr.fit(X_train, y_train)
y_pred = clf_lr.predict(X_test)
with open ('km_lr_tfi_model.pkl', 'wb') as fid:
cPickle.dump((clf_lr, tfidf), fid)
displayScore(clf_lr, X_train, y_train, X_test, y_test, y_pred)
示例5: categorise_dataset
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def categorise_dataset(contents):
iris_setosa = []
iris_versicolor = []
iris_virginica = []
for each_tuple in contents:
if each_tuple[4] == 'Iris-virginica':
iris_virginica.append(each_tuple[:4])
elif each_tuple[4] == 'Iris-versicolor':
iris_versicolor.append(each_tuple[:4])
elif each_tuple[4] == 'Iris-setosa':
iris_setosa.append(each_tuple[:4])
kwargs = {
'n_init': 5,
# depends on number of cores in your machine.
'n_jobs': 3,
'n_clusters': 3,
}
kmeans = KMeans()
kmeans.set_params(**kwargs)
# apply kmeans
iris_setosa_centroids_indices = kmeans.fit_predict(np.array(iris_setosa))
iris_setosa_centroids = kmeans.cluster_centers_
iris_versicolor_centroids_indices = kmeans.fit_predict(np.array(iris_versicolor))
iris_versicolor_centroids = kmeans.cluster_centers_
iris_virginica_centroids_indices = kmeans.fit_predict(np.array(iris_virginica))
iris_virginica_centroids = kmeans.cluster_centers_
return (iris_setosa_centroids,
iris_versicolor_centroids,
iris_virginica_centroids)
示例6: cluster
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def cluster(D, k=3, verbose=False):
"""Cluster LDS's via Multi-Dimensional Scaling and KMeans.
Strategy:
1. Build NxN matrix of pairwise similarities
2. Run MDS to embed data in R^2
3. Run KMeans with k cluster centers
4. Find samples closest to the k centers
Paramters:
----------
D: numpy.ndarray, shape = (N, N)
Precomputed distance matrix.
k: int (default: 3)
Number of desired cluster centers.
verbose: boolean
Enable verbose output.
Returns:
--------
eData: numpy.ndarray, shape (N, k)
N d-dimensional samples embedded in R^d.
ids: numpy.ndarray, shape = (k,)
List of indices identifying the k representatives.
"""
assert D.shape[0] == D.shape[1], "OOps (distance matrix not square)!"
# build MDS for precomputed similarity matrix
mds = MDS(metric=True, n_components=2, verbose=True,
dissimilarity="precomputed")
def __symmetrize(A):
return A + A.T - np.diag(A.diagonal())
# run MDS on symmetrized similarity matrix
eData = mds.fit(__symmetrize(D)).embedding_
kmObj = KMeans(k)
kmObj.fit_predict(eData)
ids = np.zeros((k,), dtype=np.int)
for i in range(k):
# sanity check
cDat = eData[np.where(kmObj.labels_ == i)[0],:]
assert len(cDat) > 0, "Oops, empty cluster ..."
kCen = kmObj.cluster_centers_[i,:]
x = euclidean_distances(eData, kCen)
ids[i] = int(np.argsort(x.ravel())[0])
# return distance matrix and ID's of representative LDS's
return (eData, ids)
示例7: showKMeans
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def showKMeans(X, N):
scores = []
for number in xrange(N / 6, N / 2):
clustering = KMeans(n_clusters=number, max_iter=MAX_ITER, n_init=N_INIT, n_jobs=N_JOBS )
clustering.fit_predict(X)
scores.append(clustering.score(X))
plt.plot(scores)
plt.xlabel(XLABEL)
plt.ylabel(YLABEL)
plt.show()
示例8: Kmeans
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def Kmeans(self, fase):
# lista_duracao, lista_nome_fase, lista_cpi_fase, lista_id_projeto_fase, lista_real_acum_fase,
# lista_est_acum_fase, lista_real_acum_projeto, lista_est_acum_projeto, lista_perfil_equipe_fase,
# lista_num_atividades, lista_cpi_projeto
cluster = np.array(zip(fase[:,4], fase[:,5], fase[:, 8], fase[:, 9], fase[:]))
ap = KMeans(n_clusters= 5, init = 'k-means++')
ap.fit_predict(cluster)
clusters_centers_indices = ap.cluster_centers_
labels = ap.labels_
n_clusters_ = len(clusters_centers_indices)
# print(clusters_centers_indices)
# print(labels)
# print "Projeto"
# agrupamento = list()
# for j in range(0, n_clusters_):
# i = 0
# for label in labels:
# if(j == label):
# print str(cluster[i]) + " id " + str(fase[i][3]) + " label " + str (labels[i])+ " cpi " + str(fase[i][2])
# i+=1
cluster_0, cluster_1, cluster_2, cluster_3, cluster_4 = [], [], [], [], []
lista_1, lista_2, lista_3, lista_4, lista_5 = [], [], [], [], []
cpi_1, cpi_2, cpi_3, cpi_4, cpi_5 = [], [], [], [], []
i=0
for i in range(len(cluster)):
if (labels[i] == 0):
cluster_0.append(cluster)
lista_1.append(fase[i][3])
cpi_1.append(fase[i][2])
elif (labels[i]== 1):
cluster_1.append(cluster)
lista_2.append(fase[i][3])
cpi_2.append(fase[i][2])
elif (labels[i]== 2):
cluster_2.append(cluster)
lista_3.append(fase[i][3])
cpi_3.append(fase[i][2])
elif (labels[i]== 3):
cluster_3.append(cluster)
lista_4.append(fase[i][3])
cpi_4.append(fase[i][2])
elif (labels[i]== 4):
cluster_4.append(cluster)
lista_5.append(fase[i][3])
cpi_5.append(fase[i][2])
# print "cluster 1"+str(lista_1)+ "cluster 2" +str(lista_2)+ "cluster 3" +str(lista_3)+ "cluster 4" +str(lista_4)+ "cluster 5" +str(lista_5)
# print "cluster 1" + str(cpi_1) + "cluster 2" + str(cpi_2) + "cluster 3" + str(cpi_3) + "cluster 4" + str(cpi_4) + "cluster 5" + str(cpi_5)
cluster_fase = np.array(list(zip(cluster_0, cluster_1, cluster_2, cluster_3, cluster_4)))
return cluster_fase
示例9: cluster
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def cluster():
normalizedFoods = foods[learningColumns] / foods[learningColumns].max()
km = KMeans()
km.fit_predict(normalizedFoods)
#print(km.labels_)
foods["Cluster"] = km.labels_
#print km.cluster_centers_
clusterGroups = foods.groupby("Cluster")
print(clusterGroups[learningColumns].mean())
示例10: set_obs
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def set_obs(self, pts, draw=False, n_resets=20):
d = pts.shape[1]
cmplx_pts = self.discretize_cmplx(pts_per_simplex=10)
pts_h = np.c_[pts, np.ones(pts.shape[0])]
kmeans = KMeans(init="k-means++", n_clusters=self.N)
kmeans.fit_predict(pts)
centroids = np.c_[kmeans.cluster_centers_, np.ones(self.N)]
s_centers = []
for s in self.cmplx.simplices.values():
s_centers.append(s.global_coords([0.5]))
s_centers = np.c_[np.array(s_centers), np.ones(self.N)]
best_cost = np.inf
best_R = None
import time
obs_sigma = self.obs_sigma
self.obs_sigma = 1
for n in range(n_resets):
start = time.time()
s_indices = range(self.N)
k_indices = range(self.N)
np.random.shuffle(s_indices)
np.random.shuffle(k_indices)
X = centroids[k_indices[:d+1]]
Y = s_centers[s_indices[:d+1]]
R = np.linalg.lstsq(X, Y)[0]
pts_w = np.dot(pts_h, R)[:, :-1]
self._set_obs(pts, pts_w)
if draw:
## for debugging
self.draw(block=True, show=False, outf='../figs/debug/registered_{}.png'.format(n))
warped_cmplx = self.warp_cmplx()
distmat = ssd.cdist(warped_cmplx, pts, 'sqeuclidean')
cost = np.sum(np.min(distmat, axis=1))
# cost = self.gp_ll()
if cost < best_cost:
best_cost = cost
best_R = R
print n, best_cost, cost, time.time() - start, -self.gp_ll()
pts_w = np.dot(pts_h, best_R)[:, :-1]
self.obs_sigma = obs_sigma
self._set_obs(pts, pts_w)
if draw:
self.draw(block=True)
示例11: build_clusters
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def build_clusters(df, n_clusters=4):
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit_predict(df[['count_entries_norm', 'max_norm']])
cluster_data = {}
cluster_data['labels'] = kmeans.labels_
cluster_data['cluster_centers'] = kmeans.cluster_centers_
clusters = defaultdict(list)
for i, label in enumerate(kmeans.labels_):
bv = df.index[i]
clusters[label].append(bv)
cluster_data['clusters'] = clusters
return cluster_data
示例12: cluster_kmeans_signal
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def cluster_kmeans_signal(self, signal, nClusters, featureName):
"""
Not used
:param signal: matrix representing a signal, needs to be applied reshape(-1, 1)
:param nClusters: number of clusters
:return: predicted labels
"""
#(TODO): save model for clustering for each feature
model = KMeans(n_clusters= nClusters)
model.fit_predict(signal)
self.models[featureName] = model
return model.labels_
示例13: __init__
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def __init__(self,data,**kwargs):
"""
Wrapper for kmeans clustering
"""
clusterer = KMeans(**kwargs)
clusterer.fit_predict(data)
self.centroid = clusterer.cluster_centers_
self.labels = clusterer.labels_
self.output = pd.DataFrame(columns=data.columns.tolist()+['cluster_size'])
for k in np.unique(self.labels):
self.output.loc[k]=data[self.labels==k].median(axis=0)
self.output.loc[k]['cluster_size'] = np.sum(self.labels==k)
示例14: summarize
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def summarize(self, document_path):
allwords = {}
sentences = []
with open(document_path, 'r') as f:
index = 0
for line in f:
s = Util.tokenize(line, Summarizer.non_space)
sentence = []
for w in s:
sentence.append(w)
if Summarizer.sentence_terminator.search(w):
sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
sentences.append(sent)
for t in sent.stemmed:
if t not in allwords:
allwords[t] = index
index += 1
sentence = []
matrix = np.zeros((len(sentences), len(allwords)))
for i, sent in enumerate(sentences):
for t in sent.stemmed:
matrix[i, allwords[t]] = Util.tfidf(t, self.tf, self.df, Summarizer.NUM_DOCS)
# Normalize
normalizer = np.reshape(np.sum(matrix**2, axis=1)**0.5, (len(matrix), 1))
matrix /= normalizer
model = KMeans(n_clusters=Cluster.NUM_CLUSTERS, tol=1e-9)
model.fit_predict(np.nan_to_num(matrix))
labels = model.labels_
totalWords = 0
selected = []
# From each cluster, pick the sentence that is nearest to the cluster
# centroid
for i in range(Cluster.NUM_CLUSTERS):
member_indices = np.where(labels == i)
distances = np.dot(matrix[member_indices], model.cluster_centers_[i])
closest_index = np.argmin(distances, 0)
# 'closest_index' is the index into the member_indices array
member_index = member_indices[0][closest_index]
selected.append((member_index, sentences[member_index])) # stash the index of the sentence as well
totalWords += sentences[member_index].getLength()
if totalWords > 100:
break
# return the selected sentences in their order of appearance in the document
return [s[1] for s in sorted(selected, key=lambda x: x[0])]
示例15: segment_labeling
# 需要导入模块: from sklearn.cluster import KMeans [as 别名]
# 或者: from sklearn.cluster.KMeans import fit_predict [as 别名]
def segment_labeling(x, boundaries, c_method='kmeans', k=5):
x_sync = librosa.feature.sync(x.T, boundaries)
if c_method == 'kmeans':
c = KMeans(n_clusters=k, n_init=100)
seg_labels = c.fit_predict(x_sync.T)
elif c_method == 'agglomerative':
z = hierarchy.linkage(x_sync.T, method='ward')
t = k * np.max(z[:, 2])
seg_labels = hierarchy.fcluster(z, t=t, criterion='distance')
else:
c = KMeans(n_clusters=k, n_init=100)
seg_labels = c.fit_predict(x_sync.T)
return seg_labels