本文整理汇总了Python中sklearn.cluster.AffinityPropagation.fit_predict方法的典型用法代码示例。如果您正苦于以下问题:Python AffinityPropagation.fit_predict方法的具体用法?Python AffinityPropagation.fit_predict怎么用?Python AffinityPropagation.fit_predict使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.cluster.AffinityPropagation
的用法示例。
在下文中一共展示了AffinityPropagation.fit_predict方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: execute
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def execute(args):
##############################################################################
if len(args) < 1:
usage()
sys.exit()
names, labels_true, X = parse(args[0])
indices = [int(i) for i in args[1:]]
relevant_names = names[1:]
if len(indices) > 0:
X = np.asarray([[sample[i] for i in indices] for sample in X])
relevant_names = [relevant_names[i] for i in indices]
print "Clustering on", str(relevant_names) + "..."
##############################################################################
# Compute Affinity Propagation
af = AffinityPropagation(preference=-50)
# cluster_centers_indices = af.cluster_centers_indices_
# labels = af.labels_
#
# n_clusters_ = len(cluster_centers_indices)
y_pred = af.fit_predict(X)
if y_pred is None or len(y_pred) is 0 or type(y_pred[0]) is np.ndarray:
return 0
counts = get_cluster_counts(labels_true, y_pred)
print counts
示例2: cluster_trajectories
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def cluster_trajectories( curves ):
"""Given a list of curves, cluster_trajectories will cluster them."""
n_curves = len(curves)
X_2B_clstrd = np.zeros( (n_curves, 4) )
X_2B_clstrd[:,0] = np.array( [ curves[k][0, 0] for k in range(n_curves) ] )
X_2B_clstrd[:,1] = np.array( [ curves[k][1, 0] for k in range(n_curves) ] )
X_2B_clstrd[:,2] = np.array( [ curves[k][0,-1] for k in range(n_curves) ] )
X_2B_clstrd[:,3] = np.array( [ curves[k][1,-1] for k in range(n_curves) ] )
for col in range( 4 ):
X_2B_clstrd[:,col] /= X_2B_clstrd[:,col].std()
def distance_metric(a,b):
#A distance metric on R^4 modulo the involution
#(x0,x2,x3,x4) -> (x3,x4,x1,x2)
d = lambda a,b : np.sqrt( np.sum( (a-b)**2 ) )
T = lambda x: np.array([x[2],x[3],x[0],x[1]])
return min( d(a,b) , d(T(a),b) )
from sklearn.cluster import AffinityPropagation
clusterer = AffinityPropagation(affinity='precomputed', convergence_iter=100)
aff = np.zeros((n_curves, n_curves))
for i in range(n_curves):
for j in range(i+1,n_curves):
aff[i,j] = np.exp(-distance_metric( X_2B_clstrd[i], X_2B_clstrd[j])**2)
aff[j,i] = aff[i,j]
#clusterer.Affinity = aff
cluster_labels = clusterer.fit_predict(aff)
out = []
for label in set( cluster_labels):
cluster = map( lambda k: curves[k] , filter( lambda k: cluster_labels[k] == label , range( n_curves) ) )
out.append( cluster )
return map( align_cluster, out)
示例3: cluster_articles
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def cluster_articles():
ms = MongoStore()
articles = [a for a in ms.get_pending_articles()]
if len(articles) > 0:
tfidf = TfidfVectorizer(tokenizer=preprocess)
good_articles = [article for article in articles
if article["text_content"].strip() != ""]
texts = [article["text_content"] for article in good_articles]
X_tfidf = tfidf.fit_transform(texts)
print X_tfidf
ap = AffinityPropagation(damping=0.95, max_iter=4000,
convergence_iter=400, copy=True, preference=-4,
affinity='euclidean', verbose=True)
C = ap.fit_predict(X_tfidf)
print X_tfidf.shape, C.shape
print C
centers = ap.cluster_centers_indices_
clusters = []
for c, center in enumerate(centers):
members = np.where(C == c)[0]
K = cosine_similarity(X_tfidf[members], X_tfidf[center])
member_sims = [(m, float(k)) for m, k in zip(members, K)]
member_sims.sort(key=lambda x: x[1], reverse=True)
cluster = {"articles": [], "date": datetime.now(), "summarized": False}
if len([member for member, sim in member_sims if sim > .55]) >= 3:
print texts[center][:75].replace("\n", " ")
for member, sim in member_sims:
print "\t{:3.3f} ".format(sim),
print good_articles[member]["title"][:60].replace("\n", " ")
cluster["articles"].append((good_articles[member]["_id"], sim))
else:
continue
clusters.append(cluster)
if len(clusters) > 0:
ms.insert_clusters(clusters)
ms.set_clustered_flag(articles)
示例4: affinity_propagation
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def affinity_propagation():
"""
AffinityPropagation creates clusters by sending messages between pairs of
samples until convergence. The messages sent between pairs represent the
suitability for one sample to be the exemplar of the other, which is updated
in response to the values from other pairs. this updates occurs iteratively
until convergence, at which point the final exemplars are chosen and hence
the final cluster is given.
Algorithm:
The message sent between pairs belongs to one of two categories. The first
is the responsibility, r(i,k), which is the accumulated evidence that sample
k should the exemplar for sample i. The second is the availability, a(i,k),
which is the accumulated evidence that sample i should chose sample k to be
its exemplar, and considers the values for all other samples that k should
be an exemplar. In this case exemplars are chosen by samples if they are:
- similar enough to many samples, and
- chosen by many samples to be representative of themselves.
"""
# Generate a generic data sample.
n_samples = 300
std = 0.3
seed = 0
centers = [ [-1., 0.], [0., 1.5], [1., 0.] ]
data, target = make_blobs(n_samples = n_samples, centers = centers,
cluster_std = std, random_state = seed)
# Set the preference for each point: samples with large preference values
# are more likely to be chosen as exemplars. The number of exemplars, i.e.,
# clusters, is influenced by the input preference values. If preferences are
# not passed as arguments, they will be set to the median of the input
# similarities.
# pref = [ np.random.randint(low = -50, high = 0) for x in range(n_samples)]
pref = -50
# Compute affinity propagation.
clf = AffinityPropagation(preference = pref)
aff_y = clf.fit_predict(data)
# Find mismatches between predicted and true values.
cnt = int(0)
for idx in range(n_samples):
if(target[idx] != aff_y[idx]): cnt += 1
# Print results.
print('Approximated number of clusters ', len(clf.cluster_centers_indices_))
print('Accuracy ', float(n_samples - cnt) / float(n_samples))
print('Homogeneity ', metrics.homogeneity_score(target, clf.labels_))
print('Completeness ', metrics.completeness_score(target, clf.labels_))
# Plot resulting clusters.
plt.figure(figsize = (8,8))
plt.scatter(data[:,0], data[:,1], c = aff_y, s = 50)
plt.title('Affinity clustering')
plt.show()
示例5: evaluate_clustering
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def evaluate_clustering():
similarity_matrix = get_sense_similarity_submatrix(range(10000))
matrix_size = len(similarity_matrix)
print('got matrix')
affinity_propagation = AffinityPropagation()
labels1 = affinity_propagation.fit_predict(similarity_matrix)
print('affinity propagation')
dbscan = DBSCAN(min_samples=1)
labels2 = dbscan.fit_predict(similarity_matrix)
print('print dbscan')
distance_matrix = np.ndarray((matrix_size, matrix_size))
for i in range(matrix_size):
for j in range(matrix_size):
distance_matrix[i, j] = 1 - similarity_matrix[i, j]
print(distance_matrix[1, 2])
print(distance_matrix[1, 1])
print('created distance matrix')
cluster_map1 = cluster_evaluation.fpena_get_clusters(labels1)
cluster_map2 = cluster_evaluation.fpena_get_clusters(labels2)
print(cluster_map1)
print(cluster_map2)
sc1 = sklearn.metrics.silhouette_score(distance_matrix, labels1, metric='euclidean')
sc2 = sklearn.metrics.silhouette_score(distance_matrix, labels2, metric='euclidean')
sc5 = cluster_evaluation.fpena_evaluate(cluster_map1, distance_matrix)
sc6 = cluster_evaluation.fpena_evaluate(cluster_map2, distance_matrix)
num_elements1 = [len(values) for values in cluster_map1.values()]
num_elements2 = [len(values) for values in cluster_map2.values()]
print(num_elements1)
print(num_elements2)
print('Number of clusters Affinity Propagation: %f' % len(cluster_map1))
print('Number of clusters DBSCAN: %f' % len(cluster_map2))
print('Average elements per cluster Affinity Propagation: %f' % np.mean(num_elements1))
print('Average elements per cluster DBSCAN: %f' % np.mean(num_elements2))
print('Standard deviation per cluster Affinity Propagation: %f' % np.std(num_elements1))
print('Standard deviation per cluster DBSCAN: %f' % np.std(num_elements2))
print('Silouhette score Affinity Propagation (distance matrix): %f' % sc1)
print('Silouhette score DBSCAN (distance matrix): %f' % sc2)
print('Dunn index Affinity Propagation (distance matrix): %f' % sc5)
print('Dunn index DBSCAN (distance matrix): %f' % sc6)
示例6: geo_worker_
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def geo_worker_(job_queue, result_queue, **kwargs):
signal.signal(signal.SIGINT, signal.SIG_IGN)
geocache = get_resource_manager(u"GeoCacheResource")
geoquery = GeoQuery(geocache.get_tsv_path())
event = kwargs.get(u"event")
while not job_queue.empty():
try:
string_tsv_path, geo_tsv_path = job_queue.get(block=False)
with gzip.open(string_tsv_path, u"r") as f:
string_df = pd.io.parsers.read_csv(f, sep="\t", quoting=3, header=0)
loc_strings = [
loc_string for loc_string in string_df[u"locations"].tolist() if not isinstance(loc_string, float)
]
coords = []
for loc_string in loc_strings:
for location in loc_string.split(","):
coord = geoquery.lookup_location(location)
if coord is not None:
coords.append(coord)
centers = set()
if len(coords) > 0:
coords = np.array(coords)
D = -geoquery.compute_distances(coords[:, None], coords)
ap = AffinityPropagation(affinity=u"precomputed")
Y = ap.fit_predict(D)
if ap.cluster_centers_indices_ is not None:
for center in ap.cluster_centers_indices_:
centers.add((coords[center][0], coords[center][1]))
centers = [{u"lat": lat, u"lng": lng} for lat, lng in centers]
centers_df = pd.DataFrame(centers, columns=[u"lat", u"lng"])
with gzip.open(geo_tsv_path, u"w") as f:
centers_df.to_csv(f, sep="\t", index=False, index_label=False, na_rep="nan")
result_queue.put(None)
except Queue.Empty:
pass
return True
示例7: mhd_cluster_trajectories
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def mhd_cluster_trajectories( curves ):
"""Returns clusters based upon the modified Hausdorff distance."""
n_curves = len(curves)
from sklearn.cluster import AffinityPropagation
clusterer = AffinityPropagation(affinity='precomputed', convergence_iter=100)
aff = np.zeros((n_curves, n_curves))
for i in range(n_curves):
for j in range(i+1,n_curves):
from modified_Hausdorff_distance import modified_Hausdorff_distance as mhd
aff[i,j] = mhd( curves[i].transpose(), curves[j].transpose() )
aff[j,i] = aff[i,j]
#clusterer.Affinity = aff
cluster_labels = clusterer.fit_predict(aff)
out = []
for label in set( cluster_labels):
cluster = map( lambda k: curves[k] , filter( lambda k: cluster_labels[k] == label , range( n_curves) ) )
out.append( cluster )
return map( align_cluster, out)
示例8: plot_similarity_clusters
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def plot_similarity_clusters(desc1, desc2, files, plot = None):
"""
find similar sounds using Affinity Propagation clusters
:param desc1: first descriptor values
:param desc2: second descriptor values
:returns:
- euclidean_labels: labels of clusters
"""
if plot == True:
print((Fore.MAGENTA + "Clustering"))
else:
pass
min_max = preprocessing.scale(np.vstack((desc1,desc2)).T, with_mean=False, with_std=False)
pca = PCA(n_components=2, whiten=True)
y = pca.fit(min_max).transform(min_max)
euclidean = AffinityPropagation(convergence_iter=1800, affinity='euclidean')
euclidean_labels= euclidean.fit_predict(y)
if plot == True:
time.sleep(5)
print((Fore.WHITE + "Cada número representa el grupo al que pertence el sonido como ejemplar de otro/s. El grupo '0' esta coloreado en azul, el grupo '1' esta coloreado en rojo, el grupo '2' esta coloreado en amarillo. Observa el ploteo para ver qué sonidos son ejemplares de otros"))
print(np.vstack((euclidean_labels,files)).T)
time.sleep(6)
plt.scatter(y[euclidean_labels==0,0], y[euclidean_labels==0,1], c='b')
plt.scatter(y[euclidean_labels==1,0], y[euclidean_labels==1,1], c='r')
plt.scatter(y[euclidean_labels==2,0], y[euclidean_labels==2,1], c='y')
plt.scatter(y[euclidean_labels==3,0], y[euclidean_labels==3,1], c='g')
plt.show()
else:
pass
return euclidean_labels
示例9: cluster
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def cluster(self, normalize=False):
"""
Cluster the nodes based on the PMI similarity measure. The clustering algorithm used is affinity propagation,
which automatically choosed the number of clusters.
:param normalize: If true, then normalize the similarity measured (i.e., the PMI) to be between -1 and 1.
:return: The cluster labels.
"""
if normalize:
# use normalized PMI for similarity metric
similarity = self.pmi / -np.log(self.joint_probs)
similarity[np.diag_indices_from(similarity)] = 1.0
else:
similarity = self.pmi
similarity[np.diag_indices_from(similarity)] = 1.1 * similarity.max()
clustering = AffinityPropagation(affinity='precomputed', verbose=self.verbose,
preference=similarity.min())
clusters = clustering.fit_predict(similarity)
if self.verbose:
print 'Found', len(np.unique(clusters)), 'clusters.'
return clusters
示例10: TSNE
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE
dataset = pd.read_csv('~/data/gene_expr_170104.csv')
data = np.array(dataset)[:, 1:].astype(float).T
Y = TSNE().fit_transform(data)
clus = AffinityPropagation()
lab = clus.fit_predict(Y)
x, y = Y.T
plt.scatter(x, y, alpha=0.9, c = plt.cm.Spectral(lab.astype(float) / lab.max()), edgecolors='none')
# for i, j, t in zip(x, y, range(x.shape[0])):
# plt.text(i, j, t, color = 'purple')
plt.show()
x, y, = SOS(iterations=10, alpha=1, beta=0, delta=0, theta=3.5).fit_transform(data).T
plt.scatter(x, y, alpha=0.4, c = plt.cm.Spectral(lab.astype(float) / lab.max()), edgecolors='none')
# for i, j, t in zip(x, y, range(x.shape[0])):
示例11: cluster_affinity_propagation
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def cluster_affinity_propagation(similarity_matrix, desired_keys=None):
numpy_matrix = similarity_matrix_to_numpy(similarity_matrix, desired_keys)
clusterer = AffinityPropagation()
return clusterer.fit_predict(numpy_matrix)
示例12: vectorLinspace
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
# cluster3 = vectorLinspace([4,1],[7,9], num=50)
# cluster3 = cluster1 + np.random.normal(5,.1,cluster3.shape)
# cluster4 = vectorLinspace([-1,4],[-4,2], num=50)
# cluster4 = cluster1 + np.random.normal(-5,.1,cluster4.shape)
X = cluster1#np.append(cluster1,np.append(cluster2,np.append(cluster3,cluster4,axis=0),axis=0),axis=0)
print(X)
print(pearsonr(X[:,0],X[:,1]),spearmanr(X[:,0],X[:,1]))
dists = np.zeros((len(X),len(X)))
for i1,x1 in enumerate(X):
print(i1,"/",len(X))
for i2,x2 in enumerate(X):
# for i3,x3 in enumerate(X):
# if i1 != i2 and i2 != i3 and i1 != i3:
# tmp = np.append(x1,np.append(x2,x3,axis=0),axis=0).reshape((-1,2))
# #print(tmp)
# c = spearmanr(tmp[:,0],tmp[:,1])[0]
dists[i1,i2] = cosine(x1,x2)
print(dists)
from sklearn.cluster import AffinityPropagation
ap = AffinityPropagation(affinity="precomputed")
y_pred = ap.fit_predict(dists)
print(len(set(y_pred)))
cmap = dict((y,np.random.beta(1,1,3)) for y in y_pred)
import matplotlib.pyplot as plt
for x,y in zip(X,y_pred):
#plt.annotate(y,x,color=cmap[y])
pass
plt.scatter(X[:,0],X[:,1])
plt.scatter(cluster2[:,0],cluster2[:,1])
plt.show()
示例13: build_class_labels
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
build_class_labels()
num_classes = len(urls)
sim_matrix = np.zeros((num_classes, num_classes))
record_in_matrix(sim_matrix)
sim_matrix = np.sqrt(sim_matrix)
np.savetxt("sim_mat.txt", sim_matrix)
clst = AffinityPropagation(affinity='precomputed')
#clst = SpectralClustering(n_clusters=7,affinity='precomputed')
classes = clst.fit_predict(sim_matrix)
with open("ap/centers.txt", "w") as f:
for clst, indx in enumerate(clst.cluster_centers_indices_):
f.write(all_urls[indx])
f.write(" ")
f.write(str(clst))
f.write("\n")
with open("ap/clusters.txt", "w") as f:
for idx, cls in enumerate(classes):
f.write(all_urls[idx])
f.write(" ")
f.write(str(cls))
示例14: range
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
for j in range(size_berlin):
if i != j:
matrix_berlin[i][j] = (list_of_berlin_person[i].distance_of_two_persons(list_of_berlin_person[j]))
for i in range(size_newcomers):
for j in range(size_newcomers):
if i != j:
matrix_newcomer[i][j] = (list_of_newcomer_person[i].distance_of_two_persons(list_of_newcomer_person[j]))
print(matrix_berlin)
print(matrix_newcomer)
print('_____________________________________')
clusterer.fit(matrix_newcomer, y=None)
print('_____________________________________')
clusterer.fit_predict(matrix_newcomer, y=None)
print('_____________________________________')
#
af = AffinityPropagation().fit(matrix_newcomer)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
print(labels)
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
# print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
# print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
# print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
# print("Adjusted Rand Index: %0.3f"% metrics.adjusted_rand_score(labels_true, labels))
# print("Adjusted Mutual Information: %0.3f"% metrics.adjusted_mutual_info_score(labels_true, labels))
# print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
示例15: create_tag_categories
# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def create_tag_categories():
"""Cluster MSE tags in to categories using sklearn AffinityPropogation.
Any existing category system in the database will be overwritten.
"""
con = connect_db()
cur = con.cursor()
query = """
SELECT T.id, T.name, COUNT(Q.question_id) AS count FROM
(
SELECT tags.id, tags.name, COUNT(qt.question_id) AS count FROM tags
JOIN question_tags AS qt ON qt.tag_id=tags.id
WHERE tags.name NOT IN ('advice', 'applications', 'big-list',
'education', 'intuition', 'learning', 'math-history', 'math-software',
'reference-request', 'self-learning', 'soft-question', 'teaching',
'alternative-proof-strategy', 'proof-writing', 'visualization',
'alternative-proof', 'proof-strategy', 'proof-verification',
'solution-verification', 'definition', 'examples-counterexamples',
'mathematica', 'wolfram-alpha', 'maple', 'matlab', 'sage', 'octave',
'floor-function', 'ceiling-function', 'article-writing', 'publishing',
'combinatorial-species', 'gromov-hyperbolic-spaces', 'chemistry',
'book-recommendation')
GROUP BY tags.name
) AS T
JOIN question_tags AS Q ON T.id=Q.tag_id
GROUP BY T.id"""
cur.execute(query)
tag_ids = []
tag_names = []
tag_indices = dict()
tag_name_indices = dict()
counts = []
for q in cur:
tag_ids.append(q['id'])
tag_names.append(q['name'])
tag_indices[q['id']] = len(tag_ids) - 1
tag_name_indices[q['name']] = len(tag_ids) - 1
counts.append(q['count'])
tag_ids = np.array(tag_ids)
tag_names = np.array(tag_names)
query = """
SELECT t1.id AS tag1, t2.id AS tag2, COUNT(qt1.question_id) as count
FROM question_tags AS qt1
JOIN question_tags AS qt2 ON qt1.question_id=qt2.question_id
JOIN tags AS t1 ON t1.id=qt1.tag_id
JOIN tags AS t2 ON t2.id=qt2.tag_id
WHERE t1.id IN ({taglist}) AND t2.id IN ({taglist})
GROUP BY t1.name, t2.name""".format(taglist=','.join(str(i) for i in tag_ids))
cur.execute(query)
paircounts = [[0 for i in range(len(tag_ids))] for j in range(len(tag_ids))]
for q in cur:
t1 = q['tag1']
i1 = tag_indices[t1]
t2 = q['tag2']
i2 = tag_indices[t2]
c = q['count']
if i1 == i2:
paircounts[i1][i1] = int(c/2)
else:
paircounts[i1][i2] = c
sim = np.array(paircounts, dtype=np.float_)
cluster = AffinityPropagation(affinity='precomputed', damping=0.5)
labels = cluster.fit_predict(sim)
classes = sorted(list(set(labels)))
catnames = {i:tag_names[cluster.cluster_centers_indices_[i]] for i in \
range(len(cluster.cluster_centers_indices_))}
cur.execute("DELETE FROM categories WHERE 1;")
cur.execute("DELETE FROM tag_categories WHERE 1;")
query = "INSERT INTO categories (id,name) VALUES "
catnames = [tag_names[cluster.cluster_centers_indices_[c]] for c in classes]
query += ','.join("({},'{}')".format(c,catnames[c]) for c in classes)
cur.execute(query)
query = "INSERT INTO tag_categories (tag_id, category_id) VALUES "
query += ','.join("({},{})".format(tag_ids[i], labels[i]) for i \
in range(len(labels)))
cur.execute(query)
con.commit()