当前位置: 首页>>代码示例>>Python>>正文


Python AffinityPropagation.fit_predict方法代码示例

本文整理汇总了Python中sklearn.cluster.AffinityPropagation.fit_predict方法的典型用法代码示例。如果您正苦于以下问题:Python AffinityPropagation.fit_predict方法的具体用法?Python AffinityPropagation.fit_predict怎么用?Python AffinityPropagation.fit_predict使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.cluster.AffinityPropagation的用法示例。


在下文中一共展示了AffinityPropagation.fit_predict方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: execute

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def execute(args):
  ##############################################################################
  if len(args) < 1:
    usage()
    sys.exit()

  names, labels_true, X = parse(args[0])
  indices = [int(i) for i in args[1:]]
  relevant_names = names[1:]
  if len(indices) > 0:
    X = np.asarray([[sample[i] for i in indices] for sample in X])
    relevant_names = [relevant_names[i] for i in indices]
  print "Clustering on", str(relevant_names) + "..."

  
  ##############################################################################
  # Compute Affinity Propagation
  af = AffinityPropagation(preference=-50)
  # cluster_centers_indices = af.cluster_centers_indices_
  # labels = af.labels_
  # 
  # n_clusters_ = len(cluster_centers_indices)

  y_pred = af.fit_predict(X)
  if y_pred is None or len(y_pred) is 0 or type(y_pred[0]) is np.ndarray:
    return 0
  counts = get_cluster_counts(labels_true, y_pred)
  print counts
开发者ID:nmusgrave,项目名称:Conducere,代码行数:30,代码来源:oldAffinityPropagationCluster.py

示例2: cluster_trajectories

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def cluster_trajectories( curves ):
    """Given a list of curves, cluster_trajectories will cluster them."""
    n_curves = len(curves)
    X_2B_clstrd = np.zeros( (n_curves, 4) )
    X_2B_clstrd[:,0] = np.array( [ curves[k][0, 0] for k in range(n_curves) ] )
    X_2B_clstrd[:,1] = np.array( [ curves[k][1, 0] for k in range(n_curves) ] )
    X_2B_clstrd[:,2] = np.array( [ curves[k][0,-1] for k in range(n_curves) ] )
    X_2B_clstrd[:,3] = np.array( [ curves[k][1,-1] for k in range(n_curves) ] )
        
    for col in range( 4 ):
        X_2B_clstrd[:,col] /=  X_2B_clstrd[:,col].std()
        
    def distance_metric(a,b):
        #A distance metric on R^4 modulo the involution
        #(x0,x2,x3,x4) -> (x3,x4,x1,x2)
        d = lambda a,b : np.sqrt( np.sum( (a-b)**2 ) )
        T = lambda x: np.array([x[2],x[3],x[0],x[1]])
        return min( d(a,b) , d(T(a),b) )
    from sklearn.cluster import AffinityPropagation
    clusterer = AffinityPropagation(affinity='precomputed', convergence_iter=100)
    aff = np.zeros((n_curves, n_curves))
    for i in range(n_curves):
        for j in range(i+1,n_curves):
            aff[i,j] = np.exp(-distance_metric( X_2B_clstrd[i], X_2B_clstrd[j])**2)
            aff[j,i] = aff[i,j]

    #clusterer.Affinity = aff
    cluster_labels = clusterer.fit_predict(aff)
    out = []
    for label in set( cluster_labels):
        cluster = map( lambda k: curves[k] , filter( lambda k: cluster_labels[k] == label , range( n_curves) ) )
        out.append( cluster )
    return map( align_cluster, out)
开发者ID:hoj201,项目名称:pedestrian_forecasting,代码行数:35,代码来源:cluster.py

示例3: cluster_articles

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def cluster_articles():
  ms = MongoStore()
  articles = [a for a in ms.get_pending_articles()]

  if len(articles) > 0:

    tfidf = TfidfVectorizer(tokenizer=preprocess)


    good_articles = [article for article in articles 
                     if article["text_content"].strip() != ""]

    texts = [article["text_content"] for article in good_articles]

    X_tfidf = tfidf.fit_transform(texts)

    print X_tfidf

    ap = AffinityPropagation(damping=0.95, max_iter=4000, 
            convergence_iter=400, copy=True, preference=-4, 
            affinity='euclidean', verbose=True)

    C = ap.fit_predict(X_tfidf)
    print X_tfidf.shape, C.shape
    print C
    centers = ap.cluster_centers_indices_
    clusters = []
    for c, center in enumerate(centers):

        
        members = np.where(C == c)[0]
        K = cosine_similarity(X_tfidf[members], X_tfidf[center])
        member_sims = [(m, float(k)) for m, k in zip(members, K)]
        member_sims.sort(key=lambda x: x[1], reverse=True)

        cluster = {"articles": [], "date": datetime.now(), "summarized": False}

        if len([member for member, sim in member_sims if sim > .55]) >= 3:
            print texts[center][:75].replace("\n", " ")

            for member, sim in member_sims:

                print "\t{:3.3f} ".format(sim), 
                print good_articles[member]["title"][:60].replace("\n", " ")
                cluster["articles"].append((good_articles[member]["_id"], sim))
        else:
            continue
        
        clusters.append(cluster)

    if len(clusters) > 0:
        ms.insert_clusters(clusters)

    ms.set_clustered_flag(articles)
开发者ID:kedz,项目名称:newsblaster,代码行数:56,代码来源:cluster.py

示例4: affinity_propagation

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def affinity_propagation():
    """
    AffinityPropagation creates clusters by sending messages between pairs of
    samples until convergence. The messages sent between pairs represent the
    suitability for one sample to be the exemplar of the other, which is updated
    in response to the values from other pairs. this updates occurs iteratively
    until convergence, at which point the final exemplars are chosen and hence
    the final cluster is given.

    Algorithm:

    The message sent between pairs belongs to one of two categories. The first
    is the responsibility, r(i,k), which is the accumulated evidence that sample
    k should the exemplar for sample i. The second is the availability, a(i,k),
    which is the accumulated evidence that sample i should chose sample k to be
    its exemplar, and considers the values for all other samples that k should
    be an exemplar. In this case exemplars are chosen by samples if they are:

        - similar enough to many samples, and
        - chosen by many samples to be representative of themselves.
    """
    # Generate a generic data sample.
    n_samples = 300
    std = 0.3
    seed = 0
    centers = [ [-1., 0.], [0., 1.5], [1., 0.] ]
    data, target = make_blobs(n_samples = n_samples, centers = centers,
        cluster_std = std, random_state = seed)

    # Set the preference for each point: samples with large preference values
    # are more likely to be chosen as exemplars. The number of exemplars, i.e.,
    # clusters, is influenced by the input preference values. If preferences are
    # not passed as arguments, they will be set to the median of the input
    # similarities.
    # pref = [ np.random.randint(low = -50, high = 0) for x in range(n_samples)]
    pref = -50
    # Compute affinity propagation.
    clf = AffinityPropagation(preference = pref)
    aff_y = clf.fit_predict(data)
    # Find mismatches between predicted and true values.
    cnt = int(0)
    for idx in range(n_samples):
        if(target[idx] != aff_y[idx]): cnt += 1
    # Print results.
    print('Approximated number of clusters ', len(clf.cluster_centers_indices_))
    print('Accuracy ', float(n_samples - cnt) / float(n_samples))
    print('Homogeneity ', metrics.homogeneity_score(target, clf.labels_))
    print('Completeness ', metrics.completeness_score(target, clf.labels_))

    # Plot resulting clusters.
    plt.figure(figsize = (8,8))
    plt.scatter(data[:,0], data[:,1], c = aff_y, s = 50)
    plt.title('Affinity clustering')
    plt.show()
开发者ID:hope0hermes,项目名称:ScriptsPython,代码行数:56,代码来源:py3_4_unsupervised_clustering_affinity.py

示例5: evaluate_clustering

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def evaluate_clustering():

    similarity_matrix = get_sense_similarity_submatrix(range(10000))
    matrix_size = len(similarity_matrix)
    print('got matrix')

    affinity_propagation = AffinityPropagation()
    labels1 = affinity_propagation.fit_predict(similarity_matrix)
    print('affinity propagation')

    dbscan = DBSCAN(min_samples=1)
    labels2 = dbscan.fit_predict(similarity_matrix)
    print('print dbscan')

    distance_matrix = np.ndarray((matrix_size, matrix_size))
    for i in range(matrix_size):
        for j in range(matrix_size):
            distance_matrix[i, j] = 1 - similarity_matrix[i, j]

    print(distance_matrix[1, 2])
    print(distance_matrix[1, 1])

    print('created distance matrix')

    cluster_map1 = cluster_evaluation.fpena_get_clusters(labels1)
    cluster_map2 = cluster_evaluation.fpena_get_clusters(labels2)

    print(cluster_map1)
    print(cluster_map2)

    sc1 = sklearn.metrics.silhouette_score(distance_matrix, labels1, metric='euclidean')
    sc2 = sklearn.metrics.silhouette_score(distance_matrix, labels2, metric='euclidean')
    sc5 = cluster_evaluation.fpena_evaluate(cluster_map1, distance_matrix)
    sc6 = cluster_evaluation.fpena_evaluate(cluster_map2, distance_matrix)

    num_elements1 = [len(values) for values in cluster_map1.values()]
    num_elements2 = [len(values) for values in cluster_map2.values()]
    print(num_elements1)
    print(num_elements2)

    print('Number of clusters Affinity Propagation: %f' % len(cluster_map1))
    print('Number of clusters DBSCAN: %f' % len(cluster_map2))
    print('Average elements per cluster Affinity Propagation: %f' % np.mean(num_elements1))
    print('Average elements per cluster DBSCAN: %f' % np.mean(num_elements2))
    print('Standard deviation per cluster Affinity Propagation: %f' % np.std(num_elements1))
    print('Standard deviation per cluster DBSCAN: %f' % np.std(num_elements2))
    print('Silouhette score Affinity Propagation (distance matrix): %f' % sc1)
    print('Silouhette score DBSCAN (distance matrix): %f' % sc2)
    print('Dunn index Affinity Propagation (distance matrix): %f' % sc5)
    print('Dunn index DBSCAN (distance matrix): %f' % sc6)
开发者ID:antoine-tran,项目名称:yelp,代码行数:52,代码来源:context_utils.py

示例6: geo_worker_

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def geo_worker_(job_queue, result_queue, **kwargs):
    signal.signal(signal.SIGINT, signal.SIG_IGN)

    geocache = get_resource_manager(u"GeoCacheResource")
    geoquery = GeoQuery(geocache.get_tsv_path())
    event = kwargs.get(u"event")

    while not job_queue.empty():
        try:
            string_tsv_path, geo_tsv_path = job_queue.get(block=False)

            with gzip.open(string_tsv_path, u"r") as f:
                string_df = pd.io.parsers.read_csv(f, sep="\t", quoting=3, header=0)

            loc_strings = [
                loc_string for loc_string in string_df[u"locations"].tolist() if not isinstance(loc_string, float)
            ]

            coords = []

            for loc_string in loc_strings:
                for location in loc_string.split(","):
                    coord = geoquery.lookup_location(location)
                    if coord is not None:
                        coords.append(coord)

            centers = set()
            if len(coords) > 0:
                coords = np.array(coords)
                D = -geoquery.compute_distances(coords[:, None], coords)
                ap = AffinityPropagation(affinity=u"precomputed")
                Y = ap.fit_predict(D)

                if ap.cluster_centers_indices_ is not None:
                    for center in ap.cluster_centers_indices_:
                        centers.add((coords[center][0], coords[center][1]))

                    centers = [{u"lat": lat, u"lng": lng} for lat, lng in centers]
                    centers_df = pd.DataFrame(centers, columns=[u"lat", u"lng"])

                    with gzip.open(geo_tsv_path, u"w") as f:
                        centers_df.to_csv(f, sep="\t", index=False, index_label=False, na_rep="nan")

            result_queue.put(None)
        except Queue.Empty:
            pass

    return True
开发者ID:kedz,项目名称:cuttsum,代码行数:50,代码来源:geo.py

示例7: mhd_cluster_trajectories

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def mhd_cluster_trajectories( curves ):
    """Returns clusters based upon the modified Hausdorff distance."""
    n_curves = len(curves)
    from sklearn.cluster import AffinityPropagation
    clusterer = AffinityPropagation(affinity='precomputed', convergence_iter=100)
    aff = np.zeros((n_curves, n_curves))
    for i in range(n_curves):
        for j in range(i+1,n_curves):
            from modified_Hausdorff_distance import modified_Hausdorff_distance as mhd
            aff[i,j] = mhd( curves[i].transpose(), curves[j].transpose() )
            aff[j,i] = aff[i,j]

    #clusterer.Affinity = aff
    cluster_labels = clusterer.fit_predict(aff)
    out = []
    for label in set( cluster_labels):
        cluster = map( lambda k: curves[k] , filter( lambda k: cluster_labels[k] == label , range( n_curves) ) )
        out.append( cluster )
    return map( align_cluster, out)
开发者ID:hoj201,项目名称:pedestrian_forecasting,代码行数:21,代码来源:cluster.py

示例8: plot_similarity_clusters

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def plot_similarity_clusters(desc1, desc2, files, plot = None):
	"""
	find similar sounds using Affinity Propagation clusters

	:param desc1: first descriptor values
	:param desc2: second descriptor values
	:returns:
	  - euclidean_labels: labels of clusters
	""" 

	if plot == True:
		print((Fore.MAGENTA + "Clustering"))
	else:
		pass
         
	min_max = preprocessing.scale(np.vstack((desc1,desc2)).T, with_mean=False, with_std=False)          
	pca = PCA(n_components=2, whiten=True)
	y = pca.fit(min_max).transform(min_max)
	    
	euclidean = AffinityPropagation(convergence_iter=1800, affinity='euclidean')                           
	euclidean_labels= euclidean.fit_predict(y)

	if plot == True:

		time.sleep(5)  

		print((Fore.WHITE + "Cada número representa el grupo al que pertence el sonido como ejemplar de otro/s. El grupo '0' esta coloreado en azul, el grupo '1' esta coloreado en rojo, el grupo '2' esta coloreado en amarillo. Observa el ploteo para ver qué sonidos son ejemplares de otros"))
		print(np.vstack((euclidean_labels,files)).T)

		time.sleep(6)

		plt.scatter(y[euclidean_labels==0,0], y[euclidean_labels==0,1], c='b')
		plt.scatter(y[euclidean_labels==1,0], y[euclidean_labels==1,1], c='r')
		plt.scatter(y[euclidean_labels==2,0], y[euclidean_labels==2,1], c='y')
		plt.scatter(y[euclidean_labels==3,0], y[euclidean_labels==3,1], c='g')
		plt.show()
	else:
		pass

	return euclidean_labels
开发者ID:MarsCrop,项目名称:apicultor,代码行数:42,代码来源:SoundSimilarity.py

示例9: cluster

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
    def cluster(self, normalize=False):
        """
        Cluster the nodes based on the PMI similarity measure. The clustering algorithm used is affinity propagation,
        which automatically choosed the number of clusters.

        :param normalize: If true, then normalize the similarity measured (i.e., the PMI) to be between -1 and 1.
        :return: The cluster labels.
        """
        if normalize:
            # use normalized PMI for similarity metric
            similarity = self.pmi / -np.log(self.joint_probs)
            similarity[np.diag_indices_from(similarity)] = 1.0
        else:
            similarity = self.pmi
            similarity[np.diag_indices_from(similarity)] = 1.1 * similarity.max()
        clustering = AffinityPropagation(affinity='precomputed', verbose=self.verbose,
                                         preference=similarity.min())
        clusters = clustering.fit_predict(similarity)
        if self.verbose:
            print 'Found', len(np.unique(clusters)), 'clusters.'

        return clusters
开发者ID:arunpn,项目名称:Insight,代码行数:24,代码来源:pmi_graph.py

示例10: TSNE

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
from sklearn.cluster import AffinityPropagation

from sklearn.manifold import TSNE


dataset = pd.read_csv('~/data/gene_expr_170104.csv')
data = np.array(dataset)[:, 1:].astype(float).T




Y = TSNE().fit_transform(data)
clus = AffinityPropagation()

lab = clus.fit_predict(Y)

x, y  = Y.T



plt.scatter(x, y, alpha=0.9, c = plt.cm.Spectral(lab.astype(float) / lab.max()), edgecolors='none')
# for i, j, t in zip(x, y, range(x.shape[0])):
#     plt.text(i, j, t, color = 'purple')

plt.show()

x, y, = SOS(iterations=10, alpha=1, beta=0, delta=0, theta=3.5).fit_transform(data).T

plt.scatter(x, y, alpha=0.4, c = plt.cm.Spectral(lab.astype(float) / lab.max()), edgecolors='none')
# for i, j, t in zip(x, y, range(x.shape[0])):
开发者ID:damithsenanayake,项目名称:RBM-GSOM,代码行数:32,代码来源:swarmtest3.py

示例11: cluster_affinity_propagation

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def cluster_affinity_propagation(similarity_matrix, desired_keys=None):

    numpy_matrix = similarity_matrix_to_numpy(similarity_matrix, desired_keys)

    clusterer = AffinityPropagation()
    return clusterer.fit_predict(numpy_matrix)
开发者ID:antoine-tran,项目名称:yelp,代码行数:8,代码来源:sense_clusterer.py

示例12: vectorLinspace

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
# cluster3 = vectorLinspace([4,1],[7,9], num=50)
# cluster3 = cluster1 + np.random.normal(5,.1,cluster3.shape)
# cluster4 = vectorLinspace([-1,4],[-4,2], num=50)
# cluster4 = cluster1 + np.random.normal(-5,.1,cluster4.shape)

X = cluster1#np.append(cluster1,np.append(cluster2,np.append(cluster3,cluster4,axis=0),axis=0),axis=0)
print(X)
print(pearsonr(X[:,0],X[:,1]),spearmanr(X[:,0],X[:,1]))
dists = np.zeros((len(X),len(X)))
for i1,x1 in enumerate(X): 
    print(i1,"/",len(X))
    for i2,x2 in enumerate(X):
#        for i3,x3 in enumerate(X):
#            if i1 != i2 and i2 != i3 and i1 != i3:
#                 tmp = np.append(x1,np.append(x2,x3,axis=0),axis=0).reshape((-1,2))
#                 #print(tmp)
#                 c = spearmanr(tmp[:,0],tmp[:,1])[0]
        dists[i1,i2] = cosine(x1,x2)
print(dists)
from sklearn.cluster import AffinityPropagation
ap = AffinityPropagation(affinity="precomputed")
y_pred = ap.fit_predict(dists)
print(len(set(y_pred)))
cmap = dict((y,np.random.beta(1,1,3)) for y in y_pred)
import matplotlib.pyplot as plt
for x,y in zip(X,y_pred):
    #plt.annotate(y,x,color=cmap[y])
    pass
plt.scatter(X[:,0],X[:,1])
plt.scatter(cluster2[:,0],cluster2[:,1])
plt.show()
开发者ID:marlonbetz,项目名称:BA,代码行数:33,代码来源:correlation_sandbox.py

示例13: build_class_labels

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
build_class_labels()
num_classes = len(urls)



sim_matrix = np.zeros((num_classes, num_classes))
record_in_matrix(sim_matrix)
sim_matrix = np.sqrt(sim_matrix)

np.savetxt("sim_mat.txt", sim_matrix)


clst = AffinityPropagation(affinity='precomputed')
#clst = SpectralClustering(n_clusters=7,affinity='precomputed')
classes = clst.fit_predict(sim_matrix)


with open("ap/centers.txt", "w") as f:
    for clst, indx in enumerate(clst.cluster_centers_indices_):
        f.write(all_urls[indx])
        f.write(" ")
        f.write(str(clst))
        f.write("\n")


with open("ap/clusters.txt", "w") as f:
    for idx, cls in enumerate(classes):
        f.write(all_urls[idx])
        f.write(" ")
        f.write(str(cls))
开发者ID:BDTurc,项目名称:Cosi227B,代码行数:32,代码来源:cluster.py

示例14: range

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
        for j in range(size_berlin):
            if i != j:
                matrix_berlin[i][j] = (list_of_berlin_person[i].distance_of_two_persons(list_of_berlin_person[j]))

    for i in range(size_newcomers):
        for j in range(size_newcomers):
            if i != j:
                matrix_newcomer[i][j] = (list_of_newcomer_person[i].distance_of_two_persons(list_of_newcomer_person[j]))

    print(matrix_berlin)
    print(matrix_newcomer)

    print('_____________________________________')
    clusterer.fit(matrix_newcomer, y=None)
    print('_____________________________________')
    clusterer.fit_predict(matrix_newcomer, y=None)
    print('_____________________________________')
    #
    af = AffinityPropagation().fit(matrix_newcomer)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    print(labels)
    n_clusters_ = len(cluster_centers_indices)

    print('Estimated number of clusters: %d' % n_clusters_)
    # print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    # print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    # print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    # print("Adjusted Rand Index: %0.3f"% metrics.adjusted_rand_score(labels_true, labels))
    # print("Adjusted Mutual Information: %0.3f"% metrics.adjusted_mutual_info_score(labels_true, labels))
    # print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
开发者ID:parking52,项目名称:weConnectAnalysis,代码行数:33,代码来源:clustering_persons.py

示例15: create_tag_categories

# 需要导入模块: from sklearn.cluster import AffinityPropagation [as 别名]
# 或者: from sklearn.cluster.AffinityPropagation import fit_predict [as 别名]
def create_tag_categories():
    """Cluster MSE tags in to categories using sklearn AffinityPropogation.

       Any existing category system in the database will be overwritten.
    """
    con = connect_db()
    cur = con.cursor()

    query = """
    SELECT T.id, T.name, COUNT(Q.question_id) AS count FROM
    (
        SELECT tags.id, tags.name, COUNT(qt.question_id) AS count FROM tags
        JOIN question_tags AS qt ON qt.tag_id=tags.id
        WHERE tags.name NOT IN ('advice', 'applications', 'big-list', 
        'education', 'intuition', 'learning', 'math-history', 'math-software',
        'reference-request', 'self-learning', 'soft-question', 'teaching',
        'alternative-proof-strategy', 'proof-writing', 'visualization',
        'alternative-proof', 'proof-strategy', 'proof-verification',
        'solution-verification', 'definition', 'examples-counterexamples',
        'mathematica', 'wolfram-alpha', 'maple', 'matlab', 'sage', 'octave',
        'floor-function', 'ceiling-function', 'article-writing', 'publishing',
        'combinatorial-species', 'gromov-hyperbolic-spaces', 'chemistry',
        'book-recommendation')
        GROUP BY tags.name
    ) AS T
    JOIN question_tags AS Q ON T.id=Q.tag_id
    GROUP BY T.id"""
    cur.execute(query)
    tag_ids = []
    tag_names = []
    tag_indices = dict()
    tag_name_indices = dict()
    counts = []
    for q in cur:
        tag_ids.append(q['id'])
        tag_names.append(q['name'])
        tag_indices[q['id']] = len(tag_ids) - 1
        tag_name_indices[q['name']] = len(tag_ids) - 1
        counts.append(q['count'])

    tag_ids = np.array(tag_ids)
    tag_names = np.array(tag_names)

    query = """
    SELECT t1.id AS tag1, t2.id AS tag2, COUNT(qt1.question_id) as count
    FROM question_tags AS qt1
    JOIN question_tags AS qt2 ON qt1.question_id=qt2.question_id
    JOIN tags AS t1 ON t1.id=qt1.tag_id
    JOIN tags AS t2 ON t2.id=qt2.tag_id
    WHERE t1.id IN ({taglist}) AND t2.id IN ({taglist})
    GROUP BY t1.name, t2.name""".format(taglist=','.join(str(i) for i in tag_ids))
    cur.execute(query)

    paircounts = [[0 for i in range(len(tag_ids))] for j in range(len(tag_ids))]
    for q in cur:
        t1 = q['tag1']
        i1 = tag_indices[t1]
        t2 = q['tag2']
        i2 = tag_indices[t2]
        c = q['count']
        if i1 == i2:
            paircounts[i1][i1] = int(c/2)
        else:
            paircounts[i1][i2] = c

    sim = np.array(paircounts, dtype=np.float_)

    cluster = AffinityPropagation(affinity='precomputed', damping=0.5)

    labels = cluster.fit_predict(sim)

    classes = sorted(list(set(labels)))

    catnames = {i:tag_names[cluster.cluster_centers_indices_[i]] for i in \
            range(len(cluster.cluster_centers_indices_))}
    cur.execute("DELETE FROM categories WHERE 1;")
    cur.execute("DELETE FROM tag_categories WHERE 1;")

    query = "INSERT INTO categories (id,name) VALUES "
    catnames = [tag_names[cluster.cluster_centers_indices_[c]] for c in classes]
    query += ','.join("({},'{}')".format(c,catnames[c]) for c in classes)
    cur.execute(query)

    query = "INSERT INTO tag_categories (tag_id, category_id) VALUES "
    query += ','.join("({},{})".format(tag_ids[i], labels[i]) for i \
            in range(len(labels)))
    cur.execute(query)
    con.commit()
开发者ID:nrpeterson,项目名称:mathstackexpert,代码行数:90,代码来源:categories.py


注:本文中的sklearn.cluster.AffinityPropagation.fit_predict方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。