当前位置: 首页>>代码示例>>Python>>正文


Python hdbscan.HDBSCAN属性代码示例

本文整理汇总了Python中hdbscan.HDBSCAN属性的典型用法代码示例。如果您正苦于以下问题:Python hdbscan.HDBSCAN属性的具体用法?Python hdbscan.HDBSCAN怎么用?Python hdbscan.HDBSCAN使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在hdbscan的用法示例。


在下文中一共展示了hdbscan.HDBSCAN属性的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: bsoid_hdbscan

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def bsoid_hdbscan(umap_embeddings, hdbscan_params=HDBSCAN_PARAMS):
    """
    Trains HDBSCAN (unsupervised) given learned UMAP space
    :param umap_embeddings: 2D array, embedded UMAP space
    :param hdbscan_params: dict, HDBSCAN params in GLOBAL_CONFIG
    :return assignments: HDBSCAN assignments
    """
    highest_numulab = -np.infty
    numulab = []
    min_cluster_range = range(6, 21)
    logging.info('Running HDBSCAN on {} instances in {} D space...'.format(*umap_embeddings.shape))
    for min_c in min_cluster_range:
        trained_classifier = hdbscan.HDBSCAN(prediction_data=True,
                                             min_cluster_size=int(round(0.001 * min_c * umap_embeddings.shape[0])),
                                             **hdbscan_params).fit(umap_embeddings)
        numulab.append(len(np.unique(trained_classifier.labels_)))
        if numulab[-1] > highest_numulab:
            logging.info('Adjusting minimum cluster size to maximize cluster number...')
            highest_numulab = numulab[-1]
            best_clf = trained_classifier
    assignments = best_clf.labels_
    soft_clusters = hdbscan.all_points_membership_vectors(best_clf)
    soft_assignments = np.argmax(soft_clusters, axis=1)
    logging.info('Done predicting labels for {} instances in {} D space...'.format(*umap_embeddings.shape))
    return assignments, soft_clusters, soft_assignments 
开发者ID:YttriLab,项目名称:B-SOID,代码行数:27,代码来源:train.py

示例2: hdbscancluster

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def hdbscancluster(self, dist, iteration=-1):
        # HDBSCAN cluster
        clusterer = hdbscan.HDBSCAN(min_samples=self.args.dbscan_minsample, metric='precomputed')  # min_cluster_size=2,
        labels = clusterer.fit_predict(dist.astype(np.double))

        # select & cluster images as training set of this iteration
        print('Clustering and labeling...')
        num_ids = len(set(labels)) - 1

        print('Iteration {} have {} training ids'.format(iteration, num_ids))
        # generate new dataset
        new_dataset = []
        new_indices = []

        for (fname, _, _), label, indice in zip(self.traindataset, labels, self.old_indices):
            if label == -1:
                continue
            # dont need to change codes in trainer.py _parsing_input function and sampler function after add 0
            new_dataset.append((fname, label, indice))
            new_indices.append(indice)

        print('Iteration {} have {} training images'.format(iteration, len(new_dataset)))

        return new_dataset, new_indices 
开发者ID:zhangxinyu-xyz,项目名称:PAST-ReID,代码行数:26,代码来源:cluster.py

示例3: _find_k_hdbscan

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def _find_k_hdbscan(self, max_dim_clustering=Defaults.MAX_DIM):
        """Use hdbscan to downsample the examples.

        We use optimal k to determine how much to downsample the examples.

        :param max_dim_clustering: Dimensionality threshold for performing reduction.
        :type max_dim_clustering: int
        """
        import hdbscan
        num_rows = self._dataset.shape[0]
        reduced_examples = self._reduce_examples(max_dim_clustering)
        hdbs = hdbscan.HDBSCAN(min_cluster_size=2).fit(reduced_examples)
        clusters = hdbs.labels_
        opt_k = len(set(clusters))
        clustering_threshold = 5
        samples = opt_k * clustering_threshold
        module_logger.info(('found optimal k for hdbscan: {},'
                            ' will use clustering_threshold * k for sampling: {}').format(str(opt_k), str(samples)))
        return min(samples, num_rows) 
开发者ID:interpretml,项目名称:interpret-community,代码行数:21,代码来源:dataset_wrapper.py

示例4: bsoid_hdbscan

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def bsoid_hdbscan(umap_embeddings, hdbscan_params=HDBSCAN_PARAMS):
    """
    Trains HDBSCAN (unsupervised) given learned UMAP space
    :param umap_embeddings: 2D array, embedded UMAP space
    :param hdbscan_params: dict, HDBSCAN params in GLOBAL_CONFIG
    :return assignments: HDBSCAN assignments
    """
    highest_numulab = -np.infty
    numulab = []
    min_cluster_range = range(6, 21)
    logging.info('Running HDBSCAN on {} instances in {} D space...'.format(*umap_embeddings.shape))
    for min_c in min_cluster_range:
        trained_classifier = hdbscan.HDBSCAN(prediction_data=True,
                                             min_cluster_size=int(round(0.001 * min_c * umap_embeddings.shape[0])),
                                             **hdbscan_params).fit(umap_embeddings)
        numulab.append(len(np.unique(trained_classifier.labels_)))
        if numulab[-1] > highest_numulab:
            logging.info('Adjusting minimum cluster size to maximize cluster number...')
            highest_numulab = numulab[-1]
            best_clf = trained_classifier
    assignments = best_clf.labels_
    soft_clusters = hdbscan.all_points_membership_vectors(best_clf)
    soft_assignments = np.argmax(soft_clusters, axis=1)
    # trained_classifier = hdbscan.HDBSCAN(prediction_data=True,
    #                                      min_cluster_size=round(umap_embeddings.shape[0] * 0.007),  # just < 1%/cluster
    #                                      **hdbscan_params).fit(umap_embeddings)
    # assignments = best_clf.labels_
    logging.info('Done predicting labels for {} instances in {} D space...'.format(*umap_embeddings.shape))
    return assignments, soft_clusters, soft_assignments 
开发者ID:YttriLab,项目名称:B-SOID,代码行数:31,代码来源:train.py

示例5: one_sub_cluster

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def one_sub_cluster(self, local_data, allow_single_cluster):
        # clustering on one channel or one adjacency
        
        
        #~ n_components = min(local_data.shape[1], self.n_components_local_pca)
        #~ pca =  sklearn.decomposition.IncrementalPCA(n_components=n_components, whiten=True)
        
        n_components = min(local_data.shape[1]-1, self.n_components_local_pca)
        pca =  sklearn.decomposition.TruncatedSVD(n_components=n_components)
        
        local_features = pca.fit_transform(local_data)
        
        
        clusterer = hdbscan.HDBSCAN(min_cluster_size=self.min_cluster_size, allow_single_cluster=allow_single_cluster, metric='l2')
        #~ clusterer = hdbscan.HDBSCAN(min_cluster_size=self.min_cluster_size, allow_single_cluster=True)
        #~ clusterer = hdbscan.HDBSCAN(min_cluster_size=100, min_samples=20, allow_single_cluster=True)
        
        #~ t0 = time.perf_counter()
        #~ local_labels = clusterer.fit_predict(local_data)
        #~ t1 = time.perf_counter()
        #~ print('fit_predict wf', t1-t0)

        
        #~ t0 = time.perf_counter()
        local_labels = clusterer.fit_predict(local_features)
        #~ t1 = time.perf_counter()
        #~ print('fit_predict pca', t1-t0)
        
        
        
        # try isosplit here not stable enought on windows
        #~ local_labels = isosplit5.isosplit5(local_data.T)
        
        return local_labels 
开发者ID:tridesclous,项目名称:tridesclous,代码行数:36,代码来源:pruningshears.py

示例6: test_cluster_hdbscan

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def test_cluster_hdbscan():
    try:
        from hdbscan import HDBSCAN
        _has_hdbscan = True
    except:
        _has_hdbscan = False

    if _has_hdbscan:
        hdbscan_labels = cluster(data, cluster='HDBSCAN')
        assert len(set(hdbscan_labels)) == 2
    else:
        with pytest.raises(ImportError):
            hdbscan_labels = cluster(data, cluster='HDBSCAN') 
开发者ID:ContextLab,项目名称:hypertools,代码行数:15,代码来源:test_cluster.py

示例7: generate_combined_topics_hellinger

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def generate_combined_topics_hellinger(all_topics, min_samples=5, min_cluster_size=5):
    """Given a large list of topics select out a small list of stable topics
    by clustering the topics with HDBSCAN using Hellinger as a distance
    measure between topics.


    Parameters
    ----------
    all_topics: array of shape (N, n_words)
        The set of topics to be clustered.

    min_samples: int (optional, default=5)
        The min_samples parameter to use for HDBSCAN clustering.

    min_cluster_size: int (optional, default=5)
        The min_cluster_size parameter to use for HDBSCAN clustering

    Returns
    -------
    stable_topics: array of shape (M, n_words)
        A set of M topics, one for each cluster found by HDBSCAN.
    """
    distance_matrix = all_pairs_hellinger_distance(all_topics)
    labels = hdbscan.HDBSCAN(
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        metric="precomputed",
        cluster_selection_method="leaf",
    ).fit_predict(distance_matrix)
    result = np.empty((labels.max() + 1, all_topics.shape[1]), dtype=np.float32)
    for i in range(labels.max() + 1):
        result[i] = np.mean(np.sqrt(all_topics[labels == i]), axis=0) ** 2
        result[i] /= result[i].sum()

    return result 
开发者ID:lmcinnes,项目名称:enstop,代码行数:37,代码来源:enstop_.py

示例8: hdbscan

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def hdbscan(feat, min_samples=10):
    import hdbscan
    db = hdbscan.HDBSCAN(min_cluster_size=min_samples)
    labels_ = db.fit_predict(feat)
    return labels_ 
开发者ID:XiaohangZhan,项目名称:cdp,代码行数:7,代码来源:baseline_clustering.py

示例9: generate_clusters

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def generate_clusters(words, vectors_in_2D, print_status=True):
	# HDBSCAN, i.e. hierarchical density-based spatial clustering of applications with noise (https://github.com/lmcinnes/hdbscan)
	vectors = vectors_in_2D
	sns.set_context('poster')
	sns.set_color_codes()
	plot_kwds = {'alpha' : 0.5, 's' : 500, 'linewidths': 0}
	clusters = HDBSCAN(min_cluster_size=2).fit_predict(vectors)
	palette = sns.color_palette("husl", np.unique(clusters).max() + 1)
	colors = [palette[cluster_index] if cluster_index >= 0 else (0.0, 0.0, 0.0) for cluster_index in clusters]
	fig = plt.figure(figsize=(30, 30))
	plt.scatter(vectors.T[0], vectors.T[1], c=colors, **plot_kwds)
	plt.axis('off')
	x_vals = [i[0] for i in vectors]
	y_vals = [i[1] for i in vectors]
	plt.ylim(min(y_vals)-0.3, max(y_vals)+0.3)    
	plt.xlim(min(x_vals)-0.3, max(x_vals)+0.3) 
	font_path = getcwd() + '/fonts/Comfortaa-Regular.ttf'
	font_property = matplotlib.font_manager.FontProperties(fname=font_path, size=24)
	for i, word in enumerate(words):
		if type(word) != type(None):
			if type(word) != type(""):
				word = unidecode(word).replace("_", " ")
			else:
				word = word.replace("_", " ")
			text_object = plt.annotate(word, xy=(x_vals[i], y_vals[i]+0.05), font_properties=font_property, color=colors[i], ha="center")
	plt.subplots_adjust(left=(500/3000), right=(2900/3000), top=1.0, bottom=(300/2700))
	plt.savefig(get_visualization_file_path(print_status), bbox_inches="tight")
	return clusters 
开发者ID:overlap-ai,项目名称:words2map,代码行数:30,代码来源:words2map.py

示例10: test_DBCV

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def test_DBCV(data):
    kmeans = KMeans(n_clusters=2)
    kmeans_labels = kmeans.fit_predict(data)
    hdbscanner = hdbscan.HDBSCAN()
    hdbscan_labels = hdbscanner.fit_predict(data)
    kmeans_score = DBCV.DBCV(data, kmeans_labels, dist_function=euclidean)
    hdbscan_score = DBCV.DBCV(data, hdbscan_labels, dist_function=euclidean)
    assert hdbscan_score > kmeans_score 
开发者ID:christopherjenness,项目名称:DBCV,代码行数:10,代码来源:test_dbcb.py

示例11: test__mutual_reach_dist_graph

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def test__mutual_reach_dist_graph(data):
    target = 0.09872567819414102
    hdbscanner = hdbscan.HDBSCAN()
    hdbscan_labels = hdbscanner.fit_predict(data)
    graph = DBCV._mutual_reach_dist_graph(data, hdbscan_labels,
                                          euclidean)
    assert graph.shape == (data.shape[0], data.shape[0])
    assert abs(graph[0][0] - target < 0.001) 
开发者ID:christopherjenness,项目名称:DBCV,代码行数:10,代码来源:test_dbcb.py

示例12: write_summary

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def write_summary(self, cluster_ids, cluster_class_list, cluster_class_counts, xxx_todo_changeme1):
    (n_components, min_cluster_size, min_samples) = xxx_todo_changeme1
    final_output_fol = self.test_output_file + str(n_components).zfill(3) + "_" + str(min_samples).zfill(3) + "_" + str(min_cluster_size).zfill(3) + "/"
    if not os.path.exists(final_output_fol):
      os.makedirs(final_output_fol)

    num_outliers = len(cluster_ids[cluster_ids == -1])
    num_classes = len(numpy.unique(cluster_ids)) - 1
    duration = ""

    all_class_list = numpy.unique(sum((list(c) for c in cluster_class_list),[]))
    all_class_counts = [sum(c for c, l in zip(cluster_class_counts, cluster_class_counts) if l == a) for a in all_class_list]

    ### Create summary file:
    text_file_name = final_output_fol + "summary.txt"
    F = open(text_file_name, 'w')
    F.write("Cluster method: " + "HDBSCAN" + "\n")
    F.write("n_components: " + str(n_components) + "\n")
    F.write("min_samples: " + str(min_samples) + "\n")
    F.write("min_cluster_size: " + str(min_cluster_size) + "\n")
    F.write("clustering time: " + str(duration) + "\n")
    F.write(" " + "\n")
    F.write("Num Classes: " + str(num_classes) + "\n")
    F.write("Num Outliers: " + str(num_outliers) + "\n")
    F.write(" " + "\n")

    cluster_string = "all_tracks: " + str(len(cluster_ids))
    for name, count in zip(all_class_list, all_class_counts):
      cluster_string += ", " + name + ": " + str(count)
    F.write(cluster_string + "\n")
    F.write(" " + "\n")

    cluster_id_list, cluster_sizes = numpy.unique(cluster_ids, return_counts=True)
    for cluster_id, cluster_size, class_list, class_counts in zip(cluster_id_list, cluster_sizes, cluster_class_list, cluster_class_counts):
      cluster_string = "#: " + str(cluster_id).zfill(3) + ", Size: " + str(cluster_size)
      for name, count in zip(class_list, class_counts):
        cluster_string += ", " + name + ": " + str(count)
      F.write(cluster_string + "\n")

    return 
开发者ID:JonathonLuiten,项目名称:PReMVOS,代码行数:42,代码来源:ClusteringForwarder_old.py

示例13: hdbscan

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def hdbscan(feat, min_samples, **kwargs):
    import hdbscan
    db = hdbscan.HDBSCAN(min_cluster_size=min_samples)
    labels_ = db.fit_predict(feat)
    return labels_ 
开发者ID:yl-1993,项目名称:learn-to-cluster,代码行数:7,代码来源:sklearn_cluster.py

示例14: generate_combined_topics_kl

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def generate_combined_topics_kl(all_topics, min_samples=5, min_cluster_size=5):
    """Given a large list of topics select out a small list of stable topics
    by clustering the topics with HDBSCAN using KL-divergence as a distance
    measure between topics.


    Parameters
    ----------
    all_topics: array of shape (N, n_words)
        The set of topics to be clustered.

    min_samples: int (optional, default=5)
        The min_samples parameter to use for HDBSCAN clustering.

    min_cluster_size: int (optional, default=5)
        The min_cluster_size parameter to use for HDBSCAN clustering

    Returns
    -------
    stable_topics: array of shape (M, n_words)
        A set of M topics, one for each cluster found by HDBSCAN.
    """
    divergence_matrix = all_pairs_kl_divergence(all_topics)
    core_divergences = np.sort(divergence_matrix, axis=1)[:, min_samples]
    tiled_core_divergences = np.tile(core_divergences, (core_divergences.shape[0], 1))
    mutual_reachability = np.dstack(
        [
            divergence_matrix,
            divergence_matrix.T,
            tiled_core_divergences,
            tiled_core_divergences.T,
        ]
    ).max(axis=-1)
    mst_data = mst_linkage_core(mutual_reachability)
    mst_order = np.argsort(mst_data.T[2])
    mst_data = mst_data[mst_order]
    single_linkage_tree = label(mst_data)
    labels, probs, stabs, ctree, stree = _tree_to_labels(
        all_topics,
        single_linkage_tree,
        min_cluster_size=min_cluster_size,
        cluster_selection_method="leaf",
    )
    result = np.empty((labels.max() + 1, all_topics.shape[1]), dtype=np.float32)
    for i in range(labels.max() + 1):
        result[i] = np.mean(np.sqrt(all_topics[labels == i]), axis=0) ** 2
        result[i] /= result[i].sum()

    return result 
开发者ID:lmcinnes,项目名称:enstop,代码行数:51,代码来源:enstop_.py

示例15: generate_combined_topics_hellinger_umap

# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def generate_combined_topics_hellinger_umap(
    all_topics, min_samples=5, min_cluster_size=5, n_neighbors=15, reduced_dim=5
):
    """Given a large list of topics select out a small list of stable topics
    by mapping the topics to a low dimensional space with UMAP (using
    Hellinger distance) and then clustering the topics with HDBSCAN using
    Euclidean distance in the embedding space to measure distance between topics.


    Parameters
    ----------
    all_topics: array of shape (N, n_words)
        The set of topics to be clustered.

    min_samples: int (optional, default=5)
        The min_samples parameter to use for HDBSCAN clustering.

    min_cluster_size: int (optional, default=5)
        The min_cluster_size parameter to use for HDBSCAN clustering

    n_neighbors: int (optional, default=15)
        The n_neighbors value to use with UMAP.

    reduced_dim: int (optional, default=5)
        The dimension of the embedding space to use.

    Returns
    -------
    stable_topics: array of shape (M, n_words)
        A set of M topics, one for each cluster found by HDBSCAN.
    """
    embedding = umap.UMAP(
        n_neighbors=n_neighbors, n_components=reduced_dim, metric=hellinger
    ).fit_transform(all_topics)
    clusterer = hdbscan.HDBSCAN(
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        cluster_selection_method="leaf",
        allow_single_cluster=True,
    ).fit(embedding)
    labels = clusterer.labels_
    membership_strengths = clusterer.probabilities_
    result = np.empty((labels.max() + 1, all_topics.shape[1]), dtype=np.float32)
    for i in range(labels.max() + 1):
        mask = labels == i
        result[i] = (
            np.average(
                np.sqrt(all_topics[mask]), axis=0, weights=membership_strengths[mask]
            )
            ** 2
        )
        result[i] /= result[i].sum()

    return result 
开发者ID:lmcinnes,项目名称:enstop,代码行数:56,代码来源:enstop_.py


注:本文中的hdbscan.HDBSCAN属性示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。