本文整理汇总了Python中hdbscan.HDBSCAN属性的典型用法代码示例。如果您正苦于以下问题:Python hdbscan.HDBSCAN属性的具体用法?Python hdbscan.HDBSCAN怎么用?Python hdbscan.HDBSCAN使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类hdbscan
的用法示例。
在下文中一共展示了hdbscan.HDBSCAN属性的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: bsoid_hdbscan
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def bsoid_hdbscan(umap_embeddings, hdbscan_params=HDBSCAN_PARAMS):
"""
Trains HDBSCAN (unsupervised) given learned UMAP space
:param umap_embeddings: 2D array, embedded UMAP space
:param hdbscan_params: dict, HDBSCAN params in GLOBAL_CONFIG
:return assignments: HDBSCAN assignments
"""
highest_numulab = -np.infty
numulab = []
min_cluster_range = range(6, 21)
logging.info('Running HDBSCAN on {} instances in {} D space...'.format(*umap_embeddings.shape))
for min_c in min_cluster_range:
trained_classifier = hdbscan.HDBSCAN(prediction_data=True,
min_cluster_size=int(round(0.001 * min_c * umap_embeddings.shape[0])),
**hdbscan_params).fit(umap_embeddings)
numulab.append(len(np.unique(trained_classifier.labels_)))
if numulab[-1] > highest_numulab:
logging.info('Adjusting minimum cluster size to maximize cluster number...')
highest_numulab = numulab[-1]
best_clf = trained_classifier
assignments = best_clf.labels_
soft_clusters = hdbscan.all_points_membership_vectors(best_clf)
soft_assignments = np.argmax(soft_clusters, axis=1)
logging.info('Done predicting labels for {} instances in {} D space...'.format(*umap_embeddings.shape))
return assignments, soft_clusters, soft_assignments
示例2: hdbscancluster
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def hdbscancluster(self, dist, iteration=-1):
# HDBSCAN cluster
clusterer = hdbscan.HDBSCAN(min_samples=self.args.dbscan_minsample, metric='precomputed') # min_cluster_size=2,
labels = clusterer.fit_predict(dist.astype(np.double))
# select & cluster images as training set of this iteration
print('Clustering and labeling...')
num_ids = len(set(labels)) - 1
print('Iteration {} have {} training ids'.format(iteration, num_ids))
# generate new dataset
new_dataset = []
new_indices = []
for (fname, _, _), label, indice in zip(self.traindataset, labels, self.old_indices):
if label == -1:
continue
# dont need to change codes in trainer.py _parsing_input function and sampler function after add 0
new_dataset.append((fname, label, indice))
new_indices.append(indice)
print('Iteration {} have {} training images'.format(iteration, len(new_dataset)))
return new_dataset, new_indices
示例3: _find_k_hdbscan
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def _find_k_hdbscan(self, max_dim_clustering=Defaults.MAX_DIM):
"""Use hdbscan to downsample the examples.
We use optimal k to determine how much to downsample the examples.
:param max_dim_clustering: Dimensionality threshold for performing reduction.
:type max_dim_clustering: int
"""
import hdbscan
num_rows = self._dataset.shape[0]
reduced_examples = self._reduce_examples(max_dim_clustering)
hdbs = hdbscan.HDBSCAN(min_cluster_size=2).fit(reduced_examples)
clusters = hdbs.labels_
opt_k = len(set(clusters))
clustering_threshold = 5
samples = opt_k * clustering_threshold
module_logger.info(('found optimal k for hdbscan: {},'
' will use clustering_threshold * k for sampling: {}').format(str(opt_k), str(samples)))
return min(samples, num_rows)
示例4: bsoid_hdbscan
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def bsoid_hdbscan(umap_embeddings, hdbscan_params=HDBSCAN_PARAMS):
"""
Trains HDBSCAN (unsupervised) given learned UMAP space
:param umap_embeddings: 2D array, embedded UMAP space
:param hdbscan_params: dict, HDBSCAN params in GLOBAL_CONFIG
:return assignments: HDBSCAN assignments
"""
highest_numulab = -np.infty
numulab = []
min_cluster_range = range(6, 21)
logging.info('Running HDBSCAN on {} instances in {} D space...'.format(*umap_embeddings.shape))
for min_c in min_cluster_range:
trained_classifier = hdbscan.HDBSCAN(prediction_data=True,
min_cluster_size=int(round(0.001 * min_c * umap_embeddings.shape[0])),
**hdbscan_params).fit(umap_embeddings)
numulab.append(len(np.unique(trained_classifier.labels_)))
if numulab[-1] > highest_numulab:
logging.info('Adjusting minimum cluster size to maximize cluster number...')
highest_numulab = numulab[-1]
best_clf = trained_classifier
assignments = best_clf.labels_
soft_clusters = hdbscan.all_points_membership_vectors(best_clf)
soft_assignments = np.argmax(soft_clusters, axis=1)
# trained_classifier = hdbscan.HDBSCAN(prediction_data=True,
# min_cluster_size=round(umap_embeddings.shape[0] * 0.007), # just < 1%/cluster
# **hdbscan_params).fit(umap_embeddings)
# assignments = best_clf.labels_
logging.info('Done predicting labels for {} instances in {} D space...'.format(*umap_embeddings.shape))
return assignments, soft_clusters, soft_assignments
示例5: one_sub_cluster
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def one_sub_cluster(self, local_data, allow_single_cluster):
# clustering on one channel or one adjacency
#~ n_components = min(local_data.shape[1], self.n_components_local_pca)
#~ pca = sklearn.decomposition.IncrementalPCA(n_components=n_components, whiten=True)
n_components = min(local_data.shape[1]-1, self.n_components_local_pca)
pca = sklearn.decomposition.TruncatedSVD(n_components=n_components)
local_features = pca.fit_transform(local_data)
clusterer = hdbscan.HDBSCAN(min_cluster_size=self.min_cluster_size, allow_single_cluster=allow_single_cluster, metric='l2')
#~ clusterer = hdbscan.HDBSCAN(min_cluster_size=self.min_cluster_size, allow_single_cluster=True)
#~ clusterer = hdbscan.HDBSCAN(min_cluster_size=100, min_samples=20, allow_single_cluster=True)
#~ t0 = time.perf_counter()
#~ local_labels = clusterer.fit_predict(local_data)
#~ t1 = time.perf_counter()
#~ print('fit_predict wf', t1-t0)
#~ t0 = time.perf_counter()
local_labels = clusterer.fit_predict(local_features)
#~ t1 = time.perf_counter()
#~ print('fit_predict pca', t1-t0)
# try isosplit here not stable enought on windows
#~ local_labels = isosplit5.isosplit5(local_data.T)
return local_labels
示例6: test_cluster_hdbscan
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def test_cluster_hdbscan():
try:
from hdbscan import HDBSCAN
_has_hdbscan = True
except:
_has_hdbscan = False
if _has_hdbscan:
hdbscan_labels = cluster(data, cluster='HDBSCAN')
assert len(set(hdbscan_labels)) == 2
else:
with pytest.raises(ImportError):
hdbscan_labels = cluster(data, cluster='HDBSCAN')
示例7: generate_combined_topics_hellinger
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def generate_combined_topics_hellinger(all_topics, min_samples=5, min_cluster_size=5):
"""Given a large list of topics select out a small list of stable topics
by clustering the topics with HDBSCAN using Hellinger as a distance
measure between topics.
Parameters
----------
all_topics: array of shape (N, n_words)
The set of topics to be clustered.
min_samples: int (optional, default=5)
The min_samples parameter to use for HDBSCAN clustering.
min_cluster_size: int (optional, default=5)
The min_cluster_size parameter to use for HDBSCAN clustering
Returns
-------
stable_topics: array of shape (M, n_words)
A set of M topics, one for each cluster found by HDBSCAN.
"""
distance_matrix = all_pairs_hellinger_distance(all_topics)
labels = hdbscan.HDBSCAN(
min_samples=min_samples,
min_cluster_size=min_cluster_size,
metric="precomputed",
cluster_selection_method="leaf",
).fit_predict(distance_matrix)
result = np.empty((labels.max() + 1, all_topics.shape[1]), dtype=np.float32)
for i in range(labels.max() + 1):
result[i] = np.mean(np.sqrt(all_topics[labels == i]), axis=0) ** 2
result[i] /= result[i].sum()
return result
示例8: hdbscan
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def hdbscan(feat, min_samples=10):
import hdbscan
db = hdbscan.HDBSCAN(min_cluster_size=min_samples)
labels_ = db.fit_predict(feat)
return labels_
示例9: generate_clusters
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def generate_clusters(words, vectors_in_2D, print_status=True):
# HDBSCAN, i.e. hierarchical density-based spatial clustering of applications with noise (https://github.com/lmcinnes/hdbscan)
vectors = vectors_in_2D
sns.set_context('poster')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 500, 'linewidths': 0}
clusters = HDBSCAN(min_cluster_size=2).fit_predict(vectors)
palette = sns.color_palette("husl", np.unique(clusters).max() + 1)
colors = [palette[cluster_index] if cluster_index >= 0 else (0.0, 0.0, 0.0) for cluster_index in clusters]
fig = plt.figure(figsize=(30, 30))
plt.scatter(vectors.T[0], vectors.T[1], c=colors, **plot_kwds)
plt.axis('off')
x_vals = [i[0] for i in vectors]
y_vals = [i[1] for i in vectors]
plt.ylim(min(y_vals)-0.3, max(y_vals)+0.3)
plt.xlim(min(x_vals)-0.3, max(x_vals)+0.3)
font_path = getcwd() + '/fonts/Comfortaa-Regular.ttf'
font_property = matplotlib.font_manager.FontProperties(fname=font_path, size=24)
for i, word in enumerate(words):
if type(word) != type(None):
if type(word) != type(""):
word = unidecode(word).replace("_", " ")
else:
word = word.replace("_", " ")
text_object = plt.annotate(word, xy=(x_vals[i], y_vals[i]+0.05), font_properties=font_property, color=colors[i], ha="center")
plt.subplots_adjust(left=(500/3000), right=(2900/3000), top=1.0, bottom=(300/2700))
plt.savefig(get_visualization_file_path(print_status), bbox_inches="tight")
return clusters
示例10: test_DBCV
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def test_DBCV(data):
kmeans = KMeans(n_clusters=2)
kmeans_labels = kmeans.fit_predict(data)
hdbscanner = hdbscan.HDBSCAN()
hdbscan_labels = hdbscanner.fit_predict(data)
kmeans_score = DBCV.DBCV(data, kmeans_labels, dist_function=euclidean)
hdbscan_score = DBCV.DBCV(data, hdbscan_labels, dist_function=euclidean)
assert hdbscan_score > kmeans_score
示例11: test__mutual_reach_dist_graph
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def test__mutual_reach_dist_graph(data):
target = 0.09872567819414102
hdbscanner = hdbscan.HDBSCAN()
hdbscan_labels = hdbscanner.fit_predict(data)
graph = DBCV._mutual_reach_dist_graph(data, hdbscan_labels,
euclidean)
assert graph.shape == (data.shape[0], data.shape[0])
assert abs(graph[0][0] - target < 0.001)
示例12: write_summary
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def write_summary(self, cluster_ids, cluster_class_list, cluster_class_counts, xxx_todo_changeme1):
(n_components, min_cluster_size, min_samples) = xxx_todo_changeme1
final_output_fol = self.test_output_file + str(n_components).zfill(3) + "_" + str(min_samples).zfill(3) + "_" + str(min_cluster_size).zfill(3) + "/"
if not os.path.exists(final_output_fol):
os.makedirs(final_output_fol)
num_outliers = len(cluster_ids[cluster_ids == -1])
num_classes = len(numpy.unique(cluster_ids)) - 1
duration = ""
all_class_list = numpy.unique(sum((list(c) for c in cluster_class_list),[]))
all_class_counts = [sum(c for c, l in zip(cluster_class_counts, cluster_class_counts) if l == a) for a in all_class_list]
### Create summary file:
text_file_name = final_output_fol + "summary.txt"
F = open(text_file_name, 'w')
F.write("Cluster method: " + "HDBSCAN" + "\n")
F.write("n_components: " + str(n_components) + "\n")
F.write("min_samples: " + str(min_samples) + "\n")
F.write("min_cluster_size: " + str(min_cluster_size) + "\n")
F.write("clustering time: " + str(duration) + "\n")
F.write(" " + "\n")
F.write("Num Classes: " + str(num_classes) + "\n")
F.write("Num Outliers: " + str(num_outliers) + "\n")
F.write(" " + "\n")
cluster_string = "all_tracks: " + str(len(cluster_ids))
for name, count in zip(all_class_list, all_class_counts):
cluster_string += ", " + name + ": " + str(count)
F.write(cluster_string + "\n")
F.write(" " + "\n")
cluster_id_list, cluster_sizes = numpy.unique(cluster_ids, return_counts=True)
for cluster_id, cluster_size, class_list, class_counts in zip(cluster_id_list, cluster_sizes, cluster_class_list, cluster_class_counts):
cluster_string = "#: " + str(cluster_id).zfill(3) + ", Size: " + str(cluster_size)
for name, count in zip(class_list, class_counts):
cluster_string += ", " + name + ": " + str(count)
F.write(cluster_string + "\n")
return
示例13: hdbscan
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def hdbscan(feat, min_samples, **kwargs):
import hdbscan
db = hdbscan.HDBSCAN(min_cluster_size=min_samples)
labels_ = db.fit_predict(feat)
return labels_
示例14: generate_combined_topics_kl
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def generate_combined_topics_kl(all_topics, min_samples=5, min_cluster_size=5):
"""Given a large list of topics select out a small list of stable topics
by clustering the topics with HDBSCAN using KL-divergence as a distance
measure between topics.
Parameters
----------
all_topics: array of shape (N, n_words)
The set of topics to be clustered.
min_samples: int (optional, default=5)
The min_samples parameter to use for HDBSCAN clustering.
min_cluster_size: int (optional, default=5)
The min_cluster_size parameter to use for HDBSCAN clustering
Returns
-------
stable_topics: array of shape (M, n_words)
A set of M topics, one for each cluster found by HDBSCAN.
"""
divergence_matrix = all_pairs_kl_divergence(all_topics)
core_divergences = np.sort(divergence_matrix, axis=1)[:, min_samples]
tiled_core_divergences = np.tile(core_divergences, (core_divergences.shape[0], 1))
mutual_reachability = np.dstack(
[
divergence_matrix,
divergence_matrix.T,
tiled_core_divergences,
tiled_core_divergences.T,
]
).max(axis=-1)
mst_data = mst_linkage_core(mutual_reachability)
mst_order = np.argsort(mst_data.T[2])
mst_data = mst_data[mst_order]
single_linkage_tree = label(mst_data)
labels, probs, stabs, ctree, stree = _tree_to_labels(
all_topics,
single_linkage_tree,
min_cluster_size=min_cluster_size,
cluster_selection_method="leaf",
)
result = np.empty((labels.max() + 1, all_topics.shape[1]), dtype=np.float32)
for i in range(labels.max() + 1):
result[i] = np.mean(np.sqrt(all_topics[labels == i]), axis=0) ** 2
result[i] /= result[i].sum()
return result
示例15: generate_combined_topics_hellinger_umap
# 需要导入模块: import hdbscan [as 别名]
# 或者: from hdbscan import HDBSCAN [as 别名]
def generate_combined_topics_hellinger_umap(
all_topics, min_samples=5, min_cluster_size=5, n_neighbors=15, reduced_dim=5
):
"""Given a large list of topics select out a small list of stable topics
by mapping the topics to a low dimensional space with UMAP (using
Hellinger distance) and then clustering the topics with HDBSCAN using
Euclidean distance in the embedding space to measure distance between topics.
Parameters
----------
all_topics: array of shape (N, n_words)
The set of topics to be clustered.
min_samples: int (optional, default=5)
The min_samples parameter to use for HDBSCAN clustering.
min_cluster_size: int (optional, default=5)
The min_cluster_size parameter to use for HDBSCAN clustering
n_neighbors: int (optional, default=15)
The n_neighbors value to use with UMAP.
reduced_dim: int (optional, default=5)
The dimension of the embedding space to use.
Returns
-------
stable_topics: array of shape (M, n_words)
A set of M topics, one for each cluster found by HDBSCAN.
"""
embedding = umap.UMAP(
n_neighbors=n_neighbors, n_components=reduced_dim, metric=hellinger
).fit_transform(all_topics)
clusterer = hdbscan.HDBSCAN(
min_samples=min_samples,
min_cluster_size=min_cluster_size,
cluster_selection_method="leaf",
allow_single_cluster=True,
).fit(embedding)
labels = clusterer.labels_
membership_strengths = clusterer.probabilities_
result = np.empty((labels.max() + 1, all_topics.shape[1]), dtype=np.float32)
for i in range(labels.max() + 1):
mask = labels == i
result[i] = (
np.average(
np.sqrt(all_topics[mask]), axis=0, weights=membership_strengths[mask]
)
** 2
)
result[i] /= result[i].sum()
return result