本文整理汇总了Python中sklearn.metrics.pairwise_distances函数的典型用法代码示例。如果您正苦于以下问题:Python pairwise_distances函数的具体用法?Python pairwise_distances怎么用?Python pairwise_distances使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了pairwise_distances函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_silhouette
def test_silhouette():
# Tests the Silhouette Coefficient.
dataset = datasets.load_iris()
X = dataset.data
y = dataset.target
D = pairwise_distances(X, metric='euclidean')
# Given that the actual labels are used, we can assume that S would be
# positive.
silhouette = silhouette_score(D, y, metric='precomputed')
assert(silhouette > 0)
# Test without calculating D
silhouette_metric = silhouette_score(X, y, metric='euclidean')
assert_almost_equal(silhouette, silhouette_metric)
# Test with sampling
silhouette = silhouette_score(D, y, metric='precomputed',
sample_size=int(X.shape[0] / 2),
random_state=0)
silhouette_metric = silhouette_score(X, y, metric='euclidean',
sample_size=int(X.shape[0] / 2),
random_state=0)
assert(silhouette > 0)
assert(silhouette_metric > 0)
assert_almost_equal(silhouette_metric, silhouette)
# Test with sparse X
X_sparse = csr_matrix(X)
D = pairwise_distances(X_sparse, metric='euclidean')
silhouette = silhouette_score(D, y, metric='precomputed')
assert(silhouette > 0)
示例2: _hdbscan_generic
def _hdbscan_generic(X, min_samples=5, alpha=1.0,
metric='minkowski', p=2, leaf_size=None, gen_min_span_tree=False):
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')
distance_matrix = pairwise_distances(X, metric=metric, p=p)
else:
distance_matrix = pairwise_distances(X, metric=metric)
mutual_reachability_ = mutual_reachability(distance_matrix,
min_samples, alpha)
min_spanning_tree = mst_linkage_core(mutual_reachability_)
if gen_min_span_tree:
result_min_span_tree = min_spanning_tree.copy()
for index, row in enumerate(result_min_span_tree[1:], 1):
candidates = np.where(np.isclose(mutual_reachability_[row[1]], row[2]))[0]
candidates = np.intersect1d(candidates, min_spanning_tree[:index, :2].astype(int))
candidates = candidates[candidates != row[1]]
assert (len(candidates) > 0)
row[0] = candidates[0]
else:
result_min_span_tree = None
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
single_linkage_tree = label(min_spanning_tree)
return single_linkage_tree, result_min_span_tree
示例3: _hdbscan_small_kdtree
def _hdbscan_small_kdtree(X, min_cluster_size=5, min_samples=None,
metric='minkowski', p=2):
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')
distance_matrix = pairwise_distances(X, metric=metric, p=p)
else:
distance_matrix = pairwise_distances(X, metric=metric)
mutual_reachability_ = kdtree_mutual_reachability(X,
distance_matrix,
metric,
p=p,
min_points=min_samples)
min_spanning_tree = mst_linkage_core(mutual_reachability_)
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
single_linkage_tree = label(min_spanning_tree)
condensed_tree = condense_tree(single_linkage_tree,
min_cluster_size)
stability_dict = compute_stability(condensed_tree)
cluster_list = get_clusters(condensed_tree, stability_dict)
labels = -1 * np.ones(X.shape[0], dtype=int)
for index, cluster in enumerate(cluster_list):
labels[cluster] = index
return labels, condensed_tree, single_linkage_tree, min_spanning_tree
示例4: smart_initialize
def smart_initialize(data, k, seed=None):
"""
Use k-means++ to initialize a good set of centroids
:param data: whole dataset
:param k: number of centroids
:param seed: random seed
:return: initial centroids
"""
if seed is not None: # useful for obtaining consistent results
np.random.seed(seed)
centroids = np.zeros((k, data.shape[1]))
# Randomly choose the first centroid.
# Since we have no prior knowledge, choose uniformly at random
idx = np.random.randint(data.shape[0])
centroids[0] = data[idx, :].toarray()
# Compute distances from the first centroid chosen to all the other data points
distances = pairwise_distances(data, centroids[0:1], metric='euclidean').flatten()
for i in range(1, k):
# Choose the next centroid randomly, so that the probability for each data point to be chosen
# is directly proportional to its squared distance from the nearest centroid.
# Roughly speaking, a new centroid should be as far as from other centroids as possible.
idx = np.random.choice(data.shape[0], 1, p=distances / sum(distances))
centroids[i] = data[idx, :].toarray()
# Now compute distances from the centroids to all data points
distances = np.min(pairwise_distances(data, centroids[0:i + 1], metric='euclidean'), axis=1)
return centroids
示例5: _rsl_small_kdtree
def _rsl_small_kdtree(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2):
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')
distance_matrix = pairwise_distances(X, metric=metric, p=p)
else:
distance_matrix = pairwise_distances(X, metric=metric)
mutual_reachability_ = kdtree_mutual_reachability(X,
distance_matrix,
metric,
p=p,
min_points=k,
alpha=alpha)
min_spanning_tree = mst_linkage_core(mutual_reachability_)
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
single_linkage_tree = label(min_spanning_tree)
single_linkage_tree = SingleLinkageTree(single_linkage_tree)
labels = single_linkage_tree.get_clusters(cut, gamma)
return labels, single_linkage_tree
示例6: visualize_class_separation
def visualize_class_separation(X, labels):
_, (ax1,ax2) = pyplot.subplots(ncols=2)
label_order = np.argsort(labels)
ax1.imshow(pairwise_distances(X[label_order]), interpolation='nearest')
ax2.imshow(pairwise_distances(labels[label_order,None]),
interpolation='nearest')
pyplot.show()
示例7: class_separation
def class_separation(X, labels):
unique_labels, label_inds = np.unique(labels, return_inverse=True)
ratio = 0
for li in xrange(len(unique_labels)):
Xc = X[label_inds==li]
Xnc = X[label_inds!=li]
ratio += pairwise_distances(Xc).mean() / pairwise_distances(Xc,Xnc).mean()
return ratio / len(unique_labels)
示例8: eval
def eval(self, X):
"""Evaluate the kernel density estimation
Parameters
----------
X : array_like
array of points at which to evaluate the KDE. Shape is
(n_points, n_dim), where n_dim matches the dimension of
the training points.
Returns
-------
dens : ndarray
array of shape (n_points,) giving the density at each point.
The density will be normalized for metric='gaussian' or
metric='tophat', and will be unnormalized otherwise.
"""
X = np.atleast_2d(X)
if X.ndim != 2:
raise ValueError('X must be two-dimensional')
if X.shape[1] != self.X_.shape[1]:
raise ValueError('dimensions of X do not match training dimension')
if self.metric == 'gaussian':
# wrangle gaussian into scikit-learn's 'rbf' kernel
gamma = 0.5 / self.h / self.h
D = pairwise_kernels(X, self.X_, metric='rbf', gamma=gamma)
D /= np.sqrt(2 * np.pi * self.h ** (2 * X.shape[1]))
dens = D.sum(1)
elif self.metric == 'tophat':
# use Ball Tree to efficiently count neighbors
bt = BallTree(self.X_)
counts = bt.query_radius(X, self.h,
count_only=True)
dens = counts / n_volume(self.h, X.shape[1])
elif self.metric == 'exponential':
D = pairwise_distances(X, self.X_)
dens = np.exp(-abs(D) / self.h)
dens = dens.sum(1)
dens /= n_volume(self.h, X.shape[1]) * special.gamma(X.shape[1])
elif self.metric == 'quadratic':
D = pairwise_distances(X, self.X_)
dens = (1 - (D / self.h) ** 2)
dens[D > self.h] = 0
dens = dens.sum(1)
dens /= 2. * n_volume(self.h, X.shape[1]) / (X.shape[1] + 2)
else:
D = pairwise_kernels(X, self.X_, metric=self.metric, **self.kwargs)
dens = D.sum(1)
return dens
示例9: __call__
def __call__(self, X_train, X_test, y_train, y_test):
X = np.vstack([X_train, X_test])
y = np.hstack([y_train, y_test])
unique_labels, label_inds = np.unique(y, return_inverse=True)
ratio = 0
for li in range(len(unique_labels)):
Xc = X[label_inds == li]
Xnc = X[label_inds != li]
ratio += pairwise_distances(Xc).mean() \
/ pairwise_distances(Xc, Xnc).mean()
return -ratio / len(unique_labels)
示例10: outlier_clusters_ward
def outlier_clusters_ward(x, y, skill=None, memory=None):
# TODO: incorporate skill
data = np.vstack((x, y)).T
if len(data) == 0:
# uh.
print 'clustering: NO cluster members!'
cluster_centers = np.array([[-1, -1]])
cluster_labels = []
labels = []
n_clusters = 0
dist_within = np.array([])
elif len(data) == 1:
print 'clustering: only 1 data point!'
cluster_centers = data
cluster_labels = [0]
labels = np.array([0])
n_clusters = 1
dist_within = np.array([0])
else:
dist_within = 1000
dist_max = 75
n_clusters = 0
n_clusters_max = 10
clusterer = AgglomerativeClustering(n_clusters=n_clusters,
memory=memory)
# while dist_within > dist_max, keep adding clusters
while (dist_within > dist_max) * (n_clusters < n_clusters_max):
# iterate n_clusters
n_clusters += 1
clusterer.set_params(n_clusters=n_clusters)
# cluster
labels = clusterer.fit_predict(data)
# get cluster_centers
cluster_labels = range(n_clusters)
cluster_centers = np.array([np.mean(data[labels == i], axis=0)
for i in cluster_labels])
# find dist_within: the maximum pairwise distance inside a cluster
dist_within = np.max([np.max(pairwise_distances(
data[labels == i]))
for i in cluster_labels])
dist_within_final = np.array([np.max(pairwise_distances(
data[labels == i])) for i in cluster_labels])
return cluster_centers, cluster_labels, labels, n_clusters, dist_within_final
示例11: test_precomputed
def test_precomputed(random_state=42):
"""Tests unsupervised NearestNeighbors with a distance matrix."""
# Note: smaller samples may result in spurious test success
rng = np.random.RandomState(random_state)
X = rng.random_sample((10, 4))
Y = rng.random_sample((3, 4))
DXX = metrics.pairwise_distances(X, metric='euclidean')
DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
for method in ['kneighbors']:
# TODO: also test radius_neighbors, but requires different assertion
# As a feature matrix (n_samples by n_features)
nbrs_X = neighbors.NearestNeighbors(n_neighbors=3)
nbrs_X.fit(X)
dist_X, ind_X = getattr(nbrs_X, method)(Y)
# As a dense distance matrix (n_samples by n_samples)
nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute',
metric='precomputed')
nbrs_D.fit(DXX)
dist_D, ind_D = getattr(nbrs_D, method)(DYX)
assert_array_almost_equal(dist_X, dist_D)
assert_array_almost_equal(ind_X, ind_D)
# Check auto works too
nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto',
metric='precomputed')
nbrs_D.fit(DXX)
dist_D, ind_D = getattr(nbrs_D, method)(DYX)
assert_array_almost_equal(dist_X, dist_D)
assert_array_almost_equal(ind_X, ind_D)
# Check X=None in prediction
dist_X, ind_X = getattr(nbrs_X, method)(None)
dist_D, ind_D = getattr(nbrs_D, method)(None)
assert_array_almost_equal(dist_X, dist_D)
assert_array_almost_equal(ind_X, ind_D)
# Must raise a ValueError if the matrix is not of correct shape
assert_raises(ValueError, getattr(nbrs_D, method), X)
target = np.arange(X.shape[0])
for Est in (neighbors.KNeighborsClassifier,
neighbors.RadiusNeighborsClassifier,
neighbors.KNeighborsRegressor,
neighbors.RadiusNeighborsRegressor):
print(Est)
est = Est(metric='euclidean')
est.radius = est.n_neighbors = 1
pred_X = est.fit(X, target).predict(Y)
est.metric = 'precomputed'
pred_D = est.fit(DXX, target).predict(DYX)
assert_array_almost_equal(pred_X, pred_D)
示例12: find_distance_matrix
def find_distance_matrix(self, metric='cosine'):
'''
compute distance matrix between topis using cosine or euclidean
distance (default=cosine distance)
'''
if metric == 'cosine':
self.distance_matrix = pairwise_distances(self.topics,
metric='cosine')
# diagonals should be exactly zero, so remove rounding errors
numpy.fill_diagonal(self.distance_matrix, 0)
if metric == 'euclidean':
self.distance_matrix = pairwise_distances(self.topics,
metric='euclidean')
示例13: update_clfs_M
def update_clfs_M(self, clfs, M):
self.clfs = clfs
self.M = M
self.knn_test_dist, self.knn_test = NearestNeighbors(self.k, algorithm='brute', metric='mahalanobis', VI=self.M).fit(self.X_train).kneighbors(self.X_test)
self.preds_train = np.array([e.predict(self.X_train) for e in clfs]).T
self.preds_proba_train = np.array([e.predict_proba(self.X_train) for e in clfs]).swapaxes(0,1)
self.preds_proba_train_smoothed = self.preds_proba_train + 0.01
self.preds_test = np.array([e.predict(self.X_test) for e in clfs]).T
self.preds_proba_test = np.array([e.predict_proba(self.X_test) for e in clfs]).swapaxes(0,1)
self.pp_train = np.array([pt==yt for pt,yt in itertools.izip(self.preds_train, self.y_train)])
self.pp_test = np.array([pt==yt for pt,yt in itertools.izip(self.preds_test, self.y_test)])
self.pd_pp_test = pairwise_distances(self.pp_test, self.pp_train, metric='hamming')
self.pd_preds_test = pairwise_distances(self.preds_test, self.preds_train, metric='hamming')
示例14: update_input
def update_input(self, clf):
preds_train = np.array([e.predict(self.X_train) for e in clf.estimators_]).T
self.pp_train = np.array([pt==yt for pt,yt in itertools.izip(preds_train, self.y_train)])
preds_test = np.array([e.predict(self.X_test) for e in clf.estimators_]).T
self.pp_test = np.array([pt==yt for pt,yt in itertools.izip(preds_test, self.y_test)])
self.G = np.zeros(self.M.shape)
self.active_set = None
self.ij = []
self.ijl = []
self.loss = np.inf
self.pd_pp = pairwise_distances(self.pp_train, metric='hamming')
np.fill_diagonal(self.pd_pp, np.inf)
self.pd_pp_test = pairwise_distances(self.pp_test, self.pp_train, metric='hamming')
self.step_size = self.alpha
self.step_size_break = False
示例15: visualize_document_clusters
def visualize_document_clusters(wiki, tf_idf, centroids, cluster_assignment, k,
map_index_to_word, display_content=True):
'''wiki: original dataframe
tf_idf: data matrix, sparse matrix format
map_index_to_word: SFrame specifying the mapping betweeen words and column indices
display_content: if True, display 8 nearest neighbors of each centroid'''
print('==========================================================')
# Visualize each cluster c
for c in xrange(k):
# Cluster heading
print('Cluster {0:d} '.format(c)),
# Print top 5 words with largest TF-IDF weights in the cluster
idx = centroids[c].argsort()[::-1]
for i in xrange(5): # Print each word along with the TF-IDF weight
print('{0:s}:{1:.3f}'.format(map_index_to_word['category'][idx[i]], centroids[c,idx[i]])),
print('')
if display_content:
# Compute distances from the centroid to all data points in the cluster,
# and compute nearest neighbors of the centroids within the cluster.
distances = pairwise_distances(tf_idf, [centroids[c]], metric='euclidean').flatten()
distances[cluster_assignment!=c] = float('inf') # remove non-members from consideration
nearest_neighbors = distances.argsort()
# For 8 nearest neighbors, print the title as well as first 180 characters of text.
# Wrap the text at 80-character mark.
for i in xrange(8):
text = ' '.join(wiki[nearest_neighbors[i]]['text'].split(None, 25)[0:25])
print('\n* {0:50s} {1:.5f}\n {2:s}\n {3:s}'.format(wiki[nearest_neighbors[i]]['name'],
distances[nearest_neighbors[i]], text[:90], text[90:180] if len(text) > 90 else ''))
print('==========================================================')