本文整理汇总了Python中sklearn.metrics.pairwise.pairwise_distances函数的典型用法代码示例。如果您正苦于以下问题:Python pairwise_distances函数的具体用法?Python pairwise_distances怎么用?Python pairwise_distances使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了pairwise_distances函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_kneighbors_regressor_sparse
def test_kneighbors_regressor_sparse(n_samples=40,
n_features=5,
n_test_pts=10,
n_neighbors=5,
random_state=0):
# Test radius-based regression on sparse matrices
# Like the above, but with various types of sparse matrices
rng = np.random.RandomState(random_state)
X = 2 * rng.rand(n_samples, n_features) - 1
y = ((X ** 2).sum(axis=1) < .25).astype(np.int)
for sparsemat in SPARSE_TYPES:
knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors,
algorithm='auto')
knn.fit(sparsemat(X), y)
knn_pre = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors,
metric='precomputed')
knn_pre.fit(pairwise_distances(X, metric='euclidean'), y)
for sparsev in SPARSE_OR_DENSE:
X2 = sparsev(X)
assert_true(np.mean(knn.predict(X2).round() == y) > 0.95)
X2_pre = sparsev(pairwise_distances(X, metric='euclidean'))
if issparse(sparsev(X2_pre)):
assert_raises(ValueError, knn_pre.predict, X2_pre)
else:
assert_true(
np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95)
示例2: make_rbf
def make_rbf(x,sigma,metric='euclidean', x2=None):
if x.ndim == 1:
x = np.expand_dims(x, 1)
if x2 is None:
x2 = x
if metric == 'cosine':
#This code may be faster for some matrices
# Code from http://stackoverflow.com/questions/17627219/whats-the-fastest-way-in-python-to-calculate-cosine-similarity-given-sparse-mat
'''
tic()
#x = x.toarray()
#similarity = np.dot(x, x.T)
similarity = (x.dot(x.T)).toarray()
square_mag = np.diag(similarity)
inv_square_mag = 1 / square_mag
inv_square_mag[np.isinf(inv_square_mag)] = 0
inv_mag = np.sqrt(inv_square_mag)
W = similarity * inv_mag
W = W.T * inv_mag
W = 1 - W
toc()
tic()
W2 = pairwise.pairwise_distances(x,x,metric)
toc()
'''
W = pairwise.pairwise_distances(x,x2,metric)
else:
#tic()
W = pairwise.pairwise_distances(x,x2,metric)
#toc()
W = np.square(W)
W = -sigma * W
W = np.exp(W)
return W
示例3: bipartite_clustering
def bipartite_clustering(D2W,word_cluster_num,doc_cluster_num,metric,criteria):
W2D = D2W.transpose()
W2WC = kmean(W2D,word_cluster_num,criteria)
#word_cluster_num = np.amax(W2WC)+1
#print "wc:",word_cluster_num
for loop in range(4):
#D2WC = D2W.dot(transform_from_index_array(W2WC,W2WC.size,word_cluster_num))
#print D2WC
#print loop
new_centroids = get_new_centroids(W2D,W2WC)
new_distance_matrix = pairwise_distances(W2D,new_centroids,metric=metric) #how to calculate distance? maybe 1-matrix?
#print new_distance_matrix
D2WC = D2W.dot(new_distance_matrix)
if loop==0:
D2DC = kmean(D2WC,doc_cluster_num,criteria)
else:
new_centroids = get_new_centroids(D2WC,D2DC)
D2DC = kmean(D2WC,doc_cluster_num,criteria,new_centroids)
#doc_cluster_num = np.amax(D2DC)+1
#print "dc:",doc_cluster_num
new_centroids = get_new_centroids(D2W,D2DC)
new_distance_matrix = pairwise_distances(D2W,new_centroids,metric=metric)
W2DC = W2D.dot(new_distance_matrix)
new_centroids = get_new_centroids(W2DC,W2WC)
W2WC = kmean(W2DC,word_cluster_num,criteria,new_centroids)
#word_cluster_num = np.amax(W2WC)+1
#print "wc:",word_cluster_num
return D2DC,W2WC
示例4: generate_dist_stats_feat
def generate_dist_stats_feat(metric, X_train, ids_train, X_test, ids_test, indices_dict):
## stats parameters
quantiles_range = np.arange(0, 1.5, 0.5)
stats_func = [ np.mean, np.std ]
stats_feat_num = len(quantiles_range) + len(stats_func)
n_class_relevance = 13
if metric == "cosine":
stats_feat = 0 * np.ones((len(ids_test), stats_feat_num*n_class_relevance), dtype=float)
sim = 1. - pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)
elif metric == "euclidean":
stats_feat = -1 * np.ones((len(ids_test), stats_feat_num*n_class_relevance), dtype=float)
sim = pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)
print("pairwise_distances generated!")
for i in range(len(ids_test)):
id = ids_test[i]
for j in range(n_class_relevance):
key = j
if key in indices_dict:
inds = indices_dict[key]
# exclude this sample itself from the list of indices
inds = [ ind for ind in inds if id != ids_train[ind] ]
sim_tmp = sim[i][inds]
if len(sim_tmp) != 0:
feat = [ func(sim_tmp) for func in stats_func ]
## quantile
sim_tmp = pd.Series(sim_tmp)
quantiles = sim_tmp.quantile(quantiles_range)
feat = np.hstack((feat, quantiles))
stats_feat[i,j*stats_feat_num:(j+1)*stats_feat_num] = feat
return stats_feat
示例5: dunn
def dunn(max_nc, all_labels, dataset):
dunn = []
print "DUNN (MAX)..."
for nc in xrange(2, max_nc + 1):
dn = 0.0
max_intra = 0.0
for cluster_i in xrange(nc):
instances_i = dataset[np.where(all_labels[nc - 2] == cluster_i)[0]]
pairwase_matrix_intra = pairwise_distances(instances_i, n_jobs=1)
new_max_intra = np.amax(pairwase_matrix_intra)
if new_max_intra > max_intra:
max_intra = new_max_intra
for cluster_i in xrange(nc):
instances_i = dataset[np.where(all_labels[nc - 2] == cluster_i)[0]]
for cluster_j in xrange(nc):
if cluster_j > cluster_i:
instances_j = dataset[np.where(all_labels[nc - 2] == cluster_j)[0]]
pairwase_matrix_inter = pairwise_distances(instances_i, instances_j, n_jobs=1)
min_inter = np.amin(pairwase_matrix_inter)
if dn == 0.0:
dn = min_inter / max_intra
elif min_inter / max_intra < dn:
dn = min_inter / max_intra
print 'DUNN for k = ' + str(nc) + ' is ' + str(dn) + ' ...'
dunn += [dn]
return dunn
示例6: pairwise_distances
def pairwise_distances(X, Y=None, index=None, metric="euclidean"):
'''
Compute the distance matrix from a vector array X and optional Y.
This method takes either a vector array or a distance matrix,
and returns a distance matrix. If the input is a vector array,
the distances are computed. If the input is a distances matrix,
it is returned instead.
This method provides a safe way to take a distance matrix as input,
while preserving compatibility with many other algorithms that take
a vector array.
:param X: array [n_samples_a, n_samples_a]
Array of pairwise distances between samples, or a feature array.
:param Y: array [n_samples_b, n_features]
A second feature array only if X has shape [n_samples_a, n_features].
:param index: int, the index of element in X array
:param metric: The metric to use when calculating distance between instances in a feature array.
If metric ='rmsd', it should be computed by MDTraj
:return: The distances
'''
if metric == "rmsd":
if Y is None:
distances_ = md.rmsd(X, X, index, parallel=True, precentered=True)
else:
#distances_ = np.empty((len(X), len(Y)), dtype=np.float32)
# for i in xrange(len(Y)):
distances_ = md.rmsd(X, Y, index, parallel=True, precentered=True)
return distances_
else:
if Y is None:
print "if Y is None"
return sp.pairwise_distances(X, X[index], metric=metric)
if index is None:
print "if index is None, pairwise XX"
return sp.pairwise_distances(X, X, metric=metric)
示例7: generate_dist_stats_feat
def generate_dist_stats_feat(metric, X_train, ids_train, X_test, ids_test, indices_dict, qids_test=None):
if metric == "cosine":
stats_feat = 0 * np.ones((len(ids_test), stats_feat_num * n_classes), dtype=float)
sim = 1. - pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)
elif metric == "euclidean":
stats_feat = -1 * np.ones((len(ids_test), stats_feat_num * n_classes), dtype=float)
sim = pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)
for i in range(len(ids_test)):
id = ids_test[i]
if qids_test is not None:
qid = qids_test[i]
for j in range(n_classes):
key = (qid, j + 1) if qids_test is not None else j + 1
if indices_dict.has_key(key):
inds = indices_dict[key]
# exclude this sample itself from the list of indices
inds = [ind for ind in inds if id != ids_train[ind]]
sim_tmp = sim[i][inds]
if len(sim_tmp) != 0:
feat = [func(sim_tmp) for func in stats_func]
## quantile
sim_tmp = pd.Series(sim_tmp)
quantiles = sim_tmp.quantile(quantiles_range)
feat = np.hstack((feat, quantiles))
stats_feat[i, j * stats_feat_num:(j + 1) * stats_feat_num] = feat
return stats_feat
示例8: test_no_data_conversion_warning
def test_no_data_conversion_warning():
# No warnings issued if metric is not a boolean distance function
rng = np.random.RandomState(0)
X = rng.randn(5, 4)
with pytest.warns(None) as records:
pairwise_distances(X, metric="minkowski")
assert len(records) == 0
示例9: trustworthiness
def trustworthiness(X, X_embedded, n_neighbors=5, precomputed=False):
"""Expresses to what extent the local structure is retained.
The trustworthiness is within [0, 1]. It is defined as
.. math::
T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1}
\sum_{j \in U^{(k)}_i (r(i, j) - k)}
where :math:`r(i, j)` is the rank of the embedded datapoint j
according to the pairwise distances between the embedded datapoints,
:math:`U^{(k)}_i` is the set of points that are in the k nearest
neighbors in the embedded space but not in the original space.
* "Neighborhood Preservation in Nonlinear Projection Methods: An
Experimental Study"
J. Venna, S. Kaski
* "Learning a Parametric Embedding by Preserving Local Structure"
L.J.P. van der Maaten
Parameters
----------
X : array, shape (n_samples, n_features) or (n_samples, n_samples)
If the metric is 'precomputed' X must be a square distance
matrix. Otherwise it contains a sample per row.
X_embedded : array, shape (n_samples, n_components)
Embedding of the training data in low-dimensional space.
n_neighbors : int, optional (default: 5)
Number of neighbors k that will be considered.
precomputed : bool, optional (default: False)
Set this flag if X is a precomputed square distance matrix.
Returns
-------
trustworthiness : float
Trustworthiness of the low-dimensional embedding.
"""
if precomputed:
dist_X = X
else:
dist_X = pairwise_distances(X, squared=True)
dist_X_embedded = pairwise_distances(X_embedded, squared=True)
ind_X = np.argsort(dist_X, axis=1)
ind_X_embedded = np.argsort(dist_X_embedded, axis=1)[:, 1:n_neighbors + 1]
n_samples = X.shape[0]
t = 0.0
ranks = np.zeros(n_neighbors)
for i in range(n_samples):
for j in range(n_neighbors):
ranks[j] = np.where(ind_X[i] == ind_X_embedded[i, j])[0][0]
ranks -= n_neighbors
t += np.sum(ranks[ranks > 0])
t = 1.0 - t * (2.0 / (n_samples * n_neighbors *
(2.0 * n_samples - 3.0 * n_neighbors - 1.0)))
return t
示例10: test_radius_neighbors
def test_radius_neighbors():
# Checks whether Returned distances are less than `radius`
# At least one point should be returned when the `radius` is set
# to mean distance from the considering point to other points in
# the database.
# Moreover, this test compares the radius neighbors of LSHForest
# with the `sklearn.neighbors.NearestNeighbors`.
n_samples = 12
n_features = 2
n_iter = 10
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
# Test unfitted estimator
assert_raises(ValueError, lshf.radius_neighbors, X[0])
ignore_warnings(lshf.fit)(X)
for i in range(n_iter):
# Select a random point in the dataset as the query
query = X[rng.randint(0, n_samples)].reshape(1, -1)
# At least one neighbor should be returned when the radius is the
# mean distance from the query to the points of the dataset.
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
neighbors = lshf.radius_neighbors(query, radius=mean_dist,
return_distance=False)
assert_equal(neighbors.shape, (1,))
assert_equal(neighbors.dtype, object)
assert_greater(neighbors[0].shape[0], 0)
# All distances to points in the results of the radius query should
# be less than mean_dist
distances, neighbors = lshf.radius_neighbors(query,
radius=mean_dist,
return_distance=True)
assert_array_less(distances[0], mean_dist)
# Multiple points
n_queries = 5
queries = X[rng.randint(0, n_samples, n_queries)]
distances, neighbors = lshf.radius_neighbors(queries,
return_distance=True)
# dists and inds should not be 1D arrays or arrays of variable lengths
# hence the use of the object dtype.
assert_equal(distances.shape, (n_queries,))
assert_equal(distances.dtype, object)
assert_equal(neighbors.shape, (n_queries,))
assert_equal(neighbors.dtype, object)
# Compare with exact neighbor search
query = X[rng.randint(0, n_samples)].reshape(1, -1)
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)
distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
示例11: getSimMat
def getSimMat(self, type = 'euclidean', ftr_type = 'data', orderFlag = True, pca_dim=20):
if ftr_type == 'ftr':
#use input features
self.slctData = [ts for ts in self.slctData if ((ts.ftr is not None) and (len(ts.ftr) > 0))]
dataMat = [ts.ftr for ts in self.slctData]
elif ftr_type == 'data':
#use input data
dataMat = [ts.val for ts in self.slctData]
else:
print 'unknown ftr_type for ftr_type:', ftr_type
if pca_dim > len(dataMat):
pca_dim = int(math.ceil(len(dataMat)/2.0))
if type == 'euclidean': #euclidean distance based on time series data
self.simMat = skmpw.euclidean_distances(dataMat)
elif type == 'pca_euc': #extract feature based on PCA, then use Euclidean distance
pca = skd.PCA(n_components=pca_dim)
dataMat = pca.fit_transform(dataMat)
self.simMat = skmpw.euclidean_distances(dataMat)
elif type == 'nmf_euc': #extract feature based on NMF, then use Euclidean distance
nmf = skd.NMF(n_components=pca_dim)
dataMat = nmf.fit_transform(dataMat)
self.simMat = skmpw.euclidean_distances(dataMat)
elif type =='ica_euc': #extract feature based on ICA, then use Euclidean distance
ica = skd.FastICA(n_components=pca_dim)
dataMat = ica.fit_transform(dataMat)
self.simMat = skmpw.euclidean_distances(dataMat)
elif type =='cosine':
self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
elif type == 'pca_cos': #extract feature based on PCA, then use cosine distance
pca = skd.PCA(n_components=pca_dim)
dataMat = pca.fit_transform(dataMat)
self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
elif type == 'nmf_cos': #extract feature based on NMF, then use cosine distance
nmf = skd.NMF(n_components=pca_dim)
dataMat = nmf.fit_transform(dataMat)
self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
elif type =='ica_cos': #extract feature based on ICA, then use cosine distance
ica = skd.FastICA(n_components=pca_dim)
dataMat = ica.fit_transform(dataMat)
self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
else:
print 'unknown type for similarity matrix: ', type
#rearrange the order of data in simMat
self.slctDataMat = dataMat
if orderFlag:
link = spc.hierarchy.linkage(self.simMat)
dend = spc.hierarchy.dendrogram(link, no_plot=True)
order = dend['leaves']
self.slctData = [self.slctData[i] for i in order] #rearrange order
self.simMat = [self.simMat[i] for i in order]
for i in xrange(len(self.simMat)):
self.simMat[i] = [self.simMat[i][j] for j in order]
self.slctDataMat = [self.slctDataMat[i] for i in order]
# self.patchOrdering = [ts.ptchNm for ts in self.slctData] #record new ordering
self.patchOrdering = JSONifyData(self.slctData) # Deok wants all the data for each patch in the response
self.clstData = self.slctData
self.clstSimMat = self.simMat
示例12: predict
def predict(dialogue_session, line):
lowest = ('x',1)
data = dataDict[dialogue_session][1][line,:]
for vector in vDict:
predictor = vDict[vector]
if pair.pairwise_distances(predictor,data,'cosine') < lowest[1]:
lowest = (vector, pair.pairwise_distances(predictor,data,'cosine'))
return lowest
示例13: cramer_statistic
def cramer_statistic(self, n_jobs=1):
'''
Applies the Cramer Statistic to the datasets.
Parameters
----------
n_jobs : int, optional
Sets the number of cores to use to calculate
pairwise distances. Default is 1.
'''
# Adjust what we call n,m based on the larger dimension.
# Then the looping below is valid.
if self.data_matrix1.shape[0] >= self.data_matrix2.shape[0]:
m = self.data_matrix1.shape[0]
n = self.data_matrix2.shape[0]
larger = self.data_matrix1
smaller = self.data_matrix2
else:
n = self.data_matrix1.shape[0]
m = self.data_matrix2.shape[0]
larger = self.data_matrix2
smaller = self.data_matrix1
pairdist11 = pairwise_distances(larger, metric="euclidean",
n_jobs=n_jobs)
pairdist22 = pairwise_distances(smaller, metric="euclidean",
n_jobs=n_jobs)
pairdist12 = pairwise_distances(larger, smaller,
metric="euclidean", n_jobs=n_jobs)
# Take sqrt of each
# We default to using the Cramer kernel in Baringhaus & Franz (2004)
# \phi(dist) = sqrt(dist) / 2.
# The normalization values below reflect this
pairdist11 = np.sqrt(pairdist11)
pairdist12 = np.sqrt(pairdist12)
pairdist22 = np.sqrt(pairdist22)
term1 = 0.0
term2 = 0.0
term3 = 0.0
for i in range(m):
for j in range(n):
term1 += pairdist12[i, j]
for ii in range(m):
term2 += pairdist11[i, ii]
if i < n:
for jj in range(n):
term3 += pairdist22[i, jj]
m, n = float(m), float(n)
term1 *= (1 / (m * n))
term2 *= (1 / (2 * m ** 2.))
term3 *= (1 / (2 * n ** 2.))
self._distance = (m * n / (m + n)) * (term1 - term2 - term3)
示例14: run_step
def run_step(self, run_number, step_size, howlong):
dfslot = self.get_input_slot("df")
df = dfslot.data()
dfslot.update(run_number)
if dfslot.has_updated() or dfslot.has_deleted():
dfslot.reset()
logger.info("Reseting history because of changes in the input df")
dfslot.update(run_number, df)
# TODO: be smarter with changed values
m = step_size
indices = dfslot.next_created(m)
m = indices_len(indices)
i = None
j = None
Si = self._buf.matrix()
arrayslot = self.get_input_slot("array")
if arrayslot is not None and arrayslot.data() is not None:
array = arrayslot.data()
logger.debug("Using array instead of DataFrame columns")
if Si is not None:
i = array[self._last_index]
j = array[indices]
if j is None:
if self.columns is None:
self.columns = df.columns.delete(np.where(df.columns == Module.UPDATE_COLUMN))
elif not isinstance(self.columns, pd.Index):
self.columns = pd.Index(self.columns)
rows = df[self.columns]
if Si is not None:
i = rows.loc[self._last_index]
assert len(i) == len(self._last_index)
j = rows.loc[fix_loc(indices)]
assert len(j) == indices_len(indices)
Sj = pairwise_distances(j, metric=self._metric, n_jobs=self._n_jobs)
if Si is None:
mat = self._buf.resize(Sj.shape[0])
mat[:, :] = Sj
self._last_index = dfslot.last_index[indices]
else:
Sij = pairwise_distances(i, j, metric=self._metric, n_jobs=self._n_jobs)
n0 = i.shape[0]
n1 = n0 + j.shape[0]
mat = self._buf.resize(n1)
mat[0:n0, n0:n1] = Sij
mat[n0:n1, 0:n0] = Sij.T
mat[n0:n1, n0:n1] = Sj
self._last_index = self._last_index.append(df.index[indices])
# truth = pairwise_distances(array[0:n1], metric=self._metric)
# import pdb
# pdb.set_trace()
# assert np.allclose(mat,truth)
return self._return_run_step(dfslot.next_state(), steps_run=m)
示例15: test_radius_neighbors
def test_radius_neighbors():
"""Checks whether Returned distances are less than `radius`
At least one point should be returned when the `radius` is set
to mean distance from the considering point to other points in
the database.
Moreover, this test compares the radius neighbors of LSHForest
with the `sklearn.neighbors.NearestNeighbors`.
"""
n_samples = 12
n_features = 2
n_iter = 10
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = LSHForest()
# Test unfitted estimator
assert_raises(ValueError, lshf.radius_neighbors, X[0])
lshf.fit(X)
for i in range(n_iter):
query = X[rng.randint(0, n_samples)]
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
neighbors = lshf.radius_neighbors(query, radius=mean_dist,
return_distance=False)
# At least one neighbor should be returned.
assert_greater(neighbors.shape[0], 0)
# All distances should be less than mean_dist
distances, neighbors = lshf.radius_neighbors(query,
radius=mean_dist,
return_distance=True)
assert_array_less(distances[0], mean_dist)
# Multiple points
n_queries = 5
queries = X[rng.randint(0, n_samples, n_queries)]
distances, neighbors = lshf.radius_neighbors(queries,
return_distance=True)
assert_equal(neighbors.shape[0], n_queries)
assert_equal(distances.shape[0], n_queries)
# dists and inds should not be 2D arrays
assert_equal(distances.ndim, 1)
assert_equal(neighbors.ndim, 1)
# Compare with exact neighbor search
query = X[rng.randint(0, n_samples)]
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
nbrs.fit(X)
distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
# Distances of exact neighbors is less than or equal to approximate
assert_true(np.all(np.less_equal(np.sort(distances_exact[0]),
np.sort(distances_approx[0]))))