本文整理汇总了Python中sklearn.metrics.pairwise.cosine_distances函数的典型用法代码示例。如果您正苦于以下问题:Python cosine_distances函数的具体用法?Python cosine_distances怎么用?Python cosine_distances使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了cosine_distances函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_cosine_distances
def test_cosine_distances():
# Check the pairwise Cosine distances computation
rng = np.random.RandomState(1337)
x = np.abs(rng.rand(910))
XA = np.vstack([x, x])
D = cosine_distances(XA)
assert_array_almost_equal(D, [[0., 0.], [0., 0.]])
# check that all elements are in [0, 2]
assert np.all(D >= 0.)
assert np.all(D <= 2.)
# check that diagonal elements are equal to 0
assert_array_almost_equal(D[np.diag_indices_from(D)], [0., 0.])
XB = np.vstack([x, -x])
D2 = cosine_distances(XB)
# check that all elements are in [0, 2]
assert np.all(D2 >= 0.)
assert np.all(D2 <= 2.)
# check that diagonal elements are equal to 0 and non diagonal to 2
assert_array_almost_equal(D2, [[0., 2.], [2., 0.]])
# check large random matrix
X = np.abs(rng.rand(1000, 5000))
D = cosine_distances(X)
# check that diagonal elements are equal to 0
assert_array_almost_equal(D[np.diag_indices_from(D)], [0.] * D.shape[0])
assert np.all(D >= 0.)
assert np.all(D <= 2.)
示例2: get_features
def get_features(vocab):
vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
X_train_head = vectorizer_head.fit_transform(headlines)
vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
X_train_body = vectorizer_body.fit_transform(bodies)
# calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
# more important topic words a body contains of a certain topic, the higher its value for this topic
lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)
print("latent_dirichlet_allocation_cos: fit and transform body")
t0 = time()
lda_body_matrix = lda_body.fit_transform(X_train_body)
print("done in %0.3fs." % (time() - t0))
print("latent_dirichlet_allocation_cos: transform head")
# use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
# their vectors should be similar
lda_head_matrix = lda_body.transform(X_train_head)
#print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)
print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
# calculate cosine distance between the body and head
X = []
for i in range(len(lda_head_matrix)):
X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
X.append(cos_dist.tolist())
return X
示例3: sumACluster
def sumACluster(dist, vecsIn, topK_t, sameTweetThred):
if dist == "cosine":
distMatrix = pairwise.cosine_distances(vecsIn)
elif dist == "eu":
distMatrix = pairwise.euclidean_distances(vecsIn, vecsIn)
sameTweetClusters = [[0]]
for seqid, text in enumerate(vecsIn[1:], start=1):
added = None
for stcid, stc in enumerate(sameTweetClusters):
sameFlag = False
if distMatrix[seqid][stc[0]] <= sameTweetThred:
sameFlag = True
if sameFlag:
stc.append(seqid)
added = (stcid, stc)
break
if added is None:
sameTweetClusters.append([seqid])
else:
sameTweetClusters[added[0]] = added[1]
sameTweetClusterNum = [(stcid, len(stc)) for stcid, stc in enumerate(sameTweetClusters)]
numIn = len(sameTweetClusterNum)
top = sorted(sameTweetClusterNum, key = lambda a:a[1], reverse=True)[:min(topK_t, numIn)]
top = [(sameTweetClusters[item[0]][0], item[1]) for item in top]
return top
示例4: test_linkage_misc
def test_linkage_misc():
# Misc tests on linkage
X = np.ones((5, 5))
assert_raises(ValueError,
AgglomerativeClustering(linkage='foobar').fit,
X)
assert_raises(ValueError, linkage_tree, X, linkage='foobar')
assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
with warnings.catch_warnings(record=True) as warning_list:
warnings.simplefilter("always", DeprecationWarning)
# Use the copy argument, to raise a warning
Ward(copy=True).fit(X)
# We should be getting 2 warnings: one for using Ward that is
# deprecated, one for using the copy argument
assert_equal(len(warning_list), 2)
# test hiearchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hiearchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
示例5: test_linkage_misc
def test_linkage_misc():
# Misc tests on linkage
rnd = np.random.RandomState(42)
X = rnd.normal(size=(5, 5))
assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
assert_raises(ValueError, linkage_tree, X, linkage='foo')
assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
# Deprecation of Ward class
with warnings.catch_warnings(record=True) as warning_list:
warnings.simplefilter("always", DeprecationWarning)
Ward().fit(X)
assert_equal(len(warning_list), 1)
# test hiearchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hiearchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
示例6: getModelInfo
def getModelInfo(model, features):
print("Shape of the transformed features = {}".format(features.shape))
# Uncomment to info:
# vocab = model.get_feature_names()
# dist = np.sum(features, axis=0)
# for tag, count in izip(vocab, dist):
# print("word = {}, frequency = {}".format(tag, count))
return cosine_distances(features)
示例7: _build_metastore
def _build_metastore(self):
medians = np.median(self.X, axis=0).reshape(1, self.dim)
# how far each data point is from the global median
dists = cosine_distances(self.X, Y=medians).reshape(-1)
sorted_index = [self.index[i] for i in dists.argsort()]
return {'sorted_index': sorted_index}
示例8: calcurate_centroid_Matrix
def calcurate_centroid_Matrix(veclist, word2vecdic,DimentionN):
centroid_Matrix = np.zeros((DimentionN, 200))
distance_arrays = np.zeros(DimentionN)
for word in veclist:
label = word2vecdic[word]
centroid_Matrix[label] += veclist[word]
for word in veclist:
label = word2vecdic[word]
distance_arrays[label] += cosine_distances(veclist[word], centroid_Matrix[label])
return centroid_Matrix, distance_arrays
示例9: memory_cf
def memory_cf(users, movies, k, similarity_measure, weight_schema,
repr_matrix=rating_matrix_orig, rating_matrix=rating_matrix_orig):
"""
Memory-based collaborative filtering.
:param users: a user list.
:param movies: a movie list.
:param k: number of nearest users
:param similarity_measure: 'cosine' or 'dot_product'
:param weight_schema: 'mean' or 'weighted_mean'
:param repr_matrix: data point representation
:param rating_matrix: ratings based on user-movie or cluster centroids
:return: recommended ratings for the queries
"""
# construct mapping between input users and unique users
ratings, user_unique = [], list(set(users))
user_index_map = dict((u, i) for i, u in enumerate(user_unique))
users = [(u, user_index_map[u]) for u in users]
# find k nearest neighbor for each user
if similarity_measure == 'cosine':
dist = cosine_distances(repr_matrix[user_unique, :], repr_matrix)
sims = 1 - dist
elif similarity_measure == 'dot_product':
sims = repr_matrix[user_unique, :].dot(repr_matrix.T)
if issparse(sims):
sims = sims.toarray()
dist = -sims
sorted_neighbors = np.argsort(dist, axis=1)
# make rating matrix dense for fast access
rating_matrix = rating_matrix.toarray()
weight_method = mean if weight_schema == 'mean' else weighted_mean
for (user_index, neighbor_index), movie in zip(users, movies):
neighbors = list(islice(ifilter(lambda u: (u, movie) in entry_set,
sorted_neighbors[neighbor_index]),
k + 1))
# no neighbors, regarded as 3
if not neighbors:
ratings.append(3)
continue
# exclude itself
if user_index in neighbors:
neighbors.remove(user_index)
rating = weight_method(rating_matrix[neighbors, movie],
sims[neighbor_index, neighbors])
ratings.append(rating)
return ratings
示例10: get_sparse_dist_matrix
def get_sparse_dist_matrix(tweets_tfidf_matrix, eps):
"""Get the sparse distance matrix from the pairwise cosine distance
computations from the given tfidf vectors. Only distances less than or
equal to eps are put into the matrix"""
rows = []
cols = []
data = []
for ndx, tweet in enumerate(tweets_tfidf_matrix):
rows.append(len(cols))
distances = cosine_distances(tweet, tweets_tfidf_matrix)[0]
for other_ndx, dist in enumerate(distances):
if ndx != other_ndx and dist <= eps:
cols.append(other_ndx)
data.append(dist)
return csr_matrix((data, cols, rows), dtype=int)
示例11: test_linkage_misc
def test_linkage_misc():
# Misc tests on linkage
rng = np.random.RandomState(42)
X = rng.normal(size=(5, 5))
assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
assert_raises(ValueError, linkage_tree, X, linkage='foo')
assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
# test hiearchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hiearchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
示例12: cluster_cf_memory
def cluster_cf_memory():
"""
Cluster-based memory CF.
"""
rating_matrix_cluster = np.empty([k_user, rating_matrix_orig.shape[1]],
dtype=np.float64)
# build rating matrix for each user cluster, on each movie
for i in range(k_user):
cluster_indicator = np.where(user_belonging == i)[0]
rating_cluster = rating_matrix_orig[cluster_indicator, :]
rating_sum = rating_cluster.sum(axis=0)
# take average by dividing count
rating_cluster.data = np.ones(len(rating_cluster.data))
mu = rating_sum / rating_cluster.sum(axis=0)
# fill 0 for nan
mu[np.isnan(mu)] = 0
rating_matrix_cluster[i, :] = mu
# construct mapping between input users and unique users
ratings, user_unique = [], list(set(users))
user_index_map = dict((u, i) for i, u in enumerate(user_unique))
users_neighbors = [user_index_map[u] for u in users]
if similarity_measure == 'cosine':
dist = cosine_distances(rating_matrix_orig[user_unique, :], m2uc.T)
sims = 1 - dist
else:
sims = rating_matrix_orig[user_unique, :].dot(m2uc).toarray()
dist = -sims
nearest_neighbors = np.argpartition(dist, k, axis=1)[:, :k]
weight_method = mean if weight_schema == 'mean' else weighted_mean
for neighbor_index, movie in zip(users_neighbors, movies):
neighbors = nearest_neighbors[neighbor_index]
rating = weight_method(rating_matrix_cluster[neighbors, movie],
sims[neighbor_index, neighbors])
ratings.append(rating)
return ratings
示例13: plot_mds
def plot_mds(points, genres, n_points=500):
'''
Plots a set of documents in MDS space
Args:
points: dense array with coordinates of each document
genres: list of genres for each entry in points
Returns:
None
'''
genres = np.array(genres)
genre_sel = np.not_equal(genres, None)
X, y = points[genre_sel], genres[genre_sel]
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y, train_size=n_points)
distances = cosine_distances(X_train, X_train)
mds = MDS(n_components=2, dissimilarity='precomputed')
mds.fit(distances)
plot_embedding(mds.embedding_, y_train)
示例14: test_fp16_cosine_metric
def test_fp16_cosine_metric(self):
arr = numpy.empty((10000, 2), dtype=numpy.float16)
angs = numpy.random.rand(10000) * 2 * numpy.pi
for i in range(10000):
arr[i] = numpy.sin(angs[i]), numpy.cos(angs[i])
with self.stdout:
centroids, assignments = kmeans_cuda(
arr, 4, init="kmeans++", metric="cos", device=1, verbosity=2,
seed=3)
self.assertEqual(self._get_iters_number(self.stdout), 5)
self.assertEqual(len(centroids), 4)
for c in centroids:
norm = numpy.linalg.norm(c)
self.assertTrue(0.9995 < norm < 1.0005)
dists = numpy.round(cosine_distances(centroids)).astype(int)
self.assertTrue((dists == [
[0, 2, 1, 1],
[2, 0, 1, 1],
[1, 1, 0, 2],
[1, 1, 2, 0],
]).all())
self.assertEqual(numpy.min(assignments), 0)
self.assertEqual(numpy.max(assignments), 3)
示例15: cosine_similarity
def cosine_similarity(vector_a, vector_b):
return 1-cosine_distances(vector_a,vector_b)