当前位置: 首页>>代码示例>>Python>>正文


Python pairwise.cosine_distances函数代码示例

本文整理汇总了Python中sklearn.metrics.pairwise.cosine_distances函数的典型用法代码示例。如果您正苦于以下问题:Python cosine_distances函数的具体用法?Python cosine_distances怎么用?Python cosine_distances使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了cosine_distances函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_cosine_distances

def test_cosine_distances():
    # Check the pairwise Cosine distances computation
    rng = np.random.RandomState(1337)
    x = np.abs(rng.rand(910))
    XA = np.vstack([x, x])
    D = cosine_distances(XA)
    assert_array_almost_equal(D, [[0., 0.], [0., 0.]])
    # check that all elements are in [0, 2]
    assert np.all(D >= 0.)
    assert np.all(D <= 2.)
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0., 0.])

    XB = np.vstack([x, -x])
    D2 = cosine_distances(XB)
    # check that all elements are in [0, 2]
    assert np.all(D2 >= 0.)
    assert np.all(D2 <= 2.)
    # check that diagonal elements are equal to 0 and non diagonal to 2
    assert_array_almost_equal(D2, [[0., 2.], [2., 0.]])

    # check large random matrix
    X = np.abs(rng.rand(1000, 5000))
    D = cosine_distances(X)
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0.] * D.shape[0])
    assert np.all(D >= 0.)
    assert np.all(D <= 2.)
开发者ID:scikit-learn,项目名称:scikit-learn,代码行数:28,代码来源:test_pairwise.py

示例2: get_features

    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
        # more important topic words a body contains of a certain topic, the higher its value for this topic
        lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)

        print("latent_dirichlet_allocation_cos: fit and transform body")
        t0 = time()
        lda_body_matrix = lda_body.fit_transform(X_train_body)
        print("done in %0.3fs." % (time() - t0))

        print("latent_dirichlet_allocation_cos: transform head")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        lda_head_matrix = lda_body.transform(X_train_head)

        #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)

        print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
        # calculate cosine distance between the body and head
        X = []
        for i in range(len(lda_head_matrix)):
            X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
            X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
            cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
            X.append(cos_dist.tolist())
        return X
开发者ID:paris5020,项目名称:athene_system,代码行数:32,代码来源:topic_models.py

示例3: sumACluster

def sumACluster(dist, vecsIn, topK_t, sameTweetThred):
    if dist == "cosine":
        distMatrix = pairwise.cosine_distances(vecsIn)
    elif dist == "eu":
        distMatrix = pairwise.euclidean_distances(vecsIn, vecsIn)

    sameTweetClusters = [[0]]
    for seqid, text in enumerate(vecsIn[1:], start=1):
        added = None
        for stcid, stc in enumerate(sameTweetClusters):
            sameFlag = False
            if distMatrix[seqid][stc[0]] <= sameTweetThred:
                sameFlag = True

            if sameFlag:
                stc.append(seqid)
                added = (stcid, stc)
                break
        if added is None:
            sameTweetClusters.append([seqid])
        else:
            sameTweetClusters[added[0]] = added[1]
    sameTweetClusterNum = [(stcid, len(stc)) for stcid, stc in enumerate(sameTweetClusters)]
    numIn = len(sameTweetClusterNum)
    top = sorted(sameTweetClusterNum, key = lambda a:a[1], reverse=True)[:min(topK_t, numIn)]
    top = [(sameTweetClusters[item[0]][0], item[1]) for item in top]
    return top
开发者ID:qolina,项目名称:DBED,代码行数:27,代码来源:tweetClustering.py

示例4: test_linkage_misc

def test_linkage_misc():
    # Misc tests on linkage
    X = np.ones((5, 5))
    assert_raises(ValueError,
                  AgglomerativeClustering(linkage='foobar').fit,
                  X)
    assert_raises(ValueError, linkage_tree, X, linkage='foobar')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        # Use the copy argument, to raise a warning
        Ward(copy=True).fit(X)
    # We should be getting 2 warnings: one for using Ward that is
    # deprecated, one for using the copy argument
    assert_equal(len(warning_list), 2)

    # test hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)
    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hiearchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
开发者ID:Arezou1,项目名称:scikit-learn,代码行数:28,代码来源:test_hierarchical.py

示例5: test_linkage_misc

def test_linkage_misc():
    # Misc tests on linkage
    rnd = np.random.RandomState(42)
    X = rnd.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # Deprecation of Ward class
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        Ward().fit(X)
    assert_equal(len(warning_list), 1)

    # test hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)
    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hiearchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
开发者ID:MatteoFu,项目名称:scikit-learn,代码行数:25,代码来源:test_hierarchical.py

示例6: getModelInfo

def getModelInfo(model, features):
    print("Shape of the transformed features = {}".format(features.shape))
    # Uncomment to info:
    # vocab = model.get_feature_names()
    # dist = np.sum(features, axis=0)
    # for tag, count in izip(vocab, dist):
    #     print("word = {}, frequency = {}".format(tag, count))
    return cosine_distances(features)
开发者ID:geekman2,项目名称:GutenTag,代码行数:8,代码来源:sklearn_model.py

示例7: _build_metastore

    def _build_metastore(self):

        medians = np.median(self.X, axis=0).reshape(1, self.dim)

        # how far each data point is from the global median
        dists = cosine_distances(self.X, Y=medians).reshape(-1)

        sorted_index = [self.index[i] for i in dists.argsort()]

        return {'sorted_index': sorted_index}
开发者ID:ashishyadavppe,项目名称:Skater,代码行数:10,代码来源:datamanager.py

示例8: calcurate_centroid_Matrix

def calcurate_centroid_Matrix(veclist, word2vecdic,DimentionN):
	centroid_Matrix = np.zeros((DimentionN, 200))
	distance_arrays = np.zeros(DimentionN)
	for word in veclist:
		label = word2vecdic[word]
		centroid_Matrix[label] += veclist[word]
	for word in veclist:
		label = word2vecdic[word]
		distance_arrays[label] += cosine_distances(veclist[word], centroid_Matrix[label])
	return centroid_Matrix, distance_arrays
开发者ID:ItoTomoki,项目名称:ruiternews,代码行数:10,代码来源:yahoofinancil_board_preprocess.py

示例9: memory_cf

def memory_cf(users, movies, k, similarity_measure, weight_schema,
              repr_matrix=rating_matrix_orig, rating_matrix=rating_matrix_orig):
    """
    Memory-based collaborative filtering.
    :param users: a user list.
    :param movies: a movie list.
    :param k: number of nearest users
    :param similarity_measure: 'cosine' or 'dot_product'
    :param weight_schema: 'mean' or 'weighted_mean'
    :param repr_matrix: data point representation
    :param rating_matrix: ratings based on user-movie or cluster centroids
    :return: recommended ratings for the queries
    """

    # construct mapping between input users and unique users
    ratings, user_unique = [], list(set(users))
    user_index_map = dict((u, i) for i, u in enumerate(user_unique))
    users = [(u, user_index_map[u]) for u in users]

    # find k nearest neighbor for each user
    if similarity_measure == 'cosine':
        dist = cosine_distances(repr_matrix[user_unique, :], repr_matrix)
        sims = 1 - dist
    elif similarity_measure == 'dot_product':
        sims = repr_matrix[user_unique, :].dot(repr_matrix.T)
        if issparse(sims):
            sims = sims.toarray()
        dist = -sims

    sorted_neighbors = np.argsort(dist, axis=1)

    # make rating matrix dense for fast access
    rating_matrix = rating_matrix.toarray()
    weight_method = mean if weight_schema == 'mean' else weighted_mean

    for (user_index, neighbor_index), movie in zip(users, movies):
        neighbors = list(islice(ifilter(lambda u: (u, movie) in entry_set,
                                        sorted_neighbors[neighbor_index]),
                                k + 1))

        # no neighbors, regarded as 3
        if not neighbors:
            ratings.append(3)
            continue

        # exclude itself
        if user_index in neighbors:
            neighbors.remove(user_index)

        rating = weight_method(rating_matrix[neighbors, movie],
                               sims[neighbor_index, neighbors])
        ratings.append(rating)

    return ratings
开发者ID:EDFward,项目名称:TrivialCF,代码行数:54,代码来源:cf.py

示例10: get_sparse_dist_matrix

def get_sparse_dist_matrix(tweets_tfidf_matrix, eps):
    """Get the sparse distance matrix from the pairwise cosine distance
    computations from the given tfidf vectors. Only distances less than or
    equal to eps are put into the matrix"""
    rows = []
    cols = []
    data = []
    for ndx, tweet in enumerate(tweets_tfidf_matrix):
        rows.append(len(cols))
        distances = cosine_distances(tweet, tweets_tfidf_matrix)[0]
        for other_ndx, dist in enumerate(distances):
            if ndx != other_ndx and dist <= eps:
                cols.append(other_ndx)
                data.append(dist)
    return csr_matrix((data, cols, rows), dtype=int)
开发者ID:jiwu14,项目名称:TweetAnalyzer,代码行数:15,代码来源:TweetAnalyzer.py

示例11: test_linkage_misc

def test_linkage_misc():
    # Misc tests on linkage
    rng = np.random.RandomState(42)
    X = rng.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # test hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)

    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hiearchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
开发者ID:foresthz,项目名称:scikit-learn,代码行数:20,代码来源:test_hierarchical.py

示例12: cluster_cf_memory

    def cluster_cf_memory():
        """
        Cluster-based memory CF.
        """
        rating_matrix_cluster = np.empty([k_user, rating_matrix_orig.shape[1]],
                                         dtype=np.float64)

        # build rating matrix for each user cluster, on each movie
        for i in range(k_user):
            cluster_indicator = np.where(user_belonging == i)[0]
            rating_cluster = rating_matrix_orig[cluster_indicator, :]
            rating_sum = rating_cluster.sum(axis=0)
            # take average by dividing count
            rating_cluster.data = np.ones(len(rating_cluster.data))
            mu = rating_sum / rating_cluster.sum(axis=0)
            # fill 0 for nan
            mu[np.isnan(mu)] = 0
            rating_matrix_cluster[i, :] = mu

        # construct mapping between input users and unique users
        ratings, user_unique = [], list(set(users))
        user_index_map = dict((u, i) for i, u in enumerate(user_unique))
        users_neighbors = [user_index_map[u] for u in users]

        if similarity_measure == 'cosine':
            dist = cosine_distances(rating_matrix_orig[user_unique, :], m2uc.T)
            sims = 1 - dist
        else:
            sims = rating_matrix_orig[user_unique, :].dot(m2uc).toarray()
            dist = -sims

        nearest_neighbors = np.argpartition(dist, k, axis=1)[:, :k]
        weight_method = mean if weight_schema == 'mean' else weighted_mean

        for neighbor_index, movie in zip(users_neighbors, movies):
            neighbors = nearest_neighbors[neighbor_index]
            rating = weight_method(rating_matrix_cluster[neighbors, movie],
                                   sims[neighbor_index, neighbors])
            ratings.append(rating)

        return ratings
开发者ID:EDFward,项目名称:TrivialCF,代码行数:41,代码来源:cf.py

示例13: plot_mds

def plot_mds(points, genres, n_points=500):
    '''
    Plots a set of documents in MDS space

    Args:
        points: dense array with coordinates of each document
        genres: list of genres for each entry in points
    Returns:
        None
    '''

    genres = np.array(genres)
    genre_sel = np.not_equal(genres, None)
    X, y = points[genre_sel], genres[genre_sel]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, train_size=n_points)

    distances = cosine_distances(X_train, X_train)
    mds = MDS(n_components=2, dissimilarity='precomputed')
    mds.fit(distances)

    plot_embedding(mds.embedding_, y_train)
开发者ID:lwoloszy,项目名称:albumpitch,代码行数:23,代码来源:genres.py

示例14: test_fp16_cosine_metric

 def test_fp16_cosine_metric(self):
     arr = numpy.empty((10000, 2), dtype=numpy.float16)
     angs = numpy.random.rand(10000) * 2 * numpy.pi
     for i in range(10000):
         arr[i] = numpy.sin(angs[i]), numpy.cos(angs[i])
     with self.stdout:
         centroids, assignments = kmeans_cuda(
             arr, 4, init="kmeans++", metric="cos", device=1, verbosity=2,
             seed=3)
     self.assertEqual(self._get_iters_number(self.stdout), 5)
     self.assertEqual(len(centroids), 4)
     for c in centroids:
         norm = numpy.linalg.norm(c)
         self.assertTrue(0.9995 < norm < 1.0005)
     dists = numpy.round(cosine_distances(centroids)).astype(int)
     self.assertTrue((dists == [
         [0, 2, 1, 1],
         [2, 0, 1, 1],
         [1, 1, 0, 2],
         [1, 1, 2, 0],
     ]).all())
     self.assertEqual(numpy.min(assignments), 0)
     self.assertEqual(numpy.max(assignments), 3)
开发者ID:src-d,项目名称:kmcuda,代码行数:23,代码来源:test.py

示例15: cosine_similarity

def cosine_similarity(vector_a, vector_b):
	return 1-cosine_distances(vector_a,vector_b)
开发者ID:hanveiga,项目名称:master-thesis,代码行数:2,代码来源:information_measure.py


注:本文中的sklearn.metrics.pairwise.cosine_distances函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。