Python pairwise.cosine_distances方法代码示例

本文整理汇总了Python中sklearn.metrics.pairwise.cosine_distances方法的典型用法代码示例。如果您正苦于以下问题:Python pairwise.cosine_distances方法的具体用法?Python pairwise.cosine_distances怎么用?Python pairwise.cosine_distances使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.metrics.pairwise的用法示例。


示例1: test_tsne_with_different_distance_metrics

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def test_tsne_with_different_distance_metrics():
    """Make sure that TSNE works for different distance metrics"""
    random_state = check_random_state(0)
    n_components_original = 3
    n_components_embedding = 2
    X = random_state.randn(50, n_components_original).astype(np.float32)
    metrics = ['manhattan', 'cosine']
    dist_funcs = [manhattan_distances, cosine_distances]
    for metric, dist_func in zip(metrics, dist_funcs):
        X_transformed_tsne = TSNE(
            metric=metric, n_components=n_components_embedding,
        X_transformed_tsne_precomputed = TSNE(
            metric='precomputed', n_components=n_components_embedding,
        assert_array_equal(X_transformed_tsne, X_transformed_tsne_precomputed) 

示例2: construct_H_with_KNN

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def construct_H_with_KNN(X, K_neigs=[10], is_probH=False, m_prob=1):
    init multi-scale hypergraph Vertex-Edge matrix from original node feature matrix
    :param X: N_object x feature_number
    :param K_neigs: the number of neighbor expansion
    :param is_probH: prob Vertex-Edge matrix or binary
    :param m_prob: prob
    :return: N_object x N_hyperedge
    if len(X.shape) != 2:
        X = X.reshape(-1, X.shape[-1])

    if type(K_neigs) == int:
        K_neigs = [K_neigs]

    dis_mat = cos_dis(X)
    H = None
    for k_neig in K_neigs:
        H_tmp = construct_H_with_KNN_from_distance(dis_mat, k_neig, is_probH, m_prob)
        H = hyperedge_concat(H, H_tmp)
    return H 

示例3: testCosineDistancesExecution

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def testCosineDistancesExecution(self):
        raw_dense_x = np.random.rand(25, 10)
        raw_dense_y = np.random.rand(17, 10)

        raw_sparse_x = sps.random(25, 10, density=0.5, format='csr', random_state=0)
        raw_sparse_y = sps.random(17, 10, density=0.4, format='csr', random_state=1)

        for raw_x, raw_y in [
            (raw_dense_x, raw_dense_y),
            (raw_sparse_x, raw_sparse_y)
            for chunk_size in (25, 6):
                x = mt.tensor(raw_x, chunk_size=chunk_size)
                y = mt.tensor(raw_y, chunk_size=chunk_size)

                d = cosine_distances(x, y)

                result = self.executor.execute_tensor(d, concat=True)[0]
                expected = sk_cosine_distances(raw_x, raw_y)

                np.testing.assert_almost_equal(np.asarray(result), expected)

                d = cosine_distances(x)

                result = self.executor.execute_tensor(d, concat=True)[0]
                expected = sk_cosine_distances(raw_x)

                np.testing.assert_almost_equal(np.asarray(result), expected) 

示例4: vec_cos_dist

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def vec_cos_dist(token_input, operation_input):
    operation_string = None
    ref_vector_string = None
    cond_value_string = None
    for opr_sign in ['==', '>=', '<=', '!=', '<>', '<', '>', '=']:
        if opr_sign in operation_input:
            ref_vector_string = operation_input.split(opr_sign)[0]
            operation_string = opr_sign
            cond_value_string = operation_input.split(opr_sign)[1]

    if ref_vector_string and cond_value_string and operation_string:
            cond_value = float(cond_value_string)
            ref_vector = change_string_to_vector(ref_vector_string)
            token_vector = change_string_to_vector(token_input)
            if len(ref_vector) != len(token_vector):
                print ('len of vectors does not match')
                return False
            if operation_string == "=" or operation_string == "==":
                return cosine_distances(token_vector, ref_vector) == cond_value
            elif operation_string == "<":
                return cosine_distances(token_vector, ref_vector) < cond_value
            elif operation_string == ">":
                return cosine_distances(token_vector, ref_vector) > cond_value
            elif operation_string == ">=":
                return cosine_distances(token_vector, ref_vector) >= cond_value
            elif operation_string == "<=":
                return cosine_distances(token_vector, ref_vector) <= cond_value
            elif operation_string == "!=" or operation_string == "<>":
                return cosine_distances(token_vector, ref_vector) != cond_value
                return False
        except ValueError:
            # TODO raise tokenregex error
            return False

        # TODO raise tokenregex error
        print ('Problem with the operation input') 

示例5: test_cosine_distances

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def test_cosine_distances():
    # Check the pairwise Cosine distances computation
    rng = np.random.RandomState(1337)
    x = np.abs(rng.rand(910))
    XA = np.vstack([x, x])
    D = cosine_distances(XA)
    assert_array_almost_equal(D, [[0., 0.], [0., 0.]])
    # check that all elements are in [0, 2]
    assert np.all(D >= 0.)
    assert np.all(D <= 2.)
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0., 0.])

    XB = np.vstack([x, -x])
    D2 = cosine_distances(XB)
    # check that all elements are in [0, 2]
    assert np.all(D2 >= 0.)
    assert np.all(D2 <= 2.)
    # check that diagonal elements are equal to 0 and non diagonal to 2
    assert_array_almost_equal(D2, [[0., 2.], [2., 0.]])

    # check large random matrix
    X = np.abs(rng.rand(1000, 5000))
    D = cosine_distances(X)
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0.] * D.shape[0])
    assert np.all(D >= 0.)
    assert np.all(D <= 2.) 

示例6: get_cosine_dist

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def get_cosine_dist(all_functions):
    return_dict = {}
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    return cosine_distances(func_sparse, func_sparse) 

示例7: cosine_distance

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def cosine_distance(v1, v2):
    #As cosine similarity interval is [-1.0, 1.0], the cosine distance interval is [0.0, 2.0].
    #This normalizes the cosine distance to interval [0.0, 1.0]
    return pairwise.cosine_distances(v1, v2) / 2.0

#For ranks index starting from 0 

示例8: _construct_edge_list_from_distance

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def _construct_edge_list_from_distance(X, k_neigh):
    construct edge_list (numpy array) from kNN distance for single modality
    :param X -> numpy array: feature
    :param k_neigh -> int: # of neighbors
    :return: N * k_neigh numpy array
    dis = cos_dis(X)
    dis = torch.Tensor(dis)
    _, k_idx = dis.topk(k_neigh, dim=-1, largest=False)
    return k_idx.numpy() 

示例9: _distance

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def _distance(x1, x2):
        return cosine_distances(x1, x2) 

示例10: kNN_classify

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def kNN_classify(*, x, y):
    return the index of y that is closest to each x
    :param x: n*d matrix
    :param y: m*d matrix
    :return: n-dim vector
    ds = cosine_distances(x, y)
    idx = y[np.argmin(ds, axis=1)]
    return idx 

示例11: main

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def main():
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nearest', type=int, default=10)
    parser.add_argument('questions', type=argparse.FileType('r', encoding='UTF-8'))
    args = parser.parse_args()

    explanations = []

    for path, _, files in os.walk(args.tables):
        for file in files:
            explanations += read_explanations(os.path.join(path, file))

    if not explanations:
        warnings.warn('Empty explanations')

    df_q = pd.read_csv(args.questions, sep='\t', dtype=str)
    df_e = pd.DataFrame(explanations, columns=('uid', 'text'))

    vectorizer = TfidfVectorizer().fit(df_q['Question']).fit(df_e['text'])
    X_q = vectorizer.transform(df_q['Question'])
    X_e = vectorizer.transform(df_e['text'])
    X_dist = cosine_distances(X_q, X_e)

    for i_question, distances in enumerate(X_dist):
        for i_explanation in np.argsort(distances)[:args.nearest]:
            print('{}\t{}'.format(df_q.loc[i_question]['questionID'], df_e.loc[i_explanation]['uid'])) 

示例12: cosine_distances_xy

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def cosine_distances_xy(x, y, to_similar=False):
    :param x: 可迭代序列
    :param y: 可迭代序列
    :param to_similar: 是否进行后置输出转换similar值
    :return: float数值
    distance = _distance_xy(cosine_distances, x, y)
    if to_similar:
        # 余弦距离转换余弦相似度直接减就行
        distance = 1.0 - distance
    return distance 

示例13: test_cosine_distances

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def test_cosine_distances():
    # Check the pairwise Cosine distances computation
    rng = np.random.RandomState(1337)
    x = np.abs(rng.rand(910))
    XA = np.vstack([x, x])
    D = cosine_distances(XA)
    assert_array_almost_equal(D, [[0., 0.], [0., 0.]])
    # check that all elements are in [0, 2]
    assert_true(np.all(D >= 0.))
    assert_true(np.all(D <= 2.))
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0., 0.])

    XB = np.vstack([x, -x])
    D2 = cosine_distances(XB)
    # check that all elements are in [0, 2]
    assert_true(np.all(D2 >= 0.))
    assert_true(np.all(D2 <= 2.))
    # check that diagonal elements are equal to 0 and non diagonal to 2
    assert_array_almost_equal(D2, [[0., 2.], [2., 0.]])

    # check large random matrix
    X = np.abs(rng.rand(1000, 5000))
    D = cosine_distances(X)
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0.] * D.shape[0])
    assert_true(np.all(D >= 0.))
    assert_true(np.all(D <= 2.))

# Paired distances 

示例14: produce_category_focused_pairplot

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def produce_category_focused_pairplot(corpus,
    Produces a pair-plot which is focused on a single category.

    :param corpus: TermDocMatrix
    :param category: str, name of a category in the corpus
    :param category_projector: CategoryProjector, a factor analysis of the category/feature vector
    :param category_projection: CategoryProjection, None by default. If present, overrides category projector
    :param kwargs: remaining kwargs for produce_pairplot
    :return: str, HTML

    category_num = corpus.get_categories().index(category)

    uncorrelated_components_projection = (category_projector.project(corpus)
                                          if category_projection is None
                                          else category_projection)

    distances = cosine_distances(uncorrelated_components_projection.get_category_embeddings().T)

    similarity_to_category_scores = -2 * (rankdata(distances[category_num]) - 0.5)

    uncorrelated_components = uncorrelated_components_projection.get_projection()

    least_correlated_dimension = min([(np.abs(pearsonr(similarity_to_category_scores,
                                                       uncorrelated_components.T[i])[0]), i)]
                                     for i in range(uncorrelated_components.shape[1]))[0][1]

    projection_to_plot = np.array([uncorrelated_components.T[least_correlated_dimension],

    return produce_pairplot(

示例15: latent_dirichlet_allocation

# 需要导入模块: from sklearn.metrics import pairwise [as 别名]
# 或者: from sklearn.metrics.pairwise import cosine_distances [as 别名]
def latent_dirichlet_allocation(headlines, bodies):
    # https://pypi.python.org/pypi/lda on bottom see suggestions like MALLET, hca
    # https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
    # https://www.quora.com/What-are-the-best-features-to-put-into-Latent-Dirichlet-Allocation-LDA-for-topic-modeling-of-short-text

    def print_top_words(model, feature_names, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print("Topic #%d:" % topic_idx)
            print(", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]]))

    def combine_head_and_body(headlines, bodies):
        head_and_body = [headline + " " + body for i, (headline, body) in
                         enumerate(zip(headlines, bodies))]

        return head_and_body

    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
        # more important topic words a body contains of a certain topic, the higher its value for this topic
        lda_body = LatentDirichletAllocation(n_topics=25, learning_method='online', random_state=0, n_jobs=3)

        print("latent_dirichlet_allocation: fit and transform body")
        t0 = time()
        lda_body_matrix = lda_body.fit_transform(X_train_body)
        print("done in %0.3fs." % (time() - t0))

        print("latent_dirichlet_allocation: transform head")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        lda_head_matrix = lda_body.transform(X_train_head)

        #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)

        print('latent_dirichlet_allocation: calculating cosine distance between head and body')
        # calculate cosine distance between the body and head
        X = []
        for i in range(len(lda_head_matrix)):
            X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
            X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
            cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
        return X

    vocab = create_word_ngram_vocabulary(ngram_range=(1, 1), max_features=5000, lemmatize=False, term_freq=True,
    X = get_features(vocab)
    return X 
