Python matutils.unitvec方法代码示例

本文整理汇总了Python中gensim.matutils.unitvec方法的典型用法代码示例。如果您正苦于以下问题：Python matutils.unitvec方法的具体用法？Python matutils.unitvec怎么用？Python matutils.unitvec使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.matutils的用法示例。

在下文中一共展示了matutils.unitvec方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: similarity_label

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def similarity_label(self, words, normalization=True):
        """
        you can calculate more than one word at the same time.
        """
        if self.model==None:
            raise Exception('no model.')
        if isinstance(words, string_types):
            words=[words]
        vectors=np.transpose(self.model.wv.__getitem__(words))
        if normalization:
            unit_vector=unitvec(vectors,ax=0) # 这样写比原来那样速度提升一倍
            #unit_vector=np.zeros((len(vectors),len(words)))
            #for i in range(len(words)):
            #    unit_vector[:,i]=matutils.unitvec(vectors[:,i])
            dists=np.dot(self.Label_vec_u, unit_vector)
        else:
            dists=np.dot(self.Label_vec, vectors)
        return dists

开发者ID:Coldog2333，项目名称:Financial-NLP，代码行数:20，代码来源:NLP.py

示例2: getitem

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def __getitem__(self, bow, eps=1e-12):
        """
        Return tf-idf representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)
        vector = [(termid, self.wlocal(tf) * self.idfs.get(termid))
                  for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.normalize is True:
            vector = matutils.unitvec(vector)
        elif self.normalize:
            vector = self.normalize(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
        return vector
#endclass TfidfModel

开发者ID:largelymfs，项目名称:topical_word_embeddings，代码行数:27，代码来源:tfidfmodel.py

示例3: calculate_text_similar

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
    """
      最相似的句子，句向量与矩阵点乘
    :param vec: 
    :param matrix: 
    :param keys: 
    :param topn: 
    :return: 
    """
    # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
    vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
    # 矩阵点乘, 即问句与标准问句库里边的问句点乘,
    matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
    # 相似度排序
    most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
    # 获取最相似标准问句的index和得分score
    index_score = []
    for t in most_similar_sentence_vec_sort[:top_vec]:
        index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
    return index_score

开发者ID:yongzhuo，项目名称:nlp_xiaojiang，代码行数:22，代码来源:chatbot_sentence_vec_by_word.py

示例4: shift_clip_pmi

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def shift_clip_pmi(pmimtr, k_shift=1.0):
        """
        Turns a pmi matrix into a PPMI matrix by setting all negative values to 0 and then shifting by a factor of
        -log(k).

        :param pmimtr: The matrix of PMI values.
        :param k_shift: The shift factor.
        :return: A PPMI matrix.
        """

        logger.info("shifting PMI scores by log(k) with k=%s" % (k_shift, ))
        pmimtr -= np.log(k_shift)  # shifted PMI = log(#(w, c) * D / (#w * #c)) - log(k)

        logger.info("clipping PMI scores to be non-negative PPMI")
        pmimtr.clip(0.0, out=pmimtr)  # SPPMI = max(0, log(#(w, c) * D / (#w * #c)) - log(k))

        logger.info("normalizing PPMI word vectors to unit length")
        for i, vec in enumerate(pmimtr):
            pmimtr[i] = matutils.unitvec(vec)

        return matutils.corpus2csc(matutils.Dense2Corpus(pmimtr, documents_columns=False)).T

开发者ID:clips，项目名称:dutchembeddings，代码行数:23，代码来源:create_sppmi.py

示例5: similarity_3_contexts

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def similarity_3_contexts(self, t, p):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(
                matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector)
            )

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(
                matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector)
            )

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(
                matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector)
            )

        return self.config.alpha*bef + \
               self.config.beta*bet + \
               self.config.gamma*aft

开发者ID:davidsbatista，项目名称:BREDS，代码行数:23，代码来源:breds-parallel.py

示例6: similarity_cluster

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def similarity_cluster(self, p1, p2):
        count = 0
        score = 0
        if self.config.alpha == 0 and self.config.gamma == 0:
            p1.merge_all_tuples_bet()
            p2.merge_all_tuples_bet()
            for v_bet1 in p1.bet_uniques_vectors:
                for v_bet2 in p2.bet_uniques_vectors:
                    if v_bet1 is not None and v_bet2 is not None:
                        score += dot(
                            matutils.unitvec(asarray(v_bet1)),
                            matutils.unitvec(asarray(v_bet2))
                        )
                        count += 1
        else:
            for t1 in p1.tuples:
                for t2 in p2.tuples:
                    score += self.similarity_3_contexts(t1, t2)
                    count += 1

        return float(score) / float(count)

开发者ID:davidsbatista，项目名称:BREDS，代码行数:23，代码来源:breds-parallel.py

示例7: similarity

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def similarity(self, d1: int, d2: int) -> float:
        """Compute cosine similarity between two sentences from the training set.

        Parameters
        ----------
        d1 : int
            index of sentence 
        d2 : int
            index of sentence 

        Returns
        -------
        float
            The cosine similarity between the vectors of the two sentences.

        """
        return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2]))

开发者ID:oborchers，项目名称:Fast_Sentence_Embeddings，代码行数:19，代码来源:sentencevectors.py

示例8: safe_renew_label_vec

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def safe_renew_label_vec(self):
        """
        initialize word vectors of words in label_dict.
        origin version(safe)
        """
        self.Label_vec=np.empty((len(self.Label_dict),self.len_vector))
        self.Label_vec_u=np.empty((len(self.Label_dict),self.len_vector))
        for i in range(len(self.Label_index)):
            try:
                self.Label_vec[i,:]=self.model.wv.__getitem__(self.Label_index[i])
                self.Label_vec_u[i,:]=matutils.unitvec(self.model.wv.__getitem__(self.Label_index[i]))
            except:
                self.Label_vec[i,:]=np.zeros((1,self.len_vector)) # debug期间先这样处理吧
                self.Label_vec_u[i,:]=np.zeros((1,self.len_vector))

开发者ID:Coldog2333，项目名称:Financial-NLP，代码行数:16，代码来源:NLP.py

示例9: renew_label_vec

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def renew_label_vec(self):
        """
        initialize word vectors of words in label_dict.
        fast version(unstable)
        !Attention! : use it only when you make sure that all words in Label_index can calculate the word vector.
        """
        self.Label_vec=self.model.wv.__getitem__(self.Label_index)
        self.Label_vec_u=unitvec(self.Label_vec)

开发者ID:Coldog2333，项目名称:Financial-NLP，代码行数:10，代码来源:NLP.py

示例10: unitvec

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def unitvec(vector, ax=1):
    v=vector*vector
    if len(vector.shape)==1:
        sqrtv=np.sqrt(np.sum(v))
    elif len(vector.shape)==2:
        sqrtv=np.sqrt([np.sum(v, axis=ax)])
    else:
        raise Exception('It\'s too large.')
    if ax==1:
        result=np.divide(vector,sqrtv.T)
    elif ax==0:
        result=np.divide(vector,sqrtv)
    return result

开发者ID:Coldog2333，项目名称:Financial-NLP，代码行数:15，代码来源:NLP.py

示例11: most_similar

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def most_similar(self, positive=[], negative=[], topn=10):


        if isinstance(positive, string_types) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
        positive = [(word, 1.0) if isinstance(word, string_types + (ndarray,))
                                else word for word in positive]
        negative = [(word, -1.0) if isinstance(word, string_types + (ndarray,))
                                 else word for word in negative]

        # compute the weighted average of all words
        all_words, mean = set(), []
        for word, weight in positive + negative:
            if isinstance(word, ndarray):
                mean.append(weight * word)
            elif word in self.vocab:
                mean.append(weight * self.syn0norm[self.vocab[word].index])
                all_words.add(self.vocab[word].index)
            else:
                raise KeyError("word '%s' not in vocabulary" % word)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        dists = dot(self.syn0norm, mean)
        if not topn:
            return dists
        best = argsort(dists)[::-1][:topn + len(all_words)]
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], float(dists[sim]), self.syn0[sim]) for sim in best if sim not in all_words]
        return result[:topn]

开发者ID:loretoparisi，项目名称:word2vec-twitter，代码行数:36，代码来源:word2vecReader.py

示例12: getitem

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.

        **or**

        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. Using this type of batch
        query is more efficient than computing the similarities one document after
        another.
        """
        is_corpus, query = utils.is_corpus(query)
        if self.normalize:
            # self.normalize only works if the input is a plain gensim vector/corpus (as
            # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
            # as well, but in that case assume tricks are happening and don't normalize
            # anything (self.normalize has no effect).
            if matutils.ismatrix(query):
                import warnings
                # warnings.warn("non-gensim input must already come normalized")
            else:
                if is_corpus:
                    query = [matutils.unitvec(v) for v in query]
                else:
                    query = matutils.unitvec(query)
        result = self.get_similarities(query)

        if self.num_best is None:
            return result

        # if the input query was a corpus (=more documents), compute the top-n
        # most similar for each document in turn
        if matutils.ismatrix(result):
            return [matutils.full2sparse_clipped(v, self.num_best) for v in result]
        else:
            # otherwise, return top-n of the single input document
            return matutils.full2sparse_clipped(result, self.num_best)

开发者ID:largelymfs，项目名称:topical_word_embeddings，代码行数:38，代码来源:interfaces.py

示例13: getitem

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def __getitem__(self, bow):
        """
        Return log entropy representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
                  for term_id, tf in bow if term_id in self.entr]
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector

开发者ID:largelymfs，项目名称:topical_word_embeddings，代码行数:17，代码来源:logentropy_model.py

示例14: add_documents

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def add_documents(self, corpus):
        """
        Extend the index with new documents.

        Internally, documents are buffered and then spilled to disk when there's
        `self.shardsize` of them (or when a query is issued).
        """
        min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
            self.reopen_shard()
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
            else:
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T)
                else:
                    doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features))
            self.fresh_docs.append(doc)
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
                self.close_shard()
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i" % len(self.fresh_docs))

开发者ID:largelymfs，项目名称:topical_word_embeddings，代码行数:30，代码来源:docsim.py

示例15: init

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256):
        """
        `num_features` is the number of features in the corpus (will be determined
        automatically by scanning the corpus if not specified). See `Similarity`
        class for description of the other parameters.

        """
        if num_features is None:
            logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)")
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize

        if corpus is not None:
            if self.num_features <= 0:
                raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)")
            logger.info("creating matrix for %s documents and %i features" %
                         (len(corpus), num_features))
            self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype)
            # iterate over corpus, populating the numpy index matrix with (normalized)
            # document vectors
            for docno, vector in enumerate(corpus):
                if docno % 1000 == 0:
                    logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus)))
                # individual documents in fact may be in numpy.scipy.sparse format as well.
                # it's not documented because other it's not fully supported throughout.
                # the user better know what he's doing (no normalization, must
                # explicitly supply num_features etc).
                if isinstance(vector, numpy.ndarray):
                    pass
                elif scipy.sparse.issparse(vector):
                    vector = vector.toarray().flatten()
                else:
                    vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
                self.index[docno] = vector

开发者ID:largelymfs，项目名称:topical_word_embeddings，代码行数:40，代码来源:docsim.py

注：本文中的gensim.matutils.unitvec方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。