当前位置: 首页>>代码示例>>Python>>正文


Python matutils.sparse2full方法代码示例

本文整理汇总了Python中gensim.matutils.sparse2full方法的典型用法代码示例。如果您正苦于以下问题:Python matutils.sparse2full方法的具体用法?Python matutils.sparse2full怎么用?Python matutils.sparse2full使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.matutils的用法示例。


在下文中一共展示了matutils.sparse2full方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: testTransform

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def testTransform(self):
        """Test lsi[vector] transformation."""
        # create the transformation model
        model = lsimodel.LsiModel(self.corpus, num_topics=2)

        # make sure the decomposition is enough accurate
        u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False)
        self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match

        # transform one document
        doc = list(self.corpus)[0]
        transformed = model[doc]
        vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
        expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version
        # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version
        self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:18,代码来源:test_models.py

示例2: search_similar_bugs

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def search_similar_bugs(self, query):
        query = self.text_preprocess(self.get_text(query))

        dense1 = sparse2full(
            self.model[self.dictionary.doc2bow(query)], self.model.num_topics
        )
        distances = []

        for idx in range(len(self.corpus)):
            dense2 = sparse2full(
                self.model[self.dictionary.doc2bow(self.corpus[idx])],
                self.model.num_topics,
            )
            hellinger_distance = np.sqrt(
                0.5 * ((np.sqrt(dense1) - np.sqrt(dense2)) ** 2).sum()
            )

            distances.append((self.bug_ids[idx], hellinger_distance))

        distances.sort(key=lambda v: v[1])

        return [distance[0] for distance in distances[:10]] 
开发者ID:mozilla,项目名称:bugbug,代码行数:24,代码来源:similarity.py

示例3: __getitem__

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def __getitem__(self, bow):
        """
        Return RP representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics)
        vec = numpy.asfortranarray(vec, dtype=numpy.float32)
        topic_dist = numpy.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1)
        return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat)
                if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)] 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:16,代码来源:rpmodel.py

示例4: add_documents

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def add_documents(self, corpus):
        """
        Extend the index with new documents.

        Internally, documents are buffered and then spilled to disk when there's
        `self.shardsize` of them (or when a query is issued).
        """
        min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
            self.reopen_shard()
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
            else:
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T)
                else:
                    doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features))
            self.fresh_docs.append(doc)
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
                self.close_shard()
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i" % len(self.fresh_docs)) 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:30,代码来源:docsim.py

示例5: __init__

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256):
        """
        `num_features` is the number of features in the corpus (will be determined
        automatically by scanning the corpus if not specified). See `Similarity`
        class for description of the other parameters.

        """
        if num_features is None:
            logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)")
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize

        if corpus is not None:
            if self.num_features <= 0:
                raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)")
            logger.info("creating matrix for %s documents and %i features" %
                         (len(corpus), num_features))
            self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype)
            # iterate over corpus, populating the numpy index matrix with (normalized)
            # document vectors
            for docno, vector in enumerate(corpus):
                if docno % 1000 == 0:
                    logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus)))
                # individual documents in fact may be in numpy.scipy.sparse format as well.
                # it's not documented because other it's not fully supported throughout.
                # the user better know what he's doing (no normalization, must
                # explicitly supply num_features etc).
                if isinstance(vector, numpy.ndarray):
                    pass
                elif scipy.sparse.issparse(vector):
                    vector = vector.toarray().flatten()
                else:
                    vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
                self.index[docno] = vector 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:40,代码来源:docsim.py

示例6: get_similarities

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def get_similarities(self, query):
        """
        Return similarity of sparse vector `query` to all documents in the corpus,
        as a numpy array.

        If `query` is a collection of documents, return a 2D array of similarities
        of each document in `query` to all documents in the corpus (=batch query,
        faster than processing each document in turn).

        **Do not use this function directly; use the self[query] syntax instead.**

        """
        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = numpy.asarray([matutils.sparse2full(vec, self.num_features) for vec in query],
                                  dtype=self.index.dtype)
        else:
            if scipy.sparse.issparse(query):
                query = query.toarray() # convert sparse to dense
            elif isinstance(query, numpy.ndarray):
                pass
            else:
                # default case: query is a single vector in sparse gensim format
                query = matutils.sparse2full(query, self.num_features)
            query = numpy.asarray(query, dtype=self.index.dtype)

        # do a little transposition dance to stop numpy from making a copy of
        # self.index internally in numpy.dot (very slow).
        result = numpy.dot(self.index, query.T).T # return #queries x #index
        return result # XXX: removed casting the result from array to list; does anyone care?
#endclass MatrixSimilarity 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:33,代码来源:docsim.py

示例7: testFull

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def testFull(self, num_best=None, shardsize=100):
        if self.cls == similarities.Similarity:
            index = self.cls(None, corpus, num_features=len(dictionary), shardsize=shardsize)
        else:
            index = self.cls(corpus, num_features=len(dictionary))
        if isinstance(index, similarities.MatrixSimilarity):
            expected = numpy.array([
                [ 0.57735026, 0.57735026, 0.57735026, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ],
                [ 0.40824831, 0.0, 0.0, 0.40824831, 0.40824831, 0.40824831, 0.40824831, 0.40824831, 0.0, 0.0, 0.0, 0.0 ],
                [ 0.0, 0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0 ],
                [ 0.0, 0.40824831, 0.0, 0.0, 0.0, 0.81649661, 0.0, 0.0, 0.40824831, 0.0, 0.0, 0.0 ],
                [ 0.0, 0.0, 0.0, 0.57735026, 0.0, 0.0, 0.57735026, 0.57735026, 0.0, 0.0, 0.0, 0.0 ],
                [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0 ],
                [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.70710677, 0.70710677, 0.0 ],
                [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026, 0.57735026 ],
                [ 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026 ]
                ], dtype=numpy.float32)
            self.assertTrue(numpy.allclose(expected, index.index))
        index.num_best = num_best
        query = corpus[0]
        sims = index[query]
        expected = [(0, 0.99999994), (2, 0.28867513), (3, 0.23570226), (1, 0.23570226)][ : num_best]

        # convert sims to full numpy arrays, so we can use allclose() and ignore
        # ordering of items with the same similarity value
        expected = matutils.sparse2full(expected, len(index))
        if num_best is not None: # when num_best is None, sims is already a numpy array
            sims = matutils.sparse2full(sims, len(index))
        self.assertTrue(numpy.allclose(expected, sims))
        if self.cls == similarities.Similarity:
            index.destroy() 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:33,代码来源:test_similarities.py

示例8: testCorpusTransform

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def testCorpusTransform(self):
        """Test lsi[corpus] transformation."""
        model = lsimodel.LsiModel(self.corpus, num_topics=2)
        got = numpy.vstack(matutils.sparse2full(doc, 2) for doc in model[corpus])
        expected = numpy.array([
            [ 0.65946639,  0.14211544],
            [ 2.02454305, -0.42088759],
            [ 1.54655361,  0.32358921],
            [ 1.81114125,  0.5890525 ],
            [ 0.9336738 , -0.27138939],
            [ 0.01274618, -0.49016181],
            [ 0.04888203, -1.11294699],
            [ 0.08063836, -1.56345594],
            [ 0.27381003, -1.34694159]])
        self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:17,代码来源:test_models.py

示例9: testOnlineTransform

# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def testOnlineTransform(self):
        corpus = list(self.corpus)
        doc = corpus[0] # use the corpus' first document for testing

        # create the transformation model
        model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once
        model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later

        # train model on a single document
        model.add_documents([corpus[0]])

        # transform the testing document with this partial transformation
        transformed = model[doc]
        vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
        expected = numpy.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version
        self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign

        # train on another 4 documents
        model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols

        # transform a document with this partial transformation
        transformed = model[doc]
        vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
        expected = numpy.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version
        self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign

        # train on the rest of documents
        model.add_documents(corpus[5:])

        # make sure the final transformation is the same as if we had decomposed the whole corpus at once
        vec1 = matutils.sparse2full(model[doc], model.num_topics)
        vec2 = matutils.sparse2full(model2[doc], model2.num_topics)
        self.assertTrue(numpy.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:35,代码来源:test_models.py


注:本文中的gensim.matutils.sparse2full方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。