本文整理汇总了Python中gensim.matutils.sparse2full方法的典型用法代码示例。如果您正苦于以下问题:Python matutils.sparse2full方法的具体用法?Python matutils.sparse2full怎么用?Python matutils.sparse2full使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.matutils
的用法示例。
在下文中一共展示了matutils.sparse2full方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: testTransform
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def testTransform(self):
"""Test lsi[vector] transformation."""
# create the transformation model
model = lsimodel.LsiModel(self.corpus, num_topics=2)
# make sure the decomposition is enough accurate
u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False)
self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match
# transform one document
doc = list(self.corpus)[0]
transformed = model[doc]
vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version
# expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
示例2: search_similar_bugs
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def search_similar_bugs(self, query):
query = self.text_preprocess(self.get_text(query))
dense1 = sparse2full(
self.model[self.dictionary.doc2bow(query)], self.model.num_topics
)
distances = []
for idx in range(len(self.corpus)):
dense2 = sparse2full(
self.model[self.dictionary.doc2bow(self.corpus[idx])],
self.model.num_topics,
)
hellinger_distance = np.sqrt(
0.5 * ((np.sqrt(dense1) - np.sqrt(dense2)) ** 2).sum()
)
distances.append((self.bug_ids[idx], hellinger_distance))
distances.sort(key=lambda v: v[1])
return [distance[0] for distance in distances[:10]]
示例3: __getitem__
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def __getitem__(self, bow):
"""
Return RP representation of the input vector and/or corpus.
"""
# if the input vector is in fact a corpus, return a transformed corpus as result
is_corpus, bow = utils.is_corpus(bow)
if is_corpus:
return self._apply(bow)
vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics)
vec = numpy.asfortranarray(vec, dtype=numpy.float32)
topic_dist = numpy.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1)
return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat)
if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)]
示例4: add_documents
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def add_documents(self, corpus):
"""
Extend the index with new documents.
Internally, documents are buffered and then spilled to disk when there's
`self.shardsize` of them (or when a query is issued).
"""
min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete
if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
# The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
self.reopen_shard()
for doc in corpus:
if isinstance(doc, numpy.ndarray):
doclen = len(doc)
elif scipy.sparse.issparse(doc):
doclen = doc.nnz
else:
doclen = len(doc)
if doclen < 0.3 * self.num_features:
doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T)
else:
doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features))
self.fresh_docs.append(doc)
self.fresh_nnz += doclen
if len(self.fresh_docs) >= self.shardsize:
self.close_shard()
if len(self.fresh_docs) % 10000 == 0:
logger.info("PROGRESS: fresh_shard size=%i" % len(self.fresh_docs))
示例5: __init__
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256):
"""
`num_features` is the number of features in the corpus (will be determined
automatically by scanning the corpus if not specified). See `Similarity`
class for description of the other parameters.
"""
if num_features is None:
logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)")
num_features = 1 + utils.get_max_id(corpus)
self.num_features = num_features
self.num_best = num_best
self.normalize = True
self.chunksize = chunksize
if corpus is not None:
if self.num_features <= 0:
raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)")
logger.info("creating matrix for %s documents and %i features" %
(len(corpus), num_features))
self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype)
# iterate over corpus, populating the numpy index matrix with (normalized)
# document vectors
for docno, vector in enumerate(corpus):
if docno % 1000 == 0:
logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus)))
# individual documents in fact may be in numpy.scipy.sparse format as well.
# it's not documented because other it's not fully supported throughout.
# the user better know what he's doing (no normalization, must
# explicitly supply num_features etc).
if isinstance(vector, numpy.ndarray):
pass
elif scipy.sparse.issparse(vector):
vector = vector.toarray().flatten()
else:
vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
self.index[docno] = vector
示例6: get_similarities
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def get_similarities(self, query):
"""
Return similarity of sparse vector `query` to all documents in the corpus,
as a numpy array.
If `query` is a collection of documents, return a 2D array of similarities
of each document in `query` to all documents in the corpus (=batch query,
faster than processing each document in turn).
**Do not use this function directly; use the self[query] syntax instead.**
"""
is_corpus, query = utils.is_corpus(query)
if is_corpus:
query = numpy.asarray([matutils.sparse2full(vec, self.num_features) for vec in query],
dtype=self.index.dtype)
else:
if scipy.sparse.issparse(query):
query = query.toarray() # convert sparse to dense
elif isinstance(query, numpy.ndarray):
pass
else:
# default case: query is a single vector in sparse gensim format
query = matutils.sparse2full(query, self.num_features)
query = numpy.asarray(query, dtype=self.index.dtype)
# do a little transposition dance to stop numpy from making a copy of
# self.index internally in numpy.dot (very slow).
result = numpy.dot(self.index, query.T).T # return #queries x #index
return result # XXX: removed casting the result from array to list; does anyone care?
#endclass MatrixSimilarity
示例7: testFull
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def testFull(self, num_best=None, shardsize=100):
if self.cls == similarities.Similarity:
index = self.cls(None, corpus, num_features=len(dictionary), shardsize=shardsize)
else:
index = self.cls(corpus, num_features=len(dictionary))
if isinstance(index, similarities.MatrixSimilarity):
expected = numpy.array([
[ 0.57735026, 0.57735026, 0.57735026, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ],
[ 0.40824831, 0.0, 0.0, 0.40824831, 0.40824831, 0.40824831, 0.40824831, 0.40824831, 0.0, 0.0, 0.0, 0.0 ],
[ 0.0, 0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0 ],
[ 0.0, 0.40824831, 0.0, 0.0, 0.0, 0.81649661, 0.0, 0.0, 0.40824831, 0.0, 0.0, 0.0 ],
[ 0.0, 0.0, 0.0, 0.57735026, 0.0, 0.0, 0.57735026, 0.57735026, 0.0, 0.0, 0.0, 0.0 ],
[ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0 ],
[ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.70710677, 0.70710677, 0.0 ],
[ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026, 0.57735026 ],
[ 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026 ]
], dtype=numpy.float32)
self.assertTrue(numpy.allclose(expected, index.index))
index.num_best = num_best
query = corpus[0]
sims = index[query]
expected = [(0, 0.99999994), (2, 0.28867513), (3, 0.23570226), (1, 0.23570226)][ : num_best]
# convert sims to full numpy arrays, so we can use allclose() and ignore
# ordering of items with the same similarity value
expected = matutils.sparse2full(expected, len(index))
if num_best is not None: # when num_best is None, sims is already a numpy array
sims = matutils.sparse2full(sims, len(index))
self.assertTrue(numpy.allclose(expected, sims))
if self.cls == similarities.Similarity:
index.destroy()
示例8: testCorpusTransform
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def testCorpusTransform(self):
"""Test lsi[corpus] transformation."""
model = lsimodel.LsiModel(self.corpus, num_topics=2)
got = numpy.vstack(matutils.sparse2full(doc, 2) for doc in model[corpus])
expected = numpy.array([
[ 0.65946639, 0.14211544],
[ 2.02454305, -0.42088759],
[ 1.54655361, 0.32358921],
[ 1.81114125, 0.5890525 ],
[ 0.9336738 , -0.27138939],
[ 0.01274618, -0.49016181],
[ 0.04888203, -1.11294699],
[ 0.08063836, -1.56345594],
[ 0.27381003, -1.34694159]])
self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign
示例9: testOnlineTransform
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import sparse2full [as 别名]
def testOnlineTransform(self):
corpus = list(self.corpus)
doc = corpus[0] # use the corpus' first document for testing
# create the transformation model
model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once
model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later
# train model on a single document
model.add_documents([corpus[0]])
# transform the testing document with this partial transformation
transformed = model[doc]
vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
expected = numpy.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign
# train on another 4 documents
model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols
# transform a document with this partial transformation
transformed = model[doc]
vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
expected = numpy.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign
# train on the rest of documents
model.add_documents(corpus[5:])
# make sure the final transformation is the same as if we had decomposed the whole corpus at once
vec1 = matutils.sparse2full(model[doc], model.num_topics)
vec2 = matutils.sparse2full(model2[doc], model2.num_topics)
self.assertTrue(numpy.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign