本文整理汇总了Python中gensim.matutils.unitvec方法的典型用法代码示例。如果您正苦于以下问题:Python matutils.unitvec方法的具体用法?Python matutils.unitvec怎么用?Python matutils.unitvec使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.matutils
的用法示例。
在下文中一共展示了matutils.unitvec方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: similarity_label
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def similarity_label(self, words, normalization=True):
"""
you can calculate more than one word at the same time.
"""
if self.model==None:
raise Exception('no model.')
if isinstance(words, string_types):
words=[words]
vectors=np.transpose(self.model.wv.__getitem__(words))
if normalization:
unit_vector=unitvec(vectors,ax=0) # 这样写比原来那样速度提升一倍
#unit_vector=np.zeros((len(vectors),len(words)))
#for i in range(len(words)):
# unit_vector[:,i]=matutils.unitvec(vectors[:,i])
dists=np.dot(self.Label_vec_u, unit_vector)
else:
dists=np.dot(self.Label_vec, vectors)
return dists
示例2: __getitem__
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def __getitem__(self, bow, eps=1e-12):
"""
Return tf-idf representation of the input vector and/or corpus.
"""
# if the input vector is in fact a corpus, return a transformed corpus as a result
is_corpus, bow = utils.is_corpus(bow)
if is_corpus:
return self._apply(bow)
# unknown (new) terms will be given zero weight (NOT infinity/huge weight,
# as strict application of the IDF formula would dictate)
vector = [(termid, self.wlocal(tf) * self.idfs.get(termid))
for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]
# and finally, normalize the vector either to unit length, or use a
# user-defined normalization function
if self.normalize is True:
vector = matutils.unitvec(vector)
elif self.normalize:
vector = self.normalize(vector)
# make sure there are no explicit zeroes in the vector (must be sparse)
vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
return vector
#endclass TfidfModel
示例3: calculate_text_similar
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
"""
最相似的句子,句向量与矩阵点乘
:param vec:
:param matrix:
:param keys:
:param topn:
:return:
"""
# 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
# 矩阵点乘, 即问句与标准问句库里边的问句点乘,
matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
# 相似度排序
most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
# 获取最相似标准问句的index和得分score
index_score = []
for t in most_similar_sentence_vec_sort[:top_vec]:
index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
return index_score
示例4: shift_clip_pmi
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def shift_clip_pmi(pmimtr, k_shift=1.0):
"""
Turns a pmi matrix into a PPMI matrix by setting all negative values to 0 and then shifting by a factor of
-log(k).
:param pmimtr: The matrix of PMI values.
:param k_shift: The shift factor.
:return: A PPMI matrix.
"""
logger.info("shifting PMI scores by log(k) with k=%s" % (k_shift, ))
pmimtr -= np.log(k_shift) # shifted PMI = log(#(w, c) * D / (#w * #c)) - log(k)
logger.info("clipping PMI scores to be non-negative PPMI")
pmimtr.clip(0.0, out=pmimtr) # SPPMI = max(0, log(#(w, c) * D / (#w * #c)) - log(k))
logger.info("normalizing PPMI word vectors to unit length")
for i, vec in enumerate(pmimtr):
pmimtr[i] = matutils.unitvec(vec)
return matutils.corpus2csc(matutils.Dense2Corpus(pmimtr, documents_columns=False)).T
示例5: similarity_3_contexts
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def similarity_3_contexts(self, t, p):
(bef, bet, aft) = (0, 0, 0)
if t.bef_vector is not None and p.bef_vector is not None:
bef = dot(
matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector)
)
if t.bet_vector is not None and p.bet_vector is not None:
bet = dot(
matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector)
)
if t.aft_vector is not None and p.aft_vector is not None:
aft = dot(
matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector)
)
return self.config.alpha*bef + \
self.config.beta*bet + \
self.config.gamma*aft
示例6: similarity_cluster
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def similarity_cluster(self, p1, p2):
count = 0
score = 0
if self.config.alpha == 0 and self.config.gamma == 0:
p1.merge_all_tuples_bet()
p2.merge_all_tuples_bet()
for v_bet1 in p1.bet_uniques_vectors:
for v_bet2 in p2.bet_uniques_vectors:
if v_bet1 is not None and v_bet2 is not None:
score += dot(
matutils.unitvec(asarray(v_bet1)),
matutils.unitvec(asarray(v_bet2))
)
count += 1
else:
for t1 in p1.tuples:
for t2 in p2.tuples:
score += self.similarity_3_contexts(t1, t2)
count += 1
return float(score) / float(count)
示例7: similarity
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def similarity(self, d1: int, d2: int) -> float:
"""Compute cosine similarity between two sentences from the training set.
Parameters
----------
d1 : int
index of sentence
d2 : int
index of sentence
Returns
-------
float
The cosine similarity between the vectors of the two sentences.
"""
return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2]))
示例8: safe_renew_label_vec
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def safe_renew_label_vec(self):
"""
initialize word vectors of words in label_dict.
origin version(safe)
"""
self.Label_vec=np.empty((len(self.Label_dict),self.len_vector))
self.Label_vec_u=np.empty((len(self.Label_dict),self.len_vector))
for i in range(len(self.Label_index)):
try:
self.Label_vec[i,:]=self.model.wv.__getitem__(self.Label_index[i])
self.Label_vec_u[i,:]=matutils.unitvec(self.model.wv.__getitem__(self.Label_index[i]))
except:
self.Label_vec[i,:]=np.zeros((1,self.len_vector)) # debug期间先这样处理吧
self.Label_vec_u[i,:]=np.zeros((1,self.len_vector))
示例9: renew_label_vec
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def renew_label_vec(self):
"""
initialize word vectors of words in label_dict.
fast version(unstable)
!Attention! : use it only when you make sure that all words in Label_index can calculate the word vector.
"""
self.Label_vec=self.model.wv.__getitem__(self.Label_index)
self.Label_vec_u=unitvec(self.Label_vec)
示例10: unitvec
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def unitvec(vector, ax=1):
v=vector*vector
if len(vector.shape)==1:
sqrtv=np.sqrt(np.sum(v))
elif len(vector.shape)==2:
sqrtv=np.sqrt([np.sum(v, axis=ax)])
else:
raise Exception('It\'s too large.')
if ax==1:
result=np.divide(vector,sqrtv.T)
elif ax==0:
result=np.divide(vector,sqrtv)
return result
示例11: most_similar
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def most_similar(self, positive=[], negative=[], topn=10):
if isinstance(positive, string_types) and not negative:
# allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
positive = [positive]
# add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
positive = [(word, 1.0) if isinstance(word, string_types + (ndarray,))
else word for word in positive]
negative = [(word, -1.0) if isinstance(word, string_types + (ndarray,))
else word for word in negative]
# compute the weighted average of all words
all_words, mean = set(), []
for word, weight in positive + negative:
if isinstance(word, ndarray):
mean.append(weight * word)
elif word in self.vocab:
mean.append(weight * self.syn0norm[self.vocab[word].index])
all_words.add(self.vocab[word].index)
else:
raise KeyError("word '%s' not in vocabulary" % word)
if not mean:
raise ValueError("cannot compute similarity with no input")
mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
dists = dot(self.syn0norm, mean)
if not topn:
return dists
best = argsort(dists)[::-1][:topn + len(all_words)]
# ignore (don't return) words from the input
result = [(self.index2word[sim], float(dists[sim]), self.syn0[sim]) for sim in best if sim not in all_words]
return result[:topn]
示例12: __getitem__
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def __getitem__(self, query):
"""Get similarities of document `query` to all documents in the corpus.
**or**
If `query` is a corpus (iterable of documents), return a matrix of similarities
of all query documents vs. all corpus document. Using this type of batch
query is more efficient than computing the similarities one document after
another.
"""
is_corpus, query = utils.is_corpus(query)
if self.normalize:
# self.normalize only works if the input is a plain gensim vector/corpus (as
# advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
# as well, but in that case assume tricks are happening and don't normalize
# anything (self.normalize has no effect).
if matutils.ismatrix(query):
import warnings
# warnings.warn("non-gensim input must already come normalized")
else:
if is_corpus:
query = [matutils.unitvec(v) for v in query]
else:
query = matutils.unitvec(query)
result = self.get_similarities(query)
if self.num_best is None:
return result
# if the input query was a corpus (=more documents), compute the top-n
# most similar for each document in turn
if matutils.ismatrix(result):
return [matutils.full2sparse_clipped(v, self.num_best) for v in result]
else:
# otherwise, return top-n of the single input document
return matutils.full2sparse_clipped(result, self.num_best)
示例13: __getitem__
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def __getitem__(self, bow):
"""
Return log entropy representation of the input vector and/or corpus.
"""
# if the input vector is in fact a corpus, return a transformed corpus
is_corpus, bow = utils.is_corpus(bow)
if is_corpus:
return self._apply(bow)
# unknown (new) terms will be given zero weight (NOT infinity/huge)
vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
for term_id, tf in bow if term_id in self.entr]
if self.normalize:
vector = matutils.unitvec(vector)
return vector
示例14: add_documents
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def add_documents(self, corpus):
"""
Extend the index with new documents.
Internally, documents are buffered and then spilled to disk when there's
`self.shardsize` of them (or when a query is issued).
"""
min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete
if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
# The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
self.reopen_shard()
for doc in corpus:
if isinstance(doc, numpy.ndarray):
doclen = len(doc)
elif scipy.sparse.issparse(doc):
doclen = doc.nnz
else:
doclen = len(doc)
if doclen < 0.3 * self.num_features:
doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T)
else:
doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features))
self.fresh_docs.append(doc)
self.fresh_nnz += doclen
if len(self.fresh_docs) >= self.shardsize:
self.close_shard()
if len(self.fresh_docs) % 10000 == 0:
logger.info("PROGRESS: fresh_shard size=%i" % len(self.fresh_docs))
示例15: __init__
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import unitvec [as 别名]
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256):
"""
`num_features` is the number of features in the corpus (will be determined
automatically by scanning the corpus if not specified). See `Similarity`
class for description of the other parameters.
"""
if num_features is None:
logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)")
num_features = 1 + utils.get_max_id(corpus)
self.num_features = num_features
self.num_best = num_best
self.normalize = True
self.chunksize = chunksize
if corpus is not None:
if self.num_features <= 0:
raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)")
logger.info("creating matrix for %s documents and %i features" %
(len(corpus), num_features))
self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype)
# iterate over corpus, populating the numpy index matrix with (normalized)
# document vectors
for docno, vector in enumerate(corpus):
if docno % 1000 == 0:
logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus)))
# individual documents in fact may be in numpy.scipy.sparse format as well.
# it's not documented because other it's not fully supported throughout.
# the user better know what he's doing (no normalization, must
# explicitly supply num_features etc).
if isinstance(vector, numpy.ndarray):
pass
elif scipy.sparse.issparse(vector):
vector = vector.toarray().flatten()
else:
vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
self.index[docno] = vector