本文整理汇总了Python中gensim.matutils.cossim方法的典型用法代码示例。如果您正苦于以下问题:Python matutils.cossim方法的具体用法?Python matutils.cossim怎么用?Python matutils.cossim使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.matutils
的用法示例。
在下文中一共展示了matutils.cossim方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: similarity
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import cossim [as 别名]
def similarity(self, t, extraction_pattern):
(bef, bet, aft) = (0, 0, 0)
if t.bef_vector is not None and extraction_pattern.centroid_bef is not None:
bef = cossim(t.bef_vector, extraction_pattern.centroid_bef)
if t.bet_vector is not None and extraction_pattern.centroid_bet is not None:
bet = cossim(t.bet_vector, extraction_pattern.centroid_bet)
if t.aft_vector is not None and extraction_pattern.centroid_aft is not None:
aft = cossim(t.aft_vector, extraction_pattern.centroid_aft)
return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft
示例2: getComparable
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import cossim [as 别名]
def getComparable(source_lsi_doc, target_lsi_corpus):
sims = []
for i in range(len(target_lsi_corpus)):
sims.append( matutils.cossim(source_lsi_doc, target_lsi_corpus[i]) )
sortedSims = sorted(enumerate(sims), key=lambda item: -item[1])
topIndex = sortedSims[0][0]
topSim = sortedSims[0][1]
return sortedSims[0]
##################################################################################
##################################################################################
# takses wiki text and a list of language codes, and returns the interlanguage links
# language code list:
# ar arabic
# en english
# fr french
# es Español
# de Deutsch
# it Italiano
# pt portuguese
# fa farsi
# ur urdo
# he hebrew
# ps peshto (Afghānī)
# sd Sindhi (sindi)
# ug Uyghur أويغورية
# pnb punjabi (Pakistan - India)
# ckb kurdi
# arz egyptian
示例3: getMaxSimilarity
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import cossim [as 别名]
def getMaxSimilarity(dictTopic, vector):
maxValue = 0
maxIndex = -1
for k,cluster in dictTopic.iteritems():
oneSimilarity = mean([matutils.cossim(vector, v) for v in cluster])
if oneSimilarity > maxValue:
maxValue = oneSimilarity
maxIndex = k
return maxIndex, maxValue
示例4: order_by_tf_id_rank
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import cossim [as 别名]
def order_by_tf_id_rank(self, headline, sentences, number_of_sentences):
headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline))
headline_tfidf = self.tfidf_model[headline_bow]
scored_sentences = []
'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]'
#sentences = sentences.replace('\n', ' ')
for sentence in self.tokenizer.tokenize(sentences):
sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence))
sim = cossim(headline_tfidf, sentence_tfidf)
#print(str(sim))
scored_sentences.append([sentence, sim])
sorted_sentences= sorted(scored_sentences, key=lambda scored_sentences: scored_sentences[1], reverse= True)
'''
for sentence in sorted_sentences:
print(str(sentence))
'''
' return sorted_sentences '
sentences_string = ""
current_sentence_number = 0
for sentence in sorted_sentences:
current_sentence_number += 1
sentences_string += sentence[0] + ' '
if current_sentence_number == number_of_sentences:
break
#print("Ranked: \n " + sentences_string)
return sentences_string
示例5: tfidf_sim
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import cossim [as 别名]
def tfidf_sim(self, train_data, body_dict, threshold):
'''
:param
train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
body_dict : a dictionary of values containing {bodyID:'bodyText'}
threshold : used distinguish between similar and not similar
'''
bodyText_list = list(body_dict.values())
bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}
bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]
vocab = corpora.Dictionary(bodyText_w)
corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w]
tfidf_model = models.TfidfModel(corporaBody_bow)
unrelated, related, y_true, y_pred = [], [], [], []
for headline, bodyID, stance in train_data:
headline_bow = vocab.doc2bow(sent2stokens_wostop(headline))
headlines_tfidf = tfidf_model[headline_bow]
corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]]
sim = cossim(headlines_tfidf, corporaBody_tfidf)
unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred])
print_results([unrelated, related, y_true, y_pred], self.model_type)
示例6: tdidf_all_vec
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import cossim [as 别名]
def tdidf_all_vec(self):
return matutils.cossim(self.tfidf_vec1, self.tfidf_vec2)
示例7: tdidf_all_vec_pinyin
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import cossim [as 别名]
def tdidf_all_vec_pinyin(self):
return matutils.cossim(self.tfidf_vec1_pinyin, self.tfidf_vec2_pinyin)
示例8: test_lee
# 需要导入模块: from gensim import matutils [as 别名]
# 或者: from gensim.matutils import cossim [as 别名]
def test_lee(self):
"""correlation with human data > 0.6
(this is the value which was achieved in the original paper)
"""
global bg_corpus, corpus
# create a dictionary and corpus (bag of words)
dictionary = corpora.Dictionary(bg_corpus)
bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
corpus = [dictionary.doc2bow(text) for text in corpus]
# transform the bag of words with log_entropy normalization
log_ent = models.LogEntropyModel(bg_corpus)
bg_corpus_ent = log_ent[bg_corpus]
# initialize an LSI transformation from background corpus
lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
# transform small corpus to lsi bow->log_ent->fold-in-lsi
corpus_lsi = lsi[log_ent[corpus]]
# compute pairwise similarity matrix and extract upper triangular
res = np.zeros((len(corpus), len(corpus)))
for i, par1 in enumerate(corpus_lsi):
for j, par2 in enumerate(corpus_lsi):
res[i, j] = matutils.cossim(par1, par2)
flat = res[matutils.triu_indices(len(corpus), 1)]
cor = np.corrcoef(flat, human_sim_vector)[0, 1]
logging.info("LSI correlation coefficient is %s" % cor)
self.assertTrue(cor > 0.6)
# def test_lee_mallet(self):
# global bg_corpus, corpus, bg_corpus2, corpus2
# # create a dictionary and corpus (bag of words)
# dictionary = corpora.Dictionary(bg_corpus2)
# bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
# corpus = [dictionary.doc2bow(text) for text in corpus2]
# # initialize an LDA transformation from background corpus
# lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
# corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
# corpus_lda = lda[corpus]
# # compute pairwise similarity matrix and extract upper triangular
# res = np.zeros((len(corpus), len(corpus)))
# for i, par1 in enumerate(corpus_lda):
# for j, par2 in enumerate(corpus_lda):
# res[i, j] = matutils.cossim(par1, par2)
# flat = res[matutils.triu_indices(len(corpus), 1)]
# cor = np.corrcoef(flat, human_sim_vector)[0, 1]
# logging.info("LDA correlation coefficient is %s" % cor)
# self.assertTrue(cor > 0.35)