本文整理汇总了Python中gensim.models.LsiModel方法的典型用法代码示例。如果您正苦于以下问题:Python models.LsiModel方法的具体用法?Python models.LsiModel怎么用?Python models.LsiModel使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models
的用法示例。
在下文中一共展示了models.LsiModel方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: build_lsi_model
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def build_lsi_model(corpus_name, corpus_path, topics=300):
logging.info( 'building lsi model for %s corpus', corpus_name )
dictFile = corpus_path + corpus_name + '.dict'
corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm'
logging.info( 'loading dictionary ...' )
dictionary = corpora.Dictionary.load(dictFile)
logging.info( 'loading tfidf corpus ...' )
corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file)
logging.info( 'building lsi model' )
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
logging.info( 'saving lsi' )
lsiFile = corpus_path + corpus_name + '.lsi'
lsi.save(lsiFile)
logging.info( 'lsi model is ready' )
##################################################################################
示例2: create_lsi_model
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def create_lsi_model(self, **kwargs):
""" Create an LSI model from the entire words database table """
corpus_tfidf = self.load_tfidf_corpus()
if self._dictionary is None:
self.load_dictionary()
# Initialize an LSI transformation
lsi = models.LsiModel(
corpus_tfidf,
id2word=self._dictionary,
num_topics=self._dimensions,
**kwargs
)
# if self._verbose:
# lsi.print_topics(num_topics = self._dimensions)
# Save the generated model
lsi.save(self._LSI_MODEL_FILE.format(self._dimensions))
示例3: align_sentences_lsi
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def align_sentences_lsi(source_sentences, target_sentences, model_path, model_name):
logging.info( 'Sentence level alignment using LSI' )
dictionaryFile = model_path + model_name + '.dict'
lsiFile = model_path + model_name + '.lsi'
dictionary = corpora.Dictionary.load(dictionaryFile) ; logging.info( 'dictionary loaded' )
lsi = models.LsiModel.load(lsiFile) ; logging.info( 'lsi model loaded' )
source_lsi_sentences = generateLSIvectors(source_sentences, dictionary, lsi);
logging.info( 'projects source sentences into LSI space')
target_lsi_sentences = generateLSIvectors(target_sentences, dictionary, lsi);
logging.info( 'projects target sentences into LSI space' )
source_index = 0
new_source_doc = [] ; new_target_doc = []
for d in source_lsi_sentences:
target_index, sim = getComparable(d, target_lsi_sentences)
source_sent = source_sentences[source_index] ; target_sent = target_sentences[target_index]
del target_lsi_sentences[target_index] ;
del target_sentences[target_index] # remove the already aligned sentences from the target document
new_source_doc.append(source_sent)
new_target_doc.append(target_sent)
if not target_lsi_sentences: break # all target sentences are aligned
source_index+=1
return new_source_doc, new_target_doc
##################################################################################
# projecting a corpus into LSI space
示例4: calc_similarity
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def calc_similarity(self, prefix: str, text: str):
"""计算相似度
返回索引和余弦值
Arguments:
prefix {str} -- 模型前缀
text {str} -- 文本数据
value {float} -- 设定的阈值,返回大于这个值的数据
"""
dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix)) # 加载字典
corpus = corpora.MmCorpus('./models/{}_corpuse.mm'.format(prefix)) # 加载语料
tfidf_model = models.TfidfModel.load("./models/{}_tfidf_model.model".format(prefix)) # 加载Tfidf模型
corpus_tfidf = tfidf_model[corpus]
lsi = models.LsiModel(corpus_tfidf)
corpus_lsi = lsi[corpus_tfidf]
similarity_lsi = similarities.Similarity('./models/similarity-lsi-index',
corpus_lsi,
num_features=400,
num_best=3)
cut_raw = self.segment(text) # 1.分词
corpus = dictionary.doc2bow(cut_raw) # 2.转换成bow向量
corpus_tfidf = tfidf_model[corpus] # 3.计算tfidf值
corpus_lsi = lsi[corpus_tfidf] # 4.计算lsi值
sims = similarity_lsi[corpus_lsi]
with open('./data/idx_dic.dic', 'r') as f:
dt = f.read()
idx_dic = eval(dt)
result = []
if sims is not None:
result = [idx_dic[idx] for idx, val in sims if val > self.keep_val]
return result
示例5: CalSim
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def CalSim(self,test_document,Type,best_num):
'''Calculate similarities between test document wth all news(articles/documents).
# Arguments:
test_document: List of raw documents.
Type: Models of calculating similarities.
best_num: refer to 'num_best' parameter in Gensim module.
'''
if Type == 'Similarity-tfidf-index':
tfidf = models.TfidfModel(self._BowVecOfEachDoc)
tfidfVec = tfidf[self._BowVecOfEachDoc]
self._num_features = len(self._dictionary.token2id.keys())
self._similarity = similarities.Similarity(Type, tfidfVec, \
num_features=self._num_features,num_best=best_num)
test_cut_raw = list(jieba.cut(test_document))
test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw)
self._test_BowVecOfEachDoc = tfidf[test_BowVecOfEachDoc]
elif Type == 'Similarity-LSI-index':
lsi_model = models.LsiModel(self._BowVecOfEachDoc)
corpus_lsi = lsi_model[self._BowVecOfEachDoc]
self._num_features = len(self._dictionary.token2id.keys())
self._similarity = similarities.Similarity(Type, corpus_lsi, \
num_features=self._num_features,num_best=best_num)
test_cut_raw = list(jieba.cut(test_document))
test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw)
self._test_BowVecOfEachDoc = lsi_model[test_BowVecOfEachDoc]
self.Print_CalSim()
IdLst = []
SimRltLst = []
SimTxLst = []
for Id, Sim in self._similarity[self._test_BowVecOfEachDoc]:
IdLst.append(Id)
SimRltLst.append(Sim)
SimTxLst.append(self._raw_documents[Id])
return IdLst,SimTxLst,SimRltLst
示例6: __init__
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def __init__(
self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8
):
super().__init__(
cleanup_urls=cleanup_urls,
nltk_tokenizer=nltk_tokenizer,
confidence_threshold=confidence_threshold,
)
self.corpus = []
for bug in bugzilla.get_bugs():
textual_features = self.text_preprocess(self.get_text(bug))
self.corpus.append([bug["id"], textual_features])
# Assigning unique integer ids to all words
self.dictionary = Dictionary(text for bug_id, text in self.corpus)
# Conversion to BoW
corpus_final = [self.dictionary.doc2bow(text) for bug_id, text in self.corpus]
# Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions
tfidf = models.TfidfModel(corpus_final)
corpus_tfidf = tfidf[corpus_final]
# Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing
self.lsi = models.LsiModel(
corpus_tfidf, id2word=self.dictionary, num_topics=300
)
corpus_lsi = self.lsi[corpus_tfidf]
# Indexing the corpus
self.index = similarities.Similarity(
output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300
)
示例7: LsiModel
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def LsiModel(self):
self.simple_model()
# 转换模型
self.model = models.LsiModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# lda模型
示例8: train_lsi_model_gensim
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def train_lsi_model_gensim(corpus, total_topics=2):
norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
dictionary = corpora.Dictionary(norm_tokenized_corpus)
mapped_corpus = [dictionary.doc2bow(text)
for text in norm_tokenized_corpus]
tfidf = models.TfidfModel(mapped_corpus)
corpus_tfidf = tfidf[mapped_corpus]
lsi = models.LsiModel(corpus_tfidf,
id2word=dictionary,
num_topics=total_topics)
return lsi
示例9: create_lsi_model
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def create_lsi_model(num_topics,dictionary,corpus):
print ("create lsi model ...")
tfidf_model = models.TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]
lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics)
#lsi_model = models.LsiModel(corpus,id2word=dictionary,num_topics = num_topics)
corpus_lsi = lsi_model[corpus_tfidf]
#corpus_lsi = lsi_model[corpus]
corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
#corpus_simi_matrix = similarities.MatrixSimilarity(corpus_tfidf)
return (tfidf_model,lsi_model,corpus_simi_matrix)
示例10: create_lsi_model
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def create_lsi_model(num_topics,dictionary,corpus):
print ("create lsi model ...")
tfidf_model = models.TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]
lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics)
corpus_lsi = lsi_model[corpus_tfidf]
corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
return (tfidf_model,lsi_model,corpus_simi_matrix)
示例11: load_lsi_model
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def load_lsi_model(self):
""" Load a previously generated LSI model """
self._model = models.LsiModel.load(
self._LSI_MODEL_FILE.format(self._dimensions), mmap="r"
)
self._model_name = "lsi"
示例12: get_lsi
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def get_lsi(self, num_topics=300):
docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
model_lsi = models.LsiModel(docs_corpus, num_topics, id2word=self.docs_dict)
docs_lsi = model_lsi[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lsi])
return docs_vecs
# Get Random Projections(RP) vector for document list
示例13: topic_analysis
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def topic_analysis(corpus, dictionary, models_path, technique):
import uuid
uuid = str(uuid.uuid4())
print("[BLOCK] Starting models for context")
sys.stdout.flush()
if technique == "all" or technique == "hdp":
t1 = time()
# HDP model
model = HdpModel(corpus, id2word=dictionary)
model.save("%s/hdp_%s" % (models_path, uuid))
del model
t2 = time()
print("[BLOCK] Training time for HDP model: %s" % (round(t2-t1, 2)))
sys.stdout.flush()
if technique == "all" or technique == "ldap":
t1 = time()
# Parallel LDA model
model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=23, passes=20)
model.save("%s/lda_parallel_%s" % (models_path, uuid))
del model
t2 = time()
print("[BLOCK] Training time for LDA multicore: %s" % (round(t2-t1, 2)))
sys.stdout.flush()
if technique == "all" or technique == "lsa":
t1 = time()
# LSA model
model = LsiModel(corpus, id2word=dictionary, num_topics=400)
model.save("%s/lsa_%s" % (models_path, uuid))
del model
t2 = time()
print("[BLOCK] Training time for LSA: %s" % (round(t2-t1, 2)))
sys.stdout.flush()
if technique == "all" or technique == "ldao":
t1 = time()
# Online LDA model
model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=5)
model.save("%s/lda_online_%s" % (models_path, uuid))
t2 = time()
print("[BLOCK] Training time for LDA online: %s" % (round(t2-t1, 2)))
sys.stdout.flush()
if technique == "all" or technique == "lda":
t1 = time()
# Offline LDA model
model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=0, passes=20)
model.save("%s/lda_offline_%s" % (models_path, uuid))
del model
t2 = time()
print("[BLOCK] Training time for LDA offline: %s" % (round(t2-t1, 2)))
sys.stdout.flush()
示例14: align_documents_lsi
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def align_documents_lsi(source_test_corpus, target_test_corpus, model_path, model_name, output_path, top_n=20, doc_separator=x_seperator):
logging.info( 'aligning source and target documents using LSI model' )
dictionaryFile = model_path + model_name + '.dict'
lsiFile = model_path + model_name + '.lsi'
dictionary = corpora.Dictionary.load(dictionaryFile) ; logging.info( 'dictionary loaded' )
lsi = models.LsiModel.load(lsiFile) ; logging.info( 'lsi model loaded' )
logging.info( '# of source docs %d \t# of target docs %d', len(source_test_corpus), len(target_test_corpus) )
source_lsi_corpus = generateLSIvectors(source_test_corpus, dictionary, lsi)
logging.info( 'projects source corpus into LSI space' )
target_lsi_corpus = generateLSIvectors(target_test_corpus, dictionary, lsi)
logging.info( 'projects target corpus into LSI space' )
allSims = [] ; doc_tuple = [] ; source_index = 0
for d in source_lsi_corpus:
target_index, sim = getComparable(d, target_lsi_corpus)
allSims.append(sim)
source_doc = source_test_corpus[source_index] ; target_doc = target_test_corpus[target_index]
del target_lsi_corpus[target_index] ;
del target_test_corpus[target_index] # remove the already aligned document from the target corpus
doc_tuple.append((source_index,target_index, source_doc, target_doc))
if not target_lsi_corpus: break # all target docs are aligned
source_index+=1
sortedAllSims = sorted(enumerate(allSims), key=lambda item: -item[1])
topNList = sortedAllSims[:top_n]
out = open (output_path + 'results.txt', 'w')
count = 0
print '\n#, src, target, sim'
for e in topNList:
i, sim = e
srcIndx = doc_tuple[i][0] ; targetIndx = doc_tuple[i][1] ; sdoc = doc_tuple[i][2] ; tdoc = doc_tuple[i][3]
print count, srcIndx, targetIndx, '%0.2f' % sim
print>>out, count, srcIndx, targetIndx, '%0.2f' % sim
source_out = open(output_path + str(count) + '.source.txt', 'w')
target_out = open(output_path + str(count) + '.target.txt' , 'w')
print>>source_out, sdoc.encode('utf-8')
print>>target_out, tdoc.encode('utf-8')
source_out.close(); target_out.close(); count+=1
out.close();
logging.info( 'aligning source and target documents using LSI model is done!' )
##################################################################################
示例15: test_lee
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def test_lee(self):
"""correlation with human data > 0.6
(this is the value which was achieved in the original paper)
"""
global bg_corpus, corpus
# create a dictionary and corpus (bag of words)
dictionary = corpora.Dictionary(bg_corpus)
bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
corpus = [dictionary.doc2bow(text) for text in corpus]
# transform the bag of words with log_entropy normalization
log_ent = models.LogEntropyModel(bg_corpus)
bg_corpus_ent = log_ent[bg_corpus]
# initialize an LSI transformation from background corpus
lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
# transform small corpus to lsi bow->log_ent->fold-in-lsi
corpus_lsi = lsi[log_ent[corpus]]
# compute pairwise similarity matrix and extract upper triangular
res = np.zeros((len(corpus), len(corpus)))
for i, par1 in enumerate(corpus_lsi):
for j, par2 in enumerate(corpus_lsi):
res[i, j] = matutils.cossim(par1, par2)
flat = res[matutils.triu_indices(len(corpus), 1)]
cor = np.corrcoef(flat, human_sim_vector)[0, 1]
logging.info("LSI correlation coefficient is %s" % cor)
self.assertTrue(cor > 0.6)
# def test_lee_mallet(self):
# global bg_corpus, corpus, bg_corpus2, corpus2
# # create a dictionary and corpus (bag of words)
# dictionary = corpora.Dictionary(bg_corpus2)
# bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
# corpus = [dictionary.doc2bow(text) for text in corpus2]
# # initialize an LDA transformation from background corpus
# lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
# corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
# corpus_lda = lda[corpus]
# # compute pairwise similarity matrix and extract upper triangular
# res = np.zeros((len(corpus), len(corpus)))
# for i, par1 in enumerate(corpus_lda):
# for j, par2 in enumerate(corpus_lda):
# res[i, j] = matutils.cossim(par1, par2)
# flat = res[matutils.triu_indices(len(corpus), 1)]
# cor = np.corrcoef(flat, human_sim_vector)[0, 1]
# logging.info("LDA correlation coefficient is %s" % cor)
# self.assertTrue(cor > 0.35)