当前位置: 首页>>代码示例>>Python>>正文


Python models.LsiModel方法代码示例

本文整理汇总了Python中gensim.models.LsiModel方法的典型用法代码示例。如果您正苦于以下问题:Python models.LsiModel方法的具体用法?Python models.LsiModel怎么用?Python models.LsiModel使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.models的用法示例。


在下文中一共展示了models.LsiModel方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: build_lsi_model

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def build_lsi_model(corpus_name, corpus_path, topics=300):
	logging.info( 'building lsi model for %s corpus', corpus_name )
	dictFile = corpus_path + corpus_name + '.dict'
	corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm'
	
	logging.info( 'loading dictionary ...' )
	dictionary = corpora.Dictionary.load(dictFile)
	logging.info( 'loading tfidf corpus ...' )
	corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file)
	logging.info( 'building lsi model' )
	lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
	logging.info( 'saving lsi' )
	lsiFile = corpus_path + corpus_name + '.lsi'
	lsi.save(lsiFile)
	logging.info( 'lsi model is ready' )
################################################################################## 
开发者ID:motazsaad,项目名称:comparable-text-miner,代码行数:18,代码来源:textpro.py

示例2: create_lsi_model

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def create_lsi_model(self, **kwargs):
        """ Create an LSI model from the entire words database table """
        corpus_tfidf = self.load_tfidf_corpus()
        if self._dictionary is None:
            self.load_dictionary()
        # Initialize an LSI transformation
        lsi = models.LsiModel(
            corpus_tfidf,
            id2word=self._dictionary,
            num_topics=self._dimensions,
            **kwargs
        )
        # if self._verbose:
        #    lsi.print_topics(num_topics = self._dimensions)
        # Save the generated model
        lsi.save(self._LSI_MODEL_FILE.format(self._dimensions)) 
开发者ID:mideind,项目名称:Greynir,代码行数:18,代码来源:builder.py

示例3: align_sentences_lsi

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def align_sentences_lsi(source_sentences, target_sentences, model_path, model_name):
	logging.info( 'Sentence level alignment using LSI' )
	
	dictionaryFile = model_path +  model_name + '.dict'
	lsiFile = model_path +  model_name + '.lsi'
	
	dictionary = corpora.Dictionary.load(dictionaryFile) ; logging.info( 'dictionary loaded' )
	lsi = models.LsiModel.load(lsiFile) ; logging.info( 'lsi model loaded' )
	
	source_lsi_sentences = generateLSIvectors(source_sentences, dictionary, lsi); 
	logging.info( 'projects source sentences into LSI space')
	target_lsi_sentences = generateLSIvectors(target_sentences, dictionary, lsi); 
	logging.info( 'projects target sentences into LSI space' )
	
	source_index = 0 	
	new_source_doc = [] ; new_target_doc = []
	
	for d in source_lsi_sentences:
		target_index, sim = getComparable(d, target_lsi_sentences)
		source_sent = source_sentences[source_index] ; target_sent = target_sentences[target_index]
		del target_lsi_sentences[target_index] ; 
		del target_sentences[target_index] # remove the already aligned sentences from the target document
		new_source_doc.append(source_sent) 
		new_target_doc.append(target_sent)
		if not target_lsi_sentences: break # all target sentences are aligned
		source_index+=1
		
	return new_source_doc, new_target_doc
##################################################################################
# projecting a corpus into LSI space 
开发者ID:motazsaad,项目名称:comparable-text-miner,代码行数:32,代码来源:textpro.py

示例4: calc_similarity

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def calc_similarity(self, prefix: str, text: str):
        """计算相似度
        返回索引和余弦值

        Arguments:
            prefix {str} -- 模型前缀
            text {str} -- 文本数据
            value {float} -- 设定的阈值,返回大于这个值的数据
        """
        dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix))  # 加载字典
        corpus = corpora.MmCorpus('./models/{}_corpuse.mm'.format(prefix))  # 加载语料
        tfidf_model = models.TfidfModel.load("./models/{}_tfidf_model.model".format(prefix))  # 加载Tfidf模型
        corpus_tfidf = tfidf_model[corpus]

        lsi = models.LsiModel(corpus_tfidf)
        corpus_lsi = lsi[corpus_tfidf]
        similarity_lsi = similarities.Similarity('./models/similarity-lsi-index',
                                                 corpus_lsi,
                                                 num_features=400,
                                                 num_best=3)
        cut_raw = self.segment(text)  # 1.分词
        corpus = dictionary.doc2bow(cut_raw)  # 2.转换成bow向量
        corpus_tfidf = tfidf_model[corpus]  # 3.计算tfidf值
        corpus_lsi = lsi[corpus_tfidf]  # 4.计算lsi值
        sims = similarity_lsi[corpus_lsi]

        with open('./data/idx_dic.dic', 'r') as f:
            dt = f.read()
            idx_dic = eval(dt)

        result = []
        if sims is not None:
            result = [idx_dic[idx] for idx, val in sims if val > self.keep_val]

        return result 
开发者ID:jarvisqi,项目名称:nlp_learning,代码行数:37,代码来源:docsim.py

示例5: CalSim

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def CalSim(self,test_document,Type,best_num):
        '''Calculate similarities between test document wth all news(articles/documents).

        # Arguments:
            test_document: List of raw documents.
            Type: Models of calculating similarities.
            best_num: refer to 'num_best' parameter in Gensim module.
        '''
        if Type == 'Similarity-tfidf-index':
            tfidf = models.TfidfModel(self._BowVecOfEachDoc)  
            tfidfVec = tfidf[self._BowVecOfEachDoc]
            self._num_features = len(self._dictionary.token2id.keys())
            self._similarity = similarities.Similarity(Type, tfidfVec, \
                num_features=self._num_features,num_best=best_num)  
            test_cut_raw = list(jieba.cut(test_document))  
            test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw) 
            self._test_BowVecOfEachDoc = tfidf[test_BowVecOfEachDoc]
        elif Type == 'Similarity-LSI-index':
            lsi_model = models.LsiModel(self._BowVecOfEachDoc)  
            corpus_lsi = lsi_model[self._BowVecOfEachDoc]
            self._num_features = len(self._dictionary.token2id.keys())
            self._similarity = similarities.Similarity(Type, corpus_lsi, \
                num_features=self._num_features,num_best=best_num)  
            test_cut_raw = list(jieba.cut(test_document))  
            test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw) 
            self._test_BowVecOfEachDoc = lsi_model[test_BowVecOfEachDoc]
        self.Print_CalSim()
        IdLst = []
        SimRltLst = []
        SimTxLst = []
        for Id, Sim in self._similarity[self._test_BowVecOfEachDoc]:
            IdLst.append(Id)
            SimRltLst.append(Sim)
            SimTxLst.append(self._raw_documents[Id])
        return IdLst,SimTxLst,SimRltLst 
开发者ID:DemonDamon,项目名称:Listed-company-news-crawl-and-text-analysis,代码行数:37,代码来源:text_processing.py

示例6: __init__

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def __init__(
        self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8
    ):
        super().__init__(
            cleanup_urls=cleanup_urls,
            nltk_tokenizer=nltk_tokenizer,
            confidence_threshold=confidence_threshold,
        )
        self.corpus = []

        for bug in bugzilla.get_bugs():

            textual_features = self.text_preprocess(self.get_text(bug))
            self.corpus.append([bug["id"], textual_features])

        # Assigning unique integer ids to all words
        self.dictionary = Dictionary(text for bug_id, text in self.corpus)

        # Conversion to BoW
        corpus_final = [self.dictionary.doc2bow(text) for bug_id, text in self.corpus]

        # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions
        tfidf = models.TfidfModel(corpus_final)
        corpus_tfidf = tfidf[corpus_final]

        # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing
        self.lsi = models.LsiModel(
            corpus_tfidf, id2word=self.dictionary, num_topics=300
        )
        corpus_lsi = self.lsi[corpus_tfidf]

        # Indexing the corpus
        self.index = similarities.Similarity(
            output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300
        ) 
开发者ID:mozilla,项目名称:bugbug,代码行数:37,代码来源:similarity.py

示例7: LsiModel

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def LsiModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LsiModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # lda模型 
开发者ID:WenRichard,项目名称:Customer-Chatbot,代码行数:13,代码来源:sentenceSimilarity.py

示例8: train_lsi_model_gensim

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def train_lsi_model_gensim(corpus, total_topics=2):
    
    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lsi = models.LsiModel(corpus_tfidf, 
                          id2word=dictionary,
                          num_topics=total_topics)
    return lsi 
开发者ID:dipanjanS,项目名称:text-analytics-with-python,代码行数:14,代码来源:topic_modeling.py

示例9: create_lsi_model

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def create_lsi_model(num_topics,dictionary,corpus):

    print ("create lsi model ...")
    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics)
    #lsi_model = models.LsiModel(corpus,id2word=dictionary,num_topics = num_topics)
    corpus_lsi = lsi_model[corpus_tfidf]
    #corpus_lsi = lsi_model[corpus]
    corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
    #corpus_simi_matrix = similarities.MatrixSimilarity(corpus_tfidf)
    return (tfidf_model,lsi_model,corpus_simi_matrix) 
开发者ID:geekinglcq,项目名称:aca,代码行数:14,代码来源:lsi_model.py

示例10: create_lsi_model

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def create_lsi_model(num_topics,dictionary,corpus):
    print ("create lsi model ...")

    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics)
    corpus_lsi = lsi_model[corpus_tfidf]
    corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
    return (tfidf_model,lsi_model,corpus_simi_matrix) 
开发者ID:geekinglcq,项目名称:aca,代码行数:11,代码来源:lsi_author.py

示例11: load_lsi_model

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def load_lsi_model(self):
        """ Load a previously generated LSI model """
        self._model = models.LsiModel.load(
            self._LSI_MODEL_FILE.format(self._dimensions), mmap="r"
        )
        self._model_name = "lsi" 
开发者ID:mideind,项目名称:Greynir,代码行数:8,代码来源:builder.py

示例12: get_lsi

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def get_lsi(self, num_topics=300):
        docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
        model_lsi = models.LsiModel(docs_corpus, num_topics, id2word=self.docs_dict)
        docs_lsi  = model_lsi[docs_corpus]
        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lsi])
        return docs_vecs

    # Get Random Projections(RP) vector for document list 
开发者ID:crownpku,项目名称:text2vec,代码行数:10,代码来源:text2vec.py

示例13: topic_analysis

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def topic_analysis(corpus, dictionary, models_path, technique):

    import uuid
    uuid = str(uuid.uuid4())
    print("[BLOCK] Starting models for context")
    sys.stdout.flush()

    if technique == "all" or technique == "hdp":
        t1 = time()
        # HDP model
        model = HdpModel(corpus, id2word=dictionary)
        model.save("%s/hdp_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for HDP model: %s" % (round(t2-t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "ldap":
        t1 = time()
        # Parallel LDA model
        model = LdaMulticore(corpus, id2word=dictionary, num_topics=100,  workers=23, passes=20)
        model.save("%s/lda_parallel_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LDA multicore: %s" % (round(t2-t1, 2)))
    sys.stdout.flush()

    if technique == "all" or technique == "lsa":
        t1 = time()
        # LSA model
        model = LsiModel(corpus, id2word=dictionary, num_topics=400)
        model.save("%s/lsa_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LSA: %s" % (round(t2-t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "ldao":
        t1 = time()
        # Online LDA model
        model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=5)
        model.save("%s/lda_online_%s" % (models_path, uuid))
        t2 = time()
        print("[BLOCK] Training time for LDA online: %s" % (round(t2-t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "lda":
        t1 = time()
        # Offline LDA model
        model = LdaModel(corpus, id2word=dictionary, num_topics=100,  update_every=0, passes=20)
        model.save("%s/lda_offline_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LDA offline: %s" % (round(t2-t1, 2)))
        sys.stdout.flush() 
开发者ID:kafkasl,项目名称:contextualLSTM,代码行数:57,代码来源:topics_analysis.py

示例14: align_documents_lsi

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def align_documents_lsi(source_test_corpus, target_test_corpus, model_path, model_name, output_path, top_n=20, doc_separator=x_seperator):
	logging.info( 'aligning source and target documents using LSI model' )
	
	dictionaryFile = model_path +  model_name + '.dict'
	lsiFile = model_path +  model_name + '.lsi'
	
	dictionary = corpora.Dictionary.load(dictionaryFile) ; logging.info(  'dictionary loaded' )
	lsi = models.LsiModel.load(lsiFile) ; logging.info(  'lsi model loaded' )
	
	logging.info( '# of source docs %d \t# of target docs %d', len(source_test_corpus),  len(target_test_corpus) )
	
	source_lsi_corpus = generateLSIvectors(source_test_corpus, dictionary, lsi)
	logging.info( 'projects source corpus into LSI space' )
	target_lsi_corpus = generateLSIvectors(target_test_corpus, dictionary, lsi)
	logging.info( 'projects target corpus into LSI space' )
	
	allSims = [] ; doc_tuple = [] ; source_index = 0 
	
	for d in source_lsi_corpus:
		target_index, sim = getComparable(d, target_lsi_corpus)
		allSims.append(sim)
		source_doc = source_test_corpus[source_index] ; target_doc = target_test_corpus[target_index]
		del target_lsi_corpus[target_index] ; 
		del target_test_corpus[target_index] # remove the already aligned document from the target corpus
		doc_tuple.append((source_index,target_index, source_doc, target_doc))
		if not target_lsi_corpus: break # all target docs are aligned
		source_index+=1
		
	sortedAllSims = sorted(enumerate(allSims), key=lambda item: -item[1])
	topNList = sortedAllSims[:top_n]
	out = open (output_path + 'results.txt', 'w')
	count = 0
	print '\n#, src, target, sim'
	for e in topNList:
		i, sim = e
		srcIndx = doc_tuple[i][0] ; targetIndx = doc_tuple[i][1] ; sdoc = doc_tuple[i][2] ; tdoc = doc_tuple[i][3]
		print count, srcIndx, targetIndx, '%0.2f' % sim
		print>>out, count, srcIndx, targetIndx, '%0.2f' % sim
		source_out = open(output_path + str(count) + '.source.txt', 'w')
		target_out = open(output_path + str(count) + '.target.txt' , 'w')
		print>>source_out, sdoc.encode('utf-8')
		print>>target_out, tdoc.encode('utf-8')
		source_out.close(); target_out.close(); count+=1	
	out.close();
	logging.info( 'aligning source and target documents using LSI model is done!' )
################################################################################## 
开发者ID:motazsaad,项目名称:comparable-text-miner,代码行数:48,代码来源:textpro.py

示例15: test_lee

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LsiModel [as 别名]
def test_lee(self):
        """correlation with human data > 0.6
        (this is the value which was achieved in the original paper)
        """

        global bg_corpus, corpus

        # create a dictionary and corpus (bag of words)
        dictionary = corpora.Dictionary(bg_corpus)
        bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
        corpus = [dictionary.doc2bow(text) for text in corpus]

        # transform the bag of words with log_entropy normalization
        log_ent = models.LogEntropyModel(bg_corpus)
        bg_corpus_ent = log_ent[bg_corpus]

        # initialize an LSI transformation from background corpus
        lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
        # transform small corpus to lsi bow->log_ent->fold-in-lsi
        corpus_lsi = lsi[log_ent[corpus]]

        # compute pairwise similarity matrix and extract upper triangular
        res = np.zeros((len(corpus), len(corpus)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                res[i, j] = matutils.cossim(par1, par2)
        flat = res[matutils.triu_indices(len(corpus), 1)]

        cor = np.corrcoef(flat, human_sim_vector)[0, 1]
        logging.info("LSI correlation coefficient is %s" % cor)
        self.assertTrue(cor > 0.6)


    # def test_lee_mallet(self):
    #     global bg_corpus, corpus, bg_corpus2, corpus2

    #     # create a dictionary and corpus (bag of words)
    #     dictionary = corpora.Dictionary(bg_corpus2)
    #     bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
    #     corpus = [dictionary.doc2bow(text) for text in corpus2]

    #     # initialize an LDA transformation from background corpus
    #     lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
    #         corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
    #     corpus_lda = lda[corpus]

    #     # compute pairwise similarity matrix and extract upper triangular
    #     res = np.zeros((len(corpus), len(corpus)))
    #     for i, par1 in enumerate(corpus_lda):
    #         for j, par2 in enumerate(corpus_lda):
    #             res[i, j] = matutils.cossim(par1, par2)
    #     flat = res[matutils.triu_indices(len(corpus), 1)]

    #     cor = np.corrcoef(flat, human_sim_vector)[0, 1]
    #     logging.info("LDA correlation coefficient is %s" % cor)
    #     self.assertTrue(cor > 0.35) 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:58,代码来源:test_lee.py


注:本文中的gensim.models.LsiModel方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。