当前位置: 首页>>代码示例>>Python>>正文


Python corpora.MmCorpus方法代码示例

本文整理汇总了Python中gensim.corpora.MmCorpus方法的典型用法代码示例。如果您正苦于以下问题:Python corpora.MmCorpus方法的具体用法?Python corpora.MmCorpus怎么用?Python corpora.MmCorpus使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.corpora的用法示例。


在下文中一共展示了corpora.MmCorpus方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: build_lsi_model

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def build_lsi_model(corpus_name, corpus_path, topics=300):
	logging.info( 'building lsi model for %s corpus', corpus_name )
	dictFile = corpus_path + corpus_name + '.dict'
	corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm'
	
	logging.info( 'loading dictionary ...' )
	dictionary = corpora.Dictionary.load(dictFile)
	logging.info( 'loading tfidf corpus ...' )
	corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file)
	logging.info( 'building lsi model' )
	lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
	logging.info( 'saving lsi' )
	lsiFile = corpus_path + corpus_name + '.lsi'
	lsi.save(lsiFile)
	logging.info( 'lsi model is ready' )
################################################################################## 
开发者ID:motazsaad,项目名称:comparable-text-miner,代码行数:18,代码来源:textpro.py

示例2: load

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def load(cls, save_dir='./'):
        """
        Load the corpus from a save directory.
        """
        tables = pickle.load(open(save_dir + 'tag-tables.pickle', 'rb'))
        tagsToDocs = tables[0]
        docsToTags = tables[1]        
        titles = pickle.load(open(save_dir + 'titles.pickle', 'rb'))
        tfidf_model = TfidfModel.load(fname=save_dir + 'documents.tfidf_model')
        corpus_tfidf = corpora.MmCorpus(save_dir + 'documents_tfidf.mm')
        dictionary = corpora.Dictionary.load(fname=save_dir + 'documents.dict')
        files = pickle.load(open(save_dir + 'files.pickle', 'rb'))
        doc_line_nums = pickle.load(open(save_dir + 'doc_line_nums.pickle', 'rb'))
        
        ksearch = KeySearch(dictionary, tfidf_model, 
                            corpus_tfidf, titles, tagsToDocs,
                            docsToTags, files, doc_line_nums) 
        
        return ksearch 
开发者ID:chrisjmccormick,项目名称:wiki-sim-search,代码行数:21,代码来源:keysearch.py

示例3: train

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def train(self, prefix: str, corporas: list):
        """ 训练模型
        保存字典,语料,模型到磁盘

        Arguments:
            prefix {str} -- 模型名称前缀
            corpora_documents {list} -- 分词后的文本
        """
        # 生成字典和向量语料
        dictionary = corpora.Dictionary(corporas)
        dictionary.save('./models/{}_dict.dic'.format(prefix))  # 保存生成的词典

        corpus = [dictionary.doc2bow(text) for text in corporas]
        corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus)  # 保存生成的语料
        tfidf_model = models.TfidfModel(corpus)
        tfidf_model.save("./models/{}_tfidf_model.model".format(prefix))  # 保存Tfidf模型 
开发者ID:jarvisqi,项目名称:nlp_learning,代码行数:18,代码来源:docsim.py

示例4: load_corpus_and_dict

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def load_corpus_and_dict(corpus_path, id2word_path):
    print("[BLOCK] Loading  corpus and dictionary files from %s and %s" % (data_path, id2word_path))
    sys.stdout.flush()
    dictionary = Dictionary.load_from_text(id2word_path)

    print("[BLOCK] Loading corpus iterator")
    sys.stdout.flush()
    #mm = gensim.corpora.MmCorpus(corpus_path)
    corpus = MmCorpus(bz2.BZ2File(corpus_path)) # use this if you compressed the TFIDF output (recommended)

    return corpus, dictionary 
开发者ID:kafkasl,项目名称:contextualLSTM,代码行数:13,代码来源:topics_analysis.py

示例5: prepare_gensim_corpus

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def prepare_gensim_corpus(corpus_name, corpus, output_path, min_freq=5):
	if not output_path.endswith('/'): output_path = output_path + '/'
	check_dir(output_path) # if directory does not exist, then create
	
	logging.info( 'building gensim corpus and dictionary for %s corpus', corpus_name )
	logging.info( 'loading corpus' )
	texts = [[word for word in process_text(document, removePunct=True, removeSW=True, removeNum=True)] for document in corpus]
	logging.info( 'tokenizing' )
	all_tokens = [item for sublist in texts for item in sublist]
	logging.info( 'mark tokens which have frequency less than %d', min_freq )
	tokens_once = set([k for k, v in collections.Counter(all_tokens).iteritems() if v < min_freq ])
	logging.info( '|D|=%d' , len(texts) )
	logging.info( 'filter low frequency tokens' )
	texts = [[word for word in text if word not in tokens_once] for text in texts]
	logging.info( '|D|=%d' , len(texts) )
	logging.info( 'building dictionary' )
	dictionary = corpora.Dictionary(texts)
	logging.info( 'saving dictionary' )
	dictFile = output_path + corpus_name + '.dict'
	dictionary.save(dictFile) 
	logging.info( 'building corpus in  mm format' )
	corpus = [dictionary.doc2bow(text) for text in texts]
	logging.info( 'saving corpus' )
	gensim_corpus_file = output_path + corpus_name + '.mm'
	corpora.MmCorpus.serialize(gensim_corpus_file, corpus)
	logging.info( 'computing tfidf' )
	tfidf = models.TfidfModel(corpus) # tfidf model 
	corpus_tfidf = tfidf[corpus] # tfidf corpus 
	logging.info( 'saving tfidf corpus' )
	corpus_tfidf_file = output_path + corpus_name + '.tfidf.mm'
	corpora.MmCorpus.serialize(corpus_tfidf_file, corpus_tfidf)
	logging.info( 'gensim corpus is ready' )
################################################################################## 
开发者ID:motazsaad,项目名称:comparable-text-miner,代码行数:35,代码来源:textpro.py

示例6: save

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def save(self, save_dir='./'):
        """
        Write out the built corpus to a save directory.
        """
        # Store the tag tables.
        pickle.dump((self.tagsToDocs, self.docsToTags), open(save_dir + 'tag-tables.pickle', 'wb'))
        
        # Store the document titles.
        pickle.dump(self.titles, open(save_dir + 'titles.pickle', 'wb'))
        
        # Write out the tfidf model.
        self.tfidf_model.save(save_dir + 'documents.tfidf_model')
        
        # Write out the tfidf corpus.
        corpora.MmCorpus.serialize(save_dir + 'documents_tfidf.mm', self.corpus_tfidf)  

        # Write out the dictionary.
        self.dictionary.save(save_dir + 'documents.dict')
        
        # Save the filenames.
        pickle.dump(self.files, open(save_dir + 'files.pickle', 'wb'))
        
        # Save the file ID and line numbers for each document.
        pickle.dump(self.doc_line_nums, open(save_dir + 'doc_line_nums.pickle', 'wb'))
        
        # Objects that are not saved:
        #  - stop_list - You don't need to filter stop words for new input
        #                text, they simply aren't found in the dictionary.
        #  - frequency - This preliminary word count object is only used for
        #                removing infrequent words. Final word counts are in
        #                the `dictionary` object. 
开发者ID:chrisjmccormick,项目名称:wiki-sim-search,代码行数:33,代码来源:keysearch.py

示例7: test_textcorpus

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2)) 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:15,代码来源:test_miislita.py

示例8: calc_similarity

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def calc_similarity(self, prefix: str, text: str):
        """计算相似度
        返回索引和余弦值

        Arguments:
            prefix {str} -- 模型前缀
            text {str} -- 文本数据
            value {float} -- 设定的阈值,返回大于这个值的数据
        """
        dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix))  # 加载字典
        corpus = corpora.MmCorpus('./models/{}_corpuse.mm'.format(prefix))  # 加载语料
        tfidf_model = models.TfidfModel.load("./models/{}_tfidf_model.model".format(prefix))  # 加载Tfidf模型
        corpus_tfidf = tfidf_model[corpus]

        lsi = models.LsiModel(corpus_tfidf)
        corpus_lsi = lsi[corpus_tfidf]
        similarity_lsi = similarities.Similarity('./models/similarity-lsi-index',
                                                 corpus_lsi,
                                                 num_features=400,
                                                 num_best=3)
        cut_raw = self.segment(text)  # 1.分词
        corpus = dictionary.doc2bow(cut_raw)  # 2.转换成bow向量
        corpus_tfidf = tfidf_model[corpus]  # 3.计算tfidf值
        corpus_lsi = lsi[corpus_tfidf]  # 4.计算lsi值
        sims = similarity_lsi[corpus_lsi]

        with open('./data/idx_dic.dic', 'r') as f:
            dt = f.read()
            idx_dic = eval(dt)

        result = []
        if sims is not None:
            result = [idx_dic[idx] for idx, val in sims if val > self.keep_val]

        return result 
开发者ID:jarvisqi,项目名称:nlp_learning,代码行数:37,代码来源:docsim.py

示例9: update_model

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def update_model(self, prefix: str, sysno: int, doc: str):
        """
        更新字典
        :param prefix:
        :param sysno: 系统编号
        :param doc:   文本内容
        :return:
        """

        corporas = self.segment(doc)
        # # 更新字典
        dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix))  # 加载
        dictionary.add_documents([corporas])
        dictionary.save('./models/{}_dict.dic'.format(prefix))  # 保存生成的词典

        corporas_docs = np.load("./data/{}_words.npy".format(prefix))
        corporas_docs = list(corporas_docs)
        corporas_docs.append(corporas)
        np.save("./data/{}_words.npy".format(prefix), corporas_docs)
        # 更新corpus
        corpus = [dictionary.doc2bow(text) for text in corporas_docs]
        corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus)

        # 更新TfidfModel
        tfidf_model = models.TfidfModel(corpus)
        tfidf_model.save("./models/{}_tfidf_model.model".format(prefix))

        # 更新索引字典
        with open('./data/idx_dic.dic', 'r') as f:
            dt = f.read()
            idx_dic = eval(dt)

        if sysno not in idx_dic.values():
            idx_dic[len(idx_dic)] = sysno

        with open('./data/idx_dic.dic', 'w') as f:
            f.write(str(idx_dic)) 
开发者ID:jarvisqi,项目名称:nlp_learning,代码行数:39,代码来源:docsim.py

示例10: create_plain_corpus

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def create_plain_corpus(self):
        """ Create a plain vector corpus, where each vector represents a
            document. Each element of the vector contains the count of
            the corresponding word (as indexed by the dictionary) in
            the document. """
        if self._dictionary is None:
            self.load_dictionary()
        dci = CorpusIterator(dictionary=self._dictionary)
        corpora.MmCorpus.serialize(self._PLAIN_CORPUS_FILE, dci) 
开发者ID:mideind,项目名称:Greynir,代码行数:11,代码来源:builder.py

示例11: load_plain_corpus

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def load_plain_corpus(self):
        """ Load the plain corpus from file """
        return corpora.MmCorpus(self._PLAIN_CORPUS_FILE) 
开发者ID:mideind,项目名称:Greynir,代码行数:5,代码来源:builder.py

示例12: create_tfidf_corpus

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def create_tfidf_corpus(self):
        """ Create a TFIDF corpus from a plain vector corpus """
        if self._tfidf is None:
            self.load_tfidf_model()
        corpus = self.load_plain_corpus()
        corpus_tfidf = self._tfidf[corpus]
        corpora.MmCorpus.serialize(self._TFIDF_CORPUS_FILE, corpus_tfidf) 
开发者ID:mideind,项目名称:Greynir,代码行数:9,代码来源:builder.py

示例13: load_tfidf_corpus

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def load_tfidf_corpus(self):
        """ Load a TFIDF corpus from file """
        return corpora.MmCorpus(self._TFIDF_CORPUS_FILE) 
开发者ID:mideind,项目名称:Greynir,代码行数:5,代码来源:builder.py

示例14: classifyRealtimeStockNews

# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def classifyRealtimeStockNews(self,doc_list):
		'''Classify real-time news(articles/documents) of specific stock.

		#Arguments:
			doc_list: List of real-time news(articles/documents) crawled from specific websites.
		'''
		print(' * extract relevant stock codes from latest crawled news ... ')
		relevant_stock_list = self.extractStockCodeFromRealtimeNews(doc_list)
		if len(relevant_stock_list) != 0:
			tfDim = 200
			for i, code_list in enumerate(relevant_stock_list):
				for code in code_list:

					print(' * load SVM parameters (gamma & C) ... ')
					Params_svm = {'kernel': ['rbf'], 'gamma': [10, 20, 50, 100, 150, 200], \
						'C': [10, 15, 20, 30, 50, 100]}

					print(' * use historical news to build SVM model of ' + code + ' ... ')
					self.classifyHistoryStockNews("Stock_News",code,modelType='lda',tfDim=tfDim,renewDict=False,\
							renewModel=False,Classifier='SVM',Params=Params_svm) #code="600740"

					print(' * load historical dictionary of ' + code + ' ...')
					dictionary = corpora.Dictionary.load(os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\' + code + '_dict.dict')
					
					print(' * tokenize latest crawled news ... ')
					token = self.tp.jieba_tokenize(doc_list)

					print(' * create bow-vector of latest news of ' + code + ' ... ')
					bowvec_doc = [dictionary.doc2bow(text) for text in token]
					
					print(' * load bow-vector of historical news of ' + code + ' ... ')
					bowvec_all = list(corpora.MmCorpus(os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\' + code + '_bowvec.mm'))
					
					print(' * extend latest bow-vector to historical bow-vector of ' + code + ' ... ')
					bowvec_all.extend(bowvec_doc)
					
					print(' * create new lda model of ' + code + ' ... ')
					_, NewmodelVec = self.tp.CallTransformationModel(dictionary,bowvec_all,modelType='lda',\
									tfDim=200,renewModel=False,modelPath=os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\')
					
					print(' * convert latest lda vector to CSR matrix of ' + code + ' ... ')
					NewCSRMatrix = self.ConvertToCSRMatrix(NewmodelVec)
					
					print(' * load SVM model of ' + code + ' ... ')
					clf = joblib.load(os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\' + code + '_svm.pkl') 
					
					print(' * predicting ... ')
					if clf.predict(NewCSRMatrix[i-2,:])[0] == 1:
						print('   《' + doc_list[i].split(' ')[0] + "》" + '对' + code + '是利好消息 ...')
					elif clf.predict(NewCSRMatrix[i-2,:])[0] == -1:
						print('   《' + doc_list[i].split(' ')[0] + "》" + '对' + code + '是利空消息 ...')
					else:
						print('   《' + doc_list[i].split(' ')[0] + "》" + '对' + code + '是中立消息 ...')
		else:
			print(' * not any relevant stock ... ') 
开发者ID:DemonDamon,项目名称:Listed-company-news-crawl-and-text-analysis,代码行数:57,代码来源:text_mining.py


注:本文中的gensim.corpora.MmCorpus方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。