本文整理汇总了Python中gensim.corpora.MmCorpus方法的典型用法代码示例。如果您正苦于以下问题:Python corpora.MmCorpus方法的具体用法?Python corpora.MmCorpus怎么用?Python corpora.MmCorpus使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora
的用法示例。
在下文中一共展示了corpora.MmCorpus方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: build_lsi_model
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def build_lsi_model(corpus_name, corpus_path, topics=300):
logging.info( 'building lsi model for %s corpus', corpus_name )
dictFile = corpus_path + corpus_name + '.dict'
corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm'
logging.info( 'loading dictionary ...' )
dictionary = corpora.Dictionary.load(dictFile)
logging.info( 'loading tfidf corpus ...' )
corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file)
logging.info( 'building lsi model' )
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
logging.info( 'saving lsi' )
lsiFile = corpus_path + corpus_name + '.lsi'
lsi.save(lsiFile)
logging.info( 'lsi model is ready' )
##################################################################################
示例2: load
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def load(cls, save_dir='./'):
"""
Load the corpus from a save directory.
"""
tables = pickle.load(open(save_dir + 'tag-tables.pickle', 'rb'))
tagsToDocs = tables[0]
docsToTags = tables[1]
titles = pickle.load(open(save_dir + 'titles.pickle', 'rb'))
tfidf_model = TfidfModel.load(fname=save_dir + 'documents.tfidf_model')
corpus_tfidf = corpora.MmCorpus(save_dir + 'documents_tfidf.mm')
dictionary = corpora.Dictionary.load(fname=save_dir + 'documents.dict')
files = pickle.load(open(save_dir + 'files.pickle', 'rb'))
doc_line_nums = pickle.load(open(save_dir + 'doc_line_nums.pickle', 'rb'))
ksearch = KeySearch(dictionary, tfidf_model,
corpus_tfidf, titles, tagsToDocs,
docsToTags, files, doc_line_nums)
return ksearch
示例3: train
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def train(self, prefix: str, corporas: list):
""" 训练模型
保存字典,语料,模型到磁盘
Arguments:
prefix {str} -- 模型名称前缀
corpora_documents {list} -- 分词后的文本
"""
# 生成字典和向量语料
dictionary = corpora.Dictionary(corporas)
dictionary.save('./models/{}_dict.dic'.format(prefix)) # 保存生成的词典
corpus = [dictionary.doc2bow(text) for text in corporas]
corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus) # 保存生成的语料
tfidf_model = models.TfidfModel(corpus)
tfidf_model.save("./models/{}_tfidf_model.model".format(prefix)) # 保存Tfidf模型
示例4: load_corpus_and_dict
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def load_corpus_and_dict(corpus_path, id2word_path):
print("[BLOCK] Loading corpus and dictionary files from %s and %s" % (data_path, id2word_path))
sys.stdout.flush()
dictionary = Dictionary.load_from_text(id2word_path)
print("[BLOCK] Loading corpus iterator")
sys.stdout.flush()
#mm = gensim.corpora.MmCorpus(corpus_path)
corpus = MmCorpus(bz2.BZ2File(corpus_path)) # use this if you compressed the TFIDF output (recommended)
return corpus, dictionary
示例5: prepare_gensim_corpus
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def prepare_gensim_corpus(corpus_name, corpus, output_path, min_freq=5):
if not output_path.endswith('/'): output_path = output_path + '/'
check_dir(output_path) # if directory does not exist, then create
logging.info( 'building gensim corpus and dictionary for %s corpus', corpus_name )
logging.info( 'loading corpus' )
texts = [[word for word in process_text(document, removePunct=True, removeSW=True, removeNum=True)] for document in corpus]
logging.info( 'tokenizing' )
all_tokens = [item for sublist in texts for item in sublist]
logging.info( 'mark tokens which have frequency less than %d', min_freq )
tokens_once = set([k for k, v in collections.Counter(all_tokens).iteritems() if v < min_freq ])
logging.info( '|D|=%d' , len(texts) )
logging.info( 'filter low frequency tokens' )
texts = [[word for word in text if word not in tokens_once] for text in texts]
logging.info( '|D|=%d' , len(texts) )
logging.info( 'building dictionary' )
dictionary = corpora.Dictionary(texts)
logging.info( 'saving dictionary' )
dictFile = output_path + corpus_name + '.dict'
dictionary.save(dictFile)
logging.info( 'building corpus in mm format' )
corpus = [dictionary.doc2bow(text) for text in texts]
logging.info( 'saving corpus' )
gensim_corpus_file = output_path + corpus_name + '.mm'
corpora.MmCorpus.serialize(gensim_corpus_file, corpus)
logging.info( 'computing tfidf' )
tfidf = models.TfidfModel(corpus) # tfidf model
corpus_tfidf = tfidf[corpus] # tfidf corpus
logging.info( 'saving tfidf corpus' )
corpus_tfidf_file = output_path + corpus_name + '.tfidf.mm'
corpora.MmCorpus.serialize(corpus_tfidf_file, corpus_tfidf)
logging.info( 'gensim corpus is ready' )
##################################################################################
示例6: save
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def save(self, save_dir='./'):
"""
Write out the built corpus to a save directory.
"""
# Store the tag tables.
pickle.dump((self.tagsToDocs, self.docsToTags), open(save_dir + 'tag-tables.pickle', 'wb'))
# Store the document titles.
pickle.dump(self.titles, open(save_dir + 'titles.pickle', 'wb'))
# Write out the tfidf model.
self.tfidf_model.save(save_dir + 'documents.tfidf_model')
# Write out the tfidf corpus.
corpora.MmCorpus.serialize(save_dir + 'documents_tfidf.mm', self.corpus_tfidf)
# Write out the dictionary.
self.dictionary.save(save_dir + 'documents.dict')
# Save the filenames.
pickle.dump(self.files, open(save_dir + 'files.pickle', 'wb'))
# Save the file ID and line numbers for each document.
pickle.dump(self.doc_line_nums, open(save_dir + 'doc_line_nums.pickle', 'wb'))
# Objects that are not saved:
# - stop_list - You don't need to filter stop words for new input
# text, they simply aren't found in the dictionary.
# - frequency - This preliminary word count object is only used for
# removing infrequent words. Final word counts are in
# the `dictionary` object.
示例7: test_textcorpus
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def test_textcorpus(self):
"""Make sure TextCorpus can be serialized to disk. """
# construct corpus from file
miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))
# make sure serializing works
ftmp = get_tmpfile('test_textcorpus.mm')
corpora.MmCorpus.save_corpus(ftmp, miislita)
self.assertTrue(os.path.exists(ftmp))
# make sure deserializing gives the same result
miislita2 = corpora.MmCorpus(ftmp)
self.assertEqual(list(miislita), list(miislita2))
示例8: calc_similarity
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def calc_similarity(self, prefix: str, text: str):
"""计算相似度
返回索引和余弦值
Arguments:
prefix {str} -- 模型前缀
text {str} -- 文本数据
value {float} -- 设定的阈值,返回大于这个值的数据
"""
dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix)) # 加载字典
corpus = corpora.MmCorpus('./models/{}_corpuse.mm'.format(prefix)) # 加载语料
tfidf_model = models.TfidfModel.load("./models/{}_tfidf_model.model".format(prefix)) # 加载Tfidf模型
corpus_tfidf = tfidf_model[corpus]
lsi = models.LsiModel(corpus_tfidf)
corpus_lsi = lsi[corpus_tfidf]
similarity_lsi = similarities.Similarity('./models/similarity-lsi-index',
corpus_lsi,
num_features=400,
num_best=3)
cut_raw = self.segment(text) # 1.分词
corpus = dictionary.doc2bow(cut_raw) # 2.转换成bow向量
corpus_tfidf = tfidf_model[corpus] # 3.计算tfidf值
corpus_lsi = lsi[corpus_tfidf] # 4.计算lsi值
sims = similarity_lsi[corpus_lsi]
with open('./data/idx_dic.dic', 'r') as f:
dt = f.read()
idx_dic = eval(dt)
result = []
if sims is not None:
result = [idx_dic[idx] for idx, val in sims if val > self.keep_val]
return result
示例9: update_model
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def update_model(self, prefix: str, sysno: int, doc: str):
"""
更新字典
:param prefix:
:param sysno: 系统编号
:param doc: 文本内容
:return:
"""
corporas = self.segment(doc)
# # 更新字典
dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix)) # 加载
dictionary.add_documents([corporas])
dictionary.save('./models/{}_dict.dic'.format(prefix)) # 保存生成的词典
corporas_docs = np.load("./data/{}_words.npy".format(prefix))
corporas_docs = list(corporas_docs)
corporas_docs.append(corporas)
np.save("./data/{}_words.npy".format(prefix), corporas_docs)
# 更新corpus
corpus = [dictionary.doc2bow(text) for text in corporas_docs]
corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus)
# 更新TfidfModel
tfidf_model = models.TfidfModel(corpus)
tfidf_model.save("./models/{}_tfidf_model.model".format(prefix))
# 更新索引字典
with open('./data/idx_dic.dic', 'r') as f:
dt = f.read()
idx_dic = eval(dt)
if sysno not in idx_dic.values():
idx_dic[len(idx_dic)] = sysno
with open('./data/idx_dic.dic', 'w') as f:
f.write(str(idx_dic))
示例10: create_plain_corpus
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def create_plain_corpus(self):
""" Create a plain vector corpus, where each vector represents a
document. Each element of the vector contains the count of
the corresponding word (as indexed by the dictionary) in
the document. """
if self._dictionary is None:
self.load_dictionary()
dci = CorpusIterator(dictionary=self._dictionary)
corpora.MmCorpus.serialize(self._PLAIN_CORPUS_FILE, dci)
示例11: load_plain_corpus
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def load_plain_corpus(self):
""" Load the plain corpus from file """
return corpora.MmCorpus(self._PLAIN_CORPUS_FILE)
示例12: create_tfidf_corpus
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def create_tfidf_corpus(self):
""" Create a TFIDF corpus from a plain vector corpus """
if self._tfidf is None:
self.load_tfidf_model()
corpus = self.load_plain_corpus()
corpus_tfidf = self._tfidf[corpus]
corpora.MmCorpus.serialize(self._TFIDF_CORPUS_FILE, corpus_tfidf)
示例13: load_tfidf_corpus
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def load_tfidf_corpus(self):
""" Load a TFIDF corpus from file """
return corpora.MmCorpus(self._TFIDF_CORPUS_FILE)
示例14: classifyRealtimeStockNews
# 需要导入模块: from gensim import corpora [as 别名]
# 或者: from gensim.corpora import MmCorpus [as 别名]
def classifyRealtimeStockNews(self,doc_list):
'''Classify real-time news(articles/documents) of specific stock.
#Arguments:
doc_list: List of real-time news(articles/documents) crawled from specific websites.
'''
print(' * extract relevant stock codes from latest crawled news ... ')
relevant_stock_list = self.extractStockCodeFromRealtimeNews(doc_list)
if len(relevant_stock_list) != 0:
tfDim = 200
for i, code_list in enumerate(relevant_stock_list):
for code in code_list:
print(' * load SVM parameters (gamma & C) ... ')
Params_svm = {'kernel': ['rbf'], 'gamma': [10, 20, 50, 100, 150, 200], \
'C': [10, 15, 20, 30, 50, 100]}
print(' * use historical news to build SVM model of ' + code + ' ... ')
self.classifyHistoryStockNews("Stock_News",code,modelType='lda',tfDim=tfDim,renewDict=False,\
renewModel=False,Classifier='SVM',Params=Params_svm) #code="600740"
print(' * load historical dictionary of ' + code + ' ...')
dictionary = corpora.Dictionary.load(os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\' + code + '_dict.dict')
print(' * tokenize latest crawled news ... ')
token = self.tp.jieba_tokenize(doc_list)
print(' * create bow-vector of latest news of ' + code + ' ... ')
bowvec_doc = [dictionary.doc2bow(text) for text in token]
print(' * load bow-vector of historical news of ' + code + ' ... ')
bowvec_all = list(corpora.MmCorpus(os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\' + code + '_bowvec.mm'))
print(' * extend latest bow-vector to historical bow-vector of ' + code + ' ... ')
bowvec_all.extend(bowvec_doc)
print(' * create new lda model of ' + code + ' ... ')
_, NewmodelVec = self.tp.CallTransformationModel(dictionary,bowvec_all,modelType='lda',\
tfDim=200,renewModel=False,modelPath=os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\')
print(' * convert latest lda vector to CSR matrix of ' + code + ' ... ')
NewCSRMatrix = self.ConvertToCSRMatrix(NewmodelVec)
print(' * load SVM model of ' + code + ' ... ')
clf = joblib.load(os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\' + code + '_svm.pkl')
print(' * predicting ... ')
if clf.predict(NewCSRMatrix[i-2,:])[0] == 1:
print(' 《' + doc_list[i].split(' ')[0] + "》" + '对' + code + '是利好消息 ...')
elif clf.predict(NewCSRMatrix[i-2,:])[0] == -1:
print(' 《' + doc_list[i].split(' ')[0] + "》" + '对' + code + '是利空消息 ...')
else:
print(' 《' + doc_list[i].split(' ')[0] + "》" + '对' + code + '是中立消息 ...')
else:
print(' * not any relevant stock ... ')