本文整理汇总了Python中gensim.models.LdaModel.save方法的典型用法代码示例。如果您正苦于以下问题:Python LdaModel.save方法的具体用法?Python LdaModel.save怎么用?Python LdaModel.save使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models.LdaModel
的用法示例。
在下文中一共展示了LdaModel.save方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_lda_model
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
def create_lda_model(project, corpus, id2word, name, use_level=True, force=False):
model_fname = project.full_path + name + str(project.num_topics)
if use_level:
model_fname += project.level
model_fname += '.lda.gz'
if not os.path.exists(model_fname) or force:
if corpus:
update_every=None # run in batch if we have a pre-supplied corpus
else:
update_every=1
model = LdaModel(corpus=corpus,
id2word=id2word,
alpha=project.alpha,
eta=project.eta,
passes=project.passes,
num_topics=project.num_topics,
iterations=project.iterations,
eval_every=None, # disable perplexity tests for speed
update_every=update_every,
)
if corpus:
model.save(model_fname)
else:
model = LdaModel.load(model_fname)
return model, model_fname
示例2: create_model
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
def create_model(config, Kind):
model_fname = config.model_fname % Kind.__name__
corpus_fname = config.corpus_fname % Kind.__name__
if not os.path.exists(model_fname):
try:
id2word = Dictionary.load(corpus_fname + '.dict')
corpus = MalletCorpus(corpus_fname, id2word=id2word)
logger.info('Opened previously created corpus: %s' % corpus_fname)
except:
error('Corpora for building file models not found!')
file_model = LdaModel(corpus,
id2word=corpus.id2word,
alpha=config.alpha,
passes=config.passes,
num_topics=config.num_topics)
file_model.save(model_fname)
示例3: lda
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
def lda():
# remove stop words
stopwords = codecs.open('../conf/stop_words_ch.txt', mode='r', encoding='utf8').readlines()
stopwords = [ w.strip() for w in stopwords ]
fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8')
train = []
for line in fp:
line = line.split()
train.append([ w for w in line if w not in stopwords ])
dictionary = corpora.Dictionary(train)
corpus = [ dictionary.doc2bow(text) for text in train ]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)
lda.print_topics(30)
# print topic id=20
lda.print_topic(20)
# save/load model
lda.save('D:\\nlp\corpora\news.model')
示例4: upload_file
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
#.........这里部分代码省略.........
# corpus = glob.glob("swcorpus/*")
if not os.path.exists("out"):
os.makedirs("out")
# if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
# 'out'), foldername)): os.makedirs(os.path.join
# (os.path.join(os.getcwd(), 'out'), foldername))
MmCorpus.serialize(
os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
['corpus.mm'])), corpus)
mm = MmCorpus('out/corpus.mm')
print(mm)
# doc_labels = glob.glob("corpus/*")
print("fitting the model ...\n")
model = LdaModel(
corpus=mm, id2word=dictionary, num_topics=no_of_topics,
passes=no_of_passes, eval_every=eval, chunksize=chunk,
alpha=alpha, eta=eta)
# model = LdaMulticore(corpus=corpus, id2word=dictionary,
# num_topics=no_of_topics, passes=no_of_passes,
# eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)
print(model, "\n")
topics = model.show_topics(num_topics=no_of_topics)
for item, i in zip(topics, enumerate(topics)):
print("topic #"+str(i[0])+": "+str(item)+"\n")
print("saving ...\n")
if not os.path.exists("out"):
os.makedirs("out")
# if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
# 'out'), foldername)):
# os.makedirs(os.path.join(os.path.join(os.getcwd(), 'out'),
# foldername))
with open(
os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
["corpus_doclabels.txt"])), "w", encoding="utf-8") as f:
for item in doc_labels:
f.write(item + "\n")
with open(
os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
["corpus_topics.txt"])), "w", encoding="utf-8") as f:
for item, i in zip(topics, enumerate(topics)):
f.write(
"".join(["topic #", str(i[0]), ": ", str(item), "\n"]))
dictionary.save(
os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
['corpus', 'dict'])))
# MmCorpus.serialize(
# os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
# [foldername, 'mm'])), corpus)
model.save(
os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
['corpus', 'lda'])))
print("\n ta-daaaa ...\n")
# VISUALIZATION
no_of_topics = model.num_topics
no_of_docs = len(doc_labels)
doc_topic = np.zeros((no_of_docs, no_of_topics))
for doc, i in zip(corpus, range(no_of_docs)):
# topic_dist is a list of tuples (topic_id, topic_prob)
topic_dist = model.__getitem__(doc)
for topic in topic_dist:
doc_topic[i][topic[0]] = topic[1]
# get plot labels
topic_labels = []
for i in range(no_of_topics):
# show_topic() returns tuples (word_prob, word)
topic_terms = [x[0] for x in model.show_topic(i, topn=3)]
topic_labels.append(" ".join(topic_terms))
# cf. https://de.dariah.eu/tatom/topic_model_visualization.html
if no_of_docs > 20 or no_of_topics > 20:
plt.figure(figsize=(20, 20)) # if many items, enlarge figure
plt.pcolor(doc_topic, norm=None, cmap='Reds')
plt.yticks(np.arange(doc_topic.shape[0])+1.0, doc_labels)
plt.xticks(
np.arange(doc_topic.shape[1])+0.5, topic_labels, rotation='90')
plt.gca().invert_yaxis()
plt.colorbar(cmap='Reds')
plt.tight_layout()
plt.savefig("./static/corpus_heatmap.svg")
return render_template('success.html')
示例5: dictionary
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
print 'Saving dictionary (%s)...' % DICT
dictionary.save(DICT)
print 'Building bag-of-words corpus ...'
bow_corpus = [ dictionary.doc2bow(t) for t in texts ]
print 'Serializing corpus (%s) ...' % BOW
MmCorpus.serialize(BOW, bow_corpus)
size = len(bow_corpus) * 4 / 5
training = bow_corpus[:size]
testing = bow_corpus[size:]
print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training))
lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5, iterations = 1000)
print 'Saving LDA model (%s) ...' % NSFLDA
lda.save(NSFLDA)
print 'Random subset of topics:'
print '\n'.join(lda.print_topics())
print 'Computing perplexity on %d held-out documents ...' % len(testing)
perplexity = 2 ** -(lda.log_perplexity(testing))
print 'Perplexity: %.2f' % perplexity
示例6: run
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
def run(lda_model_path, corpus_path, num_topics, id2word):
corpus = corpora.BleiCorpus(corpus_path)
lda = LdaModel(corpus, num_topics=num_topics, id2word=id2word)
lda.save(lda_model_path)
return lda
示例7: print
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
# try with BoW vectors too?
# vamos a utilizar Latent Dirichlet Allocation para tratar de categorizar los abstracts
# este se demora la primera q lo corres para entrenar el modelo
print("lda")
lda_filename = 'model.lda'
if not os.path.isfile(lda_filename):
lda = LdaModel(corpus, num_topics=5,
id2word=dictionary,
update_every=5,
chunksize=10000,
passes=100)
lda.save('/tmp/model.lda')
else:
lda = LdaModel.load('/tmp/model.lda')
lda.show_topics()
topics_matrix = lda.show_topics(formatted=False, num_words=7)
print(topics_matrix)
print(len(topics_matrix))
for topic in topics_matrix:
i = topic[1]
print([str(word) for word in i])
#
# topics_matrix = np.array(topics_matrix)
#
# topic_words = topics_matrix[:, :, 1]
示例8: W2V_cpp2
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
class W2V_cpp2(W2V_base):
def __init__(self,n_topic, path, folder):
self.n_topic = n_topic
W2V_base.__init__(self, path, folder)
#process dict
for prod_id in self.idx2prod.keys():
prod = self.idx2prod[prod_id]
n_prod_id = prod_id - len(self.word_count) - 1
del self.idx2prod[prod_id]
self.idx2prod[n_prod_id] = prod
self.prod2idx[prod] = n_prod_id
for user_id in self.idx2user.keys():
user = self.idx2user[user_id]
n_user_id = user_id - len(self.word_count) - len(self.prod2idx) - 1
del self.idx2user[user_id]
self.idx2user[n_user_id] = user
self.user2idx[user] = n_user_id
def train(self):
data = []
entity2id = {}
id2entity = []
for obj in self.data:
doc = []
obj_sents = obj["text_data"]
entity = obj["prod"]
if entity not in entity2id:
entity2id[entity] = len(entity2id)
id2entity.append(entity)
doc_id = entity2id[entity]
for obj_sent in obj_sents:
for pair in obj_sent:
if pair[0] >= 0:
doc.append((pair[0], doc_id))
data.append(doc)
self.ldamodel = LdaModel(corpus=data, id2word=self.idx2word, num_topics=self.n_topic)
f_entity = open("lda/prod.txt", "w")
f_model = open("lda/model.txt", "w")
f_model.write(str(len(entity2id)))
f_model.write(" ")
f_model.write(str(self.n_topic))
f_model.write("\n")
for entity in id2entity:
f_entity.write(entity)
f_entity.write("\n")
f_model.write(entity)
f_model.write(" ")
distr = self.ldamodel.get_document_topics(data[1], minimum_phi_value=0, minimum_probability=0)
distr = [pair[1] for pair in distr]
for prod in distr:
f_model.write(str(prod))
f_model.write(" ")
f_model.write("\n")
self.ldamodel.save("lda/model_200")
示例9: zip
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
log.info('generated topics...')
# print topics
topics = model.show_topics(num_topics=no_of_topics)
for item, i in zip(topics, enumerate(topics)):
log.info('topic #%s: %s', i[0], item)
log.info('saving results...')
# create output folder
if not os.path.exists("out"): os.makedirs("out")
# save doc_labels for further use
with open(os.path.join(os.path.join(os.getcwd(), "out"),''.join([foldername, "_doclabels.txt"])), "w", encoding="utf-8") as f:
for item in doc_labels: f.write(item+"\n")
# save topics for further use
with open(os.path.join(os.path.join(os.getcwd(), "out"), ''.join([foldername, "_topics.txt"])), "w", encoding="utf-8") as f:
for item, i in zip(topics, enumerate(topics)):
f.write("".join(["topic #",str(i[0]),": ",str(item),"\n"]))
# save dictionary for further use
dictionary.save(os.path.join(os.path.join(os.getcwd(), "out"), '.'.join([foldername, 'dict'])))
# save model for further use
model.save(os.path.join(os.path.join(os.getcwd(), "out"), '.'.join([foldername, 'lda'])))
log.info('topic modeling finished')
示例10: main
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
#.........这里部分代码省略.........
dat_cs[ind1][ind2] = csimvalue
dat_ts[ind1][ind2] = tsimvalue
dat_js[ind1][ind2] = jsimvalue
if ind1 != ind2:
dat_cs[ind2][ind1] = csimvalue
dat_ts[ind2][ind1] = tsimvalue
dat_js[ind2][ind1] = jsimvalue
cs_df = pd.DataFrame(dat_cs, index = cuisine_types, columns = cuisine_types)
ts_df = pd.DataFrame(dat_ts, index = cuisine_types, columns = cuisine_types)
js_df = pd.DataFrame(dat_js, index = cuisine_types, columns = cuisine_types)
cs_df.to_csv("cosine_similarity_df.csv", header = True, index = True)
ts_df.to_csv("tanimoto_similarity_df.csv", header = True, index = True)
js_df.to_csv("jaccard_similarity_df.csv", header = True, index = True)
finish = clock()
print("Running time: %.2f seconds" % (finish - start,))
print()
# clear potential large objects from memory prior to running any further analyses
del doc_sparse_list
del dat_cs
del dat_ts
del dat_js
del cs_df
del ts_df
del js_df
# run lda analysis
print("Running LDA with %d topics..." % (LDA_TOPICS,))
start = clock()
cuisine_corpus.agglomerate = False
lda = LdaModel(corpus = cuisine_tfidf[cuisine_corpus], id2word = cuisine_corpus.dictionary, \
num_topics = LDA_TOPICS, eval_every = None, chunksize = LDA_CHUNKSIZE, iterations = 200, \
passes = 2)
lda.save("lda_cuisines.pyobject")
# create dense numpy matrix
cuisine_corpus.agglomerate = True
rows, cols = len(cuisine_files), LDA_TOPICS
lda_array = np.zeros(rows * cols).reshape(rows, cols)
for row, doc in enumerate(cuisine_corpus):
entries = lda[doc]
for col, value in entries:
lda_array[row][col] = value
with open("lda_array.npy", "wb") as f:
np.save(f, lda_array)
finish = clock()
print("LDA complete!")
print("Running time: %.2f seconds" % (finish - start,))
print()
# calculate similarity for all lda documents and write to file
print("Calculating LDA similarity matrices...")
start = clock()
dat_cs = np.zeros((len(cuisine_types), len(cuisine_types)))
dat_ts = np.zeros((len(cuisine_types), len(cuisine_types)))
for ind1 in range(lda_array.shape[0]):
vec1 = lda_array[ind1,:]
for ind2 in range(ind1, lda_array.shape[0]):
vec2 = lda_array[ind2,:]
csimvalue = vec1.dot(vec2) / np.sqrt(vec1.dot(vec1) * vec2.dot(vec2))
tsimvalue = vec1.dot(vec2) / (vec1.dot(vec1) + vec2.dot(vec2) - vec1.dot(vec2))
dat_cs[ind1][ind2] = csimvalue
dat_ts[ind1][ind2] = tsimvalue
if ind1 != ind2:
dat_cs[ind2][ind1] = csimvalue
dat_ts[ind2][ind1] = tsimvalue
示例11: print
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
print("fitting the model ...\n")
model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes,
eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)
#model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes,
# eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)
print(model, "\n")
topics = model.show_topics(num_topics=no_of_topics)
for item, i in zip(topics, enumerate(topics)):
print("topic #"+str(i[0])+": "+str(item)+"\n")
print("saving ...\n")
if not os.path.exists("out"): os.makedirs("out")
with open("out/"+foldername+"_doclabels.txt", "w") as f:
for item in doc_labels: f.write(item+"\n")
with open("out/"+foldername+"_topics.txt", "w") as f:
for item, i in zip(topics, enumerate(topics)):
f.write("topic #"+str(i[0])+": "+str(item)+"\n")
dictionary.save("out/"+foldername+".dict")
MmCorpus.serialize("out/"+foldername+".mm", corpus)
model.save("out/"+foldername+".lda")
示例12: create_lda_model
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
def create_lda_model():
logging.info('about to create all docs from chunks')
start_time = datetime.datetime.now()
create_all_docs()
end_time = datetime.datetime.now()
logging.info('total time is: %s', end_time - start_time)
logging.info('about to load all docs')
with open('./resources/LDA_processing/all_docs.pkl', mode='rb') as f:
all_docs = pickle.load(f)
logging.info('about to load english words')
with open('./resources/LDA_input/english_full_list.txt') as f:
english_words = f.read().splitlines()
good_english_words = set(english_words[75:21000])
del english_words
logging.info('about to remove all stop-words and unknown words')
texts = []
for i, doc in enumerate(all_docs):
filtered_doc = [word for word in doc if word in good_english_words]
texts.append(filtered_doc)
if i % 5000 == 0:
logging.info('Finished doc: %s', i)
logging.info('about to release memory of all_docs and english_words')
del all_docs
del good_english_words
logging.info('about to save texts')
with open('./resources/LDA_processing/texts.pkl', mode='wb') as f:
pickle.dump(texts, f)
logging.info('about to load texts')
with open('./resources/LDA_processing/texts.pkl', mode='rb') as f:
texts = pickle.load(f)
logging.info('about to create dictionary')
dictionary = corpora.Dictionary(texts)
keys = dictionary.keys()
logging.info('dict size before filter: %s', len(keys))
dictionary.filter_extremes(keep_n=150000)
dictionary.filter_extremes(no_below=150, no_above=0.05)
keys = dictionary.keys()
logging.info('dict size after filter: %s', len(keys))
dictionary.save('./resources/LDA_processing/lda.dict')
dictionary.save_as_text('./resources/LDA_processing/lda_dict.txt')
logging.info('about to create corpus')
corpus = [dictionary.doc2bow(text) for text in texts]
logging.info('about to save corpus as mm file')
corpora.MmCorpus.serialize('./resources/LDA_processing/corpus.mm', corpus)
logging.info('about to load dictionary file')
dictionary = corpora.Dictionary.load('./resources/LDA_processing/lda.dict')
logging.info('about to load corpus as mm file')
corpus = corpora.MmCorpus('./resources/LDA_processing/corpus.mm')
logging.info('about to start LDA model')
lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics)
logging.info('finished LDA model')
logging.info('about to save ldaModel')
lda.save('./resources/LDA_processing/LdaModel')
logging.info('about to load ldaModel')
lda = LdaModel.load('./resources/LDA_processing/LdaModel')
logging.info('about to find topics')
topics = lda.show_topics(num_topics=num_topics, num_words=10000, log=True, formatted=False)
logging.info('about to save topics')
with open('./resources/LDA_processing/topics.pkl', mode='wb') as f:
pickle.dump(topics, f)
dict_word_sets = find_words_from_lda_model()
with open('./resources/LDA_processing/dict_word_sets.pkl', mode='wb') as f:
pickle.dump(dict_word_sets, f)
topics_words = extract_words_from_word_sets()
with open('./resources/LDA_result/topic_words', mode='wt', encoding='utf-8') as f:
f.write('\n'.join(topics_words))
示例13: unpickle
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
#
# logging.info('combine report and wiki dictionary...')
# wiki_to_report = report_dict.merge_with(wiki_dict)
# merged_dict = report_dict
#
# logging.info('combine report and wiki corpus...')
# merged_corpus = wiki_to_report[wiki_corpus].corpus + report_corpus
logging.info('generate wiki corpus...')
wiki_txt = unpickle('data/txt/processed_wiki.pkl')
wiki_corpus = [report_dict.doc2bow(wiki) for wiki in wiki_txt]
logging.info('combine report and wiki corpus...')
merged_corpus = wiki_corpus + report_corpus
# compute TFIDF
# logging.info('compute TFIDF...')
# tfidf = TfidfModel(dictionary=report_dict, id2word=report_dict)
# perform LDA
logging.info('perform LDA...')
if use_wiki is True:
lda = LdaModel(corpus=merged_corpus, id2word=report_dict, num_topics=num_topics, passes=passes,
iterations=iterations, alpha='auto', chunksize=chunksize)
lda.save('result/model_wiki.lda')
lda.print_topics(topics=num_topics, topn=10)
else:
lda = LdaModel(corpus=report_corpus, id2word=report_dict, num_topics=num_topics, passes=passes,
iterations=iterations, alpha='auto', chunksize=chunksize)
lda.save('result/model.lda')
lda.print_topics(topics=num_topics, topn=10)
示例14: DMP
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
class DMP(object):
def __init__(self):
self.dic = None
self.lda = None
self.topic_num = config.getint('dmp', 'topic_num')
self.corpus_file = config.get('dmp', 'corpus_file')
@staticmethod
def __text2doc(iterator, sep=u' '):
'''将文本转换为文档
通过 split 函数将文本切成词的列表.
参数
sep: 分隔符
返回
返回已经切割好的词的列表
'''
docs = []
for line in iterator:
text = line.strip().split(sep)
docs.append(text)
return docs
def __load_corpus(self):
'''读取语料. 通过调用 text2doc 将文本转换为词的列表.
返回
返回处理过后的文档的列表.
'''
docs = None
with codecs.open(self.corpus_file, 'r', 'utf-8') as iterator:
docs = self.__text2doc(iterator)
return docs
def train(self):
'''训练模型, 将会得到词典 (dic) 和模型 (lda) 两个对象.
dic: 用来存储词, 每个词会有一个编号. 可以通过 dic[id] 来获取词
lda: 模型, 包含主题的列表. 每个主题有一个编号, 可以通过
lda.print_topic(id) 来获取主题中词的列表
'''
docs = self.__load_corpus()
self.dic = Dictionary(docs)
bow = [self.dic.doc2bow(doc) for doc in docs]
self.lda = LdaModel(bow, id2word=self.dic,
num_topics=self.topic_num)
def infer(self, doc):
'''推断新的文档是什么主题
参数
doc: 新的文档. 要以词的列表的形式呈现
返回
返回主题列表的迭代器, 其中主题均采用编号呈现, 需调用 lda.print_topic
函数来方便人工理解.
'''
bow = self.dic.doc2bow(doc)
topics = self.lda[bow]
return topics
def dump(self):
'''导出 lda 模型和 dic 词典.
'''
lda_file = config.get('dmp', 'lda_file')
dic_file = config.get('dmp', 'dic_file')
self.lda.save(lda_file)
self.dic.save(dic_file)
def load(self):
'''读取 lda 模型和 dic 词典.
'''
lda_file = config.get('dmp', 'lda_file')
dic_file = config.get('dmp', 'dic_file')
self.lda = LdaModel.load(lda_file)
self.dic = Dictionary.load(dic_file)
示例15: UnlabeledCorpus
# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
vocab = Dictionary.load_from_text('./vocab.txt')
corpus = UnlabeledCorpus('./rumor_train.csv', vocab)
valid_corpus = UnlabeledCorpus('./rumor_valid.csv', vocab)
valid_sentences = [doc for doc in valid_corpus][5000:]
# varing number of topics
# result = {}
# for num_topics in [2, 4, 8, 16, 32, 64]:
# best_value = -100
# for i in range(5):
# model = LdaModel(corpus=corpus, id2word=vocab, num_topics=num_topics)
# likelihood = model.log_perplexity(valid_sentences)
# best_value = max(best_value, likelihood)
# result[num_topics]= best_value
#
# for num_topics, likelihood in result.iteritems():
# print 'num_topics: %d, best word_likelihood: %f' % (num_topics, likelihood)
model = LdaModel(corpus=corpus, id2word=vocab, num_topics=8, passes=2)
model.save('./lda_model.txt')
# print topics to a file
topics = model.show_topics(num_topics=100, num_words=50)
with codecs.open('./topics.txt', 'w', 'utf-8') as out_f:
for topic in topics:
topic_id, topic_str = topic[0], topic[1]
out_f.write('%d:\n%s\n' % (topic_id, topic_str))
out_f.write('\n')