当前位置: 首页>>代码示例>>Python>>正文


Python LdaModel.save方法代码示例

本文整理汇总了Python中gensim.models.LdaModel.save方法的典型用法代码示例。如果您正苦于以下问题:Python LdaModel.save方法的具体用法?Python LdaModel.save怎么用?Python LdaModel.save使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.models.LdaModel的用法示例。


在下文中一共展示了LdaModel.save方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: create_lda_model

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
def create_lda_model(project, corpus, id2word, name, use_level=True, force=False):
    model_fname = project.full_path + name + str(project.num_topics)
    if use_level:
        model_fname += project.level

    model_fname += '.lda.gz'


    if not os.path.exists(model_fname) or force:
        if corpus:
            update_every=None # run in batch if we have a pre-supplied corpus
        else:
            update_every=1

        model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         alpha=project.alpha,
                         eta=project.eta,
                         passes=project.passes,
                         num_topics=project.num_topics,
                         iterations=project.iterations,
                         eval_every=None, # disable perplexity tests for speed
                         update_every=update_every,
                         )

        if corpus:
            model.save(model_fname)
    else:
        model = LdaModel.load(model_fname)

    return model, model_fname
开发者ID:cscorley,项目名称:changeset-feature-location,代码行数:33,代码来源:main.py

示例2: create_model

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
def create_model(config, Kind):
    model_fname = config.model_fname % Kind.__name__
    corpus_fname = config.corpus_fname % Kind.__name__

    if not os.path.exists(model_fname):
        try:
            id2word = Dictionary.load(corpus_fname + '.dict')
            corpus = MalletCorpus(corpus_fname, id2word=id2word)
            logger.info('Opened previously created corpus: %s' % corpus_fname)
        except:
            error('Corpora for building file models not found!')

        file_model = LdaModel(corpus,
                              id2word=corpus.id2word,
                              alpha=config.alpha,
                              passes=config.passes,
                              num_topics=config.num_topics)

        file_model.save(model_fname)
开发者ID:cscorley,项目名称:mud2014-modeling-changeset-topics,代码行数:21,代码来源:main.py

示例3: lda

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
def lda():
    # remove stop words
    stopwords = codecs.open('../conf/stop_words_ch.txt', mode='r', encoding='utf8').readlines()
    stopwords = [ w.strip() for w in stopwords ]
    
    fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8')
    train = []
    for line in fp:
        line = line.split()
        train.append([ w for w in line if w not in stopwords ])
    
    dictionary = corpora.Dictionary(train)
    corpus = [ dictionary.doc2bow(text) for text in train ]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)
    
    lda.print_topics(30)
    # print topic id=20
    lda.print_topic(20)
    
    # save/load model
    lda.save('D:\\nlp\corpora\news.model')
开发者ID:xialei,项目名称:poc,代码行数:23,代码来源:news.py

示例4: upload_file

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]

#.........这里部分代码省略.........
    # corpus = glob.glob("swcorpus/*")

    if not os.path.exists("out"):
        os.makedirs("out")
    # if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
    # 'out'), foldername)): os.makedirs(os.path.join
    # (os.path.join(os.getcwd(), 'out'), foldername))

    MmCorpus.serialize(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus.mm'])), corpus)
    mm = MmCorpus('out/corpus.mm')

    print(mm)

    # doc_labels = glob.glob("corpus/*")

    print("fitting the model ...\n")

    model = LdaModel(
        corpus=mm, id2word=dictionary, num_topics=no_of_topics,
        passes=no_of_passes, eval_every=eval, chunksize=chunk,
        alpha=alpha, eta=eta)

    # model = LdaMulticore(corpus=corpus, id2word=dictionary,
    # num_topics=no_of_topics, passes=no_of_passes,
    # eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)

    print(model, "\n")

    topics = model.show_topics(num_topics=no_of_topics)

    for item, i in zip(topics, enumerate(topics)):
        print("topic #"+str(i[0])+": "+str(item)+"\n")

    print("saving ...\n")

    if not os.path.exists("out"):
        os.makedirs("out")
    # if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
    # 'out'), foldername)):
    # os.makedirs(os.path.join(os.path.join(os.getcwd(), 'out'),
    # foldername))

    with open(
        os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
            ["corpus_doclabels.txt"])), "w", encoding="utf-8") as f:
            for item in doc_labels:
                f.write(item + "\n")

    with open(
        os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
            ["corpus_topics.txt"])), "w", encoding="utf-8") as f:
        for item, i in zip(topics, enumerate(topics)):
            f.write(
                "".join(["topic #", str(i[0]), ": ", str(item), "\n"]))

    dictionary.save(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus', 'dict'])))
    # MmCorpus.serialize(
    # os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
    # [foldername, 'mm'])), corpus)
    model.save(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus', 'lda'])))

    print("\n ta-daaaa ...\n")
    
    # VISUALIZATION
    no_of_topics = model.num_topics
    no_of_docs = len(doc_labels)
    doc_topic = np.zeros((no_of_docs, no_of_topics))
    
    for doc, i in zip(corpus, range(no_of_docs)):
        # topic_dist is a list of tuples (topic_id, topic_prob)
        topic_dist = model.__getitem__(doc)
        for topic in topic_dist:
            doc_topic[i][topic[0]] = topic[1]
    
    # get plot labels
    topic_labels = []
    for i in range(no_of_topics):
        # show_topic() returns tuples (word_prob, word)
        topic_terms = [x[0] for x in model.show_topic(i, topn=3)]
        topic_labels.append(" ".join(topic_terms))
        
    # cf. https://de.dariah.eu/tatom/topic_model_visualization.html

    if no_of_docs > 20 or no_of_topics > 20:
        plt.figure(figsize=(20, 20)) # if many items, enlarge figure
    plt.pcolor(doc_topic, norm=None, cmap='Reds')
    plt.yticks(np.arange(doc_topic.shape[0])+1.0, doc_labels)
    plt.xticks(
        np.arange(doc_topic.shape[1])+0.5, topic_labels, rotation='90')
    plt.gca().invert_yaxis()
    plt.colorbar(cmap='Reds')
    plt.tight_layout()
    plt.savefig("./static/corpus_heatmap.svg")
    return render_template('success.html')
开发者ID:pielstroem,项目名称:Topics,代码行数:104,代码来源:demo.py

示例5: dictionary

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
print 'Saving dictionary (%s)...' % DICT
dictionary.save(DICT)

print 'Building bag-of-words corpus ...'
bow_corpus = [ dictionary.doc2bow(t) for t in texts ]

print 'Serializing corpus (%s) ...' % BOW
MmCorpus.serialize(BOW, bow_corpus)

size = len(bow_corpus) * 4 / 5
training = bow_corpus[:size]
testing = bow_corpus[size:]

print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training))
lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5, iterations = 1000)

print 'Saving LDA model (%s) ...' % NSFLDA
lda.save(NSFLDA)

print 'Random subset of topics:'
print '\n'.join(lda.print_topics())

print 'Computing perplexity on %d held-out documents ...' % len(testing)
perplexity = 2 ** -(lda.log_perplexity(testing))
print 'Perplexity: %.2f' % perplexity




开发者ID:voronoi,项目名称:TopicModelling,代码行数:27,代码来源:LDA_v4.py

示例6: run

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
    def run(lda_model_path, corpus_path, num_topics, id2word):
        corpus = corpora.BleiCorpus(corpus_path)
        lda = LdaModel(corpus, num_topics=num_topics, id2word=id2word)
        lda.save(lda_model_path)

        return lda
开发者ID:jodzi,项目名称:flavor_fave,代码行数:8,代码来源:wine_analysis.py

示例7: print

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
# try with BoW vectors too?



#  vamos a utilizar Latent Dirichlet Allocation para tratar de categorizar los abstracts
# este se demora la primera q lo corres para entrenar el modelo
print("lda")
lda_filename = 'model.lda'
if not os.path.isfile(lda_filename):
    lda = LdaModel(corpus, num_topics=5,
                   id2word=dictionary,
                   update_every=5,
                   chunksize=10000,
                   passes=100)
    lda.save('/tmp/model.lda')
else:
    lda = LdaModel.load('/tmp/model.lda')
lda.show_topics()
topics_matrix = lda.show_topics(formatted=False, num_words=7)

print(topics_matrix)
print(len(topics_matrix))

for topic in topics_matrix:
    i = topic[1]
    print([str(word) for word in i])
#
# topics_matrix = np.array(topics_matrix)
#
# topic_words = topics_matrix[:, :, 1]
开发者ID:rafunchik,项目名称:shrimps,代码行数:32,代码来源:docs.py

示例8: W2V_cpp2

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
class W2V_cpp2(W2V_base):
    def __init__(self,n_topic, path, folder):
        self.n_topic = n_topic
        W2V_base.__init__(self, path, folder)

        #process dict
        for prod_id in self.idx2prod.keys():
            prod = self.idx2prod[prod_id]
            n_prod_id = prod_id - len(self.word_count) - 1
            del self.idx2prod[prod_id]
            self.idx2prod[n_prod_id] = prod
            self.prod2idx[prod] = n_prod_id

        for user_id in self.idx2user.keys():
            user = self.idx2user[user_id]
            n_user_id = user_id - len(self.word_count) - len(self.prod2idx) - 1
            del self.idx2user[user_id]
            self.idx2user[n_user_id] = user
            self.user2idx[user] = n_user_id

    def train(self):
        data = []
        entity2id = {}
        id2entity = []

        for obj in self.data:
            doc = []
            obj_sents = obj["text_data"]
            entity = obj["prod"]
            if entity not in entity2id:
                entity2id[entity] = len(entity2id)
                id2entity.append(entity)
            doc_id = entity2id[entity]

            for obj_sent in obj_sents:
                for pair in obj_sent:
                    if pair[0] >= 0:
                        doc.append((pair[0], doc_id))
            data.append(doc)



        self.ldamodel = LdaModel(corpus=data, id2word=self.idx2word, num_topics=self.n_topic)

        f_entity = open("lda/prod.txt", "w")
        f_model = open("lda/model.txt", "w")
        f_model.write(str(len(entity2id)))
        f_model.write(" ")
        f_model.write(str(self.n_topic))
        f_model.write("\n")

        for entity in id2entity:
            f_entity.write(entity)
            f_entity.write("\n")

            f_model.write(entity)
            f_model.write(" ")

            distr = self.ldamodel.get_document_topics(data[1], minimum_phi_value=0, minimum_probability=0)
            distr = [pair[1] for pair in distr]

            for prod in distr:
                f_model.write(str(prod))
                f_model.write(" ")

            f_model.write("\n")

        self.ldamodel.save("lda/model_200")
开发者ID:Sanqiang,项目名称:entity2vector,代码行数:70,代码来源:lda.py

示例9: zip

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
log.info('generated topics...')

# print topics
topics = model.show_topics(num_topics=no_of_topics)

for item, i in zip(topics, enumerate(topics)):
    log.info('topic #%s: %s', i[0], item)


log.info('saving results...')

# create output folder
if not os.path.exists("out"): os.makedirs("out")

# save doc_labels for further use
with open(os.path.join(os.path.join(os.getcwd(), "out"),''.join([foldername, "_doclabels.txt"])), "w", encoding="utf-8") as f:
    for item in doc_labels: f.write(item+"\n")
	
# save topics for further use
with open(os.path.join(os.path.join(os.getcwd(), "out"), ''.join([foldername, "_topics.txt"])), "w", encoding="utf-8") as f:
    for item, i in zip(topics, enumerate(topics)):
        f.write("".join(["topic #",str(i[0]),": ",str(item),"\n"]))

# save dictionary for further use
dictionary.save(os.path.join(os.path.join(os.getcwd(), "out"), '.'.join([foldername, 'dict'])))

# save model for further use
model.save(os.path.join(os.path.join(os.getcwd(), "out"), '.'.join([foldername, 'lda'])))

log.info('topic modeling finished')
开发者ID:pielstroem,项目名称:Topics,代码行数:32,代码来源:lda2.py

示例10: main

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]

#.........这里部分代码省略.........
            dat_cs[ind1][ind2] = csimvalue
            dat_ts[ind1][ind2] = tsimvalue
            dat_js[ind1][ind2] = jsimvalue
            if ind1 != ind2:
                dat_cs[ind2][ind1] = csimvalue
                dat_ts[ind2][ind1] = tsimvalue
                dat_js[ind2][ind1] = jsimvalue
    cs_df = pd.DataFrame(dat_cs, index = cuisine_types, columns = cuisine_types)
    ts_df = pd.DataFrame(dat_ts, index = cuisine_types, columns = cuisine_types)
    js_df = pd.DataFrame(dat_js, index = cuisine_types, columns = cuisine_types)
    cs_df.to_csv("cosine_similarity_df.csv", header = True, index = True)
    ts_df.to_csv("tanimoto_similarity_df.csv", header = True, index = True)
    js_df.to_csv("jaccard_similarity_df.csv", header = True, index = True)
    finish = clock()
    print("Running time: %.2f seconds" % (finish - start,))
    print()
    
    # clear potential large objects from memory prior to running any further analyses
    del doc_sparse_list
    del dat_cs
    del dat_ts
    del dat_js
    del cs_df
    del ts_df
    del js_df
   
    # run lda analysis
    print("Running LDA with %d topics..." % (LDA_TOPICS,))
    start = clock()
    cuisine_corpus.agglomerate = False
    lda = LdaModel(corpus = cuisine_tfidf[cuisine_corpus], id2word = cuisine_corpus.dictionary, \
                   num_topics = LDA_TOPICS, eval_every = None, chunksize = LDA_CHUNKSIZE, iterations = 200, \
                   passes = 2)
    lda.save("lda_cuisines.pyobject")
    
    # create dense numpy matrix
    cuisine_corpus.agglomerate = True
    rows, cols = len(cuisine_files), LDA_TOPICS
    lda_array = np.zeros(rows * cols).reshape(rows, cols)
    for row, doc in enumerate(cuisine_corpus):
        entries = lda[doc]
        for col, value in entries:
            lda_array[row][col] = value
    with open("lda_array.npy", "wb") as f:
        np.save(f, lda_array)
    finish = clock()
    print("LDA complete!")
    print("Running time: %.2f seconds" % (finish - start,))
    print()
   
    # calculate similarity for all lda documents and write to file
    print("Calculating LDA similarity matrices...")
    start = clock()
    dat_cs = np.zeros((len(cuisine_types), len(cuisine_types)))
    dat_ts = np.zeros((len(cuisine_types), len(cuisine_types)))
    for ind1 in range(lda_array.shape[0]):
        vec1 = lda_array[ind1,:]
        for ind2 in range(ind1, lda_array.shape[0]):
            vec2 = lda_array[ind2,:]
            csimvalue = vec1.dot(vec2) / np.sqrt(vec1.dot(vec1) * vec2.dot(vec2))
            tsimvalue = vec1.dot(vec2) / (vec1.dot(vec1) + vec2.dot(vec2) - vec1.dot(vec2))
            dat_cs[ind1][ind2] = csimvalue
            dat_ts[ind1][ind2] = tsimvalue
            if ind1 != ind2:
                dat_cs[ind2][ind1] = csimvalue
                dat_ts[ind2][ind1] = tsimvalue
开发者ID:btcross26,项目名称:Data-Mining-Capstone-Project,代码行数:70,代码来源:Cuisine_Similarity_Analysis.py

示例11: print

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
print("fitting the model ...\n")

model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes,
                 eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)

#model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes,
#                 eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)

print(model, "\n")

topics = model.show_topics(num_topics=no_of_topics)

for item, i in zip(topics, enumerate(topics)):
    print("topic #"+str(i[0])+": "+str(item)+"\n")


print("saving ...\n")

if not os.path.exists("out"): os.makedirs("out")

with open("out/"+foldername+"_doclabels.txt", "w") as f:
    for item in doc_labels: f.write(item+"\n")

with open("out/"+foldername+"_topics.txt", "w") as f:
    for item, i in zip(topics, enumerate(topics)):
        f.write("topic #"+str(i[0])+": "+str(item)+"\n")

dictionary.save("out/"+foldername+".dict")
MmCorpus.serialize("out/"+foldername+".mm", corpus)
model.save("out/"+foldername+".lda")
开发者ID:severinsimmler,项目名称:dariah-nlp-tutorial,代码行数:32,代码来源:lda.py

示例12: create_lda_model

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
def create_lda_model():
    logging.info('about to create all docs from chunks')
    start_time = datetime.datetime.now()
    create_all_docs()
    end_time = datetime.datetime.now()
    logging.info('total time is: %s', end_time - start_time)

    logging.info('about to load all docs')
    with open('./resources/LDA_processing/all_docs.pkl', mode='rb') as f:
        all_docs = pickle.load(f)

    logging.info('about to load english words')
    with open('./resources/LDA_input/english_full_list.txt') as f:
        english_words = f.read().splitlines()

    good_english_words = set(english_words[75:21000])
    del english_words
    logging.info('about to remove all stop-words and unknown words')
    texts = []
    for i, doc in enumerate(all_docs):
        filtered_doc = [word for word in doc if word in good_english_words]
        texts.append(filtered_doc)
        if i % 5000 == 0:
            logging.info('Finished doc: %s', i)

    logging.info('about to release memory of all_docs and english_words')
    del all_docs
    del good_english_words

    logging.info('about to save texts')
    with open('./resources/LDA_processing/texts.pkl', mode='wb') as f:
        pickle.dump(texts, f)

    logging.info('about to load texts')
    with open('./resources/LDA_processing/texts.pkl', mode='rb') as f:
        texts = pickle.load(f)

    logging.info('about to create dictionary')
    dictionary = corpora.Dictionary(texts)
    keys = dictionary.keys()
    logging.info('dict size before filter: %s', len(keys))
    dictionary.filter_extremes(keep_n=150000)
    dictionary.filter_extremes(no_below=150, no_above=0.05)
    keys = dictionary.keys()
    logging.info('dict size after filter: %s', len(keys))
    dictionary.save('./resources/LDA_processing/lda.dict')
    dictionary.save_as_text('./resources/LDA_processing/lda_dict.txt')

    logging.info('about to create corpus')
    corpus = [dictionary.doc2bow(text) for text in texts]

    logging.info('about to save corpus as mm file')
    corpora.MmCorpus.serialize('./resources/LDA_processing/corpus.mm', corpus)

    logging.info('about to load dictionary file')
    dictionary = corpora.Dictionary.load('./resources/LDA_processing/lda.dict')

    logging.info('about to load corpus as mm file')
    corpus = corpora.MmCorpus('./resources/LDA_processing/corpus.mm')

    logging.info('about to start LDA model')
    lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics)
    logging.info('finished LDA model')

    logging.info('about to save ldaModel')
    lda.save('./resources/LDA_processing/LdaModel')

    logging.info('about to load ldaModel')
    lda = LdaModel.load('./resources/LDA_processing/LdaModel')

    logging.info('about to find topics')
    topics = lda.show_topics(num_topics=num_topics, num_words=10000, log=True, formatted=False)

    logging.info('about to save topics')
    with open('./resources/LDA_processing/topics.pkl', mode='wb') as f:
        pickle.dump(topics, f)

    dict_word_sets = find_words_from_lda_model()
    with open('./resources/LDA_processing/dict_word_sets.pkl', mode='wb') as f:
        pickle.dump(dict_word_sets, f)

    topics_words = extract_words_from_word_sets()
    with open('./resources/LDA_result/topic_words', mode='wt', encoding='utf-8') as f:
        f.write('\n'.join(topics_words))
开发者ID:uriklarman,项目名称:TreasureHunter,代码行数:86,代码来源:process_documents.py

示例13: unpickle

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
        #
        # logging.info('combine report and wiki dictionary...')
        # wiki_to_report = report_dict.merge_with(wiki_dict)
        # merged_dict = report_dict
        #
        # logging.info('combine report and wiki corpus...')
        # merged_corpus = wiki_to_report[wiki_corpus].corpus + report_corpus
        logging.info('generate wiki corpus...')
        wiki_txt = unpickle('data/txt/processed_wiki.pkl')
        wiki_corpus = [report_dict.doc2bow(wiki) for wiki in wiki_txt]

        logging.info('combine report and wiki corpus...')
        merged_corpus = wiki_corpus + report_corpus

    # compute TFIDF
    # logging.info('compute TFIDF...')
    # tfidf = TfidfModel(dictionary=report_dict, id2word=report_dict)

    # perform LDA
    logging.info('perform LDA...')
    if use_wiki is True:
        lda = LdaModel(corpus=merged_corpus, id2word=report_dict, num_topics=num_topics, passes=passes,
                       iterations=iterations, alpha='auto', chunksize=chunksize)
        lda.save('result/model_wiki.lda')
        lda.print_topics(topics=num_topics, topn=10)
    else:
        lda = LdaModel(corpus=report_corpus, id2word=report_dict, num_topics=num_topics, passes=passes,
                       iterations=iterations, alpha='auto', chunksize=chunksize)
        lda.save('result/model.lda')
        lda.print_topics(topics=num_topics, topn=10)
开发者ID:andresportocarrero,项目名称:FinancialReportMining,代码行数:32,代码来源:LDA.py

示例14: DMP

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
class DMP(object):

    def __init__(self):
        self.dic = None
        self.lda = None
        self.topic_num = config.getint('dmp', 'topic_num')
        self.corpus_file = config.get('dmp', 'corpus_file')

    @staticmethod
    def __text2doc(iterator, sep=u' '):
        '''将文本转换为文档
        通过 split 函数将文本切成词的列表.

        参数
            sep: 分隔符

        返回
            返回已经切割好的词的列表
        '''
        docs = []
        for line in iterator:
            text = line.strip().split(sep)
            docs.append(text)
        return docs

    def __load_corpus(self):
        '''读取语料. 通过调用 text2doc 将文本转换为词的列表.

        返回
            返回处理过后的文档的列表.
        '''
        docs = None
        with codecs.open(self.corpus_file, 'r', 'utf-8') as iterator:
            docs = self.__text2doc(iterator)
        return docs

    def train(self):
        '''训练模型, 将会得到词典 (dic) 和模型 (lda) 两个对象.

        dic: 用来存储词, 每个词会有一个编号. 可以通过 dic[id] 来获取词
        lda: 模型, 包含主题的列表. 每个主题有一个编号, 可以通过
             lda.print_topic(id) 来获取主题中词的列表
        '''
        docs = self.__load_corpus()
        self.dic = Dictionary(docs)
        bow = [self.dic.doc2bow(doc) for doc in docs]
        self.lda = LdaModel(bow, id2word=self.dic,
                            num_topics=self.topic_num)

    def infer(self, doc):
        '''推断新的文档是什么主题

        参数
            doc: 新的文档. 要以词的列表的形式呈现

        返回
            返回主题列表的迭代器, 其中主题均采用编号呈现, 需调用 lda.print_topic
            函数来方便人工理解.
        '''
        bow = self.dic.doc2bow(doc)
        topics = self.lda[bow]
        return topics

    def dump(self):
        '''导出 lda 模型和 dic 词典.
        '''
        lda_file = config.get('dmp', 'lda_file')
        dic_file = config.get('dmp', 'dic_file')
        self.lda.save(lda_file)
        self.dic.save(dic_file)

    def load(self):
        '''读取 lda 模型和 dic 词典.
        '''
        lda_file = config.get('dmp', 'lda_file')
        dic_file = config.get('dmp', 'dic_file')
        self.lda = LdaModel.load(lda_file)
        self.dic = Dictionary.load(dic_file)
开发者ID:npiaq,项目名称:dmp,代码行数:80,代码来源:dmp.py

示例15: UnlabeledCorpus

# 需要导入模块: from gensim.models import LdaModel [as 别名]
# 或者: from gensim.models.LdaModel import save [as 别名]
    vocab = Dictionary.load_from_text('./vocab.txt')
    corpus = UnlabeledCorpus('./rumor_train.csv', vocab)
    valid_corpus = UnlabeledCorpus('./rumor_valid.csv', vocab)
    valid_sentences = [doc for doc in valid_corpus][5000:]

    # varing number of topics
    # result = {}
    # for num_topics in [2, 4, 8, 16, 32, 64]:
    #     best_value = -100
    #     for i in range(5):
    #         model = LdaModel(corpus=corpus, id2word=vocab, num_topics=num_topics)
    #         likelihood = model.log_perplexity(valid_sentences)
    #         best_value = max(best_value, likelihood)
    #     result[num_topics]= best_value
    #
    # for num_topics, likelihood in result.iteritems():
    #     print 'num_topics: %d, best word_likelihood: %f' % (num_topics, likelihood)

    model = LdaModel(corpus=corpus, id2word=vocab, num_topics=8, passes=2)
    model.save('./lda_model.txt')
    # print topics to a file
    topics = model.show_topics(num_topics=100, num_words=50)
    with codecs.open('./topics.txt', 'w', 'utf-8') as out_f:
        for topic in topics:
            topic_id, topic_str = topic[0], topic[1]
            out_f.write('%d:\n%s\n' % (topic_id, topic_str))
        out_f.write('\n')



开发者ID:zhuwenya,项目名称:Rumor_Detection,代码行数:29,代码来源:run.py


注:本文中的gensim.models.LdaModel.save方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。