本文整理汇总了Python中gensim.models.LdaModel方法的典型用法代码示例。如果您正苦于以下问题:Python models.LdaModel方法的具体用法?Python models.LdaModel怎么用?Python models.LdaModel使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models
的用法示例。
在下文中一共展示了models.LdaModel方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Execute
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LdaModel [as 别名]
def Execute( self, tokenRegex, numTopics, numPasses ):
if not os.path.exists( self.modelPath ):
os.makedirs( self.modelPath )
# Generate gensim objects
corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
corpus.dictionary.filter_extremes( no_above = 0.2 ) # remove words that are too frequent/too infrequent
model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )
self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
corpus.dictionary.save( self.dictionaryInGensim )
self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
corpus.save( self.corpusInGensim )
self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
model.save( self.modelInGensim )
示例2: build_lda_mode
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LdaModel [as 别名]
def build_lda_mode():
# corpus is bag of words, which is the original feature
corpus = corpora.BleiCorpus('./zhihu_dat/item.dat') # the bag of words feature of question data
# build up lda model: using lda model, given a bag of words feature, return the topic feature, so the topic model is to reduce the dimension of the features of a document
lda_model = models.LdaModel(corpus, id2word = dictionary, num_topics = 10)
# save the model to disk for future use(Given a document such as question, return the topic feature of the document)
lda_model.save('./zhihu_dat/zhihu_10.lda')
print 'Building complete'
示例3: test_model
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LdaModel [as 别名]
def test_model():
'''
after setting the lda model, test the model
'''
lda_model = models.LdaModel.load('./zhihu_dat/zhihu_10.lda')
# transform the question_corpus into lda space, print the lda feature
question_lda = lda_model[question_corpus]
for doc in question_lda:
print doc
示例4: trainModel
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LdaModel [as 别名]
def trainModel(self):
'''
Train a LDA model, inclusive of 4 steps:
1. Parse the whole corpora into unigram token collections and document mapping (for later use)
2. Filter tokens which are not common (no_below_this_number), and too common (no_above_fraction_of_doc)
3. Indexing the token collections and do TF-IDF transformation
4. Call gensim.models.LdaModel and generate topic distributions of the corpora
'''
print 'Start preparing unigram tokens....'
## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....], which comprise Bag-Of-Words (BOW)
# Get document_count, tokens, and document-index mapping from the corpora
doc_count,train_set,doc_mapping,link_mapping = self.__tokenizeWholeCorpora(path_corpora)
# Put the training data into gensim.corpora for later use
dic = corpora.Dictionary(train_set)
denominator = len(dic)
# Filtering infrequent words & common stopwords, thus reducing the dimension of terms (which prevents curse of dimensionality)
dic.filter_extremes(no_below=self.no_below_this_number, no_above=self.no_above_fraction_of_doc)
nominator = len(dic)
corpus = [dic.doc2bow(text) for text in train_set] # transform every token into BOW
print 'There are %i documents in the pool' % (doc_count)
print "In the corpus there are ", denominator, " raw tokens"
print "After filtering, in the corpus there are", nominator, "unique tokens, reduced ", (1-(nominator/denominator)),"%"
print 'Finished preparing unigram tokens....'
##END
print 'Start training LDA model....'
## Implementing TF-IDF as a vector for each document, and train LDA model on top of that
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = self.num_topics,iterations=self.num_of_iterations,passes = self.passes)
corpus_lda = lda[corpus_tfidf]
# Once done training, print all the topics and related words
print 'Finished training LDA model.......Here is the list of all topics & their most frequent words'
for i in range(self.num_topics):
print 'Topic %s : ' % (str(i)) + lda.print_topic(i)
# Exhibit perplexity of current model under specific topic hyperparameter : k. The lower the better
print '==============================='
print 'Model perplexity : ',lda.bound(corpus_lda),' when topic k =', str(self.num_topics)
print '==============================='
return lda,doc_mapping,link_mapping,corpus
示例5: LdaModel
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LdaModel [as 别名]
def LdaModel(self):
self.simple_model()
# 转换模型
self.model = models.LdaModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# 对新输入的句子(比较的句子)进行预处理
示例6: train_lda_model_gensim
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LdaModel [as 别名]
def train_lda_model_gensim(corpus, total_topics=2):
norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
dictionary = corpora.Dictionary(norm_tokenized_corpus)
mapped_corpus = [dictionary.doc2bow(text)
for text in norm_tokenized_corpus]
tfidf = models.TfidfModel(mapped_corpus)
corpus_tfidf = tfidf[mapped_corpus]
lda = models.LdaModel(corpus_tfidf,
id2word=dictionary,
iterations=1000,
num_topics=total_topics)
return lda
示例7: get_lda
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LdaModel [as 别名]
def get_lda(self, num_topics=100):
docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
model_lda = models.LdaModel(docs_corpus, num_topics, id2word=self.docs_dict)
docs_lda = model_lda[docs_corpus]
docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda])
return docs_vecs
# Get Hierarchical Dirichlet Process(HDP) vector for document list
示例8: topic_analysis
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LdaModel [as 别名]
def topic_analysis(corpus, dictionary, models_path, technique):
import uuid
uuid = str(uuid.uuid4())
print("[BLOCK] Starting models for context")
sys.stdout.flush()
if technique == "all" or technique == "hdp":
t1 = time()
# HDP model
model = HdpModel(corpus, id2word=dictionary)
model.save("%s/hdp_%s" % (models_path, uuid))
del model
t2 = time()
print("[BLOCK] Training time for HDP model: %s" % (round(t2-t1, 2)))
sys.stdout.flush()
if technique == "all" or technique == "ldap":
t1 = time()
# Parallel LDA model
model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=23, passes=20)
model.save("%s/lda_parallel_%s" % (models_path, uuid))
del model
t2 = time()
print("[BLOCK] Training time for LDA multicore: %s" % (round(t2-t1, 2)))
sys.stdout.flush()
if technique == "all" or technique == "lsa":
t1 = time()
# LSA model
model = LsiModel(corpus, id2word=dictionary, num_topics=400)
model.save("%s/lsa_%s" % (models_path, uuid))
del model
t2 = time()
print("[BLOCK] Training time for LSA: %s" % (round(t2-t1, 2)))
sys.stdout.flush()
if technique == "all" or technique == "ldao":
t1 = time()
# Online LDA model
model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=5)
model.save("%s/lda_online_%s" % (models_path, uuid))
t2 = time()
print("[BLOCK] Training time for LDA online: %s" % (round(t2-t1, 2)))
sys.stdout.flush()
if technique == "all" or technique == "lda":
t1 = time()
# Offline LDA model
model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=0, passes=20)
model.save("%s/lda_offline_%s" % (models_path, uuid))
del model
t2 = time()
print("[BLOCK] Training time for LDA offline: %s" % (round(t2-t1, 2)))
sys.stdout.flush()
示例9: CallTransformationModel
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LdaModel [as 别名]
def CallTransformationModel(self,Dict,Bowvec,**kwarg):
'''Invoke specific transformation models of Gensim module.
# Arguments:
Dict: Dictionary made by all tokenized news(articles/documents).
Bowvec: Bow-vector created by all tokenized news(articles/documents).
modelType: Transformation model type, including 'lsi', 'lda' and 'None', 'None' means TF-IDF mmodel.
tfDim: The number of topics that will be extracted from each news(articles/documents).
renewModel: Re-train the transformation models or not(bool type).
modelPath: The path of saving trained transformation models.
'''
if kwarg['renewModel']:
tfidf = models.TfidfModel(Bowvec) # initialize tfidf model
tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf")
if kwarg['modelType'] == 'lsi':
model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation
modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
model.save(kwarg['modelPath']) # same for tfidf, lda, ...
elif kwarg['modelType'] == 'lda':
model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'])
modelVec = model[tfidfVec] #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重
model.save(kwarg['modelPath']) # same for tfidf, lda, ...
elif kwarg['modelType'] == 'None':
model = tfidf
modelVec = tfidfVec
else:
if not os.path.exists(kwarg['modelPath']+"tfidf_model.tfidf"):
tfidf = models.TfidfModel(Bowvec) # initialize tfidf model
tfidfVec = tfidf[Bowvec] #
tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf")
else:
tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf")
tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
if kwarg['modelType'] == 'lsi':
if not os.path.exists(kwarg['modelPath']+"lsi_model.lsi"):
tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf")
tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation
modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
model.save(kwarg['modelPath']+"lsi_model.lsi") # same for tfidf, lda, ...
else:
model = models.LsiModel.load(kwarg['modelPath']+"lsi_model.lsi")
modelVec = model[tfidfVec]
elif kwarg['modelType'] == 'lda':
if not os.path.exists(kwarg['modelPath']+"lda_model.lda"):
tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf")
tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'])
modelVec = model[tfidfVec] #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重
model.save(kwarg['modelPath']+"lda_model.lda") # same for tfidf, lda, ...
else:
model = models.LdaModel.load(kwarg['modelPath']+"lda_model.lda")
modelVec = model[tfidfVec]
elif kwarg['modelType'] == 'None':
model = tfidf
modelVec = tfidfVec
return tfidfVec, modelVec
示例10: gensim_lda_topic_modelling
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import LdaModel [as 别名]
def gensim_lda_topic_modelling(path, documents, num_of_topics=6, passes=50, verbose=True, plotTopicsResults=True):
dictionary = Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]
if verbose:
print("Cleaned documents:\n", documents)
print("\nDictionary:\n", dictionary)
print("\nCorpus in BoW form: \n", corpus)
start = time.time()
ldamodel = LdaModel(corpus=corpus, num_topics=num_of_topics, passes=passes, id2word=dictionary)
end = time.time()
print("Completion time for building LDA model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0))
ldatopics = ldamodel.show_topics(formatted=False)
ldatopics_words = [[[word, prob] for word, prob in topic] for topicid, topic in ldatopics]
if verbose:
print("\nList of words associated with each topic:\n")
for i in range(len(ldatopics_words)):
print("\nTopic %d:\n" % i)
for w, p in ldatopics_words[i]:
print(p, " - ", w)
if plotTopicsResults:
plot_top_10_words_per_topic(path, ldatopics_words, num_topics=6, num_top_words=10)
all_documents_topics = [(doc_topics, word_topics, word_phis)
for doc_topics, word_topics, word_phis
in ldamodel.get_document_topics(corpus, per_word_topics=True)]
all_doc_topics = []
for i in range(len(all_documents_topics)):
doc_topics, word_topics, phi_values = all_documents_topics[i]
all_doc_topics.append([doc_topics[i][1] for i in range(len(doc_topics))])
if verbose:
print('Document topics:', doc_topics)
print('Word topics:', word_topics)
print('Phi values:', phi_values)
print('-------------- \n')
if plotTopicsResults:
plot_share_of_topics(path, all_doc_topics, no_random_tweets=10)
# Plot words coloured differently depending on the topic
for doc in documents[0:100]:
if len(doc) > 4:
color_words(ldamodel, doc)