本文整理汇总了Python中gensim.models.LdaModel类的典型用法代码示例。如果您正苦于以下问题:Python LdaModel类的具体用法?Python LdaModel怎么用?Python LdaModel使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LdaModel类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: lda
def lda(docs, k):
"""Latent Dirichlet allocation topic model.
Uses Gensim's LdaModel after tokenizing using scikit-learn's
TfidfVectorizer.
Parameters
----------
k : integer
Number of topics.
"""
from gensim.matutils import Sparse2Corpus
from gensim.models import LdaModel
# Use a scikit-learn vectorizer rather than Gensim's equivalent
# for speed and consistency with LSA and k-means.
vect = _vectorizer()
corpus = vect.fit_transform(fetch(d) for d in docs)
corpus = Sparse2Corpus(corpus)
model = LdaModel(corpus=corpus, num_topics=k)
topics = model.show_topics(formatted=False)
vocab = vect.get_feature_names()
#return [(vocab[int(idx)], w) for topic in topics for w, idx in topic]
return [[(vocab[int(idx)], w) for w, idx in topic] for topic in topics]
示例2: create_lda_model
def create_lda_model(project, corpus, id2word, name, use_level=True, force=False):
model_fname = project.full_path + name + str(project.num_topics)
if use_level:
model_fname += project.level
model_fname += '.lda.gz'
if not os.path.exists(model_fname) or force:
if corpus:
update_every=None # run in batch if we have a pre-supplied corpus
else:
update_every=1
model = LdaModel(corpus=corpus,
id2word=id2word,
alpha=project.alpha,
eta=project.eta,
passes=project.passes,
num_topics=project.num_topics,
iterations=project.iterations,
eval_every=None, # disable perplexity tests for speed
update_every=update_every,
)
if corpus:
model.save(model_fname)
else:
model = LdaModel.load(model_fname)
return model, model_fname
示例3: TestLdaCallback
class TestLdaCallback(unittest.TestCase):
def setUp(self):
self.corpus = MmCorpus(datapath('testcorpus.mm'))
self.ch_umass = CoherenceMetric(corpus=self.corpus, coherence="u_mass", logger="visdom", title="Coherence")
self.callback = [self.ch_umass]
self.model = LdaModel(id2word=common_dictionary, num_topics=2, passes=10, callbacks=self.callback)
self.host = "http://localhost"
self.port = 8097
def testCallbackUpdateGraph(self):
# Popen have no context-manager in 2.7, for this reason - try/finally.
try:
# spawn visdom.server
proc = subprocess.Popen(['python', '-m', 'visdom.server', '-port', str(self.port)])
# wait for visdom server startup (any better way?)
time.sleep(3)
viz = Visdom(server=self.host, port=self.port)
assert viz.check_connection()
# clear screen
viz.close()
self.model.update(self.corpus)
finally:
proc.kill()
示例4: perform_lda
def perform_lda(dictionary, corpus, num_topics, wiki_path=None, passes=1, iterations=50, chunksize=200):
"""
:param dictionary:
:param corpus:
:param wiki_path:
:param num_topics:
:param passes:
:param iterations:
:param chunksize:
:return:
"""
if wiki_path is not None:
logging.info('Generating wiki corpus...')
wikis = unpickle(wiki_path)
wiki_corpus = [dictionary.doc2bow(wiki) for wiki in wikis]
logging.info('Combining original corpus and wiki corpus...')
corpus = corpus + wiki_corpus # wiki_corpus is merged after the original corpus
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes,
iterations=iterations, alpha='auto', chunksize=chunksize)
corpus_ids = get_corpus_ids(dictionary.corpus_id2orig_id)
# doc_vector_ids = dictionary.corpus_id2orig_id[corpus_ids]
doc_vector_ids = [dictionary.corpus_id2orig_id[corpus_id] for corpus_id in corpus_ids]
doc_vectors = lda_model.inference(corpus)[0]
doc_vectors = doc_vectors[corpus_ids, :]
doc_vectors = doc_vectors / doc_vectors.sum(axis=1).reshape(doc_vectors.shape[0], 1)
return lda_model, doc_vectors, doc_vector_ids
示例5: TestLdaDiff
class TestLdaDiff(unittest.TestCase):
def setUp(self):
self.dictionary = common_dictionary
self.corpus = common_corpus
self.num_topics = 5
self.n_ann_terms = 10
self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=10)
def testBasic(self):
# test for matrix case
mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms)
self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics))
self.assertEqual(len(annotation), self.num_topics)
self.assertEqual(len(annotation[0]), self.num_topics)
# test for diagonal case
mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, diagonal=True)
self.assertEqual(mdiff.shape, (self.num_topics,))
self.assertEqual(len(annotation), self.num_topics)
def testIdentity(self):
for dist_name in ["hellinger", "kullback_leibler", "jaccard"]:
# test for matrix case
mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name)
for row in annotation:
for (int_tokens, diff_tokens) in row:
self.assertEqual(diff_tokens, [])
self.assertEqual(len(int_tokens), self.n_ann_terms)
self.assertTrue(np.allclose(np.diag(mdiff), np.zeros(mdiff.shape[0], dtype=mdiff.dtype)))
if dist_name == "jaccard":
self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))
# test for diagonal case
mdiff, annotation = \
self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name, diagonal=True)
for (int_tokens, diff_tokens) in annotation:
self.assertEqual(diff_tokens, [])
self.assertEqual(len(int_tokens), self.n_ann_terms)
self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))
if dist_name == "jaccard":
self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))
def testInput(self):
self.assertRaises(ValueError, self.model.diff, self.model, n_ann_terms=self.n_ann_terms, distance='something')
self.assertRaises(ValueError, self.model.diff, [], n_ann_terms=self.n_ann_terms, distance='something')
示例6: extract_topics
def extract_topics(words):
word_id_map=Dictionary([words])
word_id_map.filter_tokens([id for id, occurance in word_id_map.dfs.iteritems() if occurance == 2])
word_id_map.compactify()
deals_corpus=[word_id_map.doc2bow(words)]
lda=LdaModel(corpus=deals_corpus, id2word=word_id_map, num_topics=15, update_every=1, chunksize=1000,passes=1)
topics=[]
for i in range(15):
tokens=lda.print_topic(i).split('+')
topic_scores=[]
for token in tokens:
score,token_val=token.split('*')
topic_scores.append((token_val,score))
topics.append(topic_scores)
return topics
示例7: __init__
def __init__(self, destination, fileName, modelName='', ldaPasses='', topicNum=''):
'''
Constructor
'''
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
self.__destination = destination
self.__fileName = fileName
self.__modelName = modelName
self.__ldaPasses = ldaPasses
self.__topicNum = topicNum
#=======================================================================
# STOP WORDS AND CAHRACTERS
#=======================================================================
self.__stopwords = stopwords.words('english')# + string.punctuation
self.__chars_to_remove = [u'[', u']', u'(', u')', u'*', u'%', u'{', u'}', u'\n', u'\n\n', u'\t', u';',u'/',u'^',u'--',u'\\',u'+',u'-',u'.',u'?',u'&',u'#',u'',u'']
self.__stopwords.extend(self.__chars_to_remove)
self.__stopwords.extend([item for item in string.punctuation])
#=======================================================================
# DATABASE
#=======================================================================
self.__db = connectMySQL(db='xpath', port=3366)
self.__queryResults = None
self.__cleanedCorpus = []
if modelName != '' and os.path.exists(self.__destination+modelName+'.lda'):
self.__ldaModel = LdaModel.load(self.__destination+modelName+'.lda', mmap='r')
if fileName != '' and os.path.exists(self.__destination+fileName+'.dict'):
self.__modelDict = corpora.Dictionary.load(self.__destination+fileName+'.dict')
示例8: write_topics
def write_topics(model_path, csv_name, k):
model = LdaModel.load(model_path)
topics = []
for topic_id in range(model.num_topics):
topics.append(model.return_topic(topicid=topic_id))
dictionary = Dictionary.load('data/dictionary/tweets.dict')
word_indices = dictionary.id2token
writer = csv.writer(file(csv_name, 'w'))
output = [[0 for i in range(model.num_topics)] for j in range(k)]
for topic_id, topic in enumerate(topics):
for rank, index in enumerate(topic.argsort()[::-1]):
output[rank][topic_id] = {}
output[rank][topic_id]['word'] = word_indices[index]
output[rank][topic_id]['p'] = topic[index]
rank += 1
if rank >= k:
break
for topic_id in range(model.num_topics):
row = ['z = ' + str(topic_id)]
for rank in range(k):
row.append(output[rank][topic_id]['word'] + ':' + str(output[rank][topic_id]['p']))
writer.writerow(row)
示例9: load
def load(self):
'''读取 lda 模型和 dic 词典.
'''
lda_file = config.get('dmp', 'lda_file')
dic_file = config.get('dmp', 'dic_file')
self.lda = LdaModel.load(lda_file)
self.dic = Dictionary.load(dic_file)
示例10: create_evaluation_distinctiveness
def create_evaluation_distinctiveness(config, Kind):
model_fname = config.model_fname % Kind.__name__
try:
model = LdaModel.load(model_fname)
logger.info('Opened previously created model at file %s' % model_fname)
except:
error('Cannot evalutate LDA models not built yet!')
scores = utils.score(model, utils.kullback_leibler_divergence)
total = sum([x[1] for x in scores])
logger.info("%s model KL: %f" % (model_fname, total))
with open(config.path + 'evaluate-results.csv', 'a') as f:
w = csv.writer(f)
w.writerow([model_fname, total])
etas = list()
for topic in model.state.get_lambda():
topic_eta = list()
for p_w in topic:
topic_eta.append(p_w * numpy.log2(p_w))
etas.append(-sum(topic_eta))
entropy = sum(etas) / len(etas)
logger.info("%s model entropy mean: %f" % (model_fname, entropy))
with open(config.path + 'evaluate-entropy-results.csv', 'a') as f:
w = csv.writer(f)
w.writerow([model_fname, entropy])
示例11: __init__
class CorpusLdaModelWrapper:
def __init__(self, corpus, dictionary, doc_labels, preprocessing_pipeline, numtopics):
self.corpus = corpus
self.dictionary = dictionary
self.doc_labels = doc_labels
self.pipeline = preprocessing_pipeline
self.numtopics = numtopics
self.trained = False
def train(self):
# training
self.model = LdaModel(self.corpus, id2word=self.dictionary, num_topics=self.numtopics)
self.index = MatrixSimilarity(self.model[self.corpus])
# flag
self.trained = True
def convertTextToReducedVector(self, text):
if not self.trained:
raise exceptions.ModelNotTrainedException()
tokens = word_tokenize(prep.preprocess_text(text, self.pipeline))
tokens = filter(lambda token: self.dictionary.token2id.has_key(token), tokens)
bow = self.dictionary.doc2bow(tokens)
return self.model[bow]
def queryDoc(self, text):
reducedVec = self.convertTextToReducedVector(text)
sims = self.index[reducedVec]
simtuples = zip(range(len(sims)), sims) if self.doc_labels==None else zip(self.doc_labels, sims)
simtuples = sorted(simtuples, key=lambda item: item[1], reverse=True)
return simtuples
def show_topic(self, id):
return self.model.show_topic(id)
示例12: train
def train(self):
# training
self.model = LdaModel(self.corpus, id2word=self.dictionary, num_topics=self.numtopics)
self.index = MatrixSimilarity(self.model[self.corpus])
# flag
self.trained = True
示例13: evaluate_log
def evaluate_log(context, config):
logger.info('Evalutating models for: %s' % config.project.name)
model_fname = config.model_fname % ChangesetCorpus.__name__
changeset_fname = config.corpus_fname % ChangesetCorpus.__name__
commit_fname = config.corpus_fname % CommitLogCorpus.__name__
try:
commit_id2word = Dictionary.load(commit_fname + '.dict')
commit_corpus = MalletCorpus(commit_fname,
id2word=commit_id2word)
changeset_id2word = Dictionary.load(changeset_fname + '.dict')
changeset_corpus = MalletCorpus(changeset_fname,
id2word=changeset_id2word)
except:
error('Corpora not built yet -- cannot evaluate')
try:
model = LdaModel.load(model_fname)
logger.info('Opened previously created model at file %s' % model_fname)
except:
error('Cannot evalutate LDA models not built yet!')
changeset_doc_topic = get_doc_topic(changeset_corpus, model)
commit_doc_topic = get_doc_topic(commit_corpus, model)
first_shared = dict()
for id_ in commit_doc_topic:
i = 0
commit_topics = [topic[0] for topic in commit_doc_topic[id_]]
try:
changeset_topics = [topic[0] for topic in changeset_doc_topic[id_]]
except:
continue
maximum = 101
minimum = maximum
for i, topic in enumerate(commit_topics):
if topic in changeset_topics:
j = changeset_topics.index(topic)
minimum = min(minimum, max(i, j))
for i, topic in enumerate(changeset_topics):
if topic in commit_topics:
j = commit_topics.index(topic)
minimum = min(minimum, max(i, j))
first_shared[id_] = minimum
if minimum == maximum:
logger.info('No common topics found for %s' % str(id_))
del first_shared[id_]
mean = sum(first_shared.values()) / len(first_shared)
with open('data/evaluate-log-results.csv', 'a') as f:
w = csv.writer(f)
w.writerow([model_fname, mean] + list(first_shared.values()))
示例14: calculateLDADistance
def calculateLDADistance(self, modelName='', topNSimilar='', topicList=''):
if modelName=='':
modelName=self.__fileName
if topNSimilar=='':
topNSimilar=5
write2file = self.__destination+modelName+"_results_LDA_similarTopics.csv"
resultsCSV = open(write2file, "wb")
print 'Reading model data'
gensimDict = corpora.Dictionary.load(self.__destination+self.__fileName+'.dict')
ldaModel = LdaModel.load(self.__destination+modelName+'.lda', mmap=None)
topics = ldaModel.show_topics(num_topics=ldaModel.num_topics, num_words=len(gensimDict),formatted=False)
#=======================================================================
# num_topics=ldaModel.num_topics
# num_words=len(gensimDict)
#=======================================================================
#=======================================================================
# GET SIMILARITY VECTORS
#=======================================================================
print 'Extractig vectors'
topicsSorted = [sorted(x, key=lambda x: x[1]) for x in topics]
vectors = []
for topic in topicsSorted:
vector = [item[0] for item in topic]
vectors.append(vector)
#=======================================================================
# CALCULATE SIMILARITIES BETWEEN TOPICS
#=======================================================================
print 'Calculating distances between LDA topics\n'
results = []
for topicListItem in topicList:
distances = []
for j in range (0, len(vectors)):
dist = euclidean(vectors[topicListItem], vectors[j])
#===============================================================
# print topicListItem, j, dist
#===============================================================
distances.append(dist)
results.append(distances)
#=======================================================================
# EXPORT TOP N SIMILAR TOPICS NAD PRINT OUT QUERY TERMS
#=======================================================================
print 'Writing found similar topics to file\n'
for resultItem in range(0,len(results)):
similarLDATopics = np.argsort(results[resultItem])[::-1]
for similarItem in similarLDATopics[:topNSimilar]:
#===============================================================
# print topicList[resultItem],similarItem
#===============================================================
resultsCSV.write(str(topicList[resultItem])+'; '+str(similarItem)+'; '+', '.join(x[1].lstrip().rstrip() for x in topics[similarItem][:100])+'\n\n')
resultsCSV.write('*******************************************\n\n')
示例15: __init__
def __init__(self):
cwd = os.path.dirname(__file__)
dictionary_path = os.path.abspath(os.path.join(cwd, 'models/dictionary.dict'))
lda_model_path = os.path.abspath(os.path.join(cwd, 'models/lda_model_10_topics.lda'))
self.dictionary = corpora.Dictionary.load(dictionary_path)
self.lda = LdaModel.load(lda_model_path)