本文整理汇总了Python中gensim.models.ldamodel.LdaModel方法的典型用法代码示例。如果您正苦于以下问题:Python ldamodel.LdaModel方法的具体用法?Python ldamodel.LdaModel怎么用?Python ldamodel.LdaModel使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models.ldamodel
的用法示例。
在下文中一共展示了ldamodel.LdaModel方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fit_model
# 需要导入模块: from gensim.models import ldamodel [as 别名]
# 或者: from gensim.models.ldamodel import LdaModel [as 别名]
def fit_model(self, data, params, return_data=False):
"""
Fit model to `data` using gensim with parameter set `params`.
"""
from gensim.models.ldamodel import LdaModel
dictionary = params.pop('dictionary', None)
if hasattr(data, 'dtype') and hasattr(data, 'shape') and hasattr(data, 'transpose'):
corpus = dtm_to_gensim_corpus(data)
dtm = data
else:
if isinstance(data, tuple) and len(data) == 2:
dictionary, corpus = data
else:
corpus = data
dtm = gensim_corpus_to_dtm(corpus)
model = LdaModel(corpus, id2word=dictionary, **params)
if return_data:
return model, (corpus, dtm)
else:
return model
示例2: testTransform
# 需要导入模块: from gensim.models import ldamodel [as 别名]
# 或者: from gensim.models.ldamodel import LdaModel [as 别名]
def testTransform(self):
passed = False
# sometimes, LDA training gets stuck at a local minimum
# in that case try re-training the model from scratch, hoping for a
# better random initialization
for i in range(5): # restart at most 5 times
# create the transformation model
model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100)
model.update(corpus)
# transform one document
doc = list(corpus)[0]
transformed = model[doc]
vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
expected = [0.13, 0.87]
passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering
if passed:
break
logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
(i, sorted(vec), sorted(expected)))
self.assertTrue(passed)
示例3: testLargeMmap
# 需要导入模块: from gensim.models import ldamodel [as 别名]
# 或者: from gensim.models.ldamodel import LdaModel [as 别名]
def testLargeMmap(self):
model = ldamodel.LdaModel(self.corpus, num_topics=2)
# simulate storing large arrays separately
model.save(testfile(), sep_limit=0)
model2 = ldamodel.LdaModel.load(testfile())
self.assertEqual(model.num_topics, model2.num_topics)
self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
# test loading the large model arrays with mmap
model2 = ldamodel.LdaModel.load(testfile(), mmap='r')
self.assertEqual(model.num_topics, model2.num_topics)
self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
#endclass TestLdaModel
示例4: update_reviews_with_topics
# 需要导入模块: from gensim.models import ldamodel [as 别名]
# 或者: from gensim.models.ldamodel import LdaModel [as 别名]
def update_reviews_with_topics(topic_model, corpus_list, reviews):
"""
:type minimum_probability: float
:param minimum_probability:
:type topic_model: LdaModel
:param topic_model:
:type corpus_list: list
:param reviews:
"""
# print('reviews length', len(reviews))
for review, corpus in zip(reviews, corpus_list):
review[Constants.TOPICS_FIELD] =\
topic_model.get_document_topics(corpus)
non_zero_topics = [topic[0] for topic in review[Constants.TOPICS_FIELD]]
for topic_index in range(Constants.TOPIC_MODEL_NUM_TOPICS):
if topic_index not in non_zero_topics:
review[Constants.TOPICS_FIELD].insert(
topic_index, [topic_index, 0.0])
示例5: __init__
# 需要导入模块: from gensim.models import ldamodel [as 别名]
# 或者: from gensim.models.ldamodel import LdaModel [as 别名]
def __init__(
self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8
):
super().__init__(
cleanup_urls=cleanup_urls,
nltk_tokenizer=nltk_tokenizer,
confidence_threshold=confidence_threshold,
)
self.corpus = []
self.bug_ids = []
for bug in bugzilla.get_bugs():
self.corpus.append(self.text_preprocess(self.get_text(bug)))
self.bug_ids.append(bug["id"])
indexes = list(range(len(self.corpus)))
random.shuffle(indexes)
self.corpus = [self.corpus[idx] for idx in indexes]
self.bug_ids = [self.bug_ids[idx] for idx in indexes]
self.dictionary = Dictionary(self.corpus)
self.model = LdaModel([self.dictionary.doc2bow(text) for text in self.corpus])
示例6: initialize
# 需要导入模块: from gensim.models import ldamodel [as 别名]
# 或者: from gensim.models.ldamodel import LdaModel [as 别名]
def initialize(self, myid, dispatcher, **model_params):
self.lock_update = threading.Lock()
self.jobsdone = 0 # how many jobs has this worker completed?
self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
self.dispatcher = dispatcher
self.finished = False
logger.info("initializing worker #%s" % myid)
self.model = ldamodel.LdaModel(**model_params)
示例7: testTopicSeeding
# 需要导入模块: from gensim.models import ldamodel [as 别名]
# 或者: from gensim.models.ldamodel import LdaModel [as 别名]
def testTopicSeeding(self):
passed = False
for topic in range(2):
# try seeding it both ways round, check you get the same
# topics out but with which way round they are depending
# on the way round they're seeded
for i in range(5): # restart at most 5 times
eta = numpy.ones((2, len(dictionary))) * 0.5
system = dictionary.token2id[u'system']
trees = dictionary.token2id[u'trees']
# aggressively seed the word 'system', in one of the
# two topics, 10 times higher than the other words
eta[topic, system] *= 10
model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=200, eta=eta)
model.update(corpus)
topics = [dict((word, p) for p, word in model.show_topic(j)) for j in range(2)]
# check that the word system in the topic we seeded, got a high weight,
# and the word 'trees' (the main word in the other topic) a low weight --
# and vice versa for the other topic (which we didn't seed with 'system')
result = [[topics[topic].get(u'system',0), topics[topic].get(u'trees',0)],
[topics[1-topic].get(u'system',0), topics[1-topic].get(u'trees',0)]]
expected = [[0.385, 0.022],
[0.025, 0.157]]
passed = numpy.allclose(result, expected, atol=1e-2)
if passed:
break
logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
(i, result, expected))
self.assertTrue(passed)
示例8: testPersistence
# 需要导入模块: from gensim.models import ldamodel [as 别名]
# 或者: from gensim.models.ldamodel import LdaModel [as 别名]
def testPersistence(self):
model = ldamodel.LdaModel(self.corpus, num_topics=2)
model.save(testfile())
model2 = ldamodel.LdaModel.load(testfile())
self.assertEqual(model.num_topics, model2.num_topics)
self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
示例9: get_topic
# 需要导入模块: from gensim.models import ldamodel [as 别名]
# 或者: from gensim.models.ldamodel import LdaModel [as 别名]
def get_topic(self, doc_set):
# compile sample documents into a list
o1 = self.__tokenize(doc_set)
o2 = self.__remove_stop_words(o1)
#o3 = self.__lemmatizer(o2)
o4 = self.__dt_matrix(o2)
self.topics = LdaModel(o4[0], num_topics=1, id2word=o4[1], passes=50)
output = self.topics.show_topics(num_topics=1, num_words=3, log=False, formatted=True)
return [x.split("*")[1].replace('"', '') for x in output[0][1].split("+")]
示例10: find_lda_context
# 需要导入模块: from gensim.models import ldamodel [as 别名]
# 或者: from gensim.models.ldamodel import LdaModel [as 别名]
def find_lda_context(train_records, test_records):
"""
Uses the training records to create a topic model and then updates both
the training and testing records with a vector of probabilities for each
topic from the recently created topic model
"""
dictionary = preprocess_records(train_records, test_records)
corpus = [record[Constants.CORPUS_FIELD] for record in train_records]
print(corpus)
topic_model = ldamodel.LdaModel(
corpus, id2word=dictionary,
num_topics=num_topics,
passes=Constants.TOPIC_MODEL_PASSES,
iterations=Constants.TOPIC_MODEL_ITERATIONS)
print(corpus)
for i in range(num_topics):
print(topic_model.show_topic(i, topn=2))
records = train_records + test_records
for record in records:
document_topics =\
topic_model.get_document_topics(record[Constants.CORPUS_FIELD])
lda_context = [document_topic[1] for document_topic in document_topics]
record['lda_context'] = lda_context
context_topics = {}
for i in range(num_topics):
topic_id = 'topic' + str(i)
context_topics[topic_id] = document_topics[i][1]
record[Constants.CONTEXT_TOPICS_FIELD] = context_topics
示例11: train_topic_model
# 需要导入模块: from gensim.models import ldamodel [as 别名]
# 或者: from gensim.models.ldamodel import LdaModel [as 别名]
def train_topic_model(records):
print('%s: train topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))
if Constants.TOPIC_MODEL_TYPE == 'lda':
topic_model_file_path = \
Constants.generate_file_name(
'topic_model', 'pkl', Constants.CACHE_FOLDER, None, None, True)
if os.path.exists(topic_model_file_path):
print('WARNING: Topic model already exists')
return
corpus = \
[record[Constants.CORPUS_FIELD] for record in records]
dictionary = corpora.Dictionary.load(Constants.DICTIONARY_FILE)
topic_model = ldamodel.LdaModel(
corpus, id2word=dictionary,
num_topics=Constants.TOPIC_MODEL_NUM_TOPICS,
passes=Constants.TOPIC_MODEL_PASSES,
iterations=Constants.TOPIC_MODEL_ITERATIONS)
with open(topic_model_file_path, 'wb') as write_file:
pickle.dump(topic_model, write_file, pickle.HIGHEST_PROTOCOL)
elif Constants.TOPIC_MODEL_TYPE == 'ensemble':
file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
"factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS
if os.path.exists(file_path):
print('Ensemble topic model already exists')
return
export_to_text(records)
topic_ensemble_caller.run_local_parse_directory()
topic_ensemble_caller.run_generate_kfold()
topic_ensemble_caller.run_combine_nmf()
else:
raise ValueError('Unrecognized topic modeling algorithm: \'%s\'' %
Constants.TOPIC_MODEL_TYPE)
示例12: build_topic_model_from_corpus
# 需要导入模块: from gensim.models import ldamodel [as 别名]
# 或者: from gensim.models.ldamodel import LdaModel [as 别名]
def build_topic_model_from_corpus(corpus, dictionary):
"""
Builds a topic model with the given corpus and dictionary.
The model is built using Latent Dirichlet Allocation
:type corpus list
:parameter corpus: a list of bag of words, each bag of words represents a
document
:type dictionary: gensim.corpora.Dictionary
:parameter dictionary: a Dictionary object that contains the words that are
permitted to belong to the document, words that are not in this dictionary
will be ignored
:rtype: gensim.models.ldamodel.LdaModel
:return: an LdaModel built using the reviews contained in the records
parameter
"""
# numpy.random.seed(0)
if Constants.LDA_MULTICORE:
print('%s: lda multicore' % time.strftime("%Y/%m/%d-%H:%M:%S"))
topic_model = LdaMulticore(
corpus, id2word=dictionary,
num_topics=Constants.TOPIC_MODEL_NUM_TOPICS,
passes=Constants.TOPIC_MODEL_PASSES,
iterations=Constants.TOPIC_MODEL_ITERATIONS,
workers=Constants.NUM_CORES - 1)
else:
print('%s: lda monocore' % time.strftime("%Y/%m/%d-%H:%M:%S"))
topic_model = ldamodel.LdaModel(
corpus, id2word=dictionary,
num_topics=Constants.TOPIC_MODEL_NUM_TOPICS,
passes=Constants.TOPIC_MODEL_PASSES,
iterations=Constants.TOPIC_MODEL_ITERATIONS)
return topic_model
示例13: get_topic_distribution
# 需要导入模块: from gensim.models import ldamodel [as 别名]
# 或者: from gensim.models.ldamodel import LdaModel [as 别名]
def get_topic_distribution(record, lda_model, dictionary, minimum_probability,
sampling_method=None, max_words=None):
"""
:type record: dict
:type lda_model: LdaModel
:type minimum_probability: float
:param sampling_method: a float in the range [0,1] that
indicates the proportion of text that should be sampled from the review.
It can also take the string value of 'max', indicating that only the
word with the highest probability from the topic will be sampled
text. If None then all the review text is taken
:param max_words: is the set of words with maximum probability for each
contextual topic
"""
# review_bow = [record[Constants.BOW_FIELD]]
# review_bow =\
# sample_bag_of_words(review_bow, sampling_method, max_words)
# corpus = dictionary.doc2bow(review_bow[0])
corpus = record[Constants.CORPUS_FIELD]
lda_corpus = lda_model.get_document_topics(
corpus, minimum_probability=minimum_probability)
topic_distribution = numpy.zeros(lda_model.num_topics)
for pair in lda_corpus:
topic_distribution[pair[0]] = pair[1]
return topic_distribution