本文整理汇总了Python中gensim.models.ldamodel.LdaModel类的典型用法代码示例。如果您正苦于以下问题:Python LdaModel类的具体用法?Python LdaModel怎么用?Python LdaModel使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LdaModel类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_topics
def get_topics(candidate, day):
start_time = datetime.strptime(day, "%Y-%m-%d").date()
start_time = int(start_time.strftime('%s'))*1000
end_time = start_time + 86399999
try:
client = MongoClient()
tweets = client.fletcher.tweets
tweets = tweets.aggregate([
{"$match":{"$text":{"$search":candidate_search[candidate_slugs[candidate]]}}},
{"$match":{"timestamp_ms":{"$gte":start_time,"$lt":end_time}}}])
documents = []
pattern = re.compile("[^a-zA-Z ]")
for tweet in tweets:
documents.append(pattern.sub('', tweet['text']))
stoplist = set(candidate_stop_words[candidate_slugs[candidate]] + stopwords)
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, update_every=1, chunksize=10000, passes=10)
return lda.print_topics(3)
except:
return None
示例2: main
def main():
collection_name = "nips"
years = xrange(2010, 2015) # 10 ~ 14
n_topics = 10
corpus_paths = map(lambda y:
"data/{}-{}.dat".format(collection_name, y),
years)
all_corpus = []
year2corpus = {}
for year, path in zip(years, corpus_paths):
corpus = list(load_line_corpus(path))
all_corpus.append(proc_corpus(corpus))
year2corpus[year] = corpus
all_corpus = list(itertools.chain.from_iterable(all_corpus))
dictionary = Dictionary(all_corpus)
all_corpus = [dictionary.doc2bow(doc)
for doc in all_corpus]
import pdb
pdb.set_trace()
# print all_corpus
model = LdaModel(all_corpus, num_topics=n_topics,
id2word=dictionary,
eval_every=10, passes=100)
print model.show_topics()
示例3: LDA
class LDA(object):
def __init__(self, model, vocab, corpus=None, topics=200, passes=1):
self._model_file = model
self._dict_file = vocab
self._corpus_file = corpus
self._topics = topics
self._passes = passes
def train(self):
self._corpus = SentenceDocCorpus(self._corpus_file)
self._lda = LdaModel(self._corpus, num_topics = self._topics, id2word = self._corpus.dictionary, passes = self._passes)
self._dictionary = self._corpus.dictionary
self._lda.save(self._model_file)
self._dictionary.save(self._dict_file)
def load(self):
self._lda = LdaModel.load(self._model_file)
self._dictionary = Dictionary.load(self._dict_file)
def topics(self, words):
return self._lda[self._dictionary.doc2bow(common.filter(words))]
def topic_vector(self, words):
return np.array([v for k, v in self._lda.__getitem__(self._dictionary.doc2bow(common.filter(words)), eps=0)])
示例4: malletmodel2ldamodel
def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50):
"""Convert :class:`~gensim.models.wrappers.ldamallet.LdaMallet` to :class:`~gensim.models.ldamodel.LdaModel`.
This works by copying the training model weights (alpha, beta...) from a trained mallet model into the gensim model.
Parameters
----------
mallet_model : :class:`~gensim.models.wrappers.ldamallet.LdaMallet`
Trained Mallet model
gamma_threshold : float, optional
To be used for inference in the new LdaModel.
iterations : int, optional
Number of iterations to be used for inference in the new LdaModel.
Returns
-------
:class:`~gensim.models.ldamodel.LdaModel`
Gensim native LDA.
"""
model_gensim = LdaModel(
id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
alpha=mallet_model.alpha, iterations=iterations,
gamma_threshold=gamma_threshold,
dtype=numpy.float64 # don't loose precision when converting from MALLET
)
model_gensim.expElogbeta[:] = mallet_model.wordtopics
return model_gensim
示例5: vwmodel2ldamodel
def vwmodel2ldamodel(vw_model, iterations=50):
"""Convert :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` to
:class:`~gensim.models.ldamodel.LdaModel`.
This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel
into the gensim model.
Parameters
----------
vw_model : :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`
Trained Vowpal Wabbit model.
iterations : int
Number of iterations to be used for inference of the new :class:`~gensim.models.ldamodel.LdaModel`.
Returns
-------
:class:`~gensim.models.ldamodel.LdaModel`.
Gensim native LDA.
"""
model_gensim = LdaModel(
num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize,
passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay,
offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold,
dtype=numpy.float32
)
model_gensim.expElogbeta[:] = vw_model._get_topics()
return model_gensim
示例6: run
def run(self):
if self.clean_level in ('raw','clean','stopwords'):
kind = self.clean_level
else:
kind = 'stopwords'
for idioma in self.output()['langs'].iterkeys():
dicc_path = self.input()['dict']['langs'][idioma].path
corp_path = self.input()['corp']['langs'][idioma].path
print '=============================='
print 'Corriendo LDA de %s con nivel de limpieza %s' % (idioma, kind)
print '=============================='
# Cargar diccionario y corpus
dicc = corpora.Dictionary.load(dicc_path)
corpus = corpora.MmCorpus(corp_path)
# Correr LDA del idioma para cada numero de topicos
for n_topics in self.output()['langs'][idioma].iterkeys():
print 'Número de tópicos: ' + str(n_topics)
if self.by_chunks:
lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, update_every=self.update_e, chunksize=self.chunk_size, passes=self.n_passes)
else:
lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, passes=1)
lda.save(self.output()['langs'][idioma][n_topics].path)
示例7: train_model
def train_model(texts, **kwargs):
# parse args
filter_stopwords = kwargs.get('filter_stopwords', True)
normalizer = kwargs.get('normalizer', 'porter')
tfidf = kwargs.get('tfidf', True)
num_topics = kwargs.get('num_topics', 20)
min_freq = kwargs.get('min_freq', 2)
use_pickle = kwargs.get('use_pickle', True)
update_pickle = kwargs.get('update_pickle', True)
report = kwargs.get('report', True)
distributed = kwargs.get('distributed', False)
# build corpus or read it in from pickle
if use_pickle:
print "INFO: loading pickled corpus and word hash"
corpus = pickle.load( open( "pickles/corpus.p", "rb" ) )
id2word = pickle.load( open( "pickles/id2word.p", "rb" ) )
else:
print "INFO: processing text and building corpus..."
corpus, id2word = process_texts(
texts = texts,
filter_stopwords = filter_stopwords,
normalizer = normalizer,
min_freq = min_freq
)
if update_pickle:
# pickle files
print "INFO: updating pickled coprus and word hash"
pickle.dump(corpus, open( "pickles/corpus.p", "wb" ) )
pickle.dump(id2word, open( "pickles/id2word.p", "wb" ) )
# optional tfidf transformation
if tfidf:
print "INFO: applying tfidf transformation..."
tfidf = TfidfModel(corpus)
corpus = tfidf[corpus]
# fit model
print "INFO: fitting model..."
lda = LdaModel(
corpus = corpus,
id2word = id2word,
num_topics = num_topics,
distributed = distributed
)
# report
if report:
perplexity = lda.bound(corpus)
print "RESULTS:"
print "\nperplexity: ", perplexity, "\n"
topics = lda.show_topics(num_topics)
for i, t in enumerate(topics):
print "topic %d:" % i
print t
return lda, corpus, id2word
示例8: plottopicpop
def plottopicpop():
internet = [0 for i in range(10)]
developing = [0 for i in range(10)]
habr = [0 for i in range(10)]
n = 0
for year in range(2006, 2016):
articles, numberofarticles = getarticlesbyyear(year)
print("Got articles for", str(year))
# Normalaize texts
i = 0
for article in articles:
article = replacesymbols(article)
articles[i] = normalaisestr(article.lower())
i += 1
print('Normalaised')
# Remove unnecessary words
texts = [[word for word in article if word not in stoplist]
for article in articles]
print('Deleted stopwords')
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print('Starting training')
# Щадящий режим для ОЗУ
for i in range(numberofarticles // 100):
begin = 100 * i
end = 100 * (i + 1)
if end > numberofarticles:
end = numberofarticles
lda = LdaModel(corpus[begin:end:], id2word=dictionary, num_topics=end - begin)
for j in range(lda.num_topics):
topics = lda.get_topic_terms(j, 15)
# print(topics)
for topic in topics[0]:
top = dictionary.get(topic)
# print(top)
if "интернет" == top:
internet[n] += 1
if "разработка" == top:
developing[n] += 1
if "хабра" == top:
habr[n] += 1
del lda
n += 1
print(internet,'\n', developing, '\n', habr)
plt.title('Population of 3 topics.')
plt.xlabel('Year 2006 - 2015')
plt.ylabel('Number of articles')
plt.plot(internet, label="Интернет")
plt.plot(developing, label="Разработка")
plt.plot(habr, label="Хабра")
plt.legend()
plt.show()
示例9: getLdaModel
def getLdaModel(bow_corpus, dictionary, useSavedTill):
if useSavedTill >= USESAVED.lda_model:
common_logger.info("loading LDA model from file")
return LdaModel.load(file_lda_model)
else:
common_logger.info("Training LDA model")
num_topics = int(math.log(len(bow_corpus)) + 1) # assumption:
lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=numPasses)
common_logger.info("Saving LDA model")
lda_model.save(file_lda_model)
common_logger.info("Done creating LDA model")
return lda_model
示例10: fetch_model
def fetch_model(dictionary):
print "Fetching LDA Model... ",
try:
lda = LdaModel.load('Topic/lda.tm')
print "LDA Model loaded!"
except IOError:
print "Model not found, building LDA..."
corpus=MyCorpus()
#lda = LdaModel(corpus,num_topics=50,update_every=1,chunksize=1000,passes=15)
lda = LdaModel(corpus,num_topics=50,id2word=dictionary,update_every=1,chunksize=1000,passes=50)
print "LDA Built!"
lda.save('Topic/lda.tm')
return lda
示例11: train
def train(self):
self._corpus = SentenceDocCorpus(self._corpus_file)
self._lda = LdaModel(self._corpus, num_topics = self._topics, id2word = self._corpus.dictionary, passes = self._passes)
self._dictionary = self._corpus.dictionary
self._lda.save(self._model_file)
self._dictionary.save(self._dict_file)
示例12: make_clouds
def make_clouds(files, n_words=20):
# set locations
base_model_name = os.path.splitext(os.path.basename(files.model))[0]
output_d = '../browser/clouds/' + base_model_name + '/'
if not os.path.exists(output_d):
os.makedirs(output_d)
# create wordcloud generator
wc = WordCloud(width=1000, height=500, background_color='white')
print('Loading model')
model = LdaModel.load(files.model)
beta = model.expElogbeta
print('Normalizing by topics, and by words')
pTW = normalize(beta, axis=0)
pWT = normalize(beta, axis=1)
# load bug<->id map, then invert to id<-> bug
bug_to_id = json.loads(open(files.replacements).read())
id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k}
for i in range(len(beta)):
# compute RAR
t_rar = np.sqrt(pTW[i] * pWT[i])
top_word_ids = t_rar.argsort()[:-1 - n_words:-1]
top_words = [model.id2word.id2token[wordid] for wordid in top_word_ids]
top_words = [id_to_bug[word] if word in id_to_bug else word for word in top_words]
wc.fit_words(zip(top_words, t_rar[top_word_ids]))
wc.to_file(output_d + str(i) + '.png')
示例13: __init__
def __init__(self, fnames, model=None, corpus=None, dictionary=None):
"""`fnames` is an array of files for [lda_model, distribution]"""
self.reviews = open('data/electronics_topics_in.txt').readlines()
print "Loding topic model..."
if model is not None:
print "Using argument model"
self.lda = model
else:
self.lda = LdaModel.load(fnames[0])
if corpus is not None:
print "Using argument corpus and dictionary"
self.corpus = corpus
self.dictionary = dictionary
else:
print "Loading corpus and dictionary from file"
self.corpus = load("data/models/electronics_tfidf_corpus.pkl")
self.dictionary = load("data/models/electronics_dict.pkl")
print "Loading review-topic distribution..."
self.review_dist = [l for l in self.lda[self.corpus]]
tmp = lambda dist: sorted(dist, key=lambda arr: arr[1], reverse=True)
self.review_dist = map(lambda dist: tmp(dist), self.review_dist)
print "processing topics"
tmp = map(lambda t: re.sub("(\d*\.\d*\*)", "", t), self.lda.show_topics(-1))
self.topics = map(lambda ts: re.sub("\\s\+", ",", ts), tmp)
示例14: __init__
def __init__(self, topics = 10,
worker = 3,
pretrained_model = None,
dictionary = None):
"""
lda模型训练初始化。
Args:
topics -- 指定主题个数
worker -- 并行化参数,一般为core数量减一
pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
Example:
>>> lda = LDA(topics = 20, worker = 2,
pretrained_model = model_file,
dictionary = dictionary_file)
>>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
>>> lda.update(corpus)
>>> lda.save(model_file, dictionary_file)
>>> topics = lda.inference(['word5', 'word6'])
"""
self._topics = topics
self._workers = worker
self._model = None
self._common_dictionary = None
if pretrained_model and common_dictionary:
self._model = LdaModel.load(pretrained_model)
self._common_dictionary = Dictionary.load(dictionary)
示例15: train_lda
def train_lda (self, corpus, dictionary):
"""
PRIVATE: train_lda
------------------
given a corpus and a dictionary, this fits parameters for self.lda_model,
fills self.lda_model_topics with the
"""
self.lda_model = LdaModel(corpus, id2word=dictionary, num_topics=self.num_topics_lda)
self.lda_model_topics = self.find_per_topic_word_distributions ()