本文整理汇总了Python中lda.LDA属性的典型用法代码示例。如果您正苦于以下问题:Python lda.LDA属性的具体用法?Python lda.LDA怎么用?Python lda.LDA使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类lda
的用法示例。
在下文中一共展示了lda.LDA属性的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_save_load_ldamodel_pickle
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_save_load_ldamodel_pickle():
try:
import lda
except ImportError:
pytest.skip('lda not installed')
pfile = 'tests/data/test_pickle_unpickle_ldamodel.pickle'
dtm = np.array([[0, 1], [2, 3], [4, 5], [6, 0]])
doc_labels = ['doc_' + str(i) for i in range(dtm.shape[0])]
vocab = ['word_' + str(i) for i in range(dtm.shape[1])]
model = lda.LDA(2, n_iter=1)
model.fit(dtm)
model_io.save_ldamodel_to_pickle(pfile, model, vocab, doc_labels)
unpickled = model_io.load_ldamodel_from_pickle(pfile)
assert np.array_equal(model.doc_topic_, unpickled['model'].doc_topic_)
assert np.array_equal(model.topic_word_, unpickled['model'].topic_word_)
assert vocab == unpickled['vocab']
assert doc_labels == unpickled['doc_labels']
示例2: test_get_marginal_topic_distrib
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_marginal_topic_distrib(dtm, n_topics):
try:
import lda
except ImportError:
pytest.skip('lda not installed')
if dtm.sum() == 0: # assure that we have at least one word in the DTM
dtm[0, 0] = 1
model = lda.LDA(n_topics, 1)
model.fit(dtm)
doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
marginal_topic_distr = model_stats.marginal_topic_distrib(model.doc_topic_, doc_lengths)
assert marginal_topic_distr.shape == (n_topics,)
assert np.isclose(marginal_topic_distr.sum(), 1.0)
assert all(0 <= v <= 1 for v in marginal_topic_distr)
示例3: test_get_marginal_word_distrib
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_marginal_word_distrib(dtm, n_topics):
try:
import lda
except ImportError:
pytest.skip('lda not installed')
if dtm.sum() == 0: # assure that we have at least one word in the DTM
dtm[0, 0] = 1
model = lda.LDA(n_topics, 1)
model.fit(dtm)
doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
p_t = model_stats.marginal_topic_distrib(model.doc_topic_, doc_lengths)
p_w = model_stats.marginal_word_distrib(model.topic_word_, p_t)
assert p_w.shape == (dtm.shape[1],)
assert np.isclose(p_w.sum(), 1.0)
assert all(0 <= v <= 1 for v in p_w)
示例4: test_get_word_distinctiveness
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_word_distinctiveness(dtm, n_topics):
try:
import lda
except ImportError:
pytest.skip('lda not installed')
if dtm.sum() == 0: # assure that we have at least one word in the DTM
dtm[0, 0] = 1
model = lda.LDA(n_topics, 1)
model.fit(dtm)
doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
p_t = model_stats.marginal_topic_distrib(model.doc_topic_, doc_lengths)
w_distinct = model_stats.word_distinctiveness(model.topic_word_, p_t)
assert w_distinct.shape == (dtm.shape[1],)
assert all(v > -1e10 for v in w_distinct)
示例5: test_get_word_saliency
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_word_saliency(dtm, n_topics):
try:
import lda
except ImportError:
pytest.skip('lda not installed')
if dtm.sum() == 0: # assure that we have at least one word in the DTM
dtm[0, 0] = 1
model = lda.LDA(n_topics, 10)
model.fit(dtm)
doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
w_sal = model_stats.word_saliency(model.topic_word_, model.doc_topic_, doc_lengths)
assert w_sal.shape == (dtm.shape[1],)
assert all(v >= -1e-9 for v in w_sal)
示例6: test_get_topic_word_relevance
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_topic_word_relevance(dtm, n_topics, lambda_):
try:
import lda
except ImportError:
pytest.skip('lda not installed')
if dtm.sum() == 0: # assure that we have at least one word in the DTM
dtm[0, 0] = 1
model = lda.LDA(n_topics, 1)
model.fit(dtm)
doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
rel_mat = model_stats.topic_word_relevance(model.topic_word_, model.doc_topic_, doc_lengths, lambda_)
assert rel_mat.shape == (n_topics, dtm.shape[1])
assert all(isinstance(x, float) and not np.isnan(x) for x in rel_mat.flatten())
示例7: _run_lda
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def _run_lda(self, data, n_topics):
"""
Run LDA algorithm.
:param data: sparse vector of document features
:param n_topics: number of topics we want
:return: map from topic label (int) to list of (doc-index, score) tuples of documents
in that topic cluster.
"""
import lda
lda_model = lda.LDA(n_topics=n_topics, n_iter=200, random_state=1)
lda_model.fit(data)
clusters = collections.defaultdict(list)
# doc_topic_ are the per-topic scores for each document
for i, scores in enumerate(lda_model.doc_topic_):
label = scores.argmax()
score = scores[label]
clusters[label].append((i, score))
return clusters, lda_model
示例8: discoverTopics
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def discoverTopics(n = 20):
matrix, vocab = preprocess('/Users/fpena/tmp/20_newsgroups')
# matrix, vocab = preprocess('../data/toy2')
# sampler = LdaSampler(n)
sampler = LatentDirichletAllocation(n)
info('Starting!')
for it, phi in enumerate(sampler.run(matrix, 100)):
print(colored("Iteration %s" % it, 'yellow'))
print("Likelihood", sampler.loglikelihood())
# for topicNum in range(n):
# s = colored(topicNum, 'green')
# # s = topicNum
# words = [(proba, w) for (w, proba) in enumerate(phi[topicNum, :]) if proba > 0]
# words = sorted(words, reverse = True)
# for i in range(10):
# proba, w = words[i]
# s += ' ' + vocab[w]
# print(s)
lda_sampler = lda.LDA(n, 100)
lda_sampler._fit(matrix)
print("LDA likelihood", lda_sampler.loglikelihood())
示例9: main
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def main():
parser = argparse.ArgumentParser(description="Generate the beta abd theta files after latent Dirichlet allocation (LDA) process.");
parser.add_argument('-i', '--input', required=True, help="The input file where each line starts with the number of word as well as the sparse representation of word distribution");
parser.add_argument('-o', '--output', required=True, help="The output path");
args = parser.parse_args();
tfidf = pickle.load(open(args.input));
feat = tfidf.toarray().astype(np.int64);
model = lda.LDA(n_topics=50, n_iter=1500, random_state=2017);
model.fit(feat);
fid = open(os.path.join(args.output, 'init.beta'), 'w');
beta = model.topic_word_;
for row in range(beta.shape[0]):
fid.write('%f'%beta[row,0]);
for col in range(1, beta.shape[1]):
fid.write(' %f'%beta[row,col]);
fid.write('\n');
fid.close();
fid = open(os.path.join(args.output, 'init.theta'), 'w');
theta = model.doc_topic_
for row in range(theta.shape[0]):
fid.write('%f'%theta[row,0]);
for col in range(1, theta.shape[1]):
fid.write(' %f'%theta[row,col]);
fid.write('\n');
fid.close();
示例10: test_get_most_or_least_distinct_words
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_most_or_least_distinct_words(dtm, n_topics, n_distinct_words):
try:
import lda
except ImportError:
pytest.skip('lda not installed')
if dtm.sum() == 0: # assure that we have at least one word in the DTM
dtm[0, 0] = 1
n_distinct_words = min(n_distinct_words, dtm.shape[1])
model = lda.LDA(n_topics, 1)
model.fit(dtm)
doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
vocab = np.array([chr(65 + i) for i in range(dtm.shape[1])]) # this only works for few words
most_distinct = model_stats.most_distinct_words(vocab, model.topic_word_, model.doc_topic_, doc_lengths)
least_distinct = model_stats.least_distinct_words(vocab, model.topic_word_, model.doc_topic_, doc_lengths)
assert most_distinct.shape == least_distinct.shape == (len(vocab),) == (dtm.shape[1],)
assert all(a == b for a, b in zip(most_distinct, least_distinct[::-1]))
most_distinct_n = model_stats.most_distinct_words(vocab, model.topic_word_, model.doc_topic_, doc_lengths,
n=n_distinct_words)
least_distinct_n = model_stats.least_distinct_words(vocab, model.topic_word_, model.doc_topic_, doc_lengths,
n=n_distinct_words)
assert most_distinct_n.shape == least_distinct_n.shape == (n_distinct_words,)
assert all(a == b for a, b in zip(most_distinct_n, most_distinct[:n_distinct_words]))
assert all(a == b for a, b in zip(least_distinct_n, least_distinct[:n_distinct_words]))
示例11: test_get_most_or_least_relevant_words_for_topic
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_most_or_least_relevant_words_for_topic(dtm, n_topics, lambda_, n_relevant_words):
try:
import lda
except ImportError:
pytest.skip('lda not installed')
if dtm.sum() == 0: # assure that we have at least one word in the DTM
dtm[0, 0] = 1
n_relevant_words = min(n_relevant_words, dtm.shape[1])
topic = random.randint(0, n_topics - 1)
model = lda.LDA(n_topics, 1)
model.fit(dtm)
vocab = np.array([chr(65 + i) for i in range(dtm.shape[1])]) # this only works for few words
doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
rel_mat = model_stats.topic_word_relevance(model.topic_word_, model.doc_topic_, doc_lengths, lambda_)
most_rel = model_stats.most_relevant_words_for_topic(vocab, rel_mat, topic)
least_rel = model_stats.least_relevant_words_for_topic(vocab, rel_mat, topic)
assert most_rel.shape == least_rel.shape == (len(vocab),) == (dtm.shape[1],)
assert all(a == b for a, b in zip(most_rel, least_rel[::-1]))
most_rel_n = model_stats.most_relevant_words_for_topic(vocab, rel_mat, topic, n=n_relevant_words)
least_rel_n = model_stats.least_relevant_words_for_topic(vocab, rel_mat, topic, n=n_relevant_words)
assert most_rel_n.shape == least_rel_n.shape == (n_relevant_words,)
assert all(a == b for a, b in zip(most_rel_n, most_rel[:n_relevant_words]))
assert all(a == b for a, b in zip(least_rel_n, least_rel[:n_relevant_words]))
示例12: test_generate_topic_labels_from_top_words
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_generate_topic_labels_from_top_words(dtm, n_topics, lambda_):
try:
import lda
except ImportError:
pytest.skip('lda not installed')
if dtm.sum() == 0: # assure that we have at least one word in the DTM
dtm[0, 0] = 1
model = lda.LDA(n_topics, 1)
model.fit(dtm)
vocab = np.array([chr(65 + i) for i in range(dtm.shape[1])]) # this only works for few words
doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
topic_labels = model_stats.generate_topic_labels_from_top_words(model.topic_word_, model.doc_topic_,
doc_lengths, vocab, lambda_=lambda_)
assert isinstance(topic_labels, np.ndarray)
assert len(topic_labels) == n_topics
for i, l in enumerate(topic_labels):
assert isinstance(l, str)
parts = l.split('_')
assert len(parts) >= 2
assert int(parts[0]) == i + 1
assert all(w in vocab for w in parts[1:])
topic_labels_2 = model_stats.generate_topic_labels_from_top_words(model.topic_word_, model.doc_topic_,
doc_lengths, vocab, lambda_=lambda_,
n_words=2)
assert isinstance(topic_labels_2, np.ndarray)
assert len(topic_labels_2) == n_topics
for i, l in enumerate(topic_labels_2):
assert isinstance(l, str)
parts = l.split('_')
assert len(parts) == 3
assert int(parts[0]) == i + 1
assert all(w in vocab for w in parts[1:])
示例13: test_compute_models_parallel_lda_multi_vs_singleproc
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_compute_models_parallel_lda_multi_vs_singleproc():
passed_params = {'n_topics', 'n_iter', 'random_state'}
varying_params = [dict(n_topics=k) for k in range(2, 5)]
const_params = dict(n_iter=3, random_state=1)
models = tm_lda.compute_models_parallel(EVALUATION_TEST_DTM, varying_params, const_params)
assert len(models) == len(varying_params)
for param_set, model in models:
assert set(param_set.keys()) == passed_params
assert isinstance(model, lda.LDA)
assert isinstance(model.doc_topic_, np.ndarray)
assert isinstance(model.topic_word_, np.ndarray)
models_singleproc = tm_lda.compute_models_parallel(EVALUATION_TEST_DTM, varying_params, const_params,
n_max_processes=1)
assert len(models_singleproc) == len(models)
for param_set2, model2 in models_singleproc:
for x, y in models:
if x == param_set2:
param_set1, model1 = x, y
break
else:
assert False
assert np.allclose(model1.doc_topic_, model2.doc_topic_)
assert np.allclose(model1.topic_word_, model2.topic_word_)
示例14: create_lda
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def create_lda(self,filename):
self.filename=filename
self.name=self.filename.split(".")[0]
self.flag=True
self.hasLabel=True
self.record={"x":[],"pos":[]}
self.body={}
self.est_num=[]
self.lastprob=0
self.offset=0.5
self.interval=3
self.last_pos=0
self.last_neg=0
try:
## if model already exists, load it ##
return self.load()
except:
## otherwise read from file ##
try:
self.loadfile()
self.preprocess()
import lda
from scipy.sparse import csr_matrix
lda1 = lda.LDA(n_topics=100, alpha=0.1, eta=0.01, n_iter=200)
self.csr_mat = csr_matrix(lda1.fit_transform(self.csr_mat))
self.save()
except:
## cannot find file in workspace ##
self.flag=False
return self
示例15: fit_topics
# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def fit_topics(data, embeddings, vocab, K):
"""Fit a topic model to bag-of-words data."""
model = lda.LDA(n_topics=K, n_iter=1500, random_state=1)
model.fit(data)
topics = model.topic_word_
lda_centers = np.matmul(topics, embeddings)
print('LDA Gibbs topics')
n_top_words = 20
for i, topic_dist in enumerate(topics):
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
print('Topic {}: {}'.format(i, ' '.join(topic_words)))
print('\n')
topic_proportions = model.doc_topic_
return topics, lda_centers, topic_proportions