Python lda.LDA属性代码示例

本文整理汇总了Python中lda.LDA属性的典型用法代码示例。如果您正苦于以下问题：Python lda.LDA属性的具体用法？Python lda.LDA怎么用？Python lda.LDA使用的例子？那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类lda的用法示例。

在下文中一共展示了lda.LDA属性的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_save_load_ldamodel_pickle

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_save_load_ldamodel_pickle():
    try:
        import lda
    except ImportError:
        pytest.skip('lda not installed')

    pfile = 'tests/data/test_pickle_unpickle_ldamodel.pickle'

    dtm = np.array([[0, 1], [2, 3], [4, 5], [6, 0]])
    doc_labels = ['doc_' + str(i) for i in range(dtm.shape[0])]
    vocab = ['word_' + str(i) for i in range(dtm.shape[1])]

    model = lda.LDA(2, n_iter=1)
    model.fit(dtm)

    model_io.save_ldamodel_to_pickle(pfile, model, vocab, doc_labels)

    unpickled = model_io.load_ldamodel_from_pickle(pfile)

    assert np.array_equal(model.doc_topic_, unpickled['model'].doc_topic_)
    assert np.array_equal(model.topic_word_, unpickled['model'].topic_word_)
    assert vocab == unpickled['vocab']
    assert doc_labels == unpickled['doc_labels']

开发者ID:WZBSocialScienceCenter，项目名称:tmtoolkit，代码行数:25，代码来源:test_topicmod_model_io.py

示例2: test_get_marginal_topic_distrib

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_marginal_topic_distrib(dtm, n_topics):
    try:
        import lda
    except ImportError:
        pytest.skip('lda not installed')

    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    model = lda.LDA(n_topics, 1)
    model.fit(dtm)

    doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
    marginal_topic_distr = model_stats.marginal_topic_distrib(model.doc_topic_, doc_lengths)

    assert marginal_topic_distr.shape == (n_topics,)
    assert np.isclose(marginal_topic_distr.sum(), 1.0)
    assert all(0 <= v <= 1 for v in marginal_topic_distr)

开发者ID:WZBSocialScienceCenter，项目名称:tmtoolkit，代码行数:20，代码来源:test_topicmod_model_stats.py

示例3: test_get_marginal_word_distrib

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_marginal_word_distrib(dtm, n_topics):
    try:
        import lda
    except ImportError:
        pytest.skip('lda not installed')

    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    model = lda.LDA(n_topics, 1)
    model.fit(dtm)

    doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
    p_t = model_stats.marginal_topic_distrib(model.doc_topic_, doc_lengths)

    p_w = model_stats.marginal_word_distrib(model.topic_word_, p_t)
    assert p_w.shape == (dtm.shape[1],)
    assert np.isclose(p_w.sum(), 1.0)
    assert all(0 <= v <= 1 for v in p_w)

开发者ID:WZBSocialScienceCenter，项目名称:tmtoolkit，代码行数:21，代码来源:test_topicmod_model_stats.py

示例4: test_get_word_distinctiveness

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_word_distinctiveness(dtm, n_topics):
    try:
        import lda
    except ImportError:
        pytest.skip('lda not installed')

    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    model = lda.LDA(n_topics, 1)
    model.fit(dtm)

    doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
    p_t = model_stats.marginal_topic_distrib(model.doc_topic_, doc_lengths)

    w_distinct = model_stats.word_distinctiveness(model.topic_word_, p_t)

    assert w_distinct.shape == (dtm.shape[1],)
    assert all(v > -1e10 for v in w_distinct)

开发者ID:WZBSocialScienceCenter，项目名称:tmtoolkit，代码行数:21，代码来源:test_topicmod_model_stats.py

示例5: test_get_word_saliency

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_word_saliency(dtm, n_topics):
    try:
        import lda
    except ImportError:
        pytest.skip('lda not installed')

    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    model = lda.LDA(n_topics, 10)
    model.fit(dtm)

    doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)

    w_sal = model_stats.word_saliency(model.topic_word_, model.doc_topic_, doc_lengths)
    assert w_sal.shape == (dtm.shape[1],)
    assert all(v >= -1e-9 for v in w_sal)

开发者ID:WZBSocialScienceCenter，项目名称:tmtoolkit，代码行数:19，代码来源:test_topicmod_model_stats.py

示例6: test_get_topic_word_relevance

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_topic_word_relevance(dtm, n_topics, lambda_):
    try:
        import lda
    except ImportError:
        pytest.skip('lda not installed')

    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    model = lda.LDA(n_topics, 1)
    model.fit(dtm)

    doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)

    rel_mat = model_stats.topic_word_relevance(model.topic_word_, model.doc_topic_, doc_lengths, lambda_)

    assert rel_mat.shape == (n_topics, dtm.shape[1])
    assert all(isinstance(x, float) and not np.isnan(x) for x in rel_mat.flatten())

开发者ID:WZBSocialScienceCenter，项目名称:tmtoolkit，代码行数:20，代码来源:test_topicmod_model_stats.py

示例7: _run_lda

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def _run_lda(self, data, n_topics):
        """
        Run LDA algorithm.

        :param data: sparse vector of document features
        :param n_topics: number of topics we want
        :return: map from topic label (int) to list of (doc-index, score) tuples of documents
                 in that topic cluster.
        """
        import lda

        lda_model = lda.LDA(n_topics=n_topics, n_iter=200, random_state=1)
        lda_model.fit(data)

        clusters = collections.defaultdict(list)
        # doc_topic_ are the per-topic scores for each document
        for i, scores in enumerate(lda_model.doc_topic_):
            label = scores.argmax()
            score = scores[label]
            clusters[label].append((i, score))

        return clusters, lda_model

开发者ID:Code4SA，项目名称:mma-dexter，代码行数:24，代码来源:topics.py

示例8: discoverTopics

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def discoverTopics(n = 20):
    matrix, vocab = preprocess('/Users/fpena/tmp/20_newsgroups')
    # matrix, vocab = preprocess('../data/toy2')
    # sampler = LdaSampler(n)
    sampler = LatentDirichletAllocation(n)

    info('Starting!')
    for it, phi in enumerate(sampler.run(matrix, 100)):
        print(colored("Iteration %s" % it, 'yellow'))
        print("Likelihood", sampler.loglikelihood())

        # for topicNum in range(n):
        #     s = colored(topicNum, 'green')
        #     # s = topicNum
        #     words = [(proba, w) for (w, proba) in enumerate(phi[topicNum, :]) if proba > 0]
        #     words = sorted(words, reverse = True)
        #     for i in range(10):
        #         proba, w = words[i]
        #         s += ' ' + vocab[w]
        #     print(s)

    lda_sampler = lda.LDA(n, 100)
    lda_sampler._fit(matrix)
    print("LDA likelihood", lda_sampler.loglikelihood())

开发者ID:melqkiades，项目名称:yelp，代码行数:26，代码来源:newsgroup.py

示例9: main

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def main():
    parser = argparse.ArgumentParser(description="Generate the beta abd theta files after latent Dirichlet allocation (LDA) process.");
    parser.add_argument('-i', '--input', required=True, help="The input file where each line starts with the number of word as well as the sparse representation of word distribution");
    parser.add_argument('-o', '--output', required=True, help="The output path");
    args  = parser.parse_args();
    tfidf = pickle.load(open(args.input));
    feat  = tfidf.toarray().astype(np.int64);
    model = lda.LDA(n_topics=50, n_iter=1500, random_state=2017);
    model.fit(feat);
    fid   = open(os.path.join(args.output, 'init.beta'), 'w');
    beta  = model.topic_word_;
    for row in range(beta.shape[0]):
        fid.write('%f'%beta[row,0]);
        for col in range(1, beta.shape[1]):
            fid.write(' %f'%beta[row,col]);
        fid.write('\n');
    fid.close();
    fid   = open(os.path.join(args.output, 'init.theta'), 'w');
    theta = model.doc_topic_
    for row in range(theta.shape[0]):
        fid.write('%f'%theta[row,0]);
        for col in range(1, theta.shape[1]):
            fid.write(' %f'%theta[row,col]);
        fid.write('\n');
    fid.close();

开发者ID:domainxz，项目名称:top-k-rec，代码行数:27，代码来源:genLdaFiles.py

示例10: test_get_most_or_least_distinct_words

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_most_or_least_distinct_words(dtm, n_topics, n_distinct_words):
    try:
        import lda
    except ImportError:
        pytest.skip('lda not installed')

    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    n_distinct_words = min(n_distinct_words, dtm.shape[1])

    model = lda.LDA(n_topics, 1)
    model.fit(dtm)

    doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)
    vocab = np.array([chr(65 + i) for i in range(dtm.shape[1])])  # this only works for few words

    most_distinct = model_stats.most_distinct_words(vocab, model.topic_word_, model.doc_topic_, doc_lengths)
    least_distinct = model_stats.least_distinct_words(vocab, model.topic_word_, model.doc_topic_, doc_lengths)
    assert most_distinct.shape == least_distinct.shape == (len(vocab),) == (dtm.shape[1],)
    assert all(a == b for a, b in zip(most_distinct, least_distinct[::-1]))

    most_distinct_n = model_stats.most_distinct_words(vocab, model.topic_word_, model.doc_topic_, doc_lengths,
                                                      n=n_distinct_words)
    least_distinct_n = model_stats.least_distinct_words(vocab, model.topic_word_, model.doc_topic_, doc_lengths,
                                                        n=n_distinct_words)
    assert most_distinct_n.shape == least_distinct_n.shape == (n_distinct_words,)
    assert all(a == b for a, b in zip(most_distinct_n, most_distinct[:n_distinct_words]))
    assert all(a == b for a, b in zip(least_distinct_n, least_distinct[:n_distinct_words]))

开发者ID:WZBSocialScienceCenter，项目名称:tmtoolkit，代码行数:31，代码来源:test_topicmod_model_stats.py

示例11: test_get_most_or_least_relevant_words_for_topic

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_get_most_or_least_relevant_words_for_topic(dtm, n_topics, lambda_, n_relevant_words):
    try:
        import lda
    except ImportError:
        pytest.skip('lda not installed')

    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    n_relevant_words = min(n_relevant_words, dtm.shape[1])
    topic = random.randint(0, n_topics - 1)

    model = lda.LDA(n_topics, 1)
    model.fit(dtm)

    vocab = np.array([chr(65 + i) for i in range(dtm.shape[1])])  # this only works for few words
    doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)

    rel_mat = model_stats.topic_word_relevance(model.topic_word_, model.doc_topic_, doc_lengths, lambda_)

    most_rel = model_stats.most_relevant_words_for_topic(vocab, rel_mat, topic)
    least_rel = model_stats.least_relevant_words_for_topic(vocab, rel_mat, topic)
    assert most_rel.shape == least_rel.shape == (len(vocab),) == (dtm.shape[1],)
    assert all(a == b for a, b in zip(most_rel, least_rel[::-1]))

    most_rel_n = model_stats.most_relevant_words_for_topic(vocab, rel_mat, topic, n=n_relevant_words)
    least_rel_n = model_stats.least_relevant_words_for_topic(vocab, rel_mat, topic, n=n_relevant_words)
    assert most_rel_n.shape == least_rel_n.shape == (n_relevant_words,)
    assert all(a == b for a, b in zip(most_rel_n, most_rel[:n_relevant_words]))
    assert all(a == b for a, b in zip(least_rel_n, least_rel[:n_relevant_words]))

开发者ID:WZBSocialScienceCenter，项目名称:tmtoolkit，代码行数:32，代码来源:test_topicmod_model_stats.py

示例12: test_generate_topic_labels_from_top_words

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_generate_topic_labels_from_top_words(dtm, n_topics, lambda_):
    try:
        import lda
    except ImportError:
        pytest.skip('lda not installed')

    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    model = lda.LDA(n_topics, 1)
    model.fit(dtm)

    vocab = np.array([chr(65 + i) for i in range(dtm.shape[1])])  # this only works for few words
    doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm)

    topic_labels = model_stats.generate_topic_labels_from_top_words(model.topic_word_, model.doc_topic_,
                                                                    doc_lengths, vocab, lambda_=lambda_)
    assert isinstance(topic_labels, np.ndarray)
    assert len(topic_labels) == n_topics

    for i, l in enumerate(topic_labels):
        assert isinstance(l, str)
        parts = l.split('_')
        assert len(parts) >= 2
        assert int(parts[0]) == i + 1
        assert all(w in vocab for w in parts[1:])

    topic_labels_2 = model_stats.generate_topic_labels_from_top_words(model.topic_word_, model.doc_topic_,
                                                                      doc_lengths, vocab, lambda_=lambda_,
                                                                      n_words=2)
    assert isinstance(topic_labels_2, np.ndarray)
    assert len(topic_labels_2) == n_topics

    for i, l in enumerate(topic_labels_2):
        assert isinstance(l, str)
        parts = l.split('_')
        assert len(parts) == 3
        assert int(parts[0]) == i + 1
        assert all(w in vocab for w in parts[1:])

开发者ID:WZBSocialScienceCenter，项目名称:tmtoolkit，代码行数:41，代码来源:test_topicmod_model_stats.py

示例13: test_compute_models_parallel_lda_multi_vs_singleproc

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def test_compute_models_parallel_lda_multi_vs_singleproc():
    passed_params = {'n_topics', 'n_iter', 'random_state'}
    varying_params = [dict(n_topics=k) for k in range(2, 5)]
    const_params = dict(n_iter=3, random_state=1)

    models = tm_lda.compute_models_parallel(EVALUATION_TEST_DTM, varying_params, const_params)
    assert len(models) == len(varying_params)

    for param_set, model in models:
        assert set(param_set.keys()) == passed_params
        assert isinstance(model, lda.LDA)
        assert isinstance(model.doc_topic_, np.ndarray)
        assert isinstance(model.topic_word_, np.ndarray)

    models_singleproc = tm_lda.compute_models_parallel(EVALUATION_TEST_DTM, varying_params, const_params,
                                                       n_max_processes=1)

    assert len(models_singleproc) == len(models)
    for param_set2, model2 in models_singleproc:
        for x, y in models:
            if x == param_set2:
                param_set1, model1 = x, y
                break
        else:
            assert False

        assert np.allclose(model1.doc_topic_, model2.doc_topic_)
        assert np.allclose(model1.topic_word_, model2.topic_word_)

开发者ID:WZBSocialScienceCenter，项目名称:tmtoolkit，代码行数:30，代码来源:test_topicmod_evaluate.py

示例14: create_lda

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def create_lda(self,filename):
        self.filename=filename
        self.name=self.filename.split(".")[0]
        self.flag=True
        self.hasLabel=True
        self.record={"x":[],"pos":[]}
        self.body={}
        self.est_num=[]
        self.lastprob=0
        self.offset=0.5
        self.interval=3
        self.last_pos=0
        self.last_neg=0


        try:
            ## if model already exists, load it ##
            return self.load()
        except:
            ## otherwise read from file ##
            try:
                self.loadfile()
                self.preprocess()
                import lda
                from scipy.sparse import csr_matrix
                lda1 = lda.LDA(n_topics=100, alpha=0.1, eta=0.01, n_iter=200)
                self.csr_mat = csr_matrix(lda1.fit_transform(self.csr_mat))
                self.save()
            except:
                ## cannot find file in workspace ##
                self.flag=False
        return self

开发者ID:fastread，项目名称:src，代码行数:34，代码来源:mar.py

示例15: fit_topics

# 需要导入模块: import lda [as 别名]
# 或者: from lda import LDA [as 别名]
def fit_topics(data, embeddings, vocab, K):
    """Fit a topic model to bag-of-words data."""
    model = lda.LDA(n_topics=K, n_iter=1500, random_state=1)
    model.fit(data)
    topics = model.topic_word_
    lda_centers = np.matmul(topics, embeddings)
    print('LDA Gibbs topics')
    n_top_words = 20
    for i, topic_dist in enumerate(topics):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    print('\n')
    topic_proportions = model.doc_topic_

    return topics, lda_centers, topic_proportions

开发者ID:IBM，项目名称:HOTT，代码行数:17，代码来源:data.py

注：本文中的lda.LDA属性示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。