本文整理汇总了Python中sklearn.decomposition.LatentDirichletAllocation.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python LatentDirichletAllocation.fit_transform方法的具体用法?Python LatentDirichletAllocation.fit_transform怎么用?Python LatentDirichletAllocation.fit_transform使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.decomposition.LatentDirichletAllocation
的用法示例。
在下文中一共展示了LatentDirichletAllocation.fit_transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_lda_default_prior_params
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def test_lda_default_prior_params():
# default prior parameter should be `1 / topics`
# and verbose params should not affect result
n_topics, X = _build_sparse_mtx()
prior = 1. / n_topics
lda_1 = LatentDirichletAllocation(n_topics=n_topics, doc_topic_prior=prior,
topic_word_prior=prior, random_state=0)
lda_2 = LatentDirichletAllocation(n_topics=n_topics, random_state=0)
topic_distr_1 = lda_1.fit_transform(X)
topic_distr_2 = lda_2.fit_transform(X)
assert_almost_equal(topic_distr_1, topic_distr_2)
示例2: test_lda_score
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def test_lda_score():
# Test LDA score for batch training
# score should be higher after each iteration
n_topics, X = _build_sparse_mtx()
for method in ('online', 'batch'):
lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method,
total_samples=100, random_state=0)
lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method,
total_samples=100, random_state=0)
lda_1.fit_transform(X)
score_1 = lda_1.score(X)
lda_2.fit_transform(X)
score_2 = lda_2.score(X)
assert_greater_equal(score_2, score_1)
示例3: basic_lda
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def basic_lda(df, n_topics=200, max_df=0.5, min_df=5):
'''
Basic LDA model for album recommendations
Args:
df: dataframe with Pitchfork reviews
n_topics: number of lda topics
max_df: max_df in TfidfVectorizer
min_df: min_df in TfidfVectorizer
Returns:
tfidf: sklearn fitted TfidfVectorizer
tfidf_trans: sparse matrix with tfidf transformed data
lda: sklearn fitted LatentDirichletAllocation
lda_trans: dense array with lda transformed data
'''
X = df['review']
cv = CountVectorizer(stop_words='english',
min_df=5,
max_df=0.5)
cv_trans = cv.fit_transform(X)
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=7)
lda_trans = lda.fit_transform(cv_trans)
return cv, cv_trans, lda, lda_trans
示例4: _get_model_LDA
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def _get_model_LDA(self, corpus):
#lda = models.LdaModel(corpus, id2word=self.corpus.dictionary, num_topics=5, alpha='auto', eval_every=50)
lda = LatentDirichletAllocation(n_topics=self.num_of_clusters, max_iter=20,
learning_method='online',
learning_offset=50.,
random_state=1)
return lda.fit_transform(corpus)
示例5: produceLDATopics
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def produceLDATopics():
'''
Takes description of each game and uses sklearn's latent dirichlet allocation and count vectorizer
to extract topics.
:return: pandas data frame with topic weights for each game (rows) and topic (columns)
'''
data_samples, gameNames = create_game_profile_df(game_path)
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
learning_method='online', learning_offset=50.,
random_state=0)
topics = lda.fit_transform(tf)
# for i in range(50):
# gameTopics = []
# for j in range(len(topics[0])):
# if topics[i,j] > 1.0/float(n_topics):
# gameTopics.append(j)
# print gameNames[i], gameTopics
topicsByGame = pandas.DataFrame(topics)
topicsByGame.index = gameNames
print topicsByGame
tf_feature_names = tf_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([tf_feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
return topicsByGame
示例6: score_lda
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def score_lda(src, dst):
##read sentence pairs to two lists
b1 = []
b2 = []
lines = 0
with open(src) as p:
for i, line in enumerate(p):
s = line.split('\t')
b1.append(s[0])
b2.append(s[1][:-1]) #remove \n
lines = i + 1
vectorizer = CountVectorizer()
vectors=vectorizer.fit_transform(b1 + b2)
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
learning_method='online', learning_offset=50.,
random_state=0)
X = lda.fit_transform(vectors)
print X.shape
b1_v = vectorizer.transform(b1)
b2_v = vectorizer.transform(b2)
b1_vecs = lda.transform(b1_v)
b2_vecs = lda.transform(b2_v)
res = [round(5*(1 - spatial.distance.cosine(b1_vecs[i], b2_vecs[i])),2) for i in range(lines)]
with open(dst, 'w') as thefile:
thefile.write("\n".join(str(i) for i in res))
示例7: get_features
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def get_features(vocab):
vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
X_train_head = vectorizer_head.fit_transform(headlines)
vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
X_train_body = vectorizer_body.fit_transform(bodies)
# calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
# more important topic words a body contains of a certain topic, the higher its value for this topic
lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)
print("latent_dirichlet_allocation_cos: fit and transform body")
t0 = time()
lda_body_matrix = lda_body.fit_transform(X_train_body)
print("done in %0.3fs." % (time() - t0))
print("latent_dirichlet_allocation_cos: transform head")
# use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
# their vectors should be similar
lda_head_matrix = lda_body.transform(X_train_head)
#print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)
print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
# calculate cosine distance between the body and head
X = []
for i in range(len(lda_head_matrix)):
X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
X.append(cos_dist.tolist())
return X
示例8: latdirall
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def latdirall(content):
lda = LatentDirichletAllocation(n_topics=10)
tf_vectorizer = TfidfVectorizer(max_df=0.99, min_df=1,
stop_words='english')
tf = tf_vectorizer.fit_transform(content)
lolz = lda.fit_transform(tf)
tfidf_feature_names = tf_vectorizer.get_feature_names()
return top_topics(lda, tfidf_feature_names, 10)
示例9: test_lda_transform
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def test_lda_transform():
# Test LDA transform.
# Transform result cannot be negative
rng = np.random.RandomState(0)
X = rng.randint(5, size=(20, 10))
n_topics = 3
lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng)
X_trans = lda.fit_transform(X)
assert_true((X_trans > 0.0).any())
示例10: test_lda_perplexity
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def test_lda_perplexity():
# Test LDA perplexity for batch training
# perplexity should be lower after each iteration
n_topics, X = _build_sparse_mtx()
for method in ('online', 'batch'):
lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method,
total_samples=100, random_state=0)
lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method,
total_samples=100, random_state=0)
distr_1 = lda_1.fit_transform(X)
perp_1 = lda_1.perplexity(X, distr_1, sub_sampling=False)
distr_2 = lda_2.fit_transform(X)
perp_2 = lda_2.perplexity(X, distr_2, sub_sampling=False)
assert_greater_equal(perp_1, perp_2)
perp_1_subsampling = lda_1.perplexity(X, distr_1, sub_sampling=True)
perp_2_subsampling = lda_2.perplexity(X, distr_2, sub_sampling=True)
assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
示例11: test_lda_transform
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def test_lda_transform():
# Test LDA transform.
# Transform result cannot be negative and should be normalized
rng = np.random.RandomState(0)
X = rng.randint(5, size=(20, 10))
n_topics = 3
lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng)
X_trans = lda.fit_transform(X)
assert_true((X_trans > 0.0).any())
assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0]))
示例12: test_lda_fit_transform
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def test_lda_fit_transform(method):
# Test LDA fit_transform & transform
# fit_transform and transform result should be the same
rng = np.random.RandomState(0)
X = rng.randint(10, size=(50, 20))
lda = LatentDirichletAllocation(n_components=5, learning_method=method,
random_state=rng)
X_fit = lda.fit_transform(X)
X_trans = lda.transform(X)
assert_array_almost_equal(X_fit, X_trans, 4)
示例13: test_doc_topic_distr_deprecation
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def test_doc_topic_distr_deprecation():
# Test that the appropriate warning message is displayed when a user
# attempts to pass the doc_topic_distr argument to the perplexity method
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
learning_method='batch',
total_samples=100, random_state=0)
distr1 = lda.fit_transform(X)
distr2 = None
assert_warns(DeprecationWarning, lda.perplexity, X, distr1)
assert_warns(DeprecationWarning, lda.perplexity, X, distr2)
示例14: test_lda_score_perplexity
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def test_lda_score_perplexity():
# Test the relationship between LDA score and perplexity
n_topics, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
random_state=0)
distr = lda.fit_transform(X)
perplexity_1 = lda.perplexity(X, distr, sub_sampling=False)
score = lda.score(X)
perplexity_2 = np.exp(-1. * (score / np.sum(X.data)))
assert_almost_equal(perplexity_1, perplexity_2)
示例15: test_perplexity_input_format
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit_transform [as 别名]
def test_perplexity_input_format():
# Test LDA perplexity for sparse and dense input
# score should be the same for both dense and sparse input
n_topics, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method='batch',
total_samples=100, random_state=0)
distr = lda.fit_transform(X)
perp_1 = lda.perplexity(X)
perp_2 = lda.perplexity(X, distr)
perp_3 = lda.perplexity(X.toarray(), distr)
assert_almost_equal(perp_1, perp_2)
assert_almost_equal(perp_1, perp_3)