本文整理汇总了Python中sklearn.decomposition.LatentDirichletAllocation.fit方法的典型用法代码示例。如果您正苦于以下问题:Python LatentDirichletAllocation.fit方法的具体用法?Python LatentDirichletAllocation.fit怎么用?Python LatentDirichletAllocation.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.decomposition.LatentDirichletAllocation
的用法示例。
在下文中一共展示了LatentDirichletAllocation.fit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: plot_perplexity_iter
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def plot_perplexity_iter(A_tfidf, num_topics):
print "computing perplexity vs iter..."
max_iter = 5
perplexity = []
em_iter = []
for sweep in range(1,max_iter+1):
lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1)
tic = time()
lda.fit(A_tfidf) #online VB
toc = time()
print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic)
perplexity.append(lda.perplexity(A_tfidf))
em_iter.append(lda.n_batch_iter_)
#end
np.save('./data/perplexity_iter.npy', perplexity)
f = plt.figure()
plt.plot(em_iter, perplexity, color='b', marker='o', lw=2.0, label='perplexity')
plt.title('Perplexity (LDA, online VB)')
plt.xlabel('EM iter')
plt.ylabel('Perplexity')
plt.grid(True)
plt.legend()
plt.show()
f.savefig('./figures/perplexity_iter.png')
示例2: plot_perplexity_batch
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def plot_perplexity_batch(A_tfidf, num_docs):
print "computing perplexity vs batch size..."
max_iter = 5
num_topics = 10
batch_size = np.logspace(6, 10, 5, base=2).astype(int)
perplexity = np.zeros((len(batch_size),max_iter))
em_iter = np.zeros((len(batch_size),max_iter))
for ii, mini_batch in enumerate(batch_size):
for jj, sweep in enumerate(range(1,max_iter+1)):
lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = mini_batch, random_state=0, n_jobs=-1)
tic = time()
lda.fit(A_tfidf) #online VB
toc = time()
print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic)
perplexity[ii,jj] = lda.perplexity(A_tfidf)
em_iter[ii,jj] = lda.n_batch_iter_
#end
#end
np.save('./data/perplexity.npy', perplexity)
np.save('./data/em_iter.npy', em_iter)
f = plt.figure()
for mb in range(len(batch_size)):
plt.plot(em_iter[mb,:], perplexity[mb,:], color=np.random.rand(3,), marker='o', lw=2.0, label='mini_batch: '+str(batch_size[mb]))
plt.title('Perplexity (LDA, online VB)')
plt.xlabel('EM iter')
plt.ylabel('Perplexity')
plt.grid(True)
plt.legend()
plt.show()
f.savefig('./figures/perplexity_batch.png')
示例3: plot_perplexity_topics
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def plot_perplexity_topics(A_tfidf):
print "computing perplexity vs K..."
max_iter = 5 #based on plot_perplexity_iter()
#num_topics = np.linspace(2,20,5).astype(np.int)
num_topics = np.logspace(1,2,5).astype(np.int)
perplexity = []
em_iter = []
for k in num_topics:
lda = LatentDirichletAllocation(n_topics = k, max_iter=max_iter, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1)
tic = time()
lda.fit(A_tfidf) #online VB
toc = time()
print "K= %d, elapsed time: %.4f sec" %(k, toc - tic)
perplexity.append(lda.perplexity(A_tfidf))
em_iter.append(lda.n_batch_iter_)
#end
np.save('./data/perplexity_topics.npy', perplexity)
np.save('./data/perplexity_topics2.npy', num_topics)
f = plt.figure()
plt.plot(num_topics, perplexity, color='b', marker='o', lw=2.0, label='perplexity')
plt.title('Perplexity (LDA, online VB)')
plt.xlabel('Number of Topics, K')
plt.ylabel('Perplexity')
plt.grid(True)
plt.legend()
plt.show()
f.savefig('./figures/perplexity_topics.png')
示例4: applyLDA2
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def applyLDA2(self, number_of_clusters, country_specific_tweets):
train, feature_names = self.extractFeatures(country_specific_tweets,False)
name = "lda"
if self.results:
print("Fitting LDA model with tfidf", end= " - ")
t0 = time()
lda = LatentDirichletAllocation(n_topics=number_of_clusters, max_iter=5,
learning_method='online', learning_offset=50.,
random_state=0)
lda.fit(train)
if self.results:
print("done in %0.3fs." % (time() - t0))
parameters = lda.get_params()
topics = lda.components_
doc_topic = lda.transform(train)
top10, labels = self.printTopicCluster(topics, doc_topic, feature_names)
labels = numpy.asarray(labels)
if self.results:
print("Silhouette Coefficient {0}: {1}".format(name, metrics.silhouette_score(train, labels)))
return name, parameters, top10, labels
示例5: extractTopicLDA
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def extractTopicLDA(func_message_dic, store_cloumn):
if len(func_message_dic) == 0:
print "func_message_dic is null"
return False
try:
conn=MySQLdb.connect(host='192.168.162.122',user='wangyu',passwd='123456',port=3306)
cur=conn.cursor()
cur.execute('set names utf8mb4')
conn.select_db('codeAnalysis')
for function in func_message_dic:
message = func_message_dic[function]
np_extractor = nlp.semantics_extraction.NPExtractor(message)
text = np_extractor.extract()
if len(text) == 0:
continue
tf_vectorizer = CountVectorizer(max_df=1.0, min_df=1, max_features=n_features, stop_words='english')
tf = tf_vectorizer.fit_transform(text)
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50.,
random_state=0)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
seprator = " "
for topic_idx, topic in enumerate(lda.components_):
keywords = seprator.join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
sql = "update func_semantic set "+store_cloumn+" = '"+keywords+"' where func_name = '"+function+"'"
print sql
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
return True
except MySQLdb.Error,e:
print e
raise
示例6: LDA
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def LDA(tf,word):
lda = LatentDirichletAllocation(n_topics=30, max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=0)
lda.fit(tf)
print_top_words(lda,word,20)
示例7: lda_tuner
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def lda_tuner(ingroup_otu, best_models):
best_score = -1*np.inf
dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
topic_series = [3]
X = ingroup_otu.values
eval_counter = 0
for topics in topic_series:
for dtp in dtp_series:
for twp in twp_series:
eval_counter +=1
X_train, X_test = train_test_split(X, test_size=0.5)
lda = LatentDirichletAllocation(n_topics=topics,
doc_topic_prior=dtp,
topic_word_prior=twp,
learning_method='batch',
random_state=42,
max_iter=20)
lda.fit(X_train)
this_score = lda.score(X_test)
this_perplexity = lda.perplexity(X_test)
if this_score > best_score:
best_score = this_score
print "New Max Likelihood: {}".format(best_score)
print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter,
topics, dtp, twp,
this_score, this_perplexity)
best_models.append({'n': topics, 'dtp': dtp, 'twp': twp,
'score': this_score, 'perp': this_perplexity})
if (dtp == dtp_series[-1]) and (twp == twp_series[-1]):
eval_counter +=1
X_train, X_test = train_test_split(X, test_size=0.5)
lda = LatentDirichletAllocation(n_topics=topics,
doc_topic_prior=1./topics,
topic_word_prior=1./topics,
learning_method='batch',
random_state=42,
max_iter=20)
lda.fit(X_train)
this_score = lda.score(X_test)
this_perplexity = lda.perplexity(X_test)
if this_score > best_score:
best_score = this_score
print "New Max Likelihood: {}".format(best_score)
print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter,
topics,
(1./topics),
(1./topics),
this_score,
this_perplexity)
best_models.append({'n': topics, 'dtp': (1./topics),
'twp': (1./topics), 'score': this_score,
'perp': this_perplexity})
return best_models
示例8: fit_lda
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def fit_lda(tf):
'''takes in a tf sparse vector and finds the top topics'''
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
lda_topic_dict = print_top_words(lda, tf_feature_names, n_top_words)
return lda, lda_topic_dict
示例9: topicmodel
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def topicmodel( comments ):
_texts = []
texts = []
for c in comments:
c = c['text']
_texts.append( c )
texts.append( c )
tf_vectorizer = CountVectorizer(
max_df=.20,
min_df=10,
stop_words = stopwords )
texts = tf_vectorizer.fit_transform( texts )
## test between 2 and 20 topics
topics = {}
for k in range(2, 10):
print "Testing", k
model = LatentDirichletAllocation(
n_topics= k ,
max_iter=5,
learning_method='batch',
learning_offset=50.,
random_state=0
)
model.fit( texts )
ll = model.score( texts )
topics[ ll ] = model
topic = max( topics.keys() )
ret = collections.defaultdict( list )
## ugly, rewrite some day
model = topics[ topic ]
## for debug pront chosen models' names
feature_names = tf_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
print "Topic #%d:" % topic_idx
print " ".join( [feature_names[i].encode('utf8') for i in topic.argsort()[:-5 - 1:-1]])
print
for i, topic in enumerate( model.transform( texts ) ):
topic = numpy.argmax( topic )
text = _texts[ i ].encode('utf8')
ret[ topic ].append( text )
return ret
示例10: __init__
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
class LDATopics:
# Constructor
def __init__(self, filename):
# Member variables
self.email_data = []
self.lda = None
self.feature_names = None
self.num_topics = NUM_TOPICS
self.num_words_per_topic = NUM_WORDS_PER_TOPIC
self.num_features = NUM_FEATURES
# Load emails from full path to file
emails = EmailLoader(filename).get_email_dict_array()
# Process emails into a list of email body contents
for email_rec in emails:
if email_rec['body']:
# Clean the text and add to list
cleaner = TextCleaner(email_rec['body'])
self.email_data.append(" ".join(cleaner.tokenize_str()))
## Public methods ##
def process(self, topics=None, features=None):
# Check if default numbers should be used
if topics is None:
topics = self.num_topics
if features is None:
features = self.num_features
# Calculate term frequency for LDA
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=features, stop_words='english')
tf = tf_vectorizer.fit_transform(self.email_data)
# Fit the LDA model to data samples
self.lda = LatentDirichletAllocation(n_topics=topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0)
self.lda.fit(tf)
# Set the feature name (words)
self.feature_names = tf_vectorizer.get_feature_names()
def print_topics(self, words_per_topic=None):
# Check if default number of words per topics should be used
if words_per_topic is None:
words_per_topic = self.num_words_per_topic
self._print_topics(self.lda, self.feature_names, words_per_topic)
## Private methods ##
def _print_topics(self, model, feature_names, words_per_topic):
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([feature_names[i]
for i in topic.argsort()[:-words_per_topic - 1:-1]]))
print()
示例11: perform_analysis
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def perform_analysis(self, stocks, szTimeAxis, n_ahead):
# load Snowball comment data
from agares.datasource.snowball_cmt_loader import SnowballCmtLoader
SBLoader = SnowballCmtLoader()
date = self.dt_start.date()
df_cmt_list = []
while date <= self.dt_end.date():
df_cmt_list.append(SBLoader.load(str(date)))
date += timedelta(days=1)
df_cmt = pd.concat(df_cmt_list, ignore_index=True)
# Chinese text segmentation
self.set_jieba()
df_cmt['RawComment'] = df_cmt['RawComment'].map(jieba.cut)
# drop stopwords
self.stopwords = [line.strip() for line in open('stopwords').readlines()]
self.stopwords.append(' ')
df_cmt['RawComment'] = df_cmt['RawComment'].map(self.drop_useless_word)
cmt = df_cmt['RawComment'].tolist()
# construct tfidf matrix
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=0.05)
tfidf = tfidf_vectorizer.fit_transform(cmt)
# Fit the NMF model
n_topics = 5
n_top_words = 20
print("Fitting the NMF model with tf-idf features..")
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
self.print_top_words(nmf, tfidf_feature_names, n_top_words)
# Fit the LDA model
print("Fitting LDA models with tf-idf features..")
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
learning_method='online', learning_offset=50.,
random_state=0)
t0 = time()
lda.fit(tfidf)
print("done in %0.3fs." % (time() - t0))
print("\nTopics in LDA model:")
self.print_top_words(lda, tfidf_feature_names, n_top_words)
# load sz daily candlestick data
sz = next(iter(stocks))
cst_Day = stocks[sz].cst['1Day']
# print close price within the timescope
date = self.dt_start
print()
print("The ShangHai stock Index (close index) within the timescope")
while date <= self.dt_end:
ts = pd.to_datetime(date)
try:
print("Date: {0:s}, Index: {1:.2f}".format(str(date.date()), cst_Day.at[ts, 'close']))
except KeyError: # sz candlestick data does not exist at this datetime
print("Date: {0:s}, Index: (market closed)".format(str(date.date())))
date += timedelta(days=1)
示例12: LDA
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def LDA(matrix,preserve,n_topics=100):
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
learning_method='online', learning_offset=50.,
random_state=randint(1,100))
lda.fit(matrix[preserve])
topic_model=lda.transform(matrix)
return topic_model
示例13: calculate_lda
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def calculate_lda(self, tfidf):
print("Fitting LDA models with tf features...")
lda = LatentDirichletAllocation(n_topics=self.num_topics, max_iter=5,
learning_method='online', learning_offset=50.,
random_state=0)
t0 = time()
lda.fit(tfidf)
print("Topics in LDA model:")
print_top_words(lda, self.tfidf_feature_names, self.num_words)
print("done in %0.3fs." % (time() - t0))
示例14: test_lda_score_perplexity
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def test_lda_score_perplexity():
# Test the relationship between LDA score and perplexity
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
random_state=0)
lda.fit(X)
perplexity_1 = lda.perplexity(X, sub_sampling=False)
score = lda.score(X)
perplexity_2 = np.exp(-1. * (score / np.sum(X.data)))
assert_almost_equal(perplexity_1, perplexity_2)
示例15: get_lda
# 需要导入模块: from sklearn.decomposition import LatentDirichletAllocation [as 别名]
# 或者: from sklearn.decomposition.LatentDirichletAllocation import fit [as 别名]
def get_lda():
lda = LatentDirichletAllocation(
n_topics=K,
max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=0)
lda.fit(X)
tf_feature_names = VECTORIZER.get_feature_names()
print_top_words(lda, tf_feature_names, 10)
return lda