本文整理汇总了Python中gensim.models.ldamodel.LdaModel.print_topics方法的典型用法代码示例。如果您正苦于以下问题:Python LdaModel.print_topics方法的具体用法?Python LdaModel.print_topics怎么用?Python LdaModel.print_topics使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models.ldamodel.LdaModel
的用法示例。
在下文中一共展示了LdaModel.print_topics方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_topics
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import print_topics [as 别名]
def get_topics(candidate, day):
start_time = datetime.strptime(day, "%Y-%m-%d").date()
start_time = int(start_time.strftime('%s'))*1000
end_time = start_time + 86399999
try:
client = MongoClient()
tweets = client.fletcher.tweets
tweets = tweets.aggregate([
{"$match":{"$text":{"$search":candidate_search[candidate_slugs[candidate]]}}},
{"$match":{"timestamp_ms":{"$gte":start_time,"$lt":end_time}}}])
documents = []
pattern = re.compile("[^a-zA-Z ]")
for tweet in tweets:
documents.append(pattern.sub('', tweet['text']))
stoplist = set(candidate_stop_words[candidate_slugs[candidate]] + stopwords)
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, update_every=1, chunksize=10000, passes=10)
return lda.print_topics(3)
except:
return None
示例2: lda_topic_model
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import print_topics [as 别名]
def lda_topic_model(data, is_clean=False, num_of_topics=10, num_of_pass=5):
"""do the topic model for the given dataset
input:
data: a documents or a list of words
is_clean: Use this notation to pre-process the data.
num_of_topics: An LDA model requires the user to determine how many
topics should be generated.
num_of_pass: The greater the number of passes, the more accurate the
model will be.
A lot of passes can be slow on a very large corpus.
"""
if not is_clean:
stops = set(nltk.corpus.stopwords.words("english"))
texts = prepare_for_lda(data, stops)
else:
texts = data
dictionary = corpora.Dictionary(texts)
print dictionary
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = LdaModel(corpus, id2word=dictionary, num_topics=num_of_topics, \
passes=num_of_pass)
return ldamodel.print_topics(num_topics=num_of_topics, num_words=10)
示例3: analyze_speeches
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import print_topics [as 别名]
def analyze_speeches(filename="1.txt"):
"""Read a speech file.
Args:
filename - speech file
"""
dictionary = corpora.dictionary.Dictionary()
train_documents = list()
all_words = list()
for i in xrange(1, GC.N_SPEECHES):
filename = path_join(GC.SPEECH_FOLDER, str(i) + ".txt")
with open(filename, "r") as speech_file:
speech_words = list()
for line in speech_file:
words = line.strip().decode("utf8").split()
words = [word for word in words if valid_word(word)]
words = " ".join(map(unidecode, words))
output = words.translate(string.maketrans("", ""), punct)
speech_words += [word.lower() for word in output.split()
if valid_word(word, True)]
all_words += speech_words
dictionary.add_documents([speech_words])
train_documents.append(speech_words)
corpus = [dictionary.doc2bow(text) for text in train_documents]
lda = LdaModel(corpus=corpus,
id2word=dictionary,
num_topics=GC.N_TOPICS,
passes=10,
alpha='auto')
print '{} topics with corresponding top {} words'.format(GC.N_TOPICS, 10)
pprint(lda.print_topics())
word_counter = Counter(all_words)
print 'Top {} words in {} speeches of NaMo'.format(GC.N_TOP_WORDS,
GC.N_SPEECHES)
pprint(word_counter.most_common(GC.N_TOP_WORDS))
示例4: print
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import print_topics [as 别名]
from util.TextSimilarity import TextSimilarity
from util.TaskReader import TaskReader
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
print("LDA Output: ")
first_num = 244
task = TaskReader.read("text.txt")
similarity = TextSimilarity('french')
doc_set = similarity.get_modified_text(task.text)
edu_set = similarity.get_modified_text(task.education)
dictionary = Dictionary([[x for x in i.split()] for i in edu_set])
for i in range(0, len(doc_set)):
num = i + first_num
corp = [x for x in doc_set[i].split()]
corpus = [dictionary.doc2bow(corp)]
ldamodel = LdaModel(corpus, num_topics=1, id2word=dictionary, passes=50)
[print("Topic № " + str(num) + " : " + x[1]) for x in ldamodel.print_topics(num_topics=1, num_words=6)]
示例5: clean
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import print_topics [as 别名]
# removes symbols given a text input
symbols = [',', '.', '-', '@']
def clean(word):
word = word.lower()
for symbol in symbols:
word = word.replace(symbol, '')
return word
# load dictionary that was generated by calling corpora.Dictionary() on TripAdvisor text
id2word = Dictionary.load('scraper/tripadvisor.dict')
# constructs a sparse matrix by calling corpora.MmCorpus.serialize() on the dictionary
mm = MmCorpus('scraper/tripadvisor.mm')
# call LDA, set topics to 100
lda = LdaModel(corpus=mm, id2word=id2word, num_topics=100, update_every=1, chunksize=100, passes=1)
topics = lda.print_topics(num_topics=100)
top_words = []
# construct top words list based on topics and words generated by LDA
for topic in topics:
topic = topic.replace('+', '')
topic_split = topic.split()
for item in topic_split:
item = item.strip().split('*')[1]
top_words.append(clean(item))
# write top words to a file
f = open('scraper/top_words_lda.txt', 'w')
for word in top_words:
f.write(word.encode('utf-8') + '\n')
示例6: MyCorpus
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import print_topics [as 别名]
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
import pymongo
# put top k word here
top_k_words = [['human', 'graph', 'computer']]
dictionary = corpora.Dictionary(top_k_words)
print dictionary.token2id
class MyCorpus(object):
def __iter__(self):
# change to get document from mongodb
for line in open('mycorpus.txt'):
yield dictionary.doc2bow(line.lower().split())
corpus = MyCorpus()
lda = LdaModel(corpus, num_topics = 2, id2word = dictionary)
print lda.print_topics(2)
示例7: print
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import print_topics [as 别名]
# print(tfidf[vec])
# index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
# sims = index[tfidf[vec]]
# print(list(enumerate(sims)))
corpora.MmCorpus.save_corpus('file.mm', corpus)
#id2word= corpora.Dictionary.load('deerwester.dict')
mmCorpus = corpora.MmCorpus("file.mm")
print mmCorpus
lsi = LsiModel(mmCorpus, id2word=dictionary,num_topics=10)
print "lsi:"
#print(lsi[new_vec])
lsi.print_debug(4, 4)
lsi.print_topics(4,2)
lsi.show_topic(10, 10)
lda = LdaModel(mmCorpus,id2word=dictionary,num_topics=10)
lda.print_topics(4,4)
doc_lda = lda[new_vec]
print "lda:"
#print doc_lda
# corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
# [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
# [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
# [(0, 1.0), (4, 2.0), (7, 1.0)],
# [(3, 1.0), (5, 1.0), (6, 1.0)],
# [(9, 1.0)],
# [(9, 1.0), (10, 1.0)],
# [(9, 1.0), (10, 1.0), (11, 1.0)],
# [(8, 1.0), (10, 1.0), (11, 1.0)]]
示例8: documents
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import print_topics [as 别名]
words,ids = dictionary.filter_n_most_frequent(50)
print words,"\n\n",ids
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
#Creating the object for LDA model using gensim library & Training LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=50, id2word = dictionary, passes=50, iterations=500)
ldafile = open('lda_model_sym_wiki.pkl','wb')
cPickle.dump(ldamodel,ldafile)
ldafile.close()
#Print all the 50 topics
for topic in ldamodel.print_topics(num_topics=50, num_words=10):
print topic[0]+1, " ", topic[1],"\n"