本文整理汇总了Python中gensim.models.ldamodel.LdaModel.save方法的典型用法代码示例。如果您正苦于以下问题:Python LdaModel.save方法的具体用法?Python LdaModel.save怎么用?Python LdaModel.save使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models.ldamodel.LdaModel
的用法示例。
在下文中一共展示了LdaModel.save方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: LDA
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
class LDA(object):
def __init__(self, model, vocab, corpus=None, topics=200, passes=1):
self._model_file = model
self._dict_file = vocab
self._corpus_file = corpus
self._topics = topics
self._passes = passes
def train(self):
self._corpus = SentenceDocCorpus(self._corpus_file)
self._lda = LdaModel(self._corpus, num_topics = self._topics, id2word = self._corpus.dictionary, passes = self._passes)
self._dictionary = self._corpus.dictionary
self._lda.save(self._model_file)
self._dictionary.save(self._dict_file)
def load(self):
self._lda = LdaModel.load(self._model_file)
self._dictionary = Dictionary.load(self._dict_file)
def topics(self, words):
return self._lda[self._dictionary.doc2bow(common.filter(words))]
def topic_vector(self, words):
return np.array([v for k, v in self._lda.__getitem__(self._dictionary.doc2bow(common.filter(words)), eps=0)])
示例2: run
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
def run(self):
if self.clean_level in ('raw','clean','stopwords'):
kind = self.clean_level
else:
kind = 'stopwords'
for idioma in self.output()['langs'].iterkeys():
dicc_path = self.input()['dict']['langs'][idioma].path
corp_path = self.input()['corp']['langs'][idioma].path
print '=============================='
print 'Corriendo LDA de %s con nivel de limpieza %s' % (idioma, kind)
print '=============================='
# Cargar diccionario y corpus
dicc = corpora.Dictionary.load(dicc_path)
corpus = corpora.MmCorpus(corp_path)
# Correr LDA del idioma para cada numero de topicos
for n_topics in self.output()['langs'][idioma].iterkeys():
print 'Número de tópicos: ' + str(n_topics)
if self.by_chunks:
lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, update_every=self.update_e, chunksize=self.chunk_size, passes=self.n_passes)
else:
lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, passes=1)
lda.save(self.output()['langs'][idioma][n_topics].path)
示例3: getLdaModel
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
def getLdaModel(bow_corpus, dictionary, useSavedTill):
if useSavedTill >= USESAVED.lda_model:
common_logger.info("loading LDA model from file")
return LdaModel.load(file_lda_model)
else:
common_logger.info("Training LDA model")
num_topics = int(math.log(len(bow_corpus)) + 1) # assumption:
lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=numPasses)
common_logger.info("Saving LDA model")
lda_model.save(file_lda_model)
common_logger.info("Done creating LDA model")
return lda_model
示例4: fetch_model
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
def fetch_model(dictionary):
print "Fetching LDA Model... ",
try:
lda = LdaModel.load('Topic/lda.tm')
print "LDA Model loaded!"
except IOError:
print "Model not found, building LDA..."
corpus=MyCorpus()
#lda = LdaModel(corpus,num_topics=50,update_every=1,chunksize=1000,passes=15)
lda = LdaModel(corpus,num_topics=50,id2word=dictionary,update_every=1,chunksize=1000,passes=50)
print "LDA Built!"
lda.save('Topic/lda.tm')
return lda
示例5: gensim_lda
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
def gensim_lda(d):
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
list_doc = []
for i in range(0,len(d)):
list_doc = list_doc + d[i]
dictionary = corpora.Dictionary(list_doc)
model = LdaModel(num_topics = 20, id2word = dictionary)
for i in range(0, len(d)):
print 'Generating corpus and updating model ', i
corpus = [dictionary.doc2bow(doc) for doc in d[i]]
model.update(corpus)
model.save('model_20')
print model.show_topics(num_topics = 20, num_words = 10)
示例6: main
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
def main(argv):
if len(argv) < 4:
print 'python train_lda.py group_id num_topics passes'
sys.exit(1)
group_id = argv[1]
num_topics = int(argv[2])
passes = int(argv[3])
log.info('Prepare corpus for group: %s' % group_id)
base_path = 'tables/' + group_id + '/'
model_base_path = 'ldamodels/' + group_id + '/'
# buid dict and corpus
#now = datetime.now()
indicator = 'title-comment'
source_path = base_path + 'corpus-topic-comment'
corpus_path = model_base_path + 'corpus-'+ indicator + '-' + group_id + '.mm'
dict_path = model_base_path + 'dict-' + indicator + '-' + group_id + '.dict'
log.info('Building the dict...')
build_dict_corpus(source_path, corpus_path, dict_path)
log.info('Loading dict from pre-saved file...')
dictionary = corpora.Dictionary.load(dict_path)
log.info('Done')
#dictionary.save_as_text(base_path + 'text-dict.txt')
log.info('Build a lda model...')
log.info('Loading corpus from pre-saved .mm file...')
mmcorpus = corpora.MmCorpus(corpus_path)
log.info('Done')
log.info('Training lda model...')
model = LdaModel(mmcorpus, num_topics=num_topics, id2word = dictionary, passes = passes)
model_path = model_base_path + indicator + '-' + group_id + '.ldamodel'
model.save(model_path)
log.info('Done.')
model = LdaModel.load(model_path)
model.show_topics(topics=num_topics, topn=10, log=True)
示例7: generate_model
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
def generate_model():
np.set_printoptions(precision=2)
corpus = []
corpus += load_expo_cdc()
corpus += load_lago()
corpus += load_news()
corpus += load_news_ic()
corpus += load_palestras()
corpus = preprocessing(corpus)
dictionary = corpora.Dictionary(corpus)
bow_corpus = [dictionary.doc2bow(text) for text in corpus]
dictionary.save(DICT)
corpora.MmCorpus.serialize(BOW_CORPUS, bow_corpus)
bow2 = np.concatenate((bow_corpus, bow_corpus), axis=0)
bow2 = np.concatenate((bow2, bow2), axis=0)
bow2 = np.concatenate((bow2, bow2), axis=0)
TOPICS = 20
model = LdaModel(bow2, id2word=dictionary, num_topics=TOPICS, iterations=100, passes=15)
model.save(MODEL)
lda_corpus = [model[vector] for vector in bow2]
lda_dense = gensim.matutils.corpus2dense(lda_corpus, num_terms=TOPICS).transpose()
"""
tfidf = models.TfidfModel(bow_corpus)
tfidf_corpus = [tfidf[vector] for vector in bow_corpus]
tfidf_dense = gensim.matutils.corpus2dense(tfidf_corpus, num_terms=len(dictionary)).transpose()
"""
classifier = LogisticRegression()
labels = load_labels()
labels2 = labels
labels2 += labels2
labels2 += labels2
labels2 += labels2
classifier.fit(lda_dense, labels2)
joblib.dump(classifier, CLASSIFIER, compress=9)
#print "LDA results"
probs = classifier.predict_proba(lda_dense)
示例8: SNAP_generateLDAForTopic
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
def SNAP_generateLDAForTopic(self, topic, numTopics = 5):
if (topic == 'all'):
topics = ['syria', 'ufo', 'movie', 'celebrity', 'russia'] # bieber, cyrus
for t in topics:
for nt in [5, 10]:
self.SNAP_generateLDAForTopic(t, nt)
return
id2word = self.SNAP_id2word()
mmPath = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'snap_data',
"gensim_snap_mmcorpus_%s.mm" % topic
)
outPath = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'snap_data',
"gensim_snap_lda_%s_%d" % (topic, numTopics)
)
mm = MmCorpus(mmPath)
lda = LdaModel(corpus=mm, id2word=id2word, num_topics=numTopics, update_every=1, chunksize=10000, passes=1)
lda.save(outPath)
return
示例9: build_lda_model
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
def build_lda_model(corpus, dictionary, num_topics=10):
file_name = None
if corpus == None:
corpus = get_corpus()
if dictionary == None:
dictionary = get_dictionary()
if num_topics == 10:
file_name = LDA_FILE_10
elif num_topics == 30:
file_name = LDA_FILE_30
elif num_topics == 60:
file_name = LDA_FILE_60
elif num_topics == 120:
file_name = LDA_FILE_120
else:
raise ValueError("bad number of topics")
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, update_every=1, chunksize=100, passes=1)
lda.save(file_name)
for topic in range(10):
print "Topic {0}: {1}".format(topic, lda.print_topic(topic))
return lda
示例10: train
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
def train(refresh=True):
if refresh:
ptb = BracketParseCorpusReader(Corpus.DATA_DIR, Corpus.FILE_PATTERN)
train_folders = [str(i) + str(j) for i in range(2) for j in range(10)]
train_folders += [str(i) + str(j) for i in range(2, 3) for j in range(5)]
dictionary = corpora.dictionary.Dictionary()
train_documents = list()
logger.debug('Starting to parse training documents')
for folder in train_folders:
for ptb_file in os.listdir(os.path.join(Corpus.DATA_DIR, folder)):
document_sentences = ptb.sents(fileids=[os.path.join(folder, ptb_file)])
if len(document_sentences) > DOC_LEN_THRESHOLD:
doc2sentence = list(chain.from_iterable(document_sentences))
doc2sentence = clean_text(doc2sentence)
dictionary.add_documents([doc2sentence])
train_documents.append(doc2sentence)
logger.debug('Parsed all training documents')
dictionary.filter_extremes(no_below=1, no_above=0.5)
dictionary.save(DICTIONARY_FILE)
logger.debug('Creating corpus for training data')
corpus = [dictionary.doc2bow(text) for text in train_documents]
logger.debug('Finished creating corpus')
logger.debug('Training LDA model on corpus')
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=N_TOPICS, passes=20)
logger.debug('Completed LDA training')
lda.save(LDA_MODEL_FILE)
else:
dictionary = corpora.dictionary.Dictionary.load(DICTIONARY_FILE)
lda = LdaModel.load(LDA_MODEL_FILE)
return lda, dictionary
示例11: print
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
from gensim import corpora,models, similarities
import numpy as np
import time
corpusType = "sraa2_";
subDirectory = 'run_sraa'
t1 = time.time()
corpus = corpora.MmCorpus(subDirectory+'/'+corpusType+'corpus.mm')
dictionary = corpora.dictionary.Dictionary.load(subDirectory+'/'+corpusType+'dictionary.dict')
classes = np.loadtxt(subDirectory+'/'+corpusType+'classes.dat',dtype=int)
t2 = time.time()
print 'data loaded ... seconds: ',
print (t2-t1)
ldaModel = LdaModel(corpus, num_topics=30, id2word = dictionary, passes=20)
ldaModel.save(subDirectory+'/'+corpusType+'sraa.lda_model')
t3 = time.time()
print 'ldaModel is finished... seconds:',
print (t3-t2)
tfidfModel = models.TfidfModel(corpus)
tfidfModel.save(subDirectory+'/'+corpusType+'sraa.tfidf_model')
t4 = time.time()
print 'tfidfModel is finished... seconds:',
print (t4-t3)
示例12: create_models
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
def create_models(db, lsi_num_topics=10, lda_num_topics=5, num_bars=None):
"""
Create and save a lsi object
using data in the database.
Save this object, along with the
dictionary and the corpus, to disk
"""
bars = db['bars']
if num_bars == None:
locations = bars.find({ 'nymag.review' : {'$ne':None},
'foursquare.tips' : {'$exists':True},
'foursquare.tips' : {'$ne':None}
})
else:
locations = bars.find({ 'nymag.review' : {'$ne':None},
'foursquare.tips' : {'$exists':True},
'foursquare.tips' : {'$ne':None}
}).limit(num_bars)
ignorechars = '''!"#$%&()*+,-./:;<=>[email protected][\]^_`{|}~'''
stopwords = get_stopwords()
texts = []
bar_idx_map = {}
idx_bar_map = {}
save_directory = "assets/"
print "Fetching texts from database and tokenizing"
for idx, location in enumerate(locations):
bar_name = location['nymag']['name']
bar_idx_map[bar_name] = idx
idx_bar_map[int(idx)] = bar_name
text = create_string_from_database(location)
tokens = tokenize_document(text, stopwords, ignorechars)
texts.append(tokens)
# Do some cleaning
print "Cleaning texts"
texts = remove_words_appearing_once(texts)
# Create the counter
word_counts = Counter()
for text in texts:
word_counts.update(text)
# Create and save the dictionary
print "Creating dictionary"
dictionary = corpora.Dictionary(texts)
dictionary.save(save_directory + 'keywords.dict')
# Create and save the corpus
print "Creating Corpus matrix"
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize(save_directory + 'corpus.mm', corpus)
# Term Frequency, Inverse Document Frequency
print "Applying TFIDF"
tfidf = models.TfidfModel(corpus)
tfidf.save(save_directory + "tfidf.model")
# Map TFIDF on the corpus
print "Mapping TFIDF on corpus"
corpus_tfidf = tfidf[corpus]
corpora.MmCorpus.serialize(save_directory + 'corpus_tfidf.mm', corpus_tfidf)
# Create the LSI
print "Creating LSI with %s topics" % lsi_num_topics
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=lsi_num_topics)
lsi.save(save_directory + 'lsi.model')
# Map LSI on the corpus
corpus_lsi_tfidf = lsi[corpus_tfidf]
corpora.MmCorpus.serialize(save_directory + 'corpus_lsi_tfidf.mm', corpus_lsi_tfidf)
# Create the index
#index = similarities.MatrixSimilarity(lsi[corpus_tfidf])
index = similarities.MatrixSimilarity(corpus_lsi_tfidf)
index.save(save_directory + 'lsi_tfidf.index')
# Create the LDA (on the raw corpus)
print "Creating LDA with %s topics" % lda_num_topics
lda = LdaModel(corpus, num_topics=lda_num_topics, id2word=dictionary,
update_every=0, passes=30)
#lda.show_topics(10, 20, formatted=False)
lda.save(save_directory + 'lda.model')
# Create the lda corpus
corpus_lda = lda[corpus]
corpora.MmCorpus.serialize(save_directory + 'corpus_lda.mm', corpus_lda)
# Save some additional info
with open(save_directory + 'bar_idx_map.json', 'wb') as fp:
json.dump(bar_idx_map, fp)
with open(save_directory + 'idx_bar_map.json', 'wb') as fp:
json.dump(idx_bar_map, fp)
#.........这里部分代码省略.........
示例13: enumerate
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
for text in newTexts:
for token in text:
newfrequency[token] += 1
# In[87]:
logger.info("Generating topics from LDA")
num_topics=100
model=LdaModel(num_topics=num_topics,corpus=corpus,id2word=dictionary,iterations=1500)
#model=LdaMulticore(num_topics=100,workers=3,corpus=corpus,id2word=dictionary,iterations=3000)
#model=HdpModel(corpus=corpus, id2word=dictionary)
# In[94]:
model.save('cache/model.pkl')
# In[96]:
cursor = db.movies.find({},{"movieId":1})
movieId=[]
for doc in cursor:
movieId.append(doc['movieId'])
# In[97]:
movieDict={}
for i,val in enumerate(movieId):
movieDict[val]=newTexts[i]
示例14: get_params
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
def get_params(files):
print "Converting data to features..."
tweets = imap(lambda f: open(f).read(), files)
features = [to_features(tweet) for tweet in tweets]
# features = json.load(open("models/lda_features.json"))
print "Converting features to bag of words..."
dictionary = corpora.Dictionary(features)
corpus = [dictionary.doc2bow(text) for text in features]
# corpus = json.load(open("models/lda_corpus.json"))
return corpus, features, dictionary
if __name__ == "__main__":
print "Loading file names..."
files = glob.glob("../tweets/*")
corpus, features, dictionary = get_params(files)
print "Creating LDA Model..."
lda = LdaModel(corpus, id2word=dictionary, num_topics=30, iterations=1000, alpha='auto', chunksize=50)
lda_topic_distribution = [l for l in lda[corpus]]
print "Saving model..."
lda.save("lda_model_unigrams.dat")
print "Saving distribution..."
f = open("lda_topic_distribution.json", 'w')
json.dump(lda_topic_distribution, f)
f.close()
示例15: len
# 需要导入模块: from gensim.models.ldamodel import LdaModel [as 别名]
# 或者: from gensim.models.ldamodel.LdaModel import save [as 别名]
if len(sys.argv) != 2:
print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0])
raise SystemExit(1)
data_dir = sys.argv[1]
mapping_file = data_dir+'/token_id_idf'
dictionary_file = data_dir+'/id_token_df'
token_file = data_dir+'/tokens'
lda_file = data_dir+'/lda_model'
print 'creating dictionary...'
N = 23307 # supplied idfs from rcv1/lyrl2004 were based on 23307 training docs
create_dictionary_file(mapping_file,dictionary_file,23307)
dictionary = Dictionary.load_from_text(dictionary_file)
print 'creating corpus...'
corpus = SimpleLowCorpus(token_file,dictionary)
print 'training model...'
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
lda = LdaModel(corpus,id2word=dictionary,num_topics=200)
print 'done!'
print '\n'*3
print '======final topics======'
topics = lda.show_topics(topics=-1,topn=4)
for i,topic in enumerate(topics):
print i,topic
print 'saving model...'
lda.save(lda_file)