当前位置: 首页>>代码示例>>Python>>正文

Python ldamodel.LdaModel类代码示例

本文整理汇总了Python中gensim.models.ldamodel.LdaModel的典型用法代码示例。如果您正苦于以下问题:Python LdaModel类的具体用法?Python LdaModel怎么用?Python LdaModel使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


示例1: get_topics

def get_topics(candidate, day):
    start_time = datetime.strptime(day, "%Y-%m-%d").date()
    start_time = int(start_time.strftime('%s'))*1000
    end_time = start_time + 86399999
        client = MongoClient()
        tweets = client.fletcher.tweets
        tweets = tweets.aggregate([
        documents = []
        pattern = re.compile("[^a-zA-Z ]")
        for tweet in tweets:
            documents.append(pattern.sub('', tweet['text']))
        stoplist = set(candidate_stop_words[candidate_slugs[candidate]] + stopwords)
        texts = [[word for word in document.lower().split() if word not in stoplist]
                for document in documents]
        frequency = defaultdict(int)
        for text in texts:
            for token in text:
                frequency[token] += 1
        texts = [[token for token in text if frequency[token] > 1]
                for text in texts]
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, update_every=1, chunksize=10000, passes=10)
        return lda.print_topics(3)
        return None

示例2: main

def main():
    collection_name = "nips"
    years = xrange(2010, 2015)  # 10 ~ 14
    n_topics = 10
    corpus_paths = map(lambda y: 
                       "data/{}-{}.dat".format(collection_name, y),
    all_corpus = []
    year2corpus = {}
    for year, path in zip(years, corpus_paths):
        corpus = list(load_line_corpus(path))
        year2corpus[year] = corpus

    all_corpus = list(itertools.chain.from_iterable(all_corpus))

    dictionary = Dictionary(all_corpus)
    all_corpus = [dictionary.doc2bow(doc)
                  for doc in all_corpus]

    import pdb

    # print all_corpus
    model = LdaModel(all_corpus, num_topics=n_topics,
                     eval_every=10, passes=100)
    print model.show_topics()

示例3: LDA

class LDA(object):
    def __init__(self, model, vocab, corpus=None, topics=200, passes=1):
        self._model_file = model
        self._dict_file = vocab
        self._corpus_file = corpus
        self._topics = topics
        self._passes = passes

    def train(self):
        self._corpus = SentenceDocCorpus(self._corpus_file)
        self._lda = LdaModel(self._corpus, num_topics = self._topics, id2word = self._corpus.dictionary, passes = self._passes)
        self._dictionary = self._corpus.dictionary

    def load(self):
        self._lda = LdaModel.load(self._model_file)
        self._dictionary = Dictionary.load(self._dict_file)

    def topics(self, words):
        return self._lda[self._dictionary.doc2bow(common.filter(words))]

    def topic_vector(self, words):
        return np.array([v for k, v in self._lda.__getitem__(self._dictionary.doc2bow(common.filter(words)), eps=0)])

示例4: malletmodel2ldamodel

def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50):
    """Convert :class:`~gensim.models.wrappers.ldamallet.LdaMallet` to :class:`~gensim.models.ldamodel.LdaModel`.

    This works by copying the training model weights (alpha, beta...) from a trained mallet model into the gensim model.

    mallet_model : :class:`~gensim.models.wrappers.ldamallet.LdaMallet`
        Trained Mallet model
    gamma_threshold : float, optional
        To be used for inference in the new LdaModel.
    iterations : int, optional
        Number of iterations to be used for inference in the new LdaModel.

        Gensim native LDA.

    model_gensim = LdaModel(
        id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
        alpha=mallet_model.alpha, iterations=iterations,
        dtype=numpy.float64  # don't loose precision when converting from MALLET
    model_gensim.expElogbeta[:] = mallet_model.wordtopics
    return model_gensim

示例5: vwmodel2ldamodel

def vwmodel2ldamodel(vw_model, iterations=50):
    """Convert :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` to

    This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel
    into the gensim model.

    vw_model : :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`
        Trained Vowpal Wabbit model.
    iterations : int
        Number of iterations to be used for inference of the new :class:`~gensim.models.ldamodel.LdaModel`.

        Gensim native LDA.

    model_gensim = LdaModel(
        num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize,
        passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay,
        offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold,
    model_gensim.expElogbeta[:] = vw_model._get_topics()
    return model_gensim

示例6: run

	def run(self):
		if self.clean_level in ('raw','clean','stopwords'):
			kind = self.clean_level
			kind = 'stopwords'

		for idioma in self.output()['langs'].iterkeys():
			dicc_path = self.input()['dict']['langs'][idioma].path
			corp_path = self.input()['corp']['langs'][idioma].path
			print '=============================='
			print 'Corriendo LDA de %s con nivel de limpieza %s' % (idioma, kind)
			print '=============================='

			# Cargar diccionario y corpus
			dicc = corpora.Dictionary.load(dicc_path)
			corpus = corpora.MmCorpus(corp_path)

			# Correr LDA del idioma para cada numero de topicos
			for n_topics in self.output()['langs'][idioma].iterkeys():
				print 'Número de tópicos: ' + str(n_topics)
				if self.by_chunks:
					lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, update_every=self.update_e, chunksize=self.chunk_size, passes=self.n_passes)
					lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, passes=1)

示例7: train_model

def train_model(texts, **kwargs):

  # parse args
  filter_stopwords = kwargs.get('filter_stopwords', True)
  normalizer = kwargs.get('normalizer', 'porter')
  tfidf = kwargs.get('tfidf', True)
  num_topics = kwargs.get('num_topics', 20)
  min_freq = kwargs.get('min_freq', 2)
  use_pickle = kwargs.get('use_pickle', True)
  update_pickle = kwargs.get('update_pickle', True)
  report = kwargs.get('report', True)
  distributed = kwargs.get('distributed', False)
  # build corpus or read it in from pickle
  if use_pickle:
    print "INFO: loading pickled corpus and word hash"
    corpus = pickle.load( open( "pickles/corpus.p", "rb" ) )
    id2word = pickle.load( open( "pickles/id2word.p", "rb" ) )
    print "INFO: processing text and building corpus..."
    corpus, id2word = process_texts(
      texts = texts, 
      filter_stopwords = filter_stopwords,
      normalizer = normalizer,
      min_freq = min_freq

    if update_pickle:
      # pickle files
      print "INFO: updating pickled coprus and word hash"
      pickle.dump(corpus, open( "pickles/corpus.p", "wb" ) )
      pickle.dump(id2word, open( "pickles/id2word.p", "wb" ) )

  # optional tfidf transformation
  if tfidf:
    print "INFO: applying tfidf transformation..."
    tfidf = TfidfModel(corpus)
    corpus = tfidf[corpus]

  # fit model
  print "INFO: fitting model..."
  lda = LdaModel(
    corpus = corpus, 
    id2word = id2word, 
    num_topics = num_topics,
    distributed = distributed

  # report
  if report:
    perplexity = lda.bound(corpus)
    print "RESULTS:"
    print "\nperplexity: ", perplexity, "\n"
    topics = lda.show_topics(num_topics)
    for i, t in enumerate(topics):
      print "topic %d:" % i
      print t

  return lda, corpus, id2word

示例8: plottopicpop

def plottopicpop():
    internet = [0 for i in range(10)]
    developing = [0 for i in range(10)]
    habr = [0 for i in range(10)]
    n = 0
    for year in range(2006, 2016):
        articles, numberofarticles = getarticlesbyyear(year)
        print("Got articles for", str(year))
        # Normalaize texts
        i = 0
        for article in articles:
            article = replacesymbols(article)
            articles[i] = normalaisestr(article.lower())
            i += 1
        # Remove unnecessary words
        texts = [[word for word in article if word not in stoplist]
                 for article in articles]
        print('Deleted stopwords')
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        print('Starting training')
        # Щадящий режим для ОЗУ
        for i in range(numberofarticles // 100):
            begin = 100 * i
            end = 100 * (i + 1)
            if end > numberofarticles:
                end = numberofarticles
            lda = LdaModel(corpus[begin:end:], id2word=dictionary, num_topics=end - begin)

            for j in range(lda.num_topics):
                topics = lda.get_topic_terms(j, 15)
                # print(topics)
                for topic in topics[0]:
                    top = dictionary.get(topic)
                    # print(top)
                    if "интернет" == top:
                        internet[n] += 1
                    if "разработка" == top:
                        developing[n] += 1
                    if "хабра" == top:
                        habr[n] += 1
            del lda
        n += 1

        print(internet,'\n', developing, '\n', habr)

    plt.title('Population of 3 topics.')
    plt.xlabel('Year 2006 - 2015')
    plt.ylabel('Number of articles')
    plt.plot(internet, label="Интернет")
    plt.plot(developing, label="Разработка")
    plt.plot(habr, label="Хабра")

示例9: getLdaModel

def getLdaModel(bow_corpus, dictionary, useSavedTill):
    if useSavedTill >= USESAVED.lda_model:
        common_logger.info("loading LDA model from file")
        return LdaModel.load(file_lda_model)
        common_logger.info("Training LDA model")
        num_topics = int(math.log(len(bow_corpus)) + 1)  # assumption:
        lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=numPasses)
        common_logger.info("Saving LDA model")
        common_logger.info("Done creating LDA model")
        return lda_model

示例10: fetch_model

	def fetch_model(dictionary):
		print "Fetching LDA Model... ",
			lda = LdaModel.load('Topic/lda.tm')
			print "LDA Model loaded!"
		except IOError:
			print "Model not found, building LDA..."
			#lda = LdaModel(corpus,num_topics=50,update_every=1,chunksize=1000,passes=15)
			lda = LdaModel(corpus,num_topics=50,id2word=dictionary,update_every=1,chunksize=1000,passes=50)
			print "LDA Built!"
		return lda

示例11: train

 def train(self):
     self._corpus = SentenceDocCorpus(self._corpus_file)
     self._lda = LdaModel(self._corpus, num_topics = self._topics, id2word = self._corpus.dictionary, passes = self._passes)
     self._dictionary = self._corpus.dictionary

示例12: make_clouds

def make_clouds(files, n_words=20):
    # set locations
    base_model_name = os.path.splitext(os.path.basename(files.model))[0]
    output_d = '../browser/clouds/' + base_model_name + '/'
    if not os.path.exists(output_d):
    # create wordcloud generator
    wc = WordCloud(width=1000, height=500, background_color='white')

    print('Loading model')
    model = LdaModel.load(files.model)
    beta = model.expElogbeta

    print('Normalizing by topics, and by words')
    pTW = normalize(beta, axis=0)
    pWT = normalize(beta, axis=1)

    # load bug<->id map, then invert to id<-> bug
    bug_to_id = json.loads(open(files.replacements).read())
    id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k}

    for i in range(len(beta)):
        # compute RAR
        t_rar = np.sqrt(pTW[i] * pWT[i])
        top_word_ids = t_rar.argsort()[:-1 - n_words:-1]
        top_words = [model.id2word.id2token[wordid] for wordid in top_word_ids]
        top_words = [id_to_bug[word] if word in id_to_bug else word for word in top_words]
        wc.fit_words(zip(top_words, t_rar[top_word_ids]))
        wc.to_file(output_d + str(i) + '.png')

示例13: __init__

    def __init__(self, fnames, model=None, corpus=None, dictionary=None):
        """`fnames` is an array of files for [lda_model, distribution]"""
        self.reviews = open('data/electronics_topics_in.txt').readlines()

        print "Loding topic model..."
        if model is not None:
            print "Using argument model"
            self.lda = model
            self.lda = LdaModel.load(fnames[0])

        if corpus is not None:
            print "Using argument corpus and dictionary"
            self.corpus = corpus
            self.dictionary = dictionary
            print "Loading corpus and dictionary from file"
            self.corpus = load("data/models/electronics_tfidf_corpus.pkl")
            self.dictionary = load("data/models/electronics_dict.pkl")

        print "Loading review-topic distribution..."
        self.review_dist = [l for l in self.lda[self.corpus]]
        tmp = lambda dist: sorted(dist, key=lambda arr: arr[1], reverse=True)
        self.review_dist = map(lambda dist: tmp(dist), self.review_dist)

        print "processing topics"
        tmp = map(lambda t: re.sub("(\d*\.\d*\*)", "", t), self.lda.show_topics(-1))
        self.topics = map(lambda ts: re.sub("\\s\+", ",", ts), tmp)

示例14: __init__

    def __init__(self, topics = 10, 
                 worker = 3, 
                 pretrained_model = None, 
                 dictionary = None):
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)

示例15: train_lda

	def train_lda (self, corpus, dictionary):
			PRIVATE: train_lda
			given a corpus and a dictionary, this fits parameters for self.lda_model, 
			fills self.lda_model_topics with the 
		self.lda_model = LdaModel(corpus, id2word=dictionary, num_topics=self.num_topics_lda)
		self.lda_model_topics = self.find_per_topic_word_distributions ()
