Python corpora.MmCorpus类代码示例

本文整理汇总了Python中gensim.corpora.MmCorpus类的典型用法代码示例。如果您正苦于以下问题：Python MmCorpus类的具体用法？Python MmCorpus怎么用？Python MmCorpus使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了MmCorpus类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_apply

    def test_apply(self):

        transformed_vtcorp = self.transformer._apply(self.vtcorp)

        self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary'))

        transformed_names = self.loader.layout.required_text_corpus_names(self.transformation_label)
        text_data_name = os.path.join(self.data_root,
                                      self.loader.layout.corpus_dir,
                                      transformed_names[0])
        text_obj_name = os.path.join(self.data_root,
                                      self.loader.layout.corpus_dir,
                                      transformed_names[2])

        MmCorpus.serialize(text_data_name, transformed_vtcorp)
        transformed_vtcorp.save(text_obj_name)

        self.assertTrue(self.loader.has_text_corpora(self.transformation_label))

        self.temporary_files.extend([ os.path.join(self.data_root,
                                                   self.loader.layout.corpus_dir,
                                                   transformed_name)
                                      for transformed_name in transformed_names])

        transformed_vtcorp = TransformedCorpus.load(text_obj_name)

        self.assertIsInstance(transformed_vtcorp, TransformedCorpus)
        self.assertIsInstance(transformed_vtcorp.corpus, VTextCorpus)
        self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary'))

        print 'Transformed corpus dictionary size: %i' % len(transformed_vtcorp.corpus.dictionary)
        self.assertEqual(self.k, len(transformed_vtcorp.obj.orig2transformed))

开发者ID:hajicj，项目名称:safire，代码行数:32，代码来源:test_frequency_based_transform.py

示例2: main

def main(argv=None):
    if argv is None:
        argv = sys.argv

    print('Creating simple wiki serialized corpus')
    # Download the raw file if we do not have it already
    if not os.path.isfile(WIKIFILE):
        # Get the file
        wget.download(WIKIURL)
    wiki = WikiCorpus(WIKIFILE, lemmatize=False)
    i = 0
    article_dict = {}
    for text in wiki.get_texts(meta=True):
        url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
        article_dict[i] = (url_string.format(text[0]), text[1])
        i += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)
    MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, )
    wiki.dictionary.save_as_text(DICTFILE)
    print('Simple wiki serialized corpus created')
    # Now run LSI
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")

开发者ID:fajifr，项目名称:recontent，代码行数:35，代码来源:make_simple_wiki_corpus.py

示例3: extend_corpus

    def extend_corpus(self, corpus):
        """
        Add new documents in `corpus` to `self.corpus`. If serialization is used,
        then the entire corpus (`self.corpus`) is re-serialized and the new documents
        are added in the process. If serialization is not used, the corpus, as a list
        of documents, is simply extended.

        """
        if self.serialized:
            # Re-serialize the entire corpus while appending the new documents.
            if isinstance(corpus, MmCorpus):
                # Check that we are not attempting to overwrite the serialized corpus.
                assert self.corpus.input != corpus.input, \
                    'Input corpus cannot have the same file path as the model corpus (serialization_path).'
            corpus_chain = chain(self.corpus, corpus)  # A generator with the old and new documents.
            # Make a temporary copy of the file where the corpus is serialized.
            copyfile(self.serialization_path, self.serialization_path + '.tmp')
            self.corpus.input = self.serialization_path + '.tmp'  # Point the old corpus at this temporary file.
            # Re-serialize the old corpus, and extend it with the new corpus.
            MmCorpus.serialize(self.serialization_path, corpus_chain)
            self.corpus = MmCorpus(self.serialization_path)  # Store the new serialized corpus object in self.corpus.
            remove(self.serialization_path + '.tmp')  # Remove the temporary file again.
        else:
            # self.corpus and corpus are just lists, just extend the list.
            # First check that corpus is actually a list.
            assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists."
            self.corpus.extend(corpus)

开发者ID:abs51295，项目名称:gensim，代码行数:27，代码来源:atmodel.py

示例4: _create_bow_representation

 def _create_bow_representation(self):
     """Create bag-of-words representation of collection, and save it 
        in Matrix Matrix format to disk."""
     
     print('Create bag-of-words matrix representation.')
     self.bow_corpus = [self.dictionary.doc2bow(article) 
                        for article in self.articles]
     MmCorpus.serialize(self.bowmm_filepath, self.bow_corpus)

开发者ID:2mh，项目名称:text-berg-plug-play，代码行数:8，代码来源:tbta.py

示例5: createcorpus

def createcorpus(bg_corpus,output_dictionary,output_serialize):
	# Generating a training/background corpus from your own source of documents
	#saving dictionary and corpus in Matrix method form
	print("Creating corpus and dictionary")
	background_corpus = TextCorpus(input=bg_corpus)
	background_corpus.dictionary.save(output_dictionary)
	MmCorpus.serialize(output_serialize,background_corpus)  
	return background_corpus,background_corpus.dictionary

开发者ID:AshwiniTokekar，项目名称:Multi-document-summarization-of-news-articles-using-deep-learning，代码行数:8，代码来源:semantic_similarity.py

示例6: load_experts

def load_experts():
    """
    load expert data and save to file
    """
    expert_corpus = ExpertCorpus()
    MmCorpus.serialize(corpus=expert_corpus, fname='expert_corpus_new_test.mm')

    """
    save expert-to-document map to pickle
    """
    pickle.dump(expert2doc, open('expert2doc_new_test.p', 'wb'))

开发者ID:liamcreagh，项目名称:Anthus-News，代码行数:11，代码来源:tag_experts.py

示例7: _create_tfidf_matrix

 def _create_tfidf_matrix(self):
     """Create TF-IDF matrix and save it in Matrix Matrix format to 
        disk"""
     
     print('Create TF-IDF matrix of collection.')
     tfidf = TfidfModel(self.bow_corpus, 
                        id2word=self.dictionary, 
                        normalize=True)
     MmCorpus.serialize(self.tfidf_filepath, 
                        tfidf[self.bow_corpus])
     print('Number of documents:', tfidf.num_docs)

开发者ID:2mh，项目名称:text-berg-plug-play，代码行数:11，代码来源:tbta.py

示例8: init_empty_corpus

    def init_empty_corpus(self):
        """
        Initialize an empty corpus. If the corpora are to be treated as lists, simply
        initialize an empty list. If serialization is used, initialize an empty corpus
        of the class `gensim.corpora.MmCorpus`.

        """
        if self.serialized:
            # Initialize the corpus as a serialized empty list.
            # This corpus will be extended in self.update.
            MmCorpus.serialize(self.serialization_path, [])  # Serialize empty corpus.
            self.corpus = MmCorpus(self.serialization_path)  # Store serialized corpus object in self.corpus.
        else:
            # All input corpora are assumed to just be lists.
            self.corpus = []

开发者ID:jMonteroMunoz，项目名称:gensim，代码行数:15，代码来源:atmodel.py

示例9: pretrain

def pretrain():
    """pre train the text corpus and build the dictionary"""
    gutenberg_corpus = TextCorpus(text_corpus_file)
    gutenberg_corpus.dictionary.save(dict_file)
    gutenberg_corpus.dictionary.save_as_text(dic_txt_file)
    mm = MmCorpus.serialize(mm_corpus_file, gutenberg_corpus)
    print mm;

开发者ID:HaoyuHuang，项目名称:Spark-Gensim，代码行数:7，代码来源:lsamodel.py

示例10: main

def main():
    datadir = path.abspath(path.join(os.getcwd(), "data"))

    # Read in the corpus from within the archive file
    fin = path.join(datadir, "reuters21578.tar.gz")
    rc = ReutersCorpus(fin)

    # filter out some of the more common words,
    # and some of the less-common ones as well
    rc.dictionary.filter_extremes(no_below=20, no_above=0.1)
    rc.dictionary.compactify()

    # Serialize the Reuters 21578 corpus
    fout = path.join(datadir, "reuters21578.mm")
    MmCorpus.serialize(fout, rc)

    # Save the dictionary to file as text
    fout = path.join(datadir, "reuters21578.dict.txt")
    rc.dictionary.save_as_text(fout)

开发者ID:buruzaemon，项目名称:LDA-reuters，代码行数:19，代码来源:create_corpus.py

示例11: get_topics_lda

def get_topics_lda(tokens, n_topics=10):
    """
    Using the `gensim` package for LDA. 
    LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia).
    `gensim` is a package for topic modeling only. So for a particular topic modeling task,
    it is a lighter option to install and run. Also it can be run distributed and updated over an existing model

    :param tokens: Preprocessed tokens for faster dictionary building
    :param n_topics: Number of topics to decompose data to
    :return: list() of topics
    """
    dict_file = 'resources/deals.dict'
    if not os.path.isfile(dict_file):
        print "Dictionary file does not exist. Creating one"
        dictionary = Dictionary(tokens)
        freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1]
        dictionary.filter_tokens(freq1)
        dictionary.compactify()
        dictionary.save(dict_file)
    dictionary = Dictionary.load(dict_file)
    # print dictionary

    corpus_file = 'resources/deals.mm'
    if not os.path.isfile(corpus_file):
        print "Corpus file does not exist. Creating one"
        corpus = [dictionary.doc2bow(token) for token in tokens]
        MmCorpus.serialize(corpus_file, corpus)
    mm = MmCorpus(corpus_file)
    # print mm
    # tfidf = TfidfModel(mm)
    # corpus_tfidf = tfidf[mm]

    lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000,
                   passes=1)
    topics = []
    for i in range(0, n_topics):
        words = lda.print_topic(i).split('+')
        topic = []
        for word in words:
            score, w = word.split('*')
            topic.append((w, score))
        topics.append(topic)
    return topics

开发者ID:ypandit，项目名称:exercises，代码行数:43，代码来源:task2.py

示例12: main

def main():
    datadir = path.abspath(path.join(os.getcwd(), "data"))

    # load back the id->word mapping directly from file
    fin = path.join(datadir, "reuters21578.dict.txt")
    vocabulary = Dictionary.load_from_text(fin)

    # load the corpus
    fin = path.join(datadir, "reuters21578.mm")
    mm = MmCorpus(fin)

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=vocabulary, normalize=True)

    # save the TfidfModel instance to file
    fout = path.join(datadir, "reuters21578.tfidf.model")
    tfidf.save(fout)

    # save TF-IDF vectors in matrix market format
    fout = path.join(datadir, "reuters21578.tfidf.mm")
    MmCorpus.serialize(fout, tfidf[mm], progress_cnt=10000)

开发者ID:buruzaemon，项目名称:LDA-reuters，代码行数:21，代码来源:create_tfidf.py

示例13: main

def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp = OUT_PREFIX
    keep_words = DEFAULT_DICT_SIZE

    # the doc index
    dbc = get_cursor()
    dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
    docindex = [(pageid, title) for pageid, title in dbc]
    pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))

    lemmatize = True  # 'lemma' in program

    wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # another long task
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)

开发者ID:dvictor，项目名称:lsi-document-similarity，代码行数:39，代码来源:make_wikicorpus.py

示例14: main

def main(argv=None):
    if argv is None:
        argv = sys.argv
    print('Creating speech serialized corpus')
    # Create the speech corpus, it is inside the rawfile as a json format:
    # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"}
    with open(RAWFILE, 'r') as f:
        speech_dict = json.load(f)
    with open(RAWIDS, 'r') as f:
        id_dict = json.load(f)
    # We also need to make sure that the article ids are saved in the correct
    # format so that the gensimple engine can understand it, like this:
    # "int": ["url", "title"],
    texts = []
    article_dict = {}
    counter = 0
    for key, value in speech_dict.items():
        texts.append([token for token in value['text']])
        article_dict[str(counter)] = [value['url'], id_dict[key]['title']]
        counter += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    dictionary = Dictionary(texts)
    dictionary.save_as_text(DICTFILE)
    corpus = [dictionary.doc2bow(text) for text in texts]
    MmCorpus.serialize(MMFILE, corpus)
    print('Speech serialized corpus created')
    # # Now run LSI on TDIDF
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")

开发者ID:fajifr，项目名称:recontent，代码行数:39，代码来源:make_speech_corpus.py

示例15: main

def main(args):

    logging.info('Initializing loaders with root %s, name %s' % (
        args.root, args.name))

    dloader = MultimodalDatasetLoader(args.root, args.name)

    icorp = dloader.load_image_corpus(args.img_label)

    transformer = NormalizationTransform()

    normalized_icorp = transformer._apply(icorp)

    corpus_names = dloader.layout.required_img_corpus_names(args.transformation_label)
    corpus_full_path = os.path.join(args.root, corpus_names[0])

    logging.info('Serializing to file %s' % corpus_full_path)

    MmCorpus.serialize(corpus_full_path, normalized_icorp)

    logging.info('Re-saving original corpus object with infix %s' % args.transformation_label)

    dloader.save_image_corpus(normalized_icorp.corpus, args.transformation_label)

开发者ID:hajicj，项目名称:safire，代码行数:23，代码来源:normalize_img_dataset.py

注：本文中的gensim.corpora.MmCorpus类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。