当前位置: 首页>>代码示例>>Python>>正文


Python WikiCorpus.save方法代码示例

本文整理汇总了Python中gensim.corpora.WikiCorpus.save方法的典型用法代码示例。如果您正苦于以下问题:Python WikiCorpus.save方法的具体用法?Python WikiCorpus.save怎么用?Python WikiCorpus.save使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.corpora.WikiCorpus的用法示例。


在下文中一共展示了WikiCorpus.save方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: int

# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import save [as 别名]
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    online = 'online' in program
    lemmatize = 'lemma' in program
    debug = 'nodebug' not in program

    if online:
        dictionary = HashDictionary(id_range=keep_words, debug=debug)
        dictionary.allow_update = True # start collecting document frequencies
        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2')
        wiki.save(outp + '_corpus.pkl.bz2')
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        mywiki = myWikiCorpus(inp, lemmatize=lemmatize)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        MmCorpus.serialize(outp + '_bowm.mm', mywiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
        dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
    del wiki
开发者ID:yinlosky,项目名称:mygensim,代码行数:32,代码来源:make_wikicorpus.py

示例2: HashDictionary

# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import save [as 别名]
    logger = logging.getLogger('gensim.scripts.read_stream_items')

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %r" % args.__dict__)

    if args.online:
        dictionary = HashDictionary(id_range=args.keep_words, debug=args.debug)
        dictionary.allow_update = True # start collecting document frequencies
        ## cannot use --max-articles or --expect-streamitems
        wiki = WikiCorpus(args.input, lemmatize=args.lemmatize, dictionary=dictionary)
        MmCorpus.serialize(args.output + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2')
        wiki.save(args.output + '_corpus.pkl.bz2')
        dictionary.allow_update = False

    else:  ## not online
        # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        wiki = WikiCorpus(
            args.input, lemmatize=args.lemmatize, 
            max_articles=args.max_articles,
            expect_streamitems=args.expect_streamitems,                          
            file_name_pattern=args.file_name_pattern,
        ) 
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(args.output + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(args.output + '_wordids.txt.bz2')
开发者ID:diffeo,项目名称:gensim,代码行数:33,代码来源:make_wiki_from_stream_items.py

示例3: int

# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import save [as 别名]
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    online = 'online' in program
    lemmatize = 'lemma' in program
    debug = 'nodebug' not in program

    if online:
        dictionary = HashDictionary(id_range=keep_words, debug=debug)
        dictionary.allow_update = True # start collecting document frequencies
        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt')
        wiki.save(outp + '_corpus.pkl')
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt')
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
        dictionary = Dictionary.load_from_text(outp + '_wordids.txt')
    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')
开发者ID:Anikacyp,项目名称:gensim,代码行数:33,代码来源:make_wikicorpus.py

示例4: int

# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import save [as 别名]
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    online = 'online' in program
    lemmatize = 'lemma' in program
    debug = 'nodebug' not in program

    if online:
        dictionary = HashDictionary(id_range=keep_words, debug=debug)
        dictionary.allow_update = True # start collecting document frequencies
        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True)
        wiki.save(outp + '_corpus.pkl.bz2', use_bzip2=True)
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True)
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
        dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2', use_bzip2=True)
    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')
开发者ID:luispedro,项目名称:gensim,代码行数:33,代码来源:make_wikicorpus.py

示例5: WikiCorpus

# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import save [as 别名]
					util.download_file(wiki_url, f_corpus, progress=True)
				else:
					sys.exit()
			corpus = WikiCorpus(f_corpus)
#			corpus.save(f_bow)
	else: # models will be trained on your own corpus
		if os.path.exists(f_bow):
			corpus = TextCorpus.load(f_bow)
		else:
			corpus = TextCorpus(f_corpus)
#			corpus.save(f_bow)

	# filter dictionary
	corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size)
	corpus.dictionary.save(f_dict)
	corpus.save(f_bow)

	# tf-idf model
	if os.path.exists(f_tfidf):
		tfidf = TfidfModel.load(f_tfidf)
	else:
		tfidf = TfidfModel(corpus, id2word=corpus.dictionary)
		tfidf.save(f_tfidf)

	# TRAINING

	# lsa model
	if not os.path.exists(f_lsa):
		lsa = LsiModel(tfidf[corpus], id2word=corpus.dictionary, num_topics=lsa_dim)
		lsa.save(f_lsa)
开发者ID:jorispelemans,项目名称:scale,代码行数:32,代码来源:baglm_example.py


注:本文中的gensim.corpora.WikiCorpus.save方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。