本文整理汇总了Python中gensim.corpora.WikiCorpus.save方法的典型用法代码示例。如果您正苦于以下问题:Python WikiCorpus.save方法的具体用法?Python WikiCorpus.save怎么用?Python WikiCorpus.save使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora.WikiCorpus
的用法示例。
在下文中一共展示了WikiCorpus.save方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: int
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import save [as 别名]
keep_words = int(sys.argv[3])
else:
keep_words = DEFAULT_DICT_SIZE
online = 'online' in program
lemmatize = 'lemma' in program
debug = 'nodebug' not in program
if online:
dictionary = HashDictionary(id_range=keep_words, debug=debug)
dictionary.allow_update = True # start collecting document frequencies
wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
dictionary.save_as_text(outp + '_wordids.txt.bz2')
wiki.save(outp + '_corpus.pkl.bz2')
dictionary.allow_update = False
else:
wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
mywiki = myWikiCorpus(inp, lemmatize=lemmatize)
# only keep the most frequent words (out of total ~8.2m unique tokens)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
MmCorpus.serialize(outp + '_bowm.mm', mywiki, progress_cnt=10000) # another ~9h
wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
# load back the id->word mapping directly from file
# this seems to save more memory, compared to keeping the wiki.dictionary object from above
dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
del wiki
示例2: HashDictionary
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import save [as 别名]
logger = logging.getLogger('gensim.scripts.read_stream_items')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %r" % args.__dict__)
if args.online:
dictionary = HashDictionary(id_range=args.keep_words, debug=args.debug)
dictionary.allow_update = True # start collecting document frequencies
## cannot use --max-articles or --expect-streamitems
wiki = WikiCorpus(args.input, lemmatize=args.lemmatize, dictionary=dictionary)
MmCorpus.serialize(args.output + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
dictionary.save_as_text(outp + '_wordids.txt.bz2')
wiki.save(args.output + '_corpus.pkl.bz2')
dictionary.allow_update = False
else: ## not online
# takes about 9h on a macbook pro, for 3.5m articles (june 2011)
wiki = WikiCorpus(
args.input, lemmatize=args.lemmatize,
max_articles=args.max_articles,
expect_streamitems=args.expect_streamitems,
file_name_pattern=args.file_name_pattern,
)
# only keep the most frequent words (out of total ~8.2m unique tokens)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(args.output + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
wiki.dictionary.save_as_text(args.output + '_wordids.txt.bz2')
示例3: int
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import save [as 别名]
keep_words = int(sys.argv[3])
else:
keep_words = DEFAULT_DICT_SIZE
online = 'online' in program
lemmatize = 'lemma' in program
debug = 'nodebug' not in program
if online:
dictionary = HashDictionary(id_range=keep_words, debug=debug)
dictionary.allow_update = True # start collecting document frequencies
wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
dictionary.save_as_text(outp + '_wordids.txt')
wiki.save(outp + '_corpus.pkl')
dictionary.allow_update = False
else:
wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
# only keep the most frequent words (out of total ~8.2m unique tokens)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
wiki.dictionary.save_as_text(outp + '_wordids.txt')
# load back the id->word mapping directly from file
# this seems to save more memory, compared to keeping the wiki.dictionary object from above
dictionary = Dictionary.load_from_text(outp + '_wordids.txt')
del wiki
# initialize corpus reader and word->id mapping
mm = MmCorpus(outp + '_bow.mm')
示例4: int
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import save [as 别名]
keep_words = int(sys.argv[3])
else:
keep_words = DEFAULT_DICT_SIZE
online = 'online' in program
lemmatize = 'lemma' in program
debug = 'nodebug' not in program
if online:
dictionary = HashDictionary(id_range=keep_words, debug=debug)
dictionary.allow_update = True # start collecting document frequencies
wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True)
wiki.save(outp + '_corpus.pkl.bz2', use_bzip2=True)
dictionary.allow_update = False
else:
wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
# only keep the most frequent words (out of total ~8.2m unique tokens)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True)
# load back the id->word mapping directly from file
# this seems to save more memory, compared to keeping the wiki.dictionary object from above
dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2', use_bzip2=True)
del wiki
# initialize corpus reader and word->id mapping
mm = MmCorpus(outp + '_bow.mm')
示例5: WikiCorpus
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import save [as 别名]
util.download_file(wiki_url, f_corpus, progress=True)
else:
sys.exit()
corpus = WikiCorpus(f_corpus)
# corpus.save(f_bow)
else: # models will be trained on your own corpus
if os.path.exists(f_bow):
corpus = TextCorpus.load(f_bow)
else:
corpus = TextCorpus(f_corpus)
# corpus.save(f_bow)
# filter dictionary
corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size)
corpus.dictionary.save(f_dict)
corpus.save(f_bow)
# tf-idf model
if os.path.exists(f_tfidf):
tfidf = TfidfModel.load(f_tfidf)
else:
tfidf = TfidfModel(corpus, id2word=corpus.dictionary)
tfidf.save(f_tfidf)
# TRAINING
# lsa model
if not os.path.exists(f_lsa):
lsa = LsiModel(tfidf[corpus], id2word=corpus.dictionary, num_topics=lsa_dim)
lsa.save(f_lsa)