本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.load_from_text方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.load_from_text方法的具体用法?Python Dictionary.load_from_text怎么用?Python Dictionary.load_from_text使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora.dictionary.Dictionary
的用法示例。
在下文中一共展示了Dictionary.load_from_text方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: user_lda
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
def user_lda(lda, dictionary_path, textyielder):
id2word = Dictionary.load_from_text(dictionary_path)
ret = {}
for user, text in text_yielder():
bow = id2word.doc2bow(UserCorpus.text2tokens(text))
ret[user] = lda[bow]
return ret
示例2: SNAP_id2word
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
def SNAP_id2word(self):
path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'snap_data',
'gensim_snap_dict.txt'
)
# self.myLoadFromText(path)
ret = Dictionary.load_from_text(path)
return ret
示例3: len
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
fname = sys.argv[1]
prefix = fname.split('/')[0]
if len(sys.argv) > 2 and sys.argv[2][0:2] != '--':
suffix = sys.argv[2]
lemmatizer, filter_words = parse_args(sys.argv)
if lemmatizer == None:
LEMMATIZE = False
suffix = '_tokenized_tfidf'
else:
suffix = '_lemmatized_tfidf'
lda = None
with open(prefix + suffix + '.ldamodel') as f:
lda = cPickle.load(f)
id2token = Dictionary.load_from_text(prefix + suffix + '_wordids.txt')
if DEBUG:
print "prefix:", prefix
print "suffix:", suffix
print "using dict:", prefix + suffix + '_wordids.txt'
print id2token
docs = []
with open(fname) as f:
print("splitting %s" % fname)
tmp = []
for line in f: # bufferize into docs list
if line[0] == '@':
docs.append(tmp)
tmp = [line]
示例4: len
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
input, output = sys.argv[1:3]
if len(sys.argv) > 3:
keep_words = int(sys.argv[3])
else:
keep_words = DEFAULT_DICT_SIZE
# build dictionary. only keep 100k most frequent words (out of total ~8.2m unique tokens)
# takes about 9h on a macbook pro, for 3.5m articles (june 2011 wiki dump)
wiki = WikiCorpus(input, keep_words=keep_words)
# save dictionary and bag-of-words (term-document frequency matrix)
# another ~9h
wiki.dictionary.save_as_text(output + "_wordids.txt")
MmCorpus.serialize(output + "_bow.mm", wiki, progress_cnt=10000)
del wiki
# initialize corpus reader and word->id mapping
id2token = Dictionary.load_from_text(output + "_wordids.txt")
mm = MmCorpus(output + "_bow.mm")
# build tfidf,
# ~30min
from gensim.models import TfidfModel
tfidf = TfidfModel(mm, id2word=id2token, normalize=True)
# save tfidf vectors in matrix market format
# ~2h; result file is 15GB! bzip2'ed down to 4.5GB
MmCorpus.serialize(output + "_tfidf.mm", tfidf[mm], progress_cnt=10000)
logger.info("finished running %s" % program)
示例5: len
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
from news.document import *
if len(sys.argv) != 2:
print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0])
raise SystemExit(1)
data_dir = sys.argv[1]
mapping_file = data_dir+'/token_id_idf'
dictionary_file = data_dir+'/id_token_df'
token_file = data_dir+'/tokens'
lda_file = data_dir+'/lda_model'
print 'creating dictionary...'
N = 23307 # supplied idfs from rcv1/lyrl2004 were based on 23307 training docs
create_dictionary_file(mapping_file,dictionary_file,23307)
dictionary = Dictionary.load_from_text(dictionary_file)
print 'creating corpus...'
corpus = SimpleLowCorpus(token_file,dictionary)
print 'training model...'
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
lda = LdaModel(corpus,id2word=dictionary,num_topics=200)
print 'done!'
print '\n'*3
print '======final topics======'
topics = lda.show_topics(topics=-1,topn=4)
for i,topic in enumerate(topics):
print i,topic
print 'saving model...'
示例6: lemmatize
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
else:
LEMMATIZE = False
if not LEMMATIZE:
ONLY_NOUN_VERBS = False
ONLY_NOUNS = False
if LEMMATIZE:
print "we will lemmatize ('you were'->'be/VB')"
mname = prefix + '_lemmatized_tfidf'
else:
print "you don't have pattern: we will tokenize ('you were'->'you','were')"
mname = prefix + '_tokenized_tfidf'
try:
id2token = Dictionary.load_from_text(mname + '_wordids.txt')
mm = MmCorpus(mname + '_bow.mm')
print ">>> Loaded corpus from serialized files"
except:
print ">>> Extracting articles..."
corpus = CDS_Corpus(FOLDER)
corpus.dictionary.save_as_text(mname + '_wordids.txt')
print ">>> Saved dictionary as " + mname + "_wordids.txt"
MmCorpus.serialize(mname + '_bow.mm', corpus, progress_cnt=1000)
print ">>> Saved MM corpus as " + mname + "_bow.mm"
id2token = Dictionary.load_from_text(mname + '_wordids.txt')
mm = MmCorpus(mname + '_bow.mm')
del corpus
print ">>> Using TF-IDF"
tfidf = models.TfidfModel(mm, id2word=id2token, normalize=True)
示例7: run_lda
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
def run_lda(corpus_file, dictionary_path, topics=10):
id2word = Dictionary.load_from_text(dictionary_path)
mm = MmCorpus(corpus_file)
print mm
lda = LdaModel(corpus=mm, id2word=id2word, num_topics=topics)
return lda