当前位置: 首页>>代码示例>>Python>>正文


Python Dictionary.load_from_text方法代码示例

本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.load_from_text方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.load_from_text方法的具体用法?Python Dictionary.load_from_text怎么用?Python Dictionary.load_from_text使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.corpora.dictionary.Dictionary的用法示例。


在下文中一共展示了Dictionary.load_from_text方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: user_lda

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
def user_lda(lda, dictionary_path, textyielder):
    id2word = Dictionary.load_from_text(dictionary_path)
    ret = {}
    for user, text in text_yielder():
        bow = id2word.doc2bow(UserCorpus.text2tokens(text))
        ret[user] = lda[bow]
    return ret
开发者ID:cosbynator,项目名称:karma-prediction-cs224w,代码行数:9,代码来源:graph_tools.py

示例2: SNAP_id2word

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
 def SNAP_id2word(self):
   path = os.path.join(
     os.path.dirname(os.path.abspath(__file__)),
     'snap_data',
     'gensim_snap_dict.txt'
   )
   # self.myLoadFromText(path)
   ret = Dictionary.load_from_text(path)
   return ret
开发者ID:dshahaf,项目名称:snap-sentiment,代码行数:11,代码来源:corpus.py

示例3: len

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
    fname = sys.argv[1]
    prefix = fname.split('/')[0]
    if len(sys.argv) > 2 and sys.argv[2][0:2] != '--':
        suffix = sys.argv[2]

    lemmatizer, filter_words = parse_args(sys.argv)
    if lemmatizer == None:
        LEMMATIZE = False
        suffix = '_tokenized_tfidf'
    else:
        suffix = '_lemmatized_tfidf'

    lda = None
    with open(prefix + suffix + '.ldamodel') as f:
        lda = cPickle.load(f)
    id2token = Dictionary.load_from_text(prefix + suffix + '_wordids.txt')

    if DEBUG:
        print "prefix:", prefix
        print "suffix:", suffix
        print "using dict:", prefix + suffix + '_wordids.txt'
        print id2token

    docs = []
    with open(fname) as f:
        print("splitting %s" % fname)
        tmp = []
        for line in f: # bufferize into docs list
            if line[0] == '@':
                docs.append(tmp)
                tmp = [line]
开发者ID:syhw,项目名称:contextual_word_segmentation,代码行数:33,代码来源:split_corpus.py

示例4: len

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
    input, output = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE

    # build dictionary. only keep 100k most frequent words (out of total ~8.2m unique tokens)
    # takes about 9h on a macbook pro, for 3.5m articles (june 2011 wiki dump)
    wiki = WikiCorpus(input, keep_words=keep_words)
    # save dictionary and bag-of-words (term-document frequency matrix)
    # another ~9h
    wiki.dictionary.save_as_text(output + "_wordids.txt")
    MmCorpus.serialize(output + "_bow.mm", wiki, progress_cnt=10000)
    del wiki

    # initialize corpus reader and word->id mapping
    id2token = Dictionary.load_from_text(output + "_wordids.txt")
    mm = MmCorpus(output + "_bow.mm")

    # build tfidf,
    # ~30min
    from gensim.models import TfidfModel

    tfidf = TfidfModel(mm, id2word=id2token, normalize=True)

    # save tfidf vectors in matrix market format
    # ~2h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(output + "_tfidf.mm", tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
开发者ID:hjanime,项目名称:gensim,代码行数:32,代码来源:wikicorpus.py

示例5: len

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
from news.document import *

if len(sys.argv) != 2:
    print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0])
    raise SystemExit(1)

data_dir = sys.argv[1]
mapping_file = data_dir+'/token_id_idf'
dictionary_file = data_dir+'/id_token_df'
token_file = data_dir+'/tokens'
lda_file = data_dir+'/lda_model'

print 'creating dictionary...'
N = 23307  # supplied idfs from rcv1/lyrl2004 were based on 23307 training docs
create_dictionary_file(mapping_file,dictionary_file,23307)
dictionary = Dictionary.load_from_text(dictionary_file)

print 'creating corpus...'
corpus = SimpleLowCorpus(token_file,dictionary)

print 'training model...'
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
lda = LdaModel(corpus,id2word=dictionary,num_topics=200)
print 'done!'
print '\n'*3
print '======final topics======'
topics = lda.show_topics(topics=-1,topn=4)
for i,topic in enumerate(topics):
    print i,topic

print 'saving model...'
开发者ID:biddyweb,项目名称:news-1,代码行数:33,代码来源:train.py

示例6: lemmatize

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
        else:
            LEMMATIZE = False

    if not LEMMATIZE:
        ONLY_NOUN_VERBS = False
        ONLY_NOUNS = False

    if LEMMATIZE:
        print "we will lemmatize ('you were'->'be/VB')"
        mname = prefix + '_lemmatized_tfidf'
    else:
        print "you don't have pattern: we will tokenize ('you were'->'you','were')"
        mname = prefix + '_tokenized_tfidf'

    try:
        id2token = Dictionary.load_from_text(mname + '_wordids.txt')
        mm = MmCorpus(mname + '_bow.mm')
        print ">>> Loaded corpus from serialized files"
    except:
        print ">>> Extracting articles..."
        corpus = CDS_Corpus(FOLDER)
        corpus.dictionary.save_as_text(mname + '_wordids.txt')
        print ">>> Saved dictionary as " + mname + "_wordids.txt"
        MmCorpus.serialize(mname + '_bow.mm', corpus, progress_cnt=1000)
        print ">>> Saved MM corpus as " + mname + "_bow.mm"
        id2token = Dictionary.load_from_text(mname + '_wordids.txt')
        mm = MmCorpus(mname + '_bow.mm')
        del corpus

    print ">>> Using TF-IDF"
    tfidf = models.TfidfModel(mm, id2word=id2token, normalize=True)
开发者ID:syhw,项目名称:contextual_word_segmentation,代码行数:33,代码来源:prepare_corpus_tfidf.py

示例7: run_lda

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load_from_text [as 别名]
def run_lda(corpus_file, dictionary_path, topics=10):
    id2word = Dictionary.load_from_text(dictionary_path)
    mm = MmCorpus(corpus_file)
    print mm
    lda = LdaModel(corpus=mm, id2word=id2word, num_topics=topics)
    return lda
开发者ID:cosbynator,项目名称:karma-prediction-cs224w,代码行数:8,代码来源:graph_tools.py


注:本文中的gensim.corpora.dictionary.Dictionary.load_from_text方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。