当前位置: 首页>>代码示例>>Python>>正文


Python SqliteDict.iterkeys方法代码示例

本文整理汇总了Python中sqlitedict.SqliteDict.iterkeys方法的典型用法代码示例。如果您正苦于以下问题:Python SqliteDict.iterkeys方法的具体用法?Python SqliteDict.iterkeys怎么用?Python SqliteDict.iterkeys使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sqlitedict.SqliteDict的用法示例。


在下文中一共展示了SqliteDict.iterkeys方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: merge

# 需要导入模块: from sqlitedict import SqliteDict [as 别名]
# 或者: from sqlitedict.SqliteDict import iterkeys [as 别名]
def merge(texts, index_dic=True, tfidf=True, lda=True, sim=False):
    total_start = timeit.default_timer()
    make_index_time = 0
    make_dict_time = 0
    make_lda_time = 0
    make_tfidf_time = 0
    sim_time = 0

    if index_dic:
        f = [i.split(',') for i in texts.readlines()]
        logging.info('Create id & ac_id list')
        ids = [f[i][0] for i in range(len(f))]
        ac_ids = [f[i][1] for i in range(len(f))]
        logging.info('Create contents list')
        contents = []
        for i in range(len(f)):
            if len(f[i]) == 3:
                contents.append(f[i][2].strip().split(':'))
            else:
                contents.append([])

        # make index
        logging.info('***********Now merge index by sqlitedict***********')
        timer_start = timeit.default_timer()
        old_corpus_len = len(corpora.MmCorpus(gl.res + '/resource/corpus'))
        pos2paid = zip(range(old_corpus_len, old_corpus_len + len(f)), ac_ids)
        paid2pos_new = {}
        for key, paid in groupby(sorted(pos2paid, key=itemgetter(1)), key=itemgetter(1)):
            paid2pos_new.update({int(key): [i[0] for i in paid]})
        id2pos_new = dict(zip(ids, range(old_corpus_len, old_corpus_len + len(f))))
        pos2id_new = dict(zip(range(old_corpus_len, old_corpus_len + len(f)), ids))

        id2pos = SqliteDict(filename=gl.res + '/resource/id2pos', autocommit=True)
        id2pos.update(id2pos_new)
        id2pos.close()
        pos2id = SqliteDict(filename=gl.res + '/resource/pos2id', autocommit=True)
        pos2id.update(pos2id_new)
        pos2id.close()
        paid2pos = SqliteDict(filename=gl.res + '/resource/paid2pos', autocommit=True)
        x = [set(paid2pos_new.keys()), set([int(i) for i in paid2pos.iterkeys()])]
        for i in list(set.intersection(*x)):  # update duplicate key
            temp = list(chain(paid2pos[i], paid2pos_new[i]))
            paid2pos.update({int(i): temp})
        paid2pos.close()
        timer_end = timeit.default_timer()
        make_index_time = timer_end - timer_start

        # Merge dictionary
        logging.info('***********Now merge Dictionary***********')
        timer_start = timeit.default_timer()
        newDict = corpora.Dictionary(contents)
        newDict.filter_extremes(no_below=20, no_above=0.1, keep_n=None)
        dic = corpora.Dictionary.load(gl.res + '/resource/dict')
        dic.merge_with(newDict)
        dic.save(gl.res + '/resource/dict')
        timer_end = timeit.default_timer()
        make_dict_time = timer_end - timer_start

        # merge corpus
        logging.info('***********Now merge Corpus***********')
        temps = []
        for i, t in enumerate(contents):
            temps.append(dic.doc2bow(t))
            if i % 10000 == 0:
                logging.info('make corpus ' + str(i) + ' articles')
        corpora.MmCorpus.serialize(gl.res + '/resource/new_c', temps)
        gc.collect()
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        new_corpus = corpora.MmCorpus(gl.res + '/resource/new_c')
        merged_corpus = chain(corpus, new_corpus)
        corpora.MmCorpus.serialize(gl.res + '/resource/merged_c', merged_corpus)  # Overwrite corpus

        for filename in glob.glob(gl.res + '/resource/*'):
            if filename.endswith('corpus') or filename.endswith('corpus.index') \
                    or filename.endswith('new_c') or filename.endswith('new_c.index'):  # rm useless corpus
                # os.remove(filename)
                os.unlink(filename)
            if filename.endswith('merged_c'):  # rename to corpus
                os.rename(filename, gl.res + '/resource/corpus')
            if filename.endswith('merged_c.index'):
                os.rename(filename, gl.res + '/resource/corpus.index')

    if tfidf:
        # do tfidf merge
        gc.collect()
        logging.info('***********Now merge TF-IDF model***********')
        timer_start = timeit.default_timer()
        for filename in glob.glob(gl.res + '/resource/*'):  # backup old model
            if filename.endswith('tfidf'):
                os.rename(filename, filename + '_' + gl.c_time)
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')  # reload corpus
        tfidf = models.TfidfModel(corpus)
        tfidf.save(gl.res + '/resource/tfidf')
        timer_end = timeit.default_timer()
        make_tfidf_time = timer_end - timer_start

    if lda:
        # do lda merge
        gc.collect()
        tfidf = models.TfidfModel.load(gl.res + '/resource/tfidf')
#.........这里部分代码省略.........
开发者ID:YangZunYu,项目名称:FermiNLP,代码行数:103,代码来源:FuncV3.py


注:本文中的sqlitedict.SqliteDict.iterkeys方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。