当前位置: 首页>>代码示例>>Python>>正文


Python Dictionary.load方法代码示例

本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.load方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.load方法的具体用法?Python Dictionary.load怎么用?Python Dictionary.load使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.corpora.dictionary.Dictionary的用法示例。


在下文中一共展示了Dictionary.load方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: merge_dictionaries

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def merge_dictionaries(dictionaries_path, merged_dictionary_path=None):
    dict_paths = list(iglob(dictionaries_path))

    final_dictionary = Dictionary.load(dict_paths[0])

    for dict_path in dict_paths[1:]:
        dictionary = Dictionary.load(dict_path)

        final_dictionary.merge_with(dictionary)

    if merged_dictionary_path:
        final_dictionary.save(merged_dictionary_path)

    return final_dictionary
开发者ID:NLeSC,项目名称:AVResearcherXL,代码行数:16,代码来源:tasks.py

示例2: analyze_top_dfs

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def analyze_top_dfs(tokendict, tagdict, cutoff_factor=1):
    ''' Provided gensim-dicts `tokendict` and `tagsdict`, show the top word frequencies. '''
    if type(tokendict) == str:
        tokendict = Dictionary.load(tokendict)
    if type(tagdict) == str:
        tagdict = Dictionary.load(tagdict)
    
    max_tag_df = max(tagdict.dfs.iteritems(), key=operator.itemgetter(1))
    sorted_dfs = sorted(tokendict.dfs.iteritems(), key=operator.itemgetter(1), reverse=True)
    print "count threshold: %-15s\t%d" % (tagdict[max_tag_df[0]], max_tag_df[1])
    print "----------------------------------------------"
    for tup in sorted_dfs[:100]:
        if tup[1] > max_tag_df[1] * cutoff_factor:
            print "%-15s\t%d" % (tokendict[tup[0]][:15], tup[1])
        else: break
开发者ID:mr1azl,项目名称:tag_recommender,代码行数:17,代码来源:pruning.py

示例3: __init__

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
    def __init__(self, topics = 10, 
                 worker = 3, 
                 pretrained_model = None, 
                 dictionary = None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)
开发者ID:freygit,项目名称:36,代码行数:30,代码来源:lda.py

示例4: plot_dict_hist

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def plot_dict_hist(gdict):
    ''' Provided gensim-dict `gdict`, plot hist statistics '''
    if type(gdict) == str:
        gdict = Dictionary.load(gdict)
    sorted_dfs = sorted(gdict.dfs.iteritems(), key=operator.itemgetter(1), reverse=True)
    y = [tup[1] for tup in sorted_dfs]
    x = arange(0, len(y))
    
    plt.figure(figsize=(8,5));
    plt.loglog(x, y);
    plt.grid();
    plt.xlabel("Token rank");
    plt.ylabel("Document count");
    
    cdf = np.empty(len(y))
    delta(y, cdf)
    cdf /= np.max(cdf) # normalize
    
    x50 = x[cdf > 0.50][0]
    x80 = x[cdf > 0.80][0]
    x90 = x[cdf > 0.90][0]
    x95 = x[cdf > 0.95][0]
    
    plt.axvline(x50, color='c');
    plt.axvline(x80, color='g');
    plt.axvline(x90, color='r');
    plt.axvline(x95, color='k');
    
    print "50%\t", x50
    print "80%\t", x80
    print "90%\t", x90
    print "95%\t", x95
开发者ID:mr1azl,项目名称:tag_recommender,代码行数:34,代码来源:pruning.py

示例5: main

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def main(args):
    if args.corpus_type != "wiki":
        if args.processed_corpus_save_path is not None:
            raise ValueError("Processed corpus saving only supported " "for 'wiki' corpus type")

    kwargs = {}
    if args.dictionary_path is not None:
        kwargs["dictionary"] = Dictionary.load(args.dictionary_path)
    if args.dictionary_out_path is not None:
        kwargs["dictionary_save_path"] = args.dictionary_out_path

    if args.corpus_type == "wiki" and args.processed_corpus_save_path is not None:
        kwargs["sentences_save_path"] = args.processed_corpus_save_path

    logging.debug("Building corpus")
    corpus = CORPUS_TYPES[args.corpus_type](args.corpus_path, **kwargs)
    documents = corpus.get_texts()

    logging.debug("Now beginning VSM construction with Word2Vec")

    model = Word2Vec(
        sentences=documents,
        vocab_path=args.vocab_path,
        window=args.window_size,
        drop_capitals=args.drop_capitals,
        min_count=args.minimum_token_count,
        size=args.vector_dimensions,
        workers=multiprocessing.cpu_count(),
    )

    model.save(args.out_path)

    if args.vocab_out_path is not None:
        model.save_vocab(args.vocab_out_path)
开发者ID:hans,项目名称:deepBLE,代码行数:36,代码来源:build_gensim_vsm.py

示例6: prune_dictionary

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def prune_dictionary(src_dictionary_path, dest_dictionary_path=None,
                     no_below=None, no_above=None, keep_n=None):
    dictionary = Dictionary.load(src_dictionary_path)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above,
                               keep_n=keep_n)

    if dest_dictionary_path:
        dictionary.save(dest_dictionary_path)

    return dictionary
开发者ID:NLeSC,项目名称:AVResearcherXL,代码行数:12,代码来源:tasks.py

示例7: getDictionary

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def getDictionary(word_corpus, useSavedTill):
    if useSavedTill >= USESAVED.dictionary:
        common_logger.info("loading dictionary from file")
        dictionary = Dictionary.load(file_lda_gensim_dictionary)
        return dictionary
    else:
        common_logger.info("Creating dictionary from corpus")
        dictionary = Dictionary(word_corpus.values())
        common_logger.info("saving dictionary")
        dictionary.save(file_lda_gensim_dictionary)
        return dictionary
开发者ID:KshitizSethia,项目名称:AcroDisam,代码行数:13,代码来源:LDAModel.py

示例8: filter_extremes_wrapper

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def filter_extremes_wrapper(gdict, no_below=1, no_above=1.0, keep_n=None, save_pickle=None):
    ''' Given unfiltered gensim-dict `gdict`, wrap filter_extremes '''
    if type(gdict) == str:
        gdict = Dictionary.load(gdict)
    print "Before filtering:", gdict
    gdict.filter_extremes(**kwargs)
    print "After filtering:", gdict
    
    if save_pickle:
        print "\nsaving..."
        gdict.save(save_pickle)
    
    return gdict
开发者ID:mr1azl,项目名称:tag_recommender,代码行数:15,代码来源:pruning.py

示例9: main

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def main():
    logformat = '%(asctime)s %(name)-12s: %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=logformat)
    kera = NOB_kera()
    es = Elasticsearch(port=9201)
    mod = LdaModel.load(modelfile)
    vocab = Dictionary.load(vocabulary)
    tfidf = TfidfModel(dictionary=vocab)
    results = []
    for (topics, topicid) in get_doc_topics(mod, mod.num_topics, num_words_from_topic, vocab, tfidf):
        res = es.search(index='wiki4', body={"query": {"match": {"_all": topics}}}, size=num_results_from_es)
        results.append({'topics': topics, 'result': res, 'topicid': topicid})
    results = add_keywords(results, kera)
    df = pd.DataFrame(results)
    df.to_csv('nowiki_4_with_kera_250_topics.csv', encoding='utf-8')
开发者ID:comperiosearch,项目名称:comperio-text-analytics,代码行数:17,代码来源:build_LDA_kera_from_wiki.py

示例10: buildDictionary

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def buildDictionary(force=False):
    """ Build a dictionary in which each post corresponds to a document. """

    global globalDict

    if force or not isfile(dictName):
        postids = getPostids()
        numPosts = len(postids)

        count = 0
        for postid in postids:
            if count % 100 == 0:
                print "Added %d out of %d to dictionary: %s" % (count, numPosts, time.strftime("%H:%M:%S"))
            addPostToDict(postid)
            count += 1
    else:
        globalDict = Dictionary.load(dictName)

    # Filter out extremely common words
    globalDict.filter_extremes(no_below=2, no_above=0.5)
开发者ID:cboix,项目名称:rdaneel,代码行数:22,代码来源:parser.py

示例11: __init__

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
    def __init__(self, analyzed_items_path=None, dictionary_path=None,
                 corpus_path=None, tfidf_model_path=None):
        if dictionary_path:
            self.dictionary = Dictionary.load(dictionary_path)
        else:
            self.dictionary = None

        if analyzed_items_path:
            self.analyzed_items_path = analyzed_items_path
        else:
            self.analyzed_items_path = None

        if corpus_path:
            self.corpus = MmCorpus(corpus_path)
        else:
            self.corpus = None

        if tfidf_model_path:
            self.tfidf_model = TfidfModel.load(tfidf_model_path)
        else:
            self.tfidf_model = None
开发者ID:NLeSC,项目名称:AVResearcherXL,代码行数:23,代码来源:tasks.py

示例12: build_lda_model

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
    def build_lda_model(self, topics: int=20):
        ignore_words = [
            'like', 'know', 'fuck', 'fucking', 'want', 'shit', 'know', 'sure',
            'isn', 'CHANBOARD', 'think', 'people', 'good', 'time', 'going',
            'WEBLINK', 'got', 'way', ''
        ]
        filename = op.join(self.input_dir, f'{self.board}.dictionary')
        dictionary: Dictionary = Dictionary.load(filename)
        documents = ReadThreads(
            self.board, input_dir=self.input_dir, file_type='phrases',
            return_func=lambda x, y: dictionary.doc2bow(
                [w for w in y.split() if w not in ignore_words]
            )
        )

        lda = LdaMulticore(
            documents, id2word=dictionary, num_topics=topics, iterations=2)

        filename = op.join(self.input_dir, f'{self.board}.lda')
        lda.save(filename)

        return lda
开发者ID:ffaristocrat,项目名称:ml-sandbox,代码行数:24,代码来源:build_model.py

示例13: _create_dictionary

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
    def _create_dictionary(self, mongo_client):
        """
        Creates the gensim Dictionary (gensim.corpora.dictionary.Dictionary) or loads it if it already exists and sets
        the object's dictionary property.
        :param mongo_client: server.db.MongoClientContext
        """
        from gensim.corpora.dictionary import Dictionary

        if self._resource_exists(self.dictionary_file):
            self.logger().debug(
                    "Dictionary file found, loading it [%s]" % self._create_resource_path(self.dictionary_file))
            self._dictionary = Dictionary.load(self._create_resource_path(self.dictionary_file))
        else:
            self.logger().debug("Dictionary file not found, creating a new Dictionary file")
            self._dictionary = Dictionary()

        documents = []
        for doc in [di for d in mongo_client.scrappers_collections() for di in d.find()]:
            documents.append(self.tokenize_sentence(doc[self.considerable_doc_property]))

        self.logger().debug("Adding %d documents to dictionary (will skip existing ones)" % len(documents))
        self._dictionary.add_documents(documents)
        self._dictionary.save(self._create_resource_path(self.dictionary_file))
开发者ID:nathanIL,项目名称:openews,代码行数:25,代码来源:__init__.py

示例14: update

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
 def update(self, name, n=500, method='FastICA'):
     settings = self._setstorage.load(encode_name(name))
     clusterer = Clusterer(settings)
     
     # load the models
     dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY]))
     ngram_size = len(dictionary[0])
     transformer = NgramTransformer(ngram_size)
     ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL]))
     
     # get the input
     segments = self._segstorage.load(name=settings[SEGMENT_NAME], limit=int(n))
     documents = [s.value for s in segments]
     
     # prepare args
     kwargs = {'dictionary': dictionary,
               'ngramtransformer': transformer,
               'ldamodel': ldamodel,
               'method': method}
     Xt = clusterer.fit_transform(documents, **kwargs)
     labels = clusterer.assign_labels(documents)
     data = self._make_data(Xt, labels, documents)
     return json.dumps({'result': 'OK',
                        'data': data})
开发者ID:AlwaysTraining,项目名称:patnlp-hsm,代码行数:26,代码来源:clustererserver.py

示例15: scorer

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def scorer(model, dic):
    tfidf = TfidfModel.load(model)
    dictionary = Dictionary.load(dic)
    def score(words):
        return tfidf[dictionary.doc2bow(words)]
    return score
开发者ID:vchahun,项目名称:cdec-features,代码行数:8,代码来源:tfidf.py


注:本文中的gensim.corpora.dictionary.Dictionary.load方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。