当前位置: 首页>>代码示例>>Python>>正文


Python Dictionary.save方法代码示例

本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.save方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.save方法的具体用法?Python Dictionary.save怎么用?Python Dictionary.save使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.corpora.dictionary.Dictionary的用法示例。


在下文中一共展示了Dictionary.save方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: build_dictionaries_from_splits

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def build_dictionaries_from_splits(splits_template, n, save_pickle_tup=None):
    ''' Builds all 3 dictionaries from splits. If provided, `save_pickle_tup` must
        be a 3-tuple of the picklefile names in the following order:
        
        (title, body, tags)
        
        If `save_pickle_tup[i]` is None, the corresponding dictionary will not be saved.
    '''
    utitledict, ubodydict, utagdict = Dictionary(), Dictionary(), Dictionary()
    for eid in xrange(n):
        for row in row_stream(splits_template % eid):
            ID, title, body, tags = row
            utitledict.doc2bow(title.split(), allow_update=True)
            ubodydict.doc2bow(body.split(), allow_update=True)
            utagdict.doc2bow(tags.split(), allow_update=True)
    
    assert ubodydict.num_docs == utitledict.num_docs == utagdict.num_docs
    print "Before filtering..."
    print "utitledict:", utitledict
    print "ubodydict:", ubodydict
    print "utagdict:", utagdict
    
    if save_pickle_tup:
        assert len(save_pickle_tup) == 3
        if save_pickle_tup[0]:
            print "saving utitledict..."
            utitledict.save(save_pickle_tup[0])
        if save_pickle_tup[1]:
            print "saving ubodydict..."
            ubodydict.save(save_pickle_tup[1])
        if save_pickle_tup[2]:
            print "saving utagdict..."
            utagdict.save(save_pickle_tup[2])
            
    return (utitledict, ubodydict, utagdict)
开发者ID:mr1azl,项目名称:tag_recommender,代码行数:37,代码来源:pruning.py

示例2: create_dictionary

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def create_dictionary(analyzed_items_path, dictionary_path=None):
    dictionary = Dictionary(iter_docs(analyzed_items_path))

    if dictionary_path:
        dictionary.save(dictionary_path)

    return dictionary
开发者ID:NLeSC,项目名称:AVResearcherXL,代码行数:9,代码来源:tasks.py

示例3: create_corpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
开发者ID:Tooa,项目名称:cablemap,代码行数:27,代码来源:defaultcorpus.py

示例4: build_dictionary

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def build_dictionary():
    dictionary = Dictionary()
    for line in open(wiki_index.ARTICLES_FILE):
        dictionary.add_documents([line.lower().split()])
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    dictionary.save(DICTIONARY_FILE)
    return dictionary
开发者ID:msushkov,项目名称:cs224w-wiki,代码行数:9,代码来源:lda.py

示例5: build_dictionary

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
 def build_dictionary(self):
     documents = ReadThreads(
         self.board, input_dir=self.input_dir, file_type='phrases',
         return_func=lambda x, y: y.split())
     dictionary = Dictionary(documents)
     dictionary.save(f'{self.board}.dictionary')
     
     return dictionary
开发者ID:ffaristocrat,项目名称:ml-sandbox,代码行数:10,代码来源:build_model.py

示例6: getDictionary

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def getDictionary(word_corpus, useSavedTill):
    if useSavedTill >= USESAVED.dictionary:
        common_logger.info("loading dictionary from file")
        dictionary = Dictionary.load(file_lda_gensim_dictionary)
        return dictionary
    else:
        common_logger.info("Creating dictionary from corpus")
        dictionary = Dictionary(word_corpus.values())
        common_logger.info("saving dictionary")
        dictionary.save(file_lda_gensim_dictionary)
        return dictionary
开发者ID:KshitizSethia,项目名称:AcroDisam,代码行数:13,代码来源:LDAModel.py

示例7: build_dictionary_from_splits

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def build_dictionary_from_splits(splits_template, column, n, save_pickle=None):
    ''' Build dictionary from splits. If `save_pickle` is provided, then save. '''
    unfiltered_dict = Dictionary()
    for eid in xrange(n):
        unfiltered_dict.add_documents(csv_isolator("../../data/proc_Train_%d.csv" % eid, column))
    print "Before filtering,", unfiltered_dict
    if save_pickle:
        print "\nsaving..."
        unfiltered_dict.save(save_pickle)
    
    return unfiltered_dict
开发者ID:mr1azl,项目名称:tag_recommender,代码行数:13,代码来源:pruning.py

示例8: TextCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
class TextCorpus(gensim.corpora.TextCorpus):
    """A corpus class which makes some minor extensions to the Gensim
    `TextCorpus` implementation:

    - Support loading of pre-built dictionary
    """

    def __init__(self, input=None, dictionary=None, dictionary_save_path=None,
                 pre_tokenized=False, lowercase=False):
        super(gensim.corpora.TextCorpus, self).__init__()

        self.input = input
        self.metadata = False

        self.pre_tokenized = pre_tokenized
        self.lowercase = lowercase

        if dictionary is None:
            self.dictionary = Dictionary()

            if input is not None:
                self.dictionary.add_documents(self.get_texts())
            else:
                logging.warning("No input document stream provided; "
                                "assuming dictionary will be "
                                "initialized in some other way.")
        else:
            self.dictionary = dictionary

        if dictionary_save_path is not None:
            self.dictionary.save(dictionary_save_path)

    def get_texts(self):
        length = 0

        # Input should have one document (sentence, for the word2vec case) per line
        for line in getstream(self.input):
            length += 1

            if self.pre_tokenized:
		if not isinstance(line, unicode):
		    line = unicode(line, encoding='utf8', errors='strict')
                yield line
            else:
                yield gensim.utils.tokenize(line, lowercase=self.lowercase)

        self.length = length
开发者ID:hans,项目名称:deepBLE,代码行数:49,代码来源:text.py

示例9: main

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def main():
    parser = ArgumentParser()
    parser.add_argument('-d', '--wiki-dump')
    parser.add_argument('-l', '--limit', default=None, type=int)
    parser.add_argument('-p', '--num-procs', default=1, type=int)
    parser.add_argument('-o', '--out', default='vocab')
    opts = parser.parse_args()

    dump_loc = opts.wiki_dump
    limit = opts.limit
    n_procs = opts.num_procs
    out_fn = opts.out

    dump_gen = get_dump_gen(dump_loc, limit=limit, n_procs=n_procs)

    nlp = spacy.en.English()
    vocab = Dictionary(([token.text.lower().strip() for token in doc if token.text.strip() != ""]
                        for doc in nlp.pipe((art['article.text'] for art in dump_gen), n_threads=n_procs,
                                            parse=False, tag=False, entity=False)))

    vocab.save('%s.vocab' % out_fn)
    vocab.save_as_text('%s.txt' % out_fn)
开发者ID:andrely,项目名称:sublexical-features,代码行数:24,代码来源:wiki_word_count.py

示例10: RedisCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
    
    db.hset('idlookup', index, postid)

class RedisCorpus(object):
    def __init__(self, postids):
        self.postids = postids
        self.numPosts = len(self.postids)
        
    def __iter__(self):
        count = 0
        for postid in self.postids:
            if count % 100 == 0:
                print "Wrote %d out of %d to corpus: %s" % (count, self.numPosts, time.strftime("%H:%M:%S"))
            addCorpusMap(count, postid)
            count += 1
            yield corpusOfPost(postid, force=True)

def buildCorpus():
    """ Returns a corpus object that contains sparse vectors from every post. """

    postids = getPostids()
    corpus = RedisCorpus(postids)
    return corpus

if __name__ == "__main__":
    buildDictionary(force=True)
    globalDict.save(dictName)

    corpus = buildCorpus()
    BleiCorpus.serialize('redditcorpus.lda-c', corpus)
开发者ID:cboix,项目名称:rdaneel,代码行数:32,代码来源:parser.py

示例11: Similarities

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]

#.........这里部分代码省略.........
        """
        return os.path.isfile(self._create_resource_path(resource_file))

    def _run_transformers(self):
        """
        Runs all the transformer methods listed providing the MongoDB client context instance.
        """
        with MongoClientContext(self._mongo_connection_record) as client:
            self._create_dictionary(client)
            self._create_lsi_similarity_index(client)

    def _create_dictionary(self, mongo_client):
        """
        Creates the gensim Dictionary (gensim.corpora.dictionary.Dictionary) or loads it if it already exists and sets
        the object's dictionary property.
        :param mongo_client: server.db.MongoClientContext
        """
        from gensim.corpora.dictionary import Dictionary

        if self._resource_exists(self.dictionary_file):
            self.logger().debug(
                    "Dictionary file found, loading it [%s]" % self._create_resource_path(self.dictionary_file))
            self._dictionary = Dictionary.load(self._create_resource_path(self.dictionary_file))
        else:
            self.logger().debug("Dictionary file not found, creating a new Dictionary file")
            self._dictionary = Dictionary()

        documents = []
        for doc in [di for d in mongo_client.scrappers_collections() for di in d.find()]:
            documents.append(self.tokenize_sentence(doc[self.considerable_doc_property]))

        self.logger().debug("Adding %d documents to dictionary (will skip existing ones)" % len(documents))
        self._dictionary.add_documents(documents)
        self._dictionary.save(self._create_resource_path(self.dictionary_file))

    def _create_lsi_similarity_index(self, mongo_client):
        """
        Creates a Similarity index based on LSI model from the available dictionary. Sets the object's lsi_model and
        similarity_index object properties.
        """
        from gensim.models import LsiModel
        from gensim.similarities import MatrixSimilarity

        self._lsi_mapping.clear()
        bow_corpus = []
        for idx, tp in enumerate([(c, di) for c in mongo_client.scrappers_collections() for di in c.find()]):
            self._lsi_mapping[idx] = tp
            bow_corpus.append(self.sentence_to_bow(tp[1][self.considerable_doc_property]))

        self._lsimodel = LsiModel(bow_corpus, id2word=self.dictionary)
        self._sim_index = MatrixSimilarity(self._lsimodel[bow_corpus])

    def calculate_similarities(self):
        """
        Find / calculate similarities between documents in the index.
        Returns a defaultdict with the key as the LSI index and the value is a list of tuples with the following values
        (LSI model Index, similarity threshold - numpy.float32)
        tuple
        :return: defaultdict(list)
        """
        similarities = defaultdict(list)
        if not self.lsi_index_mapping:
            return

        for idx, tp in sorted(self.lsi_index_mapping.items(), key=itemgetter(0)):
            sentence = tp[1][self.considerable_doc_property]
开发者ID:nathanIL,项目名称:openews,代码行数:70,代码来源:__init__.py

示例12: saveWords

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def saveWords(words, wordfile):
  from gensim.corpora.dictionary import Dictionary
  from gensim.corpora import MmCorpus
  dict=Dictionary(words)
  dict.save(wordfile)
开发者ID:blanu,项目名称:AdversaryLab,代码行数:7,代码来源:processUtil.py

示例13: saveGensim

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
  def saveGensim(self, topic):
    if topic is None:
      # generate all
      self.saveGensim('movie')
      self.saveGensim('celebrity')
      self.saveGensim('syria')
      self.saveGensim('ufo')
      return

    posDocs = []
    negDocs = []

    if topic == 'movie':
      topic = 'movie_reviews'
    elif topic == 'celebrity':
      topic = 'bieber'

    if topic == 'movie_reviews':
      count = 100
      posDocs = self.movieReviews('positive', count)
      negDocs = self.movieReviews('negative', count)
    else:
      posDocs = self.getArticlesHelper('positive', topic)
      negDocs = self.getArticlesHelper('negative', topic)

    listOfTokens = [] # dictionary
    docs = [] # corpus

    for posDoc in posDocs:
      processed = self.processDocForGensim(posDoc)
      tokens = self.tokensFromText(processed)
      listOfTokens.append(tokens)
      docs.append(processed)
    for negDoc in negDocs:
      processed = self.processDocForGensim(negDoc)
      tokens = self.tokensFromText(processed)
      listOfTokens.append(tokens)
      docs.append(processed)

    dictionaryFilename = 'gensim_dictionary.txt'
    corpusFilename = 'gensim_corpus.mm'

    # make destination files if they don't exist
    dictionaryPath = os.path.join(
      os.path.dirname(os.path.abspath(__file__)),
      'james_data',
      topic,
      dictionaryFilename
    )

    corpusPath = os.path.join(
      os.path.dirname(os.path.abspath(__file__)),
      'james_data',
      topic,
      corpusFilename
    )

    corpusTempPath = corpusPath + '.tmp'

    if os.path.exists(dictionaryPath):
      os.remove(dictionaryPath)

    if os.path.exists(corpusPath):
      os.remove(corpusPath)

    if os.path.exists(corpusTempPath):
      os.remove(corpusTempPath)

    with open(dictionaryPath, 'w') as f:
      f.write(' ')

    with open(corpusPath, 'w') as f:
      f.write(' ')

    # save dictionary and corpus
    d = Dictionary(listOfTokens)
    d.save(dictionaryPath)

    with open(corpusTempPath, 'w') as f:
      f.write('\n'.join(docs))

    corpus = TextCorpus(corpusTempPath)
    MmCorpus.save_corpus(corpusPath, corpus)

    return
开发者ID:dshahaf,项目名称:snap-sentiment,代码行数:87,代码来源:corpus.py

示例14: CableCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
class CableCorpus(BaseCorpus):
    """\
    The cable corpus consists of several files which are written into a directory.

    * a dictionary with a ``<word id> <word> <frequency>`` mapping saved under "wordids.pickle"
    * a JSON file with a ``<cable reference id> <document number>`` mapping under "id2docid.json"
    * a `Market Matrix format <http://math.nist.gov/MatrixMarket/formats.html>` vector space model file "bow.mm"

    CAUTION: The corpus overrides any existing files with the same file name in the specified directory.

    By default, the corpus creates the word dictionary and the vector space model which
    may lead into an unuseful vector space model. To filter certain words, the corpus may be
    initialized with a pre-generated word dictionary. To make the dictionary immutable, the property
    ``allow_dict_updates`` should be set to ``False`` (updates are allowed by default).
    The resulting vector space model contains only words which are in the word dictionary then.

    Example to reduce the clutter::

        corpus = CableCorpus('/my/directory/')
        # Add some texts here
        corpus.add_text('ref-1', u'bla bla bla')
        corpus.add_text('ref-2', u'bla bla blub')
        ...
        corpus.dct.filter_extremes()
        corpus.close()

        from gensim.corpora.dictionary import Dictionary

        # Load previously created dict
        dct = Dictionary.load_from_text('/my/directory/cables_wordids.txt')
        # Create another corpus with the previously word dict
        corpus = CableCorpus('/my/directory/', dct, allow_dict_updates=False)
        # Add some texts
        ....
        corpus.close()
    """
    def __init__(self, path, dct=None, tokenizer=None, allow_dict_updates=True, prefix=None):
        """\
        Initializes the cable corpus.
        
        `path`
            Directory where the generated files are stored.
        `dct`
            An existing `gensim.corpora.dictionary.Dictionary`
            If it's ``None`` (default) a dictionary will be created.
        `tokenizer`
            A function to tokenize/normalize/clean-up/remove stop words from strings.
            If it's ``None`` (default), a default function will be used to tokenize texts.
        `allow_dict_updates`
            Indicats if unknown words should be added to the dictionary (default ``True``).
        `prefix`
            A prefix for the generated file names.
        """
        super(CableCorpus, self).__init__(tokenizer)
        if not os.path.isdir(path):
            raise IOError('Expected a directory path')
        self.dct = Dictionary() if dct is None else dct
        self._path = path
        self._prefix = prefix or 'cables_'
        self._mw = IncrementalMmWriter(os.path.join(path, self._prefix + 'bow.mm'))
        self.allow_dict_updates = allow_dict_updates
        self._cables = []

    def add_words(self, reference_id, words):
        self._cables.append(reference_id)
        self._mw.add_vector(self.dct.doc2bow(words, self.allow_dict_updates))

    def close(self):
        self._mw.close()
        self.dct.save(os.path.join(self._path, self._prefix + 'wordids.pickle'))
        json_filename = os.path.join(self._path, self._prefix + 'id2docid.json')
        json.dump(dict(zip(self._cables, count())), open(json_filename, 'wb'))
开发者ID:Tooa,项目名称:cablemap,代码行数:74,代码来源:corpus.py


注:本文中的gensim.corpora.dictionary.Dictionary.save方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。