当前位置: 首页>>代码示例>>Python>>正文


Python utils.smart_open方法代码示例

本文整理汇总了Python中gensim.utils.smart_open方法的典型用法代码示例。如果您正苦于以下问题:Python utils.smart_open方法的具体用法?Python utils.smart_open怎么用?Python utils.smart_open使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.utils的用法示例。


在下文中一共展示了utils.smart_open方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: save_corpus

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
        """
        Save a corpus in the SVMlight format.

        The SVMlight `<target>` class tag is taken from the `labels` array, or set
        to 0 for all documents if `labels` is not supplied.

        This function is automatically called by `SvmLightCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        logger.info("converting corpus to SVMlight format: %s" % fname)

        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for docno, doc in enumerate(corpus):
                label = labels[docno] if labels else 0 # target class is 0 by default
                offsets.append(fout.tell())
                fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
        return offsets 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:21,代码来源:svmlightcorpus.py

示例2: save_as_text

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def save_as_text(self, fname):
        """
        Save this HashDictionary to a text file, for easier debugging.

        The format is:
        `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`.

        Note: use `save`/`load` to store in binary format instead (pickle).
        """
        logger.info("saving HashDictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for tokenid in self.keys():
                words = sorted(self[tokenid])
                if words:
                    words_df = [(word, self.dfs_debug.get(word, 0)) for word in words]
                    words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])]
                    fout.write(utils.to_utf8("%i\t%i\t%s\n" %
                        (tokenid, self.dfs.get(tokenid, 0), '\t'.join(words_df))))
#endclass HashDictionary 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:21,代码来源:hashdictionary.py

示例3: __init__

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def __init__(self, input):
        """
        Initialize the reader.

        The `input` parameter refers to a file on the local filesystem,
        which is expected to be in the UCI Bag-of-Words format.
        """

        logger.info('Initializing corpus reader from %s' % input)

        self.input = input

        with utils.smart_open(self.input) as fin:
            self.num_docs = self.num_terms = self.num_nnz = 0
            try:
                self.num_docs = int(next(fin).strip())
                self.num_terms = int(next(fin).strip())
                self.num_nnz = int(next(fin).strip())
            except StopIteration:
                pass

        logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
            (self.num_docs, self.num_terms, self.num_nnz)) 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:25,代码来源:ucicorpus.py

示例4: save_as_text

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def save_as_text(self, fname, sort_by_word=True):
        """
        Save this Dictionary to a text file, in format:
        `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word,
        or by decreasing word frequency.

        Note: text format should be use for corpus inspection. Use `save`/`load`
        to store in binary format (pickle) for improved performance.
        """
        logger.info("saving dictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line)) 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:21,代码来源:dictionary.py

示例5: load_word_topics

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def load_word_topics(self):
        logger.info("loading assigned topics from %s" % self.fstate())
        wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
            assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split()
                tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token)
                wordtopics[int(topic), tokenid] += 1
        logger.info("loaded assigned topics for %i tokens" % wordtopics.sum())
        self.wordtopics = wordtopics
        self.print_topics(15) 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:18,代码来源:ldamallet.py

示例6: save_corpus

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save an existing `corpus` to disk.

        Some formats also support saving the dictionary (`feature_id->word` mapping),
        which can in this case be provided by the optional `id2word` parameter.

        >>> MmCorpus.save_corpus('file.mm', corpus)

        Some corpora also support an index of where each document begins, so
        that the documents on disk can be accessed in O(1) time (see the
        `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically
        called internally by `serialize`, which does `save_corpus` plus saves the index
        at the same time, so you want to store the corpus with::

        >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents

        Calling `serialize()` is preferred to calling `save_corpus()`.

        """
        raise NotImplementedError('cannot instantiate abstract base class')

        # example code:
        logger.info("converting corpus to ??? format: %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for doc in corpus: # iterate over the document stream
                fmt = str(doc) # format the document appropriately...
                fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk
#endclass CorpusABC 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:31,代码来源:interfaces.py

示例7: __iter__

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def __iter__(self):
        """
        Iterate over the corpus, returning one sparse vector at a time.
        """
        lineno = -1
        self.labels = []
        with utils.smart_open(self.fname) as fin:
            for lineno, line in enumerate(fin):
                doc = self.line2doc(line)
                if doc is not None:
                    if self.store_labels:
                        self.labels.append(doc[1])
                    yield doc[0]
        self.length = lineno + 1 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:16,代码来源:svmlightcorpus.py

示例8: docbyoffset

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def docbyoffset(self, offset):
        """
        Return the document stored at file position `offset`.
        """
        with utils.smart_open(self.fname) as f:
            f.seek(offset)
            return self.line2doc(f.readline())[0] 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:9,代码来源:svmlightcorpus.py

示例9: __iter__

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def __iter__(self):
        """
        Iterate over the corpus at the given filename.

        Yields a bag-of-words, a.k.a list of tuples of (word id, word count), based on the given id2word dictionary.
        """
        with utils.smart_open(self.fname) as f:
            for line in f:
                yield self.line2doc(line) 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:11,代码来源:malletcorpus.py

示例10: docbyoffset

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def docbyoffset(self, offset):
        """
        Return the document stored at file position `offset`.
        """
        with utils.smart_open(self.fname) as f:
            f.seek(offset)
            return self.line2doc(f.readline())

# endclass MalletCorpus 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:11,代码来源:malletcorpus.py

示例11: _calculate_num_docs

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def _calculate_num_docs(self):
        # the first line in input data is the number of documents (integer). throws exception on bad input.
        with utils.smart_open(self.fname) as fin:
            try:
                result = int(next(fin))
            except StopIteration:
                result = 0

        return result 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:11,代码来源:lowcorpus.py

示例12: __iter__

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def __iter__(self):
        """
        Iterate over the corpus, returning one bag-of-words vector at a time.
        """
        with utils.smart_open(self.fname) as fin:
            for lineno, line in enumerate(fin):
                if lineno > 0: # ignore the first line = number of documents
                    yield self.line2doc(line) 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:10,代码来源:lowcorpus.py

示例13: save_corpus

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the List-of-words format.

        This function is automatically called by `LowCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in List-Of-Words format into %s" % fname)
        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8('%i\n' % len(corpus)))
            for doc in corpus:
                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] * int(value))
                offsets.append(fout.tell())
                fout.write(utils.to_utf8('%s\n' % ' '.join(words)))

        if truncated:
            logger.warning("List-of-words format can only save vectors with "
                            "integer elements; %i float entries were truncated to integer value" %
                            truncated)
        return offsets 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:32,代码来源:lowcorpus.py

示例14: docbyoffset

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def docbyoffset(self, offset):
        """
        Return the document stored at file position `offset`.
        """
        with utils.smart_open(self.fname) as f:
            f.seek(offset)
            return self.line2doc(f.readline())

# endclass LowCorpus 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:11,代码来源:lowcorpus.py

示例15: __init__

# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def __init__(self, fname, fname_vocab=None):
        """
        Initialize the corpus from a file.

        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
        `fname.vocab`.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                        fname + '.vocab',
                        fname + '/vocab.txt',
                        fname_base + '.vocab',
                        fname_dir + '/vocab.txt',
                        ]:
                if path.exists(fname_vocab):
                    break
            else:
                raise IOError('BleiCorpus: could not find vocabulary file')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
        self.length = 0 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:31,代码来源:bleicorpus.py


注:本文中的gensim.utils.smart_open方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。