本文整理汇总了Python中gensim.utils.smart_open方法的典型用法代码示例。如果您正苦于以下问题:Python utils.smart_open方法的具体用法?Python utils.smart_open怎么用?Python utils.smart_open使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.utils
的用法示例。
在下文中一共展示了utils.smart_open方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: save_corpus
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
"""
Save a corpus in the SVMlight format.
The SVMlight `<target>` class tag is taken from the `labels` array, or set
to 0 for all documents if `labels` is not supplied.
This function is automatically called by `SvmLightCorpus.serialize`; don't
call it directly, call `serialize` instead.
"""
logger.info("converting corpus to SVMlight format: %s" % fname)
offsets = []
with utils.smart_open(fname, 'wb') as fout:
for docno, doc in enumerate(corpus):
label = labels[docno] if labels else 0 # target class is 0 by default
offsets.append(fout.tell())
fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
return offsets
示例2: save_as_text
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def save_as_text(self, fname):
"""
Save this HashDictionary to a text file, for easier debugging.
The format is:
`id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`.
Note: use `save`/`load` to store in binary format instead (pickle).
"""
logger.info("saving HashDictionary mapping to %s" % fname)
with utils.smart_open(fname, 'wb') as fout:
for tokenid in self.keys():
words = sorted(self[tokenid])
if words:
words_df = [(word, self.dfs_debug.get(word, 0)) for word in words]
words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])]
fout.write(utils.to_utf8("%i\t%i\t%s\n" %
(tokenid, self.dfs.get(tokenid, 0), '\t'.join(words_df))))
#endclass HashDictionary
示例3: __init__
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def __init__(self, input):
"""
Initialize the reader.
The `input` parameter refers to a file on the local filesystem,
which is expected to be in the UCI Bag-of-Words format.
"""
logger.info('Initializing corpus reader from %s' % input)
self.input = input
with utils.smart_open(self.input) as fin:
self.num_docs = self.num_terms = self.num_nnz = 0
try:
self.num_docs = int(next(fin).strip())
self.num_terms = int(next(fin).strip())
self.num_nnz = int(next(fin).strip())
except StopIteration:
pass
logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
(self.num_docs, self.num_terms, self.num_nnz))
示例4: save_as_text
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def save_as_text(self, fname, sort_by_word=True):
"""
Save this Dictionary to a text file, in format:
`id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word,
or by decreasing word frequency.
Note: text format should be use for corpus inspection. Use `save`/`load`
to store in binary format (pickle) for improved performance.
"""
logger.info("saving dictionary mapping to %s" % fname)
with utils.smart_open(fname, 'wb') as fout:
if sort_by_word:
for token, tokenid in sorted(iteritems(self.token2id)):
line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
fout.write(utils.to_utf8(line))
else:
for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
fout.write(utils.to_utf8(line))
示例5: load_word_topics
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def load_word_topics(self):
logger.info("loading assigned topics from %s" % self.fstate())
wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
with utils.smart_open(self.fstate()) as fin:
_ = next(fin) # header
self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
_ = next(fin) # beta
for lineno, line in enumerate(fin):
line = utils.to_unicode(line)
doc, source, pos, typeindex, token, topic = line.split()
tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token)
wordtopics[int(topic), tokenid] += 1
logger.info("loaded assigned topics for %i tokens" % wordtopics.sum())
self.wordtopics = wordtopics
self.print_topics(15)
示例6: save_corpus
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def save_corpus(fname, corpus, id2word=None, metadata=False):
"""
Save an existing `corpus` to disk.
Some formats also support saving the dictionary (`feature_id->word` mapping),
which can in this case be provided by the optional `id2word` parameter.
>>> MmCorpus.save_corpus('file.mm', corpus)
Some corpora also support an index of where each document begins, so
that the documents on disk can be accessed in O(1) time (see the
`corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically
called internally by `serialize`, which does `save_corpus` plus saves the index
at the same time, so you want to store the corpus with::
>>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents
Calling `serialize()` is preferred to calling `save_corpus()`.
"""
raise NotImplementedError('cannot instantiate abstract base class')
# example code:
logger.info("converting corpus to ??? format: %s" % fname)
with utils.smart_open(fname, 'wb') as fout:
for doc in corpus: # iterate over the document stream
fmt = str(doc) # format the document appropriately...
fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk
#endclass CorpusABC
示例7: __iter__
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def __iter__(self):
"""
Iterate over the corpus, returning one sparse vector at a time.
"""
lineno = -1
self.labels = []
with utils.smart_open(self.fname) as fin:
for lineno, line in enumerate(fin):
doc = self.line2doc(line)
if doc is not None:
if self.store_labels:
self.labels.append(doc[1])
yield doc[0]
self.length = lineno + 1
示例8: docbyoffset
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def docbyoffset(self, offset):
"""
Return the document stored at file position `offset`.
"""
with utils.smart_open(self.fname) as f:
f.seek(offset)
return self.line2doc(f.readline())[0]
示例9: __iter__
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def __iter__(self):
"""
Iterate over the corpus at the given filename.
Yields a bag-of-words, a.k.a list of tuples of (word id, word count), based on the given id2word dictionary.
"""
with utils.smart_open(self.fname) as f:
for line in f:
yield self.line2doc(line)
示例10: docbyoffset
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def docbyoffset(self, offset):
"""
Return the document stored at file position `offset`.
"""
with utils.smart_open(self.fname) as f:
f.seek(offset)
return self.line2doc(f.readline())
# endclass MalletCorpus
示例11: _calculate_num_docs
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def _calculate_num_docs(self):
# the first line in input data is the number of documents (integer). throws exception on bad input.
with utils.smart_open(self.fname) as fin:
try:
result = int(next(fin))
except StopIteration:
result = 0
return result
示例12: __iter__
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def __iter__(self):
"""
Iterate over the corpus, returning one bag-of-words vector at a time.
"""
with utils.smart_open(self.fname) as fin:
for lineno, line in enumerate(fin):
if lineno > 0: # ignore the first line = number of documents
yield self.line2doc(line)
示例13: save_corpus
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def save_corpus(fname, corpus, id2word=None, metadata=False):
"""
Save a corpus in the List-of-words format.
This function is automatically called by `LowCorpus.serialize`; don't
call it directly, call `serialize` instead.
"""
if id2word is None:
logger.info("no word id mapping provided; initializing from corpus")
id2word = utils.dict_from_corpus(corpus)
logger.info("storing corpus in List-Of-Words format into %s" % fname)
truncated = 0
offsets = []
with utils.smart_open(fname, 'wb') as fout:
fout.write(utils.to_utf8('%i\n' % len(corpus)))
for doc in corpus:
words = []
for wordid, value in doc:
if abs(int(value) - value) > 1e-6:
truncated += 1
words.extend([utils.to_unicode(id2word[wordid])] * int(value))
offsets.append(fout.tell())
fout.write(utils.to_utf8('%s\n' % ' '.join(words)))
if truncated:
logger.warning("List-of-words format can only save vectors with "
"integer elements; %i float entries were truncated to integer value" %
truncated)
return offsets
示例14: docbyoffset
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def docbyoffset(self, offset):
"""
Return the document stored at file position `offset`.
"""
with utils.smart_open(self.fname) as f:
f.seek(offset)
return self.line2doc(f.readline())
# endclass LowCorpus
示例15: __init__
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import smart_open [as 别名]
def __init__(self, fname, fname_vocab=None):
"""
Initialize the corpus from a file.
`fname_vocab` is the file with vocabulary; if not specified, it defaults to
`fname.vocab`.
"""
IndexedCorpus.__init__(self, fname)
logger.info("loading corpus from %s" % fname)
if fname_vocab is None:
fname_base, _ = path.splitext(fname)
fname_dir = path.dirname(fname)
for fname_vocab in [
fname + '.vocab',
fname + '/vocab.txt',
fname_base + '.vocab',
fname_dir + '/vocab.txt',
]:
if path.exists(fname_vocab):
break
else:
raise IOError('BleiCorpus: could not find vocabulary file')
self.fname = fname
with utils.smart_open(fname_vocab) as fin:
words = [utils.to_unicode(word).rstrip() for word in fin]
self.id2word = dict(enumerate(words))
self.length = 0