本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.filterExtremes方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.filterExtremes方法的具体用法?Python Dictionary.filterExtremes怎么用?Python Dictionary.filterExtremes使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora.dictionary.Dictionary
的用法示例。
在下文中一共展示了Dictionary.filterExtremes方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: WikiCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filterExtremes [as 别名]
class WikiCorpus(interfaces.CorpusABC):
"""
Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus.
The documents are extracted on-the-fly, so that the whole (massive) dump
can stay compressed on disk.
>>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id, takes almost 7h
>>> wiki.saveAsText('wiki_en_vocab200k') # another 7.5h, creates a file in MatrixMarket format plus file with id->word
"""
def __init__(self, fname, noBelow = 20, keep_words = 200000, dictionary = None):
"""
Initialize the corpus. This scans the corpus once, to determine its
vocabulary (only the first `keep_words` most frequent words that
appear in at least `noBelow` documents are kept).
"""
self.fname = fname
if dictionary is None:
self.dictionary = Dictionary(self.getArticles())
self.dictionary.filterExtremes(noBelow = noBelow, noAbove = 0.1, keepN = keep_words)
else:
self.dictionary = dictionary
def __len__(self):
return self.numDocs
def __iter__(self):
"""
The function that defines a corpus -- iterating over the corpus yields
vectors, one for each document.
"""
for docNo, text in enumerate(self.getArticles()):
yield self.dictionary.doc2bow(text, allowUpdate = False)
def saveDictionary(self, fname):
"""
Store id->word mapping to a file, in format `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`.
"""
logger.info("saving dictionary mapping to %s" % fname)
fout = open(fname, 'w')
for token, tokenId in sorted(self.dictionary.token2id.iteritems()):
fout.write("%i\t%s\t%i\n" % (tokenId, token, self.dictionary.docFreq[tokenId]))
fout.close()
@staticmethod
def loadDictionary(fname):
"""
Load previously stored mapping between words and their ids.
The result can be used as the `id2word` parameter for input to transformations.
"""
result = {}
for lineNo, line in enumerate(open(fname)):
cols = line[:-1].split('\t')
if len(cols) == 2:
wordId, word = cols
elif len(cols) == 3:
wordId, word, docFreq = cols
else:
continue
result[int(wordId)] = word # docFreq not used
return result
def saveAsText(self, fname):
"""
Store the corpus to disk, in a human-readable text format.
This actually saves two files:
1. Document-term co-occurence frequency counts (bag-of-words), as
a Matrix Market file `fname_bow.mm`.
2. Token to integer mapping, as a text file `fname_wordids.txt`.
"""
self.saveDictionary(fname + '_wordids.txt')
matutils.MmWriter.writeCorpus(fname + '_bow.mm', self, progressCnt = 10000)
def getArticles(self):
"""
Iterate over the dump, returning text version of each article.
Only articles of sufficient length are returned (short articles & redirects
etc are ignored).
"""
articles, intext = 0, False
for lineno, line in enumerate(bz2.BZ2File(self.fname)):
if line.startswith(' <text'):
intext = True
line = line[line.find('>') + 1 : ]
lines = [line]
elif intext:
lines.append(line)
pos = line.find('</text>') # can be on the same line as <text>
#.........这里部分代码省略.........