本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.filter_extremes方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.filter_extremes方法的具体用法?Python Dictionary.filter_extremes怎么用?Python Dictionary.filter_extremes使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora.dictionary.Dictionary
的用法示例。
在下文中一共展示了Dictionary.filter_extremes方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_corpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
"""\
"""
wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
bow_filename = os.path.join(out_dir, 'cables_bow.mm')
tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany)
# 1. Create word dict
dct = Dictionary()
dct_handler = DictionaryHandler(dct)
handler = create_filter(dct_handler)
handle_source(src, handler, predicate)
dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
dct.save(wordid_filename)
# 2. Reiterate through the cables and create the vector space
corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
handler = create_filter(corpus_handler)
handle_source(src, handler, predicate)
# 3. Load corpus
mm = MmCorpus(bow_filename)
# 4. Create TF-IDF model
tfidf = TfidfModel(mm, id2word=dct, normalize=True)
# 5. Save the TF-IDF model
MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
示例2: build_dictionary
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
def build_dictionary():
dictionary = Dictionary()
for line in open(wiki_index.ARTICLES_FILE):
dictionary.add_documents([line.lower().split()])
dictionary.filter_extremes(no_below=2, no_above=0.5)
dictionary.save(DICTIONARY_FILE)
return dictionary
示例3: EnronCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class EnronCorpus(TextCorpus):
def __init__(self, root_name, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None):
"""
Initialize the corpus. This scans through all the emails once, to determine the corpus
vocabulary. (only the first `keep_words` most frequent words that appear in at least
`no_below` documents are kept).
"""
self.root_name = root_name
if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
else:
self.dictionary = dictionary
def get_texts(self, return_raw=False):
"""
Walk the file system, strip punctuation, normalize all numbers to be '2'.
"""
filenames = walk_os(self.root_name)
opened_files = gen_open(filenames)
stripped_files = strip_punct(opened_files)
length = 0
for email in stripped_files:
if len(email) > ARTICLE_MIN_CHARS:
length += 1
print "Iteration: %i" % length
yield tokenize(email)
self.length = length # cache corpus length
示例4: preprocess_corpora
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
"""
:rtype : gensim.corpora.dictionary.Dictionary
:param corpora:
:param stopwords:
:param allowed_pos:
:param max_doc:
:return:
"""
logging.info('Lemmatizing the corpora...')
count = 0
corpus_num = len(corpora)
processed_corpora = []
corpus_id2orig_id = []
for index, corpus in corpora.items():
count += 1
if count > max_doc:
break
if corpus is None: # skip if corpus is None
continue
print '\r', count, '/', corpus_num,
cleaned_corpus = clean_text(corpus) # delete irrelevant characters
corpus = []
tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
for token in tokens:
word, pos = token.split('/')
corpus.append(word)
# convert compound word into one token
corpus = convert_compound(corpus)
# filter stop words, long words, and non-english words
corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
processed_corpora.append(corpus)
corpus_id2orig_id.append(index)
print '\n'
logging.info('Creating dictionary and corpus...')
dictionary = Dictionary(processed_corpora)
dictionary.corpus_id2orig_id = corpus_id2orig_id
logging.info('Filtering unimportant terms...')
dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
dictionary.compactify()
logging.info('Generating corpus...')
dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
dictionary.id2token = revdict(dictionary.token2id)
return dictionary
示例5: FolderCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class FolderCorpus(corpora.TextCorpus):
def __init__(self, filepaths, preprocess=[], dictionary=None):
self.filepaths = filepaths
self.preprocess = preprocess
self.metadata = None
self.dictionary = Dictionary()
self.dictionary.add_documents(self.get_texts())
self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)
self.dictionary.compactify()
def get_texts(self):
for path in self.filepaths:
with codecs.open(path, encoding='utf8') as f:
raw_text = f.read()
raw_text = raw_text.lower()
for filt in self.preprocess:
raw_text = filt(raw_text)
text = list(utils.tokenize(raw_text, deacc=True, lowercase=True))
yield text
示例6: HNCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class HNCorpus(TextCorpus):
def __init__(self, hn_folder, dictionary=None):
"""
Takes the HN folder of articles
as input and builds the dictionary and corpus
"""
self.hn_folder = hn_folder
if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
self.dictionary.filter_extremes(no_below=NO_BELOW,
no_above=NO_ABOVE, keep_n=VOCAB_SIZE)
else:
self.dictionary = dictionary
def get_texts(self):
"""
Iterate over the HN articles returning text
"""
positions, hn_articles = 0, 0
# ************ HN articles ************
fnamelist = []
for g in glob.iglob(self.hn_folder + '/*.txt'):
fnamelist.append(g)
for fileno, fname in enumerate(fnamelist):
hn_text = open(fname).read()
hn_articles += 1
if LEMMATIZE:
result = utils.lemmatize(hn_text)
positions += len(result)
yield result
else:
result = tokenize(hn_text) # text into tokens here
positions += len(result)
yield result
print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions))
self.length = hn_articles # cache corpus length
示例7: ArchiveCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class ArchiveCorpus(corpora.TextCorpus):
def __init__(self, datafile, preprocess=[], dictionary=None):
self.datafile = datafile
self.preprocess = preprocess
self.metadata = None
if dictionary:
self.dictionary = dictionary
else:
self.dictionary = Dictionary()
if datafile is not None:
self.dictionary.add_documents(self.get_texts())
self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)
def get_texts(self):
with utils.smart_open(self.datafile) as inputfile:
for line in inputfile:
for f in self.preprocess:
line = f(line)
text = list(utils.tokenize(line, deacc=True, lowercase=True))
yield text
示例8: WikiCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class WikiCorpus(TextCorpus):
"""
Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus.
The documents are extracted on-the-fly, so that the whole (massive) dump
can stay compressed on disk.
>>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
>>> wiki.saveAsText('wiki_en_vocab200k') # another 8h, creates a file in MatrixMarket format plus file with id->word
"""
def __init__(self, fname, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None):
"""
Initialize the corpus. This scans the corpus once, to determine its
vocabulary (only the first `keep_words` most frequent words that
appear in at least `noBelow` documents are kept).
"""
self.fname = fname
if keep_words is None:
keep_words = DEFAULT_DICT_SIZE
if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
else:
self.dictionary = dictionary
def get_texts(self, return_raw=False):
"""
Iterate over the dump, returning text version of each article.
Only articles of sufficient length are returned (short articles & redirects
etc are ignored).
Note that this iterates over the **texts**; if you want vectors, just use
the standard corpus interface instead of this function::
>>> for vec in wiki_corpus:
>>> print vec
"""
articles, articles_all = 0, 0
intext, positions = False, 0
if LEMMATIZE:
lemmatizer = utils.lemmatizer
yielded = 0
for _, text in _extract_pages(bz2.BZ2File(self.fname)):
text = filter_wiki(text)
articles_all += 1
if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
articles += 1
if return_raw:
result = text
yield result
else:
if LEMMATIZE:
_ = lemmatizer.feed(text)
while lemmatizer.has_results():
_, result = lemmatizer.read() # not necessarily the same text as entered above!
positions += len(result)
yielded += 1
yield result
else:
result = tokenize(text) # text into tokens here
positions += len(result)
yield result
if LEMMATIZE:
logger.info("all %i articles read; waiting for lemmatizer to finish the %i remaining jobs" %
(articles, articles - yielded))
while yielded < articles:
_, result = lemmatizer.read()
positions += len(result)
yielded += 1
yield result
logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions"
" (total %i articles before pruning)" %
(articles, positions, articles_all))
self.length = articles # cache corpus length
示例9: WikiCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class WikiCorpus(TextCorpus):
"""
Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus.
The documents are extracted on-the-fly, so that the whole (massive) dump
can stay compressed on disk.
>>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
>>> wiki.saveAsText('wiki_en_vocab200k') # another 8h, creates a file in MatrixMarket format plus file with id->word
"""
def __init__(self, fname, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None):
"""
Initialize the corpus. This scans the corpus once, to determine its
vocabulary (only the first `keep_words` most frequent words that
appear in at least `noBelow` documents are kept).
"""
self.fname = fname
if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
else:
self.dictionary = dictionary
def get_texts(self, return_raw=False):
"""
Iterate over the dump, returning text version of each article.
Only articles of sufficient length are returned (short articles & redirects
etc are ignored).
Note that this iterates over the **texts**; if you want vectors, just use
the standard corpus interface instead of this function::
>>> for vec in wiki_corpus:
>>> print vec
"""
articles, articles_all = 0, 0
intext, positions = False, 0
for lineno, line in enumerate(bz2.BZ2File(self.fname)):
if line.startswith(" <text"):
intext = True
line = line[line.find(">") + 1 :]
lines = [line]
elif intext:
lines.append(line)
pos = line.find("</text>") # can be on the same line as <text>
if pos >= 0:
articles_all += 1
intext = False
if not lines:
continue
lines[-1] = line[:pos]
text = filter_wiki("".join(lines))
if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
articles += 1
if return_raw:
result = text
else:
result = tokenize(text) # text into tokens here
positions += len(result)
yield result
logger.info(
"finished iterating over Wikipedia corpus of %i documents with %i positions"
" (total %i articles before pruning)" % (articles, positions, articles_all)
)
self.length = articles # cache corpus length
示例10: CDS_Corpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class CDS_Corpus(TextCorpus):
def __init__(self, folder, dictionary=None):
"""
Takes the list of txt files in a folder from Isabelle
as input and builds the dictionary and corpus
"""
self.folder = folder
if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
self.dictionary.filter_extremes(no_below=NO_BELOW,
no_above=NO_ABOVE, keep_n=VOCAB_SIZE)
else:
self.dictionary = dictionary
def get_texts(self):
"""
Iterate over the "documents" (sessions/places) returning text
"""
filter_words = set()
if FILTER_WORDS:
filter_words = []
with open(FILTER_WORDS) as f:
for line in f:
filter_words.append(line.rstrip('\n'))
filter_words = set(filter_words)
#print "the following words will be filtered", filter_words
filter_words_add = set()
if FILTER_WORDS_ADD:
filter_words_add = []
with open(FILTER_WORDS_ADD) as f:
for line in f:
filter_words_add.append(line.rstrip('\n'))
filter_words_add = set(filter_words_add)
positions, hn_articles = 0, 0
fnamelist = []
docs = 0
for g in glob.iglob(self.folder + '/*.txt'):
fnamelist.append(g)
for fileno, fname in enumerate(fnamelist):
with open(fname) as f:
text = ""
for line in f:
if line[0] != '@':
#sentence = re.sub('\d+', '', line.rstrip('\n').strip('\t').replace('\n', '')).split(' ')
sentence = tokenize(re.sub('\d+', '', line.rstrip('\n').strip('\t').replace('\n', '')))
for ind, word in enumerate(sentence):
w = word.lower().rstrip(' ').strip(' ').strip('\t')
sentence[ind] = w
if FILTER_WORDS:
for ind, word in enumerate(sentence):
if word.upper() in filter_words:
sentence[ind] = ''
if FILTER_WORDS_ADD:
for ind, word in enumerate(sentence):
if word in filter_words_add:
sentence[ind] = ''
text += ' '.join(sentence) + '\n'
else:
docs += 1
if LEMMATIZE:
result = lemmatizer(text)
positions += len(result)
yield result
else:
result = tokenize(text) # text into tokens here
positions += len(result)
yield result
text = ""
docs += 1
if LEMMATIZE:
result = lemmatizer(text)
positions += len(result)
yield result
else:
result = tokenize(text) # text into tokens here
positions += len(result)
yield result
print (">>> finished iterating over the corpus of %i documents with %i positions" % (docs, positions))
self.length = docs # cache corpus length
示例11: WikiHNCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class WikiHNCorpus(TextCorpus):
def __init__(self, wiki_file, hn_folder, dictionary=None, processes=None,
lemmatize=utils.HAS_PATTERN):
"""
Takes the wikipedia *articles.xml.bz2 and the HN folder of articles
as input and builds the dictionary and corpus
"""
global outputname
self.lemmatize = lemmatize
if self.lemmatize:
print "We will lemmatize ('you were'->'be/VB')"
self.outputname = outputname + "_lemmatized"
else:
print "We will only tokenize ('you were'->'you','were')"
self.wiki_file = wiki_file
self.hn_folder = hn_folder
if processes is None:
processes = max(1, multiprocessing.cpu_count() - 1)
self.processes = processes
if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
self.dictionary.filter_extremes(no_below=NO_BELOW,
no_above=NO_ABOVE, keep_n=VOCAB_SIZE)
else:
self.dictionary = dictionary
def get_texts(self):
"""
Iterate over the Wikipedia dump and the HN articles returning text
"""
wiki_articles, hn_articles, articles_all = 0, 0, 0
positions, positions_all = 0, 0
# ************ Wikipedia ************
texts = ((text, self.lemmatize) for _, text in wikicorpus._extract_pages(bz2.BZ2File(self.wiki_file)))
pool = multiprocessing.Pool(self.processes)
for group in utils.chunkize(texts, chunksize=10 * pool._processes, maxsize=1): # otherwise imap puts all the corpus into memory
for tokens in pool.imap(wikicorpus.process_article, group):
articles_all += 1
positions_all += len(tokens)
if len(tokens) > WIKI_ARTICLE_MIN_WORDS:
wiki_articles += 1
positions += len(tokens)
yield tokens
pool.terminate()
print (">>> finished iterating over Wikipedia corpus of %i documents with %i positions (total %i articles, %i positions before pruning articles shorter than %i words)" % (wiki_articles, positions, articles_all, positions_all, WIKI_ARTICLE_MIN_WORDS))
# ************ HN articles ************
positions_after_wiki = positions
fnamelist = []
for g in glob.iglob(self.hn_folder + '/*.txt'):
fnamelist.append(g)
for fileno, fname in enumerate(fnamelist): # TODO parallelize as Wiki
hn_text = open(fname).read()
if self.lemmatize:
result = utils.lemmatize(hn_text) # text into lemmas here
else:
result = tokenize(hn_text) # text into tokens here
articles_all += 1
positions_all += len(result)
if len(result) > HN_ARTICLE_MIN_WORDS:
hn_articles += 1
positions += len(result)
yield result
print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions - positions_after_wiki))
# ************ /HN articles ************
self.length = wiki_articles + hn_articles # cache corpus length