本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.compactify方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.compactify方法的具体用法?Python Dictionary.compactify怎么用?Python Dictionary.compactify使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora.dictionary.Dictionary
的用法示例。
在下文中一共展示了Dictionary.compactify方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: preprocess_corpora
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import compactify [as 别名]
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
"""
:rtype : gensim.corpora.dictionary.Dictionary
:param corpora:
:param stopwords:
:param allowed_pos:
:param max_doc:
:return:
"""
logging.info('Lemmatizing the corpora...')
count = 0
corpus_num = len(corpora)
processed_corpora = []
corpus_id2orig_id = []
for index, corpus in corpora.items():
count += 1
if count > max_doc:
break
if corpus is None: # skip if corpus is None
continue
print '\r', count, '/', corpus_num,
cleaned_corpus = clean_text(corpus) # delete irrelevant characters
corpus = []
tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
for token in tokens:
word, pos = token.split('/')
corpus.append(word)
# convert compound word into one token
corpus = convert_compound(corpus)
# filter stop words, long words, and non-english words
corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
processed_corpora.append(corpus)
corpus_id2orig_id.append(index)
print '\n'
logging.info('Creating dictionary and corpus...')
dictionary = Dictionary(processed_corpora)
dictionary.corpus_id2orig_id = corpus_id2orig_id
logging.info('Filtering unimportant terms...')
dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
dictionary.compactify()
logging.info('Generating corpus...')
dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
dictionary.id2token = revdict(dictionary.token2id)
return dictionary
示例2: FolderCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import compactify [as 别名]
class FolderCorpus(corpora.TextCorpus):
def __init__(self, filepaths, preprocess=[], dictionary=None):
self.filepaths = filepaths
self.preprocess = preprocess
self.metadata = None
self.dictionary = Dictionary()
self.dictionary.add_documents(self.get_texts())
self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)
self.dictionary.compactify()
def get_texts(self):
for path in self.filepaths:
with codecs.open(path, encoding='utf8') as f:
raw_text = f.read()
raw_text = raw_text.lower()
for filt in self.preprocess:
raw_text = filt(raw_text)
text = list(utils.tokenize(raw_text, deacc=True, lowercase=True))
yield text