本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.load方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.load方法的具体用法?Python Dictionary.load怎么用?Python Dictionary.load使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora.dictionary.Dictionary
的用法示例。
在下文中一共展示了Dictionary.load方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: merge_dictionaries
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def merge_dictionaries(dictionaries_path, merged_dictionary_path=None):
dict_paths = list(iglob(dictionaries_path))
final_dictionary = Dictionary.load(dict_paths[0])
for dict_path in dict_paths[1:]:
dictionary = Dictionary.load(dict_path)
final_dictionary.merge_with(dictionary)
if merged_dictionary_path:
final_dictionary.save(merged_dictionary_path)
return final_dictionary
示例2: analyze_top_dfs
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def analyze_top_dfs(tokendict, tagdict, cutoff_factor=1):
''' Provided gensim-dicts `tokendict` and `tagsdict`, show the top word frequencies. '''
if type(tokendict) == str:
tokendict = Dictionary.load(tokendict)
if type(tagdict) == str:
tagdict = Dictionary.load(tagdict)
max_tag_df = max(tagdict.dfs.iteritems(), key=operator.itemgetter(1))
sorted_dfs = sorted(tokendict.dfs.iteritems(), key=operator.itemgetter(1), reverse=True)
print "count threshold: %-15s\t%d" % (tagdict[max_tag_df[0]], max_tag_df[1])
print "----------------------------------------------"
for tup in sorted_dfs[:100]:
if tup[1] > max_tag_df[1] * cutoff_factor:
print "%-15s\t%d" % (tokendict[tup[0]][:15], tup[1])
else: break
示例3: __init__
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def __init__(self, topics = 10,
worker = 3,
pretrained_model = None,
dictionary = None):
"""
lda模型训练初始化。
Args:
topics -- 指定主题个数
worker -- 并行化参数,一般为core数量减一
pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
Example:
>>> lda = LDA(topics = 20, worker = 2,
pretrained_model = model_file,
dictionary = dictionary_file)
>>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
>>> lda.update(corpus)
>>> lda.save(model_file, dictionary_file)
>>> topics = lda.inference(['word5', 'word6'])
"""
self._topics = topics
self._workers = worker
self._model = None
self._common_dictionary = None
if pretrained_model and common_dictionary:
self._model = LdaModel.load(pretrained_model)
self._common_dictionary = Dictionary.load(dictionary)
示例4: plot_dict_hist
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def plot_dict_hist(gdict):
''' Provided gensim-dict `gdict`, plot hist statistics '''
if type(gdict) == str:
gdict = Dictionary.load(gdict)
sorted_dfs = sorted(gdict.dfs.iteritems(), key=operator.itemgetter(1), reverse=True)
y = [tup[1] for tup in sorted_dfs]
x = arange(0, len(y))
plt.figure(figsize=(8,5));
plt.loglog(x, y);
plt.grid();
plt.xlabel("Token rank");
plt.ylabel("Document count");
cdf = np.empty(len(y))
delta(y, cdf)
cdf /= np.max(cdf) # normalize
x50 = x[cdf > 0.50][0]
x80 = x[cdf > 0.80][0]
x90 = x[cdf > 0.90][0]
x95 = x[cdf > 0.95][0]
plt.axvline(x50, color='c');
plt.axvline(x80, color='g');
plt.axvline(x90, color='r');
plt.axvline(x95, color='k');
print "50%\t", x50
print "80%\t", x80
print "90%\t", x90
print "95%\t", x95
示例5: main
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def main(args):
if args.corpus_type != "wiki":
if args.processed_corpus_save_path is not None:
raise ValueError("Processed corpus saving only supported " "for 'wiki' corpus type")
kwargs = {}
if args.dictionary_path is not None:
kwargs["dictionary"] = Dictionary.load(args.dictionary_path)
if args.dictionary_out_path is not None:
kwargs["dictionary_save_path"] = args.dictionary_out_path
if args.corpus_type == "wiki" and args.processed_corpus_save_path is not None:
kwargs["sentences_save_path"] = args.processed_corpus_save_path
logging.debug("Building corpus")
corpus = CORPUS_TYPES[args.corpus_type](args.corpus_path, **kwargs)
documents = corpus.get_texts()
logging.debug("Now beginning VSM construction with Word2Vec")
model = Word2Vec(
sentences=documents,
vocab_path=args.vocab_path,
window=args.window_size,
drop_capitals=args.drop_capitals,
min_count=args.minimum_token_count,
size=args.vector_dimensions,
workers=multiprocessing.cpu_count(),
)
model.save(args.out_path)
if args.vocab_out_path is not None:
model.save_vocab(args.vocab_out_path)
示例6: prune_dictionary
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def prune_dictionary(src_dictionary_path, dest_dictionary_path=None,
no_below=None, no_above=None, keep_n=None):
dictionary = Dictionary.load(src_dictionary_path)
dictionary.filter_extremes(no_below=no_below, no_above=no_above,
keep_n=keep_n)
if dest_dictionary_path:
dictionary.save(dest_dictionary_path)
return dictionary
示例7: getDictionary
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def getDictionary(word_corpus, useSavedTill):
if useSavedTill >= USESAVED.dictionary:
common_logger.info("loading dictionary from file")
dictionary = Dictionary.load(file_lda_gensim_dictionary)
return dictionary
else:
common_logger.info("Creating dictionary from corpus")
dictionary = Dictionary(word_corpus.values())
common_logger.info("saving dictionary")
dictionary.save(file_lda_gensim_dictionary)
return dictionary
示例8: filter_extremes_wrapper
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def filter_extremes_wrapper(gdict, no_below=1, no_above=1.0, keep_n=None, save_pickle=None):
''' Given unfiltered gensim-dict `gdict`, wrap filter_extremes '''
if type(gdict) == str:
gdict = Dictionary.load(gdict)
print "Before filtering:", gdict
gdict.filter_extremes(**kwargs)
print "After filtering:", gdict
if save_pickle:
print "\nsaving..."
gdict.save(save_pickle)
return gdict
示例9: main
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def main():
logformat = '%(asctime)s %(name)-12s: %(message)s'
logging.basicConfig(level=logging.DEBUG, format=logformat)
kera = NOB_kera()
es = Elasticsearch(port=9201)
mod = LdaModel.load(modelfile)
vocab = Dictionary.load(vocabulary)
tfidf = TfidfModel(dictionary=vocab)
results = []
for (topics, topicid) in get_doc_topics(mod, mod.num_topics, num_words_from_topic, vocab, tfidf):
res = es.search(index='wiki4', body={"query": {"match": {"_all": topics}}}, size=num_results_from_es)
results.append({'topics': topics, 'result': res, 'topicid': topicid})
results = add_keywords(results, kera)
df = pd.DataFrame(results)
df.to_csv('nowiki_4_with_kera_250_topics.csv', encoding='utf-8')
示例10: buildDictionary
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def buildDictionary(force=False):
""" Build a dictionary in which each post corresponds to a document. """
global globalDict
if force or not isfile(dictName):
postids = getPostids()
numPosts = len(postids)
count = 0
for postid in postids:
if count % 100 == 0:
print "Added %d out of %d to dictionary: %s" % (count, numPosts, time.strftime("%H:%M:%S"))
addPostToDict(postid)
count += 1
else:
globalDict = Dictionary.load(dictName)
# Filter out extremely common words
globalDict.filter_extremes(no_below=2, no_above=0.5)
示例11: __init__
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def __init__(self, analyzed_items_path=None, dictionary_path=None,
corpus_path=None, tfidf_model_path=None):
if dictionary_path:
self.dictionary = Dictionary.load(dictionary_path)
else:
self.dictionary = None
if analyzed_items_path:
self.analyzed_items_path = analyzed_items_path
else:
self.analyzed_items_path = None
if corpus_path:
self.corpus = MmCorpus(corpus_path)
else:
self.corpus = None
if tfidf_model_path:
self.tfidf_model = TfidfModel.load(tfidf_model_path)
else:
self.tfidf_model = None
示例12: build_lda_model
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def build_lda_model(self, topics: int=20):
ignore_words = [
'like', 'know', 'fuck', 'fucking', 'want', 'shit', 'know', 'sure',
'isn', 'CHANBOARD', 'think', 'people', 'good', 'time', 'going',
'WEBLINK', 'got', 'way', ''
]
filename = op.join(self.input_dir, f'{self.board}.dictionary')
dictionary: Dictionary = Dictionary.load(filename)
documents = ReadThreads(
self.board, input_dir=self.input_dir, file_type='phrases',
return_func=lambda x, y: dictionary.doc2bow(
[w for w in y.split() if w not in ignore_words]
)
)
lda = LdaMulticore(
documents, id2word=dictionary, num_topics=topics, iterations=2)
filename = op.join(self.input_dir, f'{self.board}.lda')
lda.save(filename)
return lda
示例13: _create_dictionary
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def _create_dictionary(self, mongo_client):
"""
Creates the gensim Dictionary (gensim.corpora.dictionary.Dictionary) or loads it if it already exists and sets
the object's dictionary property.
:param mongo_client: server.db.MongoClientContext
"""
from gensim.corpora.dictionary import Dictionary
if self._resource_exists(self.dictionary_file):
self.logger().debug(
"Dictionary file found, loading it [%s]" % self._create_resource_path(self.dictionary_file))
self._dictionary = Dictionary.load(self._create_resource_path(self.dictionary_file))
else:
self.logger().debug("Dictionary file not found, creating a new Dictionary file")
self._dictionary = Dictionary()
documents = []
for doc in [di for d in mongo_client.scrappers_collections() for di in d.find()]:
documents.append(self.tokenize_sentence(doc[self.considerable_doc_property]))
self.logger().debug("Adding %d documents to dictionary (will skip existing ones)" % len(documents))
self._dictionary.add_documents(documents)
self._dictionary.save(self._create_resource_path(self.dictionary_file))
示例14: update
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def update(self, name, n=500, method='FastICA'):
settings = self._setstorage.load(encode_name(name))
clusterer = Clusterer(settings)
# load the models
dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY]))
ngram_size = len(dictionary[0])
transformer = NgramTransformer(ngram_size)
ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL]))
# get the input
segments = self._segstorage.load(name=settings[SEGMENT_NAME], limit=int(n))
documents = [s.value for s in segments]
# prepare args
kwargs = {'dictionary': dictionary,
'ngramtransformer': transformer,
'ldamodel': ldamodel,
'method': method}
Xt = clusterer.fit_transform(documents, **kwargs)
labels = clusterer.assign_labels(documents)
data = self._make_data(Xt, labels, documents)
return json.dumps({'result': 'OK',
'data': data})
示例15: scorer
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import load [as 别名]
def scorer(model, dic):
tfidf = TfidfModel.load(model)
dictionary = Dictionary.load(dic)
def score(words):
return tfidf[dictionary.doc2bow(words)]
return score