本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.keys方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.keys方法的具体用法?Python Dictionary.keys怎么用?Python Dictionary.keys使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora.dictionary.Dictionary
的用法示例。
在下文中一共展示了Dictionary.keys方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: InfoGain
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import keys [as 别名]
class InfoGain(object):
"""
计算标签的信息增益,输入的是标注了类别的语料库,
计算信息增益首先计算全局的信息熵,然后计算每个标
签的条件熵,相减就是信息增益。
Example:
>>> ig = InfoGain(corpus_file)
>>> ig.compute()
>>> ig.save(ig_file)
>>> print(ig['word']) # 查询一个词的信息增益
"""
def __init__(self, corpus_file):
"""
Args:
corpus_file -- 语料文件,第一列是类别,后面都是标签
"""
corpus = []
categories = []
self._category_distribution = {} # 统计各个类别的样本数
self._words_cate = {} # 统计每个词(标签、特征)下的类别样本数
self._words_sample_count = {}
self._info_gain = {}
with open(corpus_file, 'r') as documents:
for line in documents:
words = line.strip().split()
if len(words) <= 1:
continue
categories.append(words[0])
corpus.append(words[1:])
if words[0] not in self._category_distribution:
self._category_distribution[words[0]] = 0
self._category_distribution[words[0]] += 1
# 统计词(标签、特征)和类别的共现次数,用于计算条件熵
for word in set(words[1:]):
if word not in self._words_cate:
self._words_cate[word] = {}
self._words_sample_count[word] = 0
if words[0] not in self._words_cate[word]:
self._words_cate[word][words[0]] = 0
self._words_cate[word][words[0]] += 1
self._words_sample_count[word] += 1
self._common_dictionary = Dictionary(corpus)
self._corpus = corpus
self._categories = categories
def compute(self):
"""
计算所有词(标签、特征)的信息增益。首先计算全局的信息熵。
"""
system_entropy = compute_entropy(len(self._corpus),
self._category_distribution)
# 计算每个词的条件熵
for word in self._common_dictionary.keys():
category_distribution = {}
if word not in self._words_cate:
continue
# 出现该词(标签、特征)的类别分布信息熵
entropy1 = compute_entropy(self._words_sample_count[word],
self._words_cate[word])
for cate in self._category_distribution:
category_distribution[cate] = self._category_distribution[cate]
if cate in self._words_cate[word]:
category_distribution[cate] -= self._words_cate[word]
# 未出现该词(标签、特征)的类别分布信息熵
entropy2 = (compute_entropy(len(self._corpus)
- self._words_sample_count[word],
category_distribution))
# 该词(标签、特征)的条件熵
condition_entropy = (self._words_sample_count[word] * entropy1/len(self._corpus)
+ (len(self._corpus) - self._words_sample_count[word])
* entropy2/len(self._corpus))
# 信息增益
info_gain = system_entropy - condition_entropy
self._info_gain[word] = info_gain
def save(self, ig_file_name, sort=False):
"""
保存到文件,格式为:词 信息增益
Args:
ig_file_name -- 文件路径
sort -- 是否按照信息增益从高到低排序后输出,默认不排序
"""
with open(ig_file_name, 'w') as ig_file:
if not sort:
for word in self._info_gain:
ig_file.write("%s %.2f\n" % (word, self._info_gain[word]))
else:
for item in sorted(self._info_gain.items(), key=lambda x: x[1], reverse=True):
ig_file.write("%s %.2f\n" % (item[0], item[1]))
def __get_item__(self, word):
if word not in self._info_gain:
return 0.0
return self._info_gain[word]