本文整理汇总了Python中sklearn.datasets.base.Bunch.tdm方法的典型用法代码示例。如果您正苦于以下问题:Python Bunch.tdm方法的具体用法?Python Bunch.tdm怎么用?Python Bunch.tdm使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.datasets.base.Bunch
的用法示例。
在下文中一共展示了Bunch.tdm方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: gen_tf_idf_space
# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 别名]
def gen_tf_idf_space():
bunch = read_object(train_data)
tf_idf_space = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, vocabulary={})
vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5)
transformer = TfidfTransformer()
tf_idf_space.tdm = vectorizer.fit_transform(bunch.contents)
tf_idf_space.vocabulary = vectorizer.vocabulary_
save_object(tf_idf_space_data, tf_idf_space)
示例2: calc_tfidf
# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 别名]
def calc_tfidf(trainsetfile,stopwordfile,dstdir):
data_set = joblib.load(trainsetfile)
wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name = data_set.tatget_name
wordbag.label = data_set.label
corpus = data_set.contents
stopwordlist = read_stopword(stopwordfile)
vectorize = TfidfVectorizer(sublinear_tf=True,max_df = 0.8,min_df=3,max_features=50000,stop_words=stopwordlist)
feature_train = vectorize.fit_transform(corpus)
wordbag.tdm = feature_train
wordbag.vocabulary = vectorize.vocabulary_
joblib.dump(wordbag,dstdir+"/"+"word_bag.data",compress=3)
示例3: testset_tfidf
# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 别名]
def testset_tfidf(testsetfile,stopwordfile,myvocabulary):
data_set = joblib.load(testsetfile)
wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name = data_set.tatget_name
wordbag.label = data_set.label
corpus = data_set.contents
stopwordlist = read_stopword(stopwordfile)
vectorize = TfidfVectorizer(sublinear_tf=True,stop_words=stopwordlist,vocabulary=myvocabulary)
feature_train = vectorize.fit_transform(corpus)
wordbag.tdm = feature_train
joblib.dump(wordbag,"test_wordbag/test_word_bag.data",compress=3)
return wordbag
示例4: execute_NM_predict
# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 别名]
def execute_NM_predict():
test_bunch = read_object(test_data)
test_space = Bunch(target_name=test_bunch.target_name, label=test_bunch.label, filenames=test_bunch.filenames,
tdm=[], vocabulary={})
tf_idf_bunch = read_object(tf_idf_space_data)
vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5,
vocabulary=tf_idf_bunch.vocabulary)
transformer = TfidfTransformer()
test_space.tdm = vectorizer.fit_transform(test_bunch.contents)
test_space.vocabulary = tf_idf_bunch.vocabulary
clf = MultinomialNB(alpha=0.001).fit(tf_idf_bunch.tdm, tf_idf_bunch.label)
#预测结果
predicted = clf.predict(test_space.tdm)
#对结果进行更加友好的打印
for label, file_name, excect_cate in zip(test_bunch.label, test_bunch.filenames, predicted):
print file_name, ' 实际类别:', label, ' 预测类别:', excect_cate
示例5: writebunchobj
# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 别名]
file_obj.close()
return bunch
#写入bunch对象
def writebunchobj(path,bunchobj):
file_obj = open(path, "wb")
pickle.dump(bunchobj,file_obj)
file_obj.close()
# 1. 读取停用词表
stopword_path = "train_word_bag/hlt_stop_words.txt"
stpwrdlst = readfile(stopword_path).splitlines()
# 2. 导入分词后的词向量bunch对象
path = "train_word_bag/train_set.dat" # 词向量空间保存路径
bunch = readbunchobj(path)
# 3. 构建tf-idf词向量空间对象
tfidfspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})
# 4. 使用TfidfVectorizer初始化向量空间模型
vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5)
transformer=TfidfTransformer() # 该类会统计每个词语的tf-idf权值
# 文本转为词频矩阵,单独保存字典文件
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_
# 创建词袋的持久化
space_path = "train_word_bag/tfdifspace.dat" # 词向量空间保存路径
writebunchobj(space_path,tfidfspace)
print "if-idf词向量空间创建成功!!!"
示例6:
# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 别名]
stpwrd_content=stpwrd_dic.read()
#将停用词转换为list
stpwrdlst=stpwrd_content.splitlines()
stpwrd_dic.close()
#计算词袋创建时间:获取开始时间
start=datetime.datetime.now()
#使用tfidfvectorizer初始化向量空间模型---创建词袋
vectorizer=TfidfVectorizer(sublinear_tf=True,max_df=0.5,stop_words=stpwrdlst)
#该类会统计每个词语的tf-idf权值
transformer=TfidfTransformer()
#文本转为词频矩阵
fea_train=vectorizer.fit_transform(corpus)
#计算词袋时间,结束时间
end=datetime.datetime.now()
print 'create word bag peroid',(end-start).seconds
#计算词袋的行列数
print 'size of fea_train',fea_train.shape
#为tdm赋值
wordbag.tdm=fea_train
wordbag.vocabulary=vectorizer.vocabulary_
#创建词袋的持久化
file_obj=open('text_corpus1_wordbag/word_bag.data','wb')
pickle.dump(wordbag,file_obj)
file_obj.close()
示例7: open
# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 别名]
#导入停用词
stopwordpath = "extra_dict/stop_words.txt"
stopword_dic = open(stopwordpath,'r')
stopword_content = stopword_dic.read()
#将停用词转为list
stopwordlist = stopword_content.splitlines()
stopword_dic.close()
#词袋创建时间
start = datetime.datetime.now()
print start
#使用 TfidfVectorizer创建词袋
vectorize = TfidfVectorizer(sublinear_tf=True,max_df = 0.8,min_df=3,max_features=50000,stop_words=stopwordlist)
feature_train = vectorize.fit_transform(corpus)
#计算词袋结束时间
end = datetime.datetime.now()
print 'create word bag peroid:',(end - start).seconds,"seconds"
#计算词袋的行列数
print 'Size of fea_train:\n',feature_train.shape
wordbag.tdm = feature_train
wordbag.vocabulary = vectorize.vocabulary_
# print wordbag.vocabulary
# print wordbag.tdm[0:5]
#持久化
joblib.dump(wordbag,"wordbag/word_bag1124.data",compress=9)
word_bag1124 = joblib.load("wordbag/word_bag1124.data")
print word_bag1124.target_name
示例8: fetch_20newsgroups
# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 别名]
## 导入数据
categories = ["alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med"] # 选取需要下载的新闻分类
data_set = fetch_20newsgroups(subset="train", categories=categories, shuffle=True, random_state=42) # 下载并获取训练数据, 也是先全部下载, 再提取部分
print(data_set.target_names) # ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
##################################################################
## 定义词袋数据结构
# tdm:tf-idf 计算后词袋
stpwrdlst = [] # 停用词表为 空
wordbag = Bunch(target_name=[], label=[], filenames=[], tdm=[], vocabulary={}, stpwrdlst=[])
wordbag.target_name = data_set.target_names
wordbag.label = data_set.target
wordbag.filenames = data_set.filenames
wordbag.stpwrdlst = stpwrdlst
vectorizer = CountVectorizer(stop_words=stpwrdlst) # 使用 TfidfVectorizer 初始化向量空间模型--创建词袋
transformer = TfidfTransformer() # 该类会统计每个词语的 tf-idf 权值
fea_train = vectorizer.fit_transform(data_set.data) # 文本转为词频矩阵
print(fea_train.shape) # (2257, 35788); 2257 篇文档, 35788 个单词
wordbag.tdm = fea_train # 为 tdm 赋值
wordbag.vocabulary = vectorizer.vocabulary_
##################################################################
## 创建词袋的持久化
file_obj = open("tmp.data", "wb")
pickle.dump(wordbag, file_obj)
file_obj.close()
##################################################################
## 读取
with open('tmp.data', 'rb') as f: clf2 = pickle.load(f)
print(clf2.tdm.shape) # (2257, 35788)