当前位置: 首页>>代码示例>>Python>>正文


Python Bunch.vocabulary方法代码示例

本文整理汇总了Python中sklearn.datasets.base.Bunch.vocabulary方法的典型用法代码示例。如果您正苦于以下问题:Python Bunch.vocabulary方法的具体用法?Python Bunch.vocabulary怎么用?Python Bunch.vocabulary使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.datasets.base.Bunch的用法示例。


在下文中一共展示了Bunch.vocabulary方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: gen_tf_idf_space

# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import vocabulary [as 别名]
def gen_tf_idf_space():
    bunch = read_object(train_data)
    tf_idf_space = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, vocabulary={})

    vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5)
    transformer = TfidfTransformer()

    tf_idf_space.tdm = vectorizer.fit_transform(bunch.contents)
    tf_idf_space.vocabulary = vectorizer.vocabulary_
    save_object(tf_idf_space_data, tf_idf_space)
开发者ID:Eric-aihua,项目名称:MachineLearning,代码行数:12,代码来源:n_bayes_main.py

示例2: calc_tfidf

# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import vocabulary [as 别名]
def calc_tfidf(trainsetfile,stopwordfile,dstdir):
    data_set = joblib.load(trainsetfile)
    wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
    wordbag.target_name = data_set.tatget_name
    wordbag.label = data_set.label
    
    corpus = data_set.contents
    stopwordlist = read_stopword(stopwordfile)
    vectorize = TfidfVectorizer(sublinear_tf=True,max_df = 0.8,min_df=3,max_features=50000,stop_words=stopwordlist)
    feature_train = vectorize.fit_transform(corpus)
    wordbag.tdm = feature_train
    wordbag.vocabulary = vectorize.vocabulary_
    joblib.dump(wordbag,dstdir+"/"+"word_bag.data",compress=3)
开发者ID:wadeallstar,项目名称:python-fraud-detect,代码行数:15,代码来源:process_tool.py

示例3: execute_NM_predict

# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import vocabulary [as 别名]
def execute_NM_predict():
    test_bunch = read_object(test_data)

    test_space = Bunch(target_name=test_bunch.target_name, label=test_bunch.label, filenames=test_bunch.filenames,
                       tdm=[], vocabulary={})

    tf_idf_bunch = read_object(tf_idf_space_data)
    vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5,
                                 vocabulary=tf_idf_bunch.vocabulary)
    transformer = TfidfTransformer()

    test_space.tdm = vectorizer.fit_transform(test_bunch.contents)
    test_space.vocabulary = tf_idf_bunch.vocabulary

    clf = MultinomialNB(alpha=0.001).fit(tf_idf_bunch.tdm, tf_idf_bunch.label)
    #预测结果
    predicted = clf.predict(test_space.tdm)
    #对结果进行更加友好的打印
    for label, file_name, excect_cate in zip(test_bunch.label, test_bunch.filenames, predicted):
        print file_name, ' 实际类别:', label, ' 预测类别:', excect_cate
开发者ID:Eric-aihua,项目名称:MachineLearning,代码行数:22,代码来源:n_bayes_main.py

示例4: writebunchobj

# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import vocabulary [as 别名]
	file_obj.close()
	return bunch
#写入bunch对象	
def writebunchobj(path,bunchobj):
	file_obj = open(path, "wb")
	pickle.dump(bunchobj,file_obj) 
	file_obj.close()	

# 1. 读取停用词表	
stopword_path = "train_word_bag/hlt_stop_words.txt"
stpwrdlst = readfile(stopword_path).splitlines()

# 2. 导入分词后的词向量bunch对象
path = "train_word_bag/train_set.dat"        # 词向量空间保存路径
bunch	= readbunchobj(path)

# 3. 构建tf-idf词向量空间对象
tfidfspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})

# 4. 使用TfidfVectorizer初始化向量空间模型 
vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5)
transformer=TfidfTransformer() # 该类会统计每个词语的tf-idf权值
# 文本转为词频矩阵,单独保存字典文件 
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_

# 创建词袋的持久化
space_path = "train_word_bag/tfdifspace.dat"        # 词向量空间保存路径
writebunchobj(space_path,tfidfspace)

print "if-idf词向量空间创建成功!!!"
开发者ID:2297988468,项目名称:Chinese-Text-Classification,代码行数:33,代码来源:vector_space.py

示例5:

# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import vocabulary [as 别名]
stpwrd_content=stpwrd_dic.read()

#将停用词转换为list
stpwrdlst=stpwrd_content.splitlines()
stpwrd_dic.close()

#计算词袋创建时间:获取开始时间
start=datetime.datetime.now()
#使用tfidfvectorizer初始化向量空间模型---创建词袋
vectorizer=TfidfVectorizer(sublinear_tf=True,max_df=0.5,stop_words=stpwrdlst)

#该类会统计每个词语的tf-idf权值
transformer=TfidfTransformer()

#文本转为词频矩阵
fea_train=vectorizer.fit_transform(corpus)

#计算词袋时间,结束时间
end=datetime.datetime.now()
print 'create word bag peroid',(end-start).seconds

#计算词袋的行列数
print 'size of fea_train',fea_train.shape
#为tdm赋值
wordbag.tdm=fea_train
wordbag.vocabulary=vectorizer.vocabulary_
#创建词袋的持久化
file_obj=open('text_corpus1_wordbag/word_bag.data','wb')
pickle.dump(wordbag,file_obj)
file_obj.close()
开发者ID:Pengfei-Zhu,项目名称:DataMining,代码行数:32,代码来源:tf-idffinal.py

示例6: writebunchobj

# 需要导入模块: from sklearn.datasets.base import Bunch [as 别名]
# 或者: from sklearn.datasets.base.Bunch import vocabulary [as 别名]
	return bunch
#写入bunch对象	
def writebunchobj(path,bunchobj):
	file_obj = open(path, "wb")
	pickle.dump(bunchobj,file_obj) 
	file_obj.close()	

# 1. 读取停用词表	
stopword_path = "train_word_bag/hlt_stop_words.txt"
stpwrdlst = readfile(stopword_path).splitlines()

# 2. 导入分词后的词向量bunch对象
path = "test_word_bag/test_set.dat"        # 词向量空间保存路径
bunch	= readbunchobj(path)

# 3. 构建测试集tfidf向量空间
testspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})
# 4. 导入训练集的词袋
trainbunch = readbunchobj("train_word_bag/tfdifspace.dat")
# 5. 使用TfidfVectorizer初始化向量空间模型 
vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5,vocabulary=trainbunch.vocabulary)
transformer=TfidfTransformer() # 该类会统计每个词语的tf-idf权值
# 文本转为tf-idf矩阵,单独保存字典文件 
testspace.tdm = vectorizer.fit_transform(bunch.contents)
testspace.vocabulary = trainbunch.vocabulary

# 创建词袋的持久化
space_path = "test_word_bag/testspace.dat"        # 词向量空间保存路径
writebunchobj(space_path,testspace)

print "test词向量空间创建成功!!!"
开发者ID:2297988468,项目名称:Chinese-Text-Classification,代码行数:33,代码来源:test_space.py


注:本文中的sklearn.datasets.base.Bunch.vocabulary方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。