本文整理匯總了Python中sklearn.datasets.base.Bunch.target_name方法的典型用法代碼示例。如果您正苦於以下問題:Python Bunch.target_name方法的具體用法?Python Bunch.target_name怎麽用?Python Bunch.target_name使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.datasets.base.Bunch
的用法示例。
在下文中一共展示了Bunch.target_name方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: calc_tfidf
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import target_name [as 別名]
def calc_tfidf(trainsetfile,stopwordfile,dstdir):
data_set = joblib.load(trainsetfile)
wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name = data_set.tatget_name
wordbag.label = data_set.label
corpus = data_set.contents
stopwordlist = read_stopword(stopwordfile)
vectorize = TfidfVectorizer(sublinear_tf=True,max_df = 0.8,min_df=3,max_features=50000,stop_words=stopwordlist)
feature_train = vectorize.fit_transform(corpus)
wordbag.tdm = feature_train
wordbag.vocabulary = vectorize.vocabulary_
joblib.dump(wordbag,dstdir+"/"+"word_bag.data",compress=3)
示例2: testset_tfidf
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import target_name [as 別名]
def testset_tfidf(testsetfile,stopwordfile,myvocabulary):
data_set = joblib.load(testsetfile)
wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name = data_set.tatget_name
wordbag.label = data_set.label
corpus = data_set.contents
stopwordlist = read_stopword(stopwordfile)
vectorize = TfidfVectorizer(sublinear_tf=True,stop_words=stopwordlist,vocabulary=myvocabulary)
feature_train = vectorize.fit_transform(corpus)
wordbag.tdm = feature_train
joblib.dump(wordbag,"test_wordbag/test_word_bag.data",compress=3)
return wordbag
示例3: train_bags
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import target_name [as 別名]
def train_bags(token_path,filename,wordbag_path):
data_set = Bunch(tatget_name=[],label=[],filenames=[],contents=[])
dir_list = os.listdir(token_path)
data_set.target_name = dir_list
for file in dir_list:
file_name = token_path+"/"+file
file_read = open(file_name,"r")
for line in file_read:
data_set.label.append(data_set.target_name.index(file))
data_set.contents.append(line.strip())
file_read.close()
#持久化
joblib.dump(data_set, wordbag_path+"/"+filename, compress=3)
示例4: reload
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import target_name [as 別名]
from sklearn.externals import joblib
import jieba
from sklearn.feature_extraction.text import HashingVectorizer
reload(sys)
# sys.setdefaultencoding('utf-8')
token_path = "token"+"/"
#次袋語料路徑
wordbag_path = "wordbag"+"/"
#是引用bunch存儲
data_set = Bunch(target_name=[],label=[],filenames=[],contents=[])
dir_list = os.listdir(token_path)
data_set.target_name = dir_list
for file in dir_list:
file_name = token_path+file
file_read = open(file_name,"r")
for line in file_read:
data_set.label.append(data_set.target_name.index(file))
data_set.contents.append(line.strip())
file_read.close()
#持久化
joblib.dump(data_set, wordbag_path+"train_set1124.data", compress=3)
#驗證
data_set = joblib.load(wordbag_path+"train_set1124.data")
print data_set.target_name
示例5: reload
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import target_name [as 別名]
reload(sys)
#導入訓練預料
data_set={}
#訓練語料集路徑
train_path='text_corpus1_wordbag/train_set.data'
file_obj=open(train_path,'rb')
#讀取持久化後的對象
data_set=pickle.load(file_obj)
file_obj.close()
#定義詞袋數據結構
wordbag=Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name=data_set.target_name
wordbag.label=data_set.label
wordbag.filenames=data_set.filenames
#構建語料
corpus=data_set.contents
#從文件導入停用詞表
stpwrdpath='extra_dict/hlt_stop_words.txt'
stpwrd_dic=open(stpwrdpath,'rb')
stpwrd_content=stpwrd_dic.read()
#將停用詞轉換為list
stpwrdlst=stpwrd_content.splitlines()
stpwrd_dic.close()
示例6: Bunch
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import target_name [as 別名]
# 詞袋語料路徑
wordbag_path = "text_corpus_wordbag/"
if not os.path.exists(wordbag_path):
os.makedirs(wordbag_path)
# Bunch類提供一種key,value的對象形式
# target_name:所有分類名稱列表
# label:每個文件的分類標簽列表
# filenames:文件名稱
# contents:文件內容
data_set = Bunch(target_name=[], label=[], filenames=[], contents=[])
# 獲取seg_path下的所有子分類
class_list = os.listdir(seg_path)
data_set.target_name = class_list
# 獲取每個子目錄下所有的文件
for mydir in class_list:
class_path = seg_path + mydir + "/"
file_list = os.listdir(class_path) # 獲取class_path下的所有文件
for file_name in file_list:
file_path = class_path + file_name
data_set.filenames.append(file_path) # 把文件路徑附加到數據集中
data_set.label.append(data_set.target_name.index(mydir)) # 把文件分類標簽附加到數據集中
with open(file_path, 'r', encoding='gb18030') as file:
seg_corpus = file.read() # 讀取語料
data_set.contents.append(seg_corpus) # 構建分詞文本內容列表
# 訓練集對象持久化