本文整理匯總了Python中sklearn.datasets.base.Bunch.vocabulary方法的典型用法代碼示例。如果您正苦於以下問題:Python Bunch.vocabulary方法的具體用法?Python Bunch.vocabulary怎麽用?Python Bunch.vocabulary使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.datasets.base.Bunch
的用法示例。
在下文中一共展示了Bunch.vocabulary方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: gen_tf_idf_space
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import vocabulary [as 別名]
def gen_tf_idf_space():
bunch = read_object(train_data)
tf_idf_space = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, vocabulary={})
vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5)
transformer = TfidfTransformer()
tf_idf_space.tdm = vectorizer.fit_transform(bunch.contents)
tf_idf_space.vocabulary = vectorizer.vocabulary_
save_object(tf_idf_space_data, tf_idf_space)
示例2: calc_tfidf
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import vocabulary [as 別名]
def calc_tfidf(trainsetfile,stopwordfile,dstdir):
data_set = joblib.load(trainsetfile)
wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name = data_set.tatget_name
wordbag.label = data_set.label
corpus = data_set.contents
stopwordlist = read_stopword(stopwordfile)
vectorize = TfidfVectorizer(sublinear_tf=True,max_df = 0.8,min_df=3,max_features=50000,stop_words=stopwordlist)
feature_train = vectorize.fit_transform(corpus)
wordbag.tdm = feature_train
wordbag.vocabulary = vectorize.vocabulary_
joblib.dump(wordbag,dstdir+"/"+"word_bag.data",compress=3)
示例3: execute_NM_predict
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import vocabulary [as 別名]
def execute_NM_predict():
test_bunch = read_object(test_data)
test_space = Bunch(target_name=test_bunch.target_name, label=test_bunch.label, filenames=test_bunch.filenames,
tdm=[], vocabulary={})
tf_idf_bunch = read_object(tf_idf_space_data)
vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5,
vocabulary=tf_idf_bunch.vocabulary)
transformer = TfidfTransformer()
test_space.tdm = vectorizer.fit_transform(test_bunch.contents)
test_space.vocabulary = tf_idf_bunch.vocabulary
clf = MultinomialNB(alpha=0.001).fit(tf_idf_bunch.tdm, tf_idf_bunch.label)
#預測結果
predicted = clf.predict(test_space.tdm)
#對結果進行更加友好的打印
for label, file_name, excect_cate in zip(test_bunch.label, test_bunch.filenames, predicted):
print file_name, ' 實際類別:', label, ' 預測類別:', excect_cate
示例4: writebunchobj
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import vocabulary [as 別名]
file_obj.close()
return bunch
#寫入bunch對象
def writebunchobj(path,bunchobj):
file_obj = open(path, "wb")
pickle.dump(bunchobj,file_obj)
file_obj.close()
# 1. 讀取停用詞表
stopword_path = "train_word_bag/hlt_stop_words.txt"
stpwrdlst = readfile(stopword_path).splitlines()
# 2. 導入分詞後的詞向量bunch對象
path = "train_word_bag/train_set.dat" # 詞向量空間保存路徑
bunch = readbunchobj(path)
# 3. 構建tf-idf詞向量空間對象
tfidfspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})
# 4. 使用TfidfVectorizer初始化向量空間模型
vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5)
transformer=TfidfTransformer() # 該類會統計每個詞語的tf-idf權值
# 文本轉為詞頻矩陣,單獨保存字典文件
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_
# 創建詞袋的持久化
space_path = "train_word_bag/tfdifspace.dat" # 詞向量空間保存路徑
writebunchobj(space_path,tfidfspace)
print "if-idf詞向量空間創建成功!!!"
示例5:
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import vocabulary [as 別名]
stpwrd_content=stpwrd_dic.read()
#將停用詞轉換為list
stpwrdlst=stpwrd_content.splitlines()
stpwrd_dic.close()
#計算詞袋創建時間:獲取開始時間
start=datetime.datetime.now()
#使用tfidfvectorizer初始化向量空間模型---創建詞袋
vectorizer=TfidfVectorizer(sublinear_tf=True,max_df=0.5,stop_words=stpwrdlst)
#該類會統計每個詞語的tf-idf權值
transformer=TfidfTransformer()
#文本轉為詞頻矩陣
fea_train=vectorizer.fit_transform(corpus)
#計算詞袋時間,結束時間
end=datetime.datetime.now()
print 'create word bag peroid',(end-start).seconds
#計算詞袋的行列數
print 'size of fea_train',fea_train.shape
#為tdm賦值
wordbag.tdm=fea_train
wordbag.vocabulary=vectorizer.vocabulary_
#創建詞袋的持久化
file_obj=open('text_corpus1_wordbag/word_bag.data','wb')
pickle.dump(wordbag,file_obj)
file_obj.close()
示例6: writebunchobj
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import vocabulary [as 別名]
return bunch
#寫入bunch對象
def writebunchobj(path,bunchobj):
file_obj = open(path, "wb")
pickle.dump(bunchobj,file_obj)
file_obj.close()
# 1. 讀取停用詞表
stopword_path = "train_word_bag/hlt_stop_words.txt"
stpwrdlst = readfile(stopword_path).splitlines()
# 2. 導入分詞後的詞向量bunch對象
path = "test_word_bag/test_set.dat" # 詞向量空間保存路徑
bunch = readbunchobj(path)
# 3. 構建測試集tfidf向量空間
testspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})
# 4. 導入訓練集的詞袋
trainbunch = readbunchobj("train_word_bag/tfdifspace.dat")
# 5. 使用TfidfVectorizer初始化向量空間模型
vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5,vocabulary=trainbunch.vocabulary)
transformer=TfidfTransformer() # 該類會統計每個詞語的tf-idf權值
# 文本轉為tf-idf矩陣,單獨保存字典文件
testspace.tdm = vectorizer.fit_transform(bunch.contents)
testspace.vocabulary = trainbunch.vocabulary
# 創建詞袋的持久化
space_path = "test_word_bag/testspace.dat" # 詞向量空間保存路徑
writebunchobj(space_path,testspace)
print "test詞向量空間創建成功!!!"