當前位置: 首頁>>代碼示例>>Python>>正文


Python Bunch.tdm方法代碼示例

本文整理匯總了Python中sklearn.datasets.base.Bunch.tdm方法的典型用法代碼示例。如果您正苦於以下問題:Python Bunch.tdm方法的具體用法?Python Bunch.tdm怎麽用?Python Bunch.tdm使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在sklearn.datasets.base.Bunch的用法示例。


在下文中一共展示了Bunch.tdm方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: gen_tf_idf_space

# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 別名]
def gen_tf_idf_space():
    bunch = read_object(train_data)
    tf_idf_space = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, vocabulary={})

    vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5)
    transformer = TfidfTransformer()

    tf_idf_space.tdm = vectorizer.fit_transform(bunch.contents)
    tf_idf_space.vocabulary = vectorizer.vocabulary_
    save_object(tf_idf_space_data, tf_idf_space)
開發者ID:Eric-aihua,項目名稱:MachineLearning,代碼行數:12,代碼來源:n_bayes_main.py

示例2: calc_tfidf

# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 別名]
def calc_tfidf(trainsetfile,stopwordfile,dstdir):
    data_set = joblib.load(trainsetfile)
    wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
    wordbag.target_name = data_set.tatget_name
    wordbag.label = data_set.label
    
    corpus = data_set.contents
    stopwordlist = read_stopword(stopwordfile)
    vectorize = TfidfVectorizer(sublinear_tf=True,max_df = 0.8,min_df=3,max_features=50000,stop_words=stopwordlist)
    feature_train = vectorize.fit_transform(corpus)
    wordbag.tdm = feature_train
    wordbag.vocabulary = vectorize.vocabulary_
    joblib.dump(wordbag,dstdir+"/"+"word_bag.data",compress=3)
開發者ID:wadeallstar,項目名稱:python-fraud-detect,代碼行數:15,代碼來源:process_tool.py

示例3: testset_tfidf

# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 別名]
def testset_tfidf(testsetfile,stopwordfile,myvocabulary):
    data_set = joblib.load(testsetfile)
    wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
    wordbag.target_name = data_set.tatget_name
    wordbag.label = data_set.label
    
    corpus = data_set.contents
    stopwordlist = read_stopword(stopwordfile)
    vectorize = TfidfVectorizer(sublinear_tf=True,stop_words=stopwordlist,vocabulary=myvocabulary)
    feature_train = vectorize.fit_transform(corpus)
    wordbag.tdm = feature_train
    joblib.dump(wordbag,"test_wordbag/test_word_bag.data",compress=3)
    return wordbag
    
開發者ID:wadeallstar,項目名稱:python-fraud-detect,代碼行數:15,代碼來源:process_tool.py

示例4: execute_NM_predict

# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 別名]
def execute_NM_predict():
    test_bunch = read_object(test_data)

    test_space = Bunch(target_name=test_bunch.target_name, label=test_bunch.label, filenames=test_bunch.filenames,
                       tdm=[], vocabulary={})

    tf_idf_bunch = read_object(tf_idf_space_data)
    vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5,
                                 vocabulary=tf_idf_bunch.vocabulary)
    transformer = TfidfTransformer()

    test_space.tdm = vectorizer.fit_transform(test_bunch.contents)
    test_space.vocabulary = tf_idf_bunch.vocabulary

    clf = MultinomialNB(alpha=0.001).fit(tf_idf_bunch.tdm, tf_idf_bunch.label)
    #預測結果
    predicted = clf.predict(test_space.tdm)
    #對結果進行更加友好的打印
    for label, file_name, excect_cate in zip(test_bunch.label, test_bunch.filenames, predicted):
        print file_name, ' 實際類別:', label, ' 預測類別:', excect_cate
開發者ID:Eric-aihua,項目名稱:MachineLearning,代碼行數:22,代碼來源:n_bayes_main.py

示例5: writebunchobj

# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 別名]
	file_obj.close()
	return bunch
#寫入bunch對象	
def writebunchobj(path,bunchobj):
	file_obj = open(path, "wb")
	pickle.dump(bunchobj,file_obj) 
	file_obj.close()	

# 1. 讀取停用詞表	
stopword_path = "train_word_bag/hlt_stop_words.txt"
stpwrdlst = readfile(stopword_path).splitlines()

# 2. 導入分詞後的詞向量bunch對象
path = "train_word_bag/train_set.dat"        # 詞向量空間保存路徑
bunch	= readbunchobj(path)

# 3. 構建tf-idf詞向量空間對象
tfidfspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})

# 4. 使用TfidfVectorizer初始化向量空間模型 
vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5)
transformer=TfidfTransformer() # 該類會統計每個詞語的tf-idf權值
# 文本轉為詞頻矩陣,單獨保存字典文件 
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_

# 創建詞袋的持久化
space_path = "train_word_bag/tfdifspace.dat"        # 詞向量空間保存路徑
writebunchobj(space_path,tfidfspace)

print "if-idf詞向量空間創建成功!!!"
開發者ID:2297988468,項目名稱:Chinese-Text-Classification,代碼行數:33,代碼來源:vector_space.py

示例6:

# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 別名]
stpwrd_content=stpwrd_dic.read()

#將停用詞轉換為list
stpwrdlst=stpwrd_content.splitlines()
stpwrd_dic.close()

#計算詞袋創建時間:獲取開始時間
start=datetime.datetime.now()
#使用tfidfvectorizer初始化向量空間模型---創建詞袋
vectorizer=TfidfVectorizer(sublinear_tf=True,max_df=0.5,stop_words=stpwrdlst)

#該類會統計每個詞語的tf-idf權值
transformer=TfidfTransformer()

#文本轉為詞頻矩陣
fea_train=vectorizer.fit_transform(corpus)

#計算詞袋時間,結束時間
end=datetime.datetime.now()
print 'create word bag peroid',(end-start).seconds

#計算詞袋的行列數
print 'size of fea_train',fea_train.shape
#為tdm賦值
wordbag.tdm=fea_train
wordbag.vocabulary=vectorizer.vocabulary_
#創建詞袋的持久化
file_obj=open('text_corpus1_wordbag/word_bag.data','wb')
pickle.dump(wordbag,file_obj)
file_obj.close()
開發者ID:Pengfei-Zhu,項目名稱:DataMining,代碼行數:32,代碼來源:tf-idffinal.py

示例7: open

# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 別名]
#導入停用詞
stopwordpath = "extra_dict/stop_words.txt"
stopword_dic = open(stopwordpath,'r')
stopword_content = stopword_dic.read()
#將停用詞轉為list
stopwordlist = stopword_content.splitlines()
stopword_dic.close()

#詞袋創建時間
start = datetime.datetime.now()
print start
#使用 TfidfVectorizer創建詞袋
vectorize = TfidfVectorizer(sublinear_tf=True,max_df = 0.8,min_df=3,max_features=50000,stop_words=stopwordlist)

feature_train = vectorize.fit_transform(corpus)

#計算詞袋結束時間
end = datetime.datetime.now()
print 'create word bag peroid:',(end - start).seconds,"seconds"

#計算詞袋的行列數
print 'Size of fea_train:\n',feature_train.shape

wordbag.tdm = feature_train
wordbag.vocabulary = vectorize.vocabulary_
# print wordbag.vocabulary
# print wordbag.tdm[0:5]
#持久化
joblib.dump(wordbag,"wordbag/word_bag1124.data",compress=9)
word_bag1124 = joblib.load("wordbag/word_bag1124.data")
print word_bag1124.target_name
開發者ID:wadeallstar,項目名稱:python-fraud-detect,代碼行數:33,代碼來源:tf-idf.py

示例8: fetch_20newsgroups

# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import tdm [as 別名]
## 導入數據
categories = ["alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med"]  # 選取需要下載的新聞分類
data_set = fetch_20newsgroups(subset="train", categories=categories, shuffle=True, random_state=42)  # 下載並獲取訓練數據, 也是先全部下載, 再提取部分
print(data_set.target_names)  # ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
##################################################################
## 定義詞袋數據結構
# tdm:tf-idf 計算後詞袋
stpwrdlst = []  # 停用詞表為 空
wordbag = Bunch(target_name=[], label=[], filenames=[], tdm=[], vocabulary={}, stpwrdlst=[])
wordbag.target_name = data_set.target_names
wordbag.label = data_set.target
wordbag.filenames = data_set.filenames
wordbag.stpwrdlst = stpwrdlst

vectorizer = CountVectorizer(stop_words=stpwrdlst)  # 使用 TfidfVectorizer 初始化向量空間模型--創建詞袋
transformer = TfidfTransformer()  # 該類會統計每個詞語的 tf-idf 權值
fea_train = vectorizer.fit_transform(data_set.data)  # 文本轉為詞頻矩陣
print(fea_train.shape)  # (2257, 35788); 2257 篇文檔, 35788 個單詞

wordbag.tdm = fea_train  # 為 tdm 賦值
wordbag.vocabulary = vectorizer.vocabulary_
##################################################################
## 創建詞袋的持久化
file_obj = open("tmp.data", "wb")
pickle.dump(wordbag, file_obj)
file_obj.close()
##################################################################
## 讀取
with open('tmp.data', 'rb') as f: clf2 = pickle.load(f)
print(clf2.tdm.shape)  # (2257, 35788)
開發者ID:coder352,項目名稱:shellscript,代碼行數:32,代碼來源:l20_Bunch-封裝_pickle-保存.py


注:本文中的sklearn.datasets.base.Bunch.tdm方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。