本文整理汇总了Python中sklearn.datasets.base.Bunch类的典型用法代码示例。如果您正苦于以下问题:Python Bunch类的具体用法?Python Bunch怎么用?Python Bunch使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Bunch类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: shuffleData
def shuffleData(self, res):
shuffle(res)
train = Bunch()
train.data = map(lambda x:x[1], res)
train.target = map(lambda x:x[0], res)
train.target_names = self.names
return train
示例2: gen_tf_idf_space
def gen_tf_idf_space():
bunch = read_object(train_data)
tf_idf_space = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, vocabulary={})
vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5)
transformer = TfidfTransformer()
tf_idf_space.tdm = vectorizer.fit_transform(bunch.contents)
tf_idf_space.vocabulary = vectorizer.vocabulary_
save_object(tf_idf_space_data, tf_idf_space)
示例3: calc_tfidf
def calc_tfidf(trainsetfile,stopwordfile,dstdir):
data_set = joblib.load(trainsetfile)
wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name = data_set.tatget_name
wordbag.label = data_set.label
corpus = data_set.contents
stopwordlist = read_stopword(stopwordfile)
vectorize = TfidfVectorizer(sublinear_tf=True,max_df = 0.8,min_df=3,max_features=50000,stop_words=stopwordlist)
feature_train = vectorize.fit_transform(corpus)
wordbag.tdm = feature_train
wordbag.vocabulary = vectorize.vocabulary_
joblib.dump(wordbag,dstdir+"/"+"word_bag.data",compress=3)
示例4: testset_tfidf
def testset_tfidf(testsetfile,stopwordfile,myvocabulary):
data_set = joblib.load(testsetfile)
wordbag = Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name = data_set.tatget_name
wordbag.label = data_set.label
corpus = data_set.contents
stopwordlist = read_stopword(stopwordfile)
vectorize = TfidfVectorizer(sublinear_tf=True,stop_words=stopwordlist,vocabulary=myvocabulary)
feature_train = vectorize.fit_transform(corpus)
wordbag.tdm = feature_train
joblib.dump(wordbag,"test_wordbag/test_word_bag.data",compress=3)
return wordbag
示例5: train_bags
def train_bags(token_path,filename,wordbag_path):
data_set = Bunch(tatget_name=[],label=[],filenames=[],contents=[])
dir_list = os.listdir(token_path)
data_set.target_name = dir_list
for file in dir_list:
file_name = token_path+"/"+file
file_read = open(file_name,"r")
for line in file_read:
data_set.label.append(data_set.target_name.index(file))
data_set.contents.append(line.strip())
file_read.close()
#持久化
joblib.dump(data_set, wordbag_path+"/"+filename, compress=3)
示例6: test_bunch_pickle_generated_with_0_16_and_read_with_0_17
def test_bunch_pickle_generated_with_0_16_and_read_with_0_17():
bunch = Bunch(key='original')
# This reproduces a problem when Bunch pickles have been created
# with scikit-learn 0.16 and are read with 0.17. Basically there
# is a suprising behaviour because reading bunch.key uses
# bunch.__dict__ (which is non empty for 0.16 Bunch objects)
# whereas assigning into bunch.key uses bunch.__setattr__. See
# https://github.com/scikit-learn/scikit-learn/issues/6196 for
# more details
bunch.__dict__['key'] = 'set from __dict__'
bunch_from_pkl = loads(dumps(bunch))
# After loading from pickle the __dict__ should have been ignored
assert_equal(bunch_from_pkl.key, 'original')
assert_equal(bunch_from_pkl['key'], 'original')
# Making sure that changing the attr does change the value
# associated with __getitem__ as well
bunch_from_pkl.key = 'changed'
assert_equal(bunch_from_pkl.key, 'changed')
assert_equal(bunch_from_pkl['key'], 'changed')
示例7: execute_NM_predict
def execute_NM_predict():
test_bunch = read_object(test_data)
test_space = Bunch(target_name=test_bunch.target_name, label=test_bunch.label, filenames=test_bunch.filenames,
tdm=[], vocabulary={})
tf_idf_bunch = read_object(tf_idf_space_data)
vectorizer = TfidfVectorizer(stop_words=load_stop_words(), sublinear_tf=True, max_df=0.5,
vocabulary=tf_idf_bunch.vocabulary)
transformer = TfidfTransformer()
test_space.tdm = vectorizer.fit_transform(test_bunch.contents)
test_space.vocabulary = tf_idf_bunch.vocabulary
clf = MultinomialNB(alpha=0.001).fit(tf_idf_bunch.tdm, tf_idf_bunch.label)
#预测结果
predicted = clf.predict(test_space.tdm)
#对结果进行更加友好的打印
for label, file_name, excect_cate in zip(test_bunch.label, test_bunch.filenames, predicted):
print file_name, ' 实际类别:', label, ' 预测类别:', excect_cate
示例8: scatter3d
def scatter3d(X, fig=None,ax=None ,color='b',cs=None, colorsMap='jet'):
if (cs is not None):
cm = plt.get_cmap(colorsMap)
cNorm = matplotlib.colors.Normalize(vmin=min(cs), vmax=max(cs))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm)
if (ax is None):
fig = plt.figure()
ax = Axes3D(fig)
if (cs is None):
ax.scatter(X[:, 0], X[:, 1], X[:, 2],c=color)
else:
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=scalarMap.to_rgba(cs))
scalarMap.set_array(cs)
fig.colorbar(scalarMap)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
plt.show()
b=Bunch()
b.fig=fig
b.ax=ax
return b
示例9: reload
import os
from sklearn.datasets.base import Bunch
from sklearn.externals import joblib
import jieba
from sklearn.feature_extraction.text import HashingVectorizer
reload(sys)
# sys.setdefaultencoding('utf-8')
token_path = "token"+"/"
#次袋语料路径
wordbag_path = "wordbag"+"/"
#是引用bunch存储
data_set = Bunch(target_name=[],label=[],filenames=[],contents=[])
dir_list = os.listdir(token_path)
data_set.target_name = dir_list
for file in dir_list:
file_name = token_path+file
file_read = open(file_name,"r")
for line in file_read:
data_set.label.append(data_set.target_name.index(file))
data_set.contents.append(line.strip())
file_read.close()
#持久化
joblib.dump(data_set, wordbag_path+"train_set1124.data", compress=3)
#验证
示例10: reload
from sklearn.feature_extraction.text import TfidfVectorizer
reload(sys)
#导入训练预料
data_set={}
#训练语料集路径
train_path='text_corpus1_wordbag/train_set.data'
file_obj=open(train_path,'rb')
#读取持久化后的对象
data_set=pickle.load(file_obj)
file_obj.close()
#定义词袋数据结构
wordbag=Bunch(target_name=[],label=[],filenames=[],tdm=[],vocabulary={})
wordbag.target_name=data_set.target_name
wordbag.label=data_set.label
wordbag.filenames=data_set.filenames
#构建语料
corpus=data_set.contents
#从文件导入停用词表
stpwrdpath='extra_dict/hlt_stop_words.txt'
stpwrd_dic=open(stpwrdpath,'rb')
stpwrd_content=stpwrd_dic.read()
#将停用词转换为list
stpwrdlst=stpwrd_content.splitlines()
stpwrd_dic.close()
示例11: main
def main():
vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1),
token_pattern='\\b\\w+\\b') #, tokenizer=StemTokenizer())
vct_analizer = vct.build_tokenizer()
print("Start loading ...")
# data fields: data, bow, file_names, target_names, target
########## NEWS GROUPS ###############
# easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
categories = [['alt.atheism', 'talk.religion.misc'],
['comp.graphics', 'comp.windows.x'],
['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
['rec.sport.baseball', 'sci.crypt']]
min_size = 10 # max(10, args.fixk)
args.fixk = None
data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)
print("Data %s" % args.train)
print("Data size %s" % len(data.train.data))
### SENTENCE TRANSFORMATION
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
## delete <br> to "." to recognize as end of sentence
data.train.data = experiment_utils.clean_html(data.train.data)
data.test.data = experiment_utils.clean_html(data.test.data)
print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
## Get the features of the sentence dataset
## create splits of data: pool, test, oracle, sentences
expert_data = Bunch()
train_test_data = Bunch()
expert_data.sentence, train_test_data.pool = split_data(data.train)
expert_data.oracle, train_test_data.test = split_data(data.test)
data.train.data = train_test_data.pool.train.data
data.train.target = train_test_data.pool.train.target
data.test.data = train_test_data.test.train.data
data.test.target = train_test_data.test.train.target
## convert document to matrix
data.train.bow = vct.fit_transform(data.train.data)
data.test.bow = vct.transform(data.test.data)
#### EXPERT CLASSIFIER: ORACLE
print("Training Oracle expert")
labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector)
expert_data.oracle.train.data = sent_train
expert_data.oracle.train.target = np.array(labels)
expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
#### EXPERT CLASSIFIER: SENTENCES
print("Training sentence expert")
labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector)
expert_data.sentence.train.data = sent_train
expert_data.sentence.train.target = np.array(labels)
expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)
sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)
#### TESTING THE CLASSIFERS
test_target, test_data = split_data_sentences(data.test,sent_detector)
test_data_bow = vct.transform(test_data)
#pred_sent = sent_clf.predict(test_data_bow)
pred_ora = exp_clf.predict(test_data_bow)
y_probas = sent_clf.predict_proba(test_data_bow)
pred_sent = sent_clf.classes_[np.argmax(y_probas, axis=1)]
## just based on one class probability
# order = np.argsort(y_probas[:,0])
order = np.argsort(y_probas.max(axis=1))
print "ORACLE\tSENTENCE\tMAX-SENT"
# for i in order[:500]:
# print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
for i in order[-500:]:
print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
print "Accuracy of Sentences Classifier", metrics.accuracy_score(test_target, pred_sent)
print "Class distribution: %s" % pred_sent.sum()
print "Size of data: %s" % pred_sent.shape[0]
sizes = [50, 100, 500, 1000, 2000, 3000, 4000, 20000]
#.........这里部分代码省略.........
示例12: main
def main():
print args
print
accuracies = defaultdict(lambda: [])
ora_accu = defaultdict(lambda: [])
oracle_accuracies =[]
ora_cm = defaultdict(lambda: [])
lbl_dit = defaultdict(lambda: [])
aucs = defaultdict(lambda: [])
x_axis = defaultdict(lambda: [])
vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1),
token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
print("Start loading ...")
# data fields: data, bow, file_names, target_names, target
########## NEWS GROUPS ###############
# easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
categories = [['alt.atheism', 'talk.religion.misc'],
['comp.graphics', 'comp.windows.x'],
['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
['rec.sport.baseball', 'sci.crypt']]
min_size = 10
args.fixk = None
data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)
print("Data %s" % args.train)
print("Data size %s" % len(data.train.data))
parameters = experiment_utils.parse_parameters_mat(args.cost_model)
print "Cost Parameters %s" % parameters
cost_model = experiment_utils.set_cost_model(args.cost_function, parameters=parameters)
print "\nCost Model: %s" % cost_model.__class__.__name__
### SENTENCE TRANSFORMATION
if args.train == "twitter":
sent_detector = TwitterSentenceTokenizer()
else:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
## delete <br> to "." to recognize as end of sentence
data.train.data = experiment_utils.clean_html(data.train.data)
data.test.data = experiment_utils.clean_html(data.test.data)
print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
## Get the features of the sentence dataset
## create splits of data: pool, test, oracle, sentences
expert_data = Bunch()
if not args.fulloracle:
train_test_data = Bunch()
expert_data.sentence, train_test_data.pool = split_data(data.train)
expert_data.oracle, train_test_data.test = split_data(data.test)
data.train.data = train_test_data.pool.train.data
data.train.target = train_test_data.pool.train.target
data.test.data = train_test_data.test.train.data
data.test.target = train_test_data.test.train.target
## convert document to matrix
data.train.bow = vct.fit_transform(data.train.data)
data.test.bow = vct.transform(data.test.data)
#### EXPERT CLASSIFIER: ORACLE
print("Training Oracle expert")
exp_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty)
if not args.fulloracle:
print "Training expert documents:%s" % len(expert_data.oracle.train.data)
labels, sent_train = experiment_utils.split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=args.limit)
expert_data.oracle.train.data = sent_train
expert_data.oracle.train.target = np.array(labels)
expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
else:
# expert_data.data = np.concatenate((data.train.data, data.test.data))
# expert_data.target = np.concatenate((data.train.target, data.test.target))
expert_data.data =data.train.data
expert_data.target = data.train.target
expert_data.target_names = data.train.target_names
labels, sent_train = experiment_utils.split_data_sentences(expert_data, sent_detector, vct, limit=args.limit)
expert_data.bow = vct.transform(sent_train)
expert_data.target = labels
expert_data.data = sent_train
exp_clf.fit(expert_data.bow, expert_data.target)
#.........这里部分代码省略.........
示例13: main
#.........这里部分代码省略.........
print ("Anytime active learning experiment - use objective function to pick data")
t0 = time.time()
tac = []
tau = []
### experiment starts
for t in range(args.trials):
trial_accu = []
trial_aucs = []
print "*" * 60
print "Trial: %s" % t
if args.student in "anyunc":
student = randomsampling.AnytimeLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
subpool=250, cost_model=cost_model)
elif args.student in "lambda":
student = randomsampling.AnytimeLearnerDiff(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
subpool=250, cost_model=cost_model, lambda_value=args.lambda_value)
elif args.student in "anyzero":
student = randomsampling.AnytimeLearnerZeroUtility(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
subpool=250, cost_model=cost_model)
else:
raise ValueError("Oops! We do not know that anytime strategy. Try again.")
print "\nStudent: %s " % student
train_indices = []
neutral_text = [] # save the raw text of the queries
neutral_data = [] # save the xik vectors
train_x = []
train_y = []
neu_x = [] # data to train the classifier
neu_y = np.array([])
pool = Bunch()
pool.data = data.train.bow.tocsr() # full words, for training
pool.text = data.train.data
# pool.fixk = data.train.bowk.tocsr() # k words BOW for querying
pool.target = data.train.target
pool.predicted = []
# pool.kwords = np.array(data.train.kwords) # k words
pool.remaining = set(range(pool.data.shape[0])) # indices of the pool
bootstrapped = False
current_cost = 0
iteration = 0
query_index = None
query_size = None
while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:
util = []
if not bootstrapped:
## random from each bootstrap
bt = randomsampling.BootstrapFromEach(t * 10)
query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
bootstrapped = True
query = pool.data[query_index]
print "Bootstrap: %s " % bt.__class__.__name__
print
else:
# print "pick instance"
## chose returns: index, k
## util returns: utility, k, unc
query_chosen, util = student.pick_next(pool=pool, step_size=step_size)
query_index = [a for a, b in query_chosen]
示例14: get_data
def get_data(clf, train, cats, fixk, min_size, vct, raw, limit=2):
import copy
min_size = 10
args.fixk = None
data, vct2 = load_from_file(train, cats, fixk, min_size, vct, raw=raw)
print("Data %s" % args.train)
print("Data size %s" % len(data.train.data))
### SENTENCE TRANSFORMATION
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
## delete <br> to "." to recognize as end of sentence
data.train.data = clean_html(data.train.data)
data.test.data = clean_html(data.test.data)
print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
## Get the features of the sentence dataset
## create splits of data: pool, test, oracle, sentences
expert_data = Bunch()
train_test_data = Bunch()
expert_data.sentence, train_test_data.pool = split_data(data.train)
expert_data.oracle, train_test_data.test = split_data(data.test)
data.train.data = train_test_data.pool.train.data
data.train.target = train_test_data.pool.train.target
data.test.data = train_test_data.test.train.data
data.test.target = train_test_data.test.train.target
## convert document to matrix
data.train.bow = vct.fit_transform(data.train.data)
data.test.bow = vct.transform(data.test.data)
#### EXPERT CLASSIFIER: ORACLE
print("Training Oracle expert")
labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=limit)
print len(sent_train)
expert_data.oracle.train.data = sent_train
expert_data.oracle.train.target = np.array(labels)
expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
print expert_data.oracle.train.bow.shape
# exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
exp_clf = copy.copy(clf)
exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
#### EXPERT CLASSIFIER: SENTENCES
print("Training sentence expert")
labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=limit)
expert_data.sentence.train.data = sent_train
expert_data.sentence.train.target = np.array(labels)
expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)
sent_clf = None
# if args.cheating:
sent_clf = copy.copy(clf)
# sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)
return exp_clf, data, vct, sent_clf, expert_data
示例15: Bunch
from sklearn.datasets.base import Bunch
# 分词后分类语料库路径
seg_path = "text_corpus_segment/"
# 词袋语料路径
wordbag_path = "text_corpus_wordbag/"
if not os.path.exists(wordbag_path):
os.makedirs(wordbag_path)
# Bunch类提供一种key,value的对象形式
# target_name:所有分类名称列表
# label:每个文件的分类标签列表
# filenames:文件名称
# contents:文件内容
data_set = Bunch(target_name=[], label=[], filenames=[], contents=[])
# 获取seg_path下的所有子分类
class_list = os.listdir(seg_path)
data_set.target_name = class_list
# 获取每个子目录下所有的文件
for mydir in class_list:
class_path = seg_path + mydir + "/"
file_list = os.listdir(class_path) # 获取class_path下的所有文件
for file_name in file_list:
file_path = class_path + file_name
data_set.filenames.append(file_path) # 把文件路径附加到数据集中
data_set.label.append(data_set.target_name.index(mydir)) # 把文件分类标签附加到数据集中
with open(file_path, 'r', encoding='gb18030') as file:
seg_corpus = file.read() # 读取语料