本文整理匯總了Python中sklearn.datasets.base.Bunch.oracle方法的典型用法代碼示例。如果您正苦於以下問題:Python Bunch.oracle方法的具體用法?Python Bunch.oracle怎麽用?Python Bunch.oracle使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.datasets.base.Bunch
的用法示例。
在下文中一共展示了Bunch.oracle方法的3個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: main
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import oracle [as 別名]
def main():
print args
print
accuracies = defaultdict(lambda: [])
ora_accu = defaultdict(lambda: [])
oracle_accuracies =[]
ora_cm = defaultdict(lambda: [])
lbl_dit = defaultdict(lambda: [])
aucs = defaultdict(lambda: [])
x_axis = defaultdict(lambda: [])
vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1),
token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
print("Start loading ...")
# data fields: data, bow, file_names, target_names, target
########## NEWS GROUPS ###############
# easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
categories = [['alt.atheism', 'talk.religion.misc'],
['comp.graphics', 'comp.windows.x'],
['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
['rec.sport.baseball', 'sci.crypt']]
min_size = 10
args.fixk = None
data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)
print("Data %s" % args.train)
print("Data size %s" % len(data.train.data))
parameters = experiment_utils.parse_parameters_mat(args.cost_model)
print "Cost Parameters %s" % parameters
cost_model = experiment_utils.set_cost_model(args.cost_function, parameters=parameters)
print "\nCost Model: %s" % cost_model.__class__.__name__
### SENTENCE TRANSFORMATION
if args.train == "twitter":
sent_detector = TwitterSentenceTokenizer()
else:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
## delete <br> to "." to recognize as end of sentence
data.train.data = experiment_utils.clean_html(data.train.data)
data.test.data = experiment_utils.clean_html(data.test.data)
print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
## Get the features of the sentence dataset
## create splits of data: pool, test, oracle, sentences
expert_data = Bunch()
if not args.fulloracle:
train_test_data = Bunch()
expert_data.sentence, train_test_data.pool = split_data(data.train)
expert_data.oracle, train_test_data.test = split_data(data.test)
data.train.data = train_test_data.pool.train.data
data.train.target = train_test_data.pool.train.target
data.test.data = train_test_data.test.train.data
data.test.target = train_test_data.test.train.target
## convert document to matrix
data.train.bow = vct.fit_transform(data.train.data)
data.test.bow = vct.transform(data.test.data)
#### EXPERT CLASSIFIER: ORACLE
print("Training Oracle expert")
exp_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty)
if not args.fulloracle:
print "Training expert documents:%s" % len(expert_data.oracle.train.data)
labels, sent_train = experiment_utils.split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=args.limit)
expert_data.oracle.train.data = sent_train
expert_data.oracle.train.target = np.array(labels)
expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
else:
# expert_data.data = np.concatenate((data.train.data, data.test.data))
# expert_data.target = np.concatenate((data.train.target, data.test.target))
expert_data.data =data.train.data
expert_data.target = data.train.target
expert_data.target_names = data.train.target_names
labels, sent_train = experiment_utils.split_data_sentences(expert_data, sent_detector, vct, limit=args.limit)
expert_data.bow = vct.transform(sent_train)
expert_data.target = labels
expert_data.data = sent_train
exp_clf.fit(expert_data.bow, expert_data.target)
#.........這裏部分代碼省略.........
示例2: main
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import oracle [as 別名]
def main():
vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1),
token_pattern='\\b\\w+\\b') #, tokenizer=StemTokenizer())
vct_analizer = vct.build_tokenizer()
print("Start loading ...")
# data fields: data, bow, file_names, target_names, target
########## NEWS GROUPS ###############
# easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
categories = [['alt.atheism', 'talk.religion.misc'],
['comp.graphics', 'comp.windows.x'],
['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
['rec.sport.baseball', 'sci.crypt']]
min_size = 10 # max(10, args.fixk)
args.fixk = None
data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)
print("Data %s" % args.train)
print("Data size %s" % len(data.train.data))
### SENTENCE TRANSFORMATION
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
## delete <br> to "." to recognize as end of sentence
data.train.data = experiment_utils.clean_html(data.train.data)
data.test.data = experiment_utils.clean_html(data.test.data)
print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
## Get the features of the sentence dataset
## create splits of data: pool, test, oracle, sentences
expert_data = Bunch()
train_test_data = Bunch()
expert_data.sentence, train_test_data.pool = split_data(data.train)
expert_data.oracle, train_test_data.test = split_data(data.test)
data.train.data = train_test_data.pool.train.data
data.train.target = train_test_data.pool.train.target
data.test.data = train_test_data.test.train.data
data.test.target = train_test_data.test.train.target
## convert document to matrix
data.train.bow = vct.fit_transform(data.train.data)
data.test.bow = vct.transform(data.test.data)
#### EXPERT CLASSIFIER: ORACLE
print("Training Oracle expert")
labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector)
expert_data.oracle.train.data = sent_train
expert_data.oracle.train.target = np.array(labels)
expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
#### EXPERT CLASSIFIER: SENTENCES
print("Training sentence expert")
labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector)
expert_data.sentence.train.data = sent_train
expert_data.sentence.train.target = np.array(labels)
expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)
sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)
#### TESTING THE CLASSIFERS
test_target, test_data = split_data_sentences(data.test,sent_detector)
test_data_bow = vct.transform(test_data)
#pred_sent = sent_clf.predict(test_data_bow)
pred_ora = exp_clf.predict(test_data_bow)
y_probas = sent_clf.predict_proba(test_data_bow)
pred_sent = sent_clf.classes_[np.argmax(y_probas, axis=1)]
## just based on one class probability
# order = np.argsort(y_probas[:,0])
order = np.argsort(y_probas.max(axis=1))
print "ORACLE\tSENTENCE\tMAX-SENT"
# for i in order[:500]:
# print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
for i in order[-500:]:
print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
print "Accuracy of Sentences Classifier", metrics.accuracy_score(test_target, pred_sent)
print "Class distribution: %s" % pred_sent.sum()
print "Size of data: %s" % pred_sent.shape[0]
sizes = [50, 100, 500, 1000, 2000, 3000, 4000, 20000]
#.........這裏部分代碼省略.........
示例3: get_data
# 需要導入模塊: from sklearn.datasets.base import Bunch [as 別名]
# 或者: from sklearn.datasets.base.Bunch import oracle [as 別名]
def get_data(clf, train, cats, fixk, min_size, vct, raw, limit=2):
import copy
min_size = 10
args.fixk = None
data, vct2 = load_from_file(train, cats, fixk, min_size, vct, raw=raw)
print("Data %s" % args.train)
print("Data size %s" % len(data.train.data))
### SENTENCE TRANSFORMATION
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
## delete <br> to "." to recognize as end of sentence
data.train.data = clean_html(data.train.data)
data.test.data = clean_html(data.test.data)
print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
## Get the features of the sentence dataset
## create splits of data: pool, test, oracle, sentences
expert_data = Bunch()
train_test_data = Bunch()
expert_data.sentence, train_test_data.pool = split_data(data.train)
expert_data.oracle, train_test_data.test = split_data(data.test)
data.train.data = train_test_data.pool.train.data
data.train.target = train_test_data.pool.train.target
data.test.data = train_test_data.test.train.data
data.test.target = train_test_data.test.train.target
## convert document to matrix
data.train.bow = vct.fit_transform(data.train.data)
data.test.bow = vct.transform(data.test.data)
#### EXPERT CLASSIFIER: ORACLE
print("Training Oracle expert")
labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=limit)
print len(sent_train)
expert_data.oracle.train.data = sent_train
expert_data.oracle.train.target = np.array(labels)
expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
print expert_data.oracle.train.bow.shape
# exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
exp_clf = copy.copy(clf)
exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
#### EXPERT CLASSIFIER: SENTENCES
print("Training sentence expert")
labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=limit)
expert_data.sentence.train.data = sent_train
expert_data.sentence.train.target = np.array(labels)
expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)
sent_clf = None
# if args.cheating:
sent_clf = copy.copy(clf)
# sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)
return exp_clf, data, vct, sent_clf, expert_data