本文整理汇总了Python中rep.test.test_estimators.generate_classification_data函数的典型用法代码示例。如果您正苦于以下问题:Python generate_classification_data函数的具体用法?Python generate_classification_data怎么用?Python generate_classification_data使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了generate_classification_data函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_cache_classifier
def test_cache_classifier():
cache_helper.clear_cache()
for Wrapper, Model in [(CacheClassifier, LogisticRegression), (CacheRegressor, LinearRegression)]:
X, y, weights = generate_classification_data(n_classes=2)
clf = Wrapper('first', Model()).fit(X, y)
assert clf._used_cache == False
clf = Wrapper('first', Model()).fit(X + 0, y + 0)
assert clf._used_cache == True
# changed name
clf = Wrapper('second', Model()).fit(X, y)
assert clf._used_cache == False
# changed data
X_new = X.copy()
X_new.iloc[0, 0] += 1
clf = Wrapper('first', Model()).fit(X_new, y)
assert clf._used_cache == False
# changed labels
y_new = y.copy()
y_new[0] += 1
clf = Wrapper('first', Model()).fit(X, y_new)
assert clf._used_cache == False
# added weights
clf = Wrapper('first', Model()).fit(X, y, sample_weight=None)
assert clf._used_cache == False
# changed parameters
clf = Wrapper('first', Model(n_jobs=2)).fit(X, y)
assert clf._used_cache == False
# fitting previous once again. Checking that overwriting is correct.
clf = Wrapper('first', Model(n_jobs=2)).fit(X, y)
assert clf._used_cache == True
cache_helper.clear_cache()
示例2: check_folding
def check_folding(classifier, check_instance=True, has_staged_pp=True, has_importances=True):
X, y, sample_weight = generate_classification_data(distance=0.6)
assert classifier == classifier.fit(X, y, sample_weight=sample_weight)
assert list(classifier.features) == list(X.columns)
check_classification_model(classifier, X, y, check_instance=check_instance, has_staged_pp=has_staged_pp,
has_importances=has_importances)
def mean_vote(x):
return numpy.mean(x, axis=0)
labels = classifier.predict(X, mean_vote)
proba = classifier.predict_proba(X, mean_vote)
assert numpy.all(proba == classifier.predict_proba(X, mean_vote))
score = accuracy_score(y, labels)
print(score)
assert score > 0.7
assert numpy.allclose(proba.sum(axis=1), 1), 'probabilities do not sum to 1'
assert numpy.all(proba >= 0.), 'negative probabilities'
auc_score = roc_auc_score(y, proba[:, 1])
print(auc_score)
assert auc_score > 0.8
if has_staged_pp:
for p in classifier.staged_predict_proba(X, mean_vote):
assert p.shape == (len(X), 2)
# checking that last iteration coincides with previous
assert numpy.all(p == proba)
示例3: very_basic_xgboost_test
def very_basic_xgboost_test():
X, y, w = generate_classification_data(n_classes=2)
clf = XGBoostClassifier(n_estimators=10).fit(X, y)
clf.predict(X)
clf.predict_proba(X)
# testing that returned features in importances are correct and in the same order
assert numpy.all(clf.features == clf.get_feature_importances().index)
示例4: test_xgboost_works_with_different_dtypes
def test_xgboost_works_with_different_dtypes():
dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32']
for dtype in dtypes:
X, y, weights = generate_classification_data(n_classes=2, distance=5)
clf = XGBoostClassifier(n_estimators=10)
clf.fit(X.astype(dtype=dtype), y.astype(dtype=dtype), sample_weight=weights.astype(dtype))
probabilities = clf.predict_proba(X.astype(dtype))
# testing single pandas.DataFrame with different dtypes
X, y, weights = generate_classification_data(n_classes=2, distance=5)
import pandas
X = pandas.DataFrame()
for dtype in dtypes:
X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype)
clf = XGBoostClassifier(n_estimators=10)
clf.fit(X, y, sample_weight=weights)
probabilities = clf.predict_proba(X)
示例5: test_pybrain_reproducibility
def test_pybrain_reproducibility():
# This test fails. Because PyBrain can't reproduce training.
X, y, _ = generate_classification_data()
clf1 = PyBrainClassifier(layers=[4], epochs=2).fit(X, y)
clf2 = PyBrainClassifier(layers=[4], epochs=2).fit(X, y)
print(clf1.predict_proba(X) - clf2.predict_proba(X))
assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'different predicitons'
check_classification_reproducibility(clf1, X, y)
示例6: test_nolearn_reproducibility
def test_nolearn_reproducibility():
X, y, sample_weight = generate_classification_data()
cl = NolearnClassifier()
y_predicted_1 = cl.fit(X, y).predict(X)
y_predicted_2 = cl.fit(X, y).predict(X)
assert (y_predicted_1 == y_predicted_2).all(), 'fitting the classifier twice gives different predictions'
y_predicted_3 = clone(cl).fit(X, y).predict(X)
assert (y_predicted_1 == y_predicted_3).all(), 'cloned classifier gives different prediction'
示例7: test_factory
def test_factory():
factory = ClassifiersFactory()
try:
from rep.estimators.tmva import TMVAClassifier
factory.add_classifier('tmva', TMVAClassifier())
except ImportError:
pass
factory.add_classifier('rf', RandomForestClassifier(n_estimators=10))
factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20))
X, y, sample_weight = generate_classification_data()
assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns),
parallel_profile='threads-4')
for cl in factory.values():
assert list(cl.features) == list(X.columns)
proba = factory.predict_proba(X, parallel_profile='threads-4')
labels = factory.predict(X, parallel_profile='threads-4')
for key, val in labels.items():
score = accuracy_score(y, val)
print(key, score)
assert score > 0.7, key
for key, val in proba.items():
assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1'
assert numpy.all(val >= 0.), 'negative probabilities'
auc_score = roc_auc_score(y, val[:, 1])
print(auc_score)
assert auc_score > 0.8
for key, iterator in factory.staged_predict_proba(X).items():
assert key != 'tmva', 'tmva does not support staged pp'
for p in iterator:
assert p.shape == (len(X), 2)
# checking that last iteration coincides with previous
assert numpy.all(p == proba[key])
# testing picklability
dump_string = cPickle.dumps(factory)
clf_loaded = cPickle.loads(dump_string)
assert type(factory) == type(clf_loaded)
probs1 = factory.predict_proba(X)
probs2 = clf_loaded.predict_proba(X)
for key, val in probs1.items():
assert numpy.all(val == probs2[key]), 'something strange was loaded'
report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight))
report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3))
report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight))
report = factory.test_on(X, y, sample_weight=sample_weight)
val = numpy.mean(X['column0'])
yield check_report_with_mask, report, "column0 > %f" % (val / 2.), X
yield check_report_with_mask, report, lambda x: numpy.array(x['column0']) < val * 2., X
yield check_report_with_mask, report, None, X
示例8: test_xgboost_random_states
def test_xgboost_random_states():
X, y, weights = generate_classification_data(n_classes=2, distance=5)
for random_state in [145, None, check_random_state(None), check_random_state(145)]:
clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state)
clf1.fit(X, y)
clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state)
clf2.fit(X, y)
if isinstance(random_state, numpy.random.RandomState):
assert not numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
else:
assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
示例9: test_xgboost_feature_importance
def test_xgboost_feature_importance():
X, y, weights = generate_classification_data(n_classes=2, distance=5)
clf = XGBoostClassifier(n_estimators=1, max_depth=1)
clf.fit(X, y)
importances = clf.get_feature_importances()
original_features = set(X.columns)
importances_features = set(importances.index)
print(original_features, importances_features)
assert original_features == importances_features, 'feature_importances_ return something wrong'
assert len(original_features) == len(clf.feature_importances_)
示例10: test_factory
def test_factory():
factory = RegressorsFactory()
try:
from rep.estimators.tmva import TMVARegressor
factory.add_regressor('tmva', TMVARegressor())
except ImportError:
pass
factory.add_regressor('rf', RandomForestRegressor(n_estimators=10))
factory.add_regressor('ada', AdaBoostRegressor(n_estimators=20))
X, y, sample_weight = generate_classification_data()
assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns))
values = factory.predict(X)
for cl in factory.values():
assert list(cl.features) == list(X.columns)
for key, val in values.items():
score = mean_squared_error(y, val)
print(score)
assert score < 0.2
for key, iterator in factory.staged_predict(X).items():
assert key != 'tmva', 'tmva does not support staged pp'
for p in iterator:
assert p.shape == (len(X), )
# checking that last iteration coincides with previous
assert numpy.all(p == values[key])
# testing picklability
dump_string = cPickle.dumps(factory)
clf_loaded = cPickle.loads(dump_string)
assert type(factory) == type(clf_loaded)
probs1 = factory.predict(X)
probs2 = clf_loaded.predict(X)
for key, val in probs1.items():
assert numpy.all(val == probs2[key]), 'something strange was loaded'
report = RegressionReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight))
report.feature_importance_shuffling(mean_squared_mod).plot(new_plot=True, figsize=(18, 3))
report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight))
report = factory.test_on(X, y, sample_weight=sample_weight)
report.feature_importance()
report.features_correlation_matrix()
report.predictions_scatter()
val = numpy.mean(X['column0'])
report_mask(report, "column0 > %f" % val, X)
report_mask(report, lambda x: numpy.array(x['column0']) < val, X)
report_mask(report, None, X)
示例11: test_gridsearch_threads
def test_gridsearch_threads(n_threads=3):
scorer = FoldingScorer(numpy.random.choice([OptimalAMS(), RocAuc()]))
grid_param = OrderedDict({"n_estimators": [10, 20],
"learning_rate": [0.1, 0.05],
'features': [['column0', 'column1'], ['column0', 'column1', 'column2']]})
generator = RegressionParameterOptimizer(grid_param, n_evaluations=4)
base = SklearnClassifier(clf=AdaBoostClassifier())
grid = GridOptimalSearchCV(base, generator, scorer, parallel_profile='threads-{}'.format(n_threads))
X, y, sample_weight = generate_classification_data()
grid.fit(X, y, sample_weight=sample_weight)
示例12: test_own_classification_reports
def test_own_classification_reports():
"""
testing clf.test_on
"""
X, y, sample_weight = generate_classification_data()
clf = SklearnClassifier(RandomForestClassifier())
clf.fit(X, y, sample_weight=sample_weight)
report = clf.test_on(X, y, sample_weight=sample_weight)
roc1 = report.compute_metric(RocAuc())
lds = LabeledDataStorage(X, y, sample_weight=sample_weight)
roc2 = clf.test_on_lds(lds=lds).compute_metric(RocAuc())
assert roc1 == roc2, 'Something wrong with test_on'
示例13: test_folding_regressor_functions
def test_folding_regressor_functions():
"""Testing folding functions """
data, y, sample_weight = generate_classification_data()
for X in [data, numpy.array(data)]:
kfolder = FoldingRegressor(SklearnRegressor(GradientBoostingRegressor(n_estimators=5)), n_folds=2)
kfolder.fit(X, y, sample_weight=sample_weight)
preds = kfolder.predict(X)
for p in kfolder.staged_predict(X):
pass
assert numpy.allclose(p, preds)
importances = kfolder.feature_importances_
other_importances = kfolder.get_feature_importances()
示例14: test_feature_importances
def test_feature_importances():
clf = XGBoostClassifier()
X, y, sample_weight = generate_classification_data()
clf.fit(X, y, sample_weight=sample_weight)
# checking feature importance (three ways)
res_default = clf.xgboost_classifier.get_fscore()
res2 = clf._get_fscore()
res3 = clf.feature_importances_
assert res_default == res2, res_default
for i, val in enumerate(res3):
if val > 0.0:
assert val == res_default['f' + str(i)]
示例15: test_feature_splitter
def test_feature_splitter():
# testing splitter
from rep.metaml import FeatureSplitter
X, y, sample_weight = generate_classification_data(n_classes=3)
split_column = X.columns[0]
splitters = numpy.random.randint(0, 3, size=len(X))
X[split_column] = splitters
X.ix[splitters == 1, :] += 4
X.ix[splitters == 2, :] -= 4
fs = FeatureSplitter(base_estimator=XGBoostClassifier(features=list(X.columns[1:]), n_estimators=10, max_depth=3),
split_feature=split_column)
fs.fit(X, y, sample_weight=sample_weight)
assert fs.score(X, y) > 0.9