本文整理汇总了Python中utils.make_classification_data函数的典型用法代码示例。如果您正苦于以下问题:Python make_classification_data函数的具体用法?Python make_classification_data怎么用?Python make_classification_data使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了make_classification_data函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_merge_missing_labels
def test_merge_missing_labels():
"""
Test to ensure that labels are sucessfully copied when merging
"""
# create a feature set
fs1, _ = make_classification_data(num_examples=100,
num_features=4,
num_labels=3,
train_test_ratio=1.0)
# create a different feature set with no labels specified
fs2, _ = make_classification_data(num_examples=100,
num_features=4,
feature_prefix='g',
empty_labels=True,
num_labels=3,
train_test_ratio=1.0)
# merge the two featuresets in different orders
fs12 = fs1 + fs2
fs21 = fs2 + fs1
# make sure that the labels are the same after merging
assert_array_equal(fs12.labels, fs1.labels)
assert_array_equal(fs21.labels, fs1.labels)
示例2: test_subtract
def test_subtract():
"""
Test to ensure that subtraction works
"""
# create a feature set
fs1, _ = make_classification_data(num_examples=100,
num_features=4,
num_labels=2,
train_test_ratio=1.0,
random_state=1234)
# create a different feature set with the same feature names
# but different feature values
fs2, _ = make_classification_data(num_examples=100,
num_features=2,
num_labels=2,
train_test_ratio=1.0,
random_state=5678)
# subtract fs1 from fs2, i.e., the features in fs2
# should be removed from fs1 but nothing else should change
fs = fs1 - fs2
# ensure that the labels are the same in fs and fs1
assert_array_equal(fs.labels, fs1.labels)
# ensure that there are only two features left
eq_(fs.features.shape[1], 2)
# and that they are f3 and f4
assert_array_equal(np.array(fs.vectorizer.feature_names_), ['f03', 'f04'])
示例3: check_print_model_weights
def check_print_model_weights(task='classification'):
# create some simple classification or regression data
if task == 'classification':
train_fs, _ = make_classification_data(train_test_ratio=0.8)
else:
train_fs, _, _ = make_regression_data(num_features=4,
train_test_ratio=0.8)
# now train the appropriate model
if task == 'classification':
learner = Learner('LogisticRegression')
learner.train(train_fs)
else:
learner = Learner('LinearRegression')
learner.train(train_fs, grid_objective='pearson')
# now save the model to disk
model_file = join(_my_dir, 'output',
'test_print_model_weights.model')
learner.save(model_file)
# now call print_model_weights main() and capture the output
print_model_weights_cmd = [model_file]
err = ''
try:
old_stderr = sys.stderr
old_stdout = sys.stdout
sys.stderr = mystderr = StringIO()
sys.stdout = mystdout = StringIO()
pmw.main(print_model_weights_cmd)
out = mystdout.getvalue()
err = mystderr.getvalue()
finally:
sys.stderr = old_stderr
sys.stdout = old_stdout
print(err)
# now parse the output of the print_model_weight command
# and get the intercept and the feature values
if task == 'classification':
lines_to_parse = [l for l in out.split('\n')[1:] if l]
intercept = safe_float(lines_to_parse[0].split('\t')[0])
feature_values = []
for ltp in lines_to_parse[1:]:
fields = ltp.split('\t')
feature_values.append((fields[2], safe_float(fields[0])))
feature_values = [t[1] for t in sorted(feature_values)]
assert_almost_equal(intercept, learner.model.intercept_[0])
assert_allclose(learner.model.coef_[0], feature_values)
else:
lines_to_parse = [l for l in out.split('\n') if l]
intercept = safe_float(lines_to_parse[0].split('=')[1])
feature_values = []
for ltp in lines_to_parse[1:]:
fields = ltp.split('\t')
feature_values.append((fields[1], safe_float(fields[0])))
feature_values = [t[1] for t in sorted(feature_values)]
assert_almost_equal(intercept, learner.model.intercept_)
assert_allclose(learner.model.coef_, feature_values)
示例4: test_string_feature
def test_string_feature():
"""
Test to make sure that string-valued features are properly
encoded as binary features
"""
# create a featureset that is derived from an original
# set of features containing 3 numeric features and
# one string-valued feature that can take six possible
# values between 'a' to 'f'. This means that the
# featureset will have 3 numeric + 6 binary features.
fs, _ = make_classification_data(num_examples=100,
num_features=4,
num_labels=3,
one_string_feature=True,
num_string_values=6,
train_test_ratio=1.0)
# confirm that the number of features are as expected
eq_(fs.features.shape, (100, 9))
# confirm the feature names
eq_(fs.vectorizer.feature_names_, ['f01', 'f02', 'f03',
'f04=a', 'f04=b', 'f04=c',
'f04=d', 'f04=e', 'f04=f'])
# confirm that the final six features are binary
assert_array_equal(fs.features[:, [3, 4, 5, 6, 7, 8]].data, 1)
示例5: test_learner_api_load_into_existing_instance
def test_learner_api_load_into_existing_instance():
"""
Check that `Learner.load()` works as expected
"""
# create a LinearSVC instance and train it on some data
learner1 = Learner('LinearSVC')
(train_fs,
test_fs) = make_classification_data(num_examples=200,
num_features=5,
use_feature_hashing=False,
non_negative=True)
learner1.train(train_fs, grid_search=False)
# now use `load()` to replace the existing instance with a
# different saved learner
other_model_file = join(_my_dir, 'other', 'test_load_saved_model.{}.model'.format(sys.version_info[0]))
learner1.load(other_model_file)
# now load the saved model into another instance using the class method
# `from_file()`
learner2 = Learner.from_file(other_model_file)
# check that the two instances are now basically the same
eq_(learner1.model_type, learner2.model_type)
eq_(learner1.model_params, learner2.model_params)
eq_(learner1.model_kwargs, learner2.model_kwargs)
示例6: check_train_and_score_function
def check_train_and_score_function(model_type):
"""
Check that the _train_and_score() function works as expected
"""
# create train and test data
(train_fs,
test_fs) = make_classification_data(num_examples=500,
train_test_ratio=0.7,
num_features=5,
use_feature_hashing=False,
non_negative=True)
# call _train_and_score() on this data
estimator_name = 'LogisticRegression' if model_type == 'classifier' else 'Ridge'
metric = 'accuracy' if model_type == 'classifier' else 'pearson'
learner1 = Learner(estimator_name)
train_score1, test_score1 = _train_and_score(learner1, train_fs, test_fs, metric)
# this should yield identical results when training another instance
# of the same learner without grid search and shuffling and evaluating
# that instance on the train and the test set
learner2 = Learner(estimator_name)
learner2.train(train_fs, grid_search=False, shuffle=False)
train_score2 = learner2.evaluate(train_fs, output_metrics=[metric])[-1][metric]
test_score2 = learner2.evaluate(test_fs, output_metrics=[metric])[-1][metric]
eq_(train_score1, train_score2)
eq_(test_score1, test_score2)
示例7: check_filter_labels
def check_filter_labels(inverse=False):
# create a feature set
fs, _ = make_classification_data(num_examples=1000,
num_features=4,
num_labels=5,
train_test_ratio=1.0)
# keep just the instaces with 0, 1 and 2 labels
labels_to_filter = [0, 1, 2]
# do the actual filtering
fs.filter(labels=labels_to_filter, inverse=inverse)
# make sure that we removed the right things
if inverse:
ids_kept = fs.ids[np.where(np.logical_not(np.in1d(fs.labels,
labels_to_filter)))]
else:
ids_kept = fs.ids[np.where(np.in1d(fs.labels, labels_to_filter))]
assert_array_equal(fs.ids, np.array(ids_kept))
# make sure that number of ids, labels and features are the same
eq_(fs.ids.shape[0], fs.labels.shape[0])
eq_(fs.labels.shape[0], fs.features.shape[0])
示例8: make_single_file_featureset_data
def make_single_file_featureset_data():
"""
Write a training file and a test file for tests that check whether
specifying train_file and test_file actually works.
"""
train_fs, test_fs = make_classification_data(num_examples=600,
train_test_ratio=0.8,
num_labels=2,
num_features=3,
non_negative=False)
# Write training feature set to a file
train_path = join(_my_dir, 'train', 'train_single_file.jsonlines')
writer = NDJWriter(train_path, train_fs)
writer.write()
# Write test feature set to a file
test_path = join(_my_dir, 'test', 'test_single_file.jsonlines')
writer = NDJWriter(test_path, test_fs)
writer.write()
# Also write another test feature set that has fewer features than the training set
test_fs.filter(features=['f01', 'f02'])
test_path = join(_my_dir, 'test', 'test_single_file_subset.jsonlines')
writer = NDJWriter(test_path, test_fs)
writer.write()
示例9: test_skll_convert_libsvm_map
def test_skll_convert_libsvm_map():
"""
Test to check whether the --reuse_libsvm_map option works for skll_convert
"""
# create some simple classification data
orig_fs, _ = make_classification_data(train_test_ratio=1.0,
one_string_feature=True)
# now write out this feature set as a libsvm file
orig_libsvm_file = join(_my_dir, 'other',
'test_skll_convert_libsvm_map.libsvm')
writer = LibSVMWriter(orig_libsvm_file, orig_fs, quiet=True)
writer.write()
# now make a copy of the dataset
swapped_fs = copy.deepcopy(orig_fs)
# now modify this new featureset to swap the first two columns
del swapped_fs.vectorizer.vocabulary_['f01']
del swapped_fs.vectorizer.vocabulary_['f02']
swapped_fs.vectorizer.vocabulary_['f01'] = 1
swapped_fs.vectorizer.vocabulary_['f02'] = 0
tmp = swapped_fs.features[:, 0]
swapped_fs.features[:, 0] = swapped_fs.features[:, 1]
swapped_fs.features[:, 1] = tmp
# now write out this new feature set as a MegaM file
swapped_megam_file = join(_my_dir, 'other',
'test_skll_convert_libsvm_map.megam')
writer = MegaMWriter(swapped_megam_file, swapped_fs, quiet=True)
writer.write()
# now run skll_convert to convert this into a libsvm file
# but using the mapping specified in the first libsvm file
converted_libsvm_file = join(_my_dir, 'other',
'test_skll_convert_libsvm_map2.libsvm')
# now call skll convert's main function
skll_convert_cmd = ['--reuse_libsvm_map', orig_libsvm_file,
'--quiet', orig_libsvm_file,
converted_libsvm_file]
err = ''
try:
old_stderr = sys.stderr
sys.stderr = mystderr = StringIO()
sk.main(skll_convert_cmd)
err = mystderr.getvalue()
finally:
sys.stderr = old_stderr
print(err)
# now read the converted libsvm file into a featureset
reader = LibSVMReader(converted_libsvm_file, quiet=True)
converted_fs = reader.read()
# now ensure that this new featureset and the original
# featureset are the same
eq_(orig_fs, converted_fs)
示例10: check_generate_predictions_console
def check_generate_predictions_console(use_threshold=False):
# create some simple classification data without feature hashing
train_fs, test_fs = make_classification_data(num_examples=1000,
num_features=5)
# save the test feature set to an NDJ file
input_file = join(_my_dir, 'test',
'test_generate_predictions.jsonlines')
writer = NDJWriter(input_file, test_fs)
writer.write()
# create a learner that uses an SGD classifier
learner = Learner('SGDClassifier', probability=use_threshold)
# train the learner with grid search
learner.train(train_fs, grid_search=True)
# get the predictions on the test featureset
predictions = learner.predict(test_fs)
# if we asked for probabilities, then use the threshold
# to convert them into binary predictions
if use_threshold:
threshold = 0.6
predictions = [int(p[1] >= threshold) for p in predictions]
else:
predictions = predictions.tolist()
threshold = None
# save the learner to a file
model_file = join(_my_dir, 'output',
'test_generate_predictions_console.model')
learner.save(model_file)
# now call main() from generate_predictions.py
generate_cmd = []
if use_threshold:
generate_cmd.append('-t {}'.format(threshold))
generate_cmd.extend([model_file, input_file])
# we need to capture stdout since that's what main() writes to
err = ''
try:
old_stdout = sys.stdout
old_stderr = sys.stderr
sys.stdout = mystdout = StringIO()
sys.stderr = mystderr = StringIO()
gp.main(generate_cmd)
out = mystdout.getvalue()
err = mystderr.getvalue()
predictions_after_saving = [int(x) for x in out.strip().split('\n')]
eq_(predictions, predictions_after_saving)
finally:
sys.stdout = old_stdout
sys.stderr = old_stderr
print(err)
示例11: test_custom_learner_model_loading
def test_custom_learner_model_loading():
num_labels = 10
class_weights = [(0.5 / (num_labels - 1))
for x in range(num_labels - 1)] + [0.5]
train_fs, test_fs = make_classification_data(num_examples=600,
train_test_ratio=0.8,
num_labels=num_labels,
num_features=5,
non_negative=True,
class_weights=class_weights)
# Write training feature set to a file
train_path = join(_my_dir, 'train',
'test_model_custom_learner.jsonlines')
writer = NDJWriter(train_path, train_fs)
writer.write()
# Write test feature set to a file
test_path = join(_my_dir, 'test',
'test_model_custom_learner.jsonlines')
writer = NDJWriter(test_path, test_fs)
writer.write()
# run the configuration that trains the custom model and saves it
cfgfile = 'test_model_save_custom_learner.template.cfg'
config_template_path = join(_my_dir, 'configs', cfgfile)
config_path = fill_in_config_paths(config_template_path)
run_configuration(config_path, quiet=True)
# save the predictions from disk into memory
# and delete the predictions file
outprefix = 'test_model_custom_learner'
pred_file = join(_my_dir, 'output',
'{}_{}_CustomLogisticRegressionWrapper'
'.predictions'.format(outprefix,
outprefix))
preds1 = read_predictions(pred_file)
os.unlink(pred_file)
# run the configuration that loads the saved model
# and generates the predictions again
cfgfile = 'test_model_load_custom_learner.template.cfg'
config_template_path = join(_my_dir, 'configs', cfgfile)
config_path = fill_in_config_paths(config_template_path)
run_configuration(config_path, overwrite=False, quiet=True)
# load the newly generated predictions
preds2 = read_predictions(pred_file)
# make sure that they are the same as before
assert_array_equal(preds1, preds2)
示例12: test_merge_different_vectorizers
def test_merge_different_vectorizers():
"""
Test to ensure rejection of merging featuresets with different vectorizers
"""
# create a featureset each with a DictVectorizer
fs1, _ = make_classification_data(num_examples=100,
num_features=4,
num_labels=3,
train_test_ratio=1.0)
# create another featureset using hashing
fs2, _ = make_classification_data(num_examples=100,
num_features=4,
feature_prefix='g',
num_labels=3,
train_test_ratio=1.0,
use_feature_hashing=True)
# This should raise a ValueError
fs1 + fs2
示例13: test_length
def test_length():
"""
Test to whether len() returns the number of instances
"""
# create a featureset
fs, _ = make_classification_data(num_examples=100,
num_features=4,
num_labels=3,
train_test_ratio=1.0)
eq_(len(fs), 100)
示例14: test_empty_labels
def test_empty_labels():
"""
Test to check behaviour when labels is None
"""
# create a feature set with empty labels
fs, _ = make_classification_data(num_examples=100,
num_features=4,
num_labels=3,
empty_labels=True,
train_test_ratio=1.0)
assert np.isnan(fs.labels).all()
示例15: test_write_hashed_featureset
def test_write_hashed_featureset():
"""
Test to check that hashed featuresets cannot be written out
"""
fs, _ = make_classification_data(num_examples=100,
num_features=4,
use_feature_hashing=True,
feature_bins=2,
random_state=1234)
output_dir = join(_my_dir, 'output')
writer = NDJWriter(join(output_dir, 'foo.jsonlines'), fs)
writer.write()