Python utils.make_classification_data函数代码示例

本文整理汇总了Python中utils.make_classification_data函数的典型用法代码示例。如果您正苦于以下问题：Python make_classification_data函数的具体用法？Python make_classification_data怎么用？Python make_classification_data使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了make_classification_data函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_merge_missing_labels

def test_merge_missing_labels():
    """
    Test to ensure that labels are sucessfully copied when merging
    """

    # create a feature set
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create a different feature set with no labels specified
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      feature_prefix='g',
                                      empty_labels=True,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # merge the two featuresets in different orders
    fs12 = fs1 + fs2
    fs21 = fs2 + fs1

    # make sure that the labels are the same after merging
    assert_array_equal(fs12.labels, fs1.labels)
    assert_array_equal(fs21.labels, fs1.labels)

开发者ID:BK-University，项目名称:skll，代码行数:26，代码来源:test_featureset.py

示例2: test_subtract

def test_subtract():
    """
    Test to ensure that subtraction works
    """

    # create a feature set
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=2,
                                      train_test_ratio=1.0,
                                      random_state=1234)

    # create a different feature set with the same feature names
    # but different feature values
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=2,
                                      num_labels=2,
                                      train_test_ratio=1.0,
                                      random_state=5678)

    # subtract fs1 from fs2, i.e., the features in fs2
    # should be removed from fs1 but nothing else should change
    fs = fs1 - fs2

    # ensure that the labels are the same in fs and fs1
    assert_array_equal(fs.labels, fs1.labels)

    # ensure that there are only two features left
    eq_(fs.features.shape[1], 2)

    # and that they are f3 and f4
    assert_array_equal(np.array(fs.vectorizer.feature_names_), ['f03', 'f04'])

开发者ID:BK-University，项目名称:skll，代码行数:32，代码来源:test_featureset.py

示例3: check_print_model_weights

def check_print_model_weights(task='classification'):

    # create some simple classification or regression data
    if task == 'classification':
        train_fs, _ = make_classification_data(train_test_ratio=0.8)
    else:
        train_fs, _, _ = make_regression_data(num_features=4,
                                              train_test_ratio=0.8)

    # now train the appropriate model
    if task == 'classification':
        learner = Learner('LogisticRegression')
        learner.train(train_fs)
    else:
        learner = Learner('LinearRegression')
        learner.train(train_fs, grid_objective='pearson')

    # now save the model to disk
    model_file = join(_my_dir, 'output',
                      'test_print_model_weights.model')
    learner.save(model_file)

    # now call print_model_weights main() and capture the output
    print_model_weights_cmd = [model_file]
    err = ''
    try:
        old_stderr = sys.stderr
        old_stdout = sys.stdout
        sys.stderr = mystderr = StringIO()
        sys.stdout = mystdout = StringIO()
        pmw.main(print_model_weights_cmd)
        out = mystdout.getvalue()
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        sys.stdout = old_stdout
        print(err)

    # now parse the output of the print_model_weight command
    # and get the intercept and the feature values
    if task == 'classification':
        lines_to_parse = [l for l in out.split('\n')[1:] if l]
        intercept = safe_float(lines_to_parse[0].split('\t')[0])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[2], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_[0])
        assert_allclose(learner.model.coef_[0], feature_values)
    else:
        lines_to_parse = [l for l in out.split('\n') if l]
        intercept = safe_float(lines_to_parse[0].split('=')[1])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[1], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_)
        assert_allclose(learner.model.coef_, feature_values)

开发者ID:MechCoder，项目名称:skll，代码行数:60，代码来源:test_utilities.py

示例4: test_string_feature

def test_string_feature():
    """
    Test to make sure that string-valued features are properly
    encoded as binary features
    """
    # create a featureset that is derived from an original
    # set of features containing 3 numeric features and
    # one string-valued feature that can take six possible
    # values between 'a' to 'f'. This means that the
    # featureset will have 3 numeric + 6 binary features.
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     one_string_feature=True,
                                     num_string_values=6,
                                     train_test_ratio=1.0)

    # confirm that the number of features are as expected
    eq_(fs.features.shape, (100, 9))

    # confirm the feature names
    eq_(fs.vectorizer.feature_names_, ['f01', 'f02', 'f03',
                                       'f04=a', 'f04=b', 'f04=c',
                                       'f04=d', 'f04=e', 'f04=f'])

    # confirm that the final six features are binary
    assert_array_equal(fs.features[:, [3, 4, 5, 6, 7, 8]].data, 1)

开发者ID:BK-University，项目名称:skll，代码行数:27，代码来源:test_featureset.py

示例5: test_learner_api_load_into_existing_instance

def test_learner_api_load_into_existing_instance():
    """
    Check that `Learner.load()` works as expected
    """

    # create a LinearSVC instance and train it on some data
    learner1 = Learner('LinearSVC')
    (train_fs,
     test_fs) = make_classification_data(num_examples=200,
                                         num_features=5,
                                         use_feature_hashing=False,
                                         non_negative=True)
    learner1.train(train_fs, grid_search=False)

    # now use `load()` to replace the existing instance with a
    # different saved learner
    other_model_file = join(_my_dir, 'other', 'test_load_saved_model.{}.model'.format(sys.version_info[0]))
    learner1.load(other_model_file)

    # now load the saved model into another instance using the class method
    # `from_file()`
    learner2 = Learner.from_file(other_model_file)

    # check that the two instances are now basically the same
    eq_(learner1.model_type, learner2.model_type)
    eq_(learner1.model_params, learner2.model_params)
    eq_(learner1.model_kwargs, learner2.model_kwargs)

开发者ID:EducationalTestingService，项目名称:skll，代码行数:27，代码来源:test_classification.py

示例6: check_train_and_score_function

def check_train_and_score_function(model_type):
    """
    Check that the _train_and_score() function works as expected
    """

    # create train and test data
    (train_fs,
     test_fs) = make_classification_data(num_examples=500,
                                         train_test_ratio=0.7,
                                         num_features=5,
                                         use_feature_hashing=False,
                                         non_negative=True)

    # call _train_and_score() on this data
    estimator_name = 'LogisticRegression' if model_type == 'classifier' else 'Ridge'
    metric = 'accuracy' if model_type == 'classifier' else 'pearson'
    learner1 = Learner(estimator_name)
    train_score1, test_score1 = _train_and_score(learner1, train_fs, test_fs, metric)

    # this should yield identical results when training another instance
    # of the same learner without grid search and shuffling and evaluating
    # that instance on the train and the test set
    learner2 = Learner(estimator_name)
    learner2.train(train_fs, grid_search=False, shuffle=False)
    train_score2 = learner2.evaluate(train_fs, output_metrics=[metric])[-1][metric]
    test_score2 = learner2.evaluate(test_fs, output_metrics=[metric])[-1][metric]

    eq_(train_score1, train_score2)
    eq_(test_score1, test_score2)

开发者ID:EducationalTestingService，项目名称:skll，代码行数:29，代码来源:test_classification.py

示例7: check_filter_labels

def check_filter_labels(inverse=False):

    # create a feature set
    fs, _ = make_classification_data(num_examples=1000,
                                     num_features=4,
                                     num_labels=5,
                                     train_test_ratio=1.0)

    # keep just the instaces with 0, 1 and 2 labels
    labels_to_filter = [0, 1, 2]

    # do the actual filtering
    fs.filter(labels=labels_to_filter, inverse=inverse)

    # make sure that we removed the right things
    if inverse:
        ids_kept = fs.ids[np.where(np.logical_not(np.in1d(fs.labels,
                                                          labels_to_filter)))]
    else:
        ids_kept = fs.ids[np.where(np.in1d(fs.labels, labels_to_filter))]

    assert_array_equal(fs.ids, np.array(ids_kept))

    # make sure that number of ids, labels and features are the same
    eq_(fs.ids.shape[0], fs.labels.shape[0])
    eq_(fs.labels.shape[0], fs.features.shape[0])

开发者ID:BK-University，项目名称:skll，代码行数:26，代码来源:test_featureset.py

示例8: make_single_file_featureset_data

def make_single_file_featureset_data():
    """
    Write a training file and a test file for tests that check whether
    specifying train_file and test_file actually works.
    """
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=2,
                                                 num_features=3,
                                                 non_negative=False)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train', 'train_single_file.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test', 'test_single_file.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    # Also write another test feature set that has fewer features than the training set
    test_fs.filter(features=['f01', 'f02'])
    test_path = join(_my_dir, 'test', 'test_single_file_subset.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

开发者ID:EducationalTestingService，项目名称:skll，代码行数:26，代码来源:test_classification.py

示例9: test_skll_convert_libsvm_map

def test_skll_convert_libsvm_map():
    """
    Test to check whether the --reuse_libsvm_map option works for skll_convert
    """

    # create some simple classification data
    orig_fs, _ = make_classification_data(train_test_ratio=1.0,
                                          one_string_feature=True)

    # now write out this feature set as a libsvm file
    orig_libsvm_file = join(_my_dir, 'other',
                            'test_skll_convert_libsvm_map.libsvm')
    writer = LibSVMWriter(orig_libsvm_file, orig_fs, quiet=True)
    writer.write()

    # now make a copy of the dataset
    swapped_fs = copy.deepcopy(orig_fs)

    # now modify this new featureset to swap the first two columns
    del swapped_fs.vectorizer.vocabulary_['f01']
    del swapped_fs.vectorizer.vocabulary_['f02']
    swapped_fs.vectorizer.vocabulary_['f01'] = 1
    swapped_fs.vectorizer.vocabulary_['f02'] = 0
    tmp = swapped_fs.features[:, 0]
    swapped_fs.features[:, 0] = swapped_fs.features[:, 1]
    swapped_fs.features[:, 1] = tmp

    # now write out this new feature set as a MegaM file
    swapped_megam_file = join(_my_dir, 'other',
                              'test_skll_convert_libsvm_map.megam')
    writer = MegaMWriter(swapped_megam_file, swapped_fs, quiet=True)
    writer.write()

    # now run skll_convert to convert this into a libsvm file
    # but using the mapping specified in the first libsvm file
    converted_libsvm_file = join(_my_dir, 'other',
                                 'test_skll_convert_libsvm_map2.libsvm')

    # now call skll convert's main function
    skll_convert_cmd = ['--reuse_libsvm_map', orig_libsvm_file,
                        '--quiet', orig_libsvm_file,
                        converted_libsvm_file]
    err = ''
    try:
        old_stderr = sys.stderr
        sys.stderr = mystderr = StringIO()
        sk.main(skll_convert_cmd)
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        print(err)

    # now read the converted libsvm file into a featureset
    reader = LibSVMReader(converted_libsvm_file, quiet=True)
    converted_fs = reader.read()

    # now ensure that this new featureset and the original
    # featureset are the same
    eq_(orig_fs, converted_fs)

开发者ID:MechCoder，项目名称:skll，代码行数:59，代码来源:test_utilities.py

示例10: check_generate_predictions_console

def check_generate_predictions_console(use_threshold=False):

    # create some simple classification data without feature hashing
    train_fs, test_fs = make_classification_data(num_examples=1000,
                                                 num_features=5)

    # save the test feature set to an NDJ file
    input_file = join(_my_dir, 'test',
                      'test_generate_predictions.jsonlines')
    writer = NDJWriter(input_file, test_fs)
    writer.write()

    # create a learner that uses an SGD classifier
    learner = Learner('SGDClassifier', probability=use_threshold)

    # train the learner with grid search
    learner.train(train_fs, grid_search=True)

    # get the predictions on the test featureset
    predictions = learner.predict(test_fs)

    # if we asked for probabilities, then use the threshold
    # to convert them into binary predictions
    if use_threshold:
        threshold = 0.6
        predictions = [int(p[1] >= threshold) for p in predictions]
    else:
        predictions = predictions.tolist()
        threshold = None

    # save the learner to a file
    model_file = join(_my_dir, 'output',
                      'test_generate_predictions_console.model')
    learner.save(model_file)

    # now call main() from generate_predictions.py
    generate_cmd = []
    if use_threshold:
        generate_cmd.append('-t {}'.format(threshold))
    generate_cmd.extend([model_file, input_file])

    # we need to capture stdout since that's what main() writes to
    err = ''
    try:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = mystdout = StringIO()
        sys.stderr = mystderr = StringIO()
        gp.main(generate_cmd)
        out = mystdout.getvalue()
        err = mystderr.getvalue()
        predictions_after_saving = [int(x) for x in out.strip().split('\n')]
        eq_(predictions, predictions_after_saving)
    finally:
        sys.stdout = old_stdout
        sys.stderr = old_stderr
        print(err)

开发者ID:MechCoder，项目名称:skll，代码行数:57，代码来源:test_utilities.py

示例11: test_custom_learner_model_loading

def test_custom_learner_model_loading():
    num_labels = 10

    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_model_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test',
                     'test_model_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    # run the configuration that trains the custom model and saves it
    cfgfile = 'test_model_save_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    # save the predictions from disk into memory
    # and delete the predictions file
    outprefix = 'test_model_custom_learner'
    pred_file = join(_my_dir, 'output',
                     '{}_{}_CustomLogisticRegressionWrapper'
                     '.predictions'.format(outprefix,
                                           outprefix))
    preds1 = read_predictions(pred_file)
    os.unlink(pred_file)

    # run the configuration that loads the saved model
    # and generates the predictions again
    cfgfile = 'test_model_load_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, overwrite=False, quiet=True)

    # load the newly generated predictions
    preds2 = read_predictions(pred_file)

    # make sure that they are the same as before
    assert_array_equal(preds1, preds2)

开发者ID:BK-University，项目名称:skll，代码行数:54，代码来源:test_custom_learner.py

示例12: test_merge_different_vectorizers

def test_merge_different_vectorizers():
    """
    Test to ensure rejection of merging featuresets with different vectorizers
    """

    # create a featureset each with a DictVectorizer
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create another featureset using hashing
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      feature_prefix='g',
                                      num_labels=3,
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True)
    # This should raise a ValueError
    fs1 + fs2

开发者ID:BK-University，项目名称:skll，代码行数:20，代码来源:test_featureset.py

示例13: test_length

def test_length():
    """
    Test to whether len() returns the number of instances
    """

    # create a featureset
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     train_test_ratio=1.0)

    eq_(len(fs), 100)

开发者ID:BK-University，项目名称:skll，代码行数:12，代码来源:test_featureset.py

示例14: test_empty_labels

def test_empty_labels():
    """
    Test to check behaviour when labels is None
    """

    # create a feature set with empty labels
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     empty_labels=True,
                                     train_test_ratio=1.0)
    assert np.isnan(fs.labels).all()

开发者ID:BK-University，项目名称:skll，代码行数:12，代码来源:test_featureset.py

示例15: test_write_hashed_featureset

def test_write_hashed_featureset():
    """
    Test to check that hashed featuresets cannot be written out
    """
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     use_feature_hashing=True,
                                     feature_bins=2,
                                     random_state=1234)
    output_dir = join(_my_dir, 'output')
    writer = NDJWriter(join(output_dir, 'foo.jsonlines'), fs)
    writer.write()

开发者ID:EducationalTestingService，项目名称:skll，代码行数:12，代码来源:test_featureset.py

注：本文中的utils.make_classification_data函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。