本文整理汇总了Python中sklearn.feature_selection.SelectPercentile方法的典型用法代码示例。如果您正苦于以下问题:Python feature_selection.SelectPercentile方法的具体用法?Python feature_selection.SelectPercentile怎么用?Python feature_selection.SelectPercentile使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_selection
的用法示例。
在下文中一共展示了feature_selection.SelectPercentile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: find_best_feature_selections
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def find_best_feature_selections(X,y):
#select the best features usin different technique
X_new = SelectKBest(chi2, k=80).fit_transform(X,y)
X_new1 = SelectPercentile(chi2, percentile=20).fit_transform(X,y)
X_new2 = SelectKBest(f_classif, k=80).fit_transform(X,y) #this one has the best performance
X_new22 = SelectPercentile(f_classif, percentile=20).fit_transform(X,y)
X_new3 = SelectKBest(f_classif, k=70).fit_transform(X,y)
X_new4 = SelectKBest(f_classif, k=60).fit_transform(X,y)
print (X_new.shape)
#selection_parameters_for_classfier(X_new,y)
#print (y.shape)
train_and_test(X_new,y)
train_and_test(X_new1,y)
train_and_test(X_new2,y)
train_and_test(X_new22,y)
train_and_test(X_new3,y)
train_and_test(X_new4,y)
#X,y = _dataset_sample()
################################PARAMETER Selected################################
#TODO some problem happens when using the parameter max_leaf_nodes in Dtree and RandomForest
示例2: test_objectmapper
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.feature_selection.GenericUnivariateSelect,
fs.GenericUnivariateSelect)
self.assertIs(df.feature_selection.SelectPercentile,
fs.SelectPercentile)
self.assertIs(df.feature_selection.SelectKBest, fs.SelectKBest)
self.assertIs(df.feature_selection.SelectFpr, fs.SelectFpr)
self.assertIs(df.feature_selection.SelectFromModel,
fs.SelectFromModel)
self.assertIs(df.feature_selection.SelectFdr, fs.SelectFdr)
self.assertIs(df.feature_selection.SelectFwe, fs.SelectFwe)
self.assertIs(df.feature_selection.RFE, fs.RFE)
self.assertIs(df.feature_selection.RFECV, fs.RFECV)
self.assertIs(df.feature_selection.VarianceThreshold,
fs.VarianceThreshold)
示例3: _get_feature_selector
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def _get_feature_selector(self):
"""Get a feature selector instance based on the feature_selector model
parameter
Returns:
(Object): a feature selector which returns a reduced feature matrix, \
given the full feature matrix, X and the class labels, y
"""
if self.config.model_settings is None:
selector_type = None
else:
selector_type = self.config.model_settings.get("feature_selector")
selector = {
"l1": SelectFromModel(LogisticRegression(penalty="l1", C=1)),
"f": SelectPercentile(),
}.get(selector_type)
return selector
示例4: univariate_feature_selection
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def univariate_feature_selection(mode,predictors,target):
if mode == 'f_regression':
fselect = SelectPercentile(f_regression, 100)
if mode == 'f_classif':
fselect = SelectPercentile(f_classif, 100)
if mode == 'chi2':
fselect = SelectPercentile(chi2, 100)
fselect.fit_transform(predictors, target)
return fselect.pvalues_
示例5: test_export_pipeline
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def test_export_pipeline():
"""Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline."""
pipeline_string = (
'KNeighborsClassifier(CombineDFs('
'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, '
'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
'DecisionTreeClassifier__min_samples_split=5),SelectPercentile(input_matrix, SelectPercentile__percentile=20))'
'KNeighborsClassifier__n_neighbors=10, '
'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform'
)
pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from tpot.builtins import StackingEstimator
# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \\
train_test_split(features, tpot_data['target'], random_state=None)
exported_pipeline = make_pipeline(
make_union(
StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)),
SelectPercentile(score_func=f_classif, percentile=20)
),
KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform")
)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
示例6: test_export_pipeline_3
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def test_export_pipeline_3():
"""Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with a preprocessor."""
pipeline_string = (
'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),'
'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,'
'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)'
)
pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \\
train_test_split(features, tpot_data['target'], random_state=None)
exported_pipeline = make_pipeline(
SelectPercentile(score_func=f_classif, percentile=20),
DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)
)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
示例7: test_export_pipeline_6
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def test_export_pipeline_6():
"""Assert that exported_pipeline() generated a compile source file with random_state and data_file_path."""
pipeline_string = (
'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),'
'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,'
'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)'
)
pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from tpot.export_utils import set_param_recursive
# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('test_path', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \\
train_test_split(features, tpot_data['target'], random_state=42)
exported_pipeline = make_pipeline(
SelectPercentile(score_func=f_classif, percentile=20),
DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
exported_code = export_pipeline(pipeline, tpot_obj.operators,
tpot_obj._pset, random_state=42,
data_file_path='test_path')
assert expected_code == exported_code
示例8: test_operator_export
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def test_operator_export():
"""Assert that a TPOT operator can export properly with a callable function as a parameter."""
assert list(TPOTSelectPercentile.arg_types) == TPOTSelectPercentile_args
export_string = TPOTSelectPercentile.export(5)
assert export_string == "SelectPercentile(score_func=f_classif, percentile=5)"
示例9: test_get_by_name
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def test_get_by_name():
"""Assert that the Operator class returns operators by name appropriately."""
assert get_by_name("SelectPercentile", tpot_obj.operators).__class__ == TPOTSelectPercentile.__class__
assert get_by_name("SelectFromModel", tpot_obj.operators).__class__ == TPOTSelectFromModel.__class__
示例10: test_pipeline_score_save
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def test_pipeline_score_save():
"""Assert that the TPOTClassifier can generate a scored pipeline export correctly."""
tpot_obj = TPOTClassifier()
tpot_obj._fit_init()
tpot_obj._pbar = tqdm(total=1, disable=True)
pipeline_string = (
'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),'
'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,'
'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)'
)
pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \\
train_test_split(features, tpot_data['target'], random_state=None)
# Average CV score on the training set was: 0.929813743
exported_pipeline = make_pipeline(
SelectPercentile(score_func=f_classif, percentile=20),
DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)
)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
assert_equal(expected_code, export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, pipeline_score=0.929813743))
示例11: _select_percentile
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def _select_percentile(self, percentile):
# Algorithm is conservative. Defaults to keeping features if
# percentile specifies a value that corresponds to a floating number
# of features. For example, if percentile=18 on a 20-feature matrix
# implies keeping 3.6 features. In that case, keeps 4 features.
if self._problem == FeatureSelector.CLASSIFICATION:
score = f_classif
else:
score = f_regression
self._selector = SelectPercentile(score, percentile)
示例12: PlotPerfPercentFeatures
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def PlotPerfPercentFeatures(X,y,est=LinearSVC()):
'''
Performance of a classifier (default: SVM-Anova)
varying the percentile of features selected (F-test) .
http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py
See Also: (Similar but with model seelction from among classifiers):
http://nbviewer.ipython.org/github/bugra/pydata-nyc-2014/blob/master/6.%20Scikit%20Learn%20-%20Model%20Selection.ipynb
'''
transform = SelectPercentile(f_classif)
clf = Pipeline([('anova', transform), ('est', est)])
###############################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds = list()
percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 99)
# percentiles = (1,5,10,25,50,75,90)
for percentile in percentiles:
# print(percentile)
clf.set_params(anova__percentile=percentile)
this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1)
score_means.append(this_scores.mean())
score_stds.append(this_scores.std())
print("Outputting Graph:")
plt.errorbar(percentiles, score_means, np.array(score_stds))
plt.title(
'Predictor Performance, varying percent of features used')
plt.xlabel('Percentile')
plt.ylabel('Prediction Performance')
plt.axis('tight')
plt.show()
示例13: plot_BestKFeatures
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def plot_BestKFeatures (X_train, y_train):
'''
http://nbviewer.ipython.org/github/gmonce/scikit-learn-book/blob/master/Chapter%204%20-%20Advanced%20Features%20-%20Feature%20Engineering%20and%20Selection.ipynb
Find the best percentile of features to use,
using cross-validation on the training set and get K best feats
'''
from sklearn import cross_validation
from sklearn import feature_selection
from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt = RandomForestClassifier(n_jobs=2, bootstrap=True, n_estimators=250, criterion='gini')
dt = dt.fit(X_train, y_train)
percentiles = range(1, 95, 5)
results = []
for i in range(1, 95, 5):
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i) #Original
fs = feature_selection.SelectPercentile(feature_selection.f_classif, percentile=i) # alt
X_train_fs = fs.fit_transform(X_train, y_train)
scores = cross_validation.cross_val_score(dt, X_train_fs, y_train, cv=4)
#print i,scores.mean()
results = np.append(results, scores.mean())
optimal_percentil = np.where(results == results.max())[0]
print (("Optimal number of features:{0}".format(percentiles[optimal_percentil])), "\n")
# Plot number of features VS. cross-validation scores
import pylab as pl
import matplotlib.pylab as pl
pl.figure()
pl.xlabel("Number of features selected")
pl.ylabel("Cross validation accuracy)")
pl.plot(percentiles,results)
print ("Mean scores:",results)
return
示例14: _get_feature_selector
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def _get_feature_selector(selector_type):
"""Get a feature selector instance based on the feature_selector model
parameter.
Returns:
(Object): A feature selector which returns a reduced feature matrix, \
given the full feature matrix, X and the class labels, y.
"""
selector = {
"l1": SelectFromModel(LogisticRegression(penalty="l1", C=1)),
"f": SelectPercentile(),
}.get(selector_type)
return selector
示例15: run_feature_selection
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectPercentile [as 别名]
def run_feature_selection(features, labels, feature_selection, best_features):
if feature_selection == 'select_K_Best':
# feature extraction
selector = SelectKBest(score_func=f_classif, k=4) # score_func=chi2 : only for non-negative features
selector.fit(features, labels)
# summarize scores
scores = selector.scores_
features_index_sorted = np.argsort(-scores)
features_selected = features[:, features_index_sorted[0:best_features]]
# SelectFromModel and LassoCV
# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
if feature_selection == 'LassoCV':
clf = LassoCV()
# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold=0.95)
sfm.fit(features, labels)
features_selected = sfm.transform(features).shape[1]
"""
# Reset the threshold till the number of features equals two.
# Note that the attribute can be set directly instead of repeatedly
# fitting the metatransformer.
while n_features > 2:
sfm.threshold += 0.1
X_transform = sfm.transform(X)
n_features = X_transform.shape[1]
"""
# Univariate feature selection
# Univariate feature selection works by selecting the best features based on univariate statistical tests.
# It can be seen as a preprocessing step to an estimator.
# Scikit-learn exposes feature selection routines as objects that implement the transform method:
# - SelectKBest removes all but the k highest scoring features
# - SelectPercentile removes all but a user-specified highest scoring percentage of features
# common univariate statistical tests for each feature: false positive rate SelectFpr, false discovery rate SelectFdr, or family wise error SelectFwe.
# - GenericUnivariateSelect allows to perform univariate feature selection with a configurable strategy. This allows to select the best univariate selection strategy with hyper-parameter search estimator.
if feature_selection == 'slct_percentile':
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(features, labels)
# The percentile not affect.
# Just select in order the top features by number or threshold
# Keep best 8 values?
scores = selector.scores_
features_index_sorted = np.argsort(-scores)
# scores = selector.scores_
# scores = -np.log10(selector.pvalues_)
# scores /= scores.max()
features_selected = features[:, features_index_sorted[0:best_features]]
print("Selected only " + str(features_selected.shape) + " features ")
return features_selected, features_index_sorted