本文整理汇总了Python中sklearn.feature_selection.SelectFwe类的典型用法代码示例。如果您正苦于以下问题:Python SelectFwe类的具体用法?Python SelectFwe怎么用?Python SelectFwe使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了SelectFwe类的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_select_heuristics_classif
def test_select_heuristics_classif():
# Test whether the relative univariate feature selection
# gets the correct items in a simple classification problem
# with the fdr, fwe and fpr heuristics
X, y = make_classification(
n_samples=200,
n_features=20,
n_informative=3,
n_redundant=2,
n_repeated=0,
n_classes=8,
n_clusters_per_class=1,
flip_y=0.0,
class_sep=10,
shuffle=False,
random_state=0,
)
univariate_filter = SelectFwe(f_classif, alpha=0.01)
X_r = univariate_filter.fit(X, y).transform(X)
gtruth = np.zeros(20)
gtruth[:5] = 1
for mode in ["fdr", "fpr", "fwe"]:
X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
assert_array_almost_equal(support, gtruth)
示例2: test_boundary_case_ch2
def test_boundary_case_ch2():
# Test boundary case, and always aim to select 1 feature.
X = np.array([[10, 20], [20, 20], [20, 30]])
y = np.array([[1], [0], [0]])
scores, pvalues = chi2(X, y)
assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))
filter_fdr = SelectFdr(chi2, alpha=0.1)
filter_fdr.fit(X, y)
support_fdr = filter_fdr.get_support()
assert_array_equal(support_fdr, np.array([True, False]))
filter_kbest = SelectKBest(chi2, k=1)
filter_kbest.fit(X, y)
support_kbest = filter_kbest.get_support()
assert_array_equal(support_kbest, np.array([True, False]))
filter_percentile = SelectPercentile(chi2, percentile=50)
filter_percentile.fit(X, y)
support_percentile = filter_percentile.get_support()
assert_array_equal(support_percentile, np.array([True, False]))
filter_fpr = SelectFpr(chi2, alpha=0.1)
filter_fpr.fit(X, y)
support_fpr = filter_fpr.get_support()
assert_array_equal(support_fpr, np.array([True, False]))
filter_fwe = SelectFwe(chi2, alpha=0.1)
filter_fwe.fit(X, y)
support_fwe = filter_fwe.get_support()
assert_array_equal(support_fwe, np.array([True, False]))
示例3: test_select_fwe_classif
def test_select_fwe_classif():
"""
Test whether the relative univariate feature selection
gets the correct items in a simple classification problem
with the fpr heuristic
"""
X, Y = make_classification(
n_samples=200,
n_features=20,
n_informative=3,
n_redundant=2,
n_repeated=0,
n_classes=8,
n_clusters_per_class=1,
flip_y=0.0,
class_sep=10,
shuffle=False,
random_state=0,
)
univariate_filter = SelectFwe(f_classif, alpha=0.01)
X_r = univariate_filter.fit(X, Y).transform(X)
X_r2 = GenericUnivariateSelect(f_classif, mode="fwe", param=0.01).fit(X, Y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(20)
gtruth[:5] = 1
assert np.sum(np.abs(support - gtruth)) < 2
示例4: test_select_fwe_4
def test_select_fwe_4():
"""Ensure that the TPOT select fwe outputs the same result as sklearn fwe when 0.001 < alpha < 0.05"""
tpot_obj = TPOT()
non_feature_columns = ['class', 'group', 'guess']
training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values
with warnings.catch_warnings():
warnings.simplefilter('ignore', category=UserWarning)
selector = SelectFwe(f_classif, alpha=0.042)
selector.fit(training_features, training_class_vals)
mask = selector.get_support(True)
mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns
assert np.array_equal(tpot_obj._select_fwe(training_testing_data, 0.042), training_testing_data[mask_cols])
示例5: test_select_fwe_regression
def test_select_fwe_regression():
# Test whether the relative univariate feature selection
# gets the correct items in a simple regression problem
# with the fwe heuristic
X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)
univariate_filter = SelectFwe(f_regression, alpha=0.01)
X_r = univariate_filter.fit(X, y).transform(X)
X_r2 = GenericUnivariateSelect(f_regression, mode="fwe", param=0.01).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(20)
gtruth[:5] = 1
assert_array_equal(support[:5], np.ones((5,), dtype=np.bool))
assert_less(np.sum(support[5:] == 1), 2)
示例6: test_select_fwe_regression
def test_select_fwe_regression():
"""
Test whether the relative univariate feature selection
gets the correct items in a simple regression problem
with the fwe heuristic
"""
X, Y = make_regression(n_samples=200, n_features=20,
n_informative=5, shuffle=False, random_state=0)
univariate_filter = SelectFwe(f_regression, alpha=0.01)
X_r = univariate_filter.fit(X, Y).transform(X)
X_r2 = GenericUnivariateSelect(f_regression, mode='fwe',
param=0.01).fit(X, Y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(20)
gtruth[:5] = 1
assert(support[:5] == 1).all()
assert(np.sum(support[5:] == 1) < 2)
示例7: _select_fwe
def _select_fwe(self, input_df, alpha):
""" Uses Scikit-learn's SelectFwe feature selection to filter the subset of features
according to p-values corresponding to Family-wise error rate
Parameters
----------
input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
Input DataFrame to perform feature selection on
alpha: float in the range [0.001, 0.05]
The highest uncorrected p-value for features to keep
Returns
-------
subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']}
Returns a DataFrame containing the 'best' features
"""
training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)
training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values
# forcing 0.001 <= alpha <= 0.05
if alpha > 0.05:
alpha = 0.05
elif alpha <= 0.001:
alpha = 0.001
if len(training_features.columns.values) == 0:
return input_df.copy()
with warnings.catch_warnings():
# Ignore warnings about constant features
warnings.simplefilter('ignore', category=UserWarning)
selector = SelectFwe(f_classif, alpha=alpha)
selector.fit(training_features, training_class_vals)
mask = selector.get_support(True)
mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group']
return input_df[mask_cols].copy()
示例8: main
def main(args):
if args.train_dir is None:
# args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/'
#args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/'
# args.train_dir = r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3'
# args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big'
args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles'
print("Using default train_dir: %s" % args.train_dir)
pandas.set_option('display.max_columns', 10)
pandas.set_option('display.max_rows', 4)
# mpl.rc('title', labelsize=6)
mpl.rc('ytick', labelsize=7)
mpl.rc('xtick', labelsize=4)
os.chdir(args.train_dir)
dataName = 'Neuropeptides'
df = pandas.read_csv('trainingSetFeatures.csv')
feature_cols = [col for col in df.columns if col not in ['classname','Id','proteinname']]
feature_cols=numpy.array(feature_cols)
X = df[feature_cols].values
y = df.classname.values
le = LabelEncoder()
y = le.fit_transform(y)
"Initial feature selection trimming"
print(X.shape)
Fwe = SelectFwe(alpha=0.01).fit(X,y)
X=Fwe.transform(X)
print("F-test -> ",X.shape)
feature_cols=feature_cols[Fwe.get_support()]
'''
FeatSelection_SVM = True
if FeatSelection_SVM == True:
svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y)
X = svc_L1.transform(X, y)
print ("L1 SVM Transformed X:",X_L1.shape)
feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
'''
k = SelectKBest(k=255).fit(X,y)
X=k.transform(X)
feature_cols=feature_cols[k.get_support()]
param_dist = {"max_depth": [6,9, None],
"max_features": ['auto',0.4],
"min_samples_leaf": [1,2,3],
"bootstrap": [True, False],
'min_samples_split':[2,3],
"criterion": [ "gini"],
"n_estimators":[100],
"n_jobs":[-1]}
rf = RandomForestClassifierWithCoef(max_depth= 7, min_samples_split= 1, min_samples_leaf= 2, n_estimators= 50, n_jobs= 2, max_features= "auto")
"WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class."
scores = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2))
print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2))
"Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'."
scores_f1 = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1')
print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))
# rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04)
rfeSelect = RFECV(estimator=rf,step=20, cv=2,scoring='f1') #average_precision , recall
X_RFE = rfeSelect.fit_transform(X,y)
print(X_RFE.shape)
RFE_FeatureNames = feature_cols[rfeSelect.get_support()]
print(RFE_FeatureNames)
RFE_ScoreRatio = 100*(cross_validation.cross_val_score(rf,X_RFE,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1').mean())/scores_f1.mean()
print("Even with just",X_RFE.shape[1]," features, we have %f performance! (f1 score ratio)" %(RFE_ScoreRatio))
# PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
print("Alt plot:")
altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
示例9: GetAllPerf
def GetAllPerf (filePaths=None):
if filePaths is None:
filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv'))
#Sanity check:
# filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile']
# filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv']
print("FilePaths: \n",filePaths)
fileNames=fileNameFromPaths (filePaths)
print("FileNames:",fileNames)
resDict = pd.DataFrame(index=fileNames,
columns=['Accuracy','Accuracy_SD',
'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1',
'LargestClassPercent','Classes',
# 'TopRFE-Features','Best (f1) Model parameters',
'# Classes',
'Array-Acc-Scores' ,'Array-f1-Scores'
,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted'])
#redDict holds results for each file/class, for saving to output-file
i=-1
for filePath in filePaths:
i +=1
'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/'
filePath = os.path.normpath(filePath)
print(filePath)
fileName=str(fileNames[i]) #Str added now 14.1
print("fileName: %s" %(fileName))
"resDict['Name']= fileName"
# filePath = str(argv[1])
# X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels
X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels
print(X.shape,"= (samples, features)")
y_inv = Counter(lb_encoder.inverse_transform(y))
MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1)
print("Classes:", lb_encoder.classes_)
print("MajorityClassPercent:", MajorityPercent)
resDict.LargestClassPercent[fileName] = MajorityPercent
resDict.Classes[fileName] = str(lb_encoder.classes_)
resDict["# Classes"][fileName]=len(lb_encoder.classes_)
KFilt=None
KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself.
if KFilt is not None:
k = SelectKBest(k=KFilt).fit(X,y)
X=k.transform(X)
featureNames=featureNames[k.get_support()]
Fwe = SelectFwe(alpha=0.01).fit(X,y)
X=Fwe.transform(X)
featureNames=featureNames[Fwe.get_support()]
print("X reduced to K best features: ",X.shape)
FeatSelection_SVM=False #Feature Names need updating!!
FeatSelection_RandLogReg=False
if FeatSelection_RandLogReg == True:
LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5,
sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y)
X_L1 = LogRegFeats.transform(X)
featureNames=featureNames[LogRegFeats.get_support()]
print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape)
elif FeatSelection_SVM == True:
svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y)
X_L1 = svc_L1.transform(X, y)
featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
print ("L1 SVM Transformed X:",X_L1.shape)
# X=X_L1
'''
print("Performance as a function of percent of features used:")
PlotPerfPercentFeatures(X,y,est=LinearSVC())
'''
'EG - graph best features; feature selection using RF, ensemble classifiers..'
'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb'
RFE_FeatsToKeep = 16
FeatSelection_RFE=False
FeatSelection_RFECV=False
if (FeatSelection_RFE or FeatSelection_RFECV) == True:
'RFE + - best feats'
'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html '
svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False)
# svc = LogisticRegression(class_weight='auto')#,C=1)
#.........这里部分代码省略.........
示例10: LabelEncoder
feature_cols=np.array(feature_cols)
# In[ ]:
X=df[feature_cols].values
y=df.classname.values
# In[ ]:
le = LabelEncoder()
y = le.fit_transform(y)
# In[ ]:
print("Orig X -> ",X.shape)
Fwe = SelectFwe(alpha=0.001).fit(X,y)
X=Fwe.transform(X)
print("F-test -> ",X.shape)
feature_cols=feature_cols[Fwe.get_support()]
# In[ ]:
rf = RandomForestClassifierWithCoef(max_depth= 9, min_samples_split= 3, min_samples_leaf= 3, n_estimators= 650, n_jobs= -1, max_features= "auto")
# In[ ]:
scores = cross_val_score(rf,X,y,n_jobs=-1,cv=StratifiedShuffleSplit(y,n_iter=7,test_size=0.3))
print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2))
# scores_f1 = cross_val_score(rf,X,y,n_jobs=-1,cv=StratifiedShuffleSplit(y,n_iter=10,test_size=0.22),scoring='f1')
# print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))
示例11: SelectFwe
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from PipeTasks import Get_yPred,balance_weights
# Import some data to play with
#########################################
os.chdir(r'/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile')
##os.chdir(r'/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SP_Cleaved+NP+Neg_Big')
df = pd.read_csv('trainingSetFeatures.csv')
## df.drop('proteinname',axis=1, inplace=True)
feature_cols = [col for col in df.columns if col not in ['classname','Id','proteinname']]
X=df[feature_cols].values
y=df.classname.values
Fwe = SelectFwe(alpha=0.01).fit(X,y)
X=Fwe.transform(X)
le = LabelEncoder()
y = le.fit_transform(y)
# Binarize the output
# y = label_binarize(y, classes=[0, 1, 2])
# y = label_binarize(y)
##n_classes = y.shape[1]
n_classes=len(set(y))
target_names=list(le.classes_)
print ("n_classes",n_classes,"target_names",target_names)
# shuffle and split training and test sets
##X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
## random_state=0)
示例12: parse_labeled_data
filename = "../training_data/ordered_tweets_no_duplicates.txt"
tweets_and_labels = parse_labeled_data(filename)
# print tweets_and_labels
# random.shuffle(tweets_and_labels)
Y, X = get_x_y(tweets_and_labels)
# X, Y = make_moons(noise=0.3, random_state=0)
# print X, Y
# print nX[0], nY[0]
# splitting training and test set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)
# C = regularization parameter (keeps from overfitting): C is the degree of penalty (L1 or L2) (powers of 10)
# penalty sparse = l2 lowers angle so that no unigram can be super weighted, l1 removes features to shift the curve
# TODO: separate into train test eval
fs = SelectFwe(alpha=700.0)
print "Before", x_train.shape
clf = svm.LinearSVC(C=100, penalty="l2", dual=False)
clf.fit(x_train, y_train)
print "NO FEATURE SELECTION"
print "Training Accuracy"
print clf.decision_function(x_train)
print (classification_report(y_train, clf.predict(x_train), target_names=target_names))
print "Testing Accuracy"
print (classification_report(y_test, clf.predict(x_test), target_names=target_names))
x_train = fs.fit_transform(x_train, y_train)
示例13: train_test_split
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectFwe
from sklearn.feature_selection import f_classif
from sklearn.neighbors import KNeighborsClassifier
# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25)
result1 = tpot_data.copy()
training_features = result1.loc[training_indices].drop(['class', 'group', 'guess'], axis=1)
training_class_vals = result1.loc[training_indices, 'class'].values
if len(training_features.columns.values) == 0:
result1 = result1.copy()
else:
selector = SelectFwe(f_classif, alpha=0.05)
selector.fit(training_features.values, training_class_vals)
mask = selector.get_support(True)
mask_cols = list(training_features.iloc[:, mask].columns) + ['class']
result1 = result1[mask_cols]
# Perform classification with a k-nearest neighbor classifier
knnc2 = KNeighborsClassifier(n_neighbors=min(8, len(training_indices)))
knnc2.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
result2 = result1.copy()
result2['knnc2-classification'] = knnc2.predict(result2.drop('class', axis=1).values)
示例14: str
'TODO: Allow user to select desired function - CV model, or feature reduction'
'TODO: Use os.path.join - for file names/locations/dirs..'
#Set by user input:
fileName = r'/trainingSetFeatures.csv'
filePath = str(argv[1])
X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels
print(X.shape,"= (samples, features)")
y_inv = Counter(lb_encoder.inverse_transform(y))
print("Classes:", y_inv)
# 'Normalize/Scale features if needed. Our data is standardized by default'
# X = StandardScaler(copy=False).fit_transform(X)
Fwe = SelectFwe(alpha=0.01).fit(X,y)
X=Fwe.transform(X)
featureNames=featureNames[Fwe.get_support()]
print("F-test filter ->",X.shape)
FeatSelection_SVM=True
FeatSelection_RandLogReg=False
if FeatSelection_RandLogReg == True:
LogRegFeats = RandomizedLogisticRegression(C=5, scaling=0.5,
sample_fraction=0.8, n_resampling=60, selection_threshold=0.2,n_jobs=-1)
X = LogRegFeats.fit_transform(X,y)
featureNames=featureNames[LogRegFeats.get_support()]
print("RandomizedLogisticRegression Feature Selection ->:",X.shape)
elif FeatSelection_SVM == True: