本文整理汇总了Python中sklearn.feature_selection.SelectKBest方法的典型用法代码示例。如果您正苦于以下问题:Python feature_selection.SelectKBest方法的具体用法?Python feature_selection.SelectKBest怎么用?Python feature_selection.SelectKBest使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_selection
的用法示例。
在下文中一共展示了feature_selection.SelectKBest方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: GetSelectedFeatureIndex
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def GetSelectedFeatureIndex(self, data_container):
data = data_container.GetArray()
data /= np.linalg.norm(data, ord=2, axis=0)
label = data_container.GetLabel()
if data.shape[1] < self.GetSelectedFeatureNumber():
print(
'ANOVA: The number of features {:d} in data container is smaller than the required number {:d}'.format(
data.shape[1], self.GetSelectedFeatureNumber()))
self.SetSelectedFeatureNumber(data.shape[1])
fs = SelectKBest(f_classif, k=self.GetSelectedFeatureNumber())
fs.fit(data, label)
feature_index = fs.get_support(True)
f_value, p_value = f_classif(data, label)
return feature_index.tolist(), f_value, p_value
示例2: get_model
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def get_model(with_pipeline=False):
"""Get a multi-layer perceptron model.
Optionally, put it in a pipeline that scales the data.
"""
model = NeuralNetClassifier(MLPClassifier)
if with_pipeline:
model = Pipeline([
('scale', FeatureUnion([
('minmax', MinMaxScaler()),
('normalize', Normalizer()),
])),
('select', SelectKBest(k=N_FEATURES)), # keep input size constant
('net', model),
])
return model
示例3: featuresFromFeatureSelection
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def featuresFromFeatureSelection(X,Y,columnNames):
for f in columnNames:
print(f)
X_new_withfitTransform = SelectKBest(chi2, k=34).fit(X, Y)
colors = getColorNames()
counter = 0
scores = X_new_withfitTransform.scores_
scores_scaled = np.divide(scores, 1000)
for score in scores_scaled:
#if(score > 10):
#print('Feature {:>34}'.format(columnNames[counter]))
print('{:>34} '.format( score))
'''Plot a graph'''
plt.bar(counter, score,color=colors[counter])
counter +=1
plt.ylabel('Scores(1k)')
plt.title('Scores calculated by Chi-Square Test')
plt.legend(columnNames, bbox_to_anchor=(0., 0.8, 1., .102), loc=3,ncol=5, mode="expand", borderaxespad=0.)
plt.show()
#print(feature_selection.chi2(X,Y))
示例4: test_export_to_sklearn_pipeline3
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def test_export_to_sklearn_pipeline3(self):
from lale.lib.lale import ConcatFeatures
from lale.lib.sklearn import PCA
from lale.lib.sklearn import KNeighborsClassifier, LogisticRegression, SVC
from sklearn.feature_selection import SelectKBest
from lale.lib.sklearn import Nystroem
from sklearn.pipeline import FeatureUnion
lale_pipeline = ((PCA() >> SelectKBest(k=2)) & (Nystroem(random_state = 42) >> SelectKBest(k=3))
& (SelectKBest(k=3))) >> ConcatFeatures() >> SelectKBest(k=2) >> LogisticRegression()
trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train)
sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline()
self.assertIsInstance(sklearn_pipeline.named_steps['featureunion'], FeatureUnion)
self.assertIsInstance(sklearn_pipeline.named_steps['selectkbest'], SelectKBest)
from sklearn.linear_model import LogisticRegression
self.assertIsInstance(sklearn_pipeline.named_steps['logisticregression'], LogisticRegression)
self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
示例5: get_top_k
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def get_top_k(self):
columns=list(self.data.columns.values)
columns.remove(self.target)
# remove intercept from top_k
if(self.objective):
top_k_vars=SelectKBest(f_regression, k=self.top_k)
top_k_vars.fit_transform(self.data[columns], self.data[self.target])
else:
columns.remove('intercept')
try:
top_k_vars=SelectKBest(chi2, k=self.top_k)
top_k_vars.fit_transform(self.data[columns], self.data[self.target])
except:
top_k_vars=SelectKBest(f_classif, k=self.top_k)
top_k_vars.fit_transform(self.data[columns], self.data[self.target])
return [columns[i] for i in top_k_vars.get_support(indices=True)]
示例6: find_best_feature_selections
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def find_best_feature_selections(X,y):
#select the best features usin different technique
X_new = SelectKBest(chi2, k=80).fit_transform(X,y)
X_new1 = SelectPercentile(chi2, percentile=20).fit_transform(X,y)
X_new2 = SelectKBest(f_classif, k=80).fit_transform(X,y) #this one has the best performance
X_new22 = SelectPercentile(f_classif, percentile=20).fit_transform(X,y)
X_new3 = SelectKBest(f_classif, k=70).fit_transform(X,y)
X_new4 = SelectKBest(f_classif, k=60).fit_transform(X,y)
print (X_new.shape)
#selection_parameters_for_classfier(X_new,y)
#print (y.shape)
train_and_test(X_new,y)
train_and_test(X_new1,y)
train_and_test(X_new2,y)
train_and_test(X_new22,y)
train_and_test(X_new3,y)
train_and_test(X_new4,y)
#X,y = _dataset_sample()
################################PARAMETER Selected################################
#TODO some problem happens when using the parameter max_leaf_nodes in Dtree and RandomForest
示例7: feature_select
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def feature_select(corpus, labels, k=1000):
"""
select top k features through chi-square test
"""
bin_cv = CountVectorizer(binary=True)
le = LabelEncoder()
X = bin_cv.fit_transform(corpus)
y = le.fit_transform(labels).reshape(-1, 1)
k = min(X.shape[1], k)
skb = SelectKBest(chi2, k=k)
skb.fit(X, y)
feature_ids = skb.get_support(indices=True)
feature_names = bin_cv.get_feature_names()
vocab = {}
for new_fid, old_fid in enumerate(feature_ids):
feature_name = feature_names[old_fid]
vocab[feature_name] = new_fid
# we only care about the final extracted feature vocabulary
return vocab
示例8: test_objectmapper
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.feature_selection.GenericUnivariateSelect,
fs.GenericUnivariateSelect)
self.assertIs(df.feature_selection.SelectPercentile,
fs.SelectPercentile)
self.assertIs(df.feature_selection.SelectKBest, fs.SelectKBest)
self.assertIs(df.feature_selection.SelectFpr, fs.SelectFpr)
self.assertIs(df.feature_selection.SelectFromModel,
fs.SelectFromModel)
self.assertIs(df.feature_selection.SelectFdr, fs.SelectFdr)
self.assertIs(df.feature_selection.SelectFwe, fs.SelectFwe)
self.assertIs(df.feature_selection.RFE, fs.RFE)
self.assertIs(df.feature_selection.RFECV, fs.RFECV)
self.assertIs(df.feature_selection.VarianceThreshold,
fs.VarianceThreshold)
示例9: test_pipeline
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def test_pipeline(self):
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
diabetes = datasets.load_diabetes()
models = ['OLS', 'GLS', 'WLS', 'GLSAR', 'QuantReg', 'GLM', 'RLM']
for model in models:
klass = getattr(sm, model)
selector = SelectKBest(f_regression, k=5)
estimator = Pipeline([('selector', selector),
('reg', base.StatsModelsRegressor(klass))])
estimator.fit(diabetes.data, diabetes.target)
result = estimator.predict(diabetes.data)
data = SelectKBest(f_regression, k=5).fit_transform(diabetes.data, diabetes.target)
expected = klass(diabetes.target, data).fit().predict(data)
self.assert_numpy_array_almost_equal(result, expected)
示例10: featureFitting
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def featureFitting(filename, X, y, featureNames,optimalFlag, kbest=20, alpha=0.05, model=None):
'''
Gets the K-best features (filtered by FDR, then select best ranked by t-test, more advanced options can be implemented).
Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
Returns new features matrix, FD scaler, and K-select scaler
'''
a=alpha
FD = SelectFdr(alpha=a)
X = FD.fit_transform(X,y)
selectK = SelectKBest(k=kbest)
selectK.fit(X,y)
selectK_mask=selectK.get_support()
K_featnames = featureNames[selectK_mask]
print("K_featnames: %s" %(K_featnames))
Reduced_df = pd.read_csv(filename, index_col=0)
Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
Reduced_df.to_csv('REDUCED_Feat.csv')
return Reduced_df, FD, selectK
示例11: ReducedFeaturesDF
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def ReducedFeaturesDF(X,y):
'''
Returns a dataframe with only a subset of features/columns retained
'''
from sklearn.feature_selection import RFE
est = LinearSVC( penalty='l1', loss='l2', dual=False, class_weight='auto')
# selectK = SelectKBest(score_func = f_classif, k=45)
selectRFE = RFE(estimator=est, n_features_to_select=22, step=0.15)
selectK=selectRFE
selectK.fit(X,y)
selectK_mask=selectK.get_support()
K_featnames = feature_names[selectK_mask]
print ("reduced RFE features:")
print(K_featnames)
Reduced_df = pd.read_csv(filename, index_col=0)
Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
# Reduced_df.to_csv('REDUCED_Feat.csv')
return Reduced_df
# ReducedFeaturesDF(X,y)
# z=pd.DataFrame(data=X_SGD,index=y)
# z.to_csv('REDUCED_Feat.csv')
示例12: fit
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def fit(self, X, y):
self.selector = SelectKBest(f_classif, k=self.max_features)
self.selector.fit(X, y)
X_train=self.selector.transform(X)
y_train=y
param_list=[]
idx = range(len(y_train))
for i in range(self.n_estimators):
random.shuffle(idx)
param_list.append((X_train[idx[:self.max_samples]],
y_train[idx[:self.max_samples]]))
pool = ThreadPool(cpu_count())
self.clf_list = pool.map(self._prepare_classifier, param_list)
pool.close()
pool.join()
"""
X2=[]
for clf in self.clf_list:
P=clf.predict_proba(X_train)
if len(X2)==0:
X2=P[:, 0]
else:
X2=numpy.vstack((X2, P[:, 0]))
X2=numpy.swapaxes(X2, 0, 1)
print "X2:", X2.shape
from sklearn.ensemble import RandomForestClassifier
self.clf2=RandomForestClassifier(n_estimators=100)
self.clf2.fit(X2, y_train)
"""
示例13: test_bagging_with_pipeline
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def test_bagging_with_pipeline():
estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
DecisionTreeClassifier()),
max_features=2)
estimator.fit(iris.data, iris.target)
assert isinstance(estimator[0].steps[-1][1].random_state, int)
示例14: test_feature_selection
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def test_feature_selection():
# make two feature dicts with two useful features and a bunch of useless
# ones, in terms of chi2
d1 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=1, useful2=20)
d2 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=20, useful2=1)
for indices in (True, False):
v = DictVectorizer().fit([d1, d2])
X = v.transform([d1, d2])
sel = SelectKBest(chi2, k=2).fit(X, [0, 1])
v.restrict(sel.get_support(indices=indices), indices=indices)
assert_equal(v.get_feature_names(), ["useful1", "useful2"])
示例15: mkchi2
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import SelectKBest [as 别名]
def mkchi2(k):
"""Make k-best chi2 selector"""
return SelectKBest(chi2, k=k)