本文整理汇总了Python中sklearn.feature_selection.SelectFpr.fit方法的典型用法代码示例。如果您正苦于以下问题:Python SelectFpr.fit方法的具体用法?Python SelectFpr.fit怎么用?Python SelectFpr.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_selection.SelectFpr
的用法示例。
在下文中一共展示了SelectFpr.fit方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: evaluate_model
# 需要导入模块: from sklearn.feature_selection import SelectFpr [as 别名]
# 或者: from sklearn.feature_selection.SelectFpr import fit [as 别名]
def evaluate_model(classifier, data_records, class_labels, labels):
attribute_values = []
accuracy_values = []
# Scoring the attributes using F_test and false positive rate
clf = SelectFpr(f_classif, alpha=0.9)
clf.fit(data_records, class_labels)
print(clf.scores_)
print('\n')
ranked_attr_indices = [0] * len(clf.scores_)
for i, x in enumerate(sorted(range(len(clf.scores_)), key=lambda y: clf.scores_[y])):
ranked_attr_indices[x] = i
# Performing a 4-fold cross validation against varying number of attributes. The attributes are chosen
# on the basis of their scores
for idx in range(2, len(ranked_attr_indices)):
filtered_records = data_records[:, ranked_attr_indices[:idx]]
for idx2 in ranked_attr_indices[:idx]:
print(labels[idx2])
validation_score = cross_validation.cross_val_score(classifier, filtered_records, class_labels, cv=5)
accuracy = max(validation_score) * 100
attribute_values.append(idx)
accuracy_values.append(accuracy)
print('Cross validation score - ' + str(idx) + ' attributes :' + str(validation_score) + '\n')
return (attribute_values, accuracy_values)
示例2: test_boundary_case_ch2
# 需要导入模块: from sklearn.feature_selection import SelectFpr [as 别名]
# 或者: from sklearn.feature_selection.SelectFpr import fit [as 别名]
def test_boundary_case_ch2():
# Test boundary case, and always aim to select 1 feature.
X = np.array([[10, 20], [20, 20], [20, 30]])
y = np.array([[1], [0], [0]])
scores, pvalues = chi2(X, y)
assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))
filter_fdr = SelectFdr(chi2, alpha=0.1)
filter_fdr.fit(X, y)
support_fdr = filter_fdr.get_support()
assert_array_equal(support_fdr, np.array([True, False]))
filter_kbest = SelectKBest(chi2, k=1)
filter_kbest.fit(X, y)
support_kbest = filter_kbest.get_support()
assert_array_equal(support_kbest, np.array([True, False]))
filter_percentile = SelectPercentile(chi2, percentile=50)
filter_percentile.fit(X, y)
support_percentile = filter_percentile.get_support()
assert_array_equal(support_percentile, np.array([True, False]))
filter_fpr = SelectFpr(chi2, alpha=0.1)
filter_fpr.fit(X, y)
support_fpr = filter_fpr.get_support()
assert_array_equal(support_fpr, np.array([True, False]))
filter_fwe = SelectFwe(chi2, alpha=0.1)
filter_fwe.fit(X, y)
support_fwe = filter_fwe.get_support()
assert_array_equal(support_fwe, np.array([True, False]))
示例3: test_select_fpr_classif
# 需要导入模块: from sklearn.feature_selection import SelectFpr [as 别名]
# 或者: from sklearn.feature_selection.SelectFpr import fit [as 别名]
def test_select_fpr_classif():
"""
Test whether the relative univariate feature selection
gets the correct items in a simple classification problem
with the fpr heuristic
"""
X, Y = make_classification(
n_samples=200,
n_features=20,
n_informative=3,
n_redundant=2,
n_repeated=0,
n_classes=8,
n_clusters_per_class=1,
flip_y=0.0,
class_sep=10,
shuffle=False,
random_state=0,
)
univariate_filter = SelectFpr(f_classif, alpha=0.0001)
X_r = univariate_filter.fit(X, Y).transform(X)
X_r2 = GenericUnivariateSelect(f_classif, mode="fpr", param=0.0001).fit(X, Y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(20)
gtruth[:5] = 1
assert_array_equal(support, gtruth)
示例4: test_select_heuristics_regression
# 需要导入模块: from sklearn.feature_selection import SelectFpr [as 别名]
# 或者: from sklearn.feature_selection.SelectFpr import fit [as 别名]
def test_select_heuristics_regression():
# Test whether the relative univariate feature selection
# gets the correct items in a simple regression problem
# with the fpr, fdr or fwe heuristics
X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10)
univariate_filter = SelectFpr(f_regression, alpha=0.01)
X_r = univariate_filter.fit(X, y).transform(X)
gtruth = np.zeros(20)
gtruth[:5] = 1
for mode in ["fdr", "fpr", "fwe"]:
X_r2 = GenericUnivariateSelect(f_regression, mode=mode, param=0.01).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
assert_array_equal(support[:5], np.ones((5,), dtype=np.bool))
assert_less(np.sum(support[5:] == 1), 3)
示例5: test_select_fpr_regression
# 需要导入模块: from sklearn.feature_selection import SelectFpr [as 别名]
# 或者: from sklearn.feature_selection.SelectFpr import fit [as 别名]
def test_select_fpr_regression():
"""
Test whether the relative univariate feature selection
gets the correct items in a simple regression problem
with the fpr heuristic
"""
X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)
univariate_filter = SelectFpr(f_regression, alpha=0.01)
X_r = univariate_filter.fit(X, Y).transform(X)
X_r2 = GenericUnivariateSelect(f_regression, mode="fpr", param=0.01).fit(X, Y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(20)
gtruth[:5] = 1
assert (support[:5] == 1).all()
assert np.sum(support[5:] == 1) < 3
示例6: train_decisiontree_FPR
# 需要导入模块: from sklearn.feature_selection import SelectFpr [as 别名]
# 或者: from sklearn.feature_selection.SelectFpr import fit [as 别名]
def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False):
print("Training with configuration " + configurationname)
X_train, y_train, id_to_a_train = train_data
dtc = DecisionTreeClassifier(random_state=0)
print("Feature Selection")
# selector = SelectFpr(score_function)
selector = SelectFpr(score_function)
result = selector.fit(X_train, y_train)
X_train = selector.transform(X_train)
fitted_ids = [i for i in result.get_support(indices=True)]
print("Apply Resampling")
print(Counter(y_train))
if undersam and not oversam:
renn = RepeatedEditedNearestNeighbours()
X_train, y_train = renn.fit_resample(X_train, y_train)
if oversam and not undersam:
# feature_indices_array = list(range(len(f_to_id)))
# smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
# X_train, y_train = smote_nc.fit_resample(X_train, y_train)
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)
if oversam and undersam:
smote_enn = SMOTEENN(random_state=0)
X_train, y_train = smote_enn.fit_resample(X_train, y_train)
print(Counter(y_train))
print("Train Classifier")
dtc = dtc.fit(X_train, y_train, check_input=True)
if export:
export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True)
transform(fitted_ids)
print("Self Accuracy: " + str(dtc.score(X_train, y_train)))
return selector, dtc
示例7: SelectFpr
# 需要导入模块: from sklearn.feature_selection import SelectFpr [as 别名]
# 或者: from sklearn.feature_selection.SelectFpr import fit [as 别名]
################################################################################
pl.figure(1)
pl.clf()
x_indices = np.arange(x.shape[-1])
################################################################################
# Univariate feature selection
from sklearn.feature_selection import SelectFpr, f_classif
# As a scoring function, we use a F test for classification
# We use the default selection function: the 10% most significant
# features
selector = SelectFpr(f_classif, alpha=0.1)
selector.fit(x, y)
scores = -np.log10(selector._pvalues)
scores /= scores.max()
pl.bar(x_indices-.45, scores, width=.3,
label=r'Univariate score ($-Log(p_{value})$)',
color='g')
################################################################################
# Compare to the weights of an SVM
clf = svm.SVC(kernel='linear')
clf.fit(x, y)
svm_weights = (clf.coef_**2).sum(axis=0)
svm_weights /= svm_weights.max()
pl.bar(x_indices-.15, svm_weights, width=.3, label='SVM weight',
color='r')
示例8: SelectFpr
# 需要导入模块: from sklearn.feature_selection import SelectFpr [as 别名]
# 或者: from sklearn.feature_selection.SelectFpr import fit [as 别名]
data2 = pdc.objFeatures[tr2_mask][:, featureIds]
data = np.vstack([data1, data2])
labels1 = np.zeros((data1.shape[0],))
labels2 = np.ones((data2.shape[0],))
labels = np.hstack([labels1, labels2])
X1 = data1[:1000]
X2 = data2[-1000:]
X = np.vstack([X1, X2])
Y1 = labels1[:X1.shape[0]]
Y2 = labels2[:X2.shape[0]]
Y = np.hstack([Y1, Y2])
from sklearn.feature_selection import SelectFpr, f_classif
selector = SelectFpr(f_classif, alpha=0.1)
selector.fit(X, Y)
scores = -np.log10(selector._pvalues)
scores /= scores.max()
from sklearn import svm
# Compare to the weights of an SVM
clf = svm.SVC(kernel='linear')
clf.fit(X, Y)
print 'SVM error:', clf.score(data, labels)
pred = clf.predict(data)
match = numpy.sum(pred == labels)
print match, labels.shape[0]
print match / float(labels.shape[0])
svm_weights = (clf.coef_**2).sum(axis=0)
svm_weights /= svm_weights.max()
示例9: VarianceThreshold
# 需要导入模块: from sklearn.feature_selection import SelectFpr [as 别名]
# 或者: from sklearn.feature_selection.SelectFpr import fit [as 别名]
# import data of all Count and Position features. Training and test sets altogether
dfCountfeatures = pd.read_csv('data/CountingAndPositionFeatures_TrainAndTestData.csv')
dfTrainRaw = pd.read_csv('data/train.csv')
# get only training data
TrainQueryIDs = dfTrainRaw["id"]
relevance = dfTrainRaw["relevance"]
dfCountfeatures_TrainSet = dfCountfeatures[dfCountfeatures["id"].isin(TrainQueryIDs)]
#select these features which have non-zero variance
selector = VarianceThreshold()
selector.fit_transform(dfCountfeatures_TrainSet).shape # only one feature with zero variance - shape (74067L, 262L)
# select feature based on p-values from univariate regression with target feature (relevance)
selector2= SelectFpr(f_regression, alpha = 0.01)
selector2.fit(dfCountfeatures_TrainSet.drop("id", axis = 1), relevance)
selector2.get_support(indices=True).size # left 226 features out of 262 with p-value <=1%
# get titles of features which were selected
selectedCountfeatures = dfCountfeatures.columns[selector2.get_support(indices=True)]
# check correlation amongst features
corrReduced = dfCountfeatures_TrainSet[selectedCountfeatures].corr()
corrReduced.iloc[:,:] = np.tril(corrReduced.values, k=-1)
corrReduced =corrReduced.stack()
# get pairs of features which are highly correlated
corrReduced[corrReduced.abs()>0.8].size # 578 pairs correlated more than 80% out of 25.425
len(set(corrReduced[corrReduced.abs()>0.8].index.labels[0])) # 172 features to be removed due to high correlation with other features
# get feature titles which will be used in training the model after removing highly correlated features
indices = set(corrReduced[corrReduced.abs()>0.8].index.labels[0])
selectedCountfeatures2 = [i for j, i in enumerate(selectedCountfeatures.tolist()) if j not in indices]
selectedCountfeatures2.append("id")