本文整理汇总了Python中sklearn.feature_selection.VarianceThreshold类的典型用法代码示例。如果您正苦于以下问题:Python VarianceThreshold类的具体用法?Python VarianceThreshold怎么用?Python VarianceThreshold使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了VarianceThreshold类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: feature_select
def feature_select(word,instance_dic,feature_dic, thre_hold=0.01, num_feature=100):
instances_list = instance_dic[word]
feature_words=feature_dic[word]
feature_xs = []
labels = []
for instance in instances_list:
label = ' '.join(instance.senseid)
feature_x_dic = feature_vector(instance,feature_words)
feature_vals=[]
for word in feature_words:
feature_vals.append(feature_x_dic[word])
feature_xs.append(feature_vals)
labels.append(label)
# 1st round feature selection by removing low variance features
sel_lowvr = VarianceThreshold(threshold=(thre_hold))
feature_xs_selected = sel_lowvr.fit(feature_xs)
lowvr_index = feature_xs_selected.get_support(indices=True).tolist()
feature_xs_selected = feature_xs_selected.transform(feature_xs).tolist()
# 2nd round feature selection using sklearn's SelectKBest()
if num_feature < len(feature_xs_selected[0]):
sel_chi2 = SelectKBest(chi2, k= num_feature).fit(feature_xs_selected, labels)
chi2_index= sel_chi2.get_support(indices=True).tolist()
#feature_xs_selected = sel_chi2.transform(feature_xs_selected).tolist()# transform from numpy array back to lis
return lowvr_index, chi2_index
else:
print str(word) + ": chi2 selection not executed due to low # of features"
return lowvr_index, [i for i in range(len(lowvr_index))]
示例2: vectorize_EX
def vectorize_EX(self, columns, variance_thresh=0, train_only=False):
print('Start vectorizing')
start_time = time.time()
hasher = CountVectorizer(binary=True, tokenizer=LemmaTokenizer(), stop_words='english')
train_dtm = hasher.fit_transform(
self.ga_bm_train[columns].apply(lambda x: ','.join(x), axis=1))
print(hasher.get_feature_names())
print('dtm train shape: ', train_dtm.shape)
selector = VarianceThreshold(variance_thresh)
train_dtm = selector.fit_transform(train_dtm)
print('dtm train shape after variance thresh: ', train_dtm.shape)
if not train_only:
test_dtm = hasher.transform(
self.ga_bm_test[columns].apply(lambda x: ','.join(x), axis=1))
print('dtm test shape: ', test_dtm.shape)
test_dtm = selector.transform(test_dtm)
print('dtm test shape after variance thresh: ', test_dtm.shape)
print("Time: ", round(((time.time() - start_time)/60), 2))
print('Complete vectorizing')
if train_only:
return train_dtm
else:
return (train_dtm, test_dtm)
示例3: variance_cutoff
def variance_cutoff(X,cutoff=0.8):
"""
Set variance cutoff for variables
"""
sel = VarianceThreshold(threshold=(cutoff * (1 - cutoff)))
X = sel.fit_transform(X)
return X
示例4: main
def main():
args = getOptions()
print args
print "train file read"
train_x, train_y = readfile_noid(args.train,'train',4)
train_x_new, id = extractID(train_x)
del train_x
train_x_clean, contentdict = cityclean(train_x_new)
del id, train_x_new
#remove feature with no distinction and less important
print "remove feature with no distinction and less important"
sel = VarianceThreshold()
train_x_uniq = sel.fit_transform(train_x_clean)
del train_x_clean
#normalization
print "normalization"
train_x_nor, mean, std = normalize(train_x_uniq)
del train_x_uniq
#feature selection and modeling
print "feature selection and modeling"
exclusivefs(train_x_nor, train_y)
示例5: doFeatureSelection
def doFeatureSelection(self,features,target,k):
features_int = np.array(features,dtype=float)
target_int = np.array(target,dtype=float)
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
features_new = sel.fit_transform(features_int)
#features_new = SelectKBest(chi2,k=10).fit_transform(features_int,target_int)
return features_new
示例6: _variance_threshold
def _variance_threshold(self, input_df, threshold):
"""Uses Scikit-learn's VarianceThreshold feature selection to learn the subset of features that pass the threshold
Parameters
----------
input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
Input DataFrame to perform feature selection on
threshold: float
The variance threshold that removes features that fall under the threshold
Returns
-------
subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']}
Returns a DataFrame containing the features that are above the variance threshold
"""
training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)
selector = VarianceThreshold(threshold=threshold)
try:
selector.fit(training_features)
except ValueError:
# None features are above the variance threshold
return input_df[['guess', 'class', 'group']].copy()
mask = selector.get_support(True)
mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group']
return input_df[mask_cols].copy()
示例7: main
def main():
parser = argparse.ArgumentParser(description='Normalize the feature values')
required = parser.add_argument_group('required options')
required.add_argument('-x', '--outlist', required=True, help='File containing feature values')
required.add_argument('-y', '--execlist', required=True, help='File containing exec list')
args = parser.parse_args()
#X = np.loadtxt(args.outlist, skiprows=1)
np.set_printoptions(precision=2)
X = np.genfromtxt(args.outlist, skiprows=1)
X=np.nan_to_num(X)
Y = np.loadtxt(args.execlist, ndmin=2)
#f = open("trainlist","wb")
#newResult = X/Y
#sel = VarianceThreshold(threshold=(.8*(1-.8)))
sel = VarianceThreshold(threshold=(.8*(1-.8)))
result1 = sel.fit_transform(X)
newResult = result1/Y
#result2 = sel.fit_transform(newResult)
#feature collection for test programs
if os.path.isfile('eventlist'):
features = np.genfromtxt('eventlist',dtype='str')
featureFromVariance = sel.get_support(indices=True)
text_file = open("variancefeatures.txt","w")
for i in featureFromVariance:
text_file.write(features[i])
text_file.write("\n")
text_file.close()
np.savetxt('normfeaturelist', newResult, fmt='%.2f', delimiter='\t')
示例8: test_same_variances
def test_same_variances(self):
local = VarianceThreshold()
dist = SparkVarianceThreshold()
shapes = [((10, 5), None),
((1e3, 20), None),
((1e3, 20), 100),
((1e4, 100), None),
((1e4, 100), 600)]
for shape, block_size in shapes:
X_dense, X_dense_rdd = self.make_dense_rdd()
X_sparse, X_sparse_rdd = self.make_sparse_rdd()
Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))
local.fit(X_dense)
dist.fit(X_dense_rdd)
assert_array_almost_equal(local.variances_, dist.variances_)
local.fit(X_sparse)
dist.fit(X_sparse_rdd)
assert_array_almost_equal(local.variances_, dist.variances_)
dist.fit(Z)
assert_array_almost_equal(local.variances_, dist.variances_)
示例9: feature_selection
def feature_selection(features, ideal_num=None):
from sklearn.feature_selection import VarianceThreshold
copy = np.copy(features)
for i in range(8):
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(copy[i])
return copy
示例10: main
def main():
args = getOptions()
print args
print "train file read"
train_x, train_y = readfile_noid(args.train,'train')
train_x_new, id = extractID(train_x)
del id
print "test file read"
test_x, test_y = readfile_noid(args.test,'test')
test_x_new, id = extractID(test_x)
#remove feature with no distinction and less important
print "remove feature with no distinction and less important"
sel = VarianceThreshold()
train_x_uniq = sel.fit_transform(train_x_new)
test_x_uniq = sel.transform(test_x_new)
#normalization
print "normalization"
train_x_nor, mean, std = normalize(train_x_uniq)
test_x_nor, mean, std = normalize(test_x_uniq, mean, std)
#feature selection
print "feature selection"
# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(train_y, 10),
scoring='accuracy')
rfecv.fit(train_x_nor, train_y)
print("Optimal number of features : %d" % rfecv.n_features_)
示例11: remove_feat_constants
def remove_feat_constants(data_frame):
# Remove feature vectors containing one unique value,
# because such features do not have predictive value.
print("")
print("Deleting zero variance features...")
# Let's get the zero variance features by fitting VarianceThreshold
# selector to the data, but let's not transform the data with
# the selector because it will also transform our Pandas data frame into
# NumPy array and we would like to keep the Pandas data frame. Therefore,
# let's delete the zero variance features manually.
n_features_originally = data_frame.shape[1]
selector = VarianceThreshold()
selector.fit(data_frame)
# Get the indices of zero variance feats
feat_ix_keep = selector.get_support(indices=True)
orig_feat_ix = np.arange(data_frame.columns.size)
feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep)
# Delete zero variance feats from the original pandas data frame
data_frame = data_frame.drop(labels=data_frame.columns[feat_ix_delete],
axis=1)
# Print info
n_features_deleted = feat_ix_delete.size
print(" - Deleted %s / %s features (~= %.1f %%)" % (
n_features_deleted, n_features_originally,
100.0 * (np.float(n_features_deleted) / n_features_originally)))
return data_frame
示例12: interactive_pipeline
def interactive_pipeline(X, Y, pca_n_components, random_forest_n):
#remove missing values columns
X.dropna(axis=1, inplace=True)
# standartize X
X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X))
#cutoff by variance
variance_threshold = 0.03
variance_cutoff = VarianceThreshold(threshold=variance_threshold)
variance_cutoff.fit_transform(X)
#cutoff high correlation
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
X.drop(X.columns[to_drop], 1, inplace=True)
#random forest
k_best_features = random_forest_n
feature_importance = random_forest_selection.get_feature_importance(X,Y)
processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance)
#PCA
pca = PCA_Obj(X)
X = pca.create_pca(pca_n_components)
print("X.shape", X.shape)
return X, Y
#feature_selection_pipeline_from_file()
示例13: variance_threshold
def variance_threshold(features_train, features_valid):
"""Return the initial dataframes after dropping some features according to variance threshold
Parameters:
----------
features_train: pd.DataFrame
features of training set
features_valid: pd.DataFrame
features of validation set
Output:
------
features_train: pd.DataFrame
features_valid: pd.DataFrame
"""
from sklearn.feature_selection import VarianceThreshold
threshold=0.01
selector = VarianceThreshold(threshold=threshold)
selector.fit(features_train)
## Instead of using the transform() method, we look at which columns have been dropped, to be able to drop in both training and validation set the same features. This way, we keep the column names to make interpretation easier
variances = selector.variances_
dropped_features = features_train.columns.values[variances < threshold] #name of features to drop
features_train.drop(dropped_features, axis=1, inplace=True)
features_valid.drop(dropped_features, axis=1, inplace=True)
return features_train, features_valid
示例14: feature_selection_pipeline_from_file
def feature_selection_pipeline_from_file():
#get data
dataset = refactor_labels(get_data(path, 'Sheet1'), group_column)
# all the visualizations
auto_visualize_features(dataset.drop(subject_number_column, axis = 1))
#remove missing values columns
non_missing_values_treshold = len(dataset.index) * 0.99
dataset.dropna(axis=1, thresh=non_missing_values_treshold, inplace=True)
#impute missing values
dataset.fillna(dataset.mean(), inplace=True)
#set X
X = dataset.drop([group_column, subject_number_column], 1)
sbj = dataset[subject_number_column]
Y = dataset[group_column]
names = list(X)
# standartize X
X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X))
X.columns = names
print("p0", X.shape)
#cutoff by variance
variance_threshold = 0.05
variance_cutoff = VarianceThreshold(threshold=variance_threshold)
variance_cutoff.fit_transform(X)
print("p1", X.shape)
#cutoff high correlation
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
X.drop(to_drop, axis = 1, inplace=True)
print("p2",X.shape)
#random forest
k_best_features = 42
feature_importance = random_forest_selection.get_feature_importance(X,Y)
random_forest_selection.save_feature_importance(feature_importance_txt_path, feature_importance, list(X))
processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance)
print("p3", processed_dataframe.shape)
processed_dataframe.to_csv(processed_dataframe_path)
#PCA
pca = PCA_Obj(X)
pca.explained_variance_graph(pca_explained_variance_graph_path)
pca.print_components()
n_components = 12
X = pca.create_pca(n_components)
pca.save_pca_data(features_after_pca, Y=Y)
print("p4", X.shape)
示例15: varianceSelection
def varianceSelection(self, df, threashold=.8):
if not isinstance(df, pandas.core.frame.DataFrame):
logger.error('[%s] : [ERROR] Variance selection only possible on Dataframe not %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(df))
sys.exit(1)
sel = VarianceThreshold(threshold=(threashold * (1 - threashold)))
sel.fit_transform(df)
return df[[c for (s, c) in zip(sel.get_support(), df.columns.values) if s]]