当前位置: 首页>>代码示例>>Python>>正文


Python feature_selection.VarianceThreshold类代码示例

本文整理汇总了Python中sklearn.feature_selection.VarianceThreshold的典型用法代码示例。如果您正苦于以下问题:Python VarianceThreshold类的具体用法?Python VarianceThreshold怎么用?Python VarianceThreshold使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了VarianceThreshold类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: feature_select

def feature_select(word,instance_dic,feature_dic, thre_hold=0.01, num_feature=100):
    instances_list = instance_dic[word]
    feature_words=feature_dic[word]
    feature_xs = []
    labels = []

    for instance in instances_list:
        label = ' '.join(instance.senseid)
        feature_x_dic = feature_vector(instance,feature_words)
        feature_vals=[]
        for word in feature_words:
            feature_vals.append(feature_x_dic[word])
        feature_xs.append(feature_vals)
        labels.append(label)

    # 1st round feature selection by removing low variance features
    sel_lowvr = VarianceThreshold(threshold=(thre_hold))
    feature_xs_selected = sel_lowvr.fit(feature_xs)
    lowvr_index = feature_xs_selected.get_support(indices=True).tolist()
    feature_xs_selected = feature_xs_selected.transform(feature_xs).tolist()



    # 2nd round feature selection using sklearn's SelectKBest()
    if num_feature < len(feature_xs_selected[0]):
        sel_chi2 = SelectKBest(chi2, k= num_feature).fit(feature_xs_selected, labels)
        chi2_index= sel_chi2.get_support(indices=True).tolist()
        #feature_xs_selected = sel_chi2.transform(feature_xs_selected).tolist()# transform from numpy array back to lis
        return lowvr_index, chi2_index
    else:
        print str(word) + ": chi2 selection not executed due to low # of features"
        return lowvr_index, [i for i in range(len(lowvr_index))]
开发者ID:skywalker-lili,项目名称:Cornell-Course-projects,代码行数:32,代码来源:nlp_p2_final.py

示例2: vectorize_EX

    def vectorize_EX(self, columns, variance_thresh=0, train_only=False):

        print('Start vectorizing')
        start_time = time.time()
        hasher = CountVectorizer(binary=True, tokenizer=LemmaTokenizer(), stop_words='english')

        train_dtm = hasher.fit_transform(
            self.ga_bm_train[columns].apply(lambda x: ','.join(x), axis=1))
        print(hasher.get_feature_names())
        print('dtm train shape: ', train_dtm.shape)

        selector = VarianceThreshold(variance_thresh)
        train_dtm = selector.fit_transform(train_dtm)
        print('dtm train shape after variance thresh: ', train_dtm.shape)

        if not train_only:
            test_dtm = hasher.transform(
                self.ga_bm_test[columns].apply(lambda x: ','.join(x), axis=1))

            print('dtm test shape: ', test_dtm.shape)
            test_dtm = selector.transform(test_dtm)
            print('dtm test shape after variance thresh: ', test_dtm.shape)

        print("Time: ", round(((time.time() - start_time)/60), 2))
        print('Complete vectorizing')
        if train_only:
            return train_dtm
        else:
            return (train_dtm, test_dtm)
开发者ID:canzheng,项目名称:kaggle-talkingdata,代码行数:29,代码来源:data_load.py

示例3: variance_cutoff

def variance_cutoff(X,cutoff=0.8):
    """
    Set variance cutoff for variables
    """
    sel = VarianceThreshold(threshold=(cutoff * (1 - cutoff)))
    X = sel.fit_transform(X)
    return X
开发者ID:kmravikumar,项目名称:spring-leaf,代码行数:7,代码来源:spring_leaf.py

示例4: main

def main():
    args = getOptions()
    print args

    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train',4)
    train_x_new, id = extractID(train_x)
    del train_x
    train_x_clean, contentdict = cityclean(train_x_new)
    del id, train_x_new
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_clean)
    del train_x_clean
    
    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    del train_x_uniq
    
    #feature selection and modeling
    print "feature selection and modeling"
    exclusivefs(train_x_nor, train_y)
开发者ID:kaiwang0112006,项目名称:mykaggle_springleaf,代码行数:25,代码来源:exclusivefsv2.py

示例5: doFeatureSelection

 def doFeatureSelection(self,features,target,k):
     features_int = np.array(features,dtype=float)
     target_int = np.array(target,dtype=float)
     sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
     features_new = sel.fit_transform(features_int)
     #features_new = SelectKBest(chi2,k=10).fit_transform(features_int,target_int)
     return features_new
开发者ID:satheeshravir,项目名称:mlproject,代码行数:7,代码来源:CartDecisionTreeAlgorithm.py

示例6: _variance_threshold

    def _variance_threshold(self, input_df, threshold):
        """Uses Scikit-learn's VarianceThreshold feature selection to learn the subset of features that pass the threshold

        Parameters
        ----------
        input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
            Input DataFrame to perform feature selection on
        threshold: float
            The variance threshold that removes features that fall under the threshold

        Returns
        -------
        subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']}
            Returns a DataFrame containing the features that are above the variance threshold

        """

        training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)

        selector = VarianceThreshold(threshold=threshold)
        try:
            selector.fit(training_features)
        except ValueError:
            # None features are above the variance threshold
            return input_df[['guess', 'class', 'group']].copy()

        mask = selector.get_support(True)
        mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group']
        return input_df[mask_cols].copy()
开发者ID:vsolano,项目名称:tpot,代码行数:29,代码来源:tpot.py

示例7: main

def main():
    parser = argparse.ArgumentParser(description='Normalize the feature values')
    required = parser.add_argument_group('required options')

    required.add_argument('-x', '--outlist', required=True, help='File containing feature values')
    required.add_argument('-y', '--execlist', required=True, help='File containing exec list')
    
    args = parser.parse_args()

    #X = np.loadtxt(args.outlist, skiprows=1)
    np.set_printoptions(precision=2)
    X = np.genfromtxt(args.outlist, skiprows=1)
    X=np.nan_to_num(X)
    Y = np.loadtxt(args.execlist, ndmin=2)

    #f = open("trainlist","wb")
    #newResult = X/Y
    #sel = VarianceThreshold(threshold=(.8*(1-.8)))
    sel = VarianceThreshold(threshold=(.8*(1-.8)))
    result1 = sel.fit_transform(X)
    newResult = result1/Y
    #result2 = sel.fit_transform(newResult)

    #feature collection for test programs
    if os.path.isfile('eventlist'):
       features = np.genfromtxt('eventlist',dtype='str')
       featureFromVariance = sel.get_support(indices=True)
       text_file = open("variancefeatures.txt","w")
       for i in featureFromVariance:
           text_file.write(features[i])
           text_file.write("\n")
       text_file.close()

    np.savetxt('normfeaturelist', newResult, fmt='%.2f', delimiter='\t')
开发者ID:biplabks,项目名称:MLTUNE,代码行数:34,代码来源:normalize.py

示例8: test_same_variances

    def test_same_variances(self):
        local = VarianceThreshold()
        dist = SparkVarianceThreshold()

        shapes = [((10, 5), None),
                  ((1e3, 20), None),
                  ((1e3, 20), 100),
                  ((1e4, 100), None),
                  ((1e4, 100), 600)]

        for shape, block_size in shapes:
            X_dense, X_dense_rdd = self.make_dense_rdd()
            X_sparse, X_sparse_rdd = self.make_sparse_rdd()
            Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

            local.fit(X_dense)
            dist.fit(X_dense_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            local.fit(X_sparse)
            dist.fit(X_sparse_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            dist.fit(Z)
            assert_array_almost_equal(local.variances_, dist.variances_)
开发者ID:KartikPadmanabhan,项目名称:sparkit-learn,代码行数:25,代码来源:test_variance_threshold.py

示例9: feature_selection

def feature_selection(features, ideal_num=None):
    from sklearn.feature_selection import VarianceThreshold
    copy = np.copy(features)
    for i in range(8):
        sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
        sel.fit_transform(copy[i])
    return copy
开发者ID:sssruhan1,项目名称:spine,代码行数:7,代码来源:asm_util.py

示例10: main

def main():
    args = getOptions()
    print args

    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    # Create the RFE object and compute a cross-validated score.
    svc = SVC(kernel="linear")
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(train_y, 10),
                  scoring='accuracy')
    rfecv.fit(train_x_nor, train_y)
    
    print("Optimal number of features : %d" % rfecv.n_features_)
开发者ID:kaiwang0112006,项目名称:mykaggle_springleaf,代码行数:35,代码来源:ft.py

示例11: remove_feat_constants

def remove_feat_constants(data_frame):
    # Remove feature vectors containing one unique value,
    # because such features do not have predictive value.
    print("")
    print("Deleting zero variance features...")
    # Let's get the zero variance features by fitting VarianceThreshold
    # selector to the data, but let's not transform the data with
    # the selector because it will also transform our Pandas data frame into
    # NumPy array and we would like to keep the Pandas data frame. Therefore,
    # let's delete the zero variance features manually.
    n_features_originally = data_frame.shape[1]
    selector = VarianceThreshold()
    selector.fit(data_frame)
    # Get the indices of zero variance feats
    feat_ix_keep = selector.get_support(indices=True)
    orig_feat_ix = np.arange(data_frame.columns.size)
    feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep)
    # Delete zero variance feats from the original pandas data frame
    data_frame = data_frame.drop(labels=data_frame.columns[feat_ix_delete],
                                 axis=1)
    # Print info
    n_features_deleted = feat_ix_delete.size
    print("  - Deleted %s / %s features (~= %.1f %%)" % (
        n_features_deleted, n_features_originally,
        100.0 * (np.float(n_features_deleted) / n_features_originally)))
    return data_frame
开发者ID:canivel,项目名称:Kaggle-Santander,代码行数:26,代码来源:xgb_features.py

示例12: interactive_pipeline

def interactive_pipeline(X, Y, pca_n_components, random_forest_n):

    #remove missing values columns

    X.dropna(axis=1,  inplace=True)

    # standartize X
    X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X))
    #cutoff by variance
    variance_threshold = 0.03
    variance_cutoff = VarianceThreshold(threshold=variance_threshold)
    variance_cutoff.fit_transform(X)
    #cutoff high correlation
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
    X.drop(X.columns[to_drop], 1, inplace=True)
    #random forest
    k_best_features = random_forest_n
    feature_importance = random_forest_selection.get_feature_importance(X,Y)
    processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance)
    #PCA
    pca = PCA_Obj(X)
    X = pca.create_pca(pca_n_components)
    print("X.shape", X.shape)
    return X, Y

#feature_selection_pipeline_from_file()
开发者ID:nogur9,项目名称:PTSD,代码行数:28,代码来源:feature_selection_pipeline.py

示例13: variance_threshold

def variance_threshold(features_train, features_valid):
    """Return the initial dataframes after dropping some features according to variance threshold

    Parameters:
    ----------
    features_train: pd.DataFrame
        features of training set

    features_valid: pd.DataFrame
        features of validation set

    Output:
    ------
    features_train: pd.DataFrame

    features_valid: pd.DataFrame
    """
    from sklearn.feature_selection import VarianceThreshold    

    threshold=0.01
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(features_train)

    ## Instead of using the transform() method, we look at which columns have been dropped, to be able to drop in both training and validation set the same features. This way, we keep the column names to make interpretation easier
    variances = selector.variances_
    dropped_features = features_train.columns.values[variances < threshold] #name of features to drop
    features_train.drop(dropped_features, axis=1, inplace=True)
    features_valid.drop(dropped_features, axis=1, inplace=True)

    return features_train, features_valid
开发者ID:ldocao,项目名称:dataiku_census,代码行数:30,代码来源:selector.py

示例14: feature_selection_pipeline_from_file

def feature_selection_pipeline_from_file():
    #get data
    dataset = refactor_labels(get_data(path, 'Sheet1'), group_column)

    # all the visualizations
    auto_visualize_features(dataset.drop(subject_number_column, axis = 1))

    #remove missing values columns
    non_missing_values_treshold = len(dataset.index) * 0.99
    dataset.dropna(axis=1, thresh=non_missing_values_treshold, inplace=True)

    #impute missing values
    dataset.fillna(dataset.mean(), inplace=True)

    #set X
    X = dataset.drop([group_column, subject_number_column], 1)
    sbj = dataset[subject_number_column]
    Y = dataset[group_column]
    names = list(X)

    # standartize X
    X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X))
    X.columns = names
    print("p0", X.shape)

    #cutoff by variance
    variance_threshold = 0.05
    variance_cutoff = VarianceThreshold(threshold=variance_threshold)
    variance_cutoff.fit_transform(X)

    print("p1", X.shape)

    #cutoff high correlation
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]

    X.drop(to_drop, axis = 1, inplace=True)
    print("p2",X.shape)


    #random forest
    k_best_features = 42
    feature_importance = random_forest_selection.get_feature_importance(X,Y)
    random_forest_selection.save_feature_importance(feature_importance_txt_path, feature_importance, list(X))
    processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance)
    print("p3", processed_dataframe.shape)
    processed_dataframe.to_csv(processed_dataframe_path)


    #PCA
    pca = PCA_Obj(X)
    pca.explained_variance_graph(pca_explained_variance_graph_path)
    pca.print_components()
    n_components = 12
    X = pca.create_pca(n_components)
    pca.save_pca_data(features_after_pca, Y=Y)
    print("p4", X.shape)
开发者ID:nogur9,项目名称:PTSD,代码行数:59,代码来源:feature_selection_pipeline.py

示例15: varianceSelection

 def varianceSelection(self, df, threashold=.8):
     if not isinstance(df, pandas.core.frame.DataFrame):
         logger.error('[%s] : [ERROR] Variance selection only possible on Dataframe not %s',
                                      datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(df))
         sys.exit(1)
     sel = VarianceThreshold(threshold=(threashold * (1 - threashold)))
     sel.fit_transform(df)
     return df[[c for (s, c) in zip(sel.get_support(), df.columns.values) if s]]
开发者ID:igabriel85,项目名称:dmon-adp,代码行数:8,代码来源:adpfeature.py


注:本文中的sklearn.feature_selection.VarianceThreshold类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。