当前位置: 首页>>代码示例>>Python>>正文


Python Imputer.fit_transform方法代码示例

本文整理汇总了Python中sklearn.preprocessing.Imputer.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python Imputer.fit_transform方法的具体用法?Python Imputer.fit_transform怎么用?Python Imputer.fit_transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.preprocessing.Imputer的用法示例。


在下文中一共展示了Imputer.fit_transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: process

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def process(discrete, cont):
  # Create discrete and continuous data matrices
  discrete_X = np.array(discrete)
  cont_X = np.array(cont)

  # Impute discrete values
  imp = Imputer(strategy='most_frequent')
  discrete_X = imp.fit_transform(discrete_X)

  # Impute continuous values
  imp_c = Imputer(strategy='mean')
  cont_X = imp_c.fit_transform(cont_X)

  # Discrete basis representation
  enc = OneHotEncoder()
  enc.fit(discrete_X)
  discrete_X = enc.transform(discrete_X).toarray()

  # Continuous scaling
  scaler = StandardScaler()
  scaler.fit(cont_X)
  cont_X = scaler.transform(cont_X)

  # Merge to one array
  X = np.concatenate((discrete_X, cont_X), axis=1)
  return X
开发者ID:apnorton,项目名称:ml-project,代码行数:28,代码来源:Preproc.py

示例2: preprocess

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def preprocess(data, feat_type):
    # replace missing value by most common if categorical and by mean if numerical
    try:
        if data.getformat()=='csr':
            return data
    except:
        print feat_type
        # separate numerical and categorical columns
        idx_num = [i for i in xrange(len(feat_type)) if feat_type[i] == 'Numerical']
        data_num = data[:,idx_num]
        idx_cat = [i for i in xrange(len(feat_type)) if feat_type[i] == 'Categorical']
        data_cat = data[:,idx_cat]
        # fill missing values
        imp_num = Imputer(axis = 0)
        data_num = imp_num.fit_transform(data_num)
        imp_cat = Imputer(axis = 0, strategy='most_frequent')
        data_cat = imp_cat.fit_transform(data_cat)
        # retrieve mean and divide by standard deviation
        data_num = scale(data_num)
        # one-hot encode using pandas
        # have to do it column by column because of pandas
        data_cat_pd = pd.DataFrame(data_cat)
        for i in xrange(data_cat.shape[1]):
            data_cat_pd = pd.concat((data_cat_pd, pd.get_dummies(data_cat[:,i])),join = 'outer', axis = 1)
        # delete the columns that have been one hot encoded; need to rename first,
        # otherwise some columns may be suppressed unwillingly
        data_cat_pd.columns = [i for i in xrange(data_cat_pd.shape[1])]
        data_cat_pd = data_cat_pd.drop(data_cat_pd.iloc[:,[i for i in xrange(data_cat.shape[1])]],axis =1)
        data_cat = np.asarray(data_cat_pd)

        # regroup categorical and numerical variables
        return np.hstack((data_num,data_cat))
开发者ID:ludovicth,项目名称:chalearn,代码行数:34,代码来源:data_converter.py

示例3: predict

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
    def predict(self, raw_array, results, aux_data_a_d=None, diff=False,
                holdout_col=0, lag=1, positive_control=False, **kwargs):
        """ Given the input results model, predicts the year of data immediately succeeding the last year of the input array. Axis 0 indexes observations (schools) and axis 1 indexes years. For holdout_col>0, the last holdout_col years of data will be withheld from the prediction, which is ideal for finding the error of the algorithm. """

        if positive_control:
            if holdout_col > 0:
                if diff:
                    if holdout_col == 1:
                        control_array = np.diff(raw_array[:, -2:],
                                    1, axis=1)
                    else:
                        control_array = \
                            np.diff(raw_array[:, -holdout_col-1:-holdout_col+1],
                                    1, axis=1)
                else:
                    control_array = raw_array[:, -holdout_col]
            else:
                control_array = np.random.randn(raw_array.shape[0], 1)

        if holdout_col > 0:
            raw_array = raw_array[:, :-holdout_col]
        prediction_raw_array = raw_array
        if diff:
            array = np.diff(raw_array, 1, axis=1)
            X = array[:, -lag:]
            if positive_control:
                X = np.concatenate((X, control_array.reshape(-1, 1)), axis=1)
            if aux_data_a_d:
                for feature_s in aux_data_a_d.iterkeys():
                    if holdout_col > 0:
                        raw_array = aux_data_a_d[feature_s][:, :-holdout_col]
                    else:
                        raw_array = aux_data_a_d[feature_s]
                    array = np.diff(raw_array, 1, axis=1)
                    X = np.concatenate((X, array[:, -lag:]), axis=1)
            estimatorX = Imputer(axis=0)
            X = estimatorX.fit_transform(X)
            predicted_change_a = results.predict(X)
            estimator_orig = Imputer(axis=0)
            orig_a = estimator_orig.fit_transform(prediction_raw_array[:, -1].reshape(-1,1))
            prediction_a = orig_a + predicted_change_a.reshape(-1, 1)
        else:
            array = raw_array
            X = array[:, -lag:]
            if positive_control:
                X = np.concatenate((X, control_array.reshape(-1, 1)), axis=1)
            if aux_data_a_d:
                for feature_s in aux_data_a_d.iterkeys():
                    if holdout_col > 0:
                        raw_array = aux_data_a_d[feature_s][:, :-holdout_col]
                    else:
                        raw_array = aux_data_a_d[feature_s]
                    array = raw_array
                    X = np.concatenate((X, array[:, -lag:]), axis=1)
            estimatorX = Imputer(axis=0)
            X = estimatorX.fit_transform(X)
            prediction_a = results.predict(X)

        return prediction_a.reshape((-1, 1))
开发者ID:EricMichaelSmith,项目名称:school_district_prediction,代码行数:61,代码来源:create_predictions.py

示例4: fill_missing_values

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def fill_missing_values(_df, dis_features, cont_features):
    # for discrete features we will use 'most_frequent' strategy
    imp_discrete = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
    _df[dis_features] = imp_discrete.fit_transform(_df[dis_features].values)

    # for continuous features we will use 'mean' strategy
    imp_continuous = Imputer(missing_values='NaN', strategy='mean', axis=0)
    _df[cont_features] = imp_continuous.fit_transform(_df[cont_features].values)
    return _df
开发者ID:radotzki,项目名称:the-elections-challenge,代码行数:11,代码来源:main.py

示例5: main

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def main():

    weather, train, spray, test = load_data()
    target = train.WnvPresent.values
    idcol = test.Id.values

    weather = wnvutils.clean_weather(weather)

    train = wnvutils.clean_train_test(train)
    test = wnvutils.clean_train_test(test)

    train, test = wnvutils.clean_train_test2(train, test)

    train = train.merge(weather, on="Date")
    test = test.merge(weather, on="Date")

    train.drop("Date", axis=1, inplace=True)
    test.drop("Date", axis=1, inplace=True)

    desc_df(train)

    train = train.ix[:, pd.notnull(train).any(axis=0)]
    test = test.ix[:, pd.notnull(test).any(axis=0)]

    def min_dist_to_spray_(x):
        return wnvutils.min_dist_to_spray(x.Latitude, x.Longitude, spray)

    train["DistToSpray"] = train.apply(min_dist_to_spray_, axis=1)
    test["DistToSpray"] = test.apply(min_dist_to_spray_, axis=1)

    desc_df(train)

    imputer = Imputer()
    traina = imputer.fit_transform(train)
    testa = imputer.fit_transform(test)

    training = np.random.choice([True, False], size=train.shape[0], p=[0.8, 0.2])

    rfc = ensemble.RandomForestClassifier() # oob_score=True)
    rfc.fit(traina[training], target[training])
    # print("oob score:", rfc.oob_score_)

    #
    with open("output/feature_imp.txt", "w") as fout:
        for name, imp in sorted(zip(train.columns, rfc.feature_importances_),
                                key=lambda x: x[1], reverse=True):
            print(name, ":", imp)
            print(name, ":", imp, file=fout)

    predictions = rfc.predict(traina[~training])
    print("Accuracy:", (predictions == target[~training]).mean())

    predictions = rfc.predict_proba(traina[~training])
    np.savetxt("/tmp/predictions.txt", predictions[:, 1])

    print(predictions[:,1])
    print("ROC AUC Score:", roc_auc_score(target[~training], predictions[:,1]))
开发者ID:thekensta,项目名称:kaggle-west-nile-virus,代码行数:59,代码来源:wnv.py

示例6: test_model

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def test_model(data, stat_as_index, make_vector, model, do_pca=False, target='score'):
    # compile and shit
    print('Compiling stats...')
    fv, sc = [], []
    for year in ['2014', '2015', '2016']:
        f,s = build_fvs(
            data, year, stat_as_index, make_vector, target)
        fv.append(f)
        sc.append(s)

    # Compile into single vectors: Predict 2016 from 2014 and 2015
    fv_train, fv_test = np.vstack(fv[0:2]), fv[2]
    sc_train, sc_test = np.concatenate(sc[0:2]), sc[2]

    # Impute NaNs
    train_nan = np.isnan(fv_train)
    test_nan = np.isnan(fv_test)
    
    for i in range(fv_train.shape[1]):
        if np.isnan(fv_train[0,i]):
            fv_train[0,i] = 0
    for i in range(fv_test.shape[1]):
        if np.isnan(fv_test[0,i]):
            fv_test[0,i] = 0
            
    print('Imputing...')
    if train_nan.any():
        i1 = Imputer()
        fv_train = i1.fit_transform(fv_train)
        #print(i1.statistics_)
    if test_nan.any():
        i2 = Imputer()
        fv_test = i2.fit_transform(fv_test)
        #print(i2.statistics_)

    if do_pca:
        print('Performing PCA...')
        pca = PCA(whiten=True)
        fv_train = pca.fit_transform(fv_train)
        fv_test = pca.transform(fv_test)

    print('Building test/train sets...')
    # Exclude players with missing scores
    train_nan, test_nan = np.isnan(sc_train), np.isnan(sc_test)
    fv_train, sc_train = fv_train[~train_nan], sc_train[~train_nan]
    fv_test, sc_test = fv_test[~test_nan], sc_test[~test_nan]

    print('Building model...')
    # Build model
    mod = model
    mod.fit(fv_train, sc_train)
    
    print('Predicting output...')
    # kluge to allow for classifier and regressor evaluation
    try: pred = mod.predict_proba(fv_test)
    except: pred = mod.predict(fv_test)
    return pred, sc_test, mod
开发者ID:emitch,项目名称:golfstat,代码行数:59,代码来源:results.py

示例7: fillData

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def fillData(trainFeatures, testFeatures, missing_values=np.NaN, strategy='mean', axis=0, verbose=0, copy=True, all = True):
    imp = Imputer(missing_values, strategy, axis, verbose, copy) 
    if all:
        trainCount = len(trainFeatures)
        full = np.vstack((trainFeatures, testFeatures))
        full = imp.fit_transform(full)
        trainFeatures, testFeatures = np.array(full[:trainCount]), np.array(full[trainCount:])
        return trainFeatures, testFeatures
    else:
        return imp.fit_transform(trainFeatures), imp.fit_transform(testFeatures)
开发者ID:muratcancicek,项目名称:Assignment-Projects,代码行数:12,代码来源:Algorithms.py

示例8: fill_missing_imputation

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def fill_missing_imputation(electionsData, most_frequent):

    most_frequent = electionsData.columns.intersection(most_frequent)

    im = Imputer(strategy="most_frequent")
    electionsData[most_frequent] = im.fit_transform(electionsData[most_frequent])

    #Fill all of the rest (numeric) using mean
    im = Imputer(strategy="median")
    electionsData[:] = im.fit_transform(electionsData[:])
开发者ID:asafshmir,项目名称:idc-sd,代码行数:12,代码来源:putting_it_all_together.py

示例9: imputing_most_frequent

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def imputing_most_frequent(dataset):
    '''

    :param dataset: pandas DataFrame dataset. 
    :return: The same dataset where the missing values are replaced with the column's most common value
    '''

    imp = Imputer(missing_values='NaN', strategy='most_frequent', copy=False)
    imp.fit_transform(dataset)
    return dataset
开发者ID:nogur9,项目名称:PTSD,代码行数:12,代码来源:Data_Imputation.py

示例10: test_imputation_shape

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
    def test_imputation_shape(self):
        """Verify the shapes of the imputed matrix for different strategies."""
        X = np.random.randn(10, 2)
        X[::2] = np.nan

        for strategy in ['mean', 'median', 'most_frequent']:
            imputer = Imputer(strategy=strategy)
            X_imputed = imputer.fit_transform(X)
            assert_equal(X_imputed.shape, (10, 2))
            X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
            assert_equal(X_imputed.shape, (10, 2))
开发者ID:Bryan-LL,项目名称:auto-sklearn,代码行数:13,代码来源:test_imputation.py

示例11: preprocess

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
 def preprocess(self):
   # impute missing values
   true_ids = set([urlid for urlid, label in self.target.iteritems() if label])
   true_data = [v for k, v in self.data.iteritems() if k in true_ids]
   false_data = [v for k, v in self.data.iteritems() if k not in true_ids]
   self.target = [1 for x in xrange(len(true_data))] + [0 for x in xrange(len(false_data))]
   imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
   true_data = imp.fit_transform(true_data)
   false_data = imp.fit_transform(false_data)
   self.data = np.concatenate((true_data, false_data), axis=0)
   self.test_data = imp.fit_transform(self.test_data.values())
开发者ID:arvs,项目名称:carlton,代码行数:13,代码来源:baseline.py

示例12: median_impute

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
 def median_impute(self):
     """
     impute
     """
     tr = HFile(self.trfile)
     te = HFile(self.tefile)
     self.attributes = tr.attributes
     self.class_index = tr.class_index
     imp = Imputer(missing_values=-1, strategy='median')
     self.tr = imp.fit_transform(tr.data)
     self.ta = tr.classes
     self.te = imp.fit_transform(te.data)
开发者ID:Hossein-Noroozpour,项目名称:PyHDM,代码行数:14,代码来源:HDataManager.py

示例13: solve_missing_values

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def solve_missing_values(data):
    """
    Solve missing values
    Parameters
    ----------
    data: Values to remove missing values
    """
    from sklearn.preprocessing import Imputer

    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit_transform(data)
    return data
开发者ID:gdl-civestav-localization,项目名称:cinvestav_location_fingerprinting,代码行数:14,代码来源:scaling.py

示例14: run_importance

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def run_importance(clf, data, labels, feature_labels=[""], string=""):
    """
    Fit a classifier using all the data and plot the feature importances
    :param clf: Classifier object that has feature_importances_ member
    :param feature_labels: names of the features
    :param string: classifier name
    :return: (void) plot Gini importance vs feature
    """
    num_features = data.shape[1]
    importances = [0]*num_features

    imp = Imputer(missing_values=np.NaN, strategy="mean")
    data = imp.fit_transform(data)

    # run the classifier 100 times and average the importance found after each fit
    for r in range(100):
        clf.fit(data, labels)
        importances = [importances[i]+clf.feature_importances_[i] for i in range(num_features)]
    importances = [importance/100 for importance in importances]

    # Filter out the features that have 0 importance (e.g. values are all 0)
    # non_zeros are the indices in feature_importances that are not 0
    non_zeros = [i for i in range(num_features) if not importances[i] == 0]
    importances = [importances[i] for i in non_zeros]
    feature_labels = [feature_labels[i] for i in non_zeros]

    # Plot the features
    bar_width = 0.7
    plt.bar(range(len(feature_labels)), importances, bar_width)
    plt.xticks([ind + +float(bar_width)/2 for ind in range(len(feature_labels))], feature_labels,rotation="vertical")
    plt.gcf().subplots_adjust(bottom=0.35)
    plt.xlabel("Feature")
    plt.ylabel("Gini Importance")
    plt.title("Gini Importance v. Features for "+string+" Classifier")
    plt.show()
开发者ID:danielgeng,项目名称:cs249_big_data_analytics,代码行数:37,代码来源:ml_models.py

示例15: test_3_stage

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
    def test_3_stage(self):
        from sklearn.preprocessing import Imputer

        infile_name = path_of_data('missing_vals.csv')

        p = Pipeline()

        csv_read_node = p.add(CSVRead(infile_name))
        csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))
        impute_node = p.add(wrap_and_make_instance(Imputer))

        csv_read_node['output'] > impute_node['X_train']
        impute_node['X_new'] > csv_write_node['input']

        self.run_pipeline(p)

        ctrl_imputer = Imputer()
        ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",",
                                  names=True)
        num_type = ctrl_X_sa[0][0].dtype
        ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa)
        ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd)
        control = ctrl_X_new_nd

        result = self._tmp_files.csv_read('out.csv', True)

        self.assertTrue(np.allclose(result, control))
开发者ID:macressler,项目名称:UPSG,代码行数:29,代码来源:test_pipeline.py


注:本文中的sklearn.preprocessing.Imputer.fit_transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。