当前位置: 首页>>代码示例>>Python>>正文


Python preprocessing.Imputer方法代码示例

本文整理汇总了Python中sklearn.preprocessing.Imputer方法的典型用法代码示例。如果您正苦于以下问题:Python preprocessing.Imputer方法的具体用法?Python preprocessing.Imputer怎么用?Python preprocessing.Imputer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在模块sklearn.preprocessing的用法示例。

在下文中一共展示了preprocessing.Imputer方法的30个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: setUpClass

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        from sklearn.datasets import load_boston

        scikit_data = load_boston()
        scikit_model = Imputer(strategy="most_frequent", axis=0)
        scikit_data["data"][1, 8] = np.NaN

        input_data = scikit_data["data"][:, 8].reshape(-1, 1)
        scikit_model.fit(input_data, scikit_data["target"])

        # Save the data and the model
        self.scikit_data = scikit_data
        self.scikit_model = scikit_model 
开发者ID:apple,项目名称:coremltools,代码行数:18,代码来源:test_categorical_imputer.py


示例2: readFile

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def readFile(inpath):
    if os.path.isfile(inpath):
        dataset = genfromtxt(open(inpath,'r'), delimiter=',', dtype='f8')[0:] 
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)# fill in the missing values with the mean of each column
        transformedData = imp.fit_transform(dataset)
        rmvedCols = imp.statistics_
        idxRmved = np.where(np.isnan(rmvedCols))#take the indices of the nan columns
        nanTarget = dataset.shape[1]-1 in idxRmved[0]#check if the target is a nan column
        if nanTarget:
            raise ValueError("The target variable contains only nan values or inf")
    else:
        raise ValueError("File does not exist")    
    return transformedData
    
#parameters: vector 'target' which is the target variable
#returns: the dataset which includes the previous values of the target 
开发者ID:h-cel,项目名称:ClimateVegetationDynamics_GrangerCausality,代码行数:18,代码来源:GC_script.py


示例3: test_imputer_float_inputs

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def test_imputer_float_inputs(self):
        model = Imputer(missing_values="NaN", strategy="mean", axis=0)
        data = [[1, 2], [np.nan, 3], [7, 6]]
        model.fit(data)

        model_onnx = convert_sklearn(model, "scikit-learn imputer",
                                     [("input", FloatTensorType([None, 2]))])
        self.assertTrue(model_onnx.graph.node is not None)

        # should contain only node
        self.assertEqual(len(model_onnx.graph.node), 1)

        # last node should contain the Imputer
        outputs = model_onnx.graph.output
        self.assertEqual(len(outputs), 1)
        self.assertEqual(outputs[0].type.tensor_type.shape.dim[-1].dim_value,
                         2)
        dump_data_and_model(
            np.array(data, dtype=np.float32),
            model,
            model_onnx,
            basename="SklearnImputerMeanFloat32",
        ) 
开发者ID:onnx,项目名称:sklearn-onnx,代码行数:25,代码来源:test_sklearn_imputer_converter.py


示例4: test_simple_imputer_float_inputs

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def test_simple_imputer_float_inputs(self):
        model = SimpleImputer(strategy="mean", fill_value="nan")
        data = [[1, 2], [np.nan, 3], [7, 6]]
        model.fit(data)

        model_onnx = convert_sklearn(
            model,
            "scikit-learn simple imputer",
            [("input", FloatTensorType([None, 2]))],
            target_opset=TARGET_OPSET)
        self.assertTrue(model_onnx.graph.node is not None)

        # should contain only node
        self.assertEqual(len(model_onnx.graph.node), 1)

        # last node should contain the Imputer
        outputs = model_onnx.graph.output
        self.assertEqual(len(outputs), 1)
        self.assertEqual(
            outputs[0].type.tensor_type.shape.dim[-1].dim_value, 2)
        dump_data_and_model(
            np.array(data, dtype=np.float32),
            model, model_onnx,
            basename="SklearnSimpleImputerMeanFloat32") 
开发者ID:onnx,项目名称:sklearn-onnx,代码行数:26,代码来源:test_sklearn_imputer_converter.py


示例5: test_objectmapper

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
        self.assertIs(df.preprocessing.FunctionTransformer,
                      pp.FunctionTransformer)
        self.assertIs(df.preprocessing.Imputer, pp.Imputer)
        self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
        self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
        self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
        self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
        self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
        self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
        self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
        self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
        self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
        self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
        self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler) 
开发者ID:pandas-ml,项目名称:pandas-ml,代码行数:19,代码来源:test_preprocessing.py


示例6: test_transform_1d_frame_int

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def test_transform_1d_frame_int(self):
        arr = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
        idx = pd.Index('a b c d e f g h i'.split(' '))
        df = pdml.ModelFrame(arr, index=idx, columns=['X'])
        self.assertEqual(len(df.columns), 1)

        # reshape arr to 2d
        arr = arr.reshape(-1, 1)

        if pd.compat.PY3:
            models = ['Binarizer', 'Imputer', 'StandardScaler']
            # MinMaxScalar raises TypeError in ufunc
        else:
            models = ['Binarizer', 'Imputer', 'StandardScaler', 'MinMaxScaler']

        for model in models:
            mod1 = getattr(df.preprocessing, model)()
            mod2 = getattr(pp, model)()

            self._assert_transform(df, arr, mod1, mod2)

            mod1 = getattr(df.preprocessing, model)()
            mod2 = getattr(pp, model)()
            self._assert_fit_transform(df, arr, mod1, mod2) 
开发者ID:pandas-ml,项目名称:pandas-ml,代码行数:26,代码来源:test_preprocessing.py


示例7: test_Imputer

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def test_Imputer(self):
        arr = np.array([1, np.nan, 3, 2])
        s = pdml.ModelSeries(arr)

        mod1 = s.pp.Imputer(axis=0)
        s.fit(mod1)
        result = s.transform(mod1)

        expected = np.array([1, 2, 3, 2])

        self.assertIsInstance(result, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(result.values, expected)

        mod1 = s.pp.Imputer(axis=0)
        result = s.fit_transform(mod1)

        self.assertIsInstance(result, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(result.values, expected) 
开发者ID:pandas-ml,项目名称:pandas-ml,代码行数:20,代码来源:test_preprocessing.py


示例8: FeatureCombination

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def FeatureCombination(Df,s='',num_feature=2): 
    feature_set = []
    for c in Df.columns:
        if c.startswith(s): feature_set.append(c)
    print('combining', len(feature_set), 'features')
    data = Df[feature_set].values

    for c in Df.columns:
        if Df[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(Df[c].values))
            Df[c] = lbl.transform(list(Df[c].values))
            
    imp = preprocessing.Imputer()
    data = imp.fit_transform(data)
    data = preprocessing.scale(data)
    pca = PCA(num_feature)
    pca.fit(data)
    print('explained_variance_ratio_:', pca.explained_variance_ratio_)
    trans = pca.transform(data)
    for i in range(0,num_feature):
        Df[s+'_%d'%(i+1)] = trans[:,i]
    Df.drop(feature_set,1,inplace=True)
    return Df 
开发者ID:LenzDu,项目名称:Kaggle-Competition-Sberbank,代码行数:26,代码来源:Utils.py


示例9: test_imputer

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def test_imputer(self):
        try:
            model = Imputer(missing_values='NaN', strategy='mean', axis=0)
        except TypeError:
            model = Imputer(missing_values=np.nan, strategy='mean')
            model.axis = 0
        data = [[1, 2], [np.nan, 3], [7, 6]]
        model.fit(data)
        from onnxmltools.convert.coreml.convert import convert
        import coremltools  # noqa
        try:
            model_coreml = coremltools.converters.sklearn.convert(model)
        except ValueError as e:
            if 'not supported' in str(e):
                # Python 2.7 + scikit-learn 0.22
                return
        model_onnx = convert(model_coreml.get_spec())
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(np.array(data, dtype=np.float32),
                            model, model_onnx, basename="CmlImputerMeanFloat32") 
开发者ID:onnx,项目名称:onnxmltools,代码行数:22,代码来源:test_cml_ImputerConverter.py


示例10: impute_data

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def impute_data(self,x):
        """Imputes data set containing Nan values"""
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        return imp.fit_transform(x) 
开发者ID:lacava,项目名称:few,代码行数:6,代码来源:few.py


示例11: __init__

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def __init__(self, params, dataset):
        """Initializes a UMAPTransformer object.

        Args:
            params (Namespace): Contains parameters used to instantiate the transformer.
            dataset (Dataset): Dataset used to "train" the projection mapping.
        """

        # TODO: decide whether to make n_epochs a parameter
        #default_n_epochs = None
        default_n_epochs = 500

        if params.prediction_type == 'classification':
            target_metric = 'categorical'
        else:
            target_metric = 'l2'
        self.scaler = RobustScaler()
        # Use Imputer to replace missing values (NaNs) with means for each column
        self.imputer = Imputer()
        scaled_X = self.scaler.fit_transform(self.imputer.fit_transform(dataset.X))
        self.mapper = umap.UMAP(n_neighbors=params.umap_neighbors, 
                                n_components=params.umap_dim,
                                metric=params.umap_metric,
                                target_metric=target_metric,
                                target_weight=params.umap_targ_wt,
                                min_dist=params.umap_min_dist,
                                n_epochs=default_n_epochs)
        # TODO: How to deal with multitask data?
        self.mapper.fit(scaled_X, y=dataset.y.flatten())

    # **************************************************************************************** 
开发者ID:ATOMconsortium,项目名称:AMPL,代码行数:33,代码来源:transformations.py


示例12: get_clf_pipeline

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def get_clf_pipeline():
    clf = models.DefaultClassifier(
            GradientBoostingClassifier(
                       loss='deviance', learning_rate=0.01, n_estimators=3000,
                       subsample=0.6, min_samples_split=12, min_samples_leaf=12,
                       max_depth=6, random_state=1357, verbose=0)
           )
    steps = [('features', models.FeatureSelector()),
             ('Impute', Imputer(strategy='median')),
             ('scaler', StandardScaler()),
             ('clf', clf)]
    return Pipeline(steps) 
开发者ID:songgc,项目名称:loan-default-prediction,代码行数:14,代码来源:train_predict.py


示例13: get_reg_pipeline

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def get_reg_pipeline():
    clf = models.PartialRegressor(
            GradientBoostingRegressor(loss='ls', learning_rate=0.0075, n_estimators=5000,
                 subsample=0.5, min_samples_split=20, min_samples_leaf=20, max_leaf_nodes=30,
                 random_state=9753, verbose=0)
            )
    steps = [('features', models.FeatureSelector()),
             ('Impute', Imputer(strategy='median')),
             ('scaler', StandardScaler()),
             ('clf', clf)]
    return Pipeline(steps) 
开发者ID:songgc,项目名称:loan-default-prediction,代码行数:13,代码来源:train_predict.py


示例14: __init__

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def __init__(self, max_iter=10, initial_strategy='mean', tol=1e-3, f_model="RandomForest"):
        self.max_iter = max_iter
        self.initial_strategy = initial_strategy
        self.initial_imputer = Imputer(strategy=initial_strategy)
        self.tol = tol
        self.f_model = f_model 
开发者ID:log0ymxm,项目名称:predictive_imputer,代码行数:8,代码来源:predictive_imputer.py


示例15: data_handlemissing

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def data_handlemissing(dataframe, pipeline):
    try:
        if pipeline['options']['type'] == "dropcolumns":
            thresh = pipeline['options']['thresh']
            if thresh == -1:
                dataframe.dropna(axis=1, how="all", inplace=True)
            elif thresh == 0:
                dataframe.dropna(axis=1, how="any", inplace=True)
            elif thresh > 0:
                dataframe.dropna(axis=1, thresh=thresh, inplace=True)
        elif pipeline['options']['type'] == "droprows":
            thresh = pipeline['options']['thresh']
            if thresh == -1:
                dataframe.dropna(axis=0, how="all", inplace=True)
            elif thresh == 0:
                dataframe.dropna(axis=0, how="any", inplace=True)
            elif thresh > 0:
                dataframe.dropna(axis=0, thresh=thresh)
        elif pipeline['options']['type'] == "fillmissing":
            strategy = pipeline['options']['strategy']
            imp = Imputer(missing_values='NaN', strategy=strategy, axis=0)
            array = imp.fit_transform(dataframe.values)
            dataframe = pandas.DataFrame(array, columns = dataframe.columns)

        return dataframe
    except Exception as e:
        raise Exception("data_handlemissing: " + str(e)) 
开发者ID:tech-quantum,项目名称:sia-cog,代码行数:29,代码来源:pipelinecomponents.py


示例16: test_conversion_boston

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def test_conversion_boston(self):

        from sklearn.datasets import load_boston

        scikit_data = load_boston()

        sh = scikit_data.data.shape

        rn.seed(0)
        missing_value_indices = [
            (rn.randint(sh[0]), rn.randint(sh[1])) for k in range(sh[0])
        ]

        for strategy in ["mean", "median", "most_frequent"]:
            for missing_value in [0, "NaN", -999]:

                X = np.array(scikit_data.data).copy()

                for i, j in missing_value_indices:
                    X[i, j] = missing_value

                model = Imputer(missing_values=missing_value, strategy=strategy)
                model = model.fit(X)

                tr_X = model.transform(X.copy())

                spec = converter.convert(model, scikit_data.feature_names, "out")

                input_data = [dict(zip(scikit_data.feature_names, row)) for row in X]

                output_data = [{"out": row} for row in tr_X]

                result = evaluate_transformer(spec, input_data, output_data)

                assert result["num_errors"] == 0 
开发者ID:apple,项目名称:coremltools,代码行数:37,代码来源:test_imputer.py


示例17: test_conversion_bad_inputs

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def test_conversion_bad_inputs(self):
        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = Imputer()
            spec = converter.convert(model, "data", "out")

        # Check the expected class during covnersion.
        with self.assertRaises(Exception):
            from sklearn.linear_model import LinearRegression

            model = LinearRegression()
            spec = converter.convert(model, "data", "out") 
开发者ID:apple,项目名称:coremltools,代码行数:14,代码来源:test_categorical_imputer.py


示例18: __init__

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def __init__(self):
        self.supports_output = True
        self.default_transformation_pipeline = [Imputer(strategy='mean'), StandardScaler()] 
开发者ID:bjherger,项目名称:keras-pandas,代码行数:5,代码来源:Numerical.py


示例19: createAuto

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def createAuto(target):
    win=13 # window size, how many previous values we take of the target (here 12 because the range goes from 1-12 without the 13)
    dataAuto = np.empty((len(target),win-1))
    for i in range(1,win):
        dataAuto[:,i-1] = shift2(target, i)
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    transformedDataAuto = imp.fit_transform(dataAuto)           
    X_auto = transformedDataAuto
    return X_auto  
    
#parameters: 'X' the predictors, 'y' the target, 'cvFolds' number of folds, 'estimator' machine learning algorithm 
#returns: the R squared for each fold 
开发者ID:h-cel,项目名称:ClimateVegetationDynamics_GrangerCausality,代码行数:14,代码来源:GC_script.py


示例20: fit

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def fit(self, X, y=None):
        self.imp = Imputer(strategy=self.strategy)
        self.imp.fit(X)
        self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
        return self 
开发者ID:jem1031,项目名称:pandas-pipelines-custom-transformers,代码行数:7,代码来源:custom_transformers.py


示例21: train_model

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def train_model(titanic_data_path, model_output_path):
    print('Loading the data...')
    try:
        with tf.gfile.Open(titanic_data_path, 'r') as data_file:
            train_df = pd.read_csv(data_file)
            print('Number of samples: {}'.format(train_df.shape[0]))

            target_name = 'Survived'
            feature_names = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']

            print('Preparing the features...')
            train_features = train_df[feature_names].copy()
            train_features['Age'] = Imputer().fit_transform(train_features['Age'].values.reshape(-1, 1))
            embarked = train_features['Embarked']
            train_features['Embarked'] = embarked.fillna(embarked.mode()[0])
            train_features = pd.get_dummies(train_features)
            train_target = train_df[target_name]

            print('Training the model...')
            parameters = {'max_depth': [2, 3, 4, 5, 6, 7], 'n_estimators': [50, 100, 150, 200]}
            gsc = GridSearchCV(GradientBoostingClassifier(), parameters, n_jobs=-1, cv=5)
            gsc.fit(train_features, train_target)
            print('Best Hyper Parameters: {}'.format(gsc.best_params_))
            print('Accuracy: {}'.format(gsc.best_score_))

            with tf.gfile.Open(model_output_path, 'wb') as model_file:
                joblib.dump(gsc.best_estimator_, model_file, protocol=1)
    except Exception as e:
        print('Error: {}'.format(e)) 
开发者ID:GoogleCloudPlatform,项目名称:ml-on-gcp,代码行数:31,代码来源:titanic.py


示例22: test_model_imputer

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def test_model_imputer(self):
        model = Imputer(missing_values="NaN", strategy="mean", axis=0)
        data = [[1, 2], [np.nan, 3], [7, 6]]
        model.fit(data)
        # The conversion works but internally scikit-learn converts
        # everything into float before looking into missing values.
        # There is no nan integer. The runtime is not tested
        # in this case.
        model_onnx = convert_sklearn(model, "scikit-learn imputer",
                                     [("input", Int64TensorType([None, 2]))])
        self.assertTrue(model_onnx is not None) 
开发者ID:onnx,项目名称:sklearn-onnx,代码行数:13,代码来源:test_sklearn_imputer_converter.py


示例23: test_imputer_int_inputs

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def test_imputer_int_inputs(self):
        model = Imputer(missing_values="NaN", strategy="mean", axis=0)
        data = [[1, 2], [np.nan, 3], [7, 6]]
        model.fit(data)
        model_onnx = convert_sklearn(model, "scikit-learn imputer",
                                     [("input", Int64TensorType([None, 2]))])
        self.assertEqual(len(model_onnx.graph.node), 2)

        # Last node should be Imputer
        outputs = model_onnx.graph.output
        self.assertEqual(len(outputs), 1)
        self.assertEqual(outputs[0].type.tensor_type.shape.dim[-1].dim_value,
                         2) 
开发者ID:onnx,项目名称:sklearn-onnx,代码行数:15,代码来源:test_sklearn_imputer_converter.py


示例24: test_transform_series_int

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def test_transform_series_int(self):
        arr = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
        s = pdml.ModelSeries(arr, index='a b c d e f g h i'.split(' '))

        # reshape arr to 2d
        arr = arr.reshape(-1, 1)

        if pd.compat.PY3:
            models = ['Binarizer', 'Imputer', 'StandardScaler']
            # MinMaxScalar raises TypeError in ufunc
        else:
            models = ['Binarizer', 'Imputer', 'StandardScaler', 'MinMaxScaler']

        for model in models:
            mod1 = getattr(s.preprocessing, model)()
            mod2 = getattr(pp, model)()
            s.fit(mod1)
            mod2.fit(arr)

            result = s.transform(mod1)
            expected = mod2.transform(arr).flatten()

            self.assertIsInstance(result, pdml.ModelSeries)
            self.assert_numpy_array_almost_equal(result.values, expected)

            mod1 = getattr(s.preprocessing, model)()
            mod2 = getattr(pp, model)()

            result = s.fit_transform(mod1)
            expected = mod2.fit_transform(arr).flatten()

            self.assertIsInstance(result, pdml.ModelSeries)
            self.assert_numpy_array_almost_equal(result.values, expected) 
开发者ID:pandas-ml,项目名称:pandas-ml,代码行数:35,代码来源:test_preprocessing.py


示例25: impute_and_scale

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def impute_and_scale(df, scaling='std'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    #imputer = Imputer(strategy='mean', axis=0)
    imputer = Imputer(strategy='mean')
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df 
开发者ID:ECP-CANDLE,项目名称:Benchmarks,代码行数:34,代码来源:p1b3.py


示例26: impute_and_scale

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def impute_and_scale(df, scaling='std'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean')
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df 
开发者ID:ECP-CANDLE,项目名称:Benchmarks,代码行数:33,代码来源:NCI60.py


示例27: impute_and_scale

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def impute_and_scale(df, scaling='std', imputing='mean', dropna='all'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    if dropna:
        df = df.dropna(axis=1, how=dropna)
    else:
        empty_cols = df.columns[df.notnull().sum() == 0]
        df[empty_cols] = 0

    if imputing is None or imputing.lower() == 'none':
        mat = df.values
    else:
        imputer = Imputer(strategy=imputing)
        mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)
    df = pd.DataFrame(mat, columns=df.columns)

    return df 
开发者ID:ECP-CANDLE,项目名称:Benchmarks,代码行数:39,代码来源:uno_data.py


示例28: impute_and_scale_array

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def impute_and_scale_array(mat, scaling=None):
    """ Impute missing values with mean and scale data included in numpy array.

        Parameters
        ----------
        mat : numpy array
            Array to scale
        scaling : string
            String describing type of scaling to apply.
            Options recognized: 'maxabs', 'minmax', 'std'.
            'maxabs' : scales data to range [-1 to 1].
            'minmax' : scales data to range [-1 to 1].
            'std'    : scales data to normal variable with mean 0 and standard deviation 1.
            (Default: None, no scaling).

        Return
        ----------
        Returns the numpy array imputed with the mean value of the \
        column and scaled by the method specified. If no scaling method is specified, \
        it returns the imputed numpy array.
    """
    
#    imputer = Imputer(strategy='mean', axis=0, copy=False)
#    imputer = SimpleImputer(strategy='mean', copy=False)
    # Next line is from conditional import. axis=0 is default
    # in old version so it is not necessary.
    imputer = Imputer(strategy='mean', copy=False)
    imputer.fit_transform(mat)
    
    return scale_array(mat, scaling) 
开发者ID:ECP-CANDLE,项目名称:Benchmarks,代码行数:32,代码来源:data_utils.py


示例29: test_permutation_test_score_allow_nans

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def test_permutation_test_score_allow_nans():
    # Check that permutation_test_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    ])
    permutation_test_score(p, X, y, cv=5) 
开发者ID:alvarobartt,项目名称:twitter-stock-recommendation,代码行数:12,代码来源:test_validation.py


示例30: test_cross_val_score_allow_nans

# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import Imputer [as 别名]
def test_cross_val_score_allow_nans():
    # Check that cross_val_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    ])
    cross_val_score(p, X, y, cv=5) 
开发者ID:alvarobartt,项目名称:twitter-stock-recommendation,代码行数:12,代码来源:test_validation.py



注:本文中的sklearn.preprocessing.Imputer方法示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。