本文整理匯總了Python中sklearn.preprocessing.imputation.Imputer類的典型用法代碼示例。如果您正苦於以下問題:Python Imputer類的具體用法?Python Imputer怎麽用?Python Imputer使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了Imputer類的10個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: _impute
def _impute(features, imputer=True):
"""
Helper function that uses the safest imputing method to remove null values, in terms of compatibility with the data size
@param features: the feature values that need to be imputed
@type features: numpy.array
@param imputer: whether or not the scikit imputing method should be used
@type imputer: boolean
@return: the modified feature values
@rtype: numpy.array
"""
if not imputer: #run imputer only if enabled (default)
return np.nan_to_num(features)
else:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=2)
try:
impfeatures = imp.fit_transform(features)
except ValueError as exc:
#catch errors with illegal values (e.g. strings)
log.warning("Exception trying to run scikit imputation: {}".format(exc))
impfeatures = features
#show size for debugging purposes
#log.debug("Featurevectors {} after imputation: {}".format(impfeatures.shape, features))i
#we don't want shgrid_scores_ape to change, so if this happens, then just replace nans with zero and infinites
if impfeatures.shape == features.shape:
features = impfeatures
else:
log.warning("Imputer failed, filtering NaN based on numpy converter")
features = np.nan_to_num(features)
return features
示例2: setUp
def setUp(self):
self.cwd = os.getcwd()
tests_dir = __file__
os.chdir(os.path.dirname(tests_dir))
decoder = arff.ArffDecoder()
with open(os.path.join("datasets", "dataset.arff")) as fh:
dataset = decoder.decode(fh, encode_nominal=True)
# -1 because the last attribute is the class
self.attribute_types = [
'numeric' if type(type_) != list else 'nominal'
for name, type_ in dataset['attributes'][:-1]]
self.categorical = [True if attribute == 'nominal' else False
for attribute in self.attribute_types]
data = np.array(dataset['data'], dtype=np.float64)
X = data[:,:-1]
y = data[:,-1].reshape((-1,))
ohe = OneHotEncoder(self.categorical)
X_transformed = ohe.fit_transform(X)
imp = Imputer(copy=False)
X_transformed = imp.fit_transform(X_transformed)
center = not scipy.sparse.isspmatrix((X_transformed))
standard_scaler = StandardScaler(with_mean=center)
X_transformed = standard_scaler.fit_transform(X_transformed)
X_transformed = X_transformed.todense()
# Transform the array which indicates the categorical metafeatures
number_numerical = np.sum(~np.array(self.categorical))
categorical_transformed = [True] * (X_transformed.shape[1] -
number_numerical) + \
[False] * number_numerical
self.categorical_transformed = categorical_transformed
self.X = X
self.X_transformed = X_transformed
self.y = y
self.mf = meta_features.metafeatures
self.helpers = meta_features.helper_functions
# Precompute some helper functions
self.helpers.set_value("PCA", self.helpers["PCA"]
(self.X_transformed, self.y))
self.helpers.set_value("MissingValues", self.helpers[
"MissingValues"](self.X, self.y, self.categorical))
self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"](
self.X, self.y, self.categorical))
self.helpers.set_value("ClassOccurences",
self.helpers["ClassOccurences"](self.X, self.y))
self.helpers.set_value("Skewnesses",
self.helpers["Skewnesses"](self.X_transformed, self.y,
self.categorical_transformed))
self.helpers.set_value("Kurtosisses",
self.helpers["Kurtosisses"](self.X_transformed, self.y,
self.categorical_transformed))
示例3: test_imputation_shape
def test_imputation_shape():
# Verify the shapes of the imputed matrix for different strategies.
X = np.random.randn(10, 2)
X[::2] = np.nan
for strategy in ["mean", "median", "most_frequent"]:
imputer = Imputer(strategy=strategy)
X_imputed = imputer.fit_transform(X)
assert_equal(X_imputed.shape, (10, 2))
X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
assert_equal(X_imputed.shape, (10, 2))
示例4: test_imputation_pickle
def test_imputation_pickle():
"""Test for pickling imputers."""
import pickle
l = 100
X = sparse_random_matrix(l, l, density=0.10)
for strategy in ["mean", "median", "most_frequent"]:
imputer = Imputer(missing_values=0, strategy=strategy)
imputer.fit(X)
imputer_pickled = pickle.loads(pickle.dumps(imputer))
assert_array_equal(imputer.transform(X.copy()),
imputer_pickled.transform(X.copy()),
"Fail to transform the data after pickling "
"(strategy = %s)" % (strategy))
示例5: check_indicator
def check_indicator(X, expected_imputed_features, axis):
n_samples, n_features = X.shape
imputer = Imputer(missing_values=-1, strategy='mean', axis=axis)
imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
Xt = imputer.fit_transform(X)
Xt_with_in = imputer_with_in.fit_transform(X)
imputed_features_mask = X[:, expected_imputed_features] == -1
n_features_new = Xt.shape[1]
n_imputed_features = len(imputer_with_in.imputed_features_)
assert_array_equal(imputer.imputed_features_, expected_imputed_features)
assert_array_equal(imputer_with_in.imputed_features_,
expected_imputed_features)
assert_equal(Xt_with_in.shape,
(n_samples, n_features_new + n_imputed_features))
assert_array_equal(Xt_with_in, np.hstack((Xt, imputed_features_mask)))
imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
assert_array_equal(Xt_with_in,
imputer_with_in.fit_transform(sparse.csc_matrix(X)).A)
assert_array_equal(Xt_with_in,
imputer_with_in.fit_transform(sparse.csr_matrix(X)).A)
示例6: test_imputation_copy
def test_imputation_copy():
"""Test imputation with copy=True."""
l = 5
# Test default behaviour and with copy=True
for params in [{}, {'copy': True}]:
X = sparse_random_matrix(l, l, density=0.75, random_state=0)
# Dense
imputer = Imputer(missing_values=0, strategy="mean", **params)
Xt = imputer.fit(X).transform(X)
Xt[0, 0] = np.nan
# Check that the objects are different and that they don't use
# the same buffer
assert_false(np.all(X.todense() == Xt))
# Sparse
imputer = Imputer(missing_values=0, strategy="mean", **params)
X = X.todense()
Xt = imputer.fit(X).transform(X)
Xt[0, 0] = np.nan
# Check that the objects are different and that they don't use
# the same buffer
assert_false(np.all(X == Xt))
示例7: test_imputation_copy
def test_imputation_copy():
# Test imputation with copy
X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)
# copy=True, dense => copy
X = X_orig.copy().toarray()
imputer = Imputer(missing_values=0, strategy="mean", copy=True)
Xt = imputer.fit(X).transform(X)
Xt[0, 0] = -1
assert_false(np.all(X == Xt))
# copy=True, sparse csr => copy
X = X_orig.copy()
imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True)
Xt = imputer.fit(X).transform(X)
Xt.data[0] = -1
assert_false(np.all(X.data == Xt.data))
# copy=False, dense => no copy
X = X_orig.copy().toarray()
imputer = Imputer(missing_values=0, strategy="mean", copy=False)
Xt = imputer.fit(X).transform(X)
Xt[0, 0] = -1
assert_true(np.all(X == Xt))
# copy=False, sparse csr, axis=1 => no copy
X = X_orig.copy()
imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1)
Xt = imputer.fit(X).transform(X)
Xt.data[0] = -1
assert_true(np.all(X.data == Xt.data))
# copy=False, sparse csc, axis=0 => no copy
X = X_orig.copy().tocsc()
imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0)
Xt = imputer.fit(X).transform(X)
Xt.data[0] = -1
assert_true(np.all(X.data == Xt.data))
# copy=False, sparse csr, axis=0 => copy
X = X_orig.copy()
imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0)
Xt = imputer.fit(X).transform(X)
Xt.data[0] = -1
assert_false(np.all(X.data == Xt.data))
# copy=False, sparse csc, axis=1 => copy
X = X_orig.copy().tocsc()
imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1)
Xt = imputer.fit(X).transform(X)
Xt.data[0] = -1
assert_false(np.all(X.data == Xt.data))
# copy=False, sparse csr, axis=1, missing_values=0 => copy
X = X_orig.copy()
imputer = Imputer(missing_values=0, strategy="mean", copy=False, axis=1)
Xt = imputer.fit(X).transform(X)
assert_false(sparse.issparse(Xt))
示例8: _check_statistics
def _check_statistics(X, X_true, strategy, statistics, missing_values):
"""Utility function for testing imputation for a given strategy.
Test:
- along the two axes
- with dense and sparse arrays
Check that:
- the statistics (mean, median, mode) are correct
- the missing values are imputed correctly"""
err_msg = "Parameters: strategy = %s, missing_values = %s, " "axis = {0}, sparse = {1}" % (strategy, missing_values)
# Normal matrix, axis = 0
imputer = Imputer(missing_values, strategy=strategy, axis=0)
X_trans = imputer.fit(X).transform(X.copy())
assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, False))
assert_array_equal(X_trans, X_true, err_msg.format(0, False))
# Normal matrix, axis = 1
imputer = Imputer(missing_values, strategy=strategy, axis=1)
imputer.fit(X.transpose())
if np.isnan(statistics).any():
assert_raises(ValueError, imputer.transform, X.copy().transpose())
else:
X_trans = imputer.transform(X.copy().transpose())
assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, False))
# Sparse matrix, axis = 0
imputer = Imputer(missing_values, strategy=strategy, axis=0)
imputer.fit(sparse.csc_matrix(X))
X_trans = imputer.transform(sparse.csc_matrix(X.copy()))
if sparse.issparse(X_trans):
X_trans = X_trans.toarray()
assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, True))
assert_array_equal(X_trans, X_true, err_msg.format(0, True))
# Sparse matrix, axis = 1
imputer = Imputer(missing_values, strategy=strategy, axis=1)
imputer.fit(sparse.csc_matrix(X.transpose()))
if np.isnan(statistics).any():
assert_raises(ValueError, imputer.transform, sparse.csc_matrix(X.copy().transpose()))
else:
X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose()))
if sparse.issparse(X_trans):
X_trans = X_trans.toarray()
assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, True))
示例9: setUp
def setUp(self):
self.cwd = os.getcwd()
tests_dir = __file__
os.chdir(os.path.dirname(tests_dir))
decoder = arff.ArffDecoder()
with open(os.path.join("datasets", "dataset.arff")) as fh:
dataset = decoder.decode(fh, encode_nominal=True)
# -1 because the last attribute is the class
self.attribute_types = [
'numeric' if type(type_) != list else 'nominal'
for name, type_ in dataset['attributes'][:-1]]
self.categorical = [True if attribute == 'nominal' else False
for attribute in self.attribute_types]
data = np.array(dataset['data'], dtype=np.float64)
X = data[:, :-1]
y = data[:, -1].reshape((-1,))
# First, swap NaNs and zeros, because when converting an encoded
# dense matrix to sparse, the values which are encoded to zero are lost
X_sparse = X.copy()
NaNs = ~np.isfinite(X_sparse)
X_sparse[NaNs] = 0
X_sparse = sparse.csr_matrix(X_sparse)
ohe = OneHotEncoder(self.categorical)
X_transformed = X_sparse.copy()
X_transformed = ohe.fit_transform(X_transformed)
imp = Imputer(copy=False)
X_transformed = imp.fit_transform(X_transformed)
standard_scaler = StandardScaler()
X_transformed = standard_scaler.fit_transform(X_transformed)
# Transform the array which indicates the categorical metafeatures
number_numerical = np.sum(~np.array(self.categorical))
categorical_transformed = [True] * (X_transformed.shape[1] -
number_numerical) + \
[False] * number_numerical
self.categorical_transformed = categorical_transformed
self.X = X_sparse
self.X_transformed = X_transformed
self.y = y
self.mf = meta_features.metafeatures
self.helpers = meta_features.helper_functions
# Precompute some helper functions
self.helpers.set_value("PCA", self.helpers["PCA"]
(self.X_transformed, self.y))
self.helpers.set_value("MissingValues", self.helpers[
"MissingValues"](self.X, self.y, self.categorical))
self.mf.set_value("NumberOfMissingValues",
self.mf["NumberOfMissingValues"](self.X, self.y, self.categorical))
self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"](
self.X, self.y, self.categorical))
self.helpers.set_value("ClassOccurences",
self.helpers["ClassOccurences"](self.X, self.y))
self.helpers.set_value("Skewnesses",
self.helpers["Skewnesses"](self.X_transformed, self.y,
self.categorical_transformed))
self.helpers.set_value("Kurtosisses",
self.helpers["Kurtosisses"](self.X_transformed, self.y,
self.categorical_transformed))
示例10: print
count += 1
if count % 1000 == 0:
print(count)
val = noncat_matrix[x, y]
if val - math.floor(val) != 0.0:
for i in range(20):
if abs(abs(val) * i - math.ceil(abs(val) * i)) < 0.001:
X[x, 2 * y] = math.ceil(abs(val) * i)
X[x, 2 * y + 1] = i
return X
# категории
print("building train")
train_cat_matr = train_df.ix[:, 0:CAT_COUNT].as_matrix()
imp = Imputer(missing_values="NaN", strategy="most_frequent", axis=0)
train_cat_matr = imp.fit_transform(train_cat_matr)
# imp2 = Imputer(missing_values='NaN', strategy='median')
train_noncat_matr = train_df.ix[:, CAT_COUNT:].fillna(0).as_matrix()
# train_noncat_matr = train_df.ix[:, CAT_COUNT:].as_matrix()
# train_noncat_matr = imp2.fit_transform(train_noncat_matr)
# allf = np.hstack((train_cat_matr, train_noncat_matr))
print("building test")
test_df.ix[:, 0:CAT_COUNT] = test_set_to_encode
test_cat_matr = test_df.ix[:, 0:CAT_COUNT].as_matrix()
test_cat_matr = imp.transform(test_cat_matr)
test_noncat_matr = test_df.ix[:, CAT_COUNT:].fillna(0).as_matrix()
# test_noncat_matr = test_df.ix[:, CAT_COUNT:].as_matrix()
# test_noncat_matr = imp2.transform(test_noncat_matr)