本文整理汇总了Python中sklearn.preprocessing.Imputer.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python Imputer.fit_transform方法的具体用法?Python Imputer.fit_transform怎么用?Python Imputer.fit_transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing.Imputer
的用法示例。
在下文中一共展示了Imputer.fit_transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: process
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def process(discrete, cont):
# Create discrete and continuous data matrices
discrete_X = np.array(discrete)
cont_X = np.array(cont)
# Impute discrete values
imp = Imputer(strategy='most_frequent')
discrete_X = imp.fit_transform(discrete_X)
# Impute continuous values
imp_c = Imputer(strategy='mean')
cont_X = imp_c.fit_transform(cont_X)
# Discrete basis representation
enc = OneHotEncoder()
enc.fit(discrete_X)
discrete_X = enc.transform(discrete_X).toarray()
# Continuous scaling
scaler = StandardScaler()
scaler.fit(cont_X)
cont_X = scaler.transform(cont_X)
# Merge to one array
X = np.concatenate((discrete_X, cont_X), axis=1)
return X
示例2: preprocess
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def preprocess(data, feat_type):
# replace missing value by most common if categorical and by mean if numerical
try:
if data.getformat()=='csr':
return data
except:
print feat_type
# separate numerical and categorical columns
idx_num = [i for i in xrange(len(feat_type)) if feat_type[i] == 'Numerical']
data_num = data[:,idx_num]
idx_cat = [i for i in xrange(len(feat_type)) if feat_type[i] == 'Categorical']
data_cat = data[:,idx_cat]
# fill missing values
imp_num = Imputer(axis = 0)
data_num = imp_num.fit_transform(data_num)
imp_cat = Imputer(axis = 0, strategy='most_frequent')
data_cat = imp_cat.fit_transform(data_cat)
# retrieve mean and divide by standard deviation
data_num = scale(data_num)
# one-hot encode using pandas
# have to do it column by column because of pandas
data_cat_pd = pd.DataFrame(data_cat)
for i in xrange(data_cat.shape[1]):
data_cat_pd = pd.concat((data_cat_pd, pd.get_dummies(data_cat[:,i])),join = 'outer', axis = 1)
# delete the columns that have been one hot encoded; need to rename first,
# otherwise some columns may be suppressed unwillingly
data_cat_pd.columns = [i for i in xrange(data_cat_pd.shape[1])]
data_cat_pd = data_cat_pd.drop(data_cat_pd.iloc[:,[i for i in xrange(data_cat.shape[1])]],axis =1)
data_cat = np.asarray(data_cat_pd)
# regroup categorical and numerical variables
return np.hstack((data_num,data_cat))
示例3: predict
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def predict(self, raw_array, results, aux_data_a_d=None, diff=False,
holdout_col=0, lag=1, positive_control=False, **kwargs):
""" Given the input results model, predicts the year of data immediately succeeding the last year of the input array. Axis 0 indexes observations (schools) and axis 1 indexes years. For holdout_col>0, the last holdout_col years of data will be withheld from the prediction, which is ideal for finding the error of the algorithm. """
if positive_control:
if holdout_col > 0:
if diff:
if holdout_col == 1:
control_array = np.diff(raw_array[:, -2:],
1, axis=1)
else:
control_array = \
np.diff(raw_array[:, -holdout_col-1:-holdout_col+1],
1, axis=1)
else:
control_array = raw_array[:, -holdout_col]
else:
control_array = np.random.randn(raw_array.shape[0], 1)
if holdout_col > 0:
raw_array = raw_array[:, :-holdout_col]
prediction_raw_array = raw_array
if diff:
array = np.diff(raw_array, 1, axis=1)
X = array[:, -lag:]
if positive_control:
X = np.concatenate((X, control_array.reshape(-1, 1)), axis=1)
if aux_data_a_d:
for feature_s in aux_data_a_d.iterkeys():
if holdout_col > 0:
raw_array = aux_data_a_d[feature_s][:, :-holdout_col]
else:
raw_array = aux_data_a_d[feature_s]
array = np.diff(raw_array, 1, axis=1)
X = np.concatenate((X, array[:, -lag:]), axis=1)
estimatorX = Imputer(axis=0)
X = estimatorX.fit_transform(X)
predicted_change_a = results.predict(X)
estimator_orig = Imputer(axis=0)
orig_a = estimator_orig.fit_transform(prediction_raw_array[:, -1].reshape(-1,1))
prediction_a = orig_a + predicted_change_a.reshape(-1, 1)
else:
array = raw_array
X = array[:, -lag:]
if positive_control:
X = np.concatenate((X, control_array.reshape(-1, 1)), axis=1)
if aux_data_a_d:
for feature_s in aux_data_a_d.iterkeys():
if holdout_col > 0:
raw_array = aux_data_a_d[feature_s][:, :-holdout_col]
else:
raw_array = aux_data_a_d[feature_s]
array = raw_array
X = np.concatenate((X, array[:, -lag:]), axis=1)
estimatorX = Imputer(axis=0)
X = estimatorX.fit_transform(X)
prediction_a = results.predict(X)
return prediction_a.reshape((-1, 1))
示例4: fill_missing_values
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def fill_missing_values(_df, dis_features, cont_features):
# for discrete features we will use 'most_frequent' strategy
imp_discrete = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
_df[dis_features] = imp_discrete.fit_transform(_df[dis_features].values)
# for continuous features we will use 'mean' strategy
imp_continuous = Imputer(missing_values='NaN', strategy='mean', axis=0)
_df[cont_features] = imp_continuous.fit_transform(_df[cont_features].values)
return _df
示例5: main
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def main():
weather, train, spray, test = load_data()
target = train.WnvPresent.values
idcol = test.Id.values
weather = wnvutils.clean_weather(weather)
train = wnvutils.clean_train_test(train)
test = wnvutils.clean_train_test(test)
train, test = wnvutils.clean_train_test2(train, test)
train = train.merge(weather, on="Date")
test = test.merge(weather, on="Date")
train.drop("Date", axis=1, inplace=True)
test.drop("Date", axis=1, inplace=True)
desc_df(train)
train = train.ix[:, pd.notnull(train).any(axis=0)]
test = test.ix[:, pd.notnull(test).any(axis=0)]
def min_dist_to_spray_(x):
return wnvutils.min_dist_to_spray(x.Latitude, x.Longitude, spray)
train["DistToSpray"] = train.apply(min_dist_to_spray_, axis=1)
test["DistToSpray"] = test.apply(min_dist_to_spray_, axis=1)
desc_df(train)
imputer = Imputer()
traina = imputer.fit_transform(train)
testa = imputer.fit_transform(test)
training = np.random.choice([True, False], size=train.shape[0], p=[0.8, 0.2])
rfc = ensemble.RandomForestClassifier() # oob_score=True)
rfc.fit(traina[training], target[training])
# print("oob score:", rfc.oob_score_)
#
with open("output/feature_imp.txt", "w") as fout:
for name, imp in sorted(zip(train.columns, rfc.feature_importances_),
key=lambda x: x[1], reverse=True):
print(name, ":", imp)
print(name, ":", imp, file=fout)
predictions = rfc.predict(traina[~training])
print("Accuracy:", (predictions == target[~training]).mean())
predictions = rfc.predict_proba(traina[~training])
np.savetxt("/tmp/predictions.txt", predictions[:, 1])
print(predictions[:,1])
print("ROC AUC Score:", roc_auc_score(target[~training], predictions[:,1]))
示例6: test_model
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def test_model(data, stat_as_index, make_vector, model, do_pca=False, target='score'):
# compile and shit
print('Compiling stats...')
fv, sc = [], []
for year in ['2014', '2015', '2016']:
f,s = build_fvs(
data, year, stat_as_index, make_vector, target)
fv.append(f)
sc.append(s)
# Compile into single vectors: Predict 2016 from 2014 and 2015
fv_train, fv_test = np.vstack(fv[0:2]), fv[2]
sc_train, sc_test = np.concatenate(sc[0:2]), sc[2]
# Impute NaNs
train_nan = np.isnan(fv_train)
test_nan = np.isnan(fv_test)
for i in range(fv_train.shape[1]):
if np.isnan(fv_train[0,i]):
fv_train[0,i] = 0
for i in range(fv_test.shape[1]):
if np.isnan(fv_test[0,i]):
fv_test[0,i] = 0
print('Imputing...')
if train_nan.any():
i1 = Imputer()
fv_train = i1.fit_transform(fv_train)
#print(i1.statistics_)
if test_nan.any():
i2 = Imputer()
fv_test = i2.fit_transform(fv_test)
#print(i2.statistics_)
if do_pca:
print('Performing PCA...')
pca = PCA(whiten=True)
fv_train = pca.fit_transform(fv_train)
fv_test = pca.transform(fv_test)
print('Building test/train sets...')
# Exclude players with missing scores
train_nan, test_nan = np.isnan(sc_train), np.isnan(sc_test)
fv_train, sc_train = fv_train[~train_nan], sc_train[~train_nan]
fv_test, sc_test = fv_test[~test_nan], sc_test[~test_nan]
print('Building model...')
# Build model
mod = model
mod.fit(fv_train, sc_train)
print('Predicting output...')
# kluge to allow for classifier and regressor evaluation
try: pred = mod.predict_proba(fv_test)
except: pred = mod.predict(fv_test)
return pred, sc_test, mod
示例7: fillData
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def fillData(trainFeatures, testFeatures, missing_values=np.NaN, strategy='mean', axis=0, verbose=0, copy=True, all = True):
imp = Imputer(missing_values, strategy, axis, verbose, copy)
if all:
trainCount = len(trainFeatures)
full = np.vstack((trainFeatures, testFeatures))
full = imp.fit_transform(full)
trainFeatures, testFeatures = np.array(full[:trainCount]), np.array(full[trainCount:])
return trainFeatures, testFeatures
else:
return imp.fit_transform(trainFeatures), imp.fit_transform(testFeatures)
示例8: fill_missing_imputation
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def fill_missing_imputation(electionsData, most_frequent):
most_frequent = electionsData.columns.intersection(most_frequent)
im = Imputer(strategy="most_frequent")
electionsData[most_frequent] = im.fit_transform(electionsData[most_frequent])
#Fill all of the rest (numeric) using mean
im = Imputer(strategy="median")
electionsData[:] = im.fit_transform(electionsData[:])
示例9: imputing_most_frequent
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def imputing_most_frequent(dataset):
'''
:param dataset: pandas DataFrame dataset.
:return: The same dataset where the missing values are replaced with the column's most common value
'''
imp = Imputer(missing_values='NaN', strategy='most_frequent', copy=False)
imp.fit_transform(dataset)
return dataset
示例10: test_imputation_shape
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def test_imputation_shape(self):
"""Verify the shapes of the imputed matrix for different strategies."""
X = np.random.randn(10, 2)
X[::2] = np.nan
for strategy in ['mean', 'median', 'most_frequent']:
imputer = Imputer(strategy=strategy)
X_imputed = imputer.fit_transform(X)
assert_equal(X_imputed.shape, (10, 2))
X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
assert_equal(X_imputed.shape, (10, 2))
示例11: preprocess
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def preprocess(self):
# impute missing values
true_ids = set([urlid for urlid, label in self.target.iteritems() if label])
true_data = [v for k, v in self.data.iteritems() if k in true_ids]
false_data = [v for k, v in self.data.iteritems() if k not in true_ids]
self.target = [1 for x in xrange(len(true_data))] + [0 for x in xrange(len(false_data))]
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
true_data = imp.fit_transform(true_data)
false_data = imp.fit_transform(false_data)
self.data = np.concatenate((true_data, false_data), axis=0)
self.test_data = imp.fit_transform(self.test_data.values())
示例12: median_impute
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def median_impute(self):
"""
impute
"""
tr = HFile(self.trfile)
te = HFile(self.tefile)
self.attributes = tr.attributes
self.class_index = tr.class_index
imp = Imputer(missing_values=-1, strategy='median')
self.tr = imp.fit_transform(tr.data)
self.ta = tr.classes
self.te = imp.fit_transform(te.data)
示例13: solve_missing_values
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def solve_missing_values(data):
"""
Solve missing values
Parameters
----------
data: Values to remove missing values
"""
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit_transform(data)
return data
示例14: run_importance
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def run_importance(clf, data, labels, feature_labels=[""], string=""):
"""
Fit a classifier using all the data and plot the feature importances
:param clf: Classifier object that has feature_importances_ member
:param feature_labels: names of the features
:param string: classifier name
:return: (void) plot Gini importance vs feature
"""
num_features = data.shape[1]
importances = [0]*num_features
imp = Imputer(missing_values=np.NaN, strategy="mean")
data = imp.fit_transform(data)
# run the classifier 100 times and average the importance found after each fit
for r in range(100):
clf.fit(data, labels)
importances = [importances[i]+clf.feature_importances_[i] for i in range(num_features)]
importances = [importance/100 for importance in importances]
# Filter out the features that have 0 importance (e.g. values are all 0)
# non_zeros are the indices in feature_importances that are not 0
non_zeros = [i for i in range(num_features) if not importances[i] == 0]
importances = [importances[i] for i in non_zeros]
feature_labels = [feature_labels[i] for i in non_zeros]
# Plot the features
bar_width = 0.7
plt.bar(range(len(feature_labels)), importances, bar_width)
plt.xticks([ind + +float(bar_width)/2 for ind in range(len(feature_labels))], feature_labels,rotation="vertical")
plt.gcf().subplots_adjust(bottom=0.35)
plt.xlabel("Feature")
plt.ylabel("Gini Importance")
plt.title("Gini Importance v. Features for "+string+" Classifier")
plt.show()
示例15: test_3_stage
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit_transform [as 别名]
def test_3_stage(self):
from sklearn.preprocessing import Imputer
infile_name = path_of_data('missing_vals.csv')
p = Pipeline()
csv_read_node = p.add(CSVRead(infile_name))
csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))
impute_node = p.add(wrap_and_make_instance(Imputer))
csv_read_node['output'] > impute_node['X_train']
impute_node['X_new'] > csv_write_node['input']
self.run_pipeline(p)
ctrl_imputer = Imputer()
ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",",
names=True)
num_type = ctrl_X_sa[0][0].dtype
ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa)
ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd)
control = ctrl_X_new_nd
result = self._tmp_files.csv_read('out.csv', True)
self.assertTrue(np.allclose(result, control))