本文整理汇总了Python中sklearn.preprocessing.Imputer.fit方法的典型用法代码示例。如果您正苦于以下问题:Python Imputer.fit方法的具体用法?Python Imputer.fit怎么用?Python Imputer.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing.Imputer
的用法示例。
在下文中一共展示了Imputer.fit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: load_datasets
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def load_datasets(feature_paths, label_paths):
feature = np.ndarray(shape=(0,41))
label = np.ndarray(shape=(0,1))
for file in feature_paths:
# 使用pandas库的read_table函数读取一个特征文件内容
# 指定分隔符为逗号 缺失值为问号 文件中不包含表头行
df = pd.read_table(file, delimiter=',', na_values='?', header=None)
# 使用Imputer函数,通过设定strategy参数为'mean'
# 使用平均值对缺失数据补全,fit()函数用于训练预处理器,
# transform()函数用于生成预处理结果
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(df)
df = imp.transform(df)
# 将预处理后的数据加入feature,依次遍历完所有特征文件
feature = np.concatenate((feature, df))
for file in label_paths:
# 同上
df = pd.read_table(file, header=None)
# 标签文件没有缺失值,所以直接将读取到的新数据加入label集合
label = np.concatenate((label, df))
label = np.ravel(label)
# 将特征集合feature与标签集合label返回
return feature, label
示例2: bnp_svm
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def bnp_svm(train, test):
print('bnpsvm')
## If a value is missing, set it to the average
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
#print("cleaning data")
train = train.sample(1000)
## set up training data
train1 = train.select_dtypes(include=['float64'])
imp.fit(train1)
train1 = imp.transform(train1)
train1 = np.array(train1).astype(float)
## set up real y
target = np.array(train['target']).astype(int)
## set up testing data
test1 = test.select_dtypes(include=['float64'])
test1 = imp.transform(test1)
test1 = np.array(test1).astype(float)
#print("training...")
clf = svm.SVC(gamma=0.001, C=100, probability=True)
#print("testing")
clf.fit(train1, target)
#print("predicting")
yhat = clf.predict_proba(test1)
return yhat
#print(bnp_svm(train, test))
示例3: data_preprocessing_descriptive
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def data_preprocessing_descriptive(Extracted_Features,Coma_Features,Corrected_Features):
lvltrace.lvltrace("LVLEntree dans data_preprocessing_descriptive dans preproc_descriptive")
tools.separate_coma(Extracted_Features,Coma_Features)
for root, dirs, files in os.walk(Coma_Features):
for i in files:
if not i.startswith('.'):
input_i=Coma_Features+i
output_i=Corrected_Features+i
lines=tools.file_lines(input_i)
ncol=tools.file_col(input_i)
if lines >= 2:
file = open(output_i, "w")
writer=csv.writer(file, lineterminator='\t')
data = np.genfromtxt(input_i,delimiter=',')
X = data[1:, 2:]
neuron_type = np.genfromtxt(input_i,delimiter=',',dtype=None)
y = neuron_type[:, 0] # (class)
neuron_name = np.genfromtxt(input_i,delimiter=',',dtype=None)
z = neuron_name[:, 1] # Neuron names
features = np.genfromtxt(input_i,delimiter=',',dtype=None)
w = features[0, :] # features names
#Replace missing values 'nan' by column mean
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
# Output replacement "Nan" values
Y=imp.transform(X)
#print i
#print Y.shape, y.shape,z.shape
#print Y.shape[1]
####################
for line in xrange(Y.shape[0]+1):
for colonne in xrange(Y.shape[1]+2):
if line == 0:
if colonne == 0:
file.write("%s\t"%y[line])
else:
if colonne == 1:
file.write("%s\t"%z[line])
else:
file.write("%s\t"%w[colonne])
else:
if colonne == 0:
file.write("%s\t"%y[line])
else:
if colonne == 1:
file.write("%s\t"%z[line])
else:
file.write("%f\t"%Y[line-1,colonne-2])
file.write("\n")
#########################
else:
print "Only one morphology !!!"
file.close()
lvltrace.lvltrace("LVLSortie dans data_preprocessing_descriptive dans preproc_descriptive")
开发者ID:xaviervasques,项目名称:Neuron_Morpho_Classification_ML,代码行数:62,代码来源:data_preprocessing_descriptive.py
示例4: clf_fit_transform
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def clf_fit_transform(self):
#import dataset
self.df= pd.read_csv(self.dataset,na_values=["?"])
#clean dataset
#use median,most_frequent,mean
imr = Imputer(missing_values='NaN', strategy='mean', axis=0,copy=False)
imr.fit(self.df)
X_imputed_df = pd.DataFrame(imr.transform(self.df.values), columns = self.df.columns)
X_imputed_df.drop(['id'],1,inplace=True)
X= np.array(X_imputed_df.drop(['class'],1))
y=np.array(X_imputed_df['class'])
le= LabelEncoder()
y=le.fit_transform(y)
#cross validation
self.X_train, self.X_test ,self.y_train,self.y_test = cross_validation.train_test_split(X,y,test_size=0.2,random_state=0)
# define the object
self.stdsc = StandardScaler()
self.X_train_std= self.stdsc.fit_transform(self.X_train)
# once it learns it can apply on other inputs
self.X_test_std= self.stdsc.transform(self.X_test)
示例5: test
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def test():
vec = DictVectorizer()
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
for filename in glob.glob(r'../dataset/UCI/*.arff'):
basename = re.sub(r'(\..*?)$','',os.path.basename(filename))
print basename
if basename != DS:
continue
# cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb'))
data = arff.loadarff(filename)[0]
X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray()
imp.fit(X)
X = imp.transform(X)
labels = np.array([row[-1] for row in data])
y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels])
random = np.random.permutation(range(len(X)))
print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())]))
for iteration in xrange(10):
X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10)
for train, test in kf:
length, train_size = len(train), 0.1
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0)
for R in xrange(2,10):
ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]])
# print "%s R=%d"%(basename,R),
cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix)
exit()
示例6: trainSVM
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def trainSVM(x1,x2,kernel):
# prepare data
x1 = map(list,x1)
x2 = map(list,x2)
X = x1+x2
y1 = ones((shape(x1)[0],1))
y2 = -1*ones((shape(x2)[0],1))
Y = list(y1)+list(y2)
Y = ravel(Y)
#print 'Y'
if (kernel == 0):
svm = LinearSVC() #Instantiating the SVM LINEAR classifier.
params = {'C': [1, 10, 50, 100,200,300]} #Defining the params C which will be used by GridSearch. Param C does increase the weight of the 'fails'.
grid = GridSearchCV(svm, params, cv=5)
else:
svm = SVC(probability=True) #Instantiating the SVM RBF classifier.
params = {'C': [50, 100,200,300]} #Defining the params C & Gamma which will be used by GridSearch. Param C does increase the weight of the 'fails'. Gamma does define the std of a gaussian.
grid = GridSearchCV(svm, params, cv=5)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
trainData = imp.transform(X)
grid.fit(trainData, Y) #Run fit with all sets of parameters.
model = grid.best_estimator_
return model
示例7: load_datasets
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def load_datasets(feature_paths, label_paths):
'''
读取特征文件和标签文件并返回
'''
#定义feature数组变量,列数量和特征维度一致为41;定义空的标签变量,列数量与标签维度一致为1
feature = np.ndarray(shape=(0,41))
label = np.ndarray(shape=(0,1))
for file in feature_paths:
#使用pandas库的read_table函数读取一个特征文件的内容,其中指定分隔符为逗号、缺失值为问号且文件不包含表头行
#df = pd.read_table(file, delimiter=',', na_values='?', header=None)
#pandas.read_csv(数据源, encoding=编码格式为utf-8, parse_dates=第0列解析为日期, index_col=用作行索引的列编号)
data=pd.read_csv(file,encoding='utf-8',parse_dates=[0],index_col=0)
#DataFrame.sort_index(axis=0 (按0列排), ascending=True(升序), inplace=False(排序后是否覆盖原数据))
#data 按照时间升序排列
#data.sort_index(0,ascending=True,inplace=True)
#使用Imputer函数,通过设定strategy参数为‘mean’,使用平均值对缺失数据进行补全。
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
#fit()函数用于训练预处理器,transform()函数用于生成预处理结果。
imp.fit(df)
df = imp.transform(df)
#将预处理后的数据加入feature,依次遍历完所有特征文件
feature = np.concatenate((feature, df))
#读取标签文件
for file in label_paths:
df = pd.read_table(file, header=None)
label = np.concatenate((label, df))
#将标签归整化为一维向量
label = np.ravel(label)
return feature, label
示例8: ImputeCategorical
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
class ImputeCategorical(BaseEstimator, TransformerMixin):
"""
Encodes a specified list of columns or all columns if None.
"""
def __init__(self, columns=None):
self.columns = columns
self.imputer = None
def fit(self, data, target=None):
"""
Expects a data frame with named columns to impute.
"""
# Encode all columns if columns is None
if self.columns is None:
self.columns = data.columns
# Fit an imputer for each column in the data frame
self.imputer = Imputer(missing_values=0, strategy='most_frequent')
self.imputer.fit(data[self.columns])
return self
def transform(self, data):
"""
Uses the encoders to transform a data frame.
"""
output = data.copy()
output[self.columns] = self.imputer.transform(output[self.columns])
return output
示例9: eval_func
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def eval_func(chromosome):
t_par = chromosome.getInternalList()
print("## Start with Individual : " + str(t_par))
eta = t_par[0]
max_depth = t_par[1]
subsample = t_par[2]
colsample_bytree = t_par[3]
n_estimators = t_par[4]
test_size = t_par[5]
imp_start = t_par[6]
num_of_feat_corr = t_par[7]
print("## Filling missing data")
imp = Imputer(missing_values='NaN', strategy=imp_start, axis=0)
imp.fit(train[features])
train[features] = imp.transform(train[features])
test[features] = imp.transform(test[features])
curr_features = copy.deepcopy(features)
print("## Creating Random features based on Correlation")
output_cor = correlation_p[output_col_name].sort_values()
most_neg_cor = list(output_cor.index[0:num_of_feat_corr].ravel())
most_pos_cor = list(output_cor.index[(-2-num_of_feat_corr):-2].ravel())
for f1, f2 in pairwise(most_neg_cor):
train[f1 + "_" + f2] = train[f1] + train[f2]
test[f1 + "_" + f2] = test[f1] + test[f2]
curr_features += [f1 + "_" + f2]
for f1, f2 in pairwise(most_pos_cor):
train[f1 + "_" + f2] = train[f1] + train[f2]
test[f1 + "_" + f2] = test[f1] + test[f2]
curr_features += [f1 + "_" + f2]
params = {"objective": "binary:logistic",
"eta": eta,
"nthread":3,
"max_depth": max_depth,
"subsample": subsample,
"colsample_bytree": colsample_bytree,
"eval_metric": "logloss",
"n_estimators": n_estimators,
"silent": 1
}
num_boost_round = 10000
test_size = test_size
best_score = train_model(curr_features,params,num_boost_round,test_size)
grid_search_pd.loc[len(grid_search_pd),grid_search_columns] = [eta,max_depth,subsample,colsample_bytree,n_estimators,test_size,imp_start,num_of_feat_corr,best_score]
timestamp = time.strftime("%Y%m%d-%H%M%S")
print("########################## Round Time Stamp ==== " + timestamp)
grid_search_pd.to_csv(grid_search_file, index=False)
return best_score
示例10: fit
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def fit(self, train_x, train_y=None, is_norm=True):
# Normalization
if is_norm:
train_x_min = train_x.min(0)
train_x_ptp = train_x.ptp(axis=0)
train_x = train_x.astype(float) - train_x_min / train_x_ptp
if np.any(train_y):
train_y = train_y.astype(float) - train_x_min / train_x_ptp
imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
imp.fit(train_x)
if np.isnan(train_x).any():
log("Found {} NaN values in train_x, so try to transform them to 'mean'".format(np.isnan(train_x).sum()), WARN)
train_x = imp.transform(train_x)
if np.any(train_y) and np.isnan(train_y).any():
log("Found {} NaN values in train_y, so try to transform them to 'mean'".format(np.isnan(train_y).sum()), WARN)
train_y = imp.transform(train_y)
if np.any(train_y):
self.model.fit(train_x, train_y)
else:
self.model.fit(train_x)
示例11: preprocess_apply
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def preprocess_apply(data, missingvaluemethod, preprocessingmethods):
#imputing missing values
if missingvaluemethod!=Constants.MISSING_VALUE_METHOD_NONE:
if missingvaluemethod==Constants.MISSING_VALUE_METHOD_MEAN:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
elif missingvaluemethod==Constants.MISSING_VALUE_METHOD_MEDIAN:
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
elif missingvaluemethod==Constants.MISSING_VALUE_METHOD_MOST_FREQUENT:
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(data)
data=imp.transform(data)
else:
data=np.asarray(data)
#scale data
res=np.array([])
for i in range(0,len(preprocessingmethods)):
field=[[x[i]] for x in data]
if preprocessingmethods[i]==Constants.SCALING_METHOD_NONE:
pass
elif preprocessingmethods[i]==Constants.SCALING_METHOD_STANDARDIZATION:
scaler=preprocessing.StandardScaler().fit(field)
field=scaler.transform(field)
elif preprocessingmethods[i]==Constants.SCALING_METHOD_MINMAX:
field=preprocessing.MinMaxScaler().fit_transform(field)
elif preprocessingmethods[i]==Constants.SCALING_METHOD_CATEGORICAL:
enc = preprocessing.OneHotEncoder()
enc.fit(field)
field=enc.transform(field).toarray()
if i==0:
res = field
else:
res = np.concatenate((res, field), axis=1)
return res
示例12: FeaturePreProcesser
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
class FeaturePreProcesser():
def __init__(self):
pass
def fit(self,X):
self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
self.imputer.fit(X)
X = self.imputer.transform(X)
self.std_scaler = StandardScaler()
self.std_scaler.fit(X)
def fit_transform(self, X):
self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
self.imputer.fit(X)
X = self.imputer.transform(X)
self.std_scaler = StandardScaler()
self.std_scaler.fit(X)
X = self.std_scaler.transform(X)
return X
def transform(self, X):
X = self.imputer.transform(X)
X = self.std_scaler.transform(X)
return X
示例13: __init__
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
class ImputerWrapper:
""" A simple wrapper around Imputer and supports using zero to fill in missing values.
If entire column is nan it gets filled with 0 to avoid Imputer removing the column.
"""
def __init__(self, missing_values='NaN', strategy='zero', axis=0, verbose=0, copy=False):
self.strategy = strategy
self.imputer = None
if strategy != 'zero':
self.imputer = Imputer(missing_values, strategy, axis, verbose, copy)
def prepare(self, X):
for j in range(X.shape[1]):
all_nan = True
for i in range(X.shape[0]):
if not numpy.isnan(X[i][j]):
all_nan = False
break
if all_nan:
logging.info('column %d all nan, filling with 0' % j)
for i in range(X.shape[0]):
X[i][j] = 0.0
def fit(self, X, y=None):
if self.strategy == 'zero':
return self
self.prepare(X)
self.imputer.fit(X, y)
return self
def fit_transform(self, X, y=None, **fit_params):
if self.strategy == 'zero':
for i in range(X.shape[0]):
for j in range(X.shape[1]):
if numpy.isnan(X[i][j]):
X[i][j] = 0.0
return X
self.prepare(X)
return self.imputer.fit_transform(X, y, **fit_params)
def get_params(self, deep=True):
if self.strategy == 'zero':
return None
return self.imputer.get_params(deep)
def set_params(self, **params):
if self.strategy == 'zero':
return self
self.imputer.set_params(**params)
return self
def transform(self, X):
if self.strategy == 'zero':
for i in range(X.shape[0]):
for j in range(X.shape[1]):
if numpy.isnan(X[i][j]):
X[i][j] = 0.0
return X
return self.imputer.transform(X)
示例14: imput_data
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def imput_data(data):
numSubsets = data.shape[-1]
for i in range(numSubsets):
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(data[:,:,i])
data[:,:,i] = imp.transform(data[:,:,i])
data[:,-1,i] = preprocessing.scale(data[:,-1,i])
return data
示例15: my_imputer
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def my_imputer(name,strat,value):
if value == 0:
data[name] = data[name].fillna(0)
imp = Imputer(missing_values=value, strategy=strat, axis=0)
x = data[name]
x = x.reshape(-1,1)
imp.fit(x)
data[name] = imp.transform(x)