本文整理汇总了Python中sklearn.preprocessing.Imputer.transform方法的典型用法代码示例。如果您正苦于以下问题:Python Imputer.transform方法的具体用法?Python Imputer.transform怎么用?Python Imputer.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing.Imputer
的用法示例。
在下文中一共展示了Imputer.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: eval_func
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def eval_func(chromosome):
t_par = chromosome.getInternalList()
print("## Start with Individual : " + str(t_par))
eta = t_par[0]
max_depth = t_par[1]
subsample = t_par[2]
colsample_bytree = t_par[3]
n_estimators = t_par[4]
test_size = t_par[5]
imp_start = t_par[6]
num_of_feat_corr = t_par[7]
print("## Filling missing data")
imp = Imputer(missing_values='NaN', strategy=imp_start, axis=0)
imp.fit(train[features])
train[features] = imp.transform(train[features])
test[features] = imp.transform(test[features])
curr_features = copy.deepcopy(features)
print("## Creating Random features based on Correlation")
output_cor = correlation_p[output_col_name].sort_values()
most_neg_cor = list(output_cor.index[0:num_of_feat_corr].ravel())
most_pos_cor = list(output_cor.index[(-2-num_of_feat_corr):-2].ravel())
for f1, f2 in pairwise(most_neg_cor):
train[f1 + "_" + f2] = train[f1] + train[f2]
test[f1 + "_" + f2] = test[f1] + test[f2]
curr_features += [f1 + "_" + f2]
for f1, f2 in pairwise(most_pos_cor):
train[f1 + "_" + f2] = train[f1] + train[f2]
test[f1 + "_" + f2] = test[f1] + test[f2]
curr_features += [f1 + "_" + f2]
params = {"objective": "binary:logistic",
"eta": eta,
"nthread":3,
"max_depth": max_depth,
"subsample": subsample,
"colsample_bytree": colsample_bytree,
"eval_metric": "logloss",
"n_estimators": n_estimators,
"silent": 1
}
num_boost_round = 10000
test_size = test_size
best_score = train_model(curr_features,params,num_boost_round,test_size)
grid_search_pd.loc[len(grid_search_pd),grid_search_columns] = [eta,max_depth,subsample,colsample_bytree,n_estimators,test_size,imp_start,num_of_feat_corr,best_score]
timestamp = time.strftime("%Y%m%d-%H%M%S")
print("########################## Round Time Stamp ==== " + timestamp)
grid_search_pd.to_csv(grid_search_file, index=False)
return best_score
示例2: bnp_svm
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def bnp_svm(train, test):
print('bnpsvm')
## If a value is missing, set it to the average
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
#print("cleaning data")
train = train.sample(1000)
## set up training data
train1 = train.select_dtypes(include=['float64'])
imp.fit(train1)
train1 = imp.transform(train1)
train1 = np.array(train1).astype(float)
## set up real y
target = np.array(train['target']).astype(int)
## set up testing data
test1 = test.select_dtypes(include=['float64'])
test1 = imp.transform(test1)
test1 = np.array(test1).astype(float)
#print("training...")
clf = svm.SVC(gamma=0.001, C=100, probability=True)
#print("testing")
clf.fit(train1, target)
#print("predicting")
yhat = clf.predict_proba(test1)
return yhat
#print(bnp_svm(train, test))
示例3: FeaturePreProcesser
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
class FeaturePreProcesser():
def __init__(self):
pass
def fit(self,X):
self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
self.imputer.fit(X)
X = self.imputer.transform(X)
self.std_scaler = StandardScaler()
self.std_scaler.fit(X)
def fit_transform(self, X):
self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
self.imputer.fit(X)
X = self.imputer.transform(X)
self.std_scaler = StandardScaler()
self.std_scaler.fit(X)
X = self.std_scaler.transform(X)
return X
def transform(self, X):
X = self.imputer.transform(X)
X = self.std_scaler.transform(X)
return X
示例4: fit
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def fit(self, train_x, train_y=None, is_norm=True):
# Normalization
if is_norm:
train_x_min = train_x.min(0)
train_x_ptp = train_x.ptp(axis=0)
train_x = train_x.astype(float) - train_x_min / train_x_ptp
if np.any(train_y):
train_y = train_y.astype(float) - train_x_min / train_x_ptp
imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
imp.fit(train_x)
if np.isnan(train_x).any():
log("Found {} NaN values in train_x, so try to transform them to 'mean'".format(np.isnan(train_x).sum()), WARN)
train_x = imp.transform(train_x)
if np.any(train_y) and np.isnan(train_y).any():
log("Found {} NaN values in train_y, so try to transform them to 'mean'".format(np.isnan(train_y).sum()), WARN)
train_y = imp.transform(train_y)
if np.any(train_y):
self.model.fit(train_x, train_y)
else:
self.model.fit(train_x)
示例5: ImputeAndGetFinalTrainTestData
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def ImputeAndGetFinalTrainTestData(train,test):
X_train = train[:,:-1];
y_train = train[:,-1];
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X_train);
X_train = imp.transform(X_train);
X_test = imp.transform(test.as_matrix());
return (X_train,y_train,X_test)
示例6: _check_statistics
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def _check_statistics(self, X, X_true,
strategy, statistics, missing_values):
"""Utility function for testing imputation for a given strategy.
Test:
- along the two axes
- with dense and sparse arrays
Check that:
- the statistics (mean, median, mode) are correct
- the missing values are imputed correctly"""
err_msg = "Parameters: strategy = %s, missing_values = %s, " \
"axis = {0}, sparse = {1}" % (strategy, missing_values)
# Normal matrix, axis = 0
imputer = Imputer(missing_values, strategy=strategy, axis=0)
X_trans = imputer.fit(X).transform(X.copy())
assert_array_equal(imputer.statistics_, statistics,
err_msg.format(0, False))
assert_array_equal(X_trans, X_true, err_msg.format(0, False))
# Normal matrix, axis = 1
imputer = Imputer(missing_values, strategy=strategy, axis=1)
imputer.fit(X.transpose())
if np.isnan(statistics).any():
assert_raises(ValueError, imputer.transform, X.copy().transpose())
else:
X_trans = imputer.transform(X.copy().transpose())
assert_array_equal(X_trans, X_true.transpose(),
err_msg.format(1, False))
# Sparse matrix, axis = 0
imputer = Imputer(missing_values, strategy=strategy, axis=0)
imputer.fit(sparse.csc_matrix(X))
X_trans = imputer.transform(sparse.csc_matrix(X.copy()))
if sparse.issparse(X_trans):
X_trans = X_trans.toarray()
assert_array_equal(imputer.statistics_, statistics,
err_msg.format(0, True))
assert_array_equal(X_trans, X_true, err_msg.format(0, True))
# Sparse matrix, axis = 1
imputer = Imputer(missing_values, strategy=strategy, axis=1)
imputer.fit(sparse.csc_matrix(X.transpose()))
if np.isnan(statistics).any():
assert_raises(ValueError, imputer.transform,
sparse.csc_matrix(X.copy().transpose()))
else:
X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose()))
if sparse.issparse(X_trans):
X_trans = X_trans.toarray()
assert_array_equal(X_trans, X_true.transpose(),
err_msg.format(1, True))
示例7: load_datasets
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def load_datasets(feature_paths, label_paths):
'''
读取特征文件和标签文件并返回
'''
#定义feature数组变量,列数量和特征维度一致为41;定义空的标签变量,列数量与标签维度一致为1
feature = np.ndarray(shape=(0,41))
label = np.ndarray(shape=(0,1))
for file in feature_paths:
#使用pandas库的read_table函数读取一个特征文件的内容,其中指定分隔符为逗号、缺失值为问号且文件不包含表头行
#df = pd.read_table(file, delimiter=',', na_values='?', header=None)
#pandas.read_csv(数据源, encoding=编码格式为utf-8, parse_dates=第0列解析为日期, index_col=用作行索引的列编号)
data=pd.read_csv(file,encoding='utf-8',parse_dates=[0],index_col=0)
#DataFrame.sort_index(axis=0 (按0列排), ascending=True(升序), inplace=False(排序后是否覆盖原数据))
#data 按照时间升序排列
#data.sort_index(0,ascending=True,inplace=True)
#使用Imputer函数,通过设定strategy参数为‘mean’,使用平均值对缺失数据进行补全。
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
#fit()函数用于训练预处理器,transform()函数用于生成预处理结果。
imp.fit(df)
df = imp.transform(df)
#将预处理后的数据加入feature,依次遍历完所有特征文件
feature = np.concatenate((feature, df))
#读取标签文件
for file in label_paths:
df = pd.read_table(file, header=None)
label = np.concatenate((label, df))
#将标签归整化为一维向量
label = np.ravel(label)
return feature, label
示例8: ImputeCategorical
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
class ImputeCategorical(BaseEstimator, TransformerMixin):
"""
Encodes a specified list of columns or all columns if None.
"""
def __init__(self, columns=None):
self.columns = columns
self.imputer = None
def fit(self, data, target=None):
"""
Expects a data frame with named columns to impute.
"""
# Encode all columns if columns is None
if self.columns is None:
self.columns = data.columns
# Fit an imputer for each column in the data frame
self.imputer = Imputer(missing_values=0, strategy='most_frequent')
self.imputer.fit(data[self.columns])
return self
def transform(self, data):
"""
Uses the encoders to transform a data frame.
"""
output = data.copy()
output[self.columns] = self.imputer.transform(output[self.columns])
return output
示例9: imputed_data
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def imputed_data(df, colname, strategy="mean"):
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values="NaN", strategy=strategy, axis=0)
imr = imr.fit(df[colname].reshape(-1,1))
imputed_data = imr.transform(df[colname].values.reshape(-1,1))
df[colname] = imputed_data
print("Data has been imputed to \"{}\"".format(colname))
示例10: trainSVM
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def trainSVM(x1,x2,kernel):
# prepare data
x1 = map(list,x1)
x2 = map(list,x2)
X = x1+x2
y1 = ones((shape(x1)[0],1))
y2 = -1*ones((shape(x2)[0],1))
Y = list(y1)+list(y2)
Y = ravel(Y)
#print 'Y'
if (kernel == 0):
svm = LinearSVC() #Instantiating the SVM LINEAR classifier.
params = {'C': [1, 10, 50, 100,200,300]} #Defining the params C which will be used by GridSearch. Param C does increase the weight of the 'fails'.
grid = GridSearchCV(svm, params, cv=5)
else:
svm = SVC(probability=True) #Instantiating the SVM RBF classifier.
params = {'C': [50, 100,200,300]} #Defining the params C & Gamma which will be used by GridSearch. Param C does increase the weight of the 'fails'. Gamma does define the std of a gaussian.
grid = GridSearchCV(svm, params, cv=5)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
trainData = imp.transform(X)
grid.fit(trainData, Y) #Run fit with all sets of parameters.
model = grid.best_estimator_
return model
示例11: test
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def test():
vec = DictVectorizer()
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
for filename in glob.glob(r'../dataset/UCI/*.arff'):
basename = re.sub(r'(\..*?)$','',os.path.basename(filename))
print basename
if basename != DS:
continue
# cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb'))
data = arff.loadarff(filename)[0]
X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray()
imp.fit(X)
X = imp.transform(X)
labels = np.array([row[-1] for row in data])
y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels])
random = np.random.permutation(range(len(X)))
print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())]))
for iteration in xrange(10):
X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10)
for train, test in kf:
length, train_size = len(train), 0.1
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0)
for R in xrange(2,10):
ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]])
# print "%s R=%d"%(basename,R),
cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix)
exit()
示例12: impute_null_vals
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def impute_null_vals(df_train, df_test, feature_cols, strategy='mean'):
'''
Impute null values using strategy
'''
# Impute using combined (train + test) datasets
df_combined = df_train[feature_cols].append(df_test[feature_cols])
imputer = Imputer(
missing_values='NaN', strategy=strategy, axis=0, verbose=0, copy=False
).fit(df_combined)
df_train[feature_cols] = imputer.transform(df_train[feature_cols])
df_test[feature_cols] = imputer.transform(df_test[feature_cols])
# Remove duplicate columns and rows
df_train, df_test = remove_duplicates_const(df_train, df_test)
return df_train, df_test
示例13: dealDataSet
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def dealDataSet():
imp = Imputer(missing_values='NaN',strategy='mean',axis=0)
train_x,l1 = loadDataSet('../data/train_x.csv')
imp.fit(np.array(train_x))
_train_x = imp.transform(np.array(train_x))
train_y = np.genfromtxt('../data/train_y.csv',delimiter=',')[1:,1]
test_x,result_id = loadDataSet('../data/test_x.csv')
imp.fit(np.array(test_x))
_test_x = imp.transform(np.array(test_x))
train_x_normalized = preprocessing.normalize(_train_x,norm='l2')
test_x_normalized = preprocessing.normalize(_test_x,norm='l2')
return train_x_normalized,train_y,test_x_normalized,result_id
示例14: preprocess_apply
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def preprocess_apply(data, missingvaluemethod, preprocessingmethods):
#imputing missing values
if missingvaluemethod!=Constants.MISSING_VALUE_METHOD_NONE:
if missingvaluemethod==Constants.MISSING_VALUE_METHOD_MEAN:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
elif missingvaluemethod==Constants.MISSING_VALUE_METHOD_MEDIAN:
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
elif missingvaluemethod==Constants.MISSING_VALUE_METHOD_MOST_FREQUENT:
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(data)
data=imp.transform(data)
else:
data=np.asarray(data)
#scale data
res=np.array([])
for i in range(0,len(preprocessingmethods)):
field=[[x[i]] for x in data]
if preprocessingmethods[i]==Constants.SCALING_METHOD_NONE:
pass
elif preprocessingmethods[i]==Constants.SCALING_METHOD_STANDARDIZATION:
scaler=preprocessing.StandardScaler().fit(field)
field=scaler.transform(field)
elif preprocessingmethods[i]==Constants.SCALING_METHOD_MINMAX:
field=preprocessing.MinMaxScaler().fit_transform(field)
elif preprocessingmethods[i]==Constants.SCALING_METHOD_CATEGORICAL:
enc = preprocessing.OneHotEncoder()
enc.fit(field)
field=enc.transform(field).toarray()
if i==0:
res = field
else:
res = np.concatenate((res, field), axis=1)
return res
示例15: run_main
# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def run_main(new_file, start, stop, dat):
with open(new_file, 'a') as file:
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=1)
import itertools
with open(dat, "r") as text_file:
for line in itertools.islice(text_file, start, stop):
line = line.replace("NA", "NaN")
content = line.rstrip('\n').split('\t')
CpG = content.pop(0)
flag, CpG_location = get_location(CpG)
if flag == 'F':
continue
genotype_matrix = get_genotypes(CpG_location)
genotype_matrix = imp.transform(genotype_matrix)
genotype_matrix = genotype_matrix.transpose()
#run PCA
try:
PCA_matrix = run_pca(genotype_matrix)
except ValueError:
print "value error"
continue
#run linear regression
meth_values = pd.Series(content, name="meth_val", dtype=float)
model = sm.OLS(meth_values, PCA_matrix)
results = model.fit()
MethValResids = results.resid
final = pd.Series(CpG)
final = final.append(MethValResids)
fline = final.tolist()
fline = '\t'.join(str(x) for x in fline)
fline = fline + "\n"
file.write(fline)