本文整理汇总了Python中sklearn.preprocessing.Scaler.transform方法的典型用法代码示例。如果您正苦于以下问题:Python Scaler.transform方法的具体用法?Python Scaler.transform怎么用?Python Scaler.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing.Scaler
的用法示例。
在下文中一共展示了Scaler.transform方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: data_to_kernels
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
def data_to_kernels(tr_data, te_data):
scaler = Scaler(copy=False)
scaler.fit_transform(tr_data)
#tr_data, mu, sigma = standardize(tr_data)
tr_data = power_normalize(tr_data, 0.5)
tr_data = L2_normalize(tr_data)
#te_data, _, _ = standardize(te_data, mu, sigma)
scaler.transform(te_data)
te_data = power_normalize(te_data, 0.5)
te_data = L2_normalize(te_data)
tr_kernel = np.dot(tr_data, tr_data.T)
te_kernel = np.dot(te_data, tr_data.T)
return tr_kernel, te_kernel
示例2: process_data
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
def process_data(self):
test = pandas.read_csv("test.csv")
testMat = test.as_matrix()
train = pandas.read_csv("train.csv")
trainMat = train.as_matrix()
trainResult = trainMat[:, 0]
trainMat = trainMat[:, 1:]
# trainInd = np.where(trainResult == 0)[0]
# how_many = (trainResult == 1).sum() - len(trainInd)
# np.random.shuffle(trainInd)
# addedResult = trainResult[trainInd[:how_many],:]
# addedData = trainMat[trainInd[:how_many],:]
# trainResult = np.append(trainResult,addedResult)
# trainMat = np.vstack((trainMat,addedData))
cv = StratifiedKFold(trainResult, 2)
# cv = KFold(n=trainResult.shape[0],k=2)
reduceFeatures = ExtraTreesClassifier(
compute_importances=True, random_state=1234, n_jobs=self.cpus, n_estimators=1000, criterion="gini"
)
reduceFeatures.fit(trainMat, trainResult)
trainScaler = Scaler()
self.cv_data = []
self.cv_data_nonreduced = []
for train, test in cv:
X_train, X_test, Y_train, Y_test = (
trainMat[train, :],
trainMat[test, :],
trainResult[train, :],
trainResult[test, :],
)
X_train = trainScaler.fit_transform(X_train)
X_test = trainScaler.transform(X_test)
self.cv_data_nonreduced.append((X_train, X_test, Y_train, Y_test))
X_train = reduceFeatures.transform(X_train)
X_test = reduceFeatures.transform(X_test)
self.cv_data.append((X_train, X_test, Y_train, Y_test))
testMat = trainScaler.transform(testMat)
self.testMat_nonreduced = testMat
self.testMat = reduceFeatures.transform(testMat)
allData = self.testMat, self.cv_data, self.testMat_nonreduced, self.cv_data_nonreduced
data_handle = open("allData.pkl", "w")
pickle.dump(allData, data_handle)
data_handle.close()
示例3: SVM_fit
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
def SVM_fit(X_in, y_in, X_out, gamma, C):
M = len(X_in[0]) #Number of features
seed(time())
#To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1
#First puts aside 10% of the data for the tests
test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in))))
shuffle(X_in, y_in)
X_test = [X_in[i] for i in test_indices]
y_test = [y_in[i] for i in test_indices]
X_in = [X_in[i] for i in train_indices]
y_in = [y_in[i] for i in train_indices]
#scale data first
scaler = Scaler(copy=False) #in place modification
#Normalize the data and stores as inner parameters the mean and standard deviation
#To avoid data snooping, normalization is computed on training set only, and then reported on data
scaler.fit(X_test, y_test)
X_in = scaler.transform(X_in)
X_test = scaler.transform(X_test)
X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before
std_test = X_test.std(axis=0)
f_indices = [j for j in range(M) if std_test[j] > 1e-7]
#Removes feature with null variance
X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
X_test = [[X_test[i][j] for j in f_indices] for i in range(len(X_test))]
X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]
M = len(f_indices)
#Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
svc = svm.SVC(kernel='rbf', C=C, gamma=gamma, verbose=False, cache_size=4092, tol=1e-5)
svc.fit(X_in, y_in)
y_out = svc.predict(X_out)
return y_out
示例4: test_center_kernel
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
def test_center_kernel():
"""Test that KernelCenterer is equivalent to Scaler in feature space"""
X_fit = np.random.random((5, 4))
scaler = Scaler(with_std=False)
scaler.fit(X_fit)
X_fit_centered = scaler.transform(X_fit)
K_fit = np.dot(X_fit, X_fit.T)
# center fit time matrix
centerer = KernelCenterer()
K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
K_fit_centered2 = centerer.fit_transform(K_fit)
assert_array_almost_equal(K_fit_centered, K_fit_centered2)
# center predict time matrix
X_pred = np.random.random((2, 4))
K_pred = np.dot(X_pred, X_fit.T)
X_pred_centered = scaler.transform(X_pred)
K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
K_pred_centered2 = centerer.transform(K_pred)
assert_array_almost_equal(K_pred_centered, K_pred_centered2)
示例5: test_scaler_without_centering
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
def test_scaler_without_centering():
rng = np.random.RandomState(42)
X = rng.randn(4, 5)
X[:, 0] = 0.0 # first feature is always of zero
X_csr = sp.csr_matrix(X)
scaler = Scaler(with_mean=False).fit(X)
X_scaled = scaler.transform(X, copy=True)
assert_false(np.any(np.isnan(X_scaled)))
scaler_csr = Scaler(with_mean=False).fit(X_csr)
X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
assert_false(np.any(np.isnan(X_csr_scaled.data)))
assert_equal(scaler.mean_, scaler_csr.mean_)
assert_array_almost_equal(scaler.std_, scaler_csr.std_)
assert_array_almost_equal(
X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2)
assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
# Check that X has not been modified (copy)
assert_true(X_scaled is not X)
assert_true(X_csr_scaled is not X_csr)
X_scaled_back = scaler.inverse_transform(X_scaled)
assert_true(X_scaled_back is not X)
assert_true(X_scaled_back is not X_scaled)
assert_array_almost_equal(X_scaled_back, X)
X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
assert_true(X_csr_scaled_back is not X_csr)
assert_true(X_csr_scaled_back is not X_csr_scaled)
assert_array_almost_equal(X_scaled_back, X)
示例6: test_scale_sparse_with_mean_raise_exception
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
def test_scale_sparse_with_mean_raise_exception():
rng = np.random.RandomState(42)
X = rng.randn(4, 5)
X_csr = sp.csr_matrix(X)
# check scaling and fit with direct calls on sparse data
assert_raises(ValueError, scale, X_csr, with_mean=True)
assert_raises(ValueError, Scaler(with_mean=True).fit, X_csr)
# check transform and inverse_transform after a fit on a dense array
scaler = Scaler(with_mean=True).fit(X)
assert_raises(ValueError, scaler.transform, X_csr)
X_transformed_csr = sp.csr_matrix(scaler.transform(X))
assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
示例7: range
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
year_1 = [i.year for i in actual_dates_1]
year_2 = [i.year for i in actual_dates_2]
df_i['month_1'] = month_1
df_i['month_2'] = month_2
df_i['year_1'] = year_1
df_i['year_2'] = year_2
# Fillnas to zero
train_sets.append(df_i.fillna(0))
# Log response variables
for i in range(len(outcomes)):
outcomes[i] = np.log(outcomes[i])
df_test_quants = df_test[quants]
scaled_test = scaler.transform(df_test_quants.fillna(0))
clusters_test = dpgmm.predict(scaled_test)
df_test['clusters'] = clusters_test
df_test = df_test.fillna(0)
time_deltas_1_test = [timedelta(int(i)) for i in df_test['Date_1'].values]
time_deltas_2_test = [timedelta(int(i)) for i in df_test['Date_2'].values]
actual_dates_1_test = [jan1+i for i in time_deltas_1_test]
actual_dates_2_test = [jan1+i for i in time_deltas_2_test]
month_1_test = [i.month for i in actual_dates_1_test]
month_2_test = [i.month for i in actual_dates_2_test]
year_1_test = [i.year for i in actual_dates_1_test]
year_2_test = [i.year for i in actual_dates_2_test]
df_test['month_1'] = month_1_test
df_test['month_2'] = month_2_test
df_test['year_1'] = year_1_test
示例8: KMPBase
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
class KMPBase(BaseEstimator):
def __init__(self,
n_nonzero_coefs=0.3,
loss=None,
# components (basis functions)
init_components=None,
n_components=None,
check_duplicates=False,
scale=False,
scale_y=False,
# back-fitting
n_refit=5,
estimator=None,
# metric
metric="linear", gamma=0.1, coef0=1, degree=4,
# validation
X_val=None, y_val=None,
n_validate=1,
epsilon=0,
score_func=None,
# misc
random_state=None, verbose=0, n_jobs=1):
if n_nonzero_coefs < 0:
raise AttributeError("n_nonzero_coefs should be > 0.")
self.n_nonzero_coefs = n_nonzero_coefs
self.loss = loss
self.init_components = init_components
self.n_components = n_components
self.check_duplicates = check_duplicates
self.scale = scale
self.scale_y = scale_y
self.n_refit = n_refit
self.estimator = estimator
self.metric = metric
self.gamma = gamma
self.coef0 = coef0
self.degree = degree
self.X_val = X_val
self.y_val = y_val
self.n_validate = n_validate
self.epsilon = epsilon
self.score_func = score_func
self.random_state = random_state
self.verbose = verbose
self.n_jobs = n_jobs
def _kernel_params(self):
return {"gamma" : self.gamma,
"degree" : self.degree,
"coef0" : self.coef0}
def _get_estimator(self):
if self.estimator is None:
estimator = LinearRegression()
else:
estimator = clone(self.estimator)
estimator.fit_intercept = False
return estimator
def _get_loss(self):
if self.loss == "squared":
return SquaredLoss()
else:
return None
def _pre_fit(self, X, y):
random_state = check_random_state(self.random_state)
if self.scale_y:
self.y_scaler_ = Scaler(copy=True).fit(y)
y = self.y_scaler_.transform(y)
if self.metric == "precomputed":
self.components_ = None
n_components = X.shape[1]
else:
if self.init_components is None:
if self.verbose: print "Selecting components..."
self.components_ = select_components(X, y,
self.n_components,
random_state=random_state)
else:
self.components_ = self.init_components
n_components = self.components_.shape[0]
n_nonzero_coefs = self.n_nonzero_coefs
if 0 < n_nonzero_coefs and n_nonzero_coefs <= 1:
n_nonzero_coefs = int(n_nonzero_coefs * n_components)
n_nonzero_coefs = int(n_nonzero_coefs)
if n_nonzero_coefs > n_components:
raise AttributeError("n_nonzero_coefs cannot be bigger than "
"n_components.")
if self.verbose: print "Computing dictionary..."
start = time.time()
#.........这里部分代码省略.........
示例9: main
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
def main():
X =[]
Y=[]
featuresDB = Base(os.getcwd()+"\\Databases\\features.db")
featuresDB.open()
print "features open"
for rec in featuresDB:
vec = []
vec.append(rec.f1)
vec.append(rec.f3)
vec.append(rec.f4)
vec.append(rec.f5)
vec.append(rec.f6)
vec.append(rec.f7)
vec.append(rec.f10)
vec.append(rec.f11)
vec.append(rec.f12)
vec.append(rec.f13)
vec.append(rec.f14)
vec.append(rec.f15)
vec.append(rec.f16)
vec.append(rec.f17)
vec.append(rec.f18)
vec.append(rec.f19)
vec.append(rec.f20)
vec.append(rec.f21)
vec.append(rec.f22)
vec.append(rec.f23)
X.append(vec)
Y.append(rec.score)
print "building classifier"
Y = np.array(Y)
ybar = Y.mean()
for i in range(len(Y)):
if Y[i]<ybar:
Y[i]=1
else:
Y[i]=2
scaler = Scaler().fit(X)
X = scaler.transform(X)
X= np.array(X)
Y=np.array(Y)
skf = cross_validation.StratifiedKFold(Y,k=2)
for train, test in skf:
X_train, X_test = X[train], X[test]
y_train, y_test = Y[train], Y[test]
clf = ExtraTreesClassifier(n_estimators=8,max_depth=None,min_split=1,random_state=0,compute_importances=True)
scores = cross_validation.cross_val_score(clf,X_train,y_train,cv=5)
clf.fit_transform(X_train,y_train)
print "Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
print clf.feature_importances_
y_pred =clf.predict(X_test)
print classification_report(y_test,y_pred)
model=(scaler,clf)
joblib.dump(model,'AestheticModel\\aestheticModel.pkl')
print "Done"
示例10: Scaler
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
labels = data[:,0]
n_train = 35000
#n_val = n - n_train
n_val = 7000
trainset = records[:n_train,:]
trainlabels = labels[:n_train]
#valset = records[n_train:,:]
#vallabels = labels[n_train:,:]
valset = records[n_train:n_train+n_val,:]
vallabels = labels[n_train:n_train+n_val]
n,dim = trainset.shape
# mean centering, stdev normalization and whitening
scaler = Scaler()
scaler.fit(trainset)
trainset = scaler.transform(trainset)
valset = scaler.transform(valset)
pca = PCA(n_components=dim,whiten=True)
pca.fit(trainset)
trainset = pca.transform(trainset)
valset = pca.transform(valset)
config = Train_config()
config.iterations = 10
config.nonlinearity = 'tanh'
config.batchsize = 50
config.learning_rate = 0.2
config.momentum = 0.7
log = open('log.txt','w')
nn = Net([dim,300,10],log_file=log)
nn.fit(trainset,trainlabels,config,val_set=valset,val_labels=vallabels)
示例11: range
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
all_folds[split, fold, test] = 0
for d in range(0, dims.shape[0]):
Xtrain = Xm_shfl[train, :, dims[d]]
ytrain = y_shfl[train]
sw_train = sw_shfl[train]
# (deal with NaN in training)
ytrain = ytrain[~np.isnan(np.nansum(Xtrain, axis=1))]
sw_train = sw_train[~np.isnan(np.nansum(Xtrain, axis=1))]
Xtrain = Xtrain[~np.isnan(np.nansum(Xtrain, axis=1)), :]
if np.unique(ytrain).shape[0] > 1:
# feature selection (find the 50% most discriminative channels)
fs.fit(Xtrain, ytrain) # find
Xtrain = fs.transform(Xtrain) # remove unnecessary channels
# normalization
scaler.fit(Xtrain) # find
Xtrain = scaler.transform(Xtrain) # apply zscore
# SVM fit
clf.fit(Xtrain, ytrain, sample_weight=sw_train)
# retrieve hyperplan feature identification
coef[split, fold, dims[d], :, :] = 0 # initialize
#--- univariate
uni_features = fs.pvalues_ <= stats.scoreatpercentile(fs.pvalues_, fs.percentile)
#--- multivariate
coef[split, fold, dims[d], :, uni_features] = clf.coef_.T
# predict cross val (deal with NaN in testing)
Xtest = Xm_shfl[test, :, dims[d]]
test_nan = np.isnan(np.nansum(Xtest, axis=1))
Xtest = fs.transform(Xtest)
Xtest = scaler.transform(Xtest)
if (Xtest.shape[0] - np.sum(test_nan)) > 0:
if compute_predict:
示例12: SVM_train
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
def SVM_train(X_in, y_in, X_out, gammas, cs, file_log=None):
if file_log:
file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0])))
M = len(X_in[0]) #Number of features
seed(time())
#To prevent data snooping, breaks the input set into train. cross validation
#and scale sets, with sizes proportional to 8-1-1
#First puts aside 10% of the data for the tests
scale_set_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in))))
# shuffle(X_in, y_in)
X_scale = [X_in[i] for i in scale_set_indices]
y_scale = [y_in[i] for i in scale_set_indices]
X_in = [X_in[i] for i in train_indices]
y_in = [y_in[i] for i in train_indices]
#Scale data first
scaler = Scaler(copy=False) #WARNING: copy=False => in place modification
#Normalize the data and stores as inner parameters the mean and standard deviation
#To avoid data snooping, normalization is computed on a separate subsetonly, and then reported on data
scaler.fit(X_scale, y_scale)
X_scale = scaler.transform(X_scale)
X_in = scaler.transform(X_in)
X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before
std_test = X_scale.std(axis=0)
f_indices = [j for j in range(M) if std_test[j] > 1e-7]
#Removes feature with null variance
X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
X_scale = [[X_scale[i][j] for j in f_indices] for i in range(len(X_scale))]
X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]
if file_log:
file_log.writelines('Initial features :{}, Features used: {}\n'.format(M, len(X_in[0])))
M = len(f_indices)
best_cv_accuracy = 0.
best_gamma = 0.
best_c = 0.
#Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
for c in cs:
for g in gammas:
#Balanced cross validation (keeps the ratio of the two classes as
#constant as possible across the k folds).
kfold = cross_validation.StratifiedKFold(y_in, k=10)
svc = svm.SVC(kernel='rbf', C=c, gamma=g, verbose=False, cache_size=4092, tol=1e-5)
in_accuracy = 0.
cv_accuracy = 0.
for t_indices, cv_indices in kfold:
X_train = array([X_in[i][:] for i in t_indices])
y_train = [y_in[i] for i in t_indices]
X_cv = array([X_in[i][:] for i in cv_indices])
y_cv = [y_in[i] for i in cv_indices]
svc.fit(X_train, y_train)
in_accuracy += svc.score(X_train, y_train)
cv_accuracy += svc.score(X_cv, y_cv)
in_accuracy /= kfold.k
cv_accuracy /= kfold.k
if file_log:
file_log.writelines('C:{}, gamma:{}\n'.format(c, g))
file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy))
file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy))
if (cv_accuracy > best_cv_accuracy):
best_gamma = g
best_c = c
best_cv_accuracy = cv_accuracy
if file_log:
file_log.writelines('\nBEST result: E_cv={}, C={}, gamma={}\n'.format(1. - best_cv_accuracy, best_c, best_gamma))
svc = svm.SVC(kernel='rbf', C=best_c, gamma=best_gamma, verbose=False, cache_size=4092, tol=1e-5)
svc.fit(X_in, y_in)
if file_log:
file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in)))
file_log.writelines('Etest= {}\n'.format(1. - svc.score(X_scale, y_scale)))
y_out = svc.predict(X_out)
#DEBUG: output = ['{} {:+}\n'.format(id_out[i], int(y_scale[i])) for i in range(len(X_out))]
#DEBUG: file_log.writelines('------------------------')
return y_out
示例13: Logistic_train
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
def Logistic_train(X_in, y_in, X_out, cs, file_log=None):
if file_log:
file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0])))
M = len(X_in[0]) #Number of features
seed(time())
#To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1
#First puts aside 10% of the data for the tests
test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in))))
X_scaler = [X_in[i] for i in test_indices]
y_scaler = [y_in[i] for i in test_indices]
X_in = [X_in[i] for i in train_indices]
y_in = [y_in[i] for i in train_indices]
#scale data first
scaler = Scaler(copy=False) #in place modification
#Normalize the data and stores as inner parameters the mean and standard deviation
#To avoid data snooping, normalization is computed on training set only, and then reported on data
scaler.fit(X_scaler, y_scaler)
X_scaler = scaler.transform(X_scaler)
X_in = scaler.transform(X_in)
X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before
std_test = X_scaler.std(axis=0)
f_indices = [j for j in range(M) if std_test[j] > 1e-7]
#Removes feature with null variance
X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))]
X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]
M = len(X_in[0])
#Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
best_cv_accuracy = 0.
best_c = 0.
for c in cs:
kfold = cross_validation.StratifiedKFold(y_in, k=10)
lrc = LogisticRegression(C=c, tol=1e-5)
in_accuracy = 0.
cv_accuracy = 0.
for t_indices, cv_indices in kfold:
X_train = array([X_in[i][:] for i in t_indices])
y_train = [y_in[i] for i in t_indices]
X_cv = array([X_in[i][:] for i in cv_indices])
y_cv = [y_in[i] for i in cv_indices]
lrc.fit(X_train, y_train)
in_accuracy += lrc.score(X_train, y_train)
cv_accuracy += lrc.score(X_cv, y_cv)
in_accuracy /= kfold.k
cv_accuracy /= kfold.k
if file_log:
file_log.writelines('C: {}\n'.format(c))
file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy))
file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy))
if (cv_accuracy > best_cv_accuracy):
best_c = c
best_cv_accuracy = cv_accuracy
#Now tests the out of sample error
if file_log:
file_log.writelines('\nBEST result: E_cv={}, C={}\n'.format(1. - best_cv_accuracy, best_c))
lrc = LogisticRegression(C=best_c, tol=1e-5)
lrc.fit(X_in, y_in)
if file_log:
file_log.writelines('Ein= {}\n'.format(1. - lrc.score(X_in, y_in)))
file_log.writelines('Etest= {}\n'.format(1. - lrc.score(X_scaler, y_scaler)))
y_out = lrc.predict(X_out)
return y_out
示例14: tree_train
# 需要导入模块: from sklearn.preprocessing import Scaler [as 别名]
# 或者: from sklearn.preprocessing.Scaler import transform [as 别名]
def tree_train(X_in, y_in, X_out, min_meaningful_features_ratio=1., file_log=None):
if file_log:
file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0])))
M = len(X_in[0]) #Number of features
seed(time())
#To prevent data snooping, breaks the input set into train. cross validation and test sets, with sizes proportional to 8-1-1
#First puts aside 10% of the data for the tests
test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in))))
X_scaler = [X_in[i] for i in test_indices]
y_scaler = [y_in[i] for i in test_indices]
X_in = [X_in[i] for i in train_indices]
y_in = [y_in[i] for i in train_indices]
#scale data first
scaler = Scaler(copy=False) #in place modification
#Normalize the data and stores as inner parameters the mean and standard deviation
#To avoid data snooping, normalization is computed on training set only, and then reported on data
scaler.fit(X_scaler, y_scaler)
X_scaler = scaler.transform(X_scaler)
X_in = scaler.transform(X_in)
X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before
std_test = X_scaler.std(axis=0)
f_indices = [j for j in range(M) if std_test[j] > 1e-7]
#Removes feature with null variance
X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))]
X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]
M = len(f_indices)
#Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
best_cv_accuracy = 0.
best_features_number = M
for features_number in range(int(floor(M * min_meaningful_features_ratio)), M + 1):
# kfold = cross_validation.KFold(len(y_in), k=10, shuffle=True)
kfold = cross_validation.StratifiedKFold(y_in, k=10)
svc = ExtraTreesClassifier(criterion='entropy', max_features=features_number)
in_accuracy = 0.
cv_accuracy = 0.
for t_indices, cv_indices in kfold:
X_train = array([[X_in[i][j] for j in range(M)] for i in t_indices])
y_train = [y_in[i] for i in t_indices]
X_cv = array([[X_in[i][j] for j in range(M)] for i in cv_indices])
y_cv = [y_in[i] for i in cv_indices]
svc.fit(X_train, y_train)
in_accuracy += svc.score(X_train, y_train)
cv_accuracy += svc.score(X_cv, y_cv)
in_accuracy /= kfold.k
cv_accuracy /= kfold.k
if file_log:
file_log.writelines('# of features: {}\n'.format(len(X_train[0])))
file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy))
file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy))
if (cv_accuracy > best_cv_accuracy):
best_features_number = features_number
best_cv_accuracy = cv_accuracy
#Now tests the out of sample error
if file_log:
file_log.writelines('\nBEST result: E_cv={}, t={}\n'.format(1. - best_cv_accuracy, best_features_number))
svc = ExtraTreesClassifier(criterion='entropy', n_estimators=features_number)
svc.fit(X_in, y_in)
if file_log:
file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in)))
file_log.writelines('Etest= {}\n'.format(1. - svc.score(X_scaler, y_scaler)))
y_out = svc.predict(X_out)
return y_out