本文整理匯總了Python中sklearn.preprocessing.StandardScaler.fit方法的典型用法代碼示例。如果您正苦於以下問題:Python StandardScaler.fit方法的具體用法?Python StandardScaler.fit怎麽用?Python StandardScaler.fit使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.preprocessing.StandardScaler
的用法示例。
在下文中一共展示了StandardScaler.fit方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: rf2
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def rf2():
"""
Submission: rf2_0704_04.csv
3000 trees
E_val: 0.871431
E_in: 0.999998
E_out:
30000 trees
E_val:
E_in:
E_out:
"""
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
X, y = dataset.load_train()
raw_scaler = StandardScaler()
raw_scaler.fit(X)
X_scaled = raw_scaler.transform(X)
rf = RandomForestClassifier(n_estimators=30000, oob_score=True, n_jobs=-1,
class_weight='auto', max_features='log2')
rf.fit(X_scaled, y)
logger.debug('Eval(oob): %f', rf.oob_score_)
logger.debug('Ein: %f', Util.auc_score(rf, X_scaled, y))
IO.cache(rf, Path.of_cache('rf.RandomForestClassifier.log2.pkl'))
IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
('rf', rf)]), 'rf2_0704_04')
示例2: process
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def process(discrete, cont):
# Create discrete and continuous data matrices
discrete_X = np.array(discrete)
cont_X = np.array(cont)
# Impute discrete values
imp = Imputer(strategy='most_frequent')
discrete_X = imp.fit_transform(discrete_X)
# Impute continuous values
imp_c = Imputer(strategy='mean')
cont_X = imp_c.fit_transform(cont_X)
# Discrete basis representation
enc = OneHotEncoder()
enc.fit(discrete_X)
discrete_X = enc.transform(discrete_X).toarray()
# Continuous scaling
scaler = StandardScaler()
scaler.fit(cont_X)
cont_X = scaler.transform(cont_X)
# Merge to one array
X = np.concatenate((discrete_X, cont_X), axis=1)
return X
示例3: knn
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def knn(x_train, y_train, x_valid):
x_train=np.log(x_train+1)
x_valid=np.log(x_valid+1)
where_are_nan = np.isnan(x_train)
where_are_inf = np.isinf(x_train)
x_train[where_are_nan] = 0
x_train[where_are_inf] = 0
where_are_nan = np.isnan(x_valid)
where_are_inf = np.isinf(x_valid)
x_valid[where_are_nan] = 0
x_valid[where_are_inf] = 0
scale=StandardScaler()
scale.fit(x_train)
x_train=scale.transform(x_train)
x_valid=scale.transform(x_valid)
#pca = PCA(n_components=10)
#pca.fit(x_train)
#x_train = pca.transform(x_train)
#x_valid = pca.transform(x_valid)
kneighbors=KNeighborsClassifier(n_neighbors=200,n_jobs=-1)
knn_train, knn_test = stacking(kneighbors, x_train, y_train, x_valid, "knn")
return knn_train, knn_test, "knn"
示例4: get_norm_nFoldData
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def get_norm_nFoldData(trainXY, testXY):
trainX = trainXY[:,:-1]
trainY = trainXY[:,-1]
testX = testXY[:,:-1]
testY = testXY[:,-1]
#standardise only x values not labels
scaler = StandardScaler()
scaler.fit(trainX)
trainX = scaler.transform(trainX)
scaler.fit(testX)
testX = scaler.transform(testX)
trainY = trainY.reshape((trainY.shape[0],1))
testY = testY.reshape((testY.shape[0],1))
train_X_Y = np.concatenate((trainX,trainY),axis=1)
test_X_Y = np.concatenate((testX,testY),axis=1)
folds_tr = []
folds_te = []
nfolds = 5
for i in range(nfolds):
xp = int(train_X_Y.shape[0]*.8)
np.random.shuffle(train_X_Y)
folds_tr.append(train_X_Y[:xp,:])
folds_te.append(train_X_Y[xp:,:])
return folds_tr, folds_te
示例5: load_data_csv_advanced
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def load_data_csv_advanced(datafile):
"""
Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns.
:param datafile: path of the file
:return: a NumPy array containing a data point in each row
"""
# File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location
# will be at the column named 'x' in the CSV file.
_COLUMN_X = 'x'
_COLUMN_Y = 'y'
data = pd.read_csv(datafile)
# Normalize
scaler = StandardScaler()
scaler.fit(data[[_COLUMN_X, _COLUMN_Y]])
data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]])
# Get feature vector names by removing "x" and "y"
feature_vector_names = data.columns.difference([_COLUMN_X, _COLUMN_Y])
data_coords = data[[_COLUMN_X, _COLUMN_Y]].values
result = {"coordinates": data_coords}
for feature in feature_vector_names:
data_words = [[e.strip() for e in venue_data.split(",")] for venue_data in data[feature].values.flatten().tolist()]
result[feature] = data_words
return sparsify_data(result, None, None), scaler # None for both params since SVD is not used
示例6: dbscan_outliers
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def dbscan_outliers(df):
"""
Find outliers (noise points) using DBSCAN.
Parameters
----------
df: A pandas.DataFrame
Returns
-------
A tuple of (a sklearn.DBSCAN instance, a pandas.DataFrame)
"""
scaler = StandardScaler()
scaler.fit(df)
scaled = scaler.transform(df)
dbs = DBSCAN()
db = dbs.fit(scaled)
outliers = dbs.fit_predict(scaled)
df_o = df.ix[np.nonzero(outliers)]
return db, df_o
示例7: GPR
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
class GPR(object):
def __init__(self, X, y, kernel=None):
self.X = X
self.y = y
self._noise_variance = 0.00001
self._kernel = kernel
self._scaler = StandardScaler(with_std=False)
self._scaler.fit(self.y)
self.y = self._scaler.transform(self.y)
assert self._kernel is not None
@property
def noise_variance(self):
return self._noise_variance
@noise_variance.setter
def noise_variance(self, value):
self._noise_variance = value
def predict(self, X_test):
assert isinstance(self._kernel, Kern)
K = self._kernel.K(self.X)
K_star = self._kernel.K(self.X, X_test)
K_star_star = self._kernel.K(X_test)
L = np.linalg.cholesky(K + self._noise_variance * np.eye(len(K)))
Lk = np.linalg.solve(L, K_star)
mu = np.dot(Lk.T, np.linalg.solve(L, self.y))
s2 = np.diag(K_star_star) - np.sum(Lk ** 2, axis=0) + self._noise_variance
return mu + self._scaler.mean_, s2
示例8: data_processing
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def data_processing(train,test,features):
# train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
# test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
# train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
# test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
# train['hour'] = train['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
# test['hour'] = test['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
# train['dark'] = train['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0)
# test['dark'] = test['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0)
# features += ['hour','dark','StreetNo']
print("Filling NAs")
# print(train.mode())
train = train.fillna(train.median().iloc[0])
test = test.fillna(test.median().iloc[0])
print("Label Encoder")
le=LabelEncoder()
for col in features:
le.fit(list(train[col])+list(test[col]))
train[col]=le.transform(train[col])
test[col]=le.transform(test[col])
le.fit(list(train[target]))
train[target]=le.transform(train[target])
print("Standard Scalaer")
scaler=StandardScaler()
for col in features:
scaler.fit(list(train[col]))
train[col]=scaler.transform(train[col])
test[col]=scaler.transform(test[col])
return train,test,features
示例9: __init__
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def __init__(self):
"""
Constructs a SimulateData object.
"""
# Read the simulated data.
simulated = pd.read_csv("simulated.csv", index_col=0)
predictors = np.asarray(simulated)[:, 0:-1]
responses = np.asarray(simulated)[:, -1]
# Divide the simulated data into training and test sets.
predictors_training, predictors_test,\
self.responses_training, self.responses_test =\
train_test_split(predictors, responses, test_size=0.33)
# Standardize the predictors, both training and test.
scaler = StandardScaler()
scaler.fit(predictors_training)
self.predictors_training = scaler.transform(predictors_training)
self.predictors_test = scaler.transform(predictors_test)
# Keep track of the number of samples in the training and test sets,
# and also the number of features.
self.training_sample_count = len(self.responses_training)
self.test_sample_count = len(self.responses_test)
self.feature_count = np.size(predictors, 1)
return None
示例10: lr_with_scale3
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def lr_with_scale3():
"""
Check the performance of normalizing TEST SET.
Submission: lr_with_scale3_0707_04.csv
E_val:
E_in: 0.879233
E_out: 0.8770121701777971
Submission: lr_with_scale3_0712_01.csv
E_val:
E_in:
E_out:
"""
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
X, y = dataset.load_train()
raw_scaler = StandardScaler()
raw_scaler.fit(np.r_[X, dataset.load_test()])
X_scaled = raw_scaler.transform(X)
clf = LogisticRegression(C=0.03, class_weight='auto')
clf.fit(X_scaled, y)
logger.debug('E_in: %f', Util.auc_score(clf, X_scaled, y))
IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
('lr', clf)]), 'lr_with_scale3_0712_01')
scores = cross_val_score(clf, X_scaled, y, scoring='roc_auc', n_jobs=-1)
logger.debug('E_val: %f <- %s', np.average(scores), scores)
示例11: load_data
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def load_data(dataset, scale=False):
''' Loads the dataset
:type dataset: string
:param dataset: The folder in ../data/ containing the training/testing numpy arrays
'''
print '... loading data'
path = "../data/" + dataset + "/"
#training set
trainingData = numpy.load(path + "training.data.npy")
trainingIndices = numpy.load(path + "training.indices.npy")
trainingIndptr = numpy.load(path + "training.indptr.npy")
training_y = numpy.load(path + "training.labels.npy")
training_X = scipy.sparse.csr_matrix((trainingData, trainingIndices, trainingIndptr))
#testing set
testingData = numpy.load(path + "testing.data.npy")
testingIndices = numpy.load(path + "testing.indices.npy")
testingIndptr = numpy.load(path + "testing.indptr.npy")
testing_y = numpy.load(path + "testing.labels.npy")
testing_X = scipy.sparse.csr_matrix((testingData, testingIndices, testingIndptr))
#scale the data
if scale:
print "..training scaler"
scaler = StandardScaler(with_mean=False)
scaler.fit(training_X)
print "..scaling features"
training_X = scaler.transform(training_X)
testing_X = scaler.transform(testing_X)
return [(training_X, training_y),(testing_X, testing_y)]
示例12: svc_appr
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def svc_appr():
"""
Best params: {'C': 0.022139881953014046}
Submission:
E_val:
E_in:
E_out:
"""
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats import expon
X, y = dataset.load_train()
raw_scaler = StandardScaler()
raw_scaler.fit(X)
X_scaled = raw_scaler.transform(X)
svc = LinearSVC(dual=False, class_weight='auto')
rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1,
cv=StratifiedKFold(y, 5), verbose=2,
param_distributions={'C': expon()})
rs.fit(X_scaled, y)
logger.debug('Got best SVC.')
logger.debug('Best params: %s', rs.best_params_)
logger.debug('Grid scores:')
for i, grid_score in enumerate(rs.grid_scores_):
print('\t%s' % grid_score)
logger.debug('Best score (E_val): %s', rs.best_score_)
logger.debug('E_in: %f', Util.auc_score(rs, X_scaled, y))
示例13: ada_boost_dt
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def ada_boost_dt():
"""
Submission: ada_boost_dt_0707_03.csv
E_val: 0.854350
E_in: 0.889561
E_out: 0.8832315976033993
"""
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
X, y = dataset.load_train()
raw_scaler = StandardScaler()
raw_scaler.fit(X)
X_scaled = raw_scaler.transform(X)
ab = AdaBoostClassifier(n_estimators=300)
scores = cross_val_score(ab, X_scaled, y, cv=5, n_jobs=-1)
logger.debug('CV: %s', scores)
logger.debug('E_val: %f', sum(scores) / len(scores))
ab.fit(X_scaled, y)
logger.debug('E_in: %f', Util.auc_score(ab, X_scaled, y))
IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
('ab', ab)]), 'ada_boost_dt_0707_03')
示例14: bagging_lr
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def bagging_lr():
"""
Submission: bagging_lr_0707_02.csv
E_val:
E_in:
E_out:
"""
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
X, y = dataset.load_train()
raw_scaler = StandardScaler()
raw_scaler.fit(X)
X_scaled = raw_scaler.transform(X)
bag = BaggingClassifier(LogisticRegression(class_weight='auto'),
n_estimators=3000, oob_score=True, n_jobs=-1,
verbose=2)
logger.debug('E_val (oob): %f', bag.oob_score_)
logger.debug('E_in: %f', Util.auc_score(bag, X_scaled, y))
IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
('bag', bag)]), 'bagging_lr_0707_02')
示例15: sgc_test
# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def sgc_test(X, y, weight):
from sklearn.linear_model import SGDClassifier
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
for i in range(0,1):
X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split(
X, y, weight, test_size=0.2, random_state=0)
clf = SGDClassifier(loss="hinge", n_iter=100, n_jobs=-1, penalty="l2")
#clf = LogisticRegression( max_iter=100)
scaler = StandardScaler(with_mean=False)
scaler.fit(X_train) # Don't cheat - fit only on training data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) # apply same transformation to test data
clf.fit(X_train, y_train, sample_weight=weight_train)
y_pred = clf.predict(X_train)
#print(confusion_matrix(y_train, y_pred))
print(clf.score(X_train,y_train,weight_train))
y_pred = clf.predict(X_test)
#print(confusion_matrix(y_test, y_pred))
print(clf.score(X_test,y_test,weight_test))