本文整理匯總了Python中sklearn.preprocessing.StandardScaler類的典型用法代碼示例。如果您正苦於以下問題:Python StandardScaler類的具體用法?Python StandardScaler怎麽用?Python StandardScaler使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了StandardScaler類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: check_transformer_pickle
def check_transformer_pickle(name, Transformer):
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
random_state=0, n_features=2, cluster_std=0.1)
n_samples, n_features = X.shape
X = StandardScaler().fit_transform(X)
X -= X.min()
# catch deprecation warnings
with warnings.catch_warnings(record=True):
transformer = Transformer()
if not hasattr(transformer, 'transform'):
return
set_random_state(transformer)
set_fast_parameters(transformer)
# fit
if name in CROSS_DECOMPOSITION:
random_state = np.random.RandomState(seed=12345)
y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))])
y_ = y_.T
else:
y_ = y
transformer.fit(X, y_)
X_pred = transformer.fit(X, y_).transform(X)
pickled_transformer = pickle.dumps(transformer)
unpickled_transformer = pickle.loads(pickled_transformer)
pickled_X_pred = unpickled_transformer.transform(X)
assert_array_almost_equal(pickled_X_pred, X_pred)
示例2: check_classifiers_classes
def check_classifiers_classes(name, Classifier):
X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
X, y = shuffle(X, y, random_state=7)
X = StandardScaler().fit_transform(X)
# We need to make sure that we have non negative data, for things
# like NMF
X -= X.min() - .1
y_names = np.array(["one", "two", "three"])[y]
for y_names in [y_names, y_names.astype('O')]:
if name in ["LabelPropagation", "LabelSpreading"]:
# TODO some complication with -1 label
y_ = y
else:
y_ = y_names
classes = np.unique(y_)
# catch deprecation warnings
with warnings.catch_warnings(record=True):
classifier = Classifier()
if name == 'BernoulliNB':
classifier.set_params(binarize=X.mean())
set_fast_parameters(classifier)
# fit
classifier.fit(X, y_)
y_pred = classifier.predict(X)
# training set performance
assert_array_equal(np.unique(y_), np.unique(y_pred))
if np.any(classifier.classes_ != classes):
print("Unexpected classes_ attribute for %r: "
"expected %s, got %s" %
(classifier, classes, classifier.classes_))
示例3: clustering_approach
def clustering_approach(self):
'''
Cluster user data using various clustering algos
IN: self.df_full and self.labels
OUT: results to stdout
'''
print 'Fitting clustering model'
X = self.df_full.values
y = self.labels
# scale data
scaler = StandardScaler()
X = scaler.fit_transform(X)
# KMeans
km_clf = KMeans(n_clusters=2, n_jobs=6)
km_clf.fit(X)
# swap labels as super-users are in cluster 0 (messy!!)
temp = y.apply(lambda x: 0 if x == 1 else 1)
print '\nKMeans clustering: '
self.analyse_preds(temp, km_clf.labels_)
# Agglomerative clustering
print '\nAgglomerative clustering approach: '
ac_clf = AgglomerativeClustering()
ac_labels = ac_clf.fit_predict(X)
self.analyse_preds(y, ac_labels)
return None
示例4: buildCoordinationTreeRegressor
def buildCoordinationTreeRegressor(predictorColumns, element, coordinationDir = 'coordination/', md = None):
"""
Build a coordination predictor for a given element from compositional structure data of structures containing that element. Will return a model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values
"""
try:
df = pd.read_csv(coordinationDir + element + '.csv')
except Exception:
print 'No data for ' + element
return None, None, None
df = df.dropna()
if('fracNobleGas' in df.columns):
df = df[df['fracNobleGas'] <= 0]
if(len(df) < 4):
print 'Not enough data for ' + element
return None, None, None
s = StandardScaler()
X = s.fit_transform(df[predictorColumns].astype('float64'))
y = df['avgCoordination'].values
rfr = RandomForestRegressor(max_depth = md)
acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error)))
X_train, X_test, y_train, y_test = train_test_split(X,y)
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
t = pd.DataFrame({'True':y_test, 'Predicted':y_predict})
rfr.fit(X, y)
return rfr, t, round(acc,2)
示例5: train_and_test
def train_and_test(train_books, test_books, train, scale=True):
X_train, y_train, cands_train, features = get_pair_data(train_books, True)
X_test, y_test, cands_test, features = get_pair_data(test_books)
scaler = None
if scale:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print sum(y_train)*0.1/len(y_train)
print 'Start training'
print X_train.shape
clf = train(X_train, y_train)
print 'Done training'
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
'''
# print performance for training books
print "--------------Traning data-------------"
train_perf = evaluate_books(clf, train_books, scaler, evaluate_pair)
# print performance for testing books
print "\n"
print "--------------Testing data-------------"
test_perf = evaluate_books(clf, test_books, scaler, evaluate_pair)
'''
print 'Train Non-unique Precision:', precision(y_train_pred, y_train), 'Non-unique Recall:', recall(y_train_pred, y_train)
print 'Test Non-unique Precision:', precision(y_test_pred, y_test), 'Recall:', recall(y_test_pred, y_test)
return clf, scaler, X_train, y_train, X_test, y_test
示例6: load_train_data
def load_train_data(path):
print("Loading Train Data")
df = pd.read_csv(path)
# Remove line below to run locally - Be careful you need more than 8GB RAM
rows = np.random.choice(df.index.values, 40000)
df = df.ix[rows]
# df = df.sample(n=40000)
# df = df.loc[df.index]
labels = df.target
df = df.drop('target',1)
df = df.drop('ID',1)
# Junk cols - Some feature engineering needed here
df = df.fillna(-1)
X = df.values.copy()
np.random.shuffle(X)
X = X.astype(np.float32)
encoder = LabelEncoder()
y = encoder.fit_transform(labels).astype(np.int32)
scaler = StandardScaler()
X = scaler.fit_transform(X)
return X, y, encoder, scaler
示例7: knn
def knn(x_train, y_train, x_valid):
x_train=np.log(x_train+1)
x_valid=np.log(x_valid+1)
where_are_nan = np.isnan(x_train)
where_are_inf = np.isinf(x_train)
x_train[where_are_nan] = 0
x_train[where_are_inf] = 0
where_are_nan = np.isnan(x_valid)
where_are_inf = np.isinf(x_valid)
x_valid[where_are_nan] = 0
x_valid[where_are_inf] = 0
scale=StandardScaler()
scale.fit(x_train)
x_train=scale.transform(x_train)
x_valid=scale.transform(x_valid)
#pca = PCA(n_components=10)
#pca.fit(x_train)
#x_train = pca.transform(x_train)
#x_valid = pca.transform(x_valid)
kneighbors=KNeighborsClassifier(n_neighbors=200,n_jobs=-1)
knn_train, knn_test = stacking(kneighbors, x_train, y_train, x_valid, "knn")
return knn_train, knn_test, "knn"
示例8: normalize
def normalize( training_data, test_data ):
scaler = StandardScaler()
values = scaler.fit_transform( training_data )
training_data = pd.DataFrame( values, columns=training_data.columns, index=training_data.index )
values = scaler.transform( test_data )
test_data = pd.DataFrame( values, columns=test_data.columns, index=test_data.index )
return training_data, test_data
示例9: run_model
def run_model( model, model_name, X, Y, X_val):
new_values = [ [x] for x in range(len(X))]
X = numpy.append(X, new_values, 1)
from sklearn.preprocessing import StandardScaler # I have a suspicion that the classifier might work better without the scaler
scaler = StandardScaler().fit(X)
X = scaler.transform(X)
max_time_val = X[-1][-1] *2 - X[-2][-1]
Y = make_black_maps_class(Y)
# Load validation data
model.fit(X, Y)
new_values = [ [max_time_val] for x in range(len(X_val))]
X_val = numpy.append(X_val, new_values, 1)
# Now predict validation output
Y_pred = model.predict(X_val)
# Crop impossible values
Y_pred[Y_pred < 0] = 0
Y_pred[Y_pred > 600] = 600
savetxt('final_pred_y{0}.csv'.format(model_name), Y_pred, delimiter=',')
black_map_count = 0
for y in Y_pred:
if y == 600:
black_map_count += 1
print black_map_count, model_name
sys.stdout.flush()
示例10: load_data_csv_advanced
def load_data_csv_advanced(datafile):
"""
Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns.
:param datafile: path of the file
:return: a NumPy array containing a data point in each row
"""
# File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location
# will be at the column named 'x' in the CSV file.
_COLUMN_X = 'x'
_COLUMN_Y = 'y'
data = pd.read_csv(datafile)
# Normalize
scaler = StandardScaler()
scaler.fit(data[[_COLUMN_X, _COLUMN_Y]])
data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]])
# Get feature vector names by removing "x" and "y"
feature_vector_names = data.columns.difference([_COLUMN_X, _COLUMN_Y])
data_coords = data[[_COLUMN_X, _COLUMN_Y]].values
result = {"coordinates": data_coords}
for feature in feature_vector_names:
data_words = [[e.strip() for e in venue_data.split(",")] for venue_data in data[feature].values.flatten().tolist()]
result[feature] = data_words
return sparsify_data(result, None, None), scaler # None for both params since SVD is not used
示例11: lassoRegression
def lassoRegression(X,y):
print("\n### ~~~~~~~~~~~~~~~~~~~~ ###")
print("Lasso Regression")
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
myDegree = 40
polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False)
Xp = polynomialFeatures.fit_transform(X)
myScaler = StandardScaler()
scaled_Xp = myScaler.fit_transform(Xp)
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
lassoRegression = Lasso(alpha=1e-7)
lassoRegression.fit(scaled_Xp,y)
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
dummyX = np.arange(0,2,0.01)
dummyX = dummyX.reshape((dummyX.shape[0],1))
dummyXp = polynomialFeatures.fit_transform(dummyX)
scaled_dummyXp = myScaler.transform(dummyXp)
dummyY = lassoRegression.predict(scaled_dummyXp)
outputFILE = 'plot-lassoRegression.png'
fig, ax = plt.subplots()
fig.set_size_inches(h = 6.0, w = 10.0)
ax.axis([0,2,0,15])
ax.scatter(X,y,color="black",s=10.0)
ax.plot(dummyX, dummyY, color='red', linewidth=1.5)
plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600)
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
return( None )
示例12: load_data_csv
def load_data_csv(datafile):
"""
Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns.
:param datafile: path of the file
:return: a NumPy array containing a data point in each row
"""
# File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location
# will be at the column named 'x' in the CSV file.
# This will be useful later when we start adding more features.
_COLUMN_X = 'x'
_COLUMN_Y = 'y'
_COLUMN_W = 'color'
data = pd.read_csv(datafile)
# Normalize
scaler = StandardScaler()
scaler.fit(data[[_COLUMN_X, _COLUMN_Y]])
data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]])
data_coords = data[[_COLUMN_X, _COLUMN_Y]].values
data_words = [[e] for e in data[[_COLUMN_W]].values.flatten().tolist()]
data = {"coordinates": data_coords, "words": data_words}
return sparsify_data(data, None, None), scaler # None for both params since SVD is not used
示例13: prepare_features
def prepare_features(data, enc=None, scaler=None):
'''
One-hot encode all boolean/string (categorical) features,
and shift/scale integer/float features
'''
# X needs to contain only non-negative integers
bfs = data['bfeatures'] + 1
sfs = data['sfeatures'] + 1
# Shift/scale integer and float features to have mean=0, std=1
ifs = data['ifeatures']
ffs = data['ffeatures']
x2 = np.hstack((ifs,ffs))
if scaler is None:
scaler = StandardScaler()
x2 = scaler.fit_transform(x2)
print "Training features have mean: %s" % scaler.mean_
print "and standard deviation: %s" % scaler.std_
else:
x2 = scaler.transform(x2, copy=False)
# one-hot encode categorical features
X = np.hstack((bfs,sfs,x2))
categorical = np.arange(bfs.shape[1]+sfs.shape[1])
if enc is None:
enc = OneHotEncoder(n_values='auto', categorical_features=categorical)
X = enc.fit_transform(X)
print "One-hot encoded features have dimension %d" % X.shape[1]
else:
X = enc.transform(X)
return X, enc, scaler
示例14: cross_valid
def cross_valid(data, classifier, x_cols, y_col, **kwargs):
# Do train-test split for cross-validation
size = len(data)
kf = train_test_split(size)
y_pred = np.zeros(size)
y_pred_prob = np.zeros(size)
y = data[y_col].as_matrix().astype(np.float)
totaltime_train = 0
totaltime_test = 0
for train_index, test_index in kf:
# Fill in missing values
df = data.copy()
df = fill_missing_median(df, train_index)
# Transform and normalize
X = df[x_cols].as_matrix().astype(np.float)
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Build classifier and yield predictions
y_pred[test_index], y_pred_prob[test_index], train_time, test_time \
= model(X, y, train_index, test_index, classifier, **kwargs)
totaltime_train += train_time
totaltime_test += test_time
avgtime_train = train_time/len(kf)
avgtime_test = test_time/len(kf)
return y, y_pred, y_pred_prob, avgtime_train, avgtime_test
示例15: linregress
def linregress(X_train, X_test, y_train, y_test):
coef = []
for col in X_train.columns.tolist():
X = StandardScaler().fit_transform(X_train[col])
lr = LinearRegression()
lr.fit(X.reshape(-1, 1), y_train)
coef.append([col, lr.coef_])
coef = sorted(coef, key=lambda x: x[1])[::-1]
nos = [x[1] for x in coef]
labs = [x[0] for x in coef]
for lab in labs:
if lab == 'doubles':
labs[labs.index(lab)] = '2B'
elif lab == 'triples':
labs[labs.index(lab)] = '3B'
elif lab == 'Intercept':
idx = labs.index('Intercept')
labs.pop(idx)
nos.pop(idx)
labs = [lab.upper() for lab in labs]
x = range(len(nos))
plt.plot(x,nos, lw=2, c='b')
plt.xticks(x, labs)
plt.title('Linear Regression Coefficients (Win Percentage)')
plt.savefig('images/coefficients.png')
plt.show()
print labs