當前位置: 首頁>>代碼示例>>Python>>正文


Python StandardScaler.fit方法代碼示例

本文整理匯總了Python中sklearn.preprocessing.StandardScaler.fit方法的典型用法代碼示例。如果您正苦於以下問題:Python StandardScaler.fit方法的具體用法?Python StandardScaler.fit怎麽用?Python StandardScaler.fit使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在sklearn.preprocessing.StandardScaler的用法示例。


在下文中一共展示了StandardScaler.fit方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: rf2

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def rf2():
    """
    Submission: rf2_0704_04.csv
    3000 trees
    E_val: 0.871431
    E_in: 0.999998
    E_out:
    30000 trees
    E_val:
    E_in:
    E_out:
    """
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import RandomForestClassifier

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rf = RandomForestClassifier(n_estimators=30000, oob_score=True, n_jobs=-1,
                                class_weight='auto', max_features='log2')
    rf.fit(X_scaled, y)

    logger.debug('Eval(oob): %f', rf.oob_score_)
    logger.debug('Ein: %f', Util.auc_score(rf, X_scaled, y))

    IO.cache(rf, Path.of_cache('rf.RandomForestClassifier.log2.pkl'))
    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('rf', rf)]), 'rf2_0704_04')
開發者ID:Divergent914,項目名稱:yakddcup2015,代碼行數:34,代碼來源:modeling.py

示例2: process

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def process(discrete, cont):
  # Create discrete and continuous data matrices
  discrete_X = np.array(discrete)
  cont_X = np.array(cont)

  # Impute discrete values
  imp = Imputer(strategy='most_frequent')
  discrete_X = imp.fit_transform(discrete_X)

  # Impute continuous values
  imp_c = Imputer(strategy='mean')
  cont_X = imp_c.fit_transform(cont_X)

  # Discrete basis representation
  enc = OneHotEncoder()
  enc.fit(discrete_X)
  discrete_X = enc.transform(discrete_X).toarray()

  # Continuous scaling
  scaler = StandardScaler()
  scaler.fit(cont_X)
  cont_X = scaler.transform(cont_X)

  # Merge to one array
  X = np.concatenate((discrete_X, cont_X), axis=1)
  return X
開發者ID:apnorton,項目名稱:ml-project,代碼行數:28,代碼來源:Preproc.py

示例3: knn

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def knn(x_train, y_train, x_valid):
    x_train=np.log(x_train+1)
    x_valid=np.log(x_valid+1)

    where_are_nan = np.isnan(x_train)
    where_are_inf = np.isinf(x_train)
    x_train[where_are_nan] = 0
    x_train[where_are_inf] = 0
    where_are_nan = np.isnan(x_valid)
    where_are_inf = np.isinf(x_valid)
    x_valid[where_are_nan] = 0
    x_valid[where_are_inf] = 0

    scale=StandardScaler()
    scale.fit(x_train)
    x_train=scale.transform(x_train)
    x_valid=scale.transform(x_valid)

    #pca = PCA(n_components=10)
    #pca.fit(x_train)
    #x_train = pca.transform(x_train)
    #x_valid = pca.transform(x_valid)

    kneighbors=KNeighborsClassifier(n_neighbors=200,n_jobs=-1)
    knn_train, knn_test = stacking(kneighbors, x_train, y_train, x_valid, "knn")
    return knn_train, knn_test, "knn"
開發者ID:bifeng,項目名稱:Rental-Listing-Inquiries,代碼行數:28,代碼來源:stacking_util_scale_magic_add.py

示例4: get_norm_nFoldData

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def get_norm_nFoldData(trainXY, testXY):
    trainX = trainXY[:,:-1]
    trainY = trainXY[:,-1]
    testX = testXY[:,:-1]
    testY = testXY[:,-1]

    #standardise only x values not labels
    scaler = StandardScaler()
    scaler.fit(trainX)
    trainX = scaler.transform(trainX)

    scaler.fit(testX)
    testX = scaler.transform(testX)

    trainY = trainY.reshape((trainY.shape[0],1))
    testY = testY.reshape((testY.shape[0],1))
    train_X_Y = np.concatenate((trainX,trainY),axis=1)
    test_X_Y = np.concatenate((testX,testY),axis=1)

    folds_tr = []
    folds_te = []
    nfolds = 5
    for i in range(nfolds):
        xp = int(train_X_Y.shape[0]*.8)
        np.random.shuffle(train_X_Y)
        folds_tr.append(train_X_Y[:xp,:])
        folds_te.append(train_X_Y[xp:,:])
    return folds_tr, folds_te
開發者ID:deepak242424,項目名稱:ml-temp,代碼行數:30,代碼來源:4_grid_search.py

示例5: load_data_csv_advanced

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def load_data_csv_advanced(datafile):
    """
    Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns.
    :param datafile: path of the file
    :return: a NumPy array containing a data point in each row
    """

    # File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location
    # will be at the column named 'x' in the CSV file.
    _COLUMN_X = 'x'
    _COLUMN_Y = 'y'

    data = pd.read_csv(datafile)

    # Normalize
    scaler = StandardScaler()
    scaler.fit(data[[_COLUMN_X, _COLUMN_Y]])
    data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]])

    #  Get feature vector names by removing "x" and "y"
    feature_vector_names = data.columns.difference([_COLUMN_X, _COLUMN_Y])
    data_coords = data[[_COLUMN_X, _COLUMN_Y]].values

    result = {"coordinates": data_coords}

    for feature in feature_vector_names:
        data_words = [[e.strip() for e in venue_data.split(",")] for venue_data in data[feature].values.flatten().tolist()]

        result[feature] = data_words

    return sparsify_data(result, None, None), scaler  # None for both params since SVD is not used
開發者ID:mmathioudakis,項目名稱:geotopics,代碼行數:33,代碼來源:io.py

示例6: dbscan_outliers

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def dbscan_outliers(df):
    """
    Find outliers (noise points) using DBSCAN.

    Parameters
    ----------
    df: A pandas.DataFrame

    Returns
    -------
    A tuple of (a sklearn.DBSCAN instance, a pandas.DataFrame)
    """

    scaler = StandardScaler()
    scaler.fit(df)
    scaled = scaler.transform(df)

    dbs = DBSCAN()

    db = dbs.fit(scaled)
    outliers = dbs.fit_predict(scaled)

    df_o = df.ix[np.nonzero(outliers)]

    return db, df_o
開發者ID:nwngeek212,項目名稱:MachineLearningConcepts,代碼行數:27,代碼來源:helper.py

示例7: GPR

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
class GPR(object):
    def __init__(self, X, y, kernel=None):
        self.X = X
        self.y = y

        self._noise_variance = 0.00001
        self._kernel = kernel
        self._scaler = StandardScaler(with_std=False)
        self._scaler.fit(self.y)
        self.y = self._scaler.transform(self.y)

        assert self._kernel is not None

    @property
    def noise_variance(self):
        return self._noise_variance

    @noise_variance.setter
    def noise_variance(self, value):
        self._noise_variance = value

    def predict(self, X_test):
        assert isinstance(self._kernel, Kern)

        K = self._kernel.K(self.X)
        K_star = self._kernel.K(self.X, X_test)
        K_star_star = self._kernel.K(X_test)

        L = np.linalg.cholesky(K + self._noise_variance * np.eye(len(K)))
        Lk = np.linalg.solve(L, K_star)
        mu = np.dot(Lk.T, np.linalg.solve(L, self.y))
        s2 = np.diag(K_star_star) - np.sum(Lk ** 2, axis=0) + self._noise_variance

        return mu + self._scaler.mean_, s2
開發者ID:nikizh,項目名稱:msc-project,代碼行數:36,代碼來源:model.py

示例8: data_processing

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def data_processing(train,test,features):
    # train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
    # test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
    # train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
    # test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
    # train['hour'] = train['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
    # test['hour'] = test['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
    # train['dark'] = train['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0)
    # test['dark'] = test['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0)
    # features += ['hour','dark','StreetNo']

    print("Filling NAs")
    # print(train.mode())
    train = train.fillna(train.median().iloc[0])
    test = test.fillna(test.median().iloc[0])
    print("Label Encoder")
    le=LabelEncoder()
    for col in features:
        le.fit(list(train[col])+list(test[col]))
        train[col]=le.transform(train[col])
        test[col]=le.transform(test[col])

    le.fit(list(train[target]))
    train[target]=le.transform(train[target])

    print("Standard Scalaer")
    scaler=StandardScaler()
    for col in features:
        scaler.fit(list(train[col]))
        train[col]=scaler.transform(train[col])
        test[col]=scaler.transform(test[col])

    return train,test,features
開發者ID:ssdf93,項目名稱:kaggle,代碼行數:35,代碼來源:xgboost_native.py

示例9: __init__

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
    def __init__(self):
        """
        Constructs a SimulateData object.
        """

        # Read the simulated data.
        simulated = pd.read_csv("simulated.csv", index_col=0)
        predictors = np.asarray(simulated)[:, 0:-1]
        responses = np.asarray(simulated)[:, -1]

        # Divide the simulated data into training and test sets.
        predictors_training, predictors_test,\
        self.responses_training, self.responses_test =\
            train_test_split(predictors, responses, test_size=0.33)

        # Standardize the predictors, both training and test.
        scaler = StandardScaler()
        scaler.fit(predictors_training)
        self.predictors_training = scaler.transform(predictors_training)
        self.predictors_test = scaler.transform(predictors_test)

        # Keep track of the number of samples in the training and test sets,
        # and also the number of features.
        self.training_sample_count = len(self.responses_training)
        self.test_sample_count = len(self.responses_test)
        self.feature_count = np.size(predictors, 1)
        return None
開發者ID:garygr2002,項目名稱:erasmus,代碼行數:29,代碼來源:use_simulated_data.py

示例10: lr_with_scale3

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def lr_with_scale3():
    """
    Check the performance of normalizing TEST SET.

    Submission: lr_with_scale3_0707_04.csv
    E_val:
    E_in: 0.879233
    E_out: 0.8770121701777971

    Submission: lr_with_scale3_0712_01.csv
    E_val:
    E_in:
    E_out:
    """
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.cross_validation import cross_val_score
    from sklearn.pipeline import Pipeline
    import numpy as np

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(np.r_[X, dataset.load_test()])
    X_scaled = raw_scaler.transform(X)

    clf = LogisticRegression(C=0.03, class_weight='auto')
    clf.fit(X_scaled, y)

    logger.debug('E_in: %f', Util.auc_score(clf, X_scaled, y))
    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('lr', clf)]), 'lr_with_scale3_0712_01')

    scores = cross_val_score(clf, X_scaled, y, scoring='roc_auc', n_jobs=-1)
    logger.debug('E_val: %f <- %s', np.average(scores), scores)
開發者ID:Divergent914,項目名稱:yakddcup2015,代碼行數:37,代碼來源:modeling.py

示例11: load_data

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def load_data(dataset, scale=False):
    ''' Loads the dataset

    :type dataset: string
    :param dataset: The folder in ../data/ containing the training/testing numpy arrays
    '''

    print '... loading data'
    path = "../data/" + dataset + "/"
    
    #training set
    trainingData = numpy.load(path + "training.data.npy") 
    trainingIndices = numpy.load(path + "training.indices.npy")
    trainingIndptr = numpy.load(path + "training.indptr.npy")
    training_y = numpy.load(path + "training.labels.npy")
    training_X = scipy.sparse.csr_matrix((trainingData, trainingIndices, trainingIndptr))

    #testing set
    testingData = numpy.load(path + "testing.data.npy") 
    testingIndices = numpy.load(path + "testing.indices.npy")
    testingIndptr = numpy.load(path + "testing.indptr.npy")
    testing_y = numpy.load(path + "testing.labels.npy")
    testing_X = scipy.sparse.csr_matrix((testingData, testingIndices, testingIndptr))

    #scale the data 
    if scale:
        print "..training scaler"
        scaler = StandardScaler(with_mean=False)
        scaler.fit(training_X)
        print "..scaling features"
        training_X = scaler.transform(training_X)
        testing_X = scaler.transform(testing_X)
    
    return [(training_X, training_y),(testing_X, testing_y)]
開發者ID:uci-cbcl,項目名稱:DeepCADD,代碼行數:36,代碼來源:sklearn_CADD_mlp.py

示例12: svc_appr

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def svc_appr():
    """
    Best params: {'C': 0.022139881953014046}

    Submission:
    E_val:
    E_in:
    E_out:
    """
    from sklearn.svm import LinearSVC
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.grid_search import RandomizedSearchCV
    from scipy.stats import expon

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    svc = LinearSVC(dual=False, class_weight='auto')
    rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1,
                            cv=StratifiedKFold(y, 5), verbose=2,
                            param_distributions={'C': expon()})
    rs.fit(X_scaled, y)

    logger.debug('Got best SVC.')
    logger.debug('Best params: %s', rs.best_params_)
    logger.debug('Grid scores:')
    for i, grid_score in enumerate(rs.grid_scores_):
        print('\t%s' % grid_score)
    logger.debug('Best score (E_val): %s', rs.best_score_)
    logger.debug('E_in: %f', Util.auc_score(rs, X_scaled, y))
開發者ID:Divergent914,項目名稱:yakddcup2015,代碼行數:37,代碼來源:modeling.py

示例13: ada_boost_dt

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def ada_boost_dt():
    """
    Submission: ada_boost_dt_0707_03.csv
    E_val: 0.854350
    E_in: 0.889561
    E_out: 0.8832315976033993
    """
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.cross_validation import cross_val_score
    from sklearn.pipeline import Pipeline

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    ab = AdaBoostClassifier(n_estimators=300)

    scores = cross_val_score(ab, X_scaled, y, cv=5, n_jobs=-1)
    logger.debug('CV: %s', scores)
    logger.debug('E_val: %f', sum(scores) / len(scores))

    ab.fit(X_scaled, y)

    logger.debug('E_in: %f', Util.auc_score(ab, X_scaled, y))

    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('ab', ab)]), 'ada_boost_dt_0707_03')
開發者ID:Divergent914,項目名稱:yakddcup2015,代碼行數:32,代碼來源:modeling.py

示例14: bagging_lr

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def bagging_lr():
    """
    Submission: bagging_lr_0707_02.csv
    E_val:
    E_in:
    E_out:
    """
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import BaggingClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    bag = BaggingClassifier(LogisticRegression(class_weight='auto'),
                            n_estimators=3000, oob_score=True, n_jobs=-1,
                            verbose=2)

    logger.debug('E_val (oob): %f', bag.oob_score_)
    logger.debug('E_in: %f', Util.auc_score(bag, X_scaled, y))

    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('bag', bag)]), 'bagging_lr_0707_02')
開發者ID:Divergent914,項目名稱:yakddcup2015,代碼行數:29,代碼來源:modeling.py

示例15: sgc_test

# 需要導入模塊: from sklearn.preprocessing import StandardScaler [as 別名]
# 或者: from sklearn.preprocessing.StandardScaler import fit [as 別名]
def sgc_test(X, y, weight):
    from sklearn.linear_model import SGDClassifier
    from sklearn import cross_validation
    from sklearn.metrics import confusion_matrix
    from sklearn.preprocessing import StandardScaler

    for i in range(0,1):
        X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split(
            X, y, weight, test_size=0.2, random_state=0)
        clf = SGDClassifier(loss="hinge", n_iter=100, n_jobs=-1, penalty="l2")
        #clf = LogisticRegression( max_iter=100)

        scaler = StandardScaler(with_mean=False)
        scaler.fit(X_train)  # Don't cheat - fit only on training data
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)  # apply same transformation to test data

        clf.fit(X_train, y_train, sample_weight=weight_train)

        y_pred = clf.predict(X_train)
        #print(confusion_matrix(y_train, y_pred))
        print(clf.score(X_train,y_train,weight_train))

        y_pred = clf.predict(X_test)

        #print(confusion_matrix(y_test, y_pred))
        print(clf.score(X_test,y_test,weight_test))
開發者ID:organization-lab,項目名稱:weibo-predict,代碼行數:29,代碼來源:regressor.py


注:本文中的sklearn.preprocessing.StandardScaler.fit方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。