当前位置: 首页>>代码示例>>Python>>正文


Python Imputer.transform方法代码示例

本文整理汇总了Python中sklearn.preprocessing.Imputer.transform方法的典型用法代码示例。如果您正苦于以下问题:Python Imputer.transform方法的具体用法?Python Imputer.transform怎么用?Python Imputer.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.preprocessing.Imputer的用法示例。


在下文中一共展示了Imputer.transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: eval_func

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def eval_func(chromosome):
    t_par = chromosome.getInternalList()
    print("## Start with Individual : " + str(t_par))
        
    eta                 = t_par[0]
    max_depth           = t_par[1]
    subsample           = t_par[2]
    colsample_bytree    = t_par[3]
    n_estimators        = t_par[4]
    test_size           = t_par[5]
    imp_start           = t_par[6]
    num_of_feat_corr    = t_par[7]


    print("## Filling missing data")
    imp = Imputer(missing_values='NaN', strategy=imp_start, axis=0)
    imp.fit(train[features])
    train[features] = imp.transform(train[features])
    test[features] = imp.transform(test[features])

    curr_features = copy.deepcopy(features)

    print("## Creating Random features based on Correlation")
    output_cor = correlation_p[output_col_name].sort_values()

    most_neg_cor = list(output_cor.index[0:num_of_feat_corr].ravel())
    most_pos_cor = list(output_cor.index[(-2-num_of_feat_corr):-2].ravel())

    for f1, f2 in pairwise(most_neg_cor):
        train[f1 + "_" + f2] = train[f1] + train[f2]
        test[f1 + "_" + f2] = test[f1] + test[f2]
        curr_features += [f1 + "_" + f2]

    for f1, f2 in pairwise(most_pos_cor):
        train[f1 + "_" + f2] = train[f1] + train[f2]
        test[f1 + "_" + f2] = test[f1] + test[f2]
        curr_features += [f1 + "_" + f2]


    params = {"objective": "binary:logistic",
              "eta": eta,                                              
              "nthread":3,                                             
              "max_depth": max_depth,                                  
              "subsample": subsample,                                  
              "colsample_bytree": colsample_bytree,                    
              "eval_metric": "logloss",                                
              "n_estimators": n_estimators,                            
              "silent": 1                                              
              }                                                        
    num_boost_round = 10000
    test_size = test_size
    best_score = train_model(curr_features,params,num_boost_round,test_size)
    grid_search_pd.loc[len(grid_search_pd),grid_search_columns] = [eta,max_depth,subsample,colsample_bytree,n_estimators,test_size,imp_start,num_of_feat_corr,best_score]

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    print("########################## Round Time Stamp ==== " + timestamp)

    grid_search_pd.to_csv(grid_search_file, index=False)

    return best_score
开发者ID:HighEnergyDataScientests,项目名称:bnpcompetition,代码行数:62,代码来源:bnp_model_evolving.py

示例2: bnp_svm

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def bnp_svm(train, test):
	print('bnpsvm')
	## If a value is missing, set it to the average
	imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

	#print("cleaning data")
	train = train.sample(1000)
	## set up training data
	train1 = train.select_dtypes(include=['float64'])
	imp.fit(train1)
	train1 = imp.transform(train1)
	train1 = np.array(train1).astype(float)
	## set up real y
	target = np.array(train['target']).astype(int)


	## set up testing data
	test1 = test.select_dtypes(include=['float64'])
	test1 = imp.transform(test1)
	test1 = np.array(test1).astype(float)



	#print("training...")
	clf = svm.SVC(gamma=0.001, C=100, probability=True)
	#print("testing")
	clf.fit(train1, target)
	#print("predicting")
	yhat = clf.predict_proba(test1)
	return yhat


#print(bnp_svm(train, test))
开发者ID:debanjum,项目名称:KaggleBNP,代码行数:35,代码来源:svm.py

示例3: FeaturePreProcesser

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
class FeaturePreProcesser():
    def __init__(self):
        pass

    def fit(self,X):
        self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
        self.imputer.fit(X)
        X = self.imputer.transform(X)

        self.std_scaler = StandardScaler()
        self.std_scaler.fit(X)

    def fit_transform(self, X):
        self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
        self.imputer.fit(X)
        X = self.imputer.transform(X)

        self.std_scaler = StandardScaler()
        self.std_scaler.fit(X)
        X = self.std_scaler.transform(X)

        return X
    def transform(self, X):
        X = self.imputer.transform(X)
        X = self.std_scaler.transform(X)
        return X
开发者ID:jcornford,项目名称:pyecog,代码行数:28,代码来源:classifier.py

示例4: fit

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
    def fit(self, train_x, train_y=None, is_norm=True):
        # Normalization
        if is_norm:
            train_x_min = train_x.min(0)
            train_x_ptp = train_x.ptp(axis=0)

            train_x = train_x.astype(float) - train_x_min / train_x_ptp

            if np.any(train_y):
                train_y = train_y.astype(float) - train_x_min / train_x_ptp

        imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
        imp.fit(train_x)
        if np.isnan(train_x).any():
            log("Found {} NaN values in train_x, so try to transform them to 'mean'".format(np.isnan(train_x).sum()), WARN)
            train_x = imp.transform(train_x)

        if np.any(train_y) and np.isnan(train_y).any():
            log("Found {} NaN values in train_y, so try to transform them to 'mean'".format(np.isnan(train_y).sum()), WARN)
            train_y = imp.transform(train_y)

        if np.any(train_y):
            self.model.fit(train_x, train_y)
        else:
            self.model.fit(train_x)
开发者ID:challenging,项目名称:kaggle,代码行数:27,代码来源:cluster.py

示例5: ImputeAndGetFinalTrainTestData

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def ImputeAndGetFinalTrainTestData(train,test):
    X_train = train[:,:-1];
    y_train = train[:,-1];
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X_train);
    X_train = imp.transform(X_train);
    X_test = imp.transform(test.as_matrix());
    return (X_train,y_train,X_test)
开发者ID:AnushaC91,项目名称:Kaggle-Rain-Fall-Prediction,代码行数:10,代码来源:Modelling.py

示例6: _check_statistics

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
    def _check_statistics(self, X, X_true,
                          strategy, statistics, missing_values):
        """Utility function for testing imputation for a given strategy.

        Test:
            - along the two axes
            - with dense and sparse arrays

        Check that:
            - the statistics (mean, median, mode) are correct
            - the missing values are imputed correctly"""

        err_msg = "Parameters: strategy = %s, missing_values = %s, " \
                  "axis = {0}, sparse = {1}" % (strategy, missing_values)

        # Normal matrix, axis = 0
        imputer = Imputer(missing_values, strategy=strategy, axis=0)
        X_trans = imputer.fit(X).transform(X.copy())
        assert_array_equal(imputer.statistics_, statistics,
                           err_msg.format(0, False))
        assert_array_equal(X_trans, X_true, err_msg.format(0, False))

        # Normal matrix, axis = 1
        imputer = Imputer(missing_values, strategy=strategy, axis=1)
        imputer.fit(X.transpose())
        if np.isnan(statistics).any():
            assert_raises(ValueError, imputer.transform, X.copy().transpose())
        else:
            X_trans = imputer.transform(X.copy().transpose())
            assert_array_equal(X_trans, X_true.transpose(),
                               err_msg.format(1, False))

        # Sparse matrix, axis = 0
        imputer = Imputer(missing_values, strategy=strategy, axis=0)
        imputer.fit(sparse.csc_matrix(X))
        X_trans = imputer.transform(sparse.csc_matrix(X.copy()))

        if sparse.issparse(X_trans):
            X_trans = X_trans.toarray()

        assert_array_equal(imputer.statistics_, statistics,
                           err_msg.format(0, True))
        assert_array_equal(X_trans, X_true, err_msg.format(0, True))

        # Sparse matrix, axis = 1
        imputer = Imputer(missing_values, strategy=strategy, axis=1)
        imputer.fit(sparse.csc_matrix(X.transpose()))
        if np.isnan(statistics).any():
            assert_raises(ValueError, imputer.transform,
                          sparse.csc_matrix(X.copy().transpose()))
        else:
            X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose()))

            if sparse.issparse(X_trans):
                X_trans = X_trans.toarray()

            assert_array_equal(X_trans, X_true.transpose(),
                               err_msg.format(1, True))
开发者ID:Bryan-LL,项目名称:auto-sklearn,代码行数:60,代码来源:test_imputation.py

示例7: load_datasets

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def load_datasets(feature_paths, label_paths):
    '''
    读取特征文件和标签文件并返回
    '''
    #定义feature数组变量,列数量和特征维度一致为41;定义空的标签变量,列数量与标签维度一致为1
    feature = np.ndarray(shape=(0,41))
    label = np.ndarray(shape=(0,1))
    for file in feature_paths:
        #使用pandas库的read_table函数读取一个特征文件的内容,其中指定分隔符为逗号、缺失值为问号且文件不包含表头行
        #df = pd.read_table(file, delimiter=',', na_values='?', header=None)
        
        #pandas.read_csv(数据源, encoding=编码格式为utf-8, parse_dates=第0列解析为日期, index_col=用作行索引的列编号)
        data=pd.read_csv(file,encoding='utf-8',parse_dates=[0],index_col=0)
        #DataFrame.sort_index(axis=0 (按0列排), ascending=True(升序), inplace=False(排序后是否覆盖原数据))
        #data 按照时间升序排列
        #data.sort_index(0,ascending=True,inplace=True)
        
        #使用Imputer函数,通过设定strategy参数为‘mean’,使用平均值对缺失数据进行补全。 
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        #fit()函数用于训练预处理器,transform()函数用于生成预处理结果。
        imp.fit(df)
        df = imp.transform(df)
        #将预处理后的数据加入feature,依次遍历完所有特征文件
        feature = np.concatenate((feature, df))

    #读取标签文件
    for file in label_paths:
        df = pd.read_table(file, header=None)
        label = np.concatenate((label, df))
    #将标签归整化为一维向量    
    label = np.ravel(label)
    return feature, label
开发者ID:HanKin2015,项目名称:ACM,代码行数:34,代码来源:机器学习标准模板.py

示例8: ImputeCategorical

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
class ImputeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """

    def __init__(self, columns=None):
        self.columns = columns
        self.imputer = None

    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to impute.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns

        # Fit an imputer for each column in the data frame
        self.imputer = Imputer(missing_values=0, strategy='most_frequent')
        self.imputer.fit(data[self.columns])

        return self

    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        output[self.columns] = self.imputer.transform(output[self.columns])

        return output
开发者ID:NikashS,项目名称:tutorial-predicting-income,代码行数:33,代码来源:predict.py

示例9: imputed_data

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def imputed_data(df, colname, strategy="mean"):
    from sklearn.preprocessing import Imputer
    imr = Imputer(missing_values="NaN", strategy=strategy, axis=0)
    imr = imr.fit(df[colname].reshape(-1,1))
    imputed_data = imr.transform(df[colname].values.reshape(-1,1))
    df[colname] = imputed_data
    print("Data has been imputed to \"{}\"".format(colname))
开发者ID:reinka,项目名称:coding,代码行数:9,代码来源:Titanic+2.py

示例10: trainSVM

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def trainSVM(x1,x2,kernel):
    # prepare data  
    x1 = map(list,x1)
    x2 = map(list,x2)
           
    X = x1+x2
    y1 = ones((shape(x1)[0],1))
    y2 = -1*ones((shape(x2)[0],1))
    Y = list(y1)+list(y2)
    Y = ravel(Y)
    #print 'Y'   
    if (kernel == 0):
        svm = LinearSVC()                               #Instantiating the SVM LINEAR classifier.
        params = {'C': [1, 10, 50, 100,200,300]}                    #Defining the params C which will be used by GridSearch. Param C does increase the weight of the 'fails'.
        grid = GridSearchCV(svm, params, cv=5)
    else:
        svm = SVC(probability=True)                                     #Instantiating the SVM RBF classifier.
        params = {'C': [50, 100,200,300]} #Defining the params C & Gamma which will be used by GridSearch. Param C does increase the weight of the 'fails'. Gamma does define the std of a gaussian.
        grid = GridSearchCV(svm, params, cv=5)
        
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X)
    Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)   
    trainData = imp.transform(X)

    grid.fit(trainData, Y)        #Run fit with all sets of parameters.
    model = grid.best_estimator_
    return model
开发者ID:LoPoal,项目名称:bow-classification,代码行数:30,代码来源:mcv_svm.py

示例11: test

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def test():
    vec = DictVectorizer()
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    for filename in glob.glob(r'../dataset/UCI/*.arff'):
        basename = re.sub(r'(\..*?)$','',os.path.basename(filename))
        print basename
        if basename != DS:
            continue
        # cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb'))
        data = arff.loadarff(filename)[0]
        X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray()
        imp.fit(X)
        X = imp.transform(X)
        labels = np.array([row[-1] for row in data])
        y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels])
        random = np.random.permutation(range(len(X)))
        print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())]))
        for iteration in xrange(10):
            X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10)
            for train, test in kf:
                length, train_size = len(train), 0.1
                X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
                X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0)
                for R in xrange(2,10):
                    ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]])            
                    # print "%s R=%d"%(basename,R),
                    cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix)
                exit()
开发者ID:qiangsiwei,项目名称:semi-supervied_learning,代码行数:30,代码来源:test_weight_KNN.py

示例12: impute_null_vals

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def impute_null_vals(df_train, df_test, feature_cols, strategy='mean'):
    '''
    Impute null values using strategy
    '''
    # Impute using combined (train + test) datasets
    df_combined = df_train[feature_cols].append(df_test[feature_cols])
    imputer = Imputer(
        missing_values='NaN', strategy=strategy, axis=0, verbose=0, copy=False
    ).fit(df_combined)
    df_train[feature_cols] = imputer.transform(df_train[feature_cols])
    df_test[feature_cols] = imputer.transform(df_test[feature_cols])

    # Remove duplicate columns and rows
    df_train, df_test = remove_duplicates_const(df_train, df_test)

    return df_train, df_test
开发者ID:jjinking,项目名称:kaggle-santander,代码行数:18,代码来源:santander.py

示例13: dealDataSet

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def dealDataSet():
    imp = Imputer(missing_values='NaN',strategy='mean',axis=0)
    train_x,l1 = loadDataSet('../data/train_x.csv')
    imp.fit(np.array(train_x))
    _train_x = imp.transform(np.array(train_x))

    train_y = np.genfromtxt('../data/train_y.csv',delimiter=',')[1:,1]

    test_x,result_id = loadDataSet('../data/test_x.csv')
    imp.fit(np.array(test_x))
    _test_x = imp.transform(np.array(test_x))

    train_x_normalized = preprocessing.normalize(_train_x,norm='l2')
    test_x_normalized = preprocessing.normalize(_test_x,norm='l2')

    return train_x_normalized,train_y,test_x_normalized,result_id
开发者ID:sgbsysu,项目名称:---,代码行数:18,代码来源:dealData.py

示例14: preprocess_apply

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def preprocess_apply(data, missingvaluemethod, preprocessingmethods):
	#imputing missing values
	if missingvaluemethod!=Constants.MISSING_VALUE_METHOD_NONE:
		if missingvaluemethod==Constants.MISSING_VALUE_METHOD_MEAN:
			imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
		elif missingvaluemethod==Constants.MISSING_VALUE_METHOD_MEDIAN:
			imp = Imputer(missing_values='NaN', strategy='median', axis=0)
		elif missingvaluemethod==Constants.MISSING_VALUE_METHOD_MOST_FREQUENT:
			imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
		imp.fit(data)
		data=imp.transform(data)
	else:
		data=np.asarray(data)

	#scale data
	res=np.array([])
	for i in range(0,len(preprocessingmethods)):
		field=[[x[i]] for x in data]
		if preprocessingmethods[i]==Constants.SCALING_METHOD_NONE:
			pass
		elif preprocessingmethods[i]==Constants.SCALING_METHOD_STANDARDIZATION:
			scaler=preprocessing.StandardScaler().fit(field)
			field=scaler.transform(field)
		elif preprocessingmethods[i]==Constants.SCALING_METHOD_MINMAX:
			field=preprocessing.MinMaxScaler().fit_transform(field)
		elif preprocessingmethods[i]==Constants.SCALING_METHOD_CATEGORICAL:
			enc = preprocessing.OneHotEncoder()
			enc.fit(field)
			field=enc.transform(field).toarray()
			
		if i==0:
			res = field
		else:
			res = np.concatenate((res, field), axis=1)
	return res
开发者ID:drossegger,项目名称:ml-ex1,代码行数:37,代码来源:preprocess.py

示例15: run_main

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import transform [as 别名]
def run_main(new_file, start, stop, dat):
    with open(new_file, 'a') as file:
        imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=1)
        import itertools
        with open(dat, "r") as text_file:
            for line in itertools.islice(text_file, start, stop):
                line            = line.replace("NA", "NaN")
                content         = line.rstrip('\n').split('\t')
                CpG             = content.pop(0)
                flag, CpG_location    = get_location(CpG)
                if flag == 'F':
                    continue
                genotype_matrix = get_genotypes(CpG_location)
                genotype_matrix = imp.transform(genotype_matrix)
                genotype_matrix = genotype_matrix.transpose()
            
                 #run PCA
                try:
                    PCA_matrix      = run_pca(genotype_matrix)
                except ValueError:
                    print "value error"
                    continue

                #run linear regression
                meth_values   = pd.Series(content, name="meth_val", dtype=float)
                model         = sm.OLS(meth_values, PCA_matrix)
                results       = model.fit()
                MethValResids = results.resid
                final         = pd.Series(CpG)
                final         = final.append(MethValResids)
                fline         = final.tolist()
                fline         = '\t'.join(str(x) for x in fline)
                fline         = fline + "\n"
                file.write(fline)
开发者ID:CrystalHumphries,项目名称:MethylationCorrelationBlock,代码行数:36,代码来源:try_library.py


注:本文中的sklearn.preprocessing.Imputer.transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。