Python Imputer.fit方法代码示例

本文整理汇总了Python中sklearn.preprocessing.Imputer.fit方法的典型用法代码示例。如果您正苦于以下问题：Python Imputer.fit方法的具体用法？Python Imputer.fit怎么用？Python Imputer.fit使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing.Imputer的用法示例。

在下文中一共展示了Imputer.fit方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: load_datasets

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def load_datasets(feature_paths, label_paths):
    feature = np.ndarray(shape=(0,41))
    label = np.ndarray(shape=(0,1))
    for file in feature_paths:
        # 使用pandas库的read_table函数读取一个特征文件内容
        # 指定分隔符为逗号 缺失值为问号 文件中不包含表头行
        df = pd.read_table(file, delimiter=',', na_values='?', header=None)
        # 使用Imputer函数,通过设定strategy参数为'mean'
        # 使用平均值对缺失数据补全,fit()函数用于训练预处理器,
        # transform()函数用于生成预处理结果
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        imp.fit(df)
        df = imp.transform(df)
        # 将预处理后的数据加入feature,依次遍历完所有特征文件
        feature = np.concatenate((feature, df))
     
    for file in label_paths:
        # 同上
        df = pd.read_table(file, header=None)
        # 标签文件没有缺失值,所以直接将读取到的新数据加入label集合
        label = np.concatenate((label, df))
         
    label = np.ravel(label)
    # 将特征集合feature与标签集合label返回
    return feature, label

开发者ID:MrChenxiaoq，项目名称:Machine-Learning-with-Python，代码行数:27，代码来源:use_ClassificationReport.py

示例2: bnp_svm

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def bnp_svm(train, test):
	print('bnpsvm')
	## If a value is missing, set it to the average
	imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

	#print("cleaning data")
	train = train.sample(1000)
	## set up training data
	train1 = train.select_dtypes(include=['float64'])
	imp.fit(train1)
	train1 = imp.transform(train1)
	train1 = np.array(train1).astype(float)
	## set up real y
	target = np.array(train['target']).astype(int)


	## set up testing data
	test1 = test.select_dtypes(include=['float64'])
	test1 = imp.transform(test1)
	test1 = np.array(test1).astype(float)



	#print("training...")
	clf = svm.SVC(gamma=0.001, C=100, probability=True)
	#print("testing")
	clf.fit(train1, target)
	#print("predicting")
	yhat = clf.predict_proba(test1)
	return yhat


#print(bnp_svm(train, test))

开发者ID:debanjum，项目名称:KaggleBNP，代码行数:35，代码来源:svm.py

示例3: data_preprocessing_descriptive

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def data_preprocessing_descriptive(Extracted_Features,Coma_Features,Corrected_Features):
    lvltrace.lvltrace("LVLEntree dans data_preprocessing_descriptive dans preproc_descriptive")
    tools.separate_coma(Extracted_Features,Coma_Features)
    for root, dirs, files in os.walk(Coma_Features):
        for i in files:
            if not i.startswith('.'):
                input_i=Coma_Features+i
                output_i=Corrected_Features+i
                lines=tools.file_lines(input_i)
                ncol=tools.file_col(input_i)
                if lines >= 2:
                    file = open(output_i, "w")
                    writer=csv.writer(file, lineterminator='\t')
                    
                    data = np.genfromtxt(input_i,delimiter=',')
                    X = data[1:, 2:]
                    neuron_type = np.genfromtxt(input_i,delimiter=',',dtype=None)
                    y = neuron_type[:, 0] # (class)

                    neuron_name = np.genfromtxt(input_i,delimiter=',',dtype=None)
                    z = neuron_name[:, 1] # Neuron names
                    
                    features = np.genfromtxt(input_i,delimiter=',',dtype=None)
                    w = features[0, :] # features names
                    
                    #Replace missing values 'nan' by column mean
                    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
                    imp.fit(X)
                    Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
                    # Output replacement "Nan" values
                    Y=imp.transform(X)
                    #print i
                    #print Y.shape, y.shape,z.shape
                    #print Y.shape[1]
                    
                    ####################
                    for line in xrange(Y.shape[0]+1):
                        for colonne in xrange(Y.shape[1]+2):
                            if line == 0:
                                if colonne == 0:
                                    file.write("%s\t"%y[line])
                                else:
                                    if colonne == 1:
                                        file.write("%s\t"%z[line])
                                    else:
                                        file.write("%s\t"%w[colonne])
                            else:
                                if colonne == 0:
                                    file.write("%s\t"%y[line])
                                else:
                                    if colonne == 1:
                                        file.write("%s\t"%z[line])
                                    else:
                                        file.write("%f\t"%Y[line-1,colonne-2])
                        file.write("\n")
                    #########################
                else:
                    print "Only one morphology !!!"
                file.close()
    lvltrace.lvltrace("LVLSortie dans data_preprocessing_descriptive dans preproc_descriptive")

开发者ID:xaviervasques，项目名称:Neuron_Morpho_Classification_ML，代码行数:62，代码来源:data_preprocessing_descriptive.py

示例4: clf_fit_transform

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
	def clf_fit_transform(self):
		#import dataset
		self.df= pd.read_csv(self.dataset,na_values=["?"])
		
		#clean dataset
		#use median,most_frequent,mean
		imr = Imputer(missing_values='NaN', strategy='mean', axis=0,copy=False)

		imr.fit(self.df)
		X_imputed_df = pd.DataFrame(imr.transform(self.df.values), columns = self.df.columns)


		X_imputed_df.drop(['id'],1,inplace=True)


		X= np.array(X_imputed_df.drop(['class'],1))
		y=np.array(X_imputed_df['class'])

		le= LabelEncoder()
		y=le.fit_transform(y)

	
		#cross validation
		self.X_train, self.X_test ,self.y_train,self.y_test = cross_validation.train_test_split(X,y,test_size=0.2,random_state=0)

		# define the object
		self.stdsc = StandardScaler()

		self.X_train_std= self.stdsc.fit_transform(self.X_train)

		# once it learns it can apply on other inputs
		self.X_test_std= self.stdsc.transform(self.X_test)

开发者ID:thak123，项目名称:ML2016，代码行数:34，代码来源:Classifier.py

示例5: test

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def test():
    vec = DictVectorizer()
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    for filename in glob.glob(r'../dataset/UCI/*.arff'):
        basename = re.sub(r'(\..*?)$','',os.path.basename(filename))
        print basename
        if basename != DS:
            continue
        # cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb'))
        data = arff.loadarff(filename)[0]
        X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray()
        imp.fit(X)
        X = imp.transform(X)
        labels = np.array([row[-1] for row in data])
        y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels])
        random = np.random.permutation(range(len(X)))
        print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())]))
        for iteration in xrange(10):
            X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10)
            for train, test in kf:
                length, train_size = len(train), 0.1
                X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
                X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0)
                for R in xrange(2,10):
                    ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]])            
                    # print "%s R=%d"%(basename,R),
                    cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix)
                exit()

开发者ID:qiangsiwei，项目名称:semi-supervied_learning，代码行数:30，代码来源:test_weight_KNN.py

示例6: trainSVM

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def trainSVM(x1,x2,kernel):
    # prepare data  
    x1 = map(list,x1)
    x2 = map(list,x2)
           
    X = x1+x2
    y1 = ones((shape(x1)[0],1))
    y2 = -1*ones((shape(x2)[0],1))
    Y = list(y1)+list(y2)
    Y = ravel(Y)
    #print 'Y'   
    if (kernel == 0):
        svm = LinearSVC()                               #Instantiating the SVM LINEAR classifier.
        params = {'C': [1, 10, 50, 100,200,300]}                    #Defining the params C which will be used by GridSearch. Param C does increase the weight of the 'fails'.
        grid = GridSearchCV(svm, params, cv=5)
    else:
        svm = SVC(probability=True)                                     #Instantiating the SVM RBF classifier.
        params = {'C': [50, 100,200,300]} #Defining the params C & Gamma which will be used by GridSearch. Param C does increase the weight of the 'fails'. Gamma does define the std of a gaussian.
        grid = GridSearchCV(svm, params, cv=5)
        
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X)
    Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)   
    trainData = imp.transform(X)

    grid.fit(trainData, Y)        #Run fit with all sets of parameters.
    model = grid.best_estimator_
    return model

开发者ID:LoPoal，项目名称:bow-classification，代码行数:30，代码来源:mcv_svm.py

示例7: load_datasets

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def load_datasets(feature_paths, label_paths):
    '''
    读取特征文件和标签文件并返回
    '''
    #定义feature数组变量，列数量和特征维度一致为41；定义空的标签变量，列数量与标签维度一致为1
    feature = np.ndarray(shape=(0,41))
    label = np.ndarray(shape=(0,1))
    for file in feature_paths:
        #使用pandas库的read_table函数读取一个特征文件的内容，其中指定分隔符为逗号、缺失值为问号且文件不包含表头行
        #df = pd.read_table(file, delimiter=',', na_values='?', header=None)
        
        #pandas.read_csv（数据源, encoding=编码格式为utf-8， parse_dates=第0列解析为日期， index_col=用作行索引的列编号）
        data=pd.read_csv(file,encoding='utf-8',parse_dates=[0],index_col=0)
        #DataFrame.sort_index(axis=0 (按0列排), ascending=True（升序）, inplace=False（排序后是否覆盖原数据））
        #data 按照时间升序排列
        #data.sort_index(0,ascending=True,inplace=True)
        
        #使用Imputer函数，通过设定strategy参数为‘mean’，使用平均值对缺失数据进行补全。 
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        #fit()函数用于训练预处理器，transform()函数用于生成预处理结果。
        imp.fit(df)
        df = imp.transform(df)
        #将预处理后的数据加入feature，依次遍历完所有特征文件
        feature = np.concatenate((feature, df))

    #读取标签文件
    for file in label_paths:
        df = pd.read_table(file, header=None)
        label = np.concatenate((label, df))
    #将标签归整化为一维向量    
    label = np.ravel(label)
    return feature, label

开发者ID:HanKin2015，项目名称:ACM，代码行数:34，代码来源:机器学习标准模板.py

示例8: ImputeCategorical

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
class ImputeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """

    def __init__(self, columns=None):
        self.columns = columns
        self.imputer = None

    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to impute.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns

        # Fit an imputer for each column in the data frame
        self.imputer = Imputer(missing_values=0, strategy='most_frequent')
        self.imputer.fit(data[self.columns])

        return self

    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        output[self.columns] = self.imputer.transform(output[self.columns])

        return output

开发者ID:NikashS，项目名称:tutorial-predicting-income，代码行数:33，代码来源:predict.py

示例9: eval_func

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def eval_func(chromosome):
    t_par = chromosome.getInternalList()
    print("## Start with Individual : " + str(t_par))
        
    eta                 = t_par[0]
    max_depth           = t_par[1]
    subsample           = t_par[2]
    colsample_bytree    = t_par[3]
    n_estimators        = t_par[4]
    test_size           = t_par[5]
    imp_start           = t_par[6]
    num_of_feat_corr    = t_par[7]


    print("## Filling missing data")
    imp = Imputer(missing_values='NaN', strategy=imp_start, axis=0)
    imp.fit(train[features])
    train[features] = imp.transform(train[features])
    test[features] = imp.transform(test[features])

    curr_features = copy.deepcopy(features)

    print("## Creating Random features based on Correlation")
    output_cor = correlation_p[output_col_name].sort_values()

    most_neg_cor = list(output_cor.index[0:num_of_feat_corr].ravel())
    most_pos_cor = list(output_cor.index[(-2-num_of_feat_corr):-2].ravel())

    for f1, f2 in pairwise(most_neg_cor):
        train[f1 + "_" + f2] = train[f1] + train[f2]
        test[f1 + "_" + f2] = test[f1] + test[f2]
        curr_features += [f1 + "_" + f2]

    for f1, f2 in pairwise(most_pos_cor):
        train[f1 + "_" + f2] = train[f1] + train[f2]
        test[f1 + "_" + f2] = test[f1] + test[f2]
        curr_features += [f1 + "_" + f2]


    params = {"objective": "binary:logistic",
              "eta": eta,                                              
              "nthread":3,                                             
              "max_depth": max_depth,                                  
              "subsample": subsample,                                  
              "colsample_bytree": colsample_bytree,                    
              "eval_metric": "logloss",                                
              "n_estimators": n_estimators,                            
              "silent": 1                                              
              }                                                        
    num_boost_round = 10000
    test_size = test_size
    best_score = train_model(curr_features,params,num_boost_round,test_size)
    grid_search_pd.loc[len(grid_search_pd),grid_search_columns] = [eta,max_depth,subsample,colsample_bytree,n_estimators,test_size,imp_start,num_of_feat_corr,best_score]

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    print("########################## Round Time Stamp ==== " + timestamp)

    grid_search_pd.to_csv(grid_search_file, index=False)

    return best_score

开发者ID:HighEnergyDataScientests，项目名称:bnpcompetition，代码行数:62，代码来源:bnp_model_evolving.py

示例10: fit

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
    def fit(self, train_x, train_y=None, is_norm=True):
        # Normalization
        if is_norm:
            train_x_min = train_x.min(0)
            train_x_ptp = train_x.ptp(axis=0)

            train_x = train_x.astype(float) - train_x_min / train_x_ptp

            if np.any(train_y):
                train_y = train_y.astype(float) - train_x_min / train_x_ptp

        imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
        imp.fit(train_x)
        if np.isnan(train_x).any():
            log("Found {} NaN values in train_x, so try to transform them to 'mean'".format(np.isnan(train_x).sum()), WARN)
            train_x = imp.transform(train_x)

        if np.any(train_y) and np.isnan(train_y).any():
            log("Found {} NaN values in train_y, so try to transform them to 'mean'".format(np.isnan(train_y).sum()), WARN)
            train_y = imp.transform(train_y)

        if np.any(train_y):
            self.model.fit(train_x, train_y)
        else:
            self.model.fit(train_x)

开发者ID:challenging，项目名称:kaggle，代码行数:27，代码来源:cluster.py

示例11: preprocess_apply

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def preprocess_apply(data, missingvaluemethod, preprocessingmethods):
	#imputing missing values
	if missingvaluemethod!=Constants.MISSING_VALUE_METHOD_NONE:
		if missingvaluemethod==Constants.MISSING_VALUE_METHOD_MEAN:
			imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
		elif missingvaluemethod==Constants.MISSING_VALUE_METHOD_MEDIAN:
			imp = Imputer(missing_values='NaN', strategy='median', axis=0)
		elif missingvaluemethod==Constants.MISSING_VALUE_METHOD_MOST_FREQUENT:
			imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
		imp.fit(data)
		data=imp.transform(data)
	else:
		data=np.asarray(data)

	#scale data
	res=np.array([])
	for i in range(0,len(preprocessingmethods)):
		field=[[x[i]] for x in data]
		if preprocessingmethods[i]==Constants.SCALING_METHOD_NONE:
			pass
		elif preprocessingmethods[i]==Constants.SCALING_METHOD_STANDARDIZATION:
			scaler=preprocessing.StandardScaler().fit(field)
			field=scaler.transform(field)
		elif preprocessingmethods[i]==Constants.SCALING_METHOD_MINMAX:
			field=preprocessing.MinMaxScaler().fit_transform(field)
		elif preprocessingmethods[i]==Constants.SCALING_METHOD_CATEGORICAL:
			enc = preprocessing.OneHotEncoder()
			enc.fit(field)
			field=enc.transform(field).toarray()
			
		if i==0:
			res = field
		else:
			res = np.concatenate((res, field), axis=1)
	return res

开发者ID:drossegger，项目名称:ml-ex1，代码行数:37，代码来源:preprocess.py

示例12: FeaturePreProcesser

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
class FeaturePreProcesser():
    def __init__(self):
        pass

    def fit(self,X):
        self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
        self.imputer.fit(X)
        X = self.imputer.transform(X)

        self.std_scaler = StandardScaler()
        self.std_scaler.fit(X)

    def fit_transform(self, X):
        self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
        self.imputer.fit(X)
        X = self.imputer.transform(X)

        self.std_scaler = StandardScaler()
        self.std_scaler.fit(X)
        X = self.std_scaler.transform(X)

        return X
    def transform(self, X):
        X = self.imputer.transform(X)
        X = self.std_scaler.transform(X)
        return X

开发者ID:jcornford，项目名称:pyecog，代码行数:28，代码来源:classifier.py

示例13: init

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
class ImputerWrapper:
  """ A simple wrapper around Imputer and supports using zero to fill in missing values.
      If entire column is nan it gets filled with 0 to avoid Imputer removing the column.
  """

  def __init__(self, missing_values='NaN', strategy='zero', axis=0, verbose=0, copy=False):
    self.strategy = strategy
    self.imputer = None
    if strategy != 'zero':
      self.imputer = Imputer(missing_values, strategy, axis, verbose, copy)

  def prepare(self, X):
    for j in range(X.shape[1]):
      all_nan = True
      for i in range(X.shape[0]):
        if not numpy.isnan(X[i][j]):
          all_nan = False
          break
      if all_nan:
        logging.info('column %d all nan, filling with 0' % j)
        for i in range(X.shape[0]):
          X[i][j] = 0.0

  def fit(self, X, y=None):
    if self.strategy == 'zero':
      return self
    self.prepare(X)
    self.imputer.fit(X, y)
    return self

  def fit_transform(self, X, y=None, **fit_params):
    if self.strategy == 'zero':
      for i in range(X.shape[0]):
        for j in range(X.shape[1]):
          if numpy.isnan(X[i][j]):
            X[i][j] = 0.0
      return X
    self.prepare(X)
    return self.imputer.fit_transform(X, y, **fit_params)

  def get_params(self, deep=True):
    if self.strategy == 'zero':
      return None
    return self.imputer.get_params(deep)

  def set_params(self, **params):
    if self.strategy == 'zero':
      return self
    self.imputer.set_params(**params)
    return self

  def transform(self, X):
    if self.strategy == 'zero':
      for i in range(X.shape[0]):
        for j in range(X.shape[1]):
          if numpy.isnan(X[i][j]):
            X[i][j] = 0.0
      return X
    return self.imputer.transform(X)

开发者ID:galabing，项目名称:qd2，代码行数:61，代码来源:imputer_wrapper.py

示例14: imput_data

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def imput_data(data):
	numSubsets = data.shape[-1]
	for i in range(numSubsets):
		imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
		imp.fit(data[:,:,i])
		data[:,:,i] = imp.transform(data[:,:,i])
		data[:,-1,i] = preprocessing.scale(data[:,-1,i])
	return data

开发者ID:sdanielzafar，项目名称:ISMR_Mining，代码行数:10，代码来源:parse_prev.py

示例15: my_imputer

# 需要导入模块: from sklearn.preprocessing import Imputer [as 别名]
# 或者: from sklearn.preprocessing.Imputer import fit [as 别名]
def my_imputer(name,strat,value):
    if value == 0:
        data[name] = data[name].fillna(0)
    imp = Imputer(missing_values=value, strategy=strat, axis=0)
    x = data[name]
    x = x.reshape(-1,1)
    imp.fit(x)
    data[name] = imp.transform(x)

开发者ID:DataMiningUCV，项目名称:preprocesamiento-de-datos-miguelf11，代码行数:10，代码来源:tarea1.py

注：本文中的sklearn.preprocessing.Imputer.fit方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。