Python preprocessing.Imputer类代码示例

本文整理汇总了Python中sklearn.preprocessing.Imputer类的典型用法代码示例。如果您正苦于以下问题：Python Imputer类的具体用法？Python Imputer怎么用？Python Imputer使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了Imputer类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: data_organizer

def data_organizer( instances, outcomes ):
   """
   Operations to organize data as desired
   """
   
   # Remove instances without GPA data
   new_instances = []
   new_outcomes = []
   for instance,outcome in zip(instances,outcomes):
      u1,u2,gpa = outcome
      if not math.isnan( gpa ):
         new_instances.append( [value for value in instance] )
         new_outcomes.append( [value for value in outcome] )
         
   instances = new_instances
   outcomes = new_outcomes
  
  
   # Fill in NaN values with median
   instance_list = []
   for idx,instance in enumerate(instances):
      instance_list.append( [ value for value in instance ] ) 
   bandaid = Imputer( strategy='median' )
   instances = bandaid.fit_transform( instance_list )
   

   return instances, outcomes

开发者ID:doykle，项目名称:CMPS-142-Machine-Learning-Homework，代码行数:27，代码来源:processing_nb.py

示例2: impute_and_scale

def impute_and_scale(df, scaling='std'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df

开发者ID:carrondt，项目名称:Benchmarks，代码行数:31，代码来源:p1b3.py

示例3: run_whole_video

def run_whole_video(exp_folder, lims_ID):
    #initializes video pointer for video of interest based on lims ID
    file_string = get_file_string(exp_folder, lims_ID)
    video_pointer = cv2.VideoCapture(file_string)

    # import wheel data
    wheel = joblib.load('dxds2.pkl')
    first_non_nan = next(x for x in wheel if not isnan(x))
    first_index = np.where(wheel == first_non_nan)[0]
    k = first_index[0]
    imp = Imputer(missing_values='NaN', strategy='mean')
    wheel = imp.fit_transform(wheel)
    wheel = preprocessing.MinMaxScaler((-1, 1)).fit(wheel).transform(wheel)

    # self.video_pointer.set(1, 41000)
    ret, frame = video_pointer.read()

    # crops and converts frame into desired format
    frame = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY)

    prvs = frame
    nex = frame

    # initialize vectors to keep track of data
    count = 0
    mod = 0
    opticals = []
    angles = []
    frames = []

    # length of movie
    limit = int(video_pointer.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT))


    # create hdf file
    hf = h5py.File('data_' + str(lims_ID) + '.h5', 'w')
    g = hf.create_group('feature space')
    vector = np.zeros((limit, 4321))
    table = g.create_dataset('features', data = vector, shape =(limit, 4321))


    while count <= limit:

        prvs = nex
        frames = process_input(prvs)

        ret, frame = video_pointer.read()
        nex = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY)

        optical = optical_flow(prvs, nex)
        opticals = optical['mag']
        angles= optical['ang']
        vector_data = np.concatenate((np.reshape(wheel[k], (1)), frames, opticals, angles))

        table[count, :] = vector_data

        count += 1

        if count%1000 == 0:
            print (count)

开发者ID:mahdiramadan，项目名称:AllenCode，代码行数:60，代码来源:image_processing.py

示例4: preprocess

def preprocess(data):

    non_sparse_only = True
    use_all_category_only = False
    use_all_impute_mean_mode = False


    if non_sparse_only:
        nominal_samples = data.ix[:,['var4','dummy']] 
        onehot_samples = onehot.transform(nominal_samples,['var4','dummy'])
        onehot_samples = pd.DataFrame(onehot_samples.toarray())
        numbered_samples = data.ix[:,['var7','var8','var10','var11','var13','var15','var17']]
        numbered_samples[['var7','var8']] = numbered_samples[['var7','var8']].convert_objects(convert_numeric=True)
        #(var7 and 8 are ordinal, converting to floats which includes NaNs will allow mean imputing of missing values)
        other_samples = data.ix[:,'crimeVar1':'weatherVar236'] #all the continuous vars
        other_samples = other_samples.drop(['weatherVar115'], axis=1) #nothing in this feature
        samples = pd.concat([onehot_samples,numbered_samples,other_samples],axis=1) #combine w/ the cleaned up other vars
        imp_nan = Imputer(missing_values=np.nan, strategy='mean', axis=0)
        samples_imp = imp_nan.fit_transform(samples)
    
    if use_all_category_only:
        todo
    
    if use_all_impute_mean_mode:
        todo
    
    return samples_imp

开发者ID:kirilligum，项目名称:cdips-fire，代码行数:27，代码来源:preprocess.py

示例5: learn

def learn():
	global classifier, INPUT
	print 1
	data = np.genfromtxt(INPUT, delimiter=' ', dtype='f8')
	np.random.shuffle(data)
	n = len(data)
	y = data[:,1]
	x = data[:][:,range(2,54)]
	# test_x = []
	# test_y = []
	train_x = []
	train_y = []
	print 2
	imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
	x = imp.fit_transform(x)
	print 3
	for i in range(0, n):
		if y[i] == 0:
			continue
		train_x.append(x[i])
		train_y.append(y[i])
		# if i%100==0:
		# 	test_x.append(x[i])
		# 	test_y.append(y[i])
		# else:
		# 	train_x.append(x[i])
		# 	train_y.append(y[i])
	print 4
	classifier.fit(train_x, train_y)
	print 5

开发者ID:proneetv，项目名称:adin，代码行数:30，代码来源:activity.py

示例6: ImputeCategorical

class ImputeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """

    def __init__(self, columns=None):
        self.columns = columns
        self.imputer = None

    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to impute.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns

        # Fit an imputer for each column in the data frame
        self.imputer = Imputer(missing_values=0, strategy='most_frequent')
        self.imputer.fit(data[self.columns])

        return self

    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        output[self.columns] = self.imputer.transform(output[self.columns])

        return output

开发者ID:NikashS，项目名称:tutorial-predicting-income，代码行数:31，代码来源:predict.py

示例7: test_3_stage

    def test_3_stage(self):
        from sklearn.preprocessing import Imputer

        infile_name = path_of_data('missing_vals.csv')

        p = Pipeline()

        csv_read_node = p.add(CSVRead(infile_name))
        csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))
        impute_node = p.add(wrap_and_make_instance(Imputer))

        csv_read_node['output'] > impute_node['X_train']
        impute_node['X_new'] > csv_write_node['input']

        self.run_pipeline(p)

        ctrl_imputer = Imputer()
        ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",",
                                  names=True)
        num_type = ctrl_X_sa[0][0].dtype
        ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa)
        ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd)
        control = ctrl_X_new_nd

        result = self._tmp_files.csv_read('out.csv', True)

        self.assertTrue(np.allclose(result, control))

开发者ID:macressler，项目名称:UPSG，代码行数:27，代码来源:test_pipeline.py

示例8: test

def test():
    vec = DictVectorizer()
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    for filename in glob.glob(r'../dataset/UCI/*.arff'):
        basename = re.sub(r'(\..*?)$','',os.path.basename(filename))
        print basename
        if basename != DS:
            continue
        # cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb'))
        data = arff.loadarff(filename)[0]
        X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray()
        imp.fit(X)
        X = imp.transform(X)
        labels = np.array([row[-1] for row in data])
        y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels])
        random = np.random.permutation(range(len(X)))
        print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())]))
        for iteration in xrange(10):
            X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10)
            for train, test in kf:
                length, train_size = len(train), 0.1
                X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
                X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0)
                for R in xrange(2,10):
                    ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]])            
                    # print "%s R=%d"%(basename,R),
                    cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix)
                exit()

开发者ID:qiangsiwei，项目名称:semi-supervied_learning，代码行数:28，代码来源:test_weight_KNN.py

示例9: plot_ROCList

def plot_ROCList(clfList, data, labels, stringList=""):
    """
    Plot an ROC curve for each classifier in clfList, training on a single 80/20 split
    :param clfList:
    :param data:
    :param labels:
    :param stringList:
    :return:
    """
    if stringList == "":
        stringList = ["" for i in range(len(labels))]
    imp = Imputer(missing_values=np.NaN, strategy="mean")
    data = imp.fit_transform(data)

    # Cross-validate on the data once using each model to get a ROC curve
    AUCs, fprs, tprs, threshs = cvList(data, labels, clfList)

    # Plote a ROC for each clf in clfList
    for i in range(len(clfList)):
        fpr = fprs[i]
        tpr = tprs[i]
        plt.plot(fpr, tpr)
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(stringList[i]+" ROC Curve, AUC = "+str(AUCs[i]))
        plt.savefig(stringList[i]+"_ROC.png")
        plt.close()
        print stringList[i] + ":" + str(AUCs[i])

开发者ID:danielgeng，项目名称:cs249_big_data_analytics，代码行数:28，代码来源:ml_models.py

示例10: computePearson

def computePearson(args):
  filter(args)

  with open(args.feature_file, 'r') as fp:
    features = [line for line in fp.read().splitlines()
                if not line.startswith('#')]

  X = loadtxt(TMP_DATA_FILE)
  y = loadtxt(TMP_LABEL_FILE)

  assert X.shape[0] == y.shape[0]
  assert X.shape[1] == len(features)

  imputer = Imputer(strategy='median', copy=False)
  X = imputer.fit_transform(X)

  if args.output_file:
    with open(args.output_file, 'w') as fp:
      print >> fp, '\t'.join(['feature', 'coeff', 'pvalue'])
      for i in range(len(features)):
        coeff, pvalue = pearsonr(X[:, i], y)
        print >> fp, '%s\t%f\t%f' % (features[i], coeff, pvalue)

  if args.group_output_file:
    groups = getGroups(features)
    index = {features[i]: i for i in range(len(features))}
    with open(args.group_output_file, 'w') as fp:
      print >> fp, '\t'.join(['prefix', 'feature1', 'feature2', 'coeff', 'pvalue'])
      for prefix, group in groups.iteritems():
        for i in range(len(group)):
          for j in range(i+1, len(group)):
            coeff, pvalue = pearsonr(X[:, index[group[i]]], X[:, index[group[j]]])
            print >> fp, '%s\t%s\t%s\t%f\t%f' % (
                prefix, group[i], group[j], coeff, pvalue)

开发者ID:galabing，项目名称:qd2，代码行数:34，代码来源:compute_pearson.py

示例11: gettestdata

def gettestdata(fil) :
	data = np.genfromtxt(fil,delimiter=',')
	imp = Imputer(missing_values='NaN', strategy='median', axis=0)
	X = imp.fit_transform(data[:,2:])
	X = scale(X).copy()
	#spr.eliminate_zeros()
	return np.array(X)

开发者ID:jmannhei，项目名称:ml-rain，代码行数:7，代码来源:branchtest.py

示例12: get_some_data

def get_some_data():
    data = melbourne_data;
    y = data.Price
    X = data[cols_to_use]
    my_imputer = Imputer()
    imputed_X = my_imputer.fit_transform(X)
    return imputed_X, y

开发者ID:muxiaobai，项目名称:CourseExercises，代码行数:7，代码来源:partial_dependence_plots.py

示例13: calcEdges

def calcEdges(data):
    n = len(data)        
    usersDic = {}
    usersId = 0
    moviesDic = {}
    moviesId = 0
    for i in range(n):
        r = data[i]
        if r[0] not in moviesDic:
            moviesDic[r[0]] = moviesId
            moviesId += 1
        if r[1] not in usersDic:
            usersDic[r[1]] = usersId
            usersId += 1
    E = np.zeros((moviesId, usersId))
    #E = np.full((moviesId, usersId), np.nan)
    for i in range(n):
        user = usersDic[data[i][1]]
        movie = moviesDic[data[i][0]]
        E[movie, user] = data[i][2]
    estimator = Imputer(0, strategy='mean')
    #estimator = SoftImpute()    
    #estimator.fit(E)
    #E = estimator.predict(E)
    E = estimator.fit_transform(E)
    return E, usersDic, moviesDic

开发者ID:chenchfort，项目名称:NetflixRecommender，代码行数:26，代码来源:recommend.py

示例14: bnp_svm

def bnp_svm(train, test):
	print('bnpsvm')
	## If a value is missing, set it to the average
	imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

	#print("cleaning data")
	train = train.sample(1000)
	## set up training data
	train1 = train.select_dtypes(include=['float64'])
	imp.fit(train1)
	train1 = imp.transform(train1)
	train1 = np.array(train1).astype(float)
	## set up real y
	target = np.array(train['target']).astype(int)


	## set up testing data
	test1 = test.select_dtypes(include=['float64'])
	test1 = imp.transform(test1)
	test1 = np.array(test1).astype(float)



	#print("training...")
	clf = svm.SVC(gamma=0.001, C=100, probability=True)
	#print("testing")
	clf.fit(train1, target)
	#print("predicting")
	yhat = clf.predict_proba(test1)
	return yhat


#print(bnp_svm(train, test))

开发者ID:debanjum，项目名称:KaggleBNP，代码行数:33，代码来源:svm.py

示例15: load_datasets

def load_datasets(feature_paths, label_paths):
    '''
    读取特征文件和标签文件并返回
    '''
    #定义feature数组变量，列数量和特征维度一致为41；定义空的标签变量，列数量与标签维度一致为1
    feature = np.ndarray(shape=(0,41))
    label = np.ndarray(shape=(0,1))
    for file in feature_paths:
        #使用pandas库的read_table函数读取一个特征文件的内容，其中指定分隔符为逗号、缺失值为问号且文件不包含表头行
        #df = pd.read_table(file, delimiter=',', na_values='?', header=None)
        
        #pandas.read_csv（数据源, encoding=编码格式为utf-8， parse_dates=第0列解析为日期， index_col=用作行索引的列编号）
        data=pd.read_csv(file,encoding='utf-8',parse_dates=[0],index_col=0)
        #DataFrame.sort_index(axis=0 (按0列排), ascending=True（升序）, inplace=False（排序后是否覆盖原数据））
        #data 按照时间升序排列
        #data.sort_index(0,ascending=True,inplace=True)
        
        #使用Imputer函数，通过设定strategy参数为‘mean’，使用平均值对缺失数据进行补全。 
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        #fit()函数用于训练预处理器，transform()函数用于生成预处理结果。
        imp.fit(df)
        df = imp.transform(df)
        #将预处理后的数据加入feature，依次遍历完所有特征文件
        feature = np.concatenate((feature, df))

    #读取标签文件
    for file in label_paths:
        df = pd.read_table(file, header=None)
        label = np.concatenate((label, df))
    #将标签归整化为一维向量    
    label = np.ravel(label)
    return feature, label

开发者ID:HanKin2015，项目名称:ACM，代码行数:32，代码来源:机器学习标准模板.py

注：本文中的sklearn.preprocessing.Imputer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。