当前位置: 首页>>代码示例>>Python>>正文

Python preprocessing.Imputer类代码示例

本文整理汇总了Python中sklearn.preprocessing.Imputer的典型用法代码示例。如果您正苦于以下问题:Python Imputer类的具体用法?Python Imputer怎么用?Python Imputer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


示例1: data_organizer

def data_organizer( instances, outcomes ):
   Operations to organize data as desired
   # Remove instances without GPA data
   new_instances = []
   new_outcomes = []
   for instance,outcome in zip(instances,outcomes):
      u1,u2,gpa = outcome
      if not math.isnan( gpa ):
         new_instances.append( [value for value in instance] )
         new_outcomes.append( [value for value in outcome] )
   instances = new_instances
   outcomes = new_outcomes
   # Fill in NaN values with median
   instance_list = []
   for idx,instance in enumerate(instances):
      instance_list.append( [ value for value in instance ] ) 
   bandaid = Imputer( strategy='median' )
   instances = bandaid.fit_transform( instance_list )

   return instances, outcomes

示例2: impute_and_scale

def impute_and_scale(df, scaling='std'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df

示例3: run_whole_video

def run_whole_video(exp_folder, lims_ID):
    #initializes video pointer for video of interest based on lims ID
    file_string = get_file_string(exp_folder, lims_ID)
    video_pointer = cv2.VideoCapture(file_string)

    # import wheel data
    wheel = joblib.load('dxds2.pkl')
    first_non_nan = next(x for x in wheel if not isnan(x))
    first_index = np.where(wheel == first_non_nan)[0]
    k = first_index[0]
    imp = Imputer(missing_values='NaN', strategy='mean')
    wheel = imp.fit_transform(wheel)
    wheel = preprocessing.MinMaxScaler((-1, 1)).fit(wheel).transform(wheel)

    # self.video_pointer.set(1, 41000)
    ret, frame = video_pointer.read()

    # crops and converts frame into desired format
    frame = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY)

    prvs = frame
    nex = frame

    # initialize vectors to keep track of data
    count = 0
    mod = 0
    opticals = []
    angles = []
    frames = []

    # length of movie
    limit = int(video_pointer.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT))

    # create hdf file
    hf = h5py.File('data_' + str(lims_ID) + '.h5', 'w')
    g = hf.create_group('feature space')
    vector = np.zeros((limit, 4321))
    table = g.create_dataset('features', data = vector, shape =(limit, 4321))

    while count <= limit:

        prvs = nex
        frames = process_input(prvs)

        ret, frame = video_pointer.read()
        nex = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY)

        optical = optical_flow(prvs, nex)
        opticals = optical['mag']
        angles= optical['ang']
        vector_data = np.concatenate((np.reshape(wheel[k], (1)), frames, opticals, angles))

        table[count, :] = vector_data

        count += 1

        if count%1000 == 0:
            print (count)

示例4: preprocess

def preprocess(data):

    non_sparse_only = True
    use_all_category_only = False
    use_all_impute_mean_mode = False

    if non_sparse_only:
        nominal_samples = data.ix[:,['var4','dummy']] 
        onehot_samples = onehot.transform(nominal_samples,['var4','dummy'])
        onehot_samples = pd.DataFrame(onehot_samples.toarray())
        numbered_samples = data.ix[:,['var7','var8','var10','var11','var13','var15','var17']]
        numbered_samples[['var7','var8']] = numbered_samples[['var7','var8']].convert_objects(convert_numeric=True)
        #(var7 and 8 are ordinal, converting to floats which includes NaNs will allow mean imputing of missing values)
        other_samples = data.ix[:,'crimeVar1':'weatherVar236'] #all the continuous vars
        other_samples = other_samples.drop(['weatherVar115'], axis=1) #nothing in this feature
        samples = pd.concat([onehot_samples,numbered_samples,other_samples],axis=1) #combine w/ the cleaned up other vars
        imp_nan = Imputer(missing_values=np.nan, strategy='mean', axis=0)
        samples_imp = imp_nan.fit_transform(samples)
    if use_all_category_only:
    if use_all_impute_mean_mode:
    return samples_imp

示例5: learn

def learn():
	global classifier, INPUT
	print 1
	data = np.genfromtxt(INPUT, delimiter=' ', dtype='f8')
	n = len(data)
	y = data[:,1]
	x = data[:][:,range(2,54)]
	# test_x = []
	# test_y = []
	train_x = []
	train_y = []
	print 2
	imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
	x = imp.fit_transform(x)
	print 3
	for i in range(0, n):
		if y[i] == 0:
		# if i%100==0:
		# 	test_x.append(x[i])
		# 	test_y.append(y[i])
		# else:
		# 	train_x.append(x[i])
		# 	train_y.append(y[i])
	print 4
	classifier.fit(train_x, train_y)
	print 5

示例6: ImputeCategorical

class ImputeCategorical(BaseEstimator, TransformerMixin):
    Encodes a specified list of columns or all columns if None.

    def __init__(self, columns=None):
        self.columns = columns
        self.imputer = None

    def fit(self, data, target=None):
        Expects a data frame with named columns to impute.
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns

        # Fit an imputer for each column in the data frame
        self.imputer = Imputer(missing_values=0, strategy='most_frequent')

        return self

    def transform(self, data):
        Uses the encoders to transform a data frame.
        output = data.copy()
        output[self.columns] = self.imputer.transform(output[self.columns])

        return output

示例7: test_3_stage

    def test_3_stage(self):
        from sklearn.preprocessing import Imputer

        infile_name = path_of_data('missing_vals.csv')

        p = Pipeline()

        csv_read_node = p.add(CSVRead(infile_name))
        csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))
        impute_node = p.add(wrap_and_make_instance(Imputer))

        csv_read_node['output'] > impute_node['X_train']
        impute_node['X_new'] > csv_write_node['input']


        ctrl_imputer = Imputer()
        ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",",
        num_type = ctrl_X_sa[0][0].dtype
        ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa)
        ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd)
        control = ctrl_X_new_nd

        result = self._tmp_files.csv_read('out.csv', True)

        self.assertTrue(np.allclose(result, control))

示例8: test

def test():
    vec = DictVectorizer()
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    for filename in glob.glob(r'../dataset/UCI/*.arff'):
        basename = re.sub(r'(\..*?)$','',os.path.basename(filename))
        print basename
        if basename != DS:
        # cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb'))
        data = arff.loadarff(filename)[0]
        X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray()
        X = imp.transform(X)
        labels = np.array([row[-1] for row in data])
        y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels])
        random = np.random.permutation(range(len(X)))
        print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())]))
        for iteration in xrange(10):
            X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10)
            for train, test in kf:
                length, train_size = len(train), 0.1
                X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
                X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0)
                for R in xrange(2,10):
                    ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]])            
                    # print "%s R=%d"%(basename,R),
                    cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix)

示例9: plot_ROCList

def plot_ROCList(clfList, data, labels, stringList=""):
    Plot an ROC curve for each classifier in clfList, training on a single 80/20 split
    :param clfList:
    :param data:
    :param labels:
    :param stringList:
    if stringList == "":
        stringList = ["" for i in range(len(labels))]
    imp = Imputer(missing_values=np.NaN, strategy="mean")
    data = imp.fit_transform(data)

    # Cross-validate on the data once using each model to get a ROC curve
    AUCs, fprs, tprs, threshs = cvList(data, labels, clfList)

    # Plote a ROC for each clf in clfList
    for i in range(len(clfList)):
        fpr = fprs[i]
        tpr = tprs[i]
        plt.plot(fpr, tpr)
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(stringList[i]+" ROC Curve, AUC = "+str(AUCs[i]))
        print stringList[i] + ":" + str(AUCs[i])

示例10: computePearson

def computePearson(args):

  with open(args.feature_file, 'r') as fp:
    features = [line for line in fp.read().splitlines()
                if not line.startswith('#')]

  X = loadtxt(TMP_DATA_FILE)
  y = loadtxt(TMP_LABEL_FILE)

  assert X.shape[0] == y.shape[0]
  assert X.shape[1] == len(features)

  imputer = Imputer(strategy='median', copy=False)
  X = imputer.fit_transform(X)

  if args.output_file:
    with open(args.output_file, 'w') as fp:
      print >> fp, '\t'.join(['feature', 'coeff', 'pvalue'])
      for i in range(len(features)):
        coeff, pvalue = pearsonr(X[:, i], y)
        print >> fp, '%s\t%f\t%f' % (features[i], coeff, pvalue)

  if args.group_output_file:
    groups = getGroups(features)
    index = {features[i]: i for i in range(len(features))}
    with open(args.group_output_file, 'w') as fp:
      print >> fp, '\t'.join(['prefix', 'feature1', 'feature2', 'coeff', 'pvalue'])
      for prefix, group in groups.iteritems():
        for i in range(len(group)):
          for j in range(i+1, len(group)):
            coeff, pvalue = pearsonr(X[:, index[group[i]]], X[:, index[group[j]]])
            print >> fp, '%s\t%s\t%s\t%f\t%f' % (
                prefix, group[i], group[j], coeff, pvalue)

示例11: gettestdata

def gettestdata(fil) :
	data = np.genfromtxt(fil,delimiter=',')
	imp = Imputer(missing_values='NaN', strategy='median', axis=0)
	X = imp.fit_transform(data[:,2:])
	X = scale(X).copy()
	return np.array(X)

示例12: get_some_data

def get_some_data():
    data = melbourne_data;
    y = data.Price
    X = data[cols_to_use]
    my_imputer = Imputer()
    imputed_X = my_imputer.fit_transform(X)
    return imputed_X, y

示例13: calcEdges

def calcEdges(data):
    n = len(data)        
    usersDic = {}
    usersId = 0
    moviesDic = {}
    moviesId = 0
    for i in range(n):
        r = data[i]
        if r[0] not in moviesDic:
            moviesDic[r[0]] = moviesId
            moviesId += 1
        if r[1] not in usersDic:
            usersDic[r[1]] = usersId
            usersId += 1
    E = np.zeros((moviesId, usersId))
    #E = np.full((moviesId, usersId), np.nan)
    for i in range(n):
        user = usersDic[data[i][1]]
        movie = moviesDic[data[i][0]]
        E[movie, user] = data[i][2]
    estimator = Imputer(0, strategy='mean')
    #estimator = SoftImpute()    
    #E = estimator.predict(E)
    E = estimator.fit_transform(E)
    return E, usersDic, moviesDic

示例14: bnp_svm

def bnp_svm(train, test):
	## If a value is missing, set it to the average
	imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

	#print("cleaning data")
	train = train.sample(1000)
	## set up training data
	train1 = train.select_dtypes(include=['float64'])
	train1 = imp.transform(train1)
	train1 = np.array(train1).astype(float)
	## set up real y
	target = np.array(train['target']).astype(int)

	## set up testing data
	test1 = test.select_dtypes(include=['float64'])
	test1 = imp.transform(test1)
	test1 = np.array(test1).astype(float)

	clf = svm.SVC(gamma=0.001, C=100, probability=True)
	clf.fit(train1, target)
	yhat = clf.predict_proba(test1)
	return yhat

#print(bnp_svm(train, test))

示例15: load_datasets

def load_datasets(feature_paths, label_paths):
    feature = np.ndarray(shape=(0,41))
    label = np.ndarray(shape=(0,1))
    for file in feature_paths:
        #df = pd.read_table(file, delimiter=',', na_values='?', header=None)
        #pandas.read_csv(数据源, encoding=编码格式为utf-8, parse_dates=第0列解析为日期, index_col=用作行索引的列编号)
        #DataFrame.sort_index(axis=0 (按0列排), ascending=True(升序), inplace=False(排序后是否覆盖原数据))
        #data 按照时间升序排列
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        df = imp.transform(df)
        feature = np.concatenate((feature, df))

    for file in label_paths:
        df = pd.read_table(file, header=None)
        label = np.concatenate((label, df))
    label = np.ravel(label)
    return feature, label
