本文整理汇总了Python中sklearn.preprocessing.Imputer类的典型用法代码示例。如果您正苦于以下问题:Python Imputer类的具体用法?Python Imputer怎么用?Python Imputer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Imputer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: data_organizer
def data_organizer( instances, outcomes ):
"""
Operations to organize data as desired
"""
# Remove instances without GPA data
new_instances = []
new_outcomes = []
for instance,outcome in zip(instances,outcomes):
u1,u2,gpa = outcome
if not math.isnan( gpa ):
new_instances.append( [value for value in instance] )
new_outcomes.append( [value for value in outcome] )
instances = new_instances
outcomes = new_outcomes
# Fill in NaN values with median
instance_list = []
for idx,instance in enumerate(instances):
instance_list.append( [ value for value in instance ] )
bandaid = Imputer( strategy='median' )
instances = bandaid.fit_transform( instance_list )
return instances, outcomes
示例2: impute_and_scale
def impute_and_scale(df, scaling='std'):
"""Impute missing values with mean and scale data included in pandas dataframe.
Parameters
----------
df : pandas dataframe
dataframe to impute and scale
scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
type of scaling to apply
"""
df = df.dropna(axis=1, how='all')
imputer = Imputer(strategy='mean', axis=0)
mat = imputer.fit_transform(df)
if scaling is None or scaling.lower() == 'none':
return pd.DataFrame(mat, columns=df.columns)
if scaling == 'maxabs':
scaler = MaxAbsScaler()
elif scaling == 'minmax':
scaler = MinMaxScaler()
else:
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
df = pd.DataFrame(mat, columns=df.columns)
return df
示例3: run_whole_video
def run_whole_video(exp_folder, lims_ID):
#initializes video pointer for video of interest based on lims ID
file_string = get_file_string(exp_folder, lims_ID)
video_pointer = cv2.VideoCapture(file_string)
# import wheel data
wheel = joblib.load('dxds2.pkl')
first_non_nan = next(x for x in wheel if not isnan(x))
first_index = np.where(wheel == first_non_nan)[0]
k = first_index[0]
imp = Imputer(missing_values='NaN', strategy='mean')
wheel = imp.fit_transform(wheel)
wheel = preprocessing.MinMaxScaler((-1, 1)).fit(wheel).transform(wheel)
# self.video_pointer.set(1, 41000)
ret, frame = video_pointer.read()
# crops and converts frame into desired format
frame = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY)
prvs = frame
nex = frame
# initialize vectors to keep track of data
count = 0
mod = 0
opticals = []
angles = []
frames = []
# length of movie
limit = int(video_pointer.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT))
# create hdf file
hf = h5py.File('data_' + str(lims_ID) + '.h5', 'w')
g = hf.create_group('feature space')
vector = np.zeros((limit, 4321))
table = g.create_dataset('features', data = vector, shape =(limit, 4321))
while count <= limit:
prvs = nex
frames = process_input(prvs)
ret, frame = video_pointer.read()
nex = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY)
optical = optical_flow(prvs, nex)
opticals = optical['mag']
angles= optical['ang']
vector_data = np.concatenate((np.reshape(wheel[k], (1)), frames, opticals, angles))
table[count, :] = vector_data
count += 1
if count%1000 == 0:
print (count)
示例4: preprocess
def preprocess(data):
non_sparse_only = True
use_all_category_only = False
use_all_impute_mean_mode = False
if non_sparse_only:
nominal_samples = data.ix[:,['var4','dummy']]
onehot_samples = onehot.transform(nominal_samples,['var4','dummy'])
onehot_samples = pd.DataFrame(onehot_samples.toarray())
numbered_samples = data.ix[:,['var7','var8','var10','var11','var13','var15','var17']]
numbered_samples[['var7','var8']] = numbered_samples[['var7','var8']].convert_objects(convert_numeric=True)
#(var7 and 8 are ordinal, converting to floats which includes NaNs will allow mean imputing of missing values)
other_samples = data.ix[:,'crimeVar1':'weatherVar236'] #all the continuous vars
other_samples = other_samples.drop(['weatherVar115'], axis=1) #nothing in this feature
samples = pd.concat([onehot_samples,numbered_samples,other_samples],axis=1) #combine w/ the cleaned up other vars
imp_nan = Imputer(missing_values=np.nan, strategy='mean', axis=0)
samples_imp = imp_nan.fit_transform(samples)
if use_all_category_only:
todo
if use_all_impute_mean_mode:
todo
return samples_imp
示例5: learn
def learn():
global classifier, INPUT
print 1
data = np.genfromtxt(INPUT, delimiter=' ', dtype='f8')
np.random.shuffle(data)
n = len(data)
y = data[:,1]
x = data[:][:,range(2,54)]
# test_x = []
# test_y = []
train_x = []
train_y = []
print 2
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
x = imp.fit_transform(x)
print 3
for i in range(0, n):
if y[i] == 0:
continue
train_x.append(x[i])
train_y.append(y[i])
# if i%100==0:
# test_x.append(x[i])
# test_y.append(y[i])
# else:
# train_x.append(x[i])
# train_y.append(y[i])
print 4
classifier.fit(train_x, train_y)
print 5
示例6: ImputeCategorical
class ImputeCategorical(BaseEstimator, TransformerMixin):
"""
Encodes a specified list of columns or all columns if None.
"""
def __init__(self, columns=None):
self.columns = columns
self.imputer = None
def fit(self, data, target=None):
"""
Expects a data frame with named columns to impute.
"""
# Encode all columns if columns is None
if self.columns is None:
self.columns = data.columns
# Fit an imputer for each column in the data frame
self.imputer = Imputer(missing_values=0, strategy='most_frequent')
self.imputer.fit(data[self.columns])
return self
def transform(self, data):
"""
Uses the encoders to transform a data frame.
"""
output = data.copy()
output[self.columns] = self.imputer.transform(output[self.columns])
return output
示例7: test_3_stage
def test_3_stage(self):
from sklearn.preprocessing import Imputer
infile_name = path_of_data('missing_vals.csv')
p = Pipeline()
csv_read_node = p.add(CSVRead(infile_name))
csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))
impute_node = p.add(wrap_and_make_instance(Imputer))
csv_read_node['output'] > impute_node['X_train']
impute_node['X_new'] > csv_write_node['input']
self.run_pipeline(p)
ctrl_imputer = Imputer()
ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",",
names=True)
num_type = ctrl_X_sa[0][0].dtype
ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa)
ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd)
control = ctrl_X_new_nd
result = self._tmp_files.csv_read('out.csv', True)
self.assertTrue(np.allclose(result, control))
示例8: test
def test():
vec = DictVectorizer()
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
for filename in glob.glob(r'../dataset/UCI/*.arff'):
basename = re.sub(r'(\..*?)$','',os.path.basename(filename))
print basename
if basename != DS:
continue
# cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb'))
data = arff.loadarff(filename)[0]
X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray()
imp.fit(X)
X = imp.transform(X)
labels = np.array([row[-1] for row in data])
y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels])
random = np.random.permutation(range(len(X)))
print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())]))
for iteration in xrange(10):
X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10)
for train, test in kf:
length, train_size = len(train), 0.1
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0)
for R in xrange(2,10):
ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]])
# print "%s R=%d"%(basename,R),
cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix)
exit()
示例9: plot_ROCList
def plot_ROCList(clfList, data, labels, stringList=""):
"""
Plot an ROC curve for each classifier in clfList, training on a single 80/20 split
:param clfList:
:param data:
:param labels:
:param stringList:
:return:
"""
if stringList == "":
stringList = ["" for i in range(len(labels))]
imp = Imputer(missing_values=np.NaN, strategy="mean")
data = imp.fit_transform(data)
# Cross-validate on the data once using each model to get a ROC curve
AUCs, fprs, tprs, threshs = cvList(data, labels, clfList)
# Plote a ROC for each clf in clfList
for i in range(len(clfList)):
fpr = fprs[i]
tpr = tprs[i]
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(stringList[i]+" ROC Curve, AUC = "+str(AUCs[i]))
plt.savefig(stringList[i]+"_ROC.png")
plt.close()
print stringList[i] + ":" + str(AUCs[i])
示例10: computePearson
def computePearson(args):
filter(args)
with open(args.feature_file, 'r') as fp:
features = [line for line in fp.read().splitlines()
if not line.startswith('#')]
X = loadtxt(TMP_DATA_FILE)
y = loadtxt(TMP_LABEL_FILE)
assert X.shape[0] == y.shape[0]
assert X.shape[1] == len(features)
imputer = Imputer(strategy='median', copy=False)
X = imputer.fit_transform(X)
if args.output_file:
with open(args.output_file, 'w') as fp:
print >> fp, '\t'.join(['feature', 'coeff', 'pvalue'])
for i in range(len(features)):
coeff, pvalue = pearsonr(X[:, i], y)
print >> fp, '%s\t%f\t%f' % (features[i], coeff, pvalue)
if args.group_output_file:
groups = getGroups(features)
index = {features[i]: i for i in range(len(features))}
with open(args.group_output_file, 'w') as fp:
print >> fp, '\t'.join(['prefix', 'feature1', 'feature2', 'coeff', 'pvalue'])
for prefix, group in groups.iteritems():
for i in range(len(group)):
for j in range(i+1, len(group)):
coeff, pvalue = pearsonr(X[:, index[group[i]]], X[:, index[group[j]]])
print >> fp, '%s\t%s\t%s\t%f\t%f' % (
prefix, group[i], group[j], coeff, pvalue)
示例11: gettestdata
def gettestdata(fil) :
data = np.genfromtxt(fil,delimiter=',')
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
X = imp.fit_transform(data[:,2:])
X = scale(X).copy()
#spr.eliminate_zeros()
return np.array(X)
示例12: get_some_data
def get_some_data():
data = melbourne_data;
y = data.Price
X = data[cols_to_use]
my_imputer = Imputer()
imputed_X = my_imputer.fit_transform(X)
return imputed_X, y
示例13: calcEdges
def calcEdges(data):
n = len(data)
usersDic = {}
usersId = 0
moviesDic = {}
moviesId = 0
for i in range(n):
r = data[i]
if r[0] not in moviesDic:
moviesDic[r[0]] = moviesId
moviesId += 1
if r[1] not in usersDic:
usersDic[r[1]] = usersId
usersId += 1
E = np.zeros((moviesId, usersId))
#E = np.full((moviesId, usersId), np.nan)
for i in range(n):
user = usersDic[data[i][1]]
movie = moviesDic[data[i][0]]
E[movie, user] = data[i][2]
estimator = Imputer(0, strategy='mean')
#estimator = SoftImpute()
#estimator.fit(E)
#E = estimator.predict(E)
E = estimator.fit_transform(E)
return E, usersDic, moviesDic
示例14: bnp_svm
def bnp_svm(train, test):
print('bnpsvm')
## If a value is missing, set it to the average
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
#print("cleaning data")
train = train.sample(1000)
## set up training data
train1 = train.select_dtypes(include=['float64'])
imp.fit(train1)
train1 = imp.transform(train1)
train1 = np.array(train1).astype(float)
## set up real y
target = np.array(train['target']).astype(int)
## set up testing data
test1 = test.select_dtypes(include=['float64'])
test1 = imp.transform(test1)
test1 = np.array(test1).astype(float)
#print("training...")
clf = svm.SVC(gamma=0.001, C=100, probability=True)
#print("testing")
clf.fit(train1, target)
#print("predicting")
yhat = clf.predict_proba(test1)
return yhat
#print(bnp_svm(train, test))
示例15: load_datasets
def load_datasets(feature_paths, label_paths):
'''
读取特征文件和标签文件并返回
'''
#定义feature数组变量,列数量和特征维度一致为41;定义空的标签变量,列数量与标签维度一致为1
feature = np.ndarray(shape=(0,41))
label = np.ndarray(shape=(0,1))
for file in feature_paths:
#使用pandas库的read_table函数读取一个特征文件的内容,其中指定分隔符为逗号、缺失值为问号且文件不包含表头行
#df = pd.read_table(file, delimiter=',', na_values='?', header=None)
#pandas.read_csv(数据源, encoding=编码格式为utf-8, parse_dates=第0列解析为日期, index_col=用作行索引的列编号)
data=pd.read_csv(file,encoding='utf-8',parse_dates=[0],index_col=0)
#DataFrame.sort_index(axis=0 (按0列排), ascending=True(升序), inplace=False(排序后是否覆盖原数据))
#data 按照时间升序排列
#data.sort_index(0,ascending=True,inplace=True)
#使用Imputer函数,通过设定strategy参数为‘mean’,使用平均值对缺失数据进行补全。
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
#fit()函数用于训练预处理器,transform()函数用于生成预处理结果。
imp.fit(df)
df = imp.transform(df)
#将预处理后的数据加入feature,依次遍历完所有特征文件
feature = np.concatenate((feature, df))
#读取标签文件
for file in label_paths:
df = pd.read_table(file, header=None)
label = np.concatenate((label, df))
#将标签归整化为一维向量
label = np.ravel(label)
return feature, label