本文整理汇总了Python中Preprocess类的典型用法代码示例。如果您正苦于以下问题:Python Preprocess类的具体用法?Python Preprocess怎么用?Python Preprocess使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Preprocess类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main():
kernel = c.COSINE
# training parameter
result_path = 'results/PB2_spam.acc'
model_name = 'digits_' + kernel
tr_data_path = 'data\\digits\\tr_f_l_10.pickle'
te_data_path = 'data\\digits\\te_f_l_10.pickle'
# laod and preprocess training data
tr_data = loader.load_pickle_file(tr_data_path)
te_data = loader.load_pickle_file(te_data_path)
# transpose label
tr_data[1] = np.transpose(tr_data[1])[0]
te_data[1] = np.transpose(te_data[1])[0]
Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0])
Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0])
# start training
st = time.time()
# start training
print('{:.2f} Start training.'.format(time.time() - st))
for r in (0.15, 0.1):
clf = kNN.kNN(kernel=kernel, dataset=c.DS_DIGITS)
clf.fit(tr_data[0], tr_data[1])
tr_pred = clf.predict(tr_data[0], r=r)
te_pred = clf.predict(te_data[0], r=r)
tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]
print('{} Final results with kernel {} and r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, r, tr_acc, te_acc))
示例2: main
def main():
target = 'v2'
# training parameter
k = 10 # fold
layer_thresh = 2
T = 50
threshes_path = 'data/spambase.threshes'
# laod and preprocess training data
training_data = loader.load_dataset('data/spambase.data')
# load thresholds
threshes = loader.load_pickle_file(threshes_path)
# start training
k_folds = Preprocess.prepare_k_folds(training_data, k)
tr_data, te_data = Preprocess.get_i_fold(k_folds, 0)
f_cur = [x[0] for x in tr_data[0]]
t = dt.DecisionTree()
if target == 'v1':
for i in range(100):
h_y = t.compute_entropy(tr_data[1])
thresh = threshes[0][30]
ig = t.compute_ig(f_cur, tr_data[1], thresh, h_y)
else:
h_y = t.compute_entropy_v2(tr_data[1])
thresh = threshes[0][0]
ig = t.compute_ig_v2(f_cur, tr_data[1], thresh, h_y)
示例3: main
def main():
# training parameter
result_path = 'results/housingLiR_1.mse'
model_name = 'housing_shiftAndScale'
# normalization = Preprocess.zero_mean_unit_var
normalization = Preprocess.shift_and_scale
# cols_not_norm = (0,7,12)
cols_not_norm = []
# laod and preprocess training data
training_data = loader.load_dataset('data/housing_train.txt')
testing_data = loader.load_dataset('data/housing_test.txt')
Preprocess.normalize_features_all(normalization, training_data[0], testing_data[0], cols_not_norm)
# start training
model = rm.LinearRegression()
model.build(training_data[0], training_data[1])
training_mse = model.test(training_data[0], training_data[1], util.mse)
testing_mse = model.test(testing_data[0], testing_data[1], util.mse)
print 'Error for training data is:'
print training_mse
print 'Error for testing data is:'
print testing_mse
result = {}
result['TrainingMSE'] = str(training_mse)
result['TestingMSE'] = str(testing_mse)
result['Theta'] = str(model.theta)
# log the training result to file
util.write_result_to_file(result_path, model_name, result)
示例4: main
def main():
is_sklearn = False
# kernel = c.COSINE
# kernel = c.GAUSSIAN
kernel = c.POLY
# training parameter
result_path = 'results/PB2_spam.acc'
model_name = 'digits_' + kernel
model_path = 'data/PB1_B_digits_sk_Gaussian_1.model'
# tr_data_path = 'data\\digits\\tr_f_l.pickle'
# te_data_path = 'data\\digits\\te_f_l.pickle'
tr_data_path = 'data\\digits\\tr_f_l_10.pickle'
te_data_path = 'data\\digits\\te_f_l_10.pickle'
# laod and preprocess training data
tr_data = loader.load_pickle_file(tr_data_path)
te_data = loader.load_pickle_file(te_data_path)
# transpose label
tr_data[1] = np.transpose(tr_data[1])[0]
te_data[1] = np.transpose(te_data[1])[0]
Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0])
Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0])
# start training
models = []
st = time.time()
# start training
print('{:.2f} Start training.'.format(time.time() - st))
for k in (1, 3, 7):
if not is_sklearn:
clf = kNN.kNN(kernel=kernel)
clf.fit(tr_data[0], tr_data[1])
tr_pred = clf.predict(tr_data[0], k=k)
te_pred = clf.predict(te_data[0], k=k)
else:
clf = KNeighborsClassifier(n_neighbors=k, metric=cosine_distances)
clf.fit(tr_data[0], tr_data[1])
tr_pred = clf.predict(tr_data[0])
te_pred = clf.predict(te_data[0])
tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]
models.append(clf)
print('{} Final results with kernel {} and k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, k, tr_acc, te_acc))
示例5: detectPlatesInScene
def detectPlatesInScene(imgOriginalScene):
possiblePlates = []
height, width, numChannels = imgOriginalScene.shape
imgGrayscaleScene = np.zeros((height, width, 1), np.uint8)
imgThreshScene = np.zeros((height, width, 1), np.uint8)
imgContours = np.zeros((height, width, 3), np.uint8)
cv2.destroyAllWindows()
imgGrayscaleScene, imgThreshScene = Preprocess.preprocess(imgOriginalScene)
possibleCharsInScene = findPossibleCharsInScene(imgThreshScene)
listOfListsOfMatchingCharsInScene = DetectChars.findListOfListsOfMatchingChars(possibleCharsInScene)
for matchingChars in listOfListsOfMatchingCharsInScene:
possiblePlate = extractPlate(imgOriginalScene, matchingChars)
if possiblePlate.imgPlate is not None:
possiblePlates.append(possiblePlate)
print "\n" + str(len(possiblePlates)) + " possible plates found"
return possiblePlates
示例6: get_tf_idf
def get_tf_idf(query, src="google"):
tokens = get_tokens(query, src)
#converts into a dictionary
query_dictionary = Preprocess.list_to_dict(tokens,{})
#print query_dictionary
#creates a dictionary from a random wikipedia corpus
#dictionary = Preprocess.get_corpus(num_articles)
#loads in the dictionary of existing tf_idf words
dictionary = load_idf()
tf_idf_dictionary = {}
#calculates the tfidf for each key, storing it in a new dictionary
for key in query_dictionary.keys():
tf = query_dictionary[key]
if key in dictionary:
idf = dictionary[key]
else:
idf = math.log(18,10)
tf_idf_dictionary[key] = (float(tf)*float(idf))
#print tf_idf_dictionary
#sorts the dictionary based on the tfidf value, returning it as a list
sorted_dictionary = sorted(tf_idf_dictionary.iteritems(), key=operator.itemgetter(1), reverse = True)
return sorted_dictionary
示例7: main
def main():
# training parameter
is_sklearn = True
k = 10 # fold
result_path = 'results/PB2_spam.acc'
model_name = 'spam_' + str(k) + 'fold'
data_path = 'data/spam/data.pickle'
# laod and preprocess training data
training_data = loader.load_pickle_file(data_path)
# TODO convert labels from {0, 1} to {-1, 1}
# util.replace_zero_label_with_neg_one(training_data)
# Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])
# training_data[0] = preprocessing.scale(training_data[0])
# start training
training_errs = []
testing_errs = []
print('Preparing k fold data.')
k_folds = Preprocess.prepare_k_folds(training_data, k)
for i in (0,):
st = time.time()
tr_data, te_data = Preprocess.get_i_fold(k_folds, i)
# start training
print('{:.2f} Start training.'.format(time.time() - st))
kernel = c.EUCLIDEAN
# kernel = c.GAUSSIAN
f_select = True
best_features_num = 5
clf = kNN.kNN(kernel=kernel)
clf.fit(tr_data[0], tr_data[1], f_select=f_select, best_f=best_features_num)
print("Best features: {}".format(clf.best_f_indices))
for kk in (1, 2, 3, 7):
tr_pred = clf.predict(tr_data[0], k=kk)
te_pred = clf.predict(te_data[0], k=kk)
tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]
print('{} Final results with kernel {}, k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, kk, tr_acc, te_acc))
示例8: main
def main():
# training parameter
k = 8 # fold
result_path = 'results/PB2_spam.acc'
model_name = 'spam_' + str(k) + 'fold'
data_path = 'data/spam/data.pickle'
# laod and preprocess training data
training_data = loader.load_pickle_file(data_path)
# TODO convert labels from {0, 1} to {-1, 1}
# util.replace_zero_label_with_neg_one(training_data)
Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])
# Preprocess.normalize_features_all(Preprocess.shifiat_and_scale, training_data[0])
# start training
training_accs = []
testing_accs = []
print('Preparing k fold data.')
k_folds = Preprocess.prepare_k_folds(training_data, k)
kernel = c.EUCLIDEAN
sst = time.time()
for i in (1,):
st = time.time()
tr_data, te_data = Preprocess.get_i_fold(k_folds, i)
# start training
print('{:.2f} Start training.'.format(time.time() - st))
for r in (2.5, 2.7):
clf = kNN.kNN(kernel=kernel)
# clf.fit(training_data[0], training_data[1])
clf.fit(tr_data[0], tr_data[1])
# tr_pred = clf.predict(training_data[0], r=r)
tr_pred = clf.predict(tr_data[0], r=r)
te_pred = clf.predict(te_data[0], r=r)
# tr_acc = (training_data[1] == tr_pred).sum() / training_data[0].shape[0]
tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]
testing_accs.append(te_acc)
print('{} {}-fold results with kernel {}, r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, i, kernel, r, tr_acc, te_acc))
示例9: main
def main():
# training parameter
k = 10 # fold
result_path = "results/PB1_A_spam.acc"
model_name = "spam_" + str(k) + "fold"
threshes_path = "data/spambase.threshes"
data_path = "data/spam/data.pickle"
# kernel = 'poly'
kernel = "linear"
# kernel = 'rbf'
verbose = False
tol = 0.01
c = 0.1
# laod and preprocess training data
training_data = loader.load_pickle_file(data_path)
# TODO convert labels from {0, 1} to {-1, 1}
util.replace_zero_label_with_neg_one(training_data)
# normalize
Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])
print("Preparing k fold data.")
k_folds = Preprocess.prepare_k_folds(training_data, k)
for i in range(1):
st = time.time()
tr_data, te_data = Preprocess.get_i_fold(k_folds, i)
# start training
print("{:3f} Start training. Kernel: {}".format(time.time() - st, kernel))
clf = svm.SVC(C=c, kernel=kernel, tol=tol, verbose=verbose)
# clf = svm.NuSVC(kernel=kernel, tol=tol, verbose=verbose)
clf.fit(tr_data[0], tr_data[1])
tr_pred = clf.predict(tr_data[0])
te_pred = clf.predict(te_data[0])
tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]
print("{:3f} Final results. Train acc: {}, Test acc: {}".format(time.time() - st, tr_acc, te_acc))
示例10: analyze_data
def analyze_data(time_interval=TimeInterval, refined_type=FullyPreprocessedPath):
print 'time_interval: ' + str(time_interval) + ' min'
print 'refined_type: ' + refined_type
print '--------------------------------------------'
# Refine the data and save
refined_data_path = Preprocess.preprocess_data(time_interval, refined_type)
# Build similarity model and save
Similarity.Build.similarity_model(time_interval, refined_type)
# Set data for visualization
Visualization.set_data4visualization(time_interval, refined_type)
示例11: analyze_data
def analyze_data(time_interval=TimeInterval, refined_type=FullyPreprocessedPath):
print 'time_interval: ' + str(time_interval) + ' min'
print 'refined_type: ' + refined_type
print '--------------------------------------------'
# Draw graphs and save the figures
graph_directory = Graph.Save.raw_data2graph()
# Refine the data and save
refined_data_path = Preprocess.refining_data(time_interval, refined_type)
# Build similarity model and save
Similarity.Build.similarity_model(time_interval, refined_type)
# Set data for visualization
Visualization.set_data4visualization(time_interval, refined_type)
示例12: detectCharsInPlates
def detectCharsInPlates(listOfPossiblePlates):
intPlateCounter = 0
imgContours = None
contours = []
if len(listOfPossiblePlates) == 0:
return listOfPossiblePlates
for possiblePlate in listOfPossiblePlates:
possiblePlate.imgGrayscale, possiblePlate.imgThresh = Preprocess.preprocess(possiblePlate.imgPlate)
possiblePlate.imgThresh = cv2.resize(possiblePlate.imgThresh, (0, 0), fx = 1.6, fy = 1.6) # povecavanje velicine slike
thresholdValue, possiblePlate.imgThresh = cv2.threshold(possiblePlate.imgThresh, 0.0, 255.0, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
listOfPossibleCharsInPlate = findPossibleCharsInPlate(possiblePlate.imgGrayscale, possiblePlate.imgThresh)
listOfListsOfMatchingCharsInPlate = findListOfListsOfMatchingChars(listOfPossibleCharsInPlate)
if (len(listOfListsOfMatchingCharsInPlate) == 0):
possiblePlate.strChars = ""
continue
for i in range(0, len(listOfListsOfMatchingCharsInPlate)):
listOfListsOfMatchingCharsInPlate[i].sort(key = lambda matchingChar: matchingChar.intCenterX)
listOfListsOfMatchingCharsInPlate[i] = removeInnerOverlappingChars(listOfListsOfMatchingCharsInPlate[i])
intLenOfLongestListOfChars = 0
intIndexOfLongestListOfChars = 0
for i in range(0, len(listOfListsOfMatchingCharsInPlate)):
if len(listOfListsOfMatchingCharsInPlate[i]) > intLenOfLongestListOfChars:
intLenOfLongestListOfChars = len(listOfListsOfMatchingCharsInPlate[i])
intIndexOfLongestListOfChars = i
longestListOfMatchingCharsInPlate = listOfListsOfMatchingCharsInPlate[intIndexOfLongestListOfChars]
possiblePlate.strChars = recognizeCharsInPlate(possiblePlate.imgThresh, longestListOfMatchingCharsInPlate)
return listOfPossiblePlates
示例13: open
from perceptron_dual import PerceptronDual
import csv
import Utilities as util
import numpy as np
import Consts as c
import Preprocess
data_file = 'data/twoSpirals.txt'
# load and preprocess data
features = []
labels = []
with open(data_file) as f:
for line in csv.reader(f, delimiter='\t'):
cur_l = int(float(line[-1]))
sign = 1
cur_f = [sign * float(l) for l in line[:-1]]
features.append(cur_f)
labels.append([cur_l])
features = np.array(features)
Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, features)
# Preprocess.normalize_features_all(Preprocess.shift_and_scale, features)
labels = np.array(labels).transpose()[0]
# create perceptron
# kernel = c.LINEAR
kernel = c.GAUSSIAN
model = PerceptronDual(kernel_fun=kernel)
model.fit(features, labels)
示例14:
sys.path.append('..')
import numpy as np
import rbm_rm
import rbm_cm
import matplotlib.pyplot as plt
import utils
import Preprocess
mnist_dir = os.path.join(os.environ['DATA_HOME'], 'mnist')
mnist_train_path = os.path.join(mnist_dir, 'MNISTTrainData.npy')
data_rm = np.load(mnist_train_path)
[normed, meanv, stdv] = Preprocess.mean_zero_unit_variance(data_rm)
#Look, I didn't actually use the normalized data because it broke everything
train_rm = data_rm[30000:, :]
valid_rm = data_rm[:30000, :]
data_cm = data_rm.transpose()
train_cm = data_cm[:,30000:]
valid_cm = data_cm[:,:30000]
nHidden = 100
ViewDimensions = (10, 10) # Should multiply to nHidden
TP = rbm_rm.RBMTrainParams()
TP.maxepoch = 15
rm_learner = rbm_rm.GV_RBM(nHidden, train_rm.shape[1])
示例15: main
def main():
# training parameter
round_limit = 50
result_path = 'results/spamActive_random_final_1.acc'
model_name = 'spam_active'
threshes_path = 'data/spambase.threshes'
# laod and preprocess training data
training_data = loader.load_dataset('data/spambase.data')
# TODO convert labels from {0, 1} to {-1, 1}
util.replace_zero_label_with_neg_one(training_data)
# load thresholds
threshes = loader.load_pickle_file(threshes_path)
# start training
training_errs = []
testing_errs = []
# round_err_1st_boost = None
# tr_errs_1st_boost = None
# te_errs_1st_boost = None
# te_auc_1st_boost = None
roc = []
auc = 0.0
k_folds = Preprocess.prepare_k_folds(training_data, 5)
tr_data_pool, te_data = Preprocess.get_i_fold(k_folds, 1)
data_set = DataSet.DataSet(tr_data_pool)
data_rates = (5, 10, 15, 20, 30, 50)
for c in data_rates:
tr_data = data_set.random_pick(c, False)
tr_n, f_d = np.shape(tr_data[0])
te_n, = np.shape(te_data[1])
# TODO prepare distribution
d = util.init_distribution(len(tr_data[0]))
# TODO compute thresholds cheat sheet
thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes)
boost = b.Boosting(d)
testing_predict = np.zeros((1, te_n)).tolist()[0]
training_predict = np.zeros((1, tr_n)).tolist()[0]
round_tr_err = []
round_te_err = []
round_model_err = []
round_te_auc = []
converged = False
tol = 1e-5
te_auc = 2.
round = 0
while round < round_limit: # and not converged:
round += 1
boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs)
boost.update_predict(tr_data[0], training_predict)
boost.update_predict(te_data[0], testing_predict)
c_model_err = boost.model[-1].w_err
round_model_err.append(c_model_err)
c_f_ind = boost.model[-1].f_ind
c_thresh = boost.model[-1].thresh
c_tr_err = util.get_err_from_predict(training_predict, tr_data[1])
c_te_err = util.get_err_from_predict(testing_predict, te_data[1])
# TODO calculate the AUC for testing results
# c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1])
round_tr_err.append(c_tr_err)
round_te_err.append(c_te_err)
# round_te_auc.append(c_te_auc)
print('Data {}% Round: {} Feature: {} Threshold: {:.3f} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {}'.format(c, round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err, 0))
# converged = abs(c_te_auc - te_auc) / te_auc <= tol
# te_auc = c_te_auc
training_errs.append(round_tr_err[-1])
testing_errs.append(round_te_err[-1])
# break # for testing
mean_training_err = np.mean(training_errs)
mean_testing_err = np.mean(testing_errs)
print('Training errs are:')
print(training_errs)
print('Mean training err is:')
print(mean_training_err)
print('Testing errs are:')
print(testing_errs)
print('Mean testing err is:')
print(mean_testing_err)
result = {}
result['Trainingerrs'] = training_errs
result['MeanTrainingAcc'] = mean_training_err
result['Testingerrs'] = testing_errs
result['MeanTestingAcc'] = mean_testing_err
# result['ROC'] = str(roc)
result['AUC'] = auc
# log the training result to file
util.write_result_to_file(result_path, model_name, result, True)