本文整理汇总了Python中sklearn.utils.shuffle函数的典型用法代码示例。如果您正苦于以下问题:Python shuffle函数的具体用法?Python shuffle怎么用?Python shuffle使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了shuffle函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: generate_feature
def generate_feature(in_file, dump=False, single_only=False, min_count=0):
f = open(in_file, 'r')
f.readline()
training_data, tags = [], []
total_features = {}
for line in f.readlines():
tokens = line.replace('\n', '').split(',')
fs = [s for s in tokens[1:] if s.isdigit()]
# ignore invalid data
if len(fs) != 10:
continue
tags.append(tokens[0])
features = get_feature_array(fs, single_only)
update_total_features(total_features, features)
training_data.append(features)
training_data = transform_to_matrix(total_features, training_data)
training_data = cut_off(training_data, min_count)
shuffle(training_data, tags)
tags = np.array(tags)
if dump:
np.savetxt('preprocessing/dumpX.txt', training_data, fmt='%d', delimiter=',')
np.savetxt('preprocessing/dumpY.txt', tags[np.newaxis].T, fmt='%s', delimiter=',')
return total_features, training_data, np.array(tags)
示例2: _subsample_data
def _subsample_data(self, X, Y, n=10000):
if Y is not None:
X, Y = shuffle(X, Y)
return X[:n], Y[:n]
else:
X = shuffle(X)
return X[:n]
示例3: main
def main(is_binary=True):
train, test, word2idx = get_ptb_data()
for t in train:
add_idx_to_tree(t, 0)
train = [tree2list(t, -1, is_binary) for t in train]
if is_binary:
train = [t for t in train if t[3][-1] >= 0] # for filtering binary labels
for t in test:
add_idx_to_tree(t, 0)
test = [tree2list(t, -1, is_binary) for t in test]
if is_binary:
test = [t for t in test if t[3][-1] >= 0] # for filtering binary labels
train = shuffle(train)
train = train[:5000]
# n_pos = sum(t[3][-1] for t in train)
# print "n_pos train:", n_pos
test = shuffle(test)
test = test[:1000]
# n_pos = sum(t[3][-1] for t in test)
# print "n_pos test:", n_pos
V = len(word2idx)
print "vocab size:", V
D = 20
K = 2 if is_binary else 5
model = RecursiveNN(V, D, K)
model.fit(train)
print "train accuracy:", model.score(train)
print "test accuracy:", model.score(test)
print "train f1:", model.f1_score(train)
print "test f1:", model.f1_score(test)
示例4: splitIntoTrainingAndValidation
def splitIntoTrainingAndValidation(A, B):
data1 = shuffle(sourceSets[A]) # Note this is a random shuffle, that's
data2 = shuffle(sourceSets[B]) # why we need many iterations
freqM = np.minimum(freqs[A], freqs[B])
freq1tr = np.round(freqM * 0.8) # Randomly selected 80% for the training set,
freq1va = freqM - freq1tr # and the remaining 20% for the validation set
freq2tr = np.copy(freq1tr)
freq2va = np.copy(freq1va)
trainingSetSize = int(sum(freq1tr)) # 1/2 size actually
validatnSetSize = int(sum(freq1va))
testSet1size = len(data1) - trainingSetSize - validatnSetSize
testSet2size = len(data2) - trainingSetSize - validatnSetSize
X = np.zeros((trainingSetSize*2, numFeatures))
Xv = np.zeros((validatnSetSize*2, numFeatures))
Xt = np.zeros((testSet1size+testSet2size, numFeatures))
y = np.ravel([([0]*trainingSetSize) + ([1]*trainingSetSize)])
yv = np.ravel([([0]*validatnSetSize) + ([1]*validatnSetSize)])
yt = np.ravel([([0]*testSet1size) + ([1]*testSet2size)])
trnIdx = vldIdx = tstIdx = 0
for item in data1:
year = item[0]
if freq1tr[year] > 0: X[trnIdx], trnIdx, freq1tr[year] = item[1:], trnIdx+1, freq1tr[year]-1
elif freq1va[year] > 0: Xv[vldIdx], vldIdx, freq1va[year] = item[1:], vldIdx+1, freq1va[year]-1
else: Xt[tstIdx], tstIdx = item[1:], tstIdx+1
assert trnIdx==trainingSetSize and vldIdx==validatnSetSize and tstIdx==testSet1size
for item in data2:
year = item[0]
if freq2tr[year] > 0: X[trnIdx], trnIdx, freq2tr[year] = item[1:], trnIdx+1, freq2tr[year]-1
elif freq2va[year] > 0: Xv[vldIdx], vldIdx, freq2va[year] = item[1:], vldIdx+1, freq2va[year]-1
else: Xt[tstIdx], tstIdx = item[1:], tstIdx+1
assert trnIdx==trainingSetSize*2 and vldIdx==validatnSetSize*2 and tstIdx==testSet1size+testSet2size
X, y = shuffle(X, y) # Just in case... perhaps no reason to shuffle again here?
fs = SelectKBest(f_classif, k = numFeatures) # TODO: try other feature selection methods?
fs.fit(np.concatenate((X, Xv)), np.concatenate((y, yv)))
return X, y, Xv, yv, Xt, yt, testSet1size, testSet2size, fs.scores_
示例5: compute_distances_and_pairs
def compute_distances_and_pairs(self, pdb_file, nr_contacts=None, nr_noncontacts=None):
#distance and contacts
self.features['pair']['Cbdist'] = pdb.distance_map(pdb_file, self.L)
#mask positions that have too many gaps
gap_freq = 1 - (self.Ni / self.neff)
highly_gapped_pos = np.where(gap_freq > self.max_gap_percentage)[0]
self.features['pair']['Cbdist'][:,highly_gapped_pos] = np.nan
self.features['pair']['Cbdist'][highly_gapped_pos, :] = np.nan
#if there are unresolved residues, there will be nan in the distance_map
with np.errstate(invalid='ignore'):
self.features['pair']['contact'] = (self.features['pair']['Cbdist'] <= self.contact_threshold) * 1
self.features['pair']['nocontact'] = (self.features['pair']['Cbdist'] > self.non_contact_threshold) * 1
indices_contact = np.where(np.triu(self.features['pair']['contact'], k=self.seq_separation))
indices_contact = tuple(shuffle(indices_contact[0],indices_contact[1], random_state=0))
if nr_contacts:
indices_contact = indices_contact[0][:nr_contacts], indices_contact[1][:nr_contacts]
indices_nocontact = np.where(np.triu(self.features['pair']['nocontact'], k=self.seq_separation))
indices_nocontact = tuple(shuffle(indices_nocontact[0],indices_nocontact[1], random_state=0))
if nr_noncontacts:
indices_nocontact = indices_nocontact[0][:nr_noncontacts], indices_nocontact[1][:nr_noncontacts]
#update indices of i<j for only relevant pairs
self.ij_ind_upper = np.array(list(indices_contact[0]) + list(indices_nocontact[0])), np.array(list(indices_contact[1]) + list(indices_nocontact[1]))
示例6: get_aa_cross_val
def get_aa_cross_val(L, X, Y, AA, tsize=None, rstate=-1):
"""Get test data from dataset"""
test_position = []
aa_y = np.zeros(Y.shape)
for i in xrange(len(Y)):
if L[i][-1] == AA:
aa_y[i] = 1
test_position.append(i)
if tsize:
t_len = int(tsize * len(Y))
# positions that are 0 without being the one for AA
zero_pos = np.where(np.logical_and(Y == 0, aa_y == 0))[0]
clen = t_len - len(test_position)
if clen > 0:
random_zero_pos = np.random.choice(zero_pos, clen, replace=False)
test_position.extend(random_zero_pos)
test_position = np.random.permutation(test_position)
mask = np.ones(Y.shape, dtype=bool)
mask[test_position] = False
train_position = np.array(range(len(mask)))[mask]
if rstate > 0:
return shuffle(train_position, random_state=rstate), shuffle(test_position, random_state=rstate)
# in this case, suppose we want only the train and test index
else:
return train_position, test_position
示例7: generator3
def generator3(samples, batch_size=32):
num_samples = len(samples)
while 1: # Loop forever so the generator never terminates
shuffle(samples)
for offset in range(0, num_samples, batch_size):
batch_samples = samples[offset:offset+batch_size]
car_images = []
steering_angles = []
for batch_sample in batch_samples:
img_center = cv2.imread(path+batch_sample[0].split('\\')[-1])
img_left = cv2.imread(path+batch_sample[1].split('\\')[-1])
img_right = cv2.imread(path+batch_sample[2].split('\\')[-1])
correction = 0.3 # this is a parameter to tune
steering_center = float(batch_sample[3])
steering_left = steering_center + correction
steering_right = steering_center - correction
# add images and angles to data set
car_images.extend([img_center, img_left, img_right])
steering_angles.extend([steering_center, steering_left, steering_right])
# trim image to only see section with road
X_train = np.array(car_images)
y_train = np.array(steering_angles)
yield shuffle(X_train, y_train)
示例8: import_images
def import_images():
#IMPLEMENT TIMER CUTOFF FR+OR IF FEAT EXT TAKES TOO LONG
d_feats = {'orb': []}
c_feats = {'orb': []}
(cat_paths, dog_paths) = get_filenames(TRAINING_FOLDER)
cat_train_pts = []
dog_train_pts = []
for image_fn in shuffle(dog_paths, n_samples = 400, random_state=0):
odesc_pts = extract_desc_pts(image_fn)
try:
for pt in odesc_pts:
d_feats['orb'].append(pt)
except TypeError:
print image_fn
continue
for image_fn in shuffle(cat_paths, n_samples = 400, random_state=0):
odesc_pts = extract_desc_pts(image_fn)
try:
for pt in odesc_pts:
c_feats['orb'].append(pt)
except TypeError:
print image_fn
continue
cat_k_means = KMeans(n_jobs=-1, n_clusters=200)
cat_k_means.fit(c_feats['orb'])
print 'dog calc'
dog_k_means = KMeans(n_jobs=-1, n_clusters=200)
dog_k_means.fit(d_feats['orb'])
print 'saving....'
with open('/home/max/CVD/d_o200c200s400.pickle', 'wb') as handle:
pickle.dump(dog_k_means.cluster_centers_, handle)
with open('/home/max/CVD/c_o200c200s400.pickle', 'wb') as handle:
pickle.dump(cat_k_means.cluster_centers_, handle)
return '\n\n\n DONE '
示例9: generate_training_data
def generate_training_data(image_paths, angles, batch_size=128, validation_flag=False):
'''
method for the model training data generator to load, process, and distort images, then yield them to the
model. if 'validation_flag' is true the image is not distorted. also flips images with turning angle magnitudes of greater than 0.33, as to give more weight to them and mitigate bias toward low and zero turning angles
'''
image_paths, angles = shuffle(image_paths, angles)
X,y = ([],[])
while True:
for i in range(len(angles)):
img = cv2.imread(image_paths[i])
angle = angles[i]
img = preprocess_image(img)
if not validation_flag:
img, angle = random_distort(img, angle)
X.append(img)
y.append(angle)
if len(X) == batch_size:
yield (np.array(X), np.array(y))
X, y = ([],[])
image_paths, angles = shuffle(image_paths, angles)
# flip horizontally and invert steer angle, if magnitude is > 0.33
if abs(angle) > 0.33:
img = cv2.flip(img, 1)
angle *= -1
X.append(img)
y.append(angle)
if len(X) == batch_size:
yield (np.array(X), np.array(y))
X, y = ([],[])
image_paths, angles = shuffle(image_paths, angles)
示例10: splitIntoTrainingValidation
def splitIntoTrainingValidation(A, B): # TODO: 3rd parameter: the desired value of (validatSet1size + validatSet2size)
data1 = shuffle(sourceSets[A]) # Note this is a random shuffle, that's
data2 = shuffle(sourceSets[B]) # why we need many iterations
freq1 = np.minimum(freqs[A], freqs[B])
if sum(freq1) > maxTrainSetSz: freq1 = np.round(freq1 * (maxTrainSetSz * 1.0 / sum(freq1)))
trainingSetSize = int(sum(freq1)) # Half size actually. Approximately <= maxTrainSetSz
validatSet1size = len(data1) - trainingSetSize
validatSet2size = len(data2) - trainingSetSize
X = np.zeros((trainingSetSize*2, numFeatures))
Xv = np.zeros((validatSet1size+validatSet2size, numFeatures))
y = np.ravel([([0]*trainingSetSize) + ([1]*trainingSetSize)])
yv = np.ravel([([0]*validatSet1size) + ([1]*validatSet2size)])
freq2 = np.copy(freq1)
trnIdx = valIdx = 0
for item in data1:
year = item[0]
if freq1[year] > 0:
freq1[year]-=1
X[trnIdx] = item[1:]
trnIdx+=1
else:
Xv[valIdx] = item[1:]
valIdx += 1
assert trnIdx==trainingSetSize and valIdx==validatSet1size
for item in data2:
year = item[0]
if freq2[year] > 0:
freq2[year]-=1
X[trnIdx] = item[1:]
trnIdx+=1
else:
Xv[valIdx] = item[1:]
valIdx += 1
assert trnIdx==trainingSetSize*2 and valIdx==validatSet1size+validatSet2size
return X, y, Xv, yv, validatSet1size, validatSet2size
示例11: cluster
def cluster(m, n_colors=32):
from sklearn.utils import shuffle
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin
def recreate_image(codebook, labels, w, h):
"""Recreate the (compressed) image from the code book & labels"""
d = codebook.shape[1]
image = np.zeros((w, h, d))
label_idx = 0
for i in range(w):
for j in range(h):
image[i][j] = codebook[labels[label_idx]]
label_idx += 1
return image
# Load Image and transform to a 2D numpy array.
w, h, d = original_shape = tuple(m.shape)
image_array = np.reshape(m, (w * h, d))
image_array_sample = shuffle(image_array, random_state=0)[:1000]
kmeans = KMeans(n_clusters=n_colors).fit(image_array_sample)
codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1]
labels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0)
return recreate_image(codebook_random, labels_random, w, h)
示例12: main
def main():
train, test, word2idx = get_ptb_data()
for t in train:
add_idx_to_tree(t, 0)
train = [tree2list(t, -1, True) for t in train]
train = [t for t in train if t[3][-1] >= 0] # for filtering binary labels
for t in test:
add_idx_to_tree(t, 0)
test = [tree2list(t, -1, True) for t in test]
test = [t for t in test if t[3][-1] >= 0] # for filtering binary labels
train = shuffle(train)
train = train[:1000]
# n_pos = sum(t[3][-1] for t in train)
# print "n_pos train:", n_pos
test = shuffle(test)
test = test[:100]
# n_pos = sum(t[3][-1] for t in test)
# print "n_pos test:", n_pos
V = len(word2idx)
print "vocab size:", V
D = 80
K = 5
model = RecursiveNN(V, D, K)
model.fit(train, epochs=3, activation=T.nnet.relu)
print "train accuracy:", model.score(train)
print "test accuracy:", model.score(test)
print "train f1:", model.f1_score(train)
print "test f1:", model.f1_score(test)
示例13: process_data
def process_data():
global num_classes, num_train, num_test
X_train , Y_train = load_data('Train')
X_test , Y_test = load_data('Test')
X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)
num_train = X_train.shape[0]
num_test = X_test.shape[0]
mean_image = np.mean(X_train,axis=0)
X_train -= mean_image
X_test -= mean_image
X_train = X_train.reshape(-1, 1, img_dim, img_dim)
Y_train -= 1
X_train , Y_train = shuffle(X_train, Y_train)
X_test = X_test.reshape(-1, 1, img_dim, img_dim)
Y_test -= 1
X_test , Y_test = shuffle(X_test, Y_test)
print 'Training X shape :- ', X_train.shape
print 'Training Y shape :- ', Y_train.shape
print 'Testing X shape :- ', X_test.shape
print 'Testing Y shape :- ', Y_test.shape
return X_train, Y_train, X_test, Y_test
示例14: frames2batch
def frames2batch(k = 12,batch_size = 1024, is_calib = False):
pos = util.get_files(rootdir = 'F:\\train_data\\pos\\')
neg = util.get_files(rootdir = 'F:\\train_data\\neg\\')
pos = shuffle(pos)
neg = shuffle(neg)
total = pos + neg
total = shuffle(total)
batch = []
c = 0
bpath = 'F:\\train_data\\batch\\'
for item_path in total:
frame = fr.get_frame(item_path)
frame_r = fr.resize_frame(frame,(k,k))
if frame_r == None:
continue
vec = fr.frame_to_vect(frame_r)
label = 1 if item_path.split('\\')[-1].find('pos') > 0 else 0
print(item_path,label)
batch.append((vec,label))
if len(batch) > 0 and len(batch) % batch_size == 0:
batch = sp.array(batch)
sp.savez(bpath + str(c) + '_' + str(k) + ('_' if not is_calib else '_calib-') + 'net',batch)
batch = []
c += 1
if len(batch) > 0 and len(batch) % batch_size == 0:
batch = sp.array(batch)
sp.savez(bpath + str(c) + '_' + str(k) + ('_' if not is_calib else '_calib') + '-net',batch)
batch = []
c += 1
示例15: getMNIST
def getMNIST():
# data shape: train (50000, 784), test (10000, 784)
# already scaled from 0..1 and converted to float32
datadir = '../large_files/'
if not os.path.exists(datadir):
datadir = ''
input_file = "%smnist.pkl.gz" % datadir
if not os.path.exists(input_file):
url = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
with open(input_file, "wb") as out:
f = urllib2.urlopen(url)
out.write(f.read())
out.flush()
with gzip.open(input_file) as f:
train, valid, test = cPickle.load(f)
Xtrain, Ytrain = train
Xvalid, Yvalid = valid
Xtest, Ytest = test
Ytrain_ind = y2indicator(Ytrain)
Ytest_ind = y2indicator(Ytest)
Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
Xtest, Ytest = shuffle(Xtest, Ytest)
# try to take a smaller sample
Xtrain = Xtrain[0:30000]
Ytrain = Ytrain[0:30000]
Xtest = Xtest[0:1000]
Ytest = Ytest[0:1000]
return Xtrain.reshape(len(Xtrain), 1, 28, 28), Ytrain, Ytrain_ind, Xtest.reshape(len(Xtest), 1, 28, 28), Ytest, Ytest_ind