本文整理汇总了Python中sklearn.utils.resample函数的典型用法代码示例。如果您正苦于以下问题:Python resample函数的具体用法?Python resample怎么用?Python resample使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了resample函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: initialize
def initialize(self, X, k, random_seed, method='naive'):
if method == 'naive':
# Randomly pick k data points to be the centroids of the k clusters
centroids = resample(X, n_samples=k, random_state=random_seed, replace=False)
elif method == 'kmeans++': # https://en.wikipedia.org/wiki/K-means%2B%2B
# Step 1: Choose one center uniformly at random from among the data points
centroids = resample(X, n_samples=1, random_state=random_seed, replace=False)
N = len(X)
# Sampling the 1~k centroids
for i in range(1, k):
distances = [ -1 ] * N
# Step 2: For each data point x, compute D(x)
for j in range(N):
# The distance between x and the nearest center that has already been chosen
distances[j] = min(np.linalg.norm(X[j] - centroid) for centroid in centroids)
# Step 3: Choose one new data point at randome as a new center,
# using a weighted probability distribution where a point x is chosen with probability proportional to D(x)^2
square_distances = [ distance ** 2 for distance in distances ]
total_square_distance = sum(square_distances)
# Naturally excluded already selected data points, because their probability is 0
probabilities = [ square_distance / total_square_distance for square_distance in square_distances ]
new_centroid_index = np.random.choice(range(N), size=1, replace=False, p=probabilities)[0]
centroids = np.append(centroids, [ X[new_centroid_index] ], axis=0)
return centroids
示例2: test_resample
def test_resample():
# Border case not worth mentioning in doctests
assert resample() is None
# Check that invalid arguments yield ValueError
assert_raises(ValueError, resample, [0], [0, 1])
assert_raises(ValueError, resample, [0, 1], [0, 1],
replace=False, n_samples=3)
assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42)
# Issue:6581, n_samples can be more when replace is True (default).
assert_equal(len(resample([1, 2], n_samples=5)), 5)
示例3: run_scikit_digits
def run_scikit_digits(epochs=0, layers=0, neuron_count=0):
""" Run Handwritten Digits dataset from Scikit-Learn. Learning set is split
into 70% for training, 15% for testing, and 15% for validation.
Parameters
----------
epochs : int
Number of iterations of the the traininng loop for the whole dataset
layers : int
Number of layers (not counting the input layer, but does count output
layer)
neuron_count : list
The number of neurons in each of the layers (in order), does not count
the bias term
Attributes
----------
target_values : list
The possible values for each training vector
"""
# Imported from linear_neuron
temp_digits = datasets.load_digits()
digits = utils.resample(temp_digits.data, random_state=3)
temp_answers = utils.resample(temp_digits.target, random_state=3)
# images = utils.resample(temp_digits.images, random_state=0)
num_of_training_vectors = 1250
answers, answers_to_test, validation_answers = (
temp_answers[:num_of_training_vectors],
temp_answers[num_of_training_vectors : num_of_training_vectors + 260],
temp_answers[num_of_training_vectors + 260 :],
)
training_set, testing_set, validation_set = (
digits[:num_of_training_vectors],
digits[num_of_training_vectors : num_of_training_vectors + 260],
digits[num_of_training_vectors + 260 :],
)
###########
# network.visualization(training_set[10], answers[10])
# network.visualization(training_set[11], answers[11])
# network.visualization(training_set[12], answers[12])
network = Network(layers, neuron_count, training_set[0])
network.train(training_set, answers, epochs)
f = open("my_net.pickle", "wb")
# fr = open('my_net.pickle', 'rb')
dill.dump(network, f)
# network = pickle.load(fr)
# fr.close()
f.close()
# guess_list = network.run_unseen(testing_set)
return network.run_unseen(testing_set)
示例4: resample_training_dataset
def resample_training_dataset(self, labels, feature_array, sizes = (5000,500)):
"""
Inputs:
- labels
- features
- sizes: tuple, for each class (0,1,etc)m the number of training chunks you want.
i.e for 500 seizures, 5000 baseline, sizes = (5000, 500), as 0 is baseline, 1 is Seizure
Takes labels and features an
WARNING: Up-sampling target class prevents random forest oob from being accurate.
"""
if len (labels.shape) == 1:
labels = labels[:, None]
resampled_labels = []
resampled_features = []
for i,label in enumerate(np.unique(labels.astype('int'))):
class_inds = np.where(labels==label)[0]
class_labels = labels[class_inds]
class_features = feature_array[class_inds,:]
if class_features.shape[0] < sizes[i]: # need to oversample
class_features_duplicated = np.vstack([class_features for i in range(int(sizes[i]/class_features.shape[0]))])
class_labels_duplicated = np.vstack([class_labels for i in range(int(sizes[i]/class_labels.shape[0]))])
n_extra_needed = sizes[i] - class_labels_duplicated.shape[0]
extra_features = resample(class_features, n_samples = n_extra_needed,random_state = 7, replace = False)
extra_labels = resample(class_labels, n_samples = n_extra_needed,random_state = 7, replace = False)
boot_array = np.vstack([class_features_duplicated,extra_features])
boot_labels = np.vstack([class_labels_duplicated,extra_labels])
elif class_features.shape[0] > sizes[i]: # need to undersample
boot_array = resample(class_features, n_samples = sizes[i],random_state = 7, replace = False)
boot_labels = resample(class_labels, n_samples = sizes[i],random_state = 7, replace = False)
elif class_features.shape[0] == sizes[i]:
logging.debug('label '+str(label)+ ' had exact n as sample, doing nothing!')
boot_array = class_features
boot_labels = class_labels
else:
print(class_features.shape[0], sizes[i])
print ('fuckup')
resampled_features.append(boot_array)
resampled_labels.append(boot_labels)
# stack both up...
resampled_labels = np.vstack(resampled_labels)
resampled_features = np.vstack(resampled_features)
logging.debug('Original label counts: '+str(pd.Series(labels[:,0]).value_counts()))
logging.debug('Resampled label counts: '+str(pd.Series(resampled_labels[:,0]).value_counts()))
return resampled_labels, resampled_features
示例5: run_mnist
def run_mnist(epochs, layers, neuron_count):
""" Run Mnist dataset and output a guess list on the Kaggle test_set
Parameters
----------
epochs : int
Number of iterations of the the traininng loop for the whole dataset
layers : int
Number of layers (not counting the input layer, but does count output
layer)
neuron_count : list
The number of neurons in each of the layers (in order), does not count
the bias term
Attributes
----------
target_values : list
The possible values for each training vector
"""
with open('train.csv', 'r') as f:
reader = csv.reader(f)
t = list(reader)
train = [[int(x) for x in y] for y in t[1:]]
with open('test.csv', 'r') as f:
reader = csv.reader(f)
raw_nums = list(reader)
test_set = [[int(x) for x in y] for y in raw_nums[1:]]
ans_train = [x[0] for x in train]
train_set = [x[1:] for x in train]
ans_train.pop(0)
train_set.pop(0)
train_set = utils.resample(train_set, random_state=2)
ans_train = utils.resample(ans_train, random_state=2)
network = Network(layers, neuron_count, train_set[0])
network.train(train_set, ans_train, epochs)
# For validation purposes
# guess_list = network.run_unseen(train_set[4000:4500])
# network.report_results(guess_list, ans_train[4000:4500])
# guess_list = network.run_unseen(train_set[4500:5000])
# network.report_results(guess_list, ans_train[4500:5000])
guess_list = network.run_unseen(test_set)
with open('digits.txt', 'w') as d:
for elem in guess_list:
d.write(str(elem)+'\n')
示例6: test_resample_stratified
def test_resample_stratified():
# Make sure resample can stratify
rng = np.random.RandomState(0)
n_samples = 100
p = .9
X = rng.normal(size=(n_samples, 1))
y = rng.binomial(1, p, size=n_samples)
_, y_not_stratified = resample(X, y, n_samples=10, random_state=0,
stratify=None)
assert np.all(y_not_stratified == 1)
_, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
assert not np.all(y_stratified == 1)
assert np.sum(y_stratified) == 9 # all 1s, one 0
示例7: eval_prox_random
def eval_prox_random(self, n_sample_node=5, sample_nodes=[]):
cs = self.cs
measurements = {}
nodes = cs.nodes()
test_nodes = []
if len(sample_nodes):
if type(sample_nodes[0]) is str:
test_nodes = sample_nodes
elif type(sample_nodes[0]) is int:
test_nodes = [nodes[i] for i in sample_nodes]
else:
test_nodes = resample(nodes, n_samples=n_sample_node)
# nae of coordinate-based proximity vs ground-proximity
coor_test = self.coor_all[test_nodes]
ground_prox = (
cs.proximity_to(sources=test_nodes, dests=cs.nodes()).as_matrix().transpose()
) # shape: test_nodes x all_nodes
coor_prox = np.dot(coor_test.as_matrix().transpose(), self.coor_all.as_matrix())
nae = pd.Series.combine(
pd.Series(coor_prox.flatten()), pd.Series(ground_prox.flatten()), lambda c, g: abs(c - g) / g
)
nae_plot = pd.Series(np.linspace(0.0, 1.0, num=len(nae)), index=nae.order())
measurements["nae"] = nae
measurements["nae_plot"] = nae_plot
return measurements
示例8: bootstrap_auc
def bootstrap_auc(df, col, pred_col, n_bootstrap=1000):
"""
Calculate the boostrapped AUC for a given col trying to predict a pred_col.
Parameters
----------
df : pandas.DataFrame
col : str
column to retrieve the values from
pred_col : str
the column we're trying to predict
n_boostrap : int
the number of bootstrap samples
Returns
-------
list : AUCs for each sampling
"""
scores = np.zeros(n_bootstrap)
old_len = len(df)
df.dropna(subset=[col], inplace=True)
new_len = len(df)
if new_len < old_len:
logger.info("Dropping NaN values in %s to go from %d to %d rows" % (col, old_len, new_len))
preds = df[pred_col].astype(int)
for i in range(n_bootstrap):
sampled_counts, sampled_pred = resample(df[col], preds)
if is_single_class(sampled_pred, col=pred_col):
continue
scores[i] = roc_auc_score(sampled_pred, sampled_counts)
return scores
示例9: fit
def fit(self, dataSet):
for clt in self.forest:
randSet= resample(dataSet)
#print "randSet size = %d" % len(randSet)
target = [x[0] for x in randSet]
train = [x[1:] for x in randSet]
clt.fit(train, target)
示例10: boot_estimates
def boot_estimates(model, X, y, nboot):
'''
Evaluate coefficient estimates for nboot boostrap samples
'''
coefs = [np.hstack([model.fit(iX, iy).intercept_, model.fit(iX, iy).coef_.ravel()])
for iX, iy in (resample(X, y) for i in xrange(nboot))]
return np.vstack(coefs)
示例11: downsample
def downsample(y, sizes = [30000, 3000]):
# classes = Counter(y)
res = []
for class_i, sz in enumerate(sizes):
indices = [x for x in y == class_i if x]
res.append(resample(indices, replace = True, n_samples = sz))
return tuple(res)
示例12: run_method_usage
def run_method_usage(methods,cases):
methods = [m[0] for m in methods]
# Bootstrap the percentage error bars:
percents =[]
for i in range(10000):
nc = resample(cases)
percents.append(100*np.sum(nc,axis=0)/len(nc))
percents=np.array(percents)
mean_percents = np.mean(percents,axis=0)
std_percents = np.std(percents,axis=0)*1.96
inds=np.argsort(mean_percents).tolist()
inds.reverse()
avg_usage = np.mean(mean_percents)
fig = plt.figure()
ax = fig.add_subplot(111)
x=np.arange(len(methods))
ax.plot(x,[avg_usage]*len(methods),'-',color='0.25',lw=1,alpha=0.2)
ax.bar(x, mean_percents[inds], 0.6, color=paired[0],linewidth=0,
yerr=std_percents[inds],ecolor=paired[1])
#ax.set_title('Method Occurrence')
ax.set_ylabel('Occurrence %',fontsize=30)
ax.set_xlabel('Method',fontsize=30)
ax.set_xticks(np.arange(len(methods)))
ax.set_xticklabels(np.array(methods)[inds],fontsize=8)
fig.autofmt_xdate()
fix_axes()
plt.tight_layout()
fig.savefig(figure_path+'method_occurrence.pdf', bbox_inches=0)
fig.show()
return inds,mean_percents[inds]
示例13: balanced_resample
def balanced_resample(data, labels):
"""Do a balanced resampling of data and labels, returning them
See the test routine at the bottom for an example of behavior
"""
most_common, num_required = mstats.mode(labels)
possible_labels = np.unique(labels)
data_resampled = []
labels_resampled = []
for possible_label in possible_labels:
in_this_label = labels == possible_label
data_buffered = np.array([])
data_buffered = np.reshape(data_buffered, (0, data.shape[1]))
labels_buffered = np.array([])
while len(data_buffered) < num_required:
data_buffered = np.vstack([data_buffered, data[in_this_label]])
labels_buffered = np.hstack([labels_buffered, labels[in_this_label]])
single_data_resampled, single_labels_resampled = utils.resample(
data_buffered,
labels_buffered,
n_samples=int(num_required),
replace=True
)
data_resampled.append(single_data_resampled)
labels_resampled.append(single_labels_resampled)
return np.vstack(data_resampled).astype(data.dtype), np.hstack(labels_resampled).astype(labels.dtype)
示例14: fit
def fit(self, X, Y):
num_examples = len(X)
data_indices = np.arange(num_examples)
self.data = X
Y = np.array(Y, dtype=float)
sample = resample(data_indices, replace=False, n_samples=min(20, num_examples), random_state=0)
for i in sample:
y = Y[i]
self.S.add(i)
self.y[i] = y
self.alpha[i] = 0.0
self.g[i] = y
for i in xrange(5):
min_delta = 999999999
for i in data_indices:
self.process(i, Y[i])
delta = self.reprocess()
min_delta = min(min_delta, delta)
if min_delta < self.tau: break
data_indices = shuffle(data_indices)
while True:
delta = self.reprocess()
if delta < self.tau: break
示例15: test_mnist
def test_mnist(self):
mnist = fetch_mldata('MNIST original')
X, Y = resample(mnist.data, mnist.target, replace=False, n_samples=1000, random_state=0)
X = X.astype(float)
Y = [1 if y == 0 else -1 for y in Y]
svm = LASVM(C=10, tau=0.001)
svm.fit(X, Y)
X_test, Y_test = resample(mnist.data, mnist.target, replace=False, n_samples=300, random_state=2)
X_test = X_test.astype(float)
Y_test = [1 if y == 0 else -1 for y in Y_test]
Y_predict = svm.predict(X_test)
percent_correct = np.sum(Y_predict == Y_test) / 300.0
self.assertGreater(percent_correct, 0.95)