本文整理汇总了Python中theano.compat.python2x.OrderedDict.keys方法的典型用法代码示例。如果您正苦于以下问题:Python OrderedDict.keys方法的具体用法?Python OrderedDict.keys怎么用?Python OrderedDict.keys使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类theano.compat.python2x.OrderedDict
的用法示例。
在下文中一共展示了OrderedDict.keys方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from theano.compat.python2x import OrderedDict [as 别名]
# 或者: from theano.compat.python2x.OrderedDict import keys [as 别名]
def main():
var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W')
updates = [(var, add_uniform(input=var, noise_level=.02))]
stats = get_stats(var)
l1 = stats.pop('l1')
l2 = stats.pop('l2')
min = stats.pop('min')
max = stats.pop('max')
var = stats.pop('var')
std = stats.pop('std')
mean = stats.pop('mean')
mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt'))
var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt'))
w_channel = MonitorsChannel('W', monitors=mean_monitor)
stat_channel = MonitorsChannel('stats', monitors=[var_monitor])
monitors = [w_channel, stat_channel]
train_collapsed_raw = collapse_channels(monitors, train=True)
train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw])
train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw])
valid_collapsed_raw = collapse_channels(monitors, valid=True)
valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw])
valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw])
log.debug('compiling...')
f = theano.function(inputs=[], outputs=train_collapsed.values(), updates=updates)
f2 = theano.function(inputs=[], outputs=valid_collapsed.values(), updates=updates)
log.debug('done')
t1=time.time()
for epoch in range(10):
t=time.time()
log.debug(epoch)
vals = f()
m = OrderedDict(zip(train_collapsed.keys(), vals))
for name, service in train_services.items():
if name in m:
service.write(m[name], TRAIN)
log.debug('----- '+make_time_units_string(time.time()-t))
for epoch in range(10):
t = time.time()
log.debug(epoch)
vals = f2()
m = OrderedDict(zip(valid_collapsed.keys(), vals))
for name, service in valid_services.items():
if name in m:
service.write(m[name], VALID)
log.debug('----- ' + make_time_units_string(time.time() - t))
log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
示例2: main
# 需要导入模块: from theano.compat.python2x import OrderedDict [as 别名]
# 或者: from theano.compat.python2x.OrderedDict import keys [as 别名]
def main():
w = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W')
updates = [(w, add_uniform(input=w, noise_level=.02))]
stats = get_stats(w)
l1 = stats.pop('l1')
l2 = stats.pop('l2')
min = stats.pop('min')
max = stats.pop('max')
var = stats.pop('var')
std = stats.pop('std')
mean = stats.pop('mean')
mean_monitor = Monitor('mean', mean, train=True, valid=True)
stat_monitor = Monitor('max', max)
w_channel = MonitorsChannel('W', monitors=mean_monitor)
stat_channel = MonitorsChannel('stats', monitors=[stat_monitor])
monitors = [w_channel, stat_channel]
train_collapsed = collapse_channels(monitors, train=True)
train_collapsed = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
valid_collapsed = collapse_channels(monitors, valid=True)
valid_collapsed = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
plot = Plot(bokeh_doc_name='test_plots', monitor_channels=monitors, open_browser=True)
log.debug('compiling...')
f = theano.function(inputs=[], outputs=list(train_collapsed.values()), updates=updates)
f2 = theano.function(inputs=[], outputs=list(valid_collapsed.values()), updates=updates)
log.debug('done')
t1=time.time()
for epoch in range(100):
t=time.time()
log.debug(epoch)
vals = f()
m = OrderedDict(zip(train_collapsed.keys(), vals))
plot.update_plots(epoch, m)
time.sleep(0.02)
log.debug('----- '+make_time_units_string(time.time()-t))
for epoch in range(100):
t = time.time()
log.debug(epoch)
vals = f2()
m = OrderedDict(zip(valid_collapsed.keys(), vals))
plot.update_plots(epoch, m)
time.sleep(0.02)
log.debug('----- ' + make_time_units_string(time.time() - t))
log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
示例3: get_updates
# 需要导入模块: from theano.compat.python2x import OrderedDict [as 别名]
# 或者: from theano.compat.python2x.OrderedDict import keys [as 别名]
def get_updates(self, grads):
grads = OrderedDict(grads)
updates = OrderedDict()
for param in grads.keys():
# mean_squared_grad := E[g^2]_{t-1}
mean_square_grad = theano.shared(theano._asarray(param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_grad_' + param.name, borrow=False)
self.parameters.append(mean_square_grad)
# mean_square_dx := E[(\Delta x)^2]_{t-1}
mean_square_dx = theano.shared(theano._asarray(param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_dx_' + param.name, borrow=False)
self.parameters.append(mean_square_dx)
# Accumulate gradient
new_mean_squared_grad = self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grads[param])
# Compute update
rms_dx_tm1 = T.sqrt(mean_square_dx + self.epsilon)
rms_grad_t = T.sqrt(new_mean_squared_grad + self.epsilon)
delta_x_t = - rms_dx_tm1 / rms_grad_t * grads[param]
# Accumulate updates
new_mean_square_dx = self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t)
# Apply update
updates[mean_square_grad] = new_mean_squared_grad
updates[mean_square_dx] = new_mean_square_dx
updates[param] = param + delta_x_t
return updates
示例4: __init__
# 需要导入模块: from theano.compat.python2x import OrderedDict [as 别名]
# 或者: from theano.compat.python2x.OrderedDict import keys [as 别名]
def __init__(self, valid=None, invalid=None, valid_equivalent=None):
'''
Check if variables can be expressed without using variables in invalid.
init_valid_equivalent provides a dictionary mapping some invalid
variables to valid ones that can be used instead.
'''
if valid is None:
valid = []
if invalid is None:
invalid = []
if valid_equivalent is None:
valid_equivalent = OrderedDict()
# Nodes that are valid to have in the graph computing outputs
self.valid = set(valid)
# Nodes that are NOT valid to have in the graph computing outputs
self.invalid = set(invalid)
# Mapping from invalid variables to equivalent valid ones.
self.valid_equivalent = valid_equivalent.copy()
self.valid.update(valid_equivalent.values())
self.invalid.update(valid_equivalent.keys())
示例5: get_funcs
# 需要导入模块: from theano.compat.python2x import OrderedDict [as 别名]
# 或者: from theano.compat.python2x.OrderedDict import keys [as 别名]
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None):
"""
Provides the updates for learning with gradient descent + momentum.
Parameters
----------
learning_rate : float
Learning rate coefficient.
grads : dict
A dictionary mapping from the model's parameters to their
gradients.
lr_scalers : dict
A dictionary mapping from the model's parameters to a learning
rate multiplier.
"""
gshared = OrderedDict({p: sharedX(p.get_value() * 0.,
name='%s_grad' % p.name)
for p, g in grads.iteritems()})
gsup = [(gs, g) for gs, g in zip(gshared.values(), grads.values())]
get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x)))
gnorm = get_norms(grads.values())
pnorm = get_norms(grads.keys())
f_grad_shared = theano.function(inp,
[cost, errors, gnorm, pnorm],
updates=gsup)
updates = OrderedDict()
for param, grad in gshared.keys():
vel = sharedX(param.get_value() * 0.)
assert param.dtype == vel.dtype
assert grad.dtype == param.dtype
if param.name is not None:
vel.name = 'vel_' + param.name
scaled_lr = learning_rate * lr_scalers.get(param, 1.)
updates[vel] = self.momentum * vel - scaled_lr * grad
inc = updates[vel]
if self.nesterov_momentum:
inc = self.momentum * inc - scaled_lr * grad
assert inc.dtype == vel.dtype
updates[param] = param + inc
f_update = theano.function([learning_rate],
[],
updates=updates,
on_unused_input='ignore')
return f_grad_shared, f_update
示例6: get_gradients
# 需要导入模块: from theano.compat.python2x import OrderedDict [as 别名]
# 或者: from theano.compat.python2x.OrderedDict import keys [as 别名]
#.........这里部分代码省略.........
L(v, q) = log P(v) + sum_h q(h) log P(h, v) - sum_h q(h) log P(v) + const
L(v, q) = sum_h q(h) log P(h, v) + const
L(v, q) = sum_h q(h) -E(h, v) - log Z + const
so the cost we want to minimize is
expected_energy + log Z + const
Note: for the RBM, this bound is exact, since the KL divergence goes to 0.
"""
variational_params = flatten(q)
# The gradients of the expected energy under q are easy, we can just do that in theano
expected_energy_q = model.expected_energy(X, q).mean()
params = list(model.get_params())
gradients = OrderedDict(safe_zip(params, T.grad(expected_energy_q, params,
consider_constant = variational_params,
disconnected_inputs = 'ignore')))
"""
d/d theta log Z = (d/d theta Z) / Z
= (d/d theta sum_h sum_v exp(-E(v,h)) ) / Z
= (sum_h sum_v - exp(-E(v,h)) d/d theta E(v,h) ) / Z
= - sum_h sum_v P(v,h) d/d theta E(v,h)
"""
layer_to_chains = model.make_layer_to_state(self.num_chains)
def recurse_check(l):
if isinstance(l, (list, tuple)):
for elem in l:
recurse_check(elem)
else:
assert l.get_value().shape[0] == self.num_chains
recurse_check(layer_to_chains.values())
model.layer_to_chains = layer_to_chains
# Note that we replace layer_to_chains with a dict mapping to the new
# state of the chains
updates, layer_to_chains = model.get_sampling_updates(layer_to_chains,
self.theano_rng, num_steps=self.num_gibbs_steps,
return_layer_to_updated = True)
if self.toronto_neg:
# Ruslan Salakhutdinov's undocumented negative phase from
# http://www.mit.edu/~rsalakhu/code_DBM/dbm_mf.m
# IG copied it here without fully understanding it, so it
# only applies to exactly the same model structure as
# in that code.
assert isinstance(model.visible_layer, dbm.BinaryVector)
assert isinstance(model.hidden_layers[0], dbm.BinaryVectorMaxPool)
assert model.hidden_layers[0].pool_size == 1
assert isinstance(model.hidden_layers[1], dbm.BinaryVectorMaxPool)
assert model.hidden_layers[1].pool_size == 1
assert isinstance(model.hidden_layers[2], dbm.Softmax)
assert len(model.hidden_layers) == 3
V_samples = layer_to_chains[model.visible_layer]
H1_samples, H2_samples, Y_samples = [layer_to_chains[layer] for layer in model.hidden_layers]
H1_mf = model.hidden_layers[0].mf_update(state_below=model.visible_layer.upward_state(V_samples),
state_above=model.hidden_layers[1].downward_state(H2_samples),
layer_above=model.hidden_layers[1])
Y_mf = model.hidden_layers[2].mf_update(state_below=model.hidden_layers[1].upward_state(H2_samples))
H2_mf = model.hidden_layers[1].mf_update(state_below=model.hidden_layers[0].upward_state(H1_mf),
state_above=model.hidden_layers[2].downward_state(Y_mf),
layer_above=model.hidden_layers[2])
expected_energy_p = model.energy(V_samples, [H1_mf, H2_mf, Y_samples]).mean()
constants = flatten([V_samples, H1_mf, H2_mf, Y_samples])
neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant = constants)))
else:
warnings.warn("""TODO: reduce variance of negative phase by integrating out
the even-numbered layers. The Rao-Blackwellize method can do this
for you when expected gradient = gradient of expectation, but doing
this in general is trickier.""")
#layer_to_chains = model.rao_blackwellize(layer_to_chains)
expected_energy_p = model.energy(layer_to_chains[model.visible_layer],
[layer_to_chains[layer] for layer in model.hidden_layers]).mean()
samples = flatten(layer_to_chains.values())
for i, sample in enumerate(samples):
if sample.name is None:
sample.name = 'sample_'+str(i)
neg_phase_grads = OrderedDict(safe_zip(params, T.grad(-expected_energy_p, params, consider_constant
= samples, disconnected_inputs='ignore')))
for param in list(gradients.keys()):
gradients[param] = neg_phase_grads[param] + gradients[param]
return gradients, updates
示例7: Optimizer
# 需要导入模块: from theano.compat.python2x import OrderedDict [as 别名]
# 或者: from theano.compat.python2x.OrderedDict import keys [as 别名]
#.........这里部分代码省略.........
if not self.model:
log.error("No self.model for the Optimizer!")
raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() "
"was called from the Model. Try initializing the Optimizer with the model param "
"and calling optimizer.train().")
#########################
# gradients and updates #
#########################
# grab the model parameters to use during training
self.params = self.model.get_params()
# Now create the training cost function for the model to use while training - update parameters
# gradient!
gradients = grad(cost=self.loss_expression, wrt=list(self.params.values()))
# now create the dictionary mapping the parameter with its gradient
gradients = OrderedDict(
[(param, g) for param, g in zip(list(self.params.values()), gradients)]
)
# clip gradients if we want.
gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip)
# Calculate the optimizer updates each run
# This is where the magic happens for a lot of sub-implementations of SGD!
# It tells how to update the params each training epoch
gradient_updates = self.get_updates(gradients)
# Combine the updates from the model also if applicable
updates = self.model.get_updates()
if updates:
updates.update(gradient_updates)
else:
updates = gradient_updates
log.info("%s params: %s", self.model._classname, str(list(self.params.keys())))
############
# monitors #
############
# deal with the monitor channels if they were given (or take them from the plot)
if monitor_channels is None and plot is not None and len(plot.channels) > 0:
monitor_channels = plot.channels
self.train_monitors_dict = {}
self.valid_monitors_dict = {}
self.test_monitors_dict = {}
self.train_monitors_outservice_dict = {}
self.valid_monitors_outservice_dict = {}
self.test_monitors_outservice_dict = {}
if monitor_channels:
# collapse the appropriate monitors into their (name, expression, out_service) tuples
train_collapsed = collapse_channels(monitor_channels, train=True)
valid_collapsed = collapse_channels(monitor_channels, valid=True)
test_collapsed = collapse_channels(monitor_channels, test=True)
# get name: expression dictionary
self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed])
self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed])
self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed])
# get name: outservice dictionary
self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed])
self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed])
self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed])
# finally deal with an outservice provided to monitor training cost
self.train_outservice = train_outservice
# remove redundant files made by the fileservice for the train monitor.
# TODO: THIS FEELS LIKE A HACK. I don't like it.
if isinstance(self.train_outservice, FileService):
os.remove(self.train_outservice.valid_filename)
示例8: __init__
# 需要导入模块: from theano.compat.python2x import OrderedDict [as 别名]
# 或者: from theano.compat.python2x.OrderedDict import keys [as 别名]
#.........这里部分代码省略.........
cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name)
cache_updates[self.param_to_cache[param]] = param
cached = self.param_to_cache[param]
g = self.param_to_grad_shared[param]
if lr_scalers is not None and param in lr_scalers:
scaled_alpha = alpha * lr_scalers[param]
else:
scaled_alpha = alpha
mul = scaled_alpha * g
diff = cached - mul
goto_updates[param] = diff
self._cache_values = function([], updates = cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values')
assert isinstance(param_constrainers, (list, tuple))
for param_constrainer in param_constrainers:
param_constrainer(goto_updates)
self._goto_alpha = function([alpha], updates=goto_updates,
mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha')
norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()]))
norm.name = 'BatchGradientDescent.norm'
normalize_grad_updates = OrderedDict()
for grad_shared in self.param_to_grad_shared.values():
normalize_grad_updates[grad_shared] = grad_shared / norm
# useful for monitoring
self.ave_grad_size = sharedX(0.)
self.new_weight = sharedX(1.)
normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size
self._normalize_grad = function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode,
name='BatchGradientDescent._normalize_grad')
if self.conjugate:
grad_shared = self.param_to_grad_shared.values()
grad_to_old_grad = OrderedDict()
for elem in grad_shared:
grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_'+elem.name)
self._store_old_grad = function([norm], updates = OrderedDict([(grad_to_old_grad[g], g * norm)
for g in grad_to_old_grad]), mode=self.theano_function_mode,
name='BatchGradientDescent._store_old_grad')
grad_ordered = list(grad_to_old_grad.keys())
old_grad_ordered = [ grad_to_old_grad[g] for g in grad_ordered]
def dot_product(x, y):
return sum([ (x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y) ])
beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
(1e-7+dot_product(old_grad_ordered, old_grad_ordered))
assert beta_pr.ndim == 0
beta = T.maximum(beta_pr, 0.)
"""
beta_pr is the Polak-Ribiere formula for beta.
According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
(ie, it is meant to revert to steepest descent when you have traveled far enough that
the objective function is behaving non-quadratically enough that the conjugate gradient
formulas aren't working anymore)
http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method
"""
assert grad not in grad_to_old_grad
make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered]
mode = self.theano_function_mode
if mode is not None and hasattr(mode, 'record'):
for v, u in make_conjugate_updates:
mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
+ var_descriptor(v) + '\n')
mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
+ var_descriptor(u) + '\n')
self._make_conjugate = function([], updates=make_conjugate_updates,
mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate')
if mode is not None and hasattr(mode, 'record'):
for output in self._make_conjugate.maker.fgraph.outputs:
mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
+ var_descriptor(output) + '\n')
if tol is None:
if objective.dtype == "float32":
self.tol = 1e-6
else:
self.tol = 3e-7
else:
self.tol = tol
self.ave_step_size = sharedX(0.)
self.ave_grad_mult = sharedX(0.)
示例9: Monitor
# 需要导入模块: from theano.compat.python2x import OrderedDict [as 别名]
# 或者: from theano.compat.python2x.OrderedDict import keys [as 别名]
#.........这里部分代码省略.........
batch_size=b,
num_batches=n,
topo=self.topo,
targets=self.require_label,
rng=sd)
actual_ne = 0
for X in myiterator:
if self.require_label:
X, y = X
self.run_prereqs(X,y,d)
a(X, y)
else:
self.run_prereqs(X, None, d)
a(X)
if X.ndim == 2:
actual_batch_size = X.shape[0]
else:
actual_batch_size = X.shape[d.get_topo_batch_axis()]
actual_ne += actual_batch_size
# end for X
if actual_ne != ne:
raise RuntimeError("At compile time, your iterator said it had "
+ str(ne) + " examples total, but at runtime it gave us "
+ str(actual_ne) + ".")
# end for d
log.info("Monitoring step:")
log.info("\tEpochs seen: %d" % self._epochs_seen)
log.info("\tBatches seen: %d" % self._num_batches_seen)
log.info("\tExamples seen: %d" % self._examples_seen)
t = time.time() - self.t0
for channel_name in sorted(self.channels.keys(), key=number_aware_alphabetical_key):
channel = self.channels[channel_name]
channel.time_record.append(t)
channel.batch_record.append(self._num_batches_seen)
channel.example_record.append(self._examples_seen)
channel.epoch_record.append(self._epochs_seen)
val = channel.val_shared.get_value()
channel.val_record.append(val)
# TODO: use logging infrastructure so that user can configure
# formatting
if abs(val) < 1e4:
val_str = str(val)
else:
val_str = '%.3e' % val
log.info("\t%s: %s" % (channel_name, val_str))
def run_prereqs(self, X, y, dataset):
if dataset not in self.prereqs:
return
for prereq in self.prereqs[dataset]:
prereq(X,y)
def get_batches_seen(self):
""" Returns the number of batches the model has learned on (assuming
that the learning code has been calling Monitor.report_batch correctly)
"""
return self._num_batches_seen
def get_epochs_seen(self):
return self._epochs_seen
def get_examples_seen(self):
示例10: __init__
# 需要导入模块: from theano.compat.python2x import OrderedDict [as 别名]
# 或者: from theano.compat.python2x.OrderedDict import keys [as 别名]
#.........这里部分代码省略.........
if param.name is None:
param_name = 'anon_param'
else:
param_name = param.name
cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name)
cache_updates[self.param_to_cache[param]] = param
cached = self.param_to_cache[param]
g = self.param_to_grad_shared[param]
if lr_scalers is not None and param in lr_scalers:
scaled_alpha = alpha * lr_scalers[param]
else:
scaled_alpha = alpha
mul = scaled_alpha * g
diff = cached - mul
goto_updates[param] = diff
self._cache_values = function([], updates = cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values')
assert isinstance(param_constrainers, (list, tuple))
for param_constrainer in param_constrainers:
param_constrainer(goto_updates)
self._goto_alpha = function([alpha], updates=goto_updates,
mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha')
norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()]))
norm.name = 'BatchGradientDescent.norm'
normalize_grad_updates = OrderedDict()
for grad_shared in self.param_to_grad_shared.values():
normalize_grad_updates[grad_shared] = grad_shared / norm
# useful for monitoring
self.ave_grad_size = sharedX(0.)
self.new_weight = sharedX(1.)
normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size
self._normalize_grad = function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode,
name='BatchGradientDescent._normalize_grad')
if self.conjugate:
grad_shared = self.param_to_grad_shared.values()
grad_to_old_grad = OrderedDict()
for elem in grad_shared:
grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_'+elem.name)
self._store_old_grad = function([norm], updates = OrderedDict([(grad_to_old_grad[g_], g_ * norm)
for g_ in grad_to_old_grad]), mode=self.theano_function_mode,
name='BatchGradientDescent._store_old_grad')
grad_ordered = list(grad_to_old_grad.keys())
old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered]
def dot_product(x, y):
return sum([ (x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y) ])
beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
(1e-7+dot_product(old_grad_ordered, old_grad_ordered))
assert beta_pr.ndim == 0
beta = T.maximum(beta_pr, 0.)
#beta_pr is the Polak-Ribiere formula for beta.
#According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
#but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
#(ie, it is meant to revert to steepest descent when you have traveled far enough that
#the objective function is behaving non-quadratically enough that the conjugate gradient
#formulas aren't working anymore)
#http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method
assert grad not in grad_to_old_grad
make_conjugate_updates = [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered]
mode = self.theano_function_mode
if mode is not None and hasattr(mode, 'record'):
for v, u in make_conjugate_updates:
mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
+ var_descriptor(v) + '\n')
mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
+ var_descriptor(u) + '\n')
self._make_conjugate = function([], updates=make_conjugate_updates,
mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate')
if mode is not None and hasattr(mode, 'record'):
for output in self._make_conjugate.maker.fgraph.outputs:
mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
+ var_descriptor(output) + '\n')
if tol is None:
if objective.dtype == "float32":
self.tol = 1e-6
else:
self.tol = 3e-7
else:
self.tol = tol
self.ave_step_size = sharedX(0.)
self.ave_grad_mult = sharedX(0.)
示例11: Monitor
# 需要导入模块: from theano.compat.python2x import OrderedDict [as 别名]
# 或者: from theano.compat.python2x.OrderedDict import keys [as 别名]
#.........这里部分代码省略.........
data_specs=self._flat_data_specs,
return_tuple=True,
rng=sd)
# If self._flat_data_specs is empty, no channel needs data,
# so we do not need to call the iterator in order to average
# the monitored values across different batches, we only
# have to call them once.
if len(self._flat_data_specs[1]) == 0:
X = ()
self.run_prereqs(X, d)
a(*X)
else:
actual_ne = 0
for X in myiterator:
# X is a flat (not nested) tuple
self.run_prereqs(X, d)
a(*X)
actual_ne += self._flat_data_specs[0].np_batch_size(X)
# end for X
if actual_ne != ne:
raise RuntimeError("At compile time, your iterator said "
"it had %d examples total, but at "
"runtime it gave us %d." %
(ne, actual_ne))
# end for d
log.info("Monitoring step:")
log.info("\tEpochs seen: %d" % self._epochs_seen)
log.info("\tBatches seen: %d" % self._num_batches_seen)
log.info("\tExamples seen: %d" % self._examples_seen)
t = time.time() - self.t0
for channel_name in sorted(self.channels.keys(),
key=number_aware_alphabetical_key):
channel = self.channels[channel_name]
channel.time_record.append(t)
channel.batch_record.append(self._num_batches_seen)
channel.example_record.append(self._examples_seen)
channel.epoch_record.append(self._epochs_seen)
val = channel.val_shared.get_value()
channel.val_record.append(val)
# TODO: use logging infrastructure so that user can configure
# formatting
if abs(val) < 1e4:
val_str = str(val)
else:
val_str = '%.3e' % val
log.info("\t%s: %s" % (channel_name, val_str))
def run_prereqs(self, data, dataset):
"""
Runs all "prerequistie functions" on a batch of data. Always
called right before computing the monitoring channels on that
batch.
Parameters
----------
data : tuple or Variable
a member of the Space used as input to the monitoring
functions
dataset : Dataset
the Dataset the data was drawn from
"""
if dataset not in self.prereqs:
示例12: Optimizer
# 需要导入模块: from theano.compat.python2x import OrderedDict [as 别名]
# 或者: from theano.compat.python2x.OrderedDict import keys [as 别名]
#.........这里部分代码省略.........
# gradient!
gradients, _ = self.model.get_gradient(cost=train_cost)
# Calculate the optimizer updates each run
# This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta!
# It tells how to update the params each training epoch
gradient_updates = self.get_updates(gradients)
# Combine the updates from the model also if applicable
train_updates = self.model.get_updates()
if train_updates:
train_updates.update(gradient_updates)
else:
train_updates = gradient_updates
# Compile the training function!
log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_costs),
str(type(self.model)))
t = time.time()
f_learn = function(inputs=[data_idx, data_end_idx],
updates=train_updates,
outputs=train_cost,
givens=train_givens,
name='f_learn_%d' % i)
log.info('f_learn compilation took %s', make_time_units_string(time.time() - t))
self.train_functions.append(f_learn)
# grab the expression(s) to use to monitor different model values during training
log.debug("Compiling monitor functions...")
monitor_t = time.time()
self.monitors = OrderedDict(self.model.get_monitors())
self.monitor_names = self.monitors.keys()
if len(self.monitors.keys()) > 0:
self.train_monitor_function = function(
inputs=[data_idx, data_end_idx],
updates=self.model.get_updates(),
outputs=self.monitors.values(),
givens=train_givens,
name="train_monitor_function"
)
if len(self.monitors.keys()) > 0:
self.valid_monitor_function = function(
inputs=[data_idx, data_end_idx],
updates=self.model.get_updates(),
outputs=self.monitors.values(),
givens=valid_givens,
name="valid_monitor_function"
)
if len(self.monitors.keys()) > 0:
self.test_monitor_function = function(
inputs=[data_idx, data_end_idx],
updates=self.model.get_updates(),
outputs=self.monitors.values(),
givens=test_givens,
name="test_monitor_function"
)
log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t))
self.noise_switches = raise_to_list(self.model.get_noise_switch())
##################
# start training #
##################
# make sure to deal with a list of train_cost functions - for layer-wise pretraining!
示例13: Optimizer
# 需要导入模块: from theano.compat.python2x import OrderedDict [as 别名]
# 或者: from theano.compat.python2x.OrderedDict import keys [as 别名]
#.........这里部分代码省略.........
log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT")
self.STOP = True
# save params
if self.best_params is not None:
log.debug("Restoring best model parameters...")
set_shared_values(self.params, self.best_params)
log.debug("Saving model parameters...")
self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl')
log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))
log.info("------------TOTAL %s TRAIN TIME TOOK %s---------",
str(type(self.model)), make_time_units_string(time.time() - start_time))
def _perform_one_epoch(self, f_learn, plot=None):
"""
Performs a single training iteration with the given learn function.
"""
self.epoch_counter += 1
t = time.time()
log.info('EPOCH %s', str(self.epoch_counter))
# set the noise switches on for training function! (this is where things like dropout happen)
switch_vals = []
if len(self.noise_switches) > 0 and (self.valid_flag or self.test_flag or self.epoch_counter == 1):
log.debug("Turning on %s noise switches", str(len(self.noise_switches)))
switch_vals = [switch.get_value() for switch in self.noise_switches]
[switch.set_value(1.) for switch in self.noise_switches]
# train
train_costs = []
train_monitors = {key: [] for key in self.train_monitors_dict.keys()}
for batch_start, batch_end in self.train_batches:
_outs = raise_to_list(f_learn(batch_start, batch_end))
train_costs.append(_outs[0])
# handle any user defined monitors
if len(train_monitors) > 0:
current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:])
for name, val in current_monitors:
train_monitors[name].append(val)
# get the mean values for the batches
mean_train = numpy.mean(train_costs, 0)
current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()}
# log the mean values!
log.info('Train cost: %s', trunc(mean_train))
if len(current_mean_monitors) > 0:
log.info('Train monitors: %s', str(current_mean_monitors))
# send the values to their outservices
if self.train_outservice:
self.train_outservice.write(mean_train, TRAIN)
for name, service in self.train_monitors_outservice_dict.items():
if name in current_mean_monitors and service:
service.write(current_mean_monitors[name], TRAIN)
# if there is a plot, also send them over!
if plot:
current_mean_monitors.update({TRAIN_COST_KEY: mean_train})
plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors)
# set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :)
if len(self.noise_switches) > 0 and (self.valid_flag or self.test_flag):
log.debug("Turning off %s noise switches", str(len(self.noise_switches)))
[switch.set_value(0.) for switch in self.noise_switches]