本文整理汇总了Python中blocks.algorithms.GradientDescent类的典型用法代码示例。如果您正苦于以下问题:Python GradientDescent类的具体用法?Python GradientDescent怎么用?Python GradientDescent使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了GradientDescent类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
def run(discriminative_regularization=True):
streams = create_celeba_streams(training_batch_size=100,
monitoring_batch_size=500,
include_targets=False)
main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3]
# Compute parameter updates for the batch normalization population
# statistics. They are updated following an exponential moving average.
rval = create_training_computation_graphs(discriminative_regularization)
cg, bn_cg, variance_parameters = rval
pop_updates = list(
set(get_batch_normalization_updates(bn_cg, allow_duplicates=True)))
decay_rate = 0.05
extra_updates = [(p, m * decay_rate + p * (1 - decay_rate))
for p, m in pop_updates]
model = Model(bn_cg.outputs[0])
selector = Selector(
find_bricks(
model.top_bricks,
lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp',
'decoder_convnet', 'decoder_mlp')))
parameters = list(selector.get_parameters().values()) + variance_parameters
# Prepare algorithm
step_rule = Adam()
algorithm = GradientDescent(cost=bn_cg.outputs[0],
parameters=parameters,
step_rule=step_rule)
algorithm.add_updates(extra_updates)
# Prepare monitoring
monitored_quantities_list = []
for graph in [bn_cg, cg]:
cost, kl_term, reconstruction_term = graph.outputs
cost.name = 'nll_upper_bound'
avg_kl_term = kl_term.mean(axis=0)
avg_kl_term.name = 'avg_kl_term'
avg_reconstruction_term = -reconstruction_term.mean(axis=0)
avg_reconstruction_term.name = 'avg_reconstruction_term'
monitored_quantities_list.append(
[cost, avg_kl_term, avg_reconstruction_term])
train_monitoring = DataStreamMonitoring(
monitored_quantities_list[0], train_monitor_stream, prefix="train",
updates=extra_updates, after_epoch=False, before_first_epoch=False,
every_n_epochs=5)
valid_monitoring = DataStreamMonitoring(
monitored_quantities_list[1], valid_monitor_stream, prefix="valid",
after_epoch=False, before_first_epoch=False, every_n_epochs=5)
# Prepare checkpoint
save_path = 'celeba_vae_{}regularization.zip'.format(
'' if discriminative_regularization else 'no_')
checkpoint = Checkpoint(save_path, every_n_epochs=5, use_cpickle=True)
extensions = [Timing(), FinishAfter(after_n_epochs=75), train_monitoring,
valid_monitoring, checkpoint, Printing(), ProgressBar()]
main_loop = MainLoop(data_stream=main_loop_stream,
algorithm=algorithm, extensions=extensions)
main_loop.run()
示例2: train_rnnrbm
def train_rnnrbm(train, rnnrbm, epochs=1000, test=None, bokeh=True,
load_path=None):
cdk = theano.shared(10)
lr = theano.shared(float32(0.004))
cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask, k=cdk)
error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask)
error_rate.name = "error on note as a whole"
mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask)
mistake_rate.name = "single error within note"
cost.name = 'rbm_cost'
model = Model(cost)
cg = ComputationGraph([cost])
step_rule = CompositeRule(
[RemoveNotFinite(), StepClipping(30.0), Adam(learning_rate=lr), StepClipping(6.0),
RemoveNotFinite()]) # Scale(0.01)
gradients = dict(equizip(cg.parameters, T.grad(cost, cg.parameters, consider_constant=[v_sample])))
algorithm = GradientDescent(step_rule=step_rule, gradients=gradients, cost=cost,
params=cg.parameters)
algorithm.add_updates(cg.updates)
extensions = [
SharedVariableModifier(parameter=cdk,
function=lambda n, v: rnnrbm_cdk[n] if rnnrbm_cdk.get(n) else v),
SharedVariableModifier(parameter=lr,
function=lambda n, v: float32(0.78 * v) if n % (200 * 5) == 0 else v),
FinishAfter(after_n_epochs=epochs),
TrainingDataMonitoring(
[cost, error_rate, mistake_rate, ], # hidden_states, debug_val, param_nans,
# aggregation.mean(algorithm.total_gradient_norm)], #+ params,
prefix="train",
after_epoch=False, every_n_batches=40),
Timing(),
Printing(),
ProgressBar()]
if test is not None:
extensions.append(DataStreamMonitoring(
[cost, error_rate, mistake_rate],
data_stream=test,
updates=cg.updates,
prefix="test", after_epoch=False, every_n_batches=40))
if bokeh:
extensions.append(Plot(
'Training RNN-RBM',
channels=[
['train_error on note as a whole', 'train_single error within note',
'test_error on note as a whole',
'test_single error within note'],
['train_final_cost'],
# ['train_total_gradient_norm'],
]))
main_loop = MainLoop(algorithm=algorithm,
data_stream=train,
model=model,
extensions=extensions
)
return main_loop
示例3: test_gradient_descent_finds_inputs_additional_updates
def test_gradient_descent_finds_inputs_additional_updates():
W = shared_floatx(numpy.array([[1, 2], [3, 4]]))
n = shared_floatx(1)
m = tensor.scalar('m')
algorithm = GradientDescent(gradients=OrderedDict([(W, W + 1)]))
algorithm.add_updates([(n, n + m)])
algorithm.initialize()
assert m in algorithm.inputs
示例4: test_gradient_descent
def test_gradient_descent():
W = shared_floatx(numpy.array([[1, 2], [3, 4]]))
W_start_value = W.get_value()
cost = tensor.sum(W ** 2)
algorithm = GradientDescent(cost=cost, parameters=[W])
algorithm.step_rule.learning_rate.set_value(0.75)
algorithm.initialize()
algorithm.process_batch(dict())
assert_allclose(W.get_value(), -0.5 * W_start_value)
示例5: create_main_loop
def create_main_loop(dataset, nvis, nhid, num_epochs, debug_level=0, lrate=1e-3):
seed = 188229
n_inference_steps = 6
num_examples = dataset.num_examples
batch_size = num_examples
train_loop_stream = Flatten(
DataStream.default_stream(
dataset=dataset,
iteration_scheme=SequentialScheme(dataset.num_examples, batch_size) # Repeat(
# , n_inference_steps)
# ShuffledScheme(dataset.num_examples, batch_size), n_inference_steps))
),
which_sources=("features",),
)
model_brick = FivEM(
nvis=nvis,
nhid=nhid,
epsilon=0.01,
batch_size=batch_size,
weights_init=IsotropicGaussian(0.1),
biases_init=Constant(0),
noise_scaling=1,
debug=debug_level,
lateral_x=False,
lateral_h=False,
n_inference_steps=n_inference_steps,
)
model_brick.initialize()
x = tensor.matrix("features")
cost = model_brick.cost(x)
computation_graph = ComputationGraph([cost])
model = Model(cost)
# step_rule = Adam(learning_rate=2e-5, beta1=0.1, beta2=0.001, epsilon=1e-8,
# decay_factor=(1 - 1e-8))
step_rule = Momentum(learning_rate=lrate, momentum=0.95)
# step_rule = AdaDelta()
# step_rule = RMSProp(learning_rate=0.01)
# step_rule = AdaGrad(learning_rate=1e-4)
algorithm = GradientDescent(cost=cost, params=computation_graph.parameters, step_rule=step_rule)
algorithm.add_updates(computation_graph.updates)
extensions = [
Timing(),
FinishAfter(after_n_epochs=num_epochs),
TrainingDataMonitoring([cost] + computation_graph.auxiliary_variables, after_batch=False, after_epoch=True),
# every_n_epochs=1),
Printing(after_epoch=True, after_batch=False), # every_n_epochs=1,
# Checkpoint(path="./fivem.zip",every_n_epochs=10,after_training=True)
]
main_loop = MainLoop(model=model, data_stream=train_loop_stream, algorithm=algorithm, extensions=extensions)
return main_loop
示例6: _test
def _test(f):
W = shared_floatx(numpy.array([[1, 2], [3, 4]]))
W_start_value = W.get_value()
cost = tensor.sum(W ** 2)
gradients = OrderedDict()
gradients[W] = tensor.grad(cost, W)
algorithm = GradientDescent(gradients=f(gradients))
algorithm.step_rule.learning_rate.set_value(0.75)
algorithm.initialize()
algorithm.process_batch(dict())
assert_allclose(W.get_value(), -0.5 * W_start_value)
示例7: test_theano_profile_for_sgd_function
def test_theano_profile_for_sgd_function():
W = shared_floatx(numpy.array([[1, 2], [3, 4]]))
W_start_value = W.get_value()
cost = tensor.sum(W ** 2)
algorithm = GradientDescent(
cost=cost, parameters=[W], theano_func_kwargs={'profile': True})
algorithm.step_rule.learning_rate.set_value(0.75)
algorithm.initialize()
algorithm.process_batch(dict())
assert_allclose(W.get_value(), -0.5 * W_start_value)
assert isinstance(algorithm._function.profile, ProfileStats)
示例8: run
def run():
streams = create_celeba_streams(training_batch_size=100,
monitoring_batch_size=500,
include_targets=True)
main_loop_stream = streams[0]
train_monitor_stream = streams[1]
valid_monitor_stream = streams[2]
cg, bn_dropout_cg = create_training_computation_graphs()
# Compute parameter updates for the batch normalization population
# statistics. They are updated following an exponential moving average.
pop_updates = get_batch_normalization_updates(bn_dropout_cg)
decay_rate = 0.05
extra_updates = [(p, m * decay_rate + p * (1 - decay_rate))
for p, m in pop_updates]
# Prepare algorithm
step_rule = Adam()
algorithm = GradientDescent(cost=bn_dropout_cg.outputs[0],
parameters=bn_dropout_cg.parameters,
step_rule=step_rule)
algorithm.add_updates(extra_updates)
# Prepare monitoring
cost = bn_dropout_cg.outputs[0]
cost.name = 'cost'
train_monitoring = DataStreamMonitoring(
[cost], train_monitor_stream, prefix="train",
before_first_epoch=False, after_epoch=False, after_training=True,
updates=extra_updates)
cost, accuracy = cg.outputs
cost.name = 'cost'
accuracy.name = 'accuracy'
monitored_quantities = [cost, accuracy]
valid_monitoring = DataStreamMonitoring(
monitored_quantities, valid_monitor_stream, prefix="valid",
before_first_epoch=False, after_epoch=False, every_n_epochs=5)
# Prepare checkpoint
checkpoint = Checkpoint(
'celeba_classifier.zip', every_n_epochs=5, use_cpickle=True)
extensions = [Timing(), FinishAfter(after_n_epochs=50), train_monitoring,
valid_monitoring, checkpoint, Printing(), ProgressBar()]
main_loop = MainLoop(data_stream=main_loop_stream, algorithm=algorithm,
extensions=extensions)
main_loop.run()
示例9: test_gradient_descent_spurious_sources
def test_gradient_descent_spurious_sources():
W = shared_floatx(numpy.array([[1, 2], [3, 4]]))
W_start_value = W.get_value()
cost = tensor.sum(W ** 2)
algorithm = GradientDescent(cost=cost, parameters=[W])
algorithm.step_rule.learning_rate.set_value(0.75)
algorithm.initialize()
with assert_raises(ValueError):
algorithm.process_batch(dict(example_id='test'))
algorithm = GradientDescent(cost=cost, parameters=[W],
on_unused_sources='ignore')
algorithm.step_rule.learning_rate.set_value(0.75)
algorithm.initialize()
algorithm.process_batch(dict(example_id='test'))
assert_allclose(W.get_value(), -0.5 * W_start_value)
示例10: train_model
def train_model(cost, unregularized_cost, updates,
train_stream, valid_stream, args, gate_values=None):
step_rule = learning_algorithm(args)
cg = ComputationGraph(cost)
# ADD REGULARIZATION
# WEIGHT NOISE
weight_noise = args.weight_noise
if weight_noise > 0:
weights = VariableFilter(roles=[WEIGHT])(cg.variables)
cg_train = apply_noise(cg, weights, weight_noise)
cost = cg_train.outputs[0]
cost.name = "cost_with_weight_noise"
cg = ComputationGraph(cost)
logger.info(cg.parameters)
# Define algorithm
algorithm = GradientDescent(cost=cost, step_rule=step_rule,
parameters=cg.parameters)
# Add the updates to carry the hidden state
algorithm.add_updates(updates)
# Extensions to be added
extensions = []
# Load from a dumped model
if args.load_path is not None:
extensions.append(Load(args.load_path))
# Generation extension
if args.generate:
extensions.append(TextGenerationExtension(
cost=cost,
generation_length=args.generated_text_lenght,
initial_text_length=args.initial_text_length,
every_n_batches=1,
ploting_path=os.path.join(args.save_path, 'prob_plot.png'),
softmax_sampling=args.softmax_sampling,
dataset=args.dataset,
updates=updates,
interactive_mode=args.interactive_mode))
# Training and Validation score monitoring
extensions.extend([
TrainingDataMonitoring([cost], prefix='train',
every_n_batches=args.monitoring_freq),
DataStreamMonitoring([cost, unregularized_cost],
valid_stream, args.mini_batch_size_valid,
args.dataset,
state_updates=updates,
prefix='valid',
before_first_epoch=(args.visualize == "nothing"),
every_n_batches=args.monitoring_freq)])
# Creating directory for saving model.
if not args.interactive_mode:
if not os.path.exists(args.save_path):
os.makedirs(args.save_path)
elif 'test' in args.save_path:
print "Rewriting in " + args.save_path
else:
raise Exception('Directory already exists')
# Early stopping
extensions.append(EarlyStopping('valid_' + unregularized_cost.name,
args.patience, args.save_path,
every_n_batches=args.monitoring_freq))
# Printing
extensions.append(ProgressBar())
extensions.append(Printing(every_n_batches=args.monitoring_freq))
# Reset the initial states
if args.dataset == "sine":
reset_frequency = 1
else:
reset_frequency = 100
extensions.append(ResetStates([v for v, _ in updates],
every_n_batches=reset_frequency))
# Visualizing extensions
if args.interactive_mode:
extensions.append(InteractiveMode())
main_loop = MainLoop(
model=Model(cost),
data_stream=train_stream,
algorithm=algorithm,
extensions=extensions
)
main_loop.run()
示例11: main
def main(nvis, nhid, encoding_lstm_dim, decoding_lstm_dim, T=1):
x = tensor.matrix('features')
# Construct and initialize model
encoding_mlp = MLP([Tanh()], [None, None])
decoding_mlp = MLP([Tanh()], [None, None])
encoding_lstm = LSTM(dim=encoding_lstm_dim)
decoding_lstm = LSTM(dim=decoding_lstm_dim)
draw = DRAW(nvis=nvis, nhid=nhid, T=T, encoding_mlp=encoding_mlp,
decoding_mlp=decoding_mlp, encoding_lstm=encoding_lstm,
decoding_lstm=decoding_lstm, biases_init=Constant(0),
weights_init=Orthogonal())
draw.push_initialization_config()
encoding_lstm.weights_init = IsotropicGaussian(std=0.001)
decoding_lstm.weights_init = IsotropicGaussian(std=0.001)
draw.initialize()
# Compute cost
cost = -draw.log_likelihood_lower_bound(x).mean()
cost.name = 'nll_upper_bound'
model = Model(cost)
# Datasets and data streams
mnist_train = BinarizedMNIST('train')
train_loop_stream = ForceFloatX(DataStream(
dataset=mnist_train,
iteration_scheme=SequentialScheme(mnist_train.num_examples, 100)))
train_monitor_stream = ForceFloatX(DataStream(
dataset=mnist_train,
iteration_scheme=SequentialScheme(mnist_train.num_examples, 500)))
mnist_valid = BinarizedMNIST('valid')
valid_monitor_stream = ForceFloatX(DataStream(
dataset=mnist_valid,
iteration_scheme=SequentialScheme(mnist_valid.num_examples, 500)))
mnist_test = BinarizedMNIST('test')
test_monitor_stream = ForceFloatX(DataStream(
dataset=mnist_test,
iteration_scheme=SequentialScheme(mnist_test.num_examples, 500)))
# Get parameters and monitoring channels
computation_graph = ComputationGraph([cost])
params = VariableFilter(roles=[PARAMETER])(computation_graph.variables)
monitoring_channels = dict([
('avg_' + channel.tag.name, channel.mean()) for channel in
VariableFilter(name='.*term$')(computation_graph.auxiliary_variables)])
for name, channel in monitoring_channels.items():
channel.name = name
monitored_quantities = monitoring_channels.values() + [cost]
# Training loop
step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95)
algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule)
algorithm.add_updates(computation_graph.updates)
main_loop = MainLoop(
model=model, data_stream=train_loop_stream, algorithm=algorithm,
extensions=[
Timing(),
SerializeMainLoop('vae.pkl', save_separately=['model']),
FinishAfter(after_n_epochs=200),
DataStreamMonitoring(
monitored_quantities, train_monitor_stream, prefix="train",
updates=computation_graph.updates),
DataStreamMonitoring(
monitored_quantities, valid_monitor_stream, prefix="valid",
updates=computation_graph.updates),
DataStreamMonitoring(
monitored_quantities, test_monitor_stream, prefix="test",
updates=computation_graph.updates),
ProgressBar(),
Printing()])
main_loop.run()
示例12: CategoricalCrossEntropy
s.set_value(sqrt(init_var).astype(floatX))
cost = CategoricalCrossEntropy().apply(y.flatten(), probs)
cost.name = 'cost'
error_rate = MisclassificationRate().apply(y.flatten(), probs)
error_rate.name = 'error_rate'
cg = ComputationGraph([cost])
parameters = cg.parameters
# add gradient descent to M,S
if normalization == 'bn2':
for m,s,var in statistics_list:
parameters.extend([m,s])
algorithm = GradientDescent(
cost=cost, parameters=parameters, step_rule=Adam(0.01))
#update the M and S with batch statistics
alpha = 0.1
updates = []
if normalization == 'bn2':
for m,s,var in statistics_list:
updates.append((m, cast(alpha*m + (1-alpha)*var.mean(axis=0), floatX)))
updates.append((s, cast(alpha*s + (1-alpha)*var.std(axis=0) , floatX)))
algorithm.add_updates(updates)
# Since this line wont work with the extension to include parameters
# in the gradient descent. Here's an extension that will do the job.
from blocks.extensions import SimpleExtension
from theano import function
示例13: list
for param in discriminator_cg.parameters:
param.name += '_d'
both = list(set(dsamples_cg.parameters) & set(generator_cg.parameters))
indices = []
for (i, par) in enumerate(dsamples_cg.parameters):
if par in generator_cg.parameters:
indices.append(i)
good_params = [dsamples_cg.parameters[i] for i in indices]
print 'tests'
for param in dsamples_cg.parameters:
print param.name
discriminator_descent = GradientDescent(cost=cost_discriminator,
parameters=discriminator_cg.parameters,
step_rule=RMSProp(learning_rate=0.01, decay_rate=0.97))
print filter(lambda x: x.name[-2:] == '_g', dsamples_cg.parameters)
generator_descent = GradientDescent(cost=cost_generator,
parameters=filter(lambda x: x.name[-2:] == '_g',
dsamples_cg.parameters),
# parameters=good_params,
# parameters=dsamples_cg.parameters,
step_rule=RMSProp(learning_rate=1., decay_rate=0.97))
generator_descent.total_step_norm.name = 'generator_total_step_norm'
generator_descent.total_gradient_norm.name = 'generator_total_gradient_norm'
discriminator_descent.total_step_norm.name = 'discriminator_total_step_norm'
discriminator_descent.total_gradient_norm.name = 'discriminator_total_gradient_norm'
from fuel.datasets import MNIST
mnist = MNIST(("train",))
示例14: main
def main():
nclasses = 27
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=1)
parser.add_argument("--length", type=int, default=180)
parser.add_argument("--num-epochs", type=int, default=100)
parser.add_argument("--batch-size", type=int, default=64)
parser.add_argument("--learning-rate", type=float, default=1e-3)
parser.add_argument("--epsilon", type=float, default=1e-5)
parser.add_argument("--num-hidden", type=int, default=1000)
parser.add_argument("--baseline", action="store_true")
parser.add_argument("--initialization", choices="identity glorot orthogonal uniform".split(), default="identity")
parser.add_argument("--initial-gamma", type=float, default=1e-1)
parser.add_argument("--initial-beta", type=float, default=0)
parser.add_argument("--cluster", action="store_true")
parser.add_argument("--activation", choices=list(activations.keys()), default="tanh")
parser.add_argument("--optimizer", choices="sgdmomentum adam rmsprop", default="rmsprop")
parser.add_argument("--continue-from")
parser.add_argument("--evaluate")
parser.add_argument("--dump-hiddens")
args = parser.parse_args()
np.random.seed(args.seed)
blocks.config.config.default_seed = args.seed
if args.continue_from:
from blocks.serialization import load
main_loop = load(args.continue_from)
main_loop.run()
sys.exit(0)
graphs, extensions, updates = construct_graphs(args, nclasses)
### optimization algorithm definition
if args.optimizer == "adam":
optimizer = Adam(learning_rate=args.learning_rate)
elif args.optimizer == "rmsprop":
optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9)
elif args.optimizer == "sgdmomentum":
optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99)
step_rule = CompositeRule([StepClipping(1.0), optimizer])
algorithm = GradientDescent(
cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule
)
algorithm.add_updates(updates["training"])
model = Model(graphs["training"].outputs[0])
extensions = extensions["training"] + extensions["inference"]
# step monitor
step_channels = []
step_channels.extend(
[
algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name)
for name, param in model.get_parameter_dict().items()
]
)
step_channels.append(algorithm.total_step_norm.copy(name="total_step_norm"))
step_channels.append(algorithm.total_gradient_norm.copy(name="total_gradient_norm"))
step_channels.extend(graphs["training"].outputs)
logger.warning("constructing training data monitor")
extensions.append(TrainingDataMonitoring(step_channels, prefix="iteration", after_batch=True))
# parameter monitor
extensions.append(
DataStreamMonitoring(
[param.norm(2).copy(name="parameter.norm:%s" % name) for name, param in model.get_parameter_dict().items()],
data_stream=None,
after_epoch=True,
)
)
validation_interval = 500
# performance monitor
for situation in "training inference".split():
if situation == "inference" and not args.evaluate:
# save time when we don't need the inference graph
continue
for which_set in "train valid test".split():
logger.warning("constructing %s %s monitor" % (which_set, situation))
channels = list(graphs[situation].outputs)
extensions.append(
DataStreamMonitoring(
channels,
prefix="%s_%s" % (which_set, situation),
every_n_batches=validation_interval,
data_stream=get_stream(
which_set=which_set, batch_size=args.batch_size, num_examples=10000, length=args.length
),
)
)
extensions.extend(
[
TrackTheBest("valid_training_error_rate", "best_valid_training_error_rate"),
DumpBest("best_valid_training_error_rate", "best.zip"),
#.........这里部分代码省略.........
示例15: main
#.........这里部分代码省略.........
train_cost_without_regularization.name = 'cost_without_regularization'
train_cost = train_cost + l2_regularization + train_nit_regularization
train_cost.name = 'cost_with_regularization'
cifar10_train = CIFAR10(("train",))
cifar10_train_stream = RandomPadCropFlip(
NormalizeBatchLevels(DataStream.default_stream(
cifar10_train, iteration_scheme=ShuffledScheme(
cifar10_train.num_examples, batch_size)),
which_sources=('features',)),
(32, 32), pad=4, which_sources=('features',))
test_batch_size = 128
cifar10_test = CIFAR10(("test",))
cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream(
cifar10_test,
iteration_scheme=ShuffledScheme(
cifar10_test.num_examples, test_batch_size)),
which_sources=('features',))
momentum = Momentum(0.01, 0.9)
# Create a step rule that doubles the learning rate of biases, like Caffe.
# scale_bias = Restrict(Scale(2), biases)
# step_rule = CompositeRule([scale_bias, momentum])
# Create a step rule that reduces the learning rate of noise
scale_mask = Restrict(noise_step_rule, mask_parameters)
step_rule = CompositeRule([scale_mask, momentum])
# from theano.compile.nanguardmode import NanGuardMode
# Train with simple SGD
algorithm = GradientDescent(
cost=train_cost, parameters=trainable_parameters,
step_rule=step_rule)
algorithm.add_updates(extra_updates)
#,
# theano_func_kwargs={
# 'mode': NanGuardMode(
# nan_is_error=True, inf_is_error=True, big_is_error=True)})
exp_name = save_to.replace('.%d', '')
# `Timing` extension reports time for reading data, aggregating a batch
# and monitoring;
# `ProgressBar` displays a nice progress bar during training.
extensions = [Timing(),
FinishAfter(after_n_epochs=num_epochs,
after_n_batches=num_batches),
EpochSchedule(momentum.learning_rate, [
(0, 0.01), # Warm up with 0.01 learning rate
(50, 0.1), # Then go back to 0.1
(100, 0.01),
(150, 0.001)
# (83, 0.01), # Follow the schedule in the paper
# (125, 0.001)
]),
EpochSchedule(noise_step_rule.learning_rate, [
(0, 1e-2),
(2, 1e-1),
(4, 1)
# (0, 1e-6),
# (2, 1e-5),
# (4, 1e-4)