本文整理汇总了Python中tensorflow.moving_average_variables函数的典型用法代码示例。如果您正苦于以下问题:Python moving_average_variables函数的具体用法?Python moving_average_variables怎么用?Python moving_average_variables使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了moving_average_variables函数的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: testMovingAverageVariables
def testMovingAverageVariables(self):
height, width = 3, 3
with self.test_session():
images = tf.random_uniform((5, height, width, 3), seed=1)
ops.batch_norm(images, scale=True)
moving_mean = tf.moving_average_variables()[0]
moving_variance = tf.moving_average_variables()[1]
self.assertEquals(moving_mean.op.name, 'BatchNorm/moving_mean')
self.assertEquals(moving_variance.op.name, 'BatchNorm/moving_variance')
示例2: testCreateVariablesWithoutCenterWithoutScale
def testCreateVariablesWithoutCenterWithoutScale(self):
height, width = 3, 3
with self.test_session():
images = tf.random_uniform((5, height, width, 3), seed=1)
ops.batch_norm(images, center=False, scale=False)
beta = variables.get_variables_by_name('beta')
self.assertEquals(beta, [])
gamma = variables.get_variables_by_name('gamma')
self.assertEquals(gamma, [])
moving_mean = tf.moving_average_variables()[0]
moving_variance = tf.moving_average_variables()[1]
self.assertEquals(moving_mean.op.name, 'BatchNorm/moving_mean')
self.assertEquals(moving_variance.op.name, 'BatchNorm/moving_variance')
示例3: _CheckDecay
def _CheckDecay(self, ema, actual_decay, dim):
tens = _Repeat(10.0, dim)
thirties = _Repeat(30.0, dim)
var0 = tf.Variable(tens, name="v0")
var1 = tf.Variable(thirties, name="v1")
tf.initialize_all_variables().run()
# Note that tensor2 is not a Variable but just a plain Tensor resulting
# from the sum operation.
tensor2 = var0 + var1
update = ema.apply([var0, var1, tensor2])
avg0 = ema.average(var0)
avg1 = ema.average(var1)
avg2 = ema.average(tensor2)
self.assertItemsEqual([var0, var1], tf.moving_average_variables())
self.assertFalse(avg0 in tf.trainable_variables())
self.assertFalse(avg1 in tf.trainable_variables())
self.assertFalse(avg2 in tf.trainable_variables())
tf.initialize_all_variables().run()
self.assertEqual("v0/ExponentialMovingAverage:0", avg0.name)
self.assertEqual("v1/ExponentialMovingAverage:0", avg1.name)
self.assertEqual("add/ExponentialMovingAverage:0", avg2.name)
# Check initial values.
self.assertAllClose(tens, var0.eval())
self.assertAllClose(thirties, var1.eval())
self.assertAllClose(_Repeat(10.0 + 30.0, dim), tensor2.eval())
# Check that averages are initialized correctly.
self.assertAllClose(tens, avg0.eval())
self.assertAllClose(thirties, avg1.eval())
# Note that averages of Tensor's initialize to zeros_like since no value
# of the Tensor is known because the Op has not been run (yet).
self.assertAllClose(_Repeat(0.0, dim), avg2.eval())
# Update the averages and check.
update.run()
dk = actual_decay
expected = _Repeat(10.0 * dk + 10.0 * (1 - dk), dim)
self.assertAllClose(expected, avg0.eval())
expected = _Repeat(30.0 * dk + 30.0 * (1 - dk), dim)
self.assertAllClose(expected, avg1.eval())
expected = _Repeat(0.0 * dk + (10.0 + 30.0) * (1 - dk), dim)
self.assertAllClose(expected, avg2.eval())
# Again, update the averages and check.
update.run()
expected = _Repeat((10.0 * dk + 10.0 * (1 - dk)) * dk + 10.0 * (1 - dk),
dim)
self.assertAllClose(expected, avg0.eval())
expected = _Repeat((30.0 * dk + 30.0 * (1 - dk)) * dk + 30.0 * (1 - dk),
dim)
self.assertAllClose(expected, avg1.eval())
expected = _Repeat(((0.0 * dk + (10.0 + 30.0) * (1 - dk)) * dk +
(10.0 + 30.0) * (1 - dk)),
dim)
self.assertAllClose(expected, avg2.eval())
示例4: get_other_op
def get_other_op(global_step):
batchnorm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
# Track the moving averages of all trainable variables
variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())
variables_averages_op = variable_averages.apply(variables_to_average)
batchnorm_updates_op = tf.group(*batchnorm_updates)
return variables_averages_op, batchnorm_updates_op
示例5: create_init_fn_to_restore
def create_init_fn_to_restore(self, master_checkpoint,
inception_checkpoint=None):
"""Creates an init operations to restore weights from various checkpoints.
Args:
master_checkpoint: path to a checkpoint which contains all weights for
the whole model.
inception_checkpoint: path to a checkpoint which contains weights for the
inception part only.
Returns:
a function to run initialization ops.
"""
all_assign_ops = []
all_feed_dict = {}
def assign_from_checkpoint(variables, checkpoint):
logging.info('Request to re-store %d weights from %s',
len(variables), checkpoint)
if not variables:
logging.error('Can\'t find any variables to restore.')
sys.exit(1)
assign_op, feed_dict = slim.assign_from_checkpoint(checkpoint, variables)
all_assign_ops.append(assign_op)
all_feed_dict.update(feed_dict)
logging.info('variables_to_restore:\n%s' % utils.variables_to_restore().keys())
logging.info('moving_average_variables:\n%s' % [v.op.name for v in tf.moving_average_variables()])
logging.info('trainable_variables:\n%s' % [v.op.name for v in tf.trainable_variables()])
if master_checkpoint:
assign_from_checkpoint(utils.variables_to_restore(), master_checkpoint)
if inception_checkpoint:
variables = utils.variables_to_restore(
'AttentionOcr_v1/conv_tower_fn/INCE', strip_scope=True)
assign_from_checkpoint(variables, inception_checkpoint)
def init_assign_fn(sess):
logging.info('Restoring checkpoint(s)')
sess.run(all_assign_ops, all_feed_dict)
return init_assign_fn
示例6: add_train_step
def add_train_step(self):
with tf.variable_scope('taining'):
loss = slim.losses.cross_entropy_loss(self.logits[0], self.ground_truth, label_smoothing=0.1, weight=1.0)
loss_auxiliary = slim.losses.cross_entropy_loss(self.logits[1], self.ground_truth, label_smoothing=0.1, weight=0.4, scope='aux_loss')
losses = [loss, loss_auxiliary]
regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
total_loss = tf.add_n(losses + regularization_losses, name='total_loss')
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
loss_averages_op = loss_averages.apply(losses + [total_loss])
with tf.control_dependencies([loss_averages_op]):
self.total_loss = tf.identity(total_loss)
apply_gradient_op = self.optimizer.minimize(self.total_loss)
variable_averages = tf.train.ExponentialMovingAverage(inception.MOVING_AVERAGE_DECAY, num_updates=None)
variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())
variables_averages_op = variable_averages.apply(variables_to_average)
batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION)
batchnorm_updates_op = tf.group(*batchnorm_updates)
self.train_step = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op)
示例7: train
#.........这里部分代码省略.........
grads = opt.compute_gradients(loss)
# Keep track of the gradients across all towers.
tower_grads.append(grads)
# We must calculate the mean of each gradient. Note that this is the
# synchronization point across all towers.
grads = _average_gradients(tower_grads)
# Add a summaries for the input processing and global_step.
summaries.extend(input_summaries)
# Add a summary to track the learning rate.
summaries.append(tf.scalar_summary('learning_rate', lr))
# Add histograms for gradients.
for grad, var in grads:
if grad is not None:
summaries.append(
tf.histogram_summary(var.op.name + '/gradients', grad))
# Apply the gradients to adjust the shared variables.
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# Add histograms for trainable variables.
for var in tf.trainable_variables():
summaries.append(tf.histogram_summary(var.op.name, var))
# Track the moving averages of all trainable variables.
# Note that we maintain a "double-average" of the BatchNormalization
# global statistics. This is more complicated then need be but we employ
# this for backward-compatibility with our previous models.
variable_averages = tf.train.ExponentialMovingAverage(
inception.MOVING_AVERAGE_DECAY, global_step)
# Another possiblility is to use tf.slim.get_variables().
variables_to_average = (tf.trainable_variables() +
tf.moving_average_variables())
variables_averages_op = variable_averages.apply(variables_to_average)
# Group all updates to into a single train op.
batchnorm_updates_op = tf.group(*batchnorm_updates)
train_op = tf.group(apply_gradient_op, variables_averages_op,
batchnorm_updates_op)
# Create a saver.
saver = tf.train.Saver(tf.all_variables())
# Build the summary operation from the last tower summaries.
summary_op = tf.merge_summary(summaries)
# Build an initialization operation to run below.
init = tf.initialize_all_variables()
# Start running operations on the Graph. allow_soft_placement must be set to
# True to build towers on GPU, as some of the ops do not have GPU
# implementations.
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement))
sess.run(init)
if FLAGS.pretrained_model_checkpoint_path:
assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
variables_to_restore = tf.get_collection(
slim.variables.VARIABLES_TO_RESTORE)
restorer = tf.train.Saver(variables_to_restore)
restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
print('%s: Pre-trained model restored from %s' %
(datetime.now(), FLAGS.pretrained_model_checkpoint_path))
# Start the queue runners.
tf.train.start_queue_runners(sess=sess)
summary_writer = tf.train.SummaryWriter(
FLAGS.train_dir,
graph_def=sess.graph.as_graph_def(add_shapes=True))
for step in xrange(FLAGS.max_steps):
start_time = time.time()
_, loss_value = sess.run([train_op, loss])
duration = time.time() - start_time
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 10 == 0:
examples_per_sec = FLAGS.batch_size / float(duration)
format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
'sec/batch)')
print(format_str % (datetime.now(), step, loss_value,
examples_per_sec, duration))
if step % 100 == 0:
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
# Save the model checkpoint periodically.
if step % 5000 == 0 or (step + 1) == FLAGS.max_steps:
checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step=step)
示例8: test_restore_ema
def test_restore_ema(self):
# Create 100 phony x, y data points in NumPy, y = x * 0.1 + 0.3
x_data = np.random.rand(100).astype(np.float32)
y_data = x_data * 0.1 + 0.3
# Try to find values for W and b that compute y_data = W * x_data + b
# (We know that W should be 0.1 and b 0.3, but TensorFlow will
# figure that out for us.)
W = tf.Variable(tf.random_uniform([1], -1.0, 1.0), name='W')
b = tf.Variable(tf.zeros([1]), name='b')
y = W * x_data + b
# Minimize the mean squared errors.
loss = tf.reduce_mean(tf.square(y - y_data))
optimizer = tf.train.GradientDescentOptimizer(0.5)
opt_op = optimizer.minimize(loss)
# Track the moving averages of all trainable variables.
ema = tf.train.ExponentialMovingAverage(decay=0.9999)
averages_op = ema.apply(tf.trainable_variables())
with tf.control_dependencies([opt_op]):
train_op = tf.group(averages_op)
# Before starting, initialize the variables. We will 'run' this first.
init = tf.global_variables_initializer()
saver = tf.train.Saver(tf.trainable_variables())
# Launch the graph.
sess = tf.Session()
sess.run(init)
# Fit the line.
for _ in range(201):
sess.run(train_op)
w_reference = sess.run('W/ExponentialMovingAverage:0')
b_reference = sess.run('b/ExponentialMovingAverage:0')
saver.save(sess, os.path.join(self.tmp_dir, "model_ex1"))
tf.reset_default_graph()
tf.train.import_meta_graph(os.path.join(self.tmp_dir, "model_ex1.meta"))
sess = tf.Session()
print('------------------------------------------------------')
for var in tf.global_variables():
print('all variables: ' + var.op.name)
for var in tf.trainable_variables():
print('normal variable: ' + var.op.name)
for var in tf.moving_average_variables():
print('ema variable: ' + var.op.name)
print('------------------------------------------------------')
mode = 1
restore_vars = {}
if mode == 0:
ema = tf.train.ExponentialMovingAverage(1.0)
for var in tf.trainable_variables():
print('%s: %s' % (ema.average_name(var), var.op.name))
restore_vars[ema.average_name(var)] = var
elif mode == 1:
for var in tf.trainable_variables():
ema_name = var.op.name + '/ExponentialMovingAverage'
print('%s: %s' % (ema_name, var.op.name))
restore_vars[ema_name] = var
saver = tf.train.Saver(restore_vars, name='ema_restore')
saver.restore(sess, os.path.join(self.tmp_dir, "model_ex1"))
w_restored = sess.run('W:0')
b_restored = sess.run('b:0')
self.assertAlmostEqual(w_reference, w_restored, 'Restored model modes not use the EMA filtered weight')
self.assertAlmostEqual(b_reference, b_restored, 'Restored model modes not use the EMA filtered bias')
示例9: train
#.........这里部分代码省略.........
gt_images, = tf.py_func(utils.batch_draw_landmarks, [images, lms],
[tf.float32])
summary = tf.image_summary('images',
tf.concat(2, [gt_images, pred_images]),
max_images=5)
summaries.append(tf.histogram_summary('dx', predictions - inits))
summaries.append(summary)
batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION,
scope)
# Add a summary to track the learning rate.
summaries.append(tf.scalar_summary('learning_rate', lr))
# Add histograms for gradients.
for grad, var in grads:
if grad is not None:
summaries.append(tf.histogram_summary(var.op.name +
'/gradients', grad))
# Apply the gradients to adjust the shared variables.
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# Add histograms for trainable variables.
for var in tf.trainable_variables():
summaries.append(tf.histogram_summary(var.op.name, var))
# Track the moving averages of all trainable variables.
# Note that we maintain a "double-average" of the BatchNormalization
# global statistics. This is more complicated then need be but we employ
# this for backward-compatibility with our previous models.
variable_averages = tf.train.ExponentialMovingAverage(
MOVING_AVERAGE_DECAY, global_step)
# Another possibility is to use tf.slim.get_variables().
variables_to_average = (
tf.trainable_variables() + tf.moving_average_variables())
variables_averages_op = variable_averages.apply(variables_to_average)
# Group all updates to into a single train op.
# NOTE: Currently we are not using batchnorm in MDM.
batchnorm_updates_op = tf.group(*batchnorm_updates)
train_op = tf.group(apply_gradient_op, variables_averages_op,
batchnorm_updates_op)
# Create a saver.
saver = tf.train.Saver(tf.all_variables())
# Build the summary operation from the last tower summaries.
summary_op = tf.merge_summary(summaries)
# Start running operations on the Graph. allow_soft_placement must be
# set to True to build towers on GPU, as some of the ops do not have GPU
# implementations.
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
# Build an initialization operation to run below.
init = tf.initialize_all_variables()
print('Initializing variables...')
sess.run(init)
print('Initialized variables.')
if FLAGS.pretrained_model_checkpoint_path:
assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
variables_to_restore = tf.get_collection(
slim.variables.VARIABLES_TO_RESTORE)
restorer = tf.train.Saver(variables_to_restore)
restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
print('%s: Pre-trained model restored from %s' %
(datetime.now(), FLAGS.pretrained_model_checkpoint_path))
# Start the queue runners.
tf.train.start_queue_runners(sess=sess)
summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)
print('Starting training...')
for step in xrange(FLAGS.max_steps):
start_time = time.time()
_, loss_value = sess.run([train_op, total_loss])
duration = time.time() - start_time
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 10 == 0:
examples_per_sec = FLAGS.batch_size / float(duration)
format_str = (
'%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
'sec/batch)')
print(format_str % (datetime.now(), step, loss_value,
examples_per_sec, duration))
if step % 10 == 0:
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
# Save the model checkpoint periodically.
if step % 50 == 0 or (step + 1) == FLAGS.max_steps:
checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step=step)
示例10: train
#.........这里部分代码省略.........
losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
total_loss = tf.add_n(losses, name='total_loss')
if is_chief:
# Compute the moving average of all individual losses and the
# total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summmary to all individual losses and the total loss;
# do the same for the averaged version of the losses.
for l in losses + [total_loss]:
loss_name = l.op.name
# Name each loss as '(raw)' and name the moving average version of the
# loss as the original loss name.
tf.scalar_summary(loss_name + ' (raw)', l)
tf.scalar_summary(loss_name, loss_averages.average(l))
# Add dependency to compute loss_averages.
with tf.control_dependencies([loss_averages_op]):
total_loss = tf.identity(total_loss)
# Track the moving averages of all trainable variables.
# Note that we maintain a 'double-average' of the BatchNormalization
# global statistics.
# This is not needed when the number of replicas are small but important
# for synchronous distributed training with tens of workers/replicas.
exp_moving_averager = tf.train.ExponentialMovingAverage(
inception.MOVING_AVERAGE_DECAY, global_step)
variables_to_average = (
tf.trainable_variables() + tf.moving_average_variables())
# Add histograms for model variables.
for var in variables_to_average:
tf.histogram_summary(var.op.name, var)
# Create synchronous replica optimizer.
opt = tf.train.SyncReplicasOptimizer(
opt,
replicas_to_aggregate=num_replicas_to_aggregate,
replica_id=FLAGS.task_id,
total_num_replicas=num_workers,
variable_averages=exp_moving_averager,
variables_to_average=variables_to_average)
batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION)
assert batchnorm_updates, 'Batchnorm updates are missing'
batchnorm_updates_op = tf.group(*batchnorm_updates)
# Add dependency to compute batchnorm_updates.
with tf.control_dependencies([batchnorm_updates_op]):
total_loss = tf.identity(total_loss)
# Compute gradients with respect to the loss.
grads = opt.compute_gradients(total_loss)
# Add histograms for gradients.
for grad, var in grads:
if grad is not None:
tf.histogram_summary(var.op.name + '/gradients', grad)
apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)
with tf.control_dependencies([apply_gradients_op]):
示例11: build_graph
#.........这里部分代码省略.........
images0 = [tf.image.random_flip_up_down(i) for i in images0]
if hypes["spatial_transformer"]:
images = skin.util.spatial_tranform(
images0, hypes["batch_size"], subset, hypes["loc_net"], hypes["xform_reg"]
)
else:
images = tf.pack([tf.image.resize_images(i, 299, 299) for i in images0])
with tf.name_scope(None):
images = tf.identity(images, name="input")
logits, logits_aux = inception_model.inference(
images=(images - 128) / 128.0,
num_classes=len(self.labels),
for_training=(subset == "train"),
restore_logits=(subset != "train"),
)
with tf.name_scope(None):
logits = tf.identity(logits, name="logits")
tf.histogram_summary("logits", logits)
with tf.name_scope("loss"):
batch_size, num_classes = logits.get_shape().as_list()
labels_sparse = tf.sparse_to_dense(
sparse_indices=tf.transpose(tf.pack([tf.range(batch_size), labels])),
output_shape=[batch_size, num_classes],
sparse_values=np.ones(batch_size, dtype="float32"),
)
loss = tf.nn.softmax_cross_entropy_with_logits(logits, labels_sparse)
loss = tf.reduce_mean(loss, name="loss")
loss_aux = tf.nn.softmax_cross_entropy_with_logits(logits_aux, labels_sparse)
loss_aux = tf.reduce_mean(loss_aux, name="loss_aux")
loss = 0.7 * loss + 0.3 * loss_aux
tf.scalar_summary("loss", loss)
fetches = {"loss": loss, "filenames": filenames, "logits": logits}
def print_graph_ops():
with open("/tmp/graph_ops.txt", "w") as f:
for op in tf.get_default_graph().get_operations():
f.write(op.type.ljust(35) + "\t" + op.name + "\n")
if subset == "train":
reg_losses = tf.get_collection("regularization_losses")
for i, j in enumerate(reg_losses):
if "loc_net" in j.name:
reg_losses[i] *= hypes["loc_net_reg"]
reg_loss = tf.add_n(reg_losses)
tf.scalar_summary("reg_loss", reg_loss)
with tf.variable_scope("reg_loss"):
loss += reg_loss
print_graph_ops()
global_step = tf.Variable(0, name="global_step", trainable=False)
opt = eval("tf.train.{}Optimizer".format("Adam"))(
learning_rate=hypes["learning_rate"],
epsilon=hypes["epsilon"],
beta1=hypes["beta1"],
beta2=hypes["beta2"],
)
grads = opt.compute_gradients(loss)
apply_grads = opt.apply_gradients(grads, global_step)
variable_averages = tf.train.ExponentialMovingAverage(hypes["variable_averages_decay"], global_step)
variables_to_average = tf.trainable_variables() + tf.moving_average_variables()
variables_averages_op = variable_averages.apply(variables_to_average)
batchnorm_updates_op = tf.group(*tf.get_collection("_update_ops_"))
train_op = tf.group(apply_grads, variables_averages_op, batchnorm_updates_op)
for grad, var in grads:
tf.histogram_summary(var.op.name, var)
try:
tf.histogram_summary(var.op.name + "/gradients", grad)
except:
print var.op.name
fetches.update({"reg_loss": reg_loss, "train_op": train_op, "global_step": global_step})
else:
print_graph_ops()
return fetches
示例12: main
def main(argv=None):
# 将简单的运算放在CPU上,只有神经网络的训练过程放在GPU上。
with tf.Graph().as_default(), tf.device('/cpu:0'):
# 定义基本的训练过程
x, y_ = get_input()
regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE)
global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
learning_rate = tf.train.exponential_decay(
LEARNING_RATE_BASE, global_step, 60000 / BATCH_SIZE, LEARNING_RATE_DECAY)
opt = tf.train.GradientDescentOptimizer(learning_rate)
tower_grads = []
reuse_variables = False
# 将神经网络的优化过程跑在不同的GPU上。
for i in range(N_GPU):
# 将优化过程指定在一个GPU上。
with tf.device('/gpu:%d' % i):
with tf.name_scope('GPU_%d' % i) as scope:
cur_loss = get_loss(x, y_, regularizer, scope, reuse_variables)
reuse_variables = True
grads = opt.compute_gradients(cur_loss)
tower_grads.append(grads)
# 计算变量的平均梯度。
grads = average_gradients(tower_grads)
for grad, var in grads:
if grad is not None:
tf.histogram_summary('gradients_on_average/%s' % var.op.name, grad)
# 使用平均梯度更新参数。
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
for var in tf.trainable_variables():
tf.histogram_summary(var.op.name, var)
# 计算变量的滑动平均值。
variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
variables_to_average = (tf.trainable_variables() +tf.moving_average_variables())
variables_averages_op = variable_averages.apply(variables_to_average)
# 每一轮迭代需要更新变量的取值并更新变量的滑动平均值。
train_op = tf.group(apply_gradient_op, variables_averages_op)
saver = tf.train.Saver(tf.all_variables())
summary_op = tf.merge_all_summaries()
init = tf.initialize_all_variables()
with tf.Session(config=tf.ConfigProto(
allow_soft_placement=True, log_device_placement=True)) as sess:
# 初始化所有变量并启动队列。
init.run()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
summary_writer = tf.train.SummaryWriter(MODEL_SAVE_PATH, sess.graph)
for step in range(TRAINING_STEPS):
# 执行神经网络训练操作,并记录训练操作的运行时间。
start_time = time.time()
_, loss_value = sess.run([train_op, cur_loss])
duration = time.time() - start_time
# 每隔一段时间数据当前的训练进度,并统计训练速度。
if step != 0 and step % 10 == 0:
# 计算使用过的训练数据个数。
num_examples_per_step = BATCH_SIZE * N_GPU
examples_per_sec = num_examples_per_step / duration
sec_per_batch = duration / N_GPU
# 输出训练信息。
format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))
# 通过TensorBoard可视化训练过程。
summary = sess.run(summary_op)
summary_writer.add_summary(summary, step)
# 每隔一段时间保存当前的模型。
if step % 1000 == 0 or (step + 1) == TRAINING_STEPS:
checkpoint_path = os.path.join(MODEL_SAVE_PATH, MODEL_NAME)
saver.save(sess, checkpoint_path, global_step=step)
coord.request_stop()
coord.join(threads)
if __name__ == '__main__':
tf.app.run()
示例13: train
def train():
ps_hosts = FLAGS.ps_hosts.split(',')
worker_hosts = FLAGS.worker_hosts.split(',')
print ('PS hosts are: %s' % ps_hosts)
print ('Worker hosts are: %s' % worker_hosts)
server = tf.train.Server(
{'ps': ps_hosts, 'worker': worker_hosts},
job_name = FLAGS.job_name,
task_index=FLAGS.task_id)
if FLAGS.job_name == 'ps':
# `ps` jobs wait for incoming connections from the workers.
server.join()
is_chief = (FLAGS.task_id == 0)
if is_chief:
if tf.gfile.Exists(FLAGS.train_dir):
tf.gfile.DeleteRecursively(FLAGS.train_dir)
tf.gfile.MakeDirs(FLAGS.train_dir)
"""Train CIFAR-10 for a number of steps."""
cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts})
device_setter = tf.train.replica_device_setter(cluster=cluster)
with tf.device(device_setter):
global_step = tf.Variable(0, trainable=False)
# Get images and labels for CIFAR-10.
images, labels = cifar10.distorted_inputs()
# Build a Graph that computes the logits predictions from the
# inference model.
logits = cifar10.inference(images)
# Calculate loss.
loss = cifar10.loss(logits, labels)
num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
global_step,
decay_steps,
LEARNING_RATE_DECAY_FACTOR,
staircase=True)
tf.scalar_summary('learning_rate', lr)
opt = tf.train.GradientDescentOptimizer(lr)
# Track the moving averages of all trainable variables.
exp_moving_averager = tf.train.ExponentialMovingAverage(
MOVING_AVERAGE_DECAY, global_step)
variables_to_average = (
tf.trainable_variables() + tf.moving_average_variables())
opt = tf.train.SyncReplicasOptimizer(
opt,
replicas_to_aggregate=len(worker_hosts),
replica_id=FLAGS.task_id,
total_num_replicas=len(worker_hosts),
variable_averages=exp_moving_averager,
variables_to_average=variables_to_average)
# Compute gradients with respect to the loss.
grads = opt.compute_gradients(loss)
# Add histograms for gradients.
for grad, var in grads:
if grad is not None:
tf.histogram_summary(var.op.name + '/gradients', grad)
apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)
with tf.control_dependencies([apply_gradients_op]):
train_op = tf.identity(loss, name='train_op')
chief_queue_runners = [opt.get_chief_queue_runner()]
init_tokens_op = opt.get_init_tokens_op()
saver = tf.train.Saver()
# We run the summaries in the same thread as the training operations by
# passing in None for summary_op to avoid a summary_thread being started.
# Running summaries and training operations in parallel could run out of
# GPU memory.
sv = tf.train.Supervisor(is_chief=is_chief,
logdir=FLAGS.train_dir,
init_op=tf.initialize_all_variables(),
summary_op=tf.merge_all_summaries(),
global_step=global_step,
saver=saver,
save_model_secs=60)
tf.logging.info('%s Supervisor' % datetime.now())
sess_config = tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement)
#.........这里部分代码省略.........
示例14: train
def train():
assert FLAGS.job_name in ['ps', 'worker'], 'job_name must be ps or worker'
ps_hosts = FLAGS.ps_hosts.split(',')
worker_hosts = FLAGS.worker_hosts.split(',')
tf.logging.info('PS hosts are %s ' % ps_hosts)
tf.logging.info('Worker hosts are %s ' % worker_hosts)
cluster_spec = tf.train.ClusterSpec({'ps': ps_hosts,
'worker': worker_hosts})
server = tf.train.Server(cluster_spec, job_name=FLAGS.job_name,
task_index=FLAGS.task_id)
if FLAGS.job_name == 'ps':
server.join()
else:
"""Train Inception on a dataset for a number of steps."""
# Number of workers and parameter servers are infered from the workers and ps
# hosts string.
num_workers = len(cluster_spec.as_dict()['worker'])
num_parameter_servers = len(cluster_spec.as_dict()['ps'])
# If no value is given, num_replicas_to_aggregate defaults to be the number of
# workers.
if FLAGS.num_replicas_to_aggregate == -1:
num_replicas_to_aggregate = num_workers
else:
num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate
# Both should be greater than 0 in a distributed training.
assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and '
'num_parameter_servers'
' must be > 0.')
# Choose worker 0 as the chief. Note that any worker could be the chief
# but there should be only one chief.
is_chief = (FLAGS.task_id == 0)
# Ops are assigned to worker by default.
with tf.device(tf.train.replica_device_setter(worker_device='/job:worker/task:%d' % FLAGS.task_id,
cluster=cluster_spec)):
# Variables and its related init/assign ops are assigned to ps.
# with slim.scopes.arg_scope(
# [slim.variables.variable, slim.variables.global_step],
# device=slim.variables.VariableDeviceChooser(num_parameter_servers)):
# Create a variable to count the number of train() calls. This equals the
# number of updates applied to the variables.
#global_step = slim.variables.global_step()
global_step = tf.Variable(0, name='global_step', trainable=False)
num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
global_step,
decay_steps,
LEARNING_RATE_DECAY_FACTOR,
staircase=True)
tf.scalar_summary('learning_rate', lr)
opt = tf.train.GradientDescentOptimizer(lr)
images, labels = image_two_stream.distorted_inputs()
logits = image_two_stream.inference_final(images)
total_loss = image_two_stream.loss(logits, labels)
# train_op = image.train(loss, global_step)
if is_chief:
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
losses = tf.get_collection('losses')
loss_averages_op = loss_averages.apply(losses + [total_loss])
for l in losses + [total_loss]:
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.scalar_summary(l.op.name + ' (raw)', l)
tf.scalar_summary(l.op.name, loss_averages.average(l))
with tf.control_dependencies([loss_averages_op]):
total_loss = tf.identity(total_loss)
variable_averages = tf.train.ExponentialMovingAverage(
MOVING_AVERAGE_DECAY, global_step)
variables_averages_op = (tf.trainable_variables() + tf.moving_average_variables())
for var in variables_averages_op:
tf.histogram_summary(var.op.name, var)
opt = tf.train.SyncReplicasOptimizer(
opt,
replicas_to_aggregate=num_replicas_to_aggregate,
replica_id=FLAGS.task_id,
total_num_replicas=num_workers,
variable_averages=variable_averages,
variables_to_average=variables_averages_op)
#batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION)
#assert batchnorm_updates, 'Batchnorm updates are missing'
# batchnorm_updates_op = tf.group(*batchnorm_updates)
## Add dependency to compute batchnorm_updates.
#.........这里部分代码省略.........