本文整理匯總了Python中tensorflow.contrib.nccl.all_sum方法的典型用法代碼示例。如果您正苦於以下問題:Python nccl.all_sum方法的具體用法?Python nccl.all_sum怎麽用?Python nccl.all_sum使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類tensorflow.contrib.nccl
的用法示例。
在下文中一共展示了nccl.all_sum方法的9個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: allreduce_grads
# 需要導入模塊: from tensorflow.contrib import nccl [as 別名]
# 或者: from tensorflow.contrib.nccl import all_sum [as 別名]
def allreduce_grads(all_grads, average):
"""
All-reduce average the gradients among K devices. Results are broadcasted to all devices.
Args:
all_grads (K x N): List of list of gradients. N is the number of variables.
average (bool): average gradients or not.
Returns:
K x N: same as input, but each grad is replaced by the average over K devices.
"""
nr_tower = len(all_grads)
if nr_tower == 1:
return all_grads
new_all_grads = [] # N x K
for grads in zip(*all_grads):
summed = nccl.all_sum(grads)
grads_for_devices = [] # K
for g in summed:
with tf.device(g.device):
# tensorflow/benchmarks didn't average gradients
if average:
g = tf.multiply(g, 1.0 / nr_tower)
grads_for_devices.append(g)
new_all_grads.append(grads_for_devices)
# transpose to K x N
ret = list(zip(*new_all_grads))
return ret
示例2: sum_grad_and_var_all_reduce
# 需要導入模塊: from tensorflow.contrib import nccl [as 別名]
# 或者: from tensorflow.contrib.nccl import all_sum [as 別名]
def sum_grad_and_var_all_reduce(grad_and_vars, num_workers, alg, gpu_indices,
aux_devices=None, num_shards=1):
"""Apply all-reduce algorithm over specified gradient tensors."""
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
scaled_grads = [g for g, _ in grad_and_vars]
if alg == 'nccl':
summed_grads = nccl.all_sum(scaled_grads)
elif alg == 'xring':
summed_grads = all_reduce.build_ring_all_reduce(
scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
elif alg == 'nccl/xring':
summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
tf.add)
elif alg == 'nccl/rechd':
summed_grads = all_reduce.build_nccl_then_recursive_hd(scaled_grads, tf.add)
elif alg == 'nccl/pscpu':
summed_grads = all_reduce.build_nccl_then_shuffle(
scaled_grads, aux_devices, tf.add, tf.add_n)
elif alg == 'pscpu/pscpu':
summed_grads = all_reduce.build_shuffle_then_shuffle(
scaled_grads, aux_devices,
# TODO(tucker): devise a way of better specifying the device set
# for the second level.
[aux_devices[0]],
tf.add_n)
elif alg in ['pscpu', 'psgpu']:
summed_grads = all_reduce.build_shuffle_all_reduce(
scaled_grads, aux_devices, tf.add_n)
else:
raise ValueError('unsupported all_reduce alg: ', alg)
result = []
for (_, v), g in zip(grad_and_vars, summed_grads):
result.append([g, v])
return result
示例3: allreduce_grads
# 需要導入模塊: from tensorflow.contrib import nccl [as 別名]
# 或者: from tensorflow.contrib.nccl import all_sum [as 別名]
def allreduce_grads(all_grads, average):
"""
All-reduce average the gradients among K devices. Results are broadcasted to all devices.
Args:
all_grads (K x N): List of list of gradients. N is the number of variables.
average (bool): average gradients or not.
Returns:
K x N: same as input, but each grad is replaced by the average over K devices.
"""
from tensorflow.contrib import nccl
nr_tower = len(all_grads)
if nr_tower == 1:
return all_grads
new_all_grads = [] # N x K
for grads in zip(*all_grads):
summed = nccl.all_sum(grads)
grads_for_devices = [] # K
for g in summed:
with tf.device(g.device):
# tensorflow/benchmarks didn't average gradients
if average:
g = tf.multiply(g, 1.0 / nr_tower)
grads_for_devices.append(g)
new_all_grads.append(grads_for_devices)
# transpose to K x N
ret = list(zip(*new_all_grads))
return ret
示例4: allreduce_grads
# 需要導入模塊: from tensorflow.contrib import nccl [as 別名]
# 或者: from tensorflow.contrib.nccl import all_sum [as 別名]
def allreduce_grads(all_grads, average):
"""
All-reduce average the gradients among K devices. Results are broadcasted to all devices.
Args:
all_grads (K x N): List of list of gradients. N is the number of variables.
average (bool): average gradients or not.
Returns:
K x N: same as input, but each grad is replaced by the average over K devices.
"""
if get_tf_version_tuple() <= (1, 12):
from tensorflow.contrib import nccl
else:
from tensorflow.python.ops import nccl_ops as nccl
nr_tower = len(all_grads)
if nr_tower == 1:
return all_grads
new_all_grads = [] # N x K
for grads in zip(*all_grads):
summed = nccl.all_sum(grads)
grads_for_devices = [] # K
for g in summed:
with tf.device(g.device):
# tensorflow/benchmarks didn't average gradients
if average:
g = tf.multiply(g, 1.0 / nr_tower)
grads_for_devices.append(g)
new_all_grads.append(grads_for_devices)
# transpose to K x N
ret = list(zip(*new_all_grads))
return ret
示例5: _reduced_opt
# 需要導入模塊: from tensorflow.contrib import nccl [as 別名]
# 或者: from tensorflow.contrib.nccl import all_sum [as 別名]
def _reduced_opt(self, tower_grads_vars):
tower_reduced_grads_vars = []
for grads_vars in zip(*tower_grads_vars):
grads = [g for g, _ in grads_vars]
reduced_grads = nccl.all_sum(grads)
reduced_grads_vars = [(g, v) for (_, v), g in zip(grads_vars, reduced_grads)]
tower_reduced_grads_vars.append(reduced_grads_vars)
# Optimizier
tower_train_ops = []
grad_state = [list(x) for x in zip(*tower_reduced_grads_vars)]
for device_id in xrange(self.num_gpus):
with tf.device('/gpu:%d' % device_id):
# Gradients of TOWER_(device_id)
grads = grad_state[device_id]
# Optimizer configure
if self.optimizer == 'Momentum':
opt = tf.train.MomentumOptimizer(self.lr, momentum=0.9)
elif self.optimizer == 'Adam':
opt = tf.train.AdamOptimizer(self.lr, beta1=0.5, beta2=0.999)
# Tower train_ops
tower_train_ops.append(opt.apply_gradients(grads))
print('Optimizer %d has been configured.' % device_id)
return tower_train_ops, tower_reduced_grads_vars
示例6: all_avg_gradients
# 需要導入模塊: from tensorflow.contrib import nccl [as 別名]
# 或者: from tensorflow.contrib.nccl import all_sum [as 別名]
def all_avg_gradients(
tower_gradvars, devices, param_server_device='/gpu:0', usenccl=True):
'''Take the average of gradients across devices'''
if len(devices) == 1:
return tower_gradvars
num_devices = len(devices)
avg_gradvars = []
for layer in zip(*tower_gradvars):
grads_on_devices, vars_on_devices = zip(*layer)
if HAVE_NCCL and usenccl:
# Note: These nccl ops _must_ be run on all devices, else deadlock
# print('ALL_AVG_GRADIENTS GRADS_ON_DEVICES:',
# grads_on_devices) # DEBUG
avg_grads_on_devices = nccl.all_sum(grads_on_devices)
for idev, device in enumerate(devices):
with tf.device(device):
avg_grads_on_devices[idev] *= 1. / num_devices
else:
with tf.device(param_server_device):
avg_grad = tf.reduce_mean(tf.stack(grads_on_devices), 0)
avg_grads_on_devices = [avg_grad] * num_devices
avg_gradvars_on_devices = zip(*(avg_grads_on_devices, vars_on_devices))
avg_gradvars.append(avg_gradvars_on_devices)
return list(zip(*avg_gradvars))
示例7: allreduce_grads
# 需要導入模塊: from tensorflow.contrib import nccl [as 別名]
# 或者: from tensorflow.contrib.nccl import all_sum [as 別名]
def allreduce_grads(all_grads, average=True):
"""
REFERENCE : https://github.com/ppwwyyxx/tensorpack/blob/83e4e187af5765792408e7b7163efd4744d63628/tensorpack/graph_builder/utils.py
All-reduce average the gradients among K devices. Results are broadcasted to all devices.
Args:
all_grads (K x N): List of list of gradients. N is the number of variables.
average (bool): average gradients or not.
Returns:
K x N: same as input, but each grad is replaced by the average over K devices.
"""
from tensorflow.contrib import nccl
nr_tower = len(all_grads)
if nr_tower == 1:
return all_grads
new_all_grads = [] # N x K
for grads in zip(*all_grads):
summed = nccl.all_sum(grads)
grads_for_devices = [] # K
for g in summed:
with tf.device(g.device):
# tensorflow/benchmarks didn't average gradients
if average:
g = tf.multiply(g, 1.0 / nr_tower, name='allreduce_avg')
grads_for_devices.append(g)
new_all_grads.append(grads_for_devices)
# transpose to K x N
ret = list(zip(*new_all_grads))
return ret
示例8: all_avg_gradients
# 需要導入模塊: from tensorflow.contrib import nccl [as 別名]
# 或者: from tensorflow.contrib.nccl import all_sum [as 別名]
def all_avg_gradients(tower_gradvars, devices, param_server_device='/gpu:0'):
if len(devices) == 1:
return tower_gradvars
if have_nccl and FLAGS.nccl:
new_tower_grads = []
contig_list = []
for d, grad_list in zip(devices, tower_gradvars):
with tf.device(d):
flat_grads = [tf.reshape(g, [-1]) for (g, _) in grad_list]
contig_grads = tf.concat(flat_grads, 0)
contig_list.append(contig_grads)
summed_grads = nccl.all_sum(contig_list)
for d, s, grad_list in zip(devices, summed_grads, tower_gradvars):
with tf.device(d):
new_grad_list = [];
sizes = [tf.size(g) for (g, _) in grad_list]
flat_grads = tf.split(s, sizes)
for newg, (oldg, v) in zip(flat_grads, grad_list):
newg = tf.reshape(newg, tf.shape(oldg))
newg *= 1. / len(devices)
new_grad_list.append((newg, v))
new_tower_grads.append(new_grad_list)
return new_tower_grads
else:
num_devices = len(tower_gradvars)
avg_gradvars = []
for layer in zip(*tower_gradvars):
grads_on_devices, vars_on_devices = zip(*layer)
with tf.device(param_server_device):
avg_grad = tf.reduce_mean(tf.stack(grads_on_devices), 0)
avg_grads_on_devices = [avg_grad]*num_devices
avg_gradvars_on_devices = zip(*(avg_grads_on_devices, vars_on_devices))
avg_gradvars.append(avg_gradvars_on_devices)
return list(zip(*avg_gradvars))
示例9: sum_grad_and_var_all_reduce
# 需要導入模塊: from tensorflow.contrib import nccl [as 別名]
# 或者: from tensorflow.contrib.nccl import all_sum [as 別名]
def sum_grad_and_var_all_reduce(grad_and_vars,
num_workers,
alg,
gpu_indices,
aux_devices=None,
num_shards=1):
"""Apply all-reduce algorithm over specified gradient tensors."""
with tf.name_scope('allreduce'):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
scaled_grads = [g for g, _ in grad_and_vars]
if alg == 'nccl':
summed_grads = nccl.all_sum(scaled_grads)
elif alg == 'xring':
summed_grads = all_reduce.build_ring_all_reduce(
scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
elif alg == 'nccl/xring':
summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
tf.add)
elif alg == 'nccl/rechd':
summed_grads = all_reduce.build_nccl_then_recursive_hd(
scaled_grads, tf.add)
elif alg == 'nccl/pscpu':
summed_grads = all_reduce.build_nccl_then_shuffle(
scaled_grads, aux_devices, tf.add, tf.add_n)
elif alg == 'pscpu/pscpu':
summed_grads = all_reduce.build_shuffle_then_shuffle(
scaled_grads,
aux_devices,
# TODO(tucker): devise a way of better specifying the device set
# for the second level.
[aux_devices[0]],
tf.add_n)
elif alg in ['pscpu', 'psgpu']:
summed_grads = all_reduce.build_shuffle_all_reduce(
scaled_grads, aux_devices, tf.add_n)
else:
raise ValueError('unsupported all_reduce alg: ', alg)
result = []
for (_, v), g in zip(grad_and_vars, summed_grads):
result.append([g, v])
return result