本文整理匯總了Python中torch.cuda.comm.broadcast_coalesced方法的典型用法代碼示例。如果您正苦於以下問題:Python comm.broadcast_coalesced方法的具體用法?Python comm.broadcast_coalesced怎麽用?Python comm.broadcast_coalesced使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類torch.cuda.comm
的用法示例。
在下文中一共展示了comm.broadcast_coalesced方法的12個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: forward
# 需要導入模塊: from torch.cuda import comm [as 別名]
# 或者: from torch.cuda.comm import broadcast_coalesced [as 別名]
def forward(ctx, target_gpus, *inputs):
if not all(input.is_cuda for input in inputs):
raise TypeError('Broadcast function not implemented for CPU tensors')
ctx.target_gpus = target_gpus
if len(inputs) == 0:
return tuple()
ctx.num_inputs = len(inputs)
ctx.input_device = inputs[0].get_device()
outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
non_differentiables = []
for idx, input_requires_grad in enumerate(ctx.needs_input_grad[1:]):
if not input_requires_grad:
for output in outputs:
non_differentiables.append(output[idx])
ctx.mark_non_differentiable(*non_differentiables)
return tuple([t for tensors in outputs for t in tensors])
示例2: _sync_params
# 需要導入模塊: from torch.cuda import comm [as 別名]
# 或者: from torch.cuda.comm import broadcast_coalesced [as 別名]
def _sync_params(self):
params = [p.data for p in self.module.parameters()]
result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size)
for tensors, module in zip(result[1:], self._module_copies[1:]):
for tensor, param in zip(tensors, module.parameters()):
param.data.set_(tensor)
# cross-node buffer sync
buffers = list(self.module._all_buffers())
flat_buffers = _flatten_tensors(buffers)
dist.broadcast(flat_buffers, 0)
for buf, synced in zip(buffers, _unflatten_tensors(flat_buffers, buffers)):
buf.copy_(synced)
# intra-node buffer sync
result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size)
for tensors, module in zip(result[1:], self._module_copies[1:]):
for tensor, buf in zip(tensors, module._all_buffers()):
buf.set_(tensor)
示例3: step
# 需要導入模塊: from torch.cuda import comm [as 別名]
# 或者: from torch.cuda.comm import broadcast_coalesced [as 別名]
def step(self, l2norm):
self.optimizer.step()
group_params = [(i, [(n,m[n]) for n,p in self.param_group]) for i,m in enumerate(self.named_params)]
group_params = sorted(group_params, key=lambda x:x[0] if x[0]!=self.device else -1)
params = dict(self.param_group)
for n,p in group_params[0][1]:
if p.data.dtype == torch.half:
p.data.copy_(params[n].data)
else:
p.data = params[n].data
param_list = [[p for n,p in g] for i,g in group_params]
device_list =[i for i,g in group_params]
outputs = broadcast_coalesced(param_list[0], device_list)
for o,p in zip(outputs, param_list):
for x,y in zip(o, p):
y.data.copy_(x.data)
示例4: backward
# 需要導入模塊: from torch.cuda import comm [as 別名]
# 或者: from torch.cuda.comm import broadcast_coalesced [as 別名]
def backward(ctx, dz):
z, var, weight, bias = ctx.saved_tensors
dz = dz.contiguous()
# Undo activation
_act_backward(ctx, z, dz)
if ctx.training:
edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
if ctx.is_master:
edzs, eydzs = [edz], [eydz]
for _ in range(len(ctx.worker_queues)):
edz_w, eydz_w = ctx.master_queue.get()
ctx.master_queue.task_done()
edzs.append(edz_w)
eydzs.append(eydz_w)
edz = comm.reduce_add(edzs) / (ctx.master_queue.maxsize + 1)
eydz = comm.reduce_add(eydzs) / (ctx.master_queue.maxsize + 1)
tensors = comm.broadcast_coalesced((edz, eydz), [edz.get_device()] + ctx.worker_ids)
for ts, queue in zip(tensors[1:], ctx.worker_queues):
queue.put(ts)
else:
ctx.master_queue.put((edz, eydz))
edz, eydz = ctx.worker_queue.get()
ctx.worker_queue.task_done()
else:
edz = dz.new_zeros(dz.size(1))
eydz = dz.new_zeros(dz.size(1))
dx, dweight, dbias = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
dweight = dweight if ctx.affine else None
dbias = dbias if ctx.affine else None
return dx, dweight, dbias, None, None, None, None, None, None, None, None
示例5: forward
# 需要導入模塊: from torch.cuda import comm [as 別名]
# 或者: from torch.cuda.comm import broadcast_coalesced [as 別名]
def forward(ctx, num_inputs, *inputs):
ctx.num_inputs = num_inputs
ctx.target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)]
inputs = [inputs[i:i + num_inputs]
for i in range(0, len(inputs), num_inputs)]
# sort before reduce sum
inputs = sorted(inputs, key=lambda i: i[0].get_device())
results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
return tuple([t for tensors in outputs for t in tensors])
示例6: backward
# 需要導入模塊: from torch.cuda import comm [as 別名]
# 或者: from torch.cuda.comm import broadcast_coalesced [as 別名]
def backward(ctx, *inputs):
inputs = [i.data for i in inputs]
inputs = [inputs[i:i + ctx.num_inputs]
for i in range(0, len(inputs), ctx.num_inputs)]
results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
return (None,) + tuple([Variable(t) for tensors in outputs for t in tensors])
示例7: forward
# 需要導入模塊: from torch.cuda import comm [as 別名]
# 或者: from torch.cuda.comm import broadcast_coalesced [as 別名]
def forward(ctx, num_inputs, *inputs):
ctx.num_inputs = num_inputs
ctx.target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)]
inputs = [inputs[i:i + num_inputs]
for i in range(0, len(inputs), num_inputs)]
# sort before reduce sum
inputs = sorted(inputs, key=lambda i: i[0].get_device())
results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
return tuple([t for tensors in outputs for t in tensors])
示例8: backward
# 需要導入模塊: from torch.cuda import comm [as 別名]
# 或者: from torch.cuda.comm import broadcast_coalesced [as 別名]
def backward(ctx, *inputs):
inputs = [i.data for i in inputs]
inputs = [inputs[i:i + ctx.num_inputs]
for i in range(0, len(inputs), ctx.num_inputs)]
results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
return (None,) + tuple([Variable(t) for tensors in outputs for t in tensors])
示例9: backward
# 需要導入模塊: from torch.cuda import comm [as 別名]
# 或者: from torch.cuda.comm import broadcast_coalesced [as 別名]
def backward(ctx, dz):
x, weight, bias, mean, var = ctx.saved_tensors
dz = dz.contiguous()
# 1. compute \sum(\frac{dJ}{dy_i}) and \sum(\frac{dJ}{dy_i}*\hat{x_i})
sum_dz, sum_dz_xhat = _backend.syncbn_backward_xhat(
dz, x, mean, var, ctx.eps)
if ctx.is_master:
sum_dzs, sum_dz_xhats = [sum_dz], [sum_dz_xhat]
# master : gatther from slaves
for _ in range(ctx.master_queue.maxsize):
sum_dz_w, sum_dz_xhat_w = ctx.master_queue.get()
ctx.master_queue.task_done()
sum_dzs.append(sum_dz_w)
sum_dz_xhats.append(sum_dz_xhat_w)
# master : compute global stats
sum_dz = comm.reduce_add(sum_dzs)
sum_dz_xhat = comm.reduce_add(sum_dz_xhats)
sum_dz /= ctx.N
sum_dz_xhat /= ctx.N
# master : broadcast global stats
tensors = comm.broadcast_coalesced(
(sum_dz, sum_dz_xhat), [mean.get_device()] + ctx.worker_ids)
for ts, queue in zip(tensors[1:], ctx.worker_queues):
queue.put(ts)
else:
# slave : send to master
ctx.master_queue.put((sum_dz, sum_dz_xhat))
# slave : get global stats
sum_dz, sum_dz_xhat = ctx.worker_queue.get()
ctx.worker_queue.task_done()
# do batch norm backward
dx, dweight, dbias = _backend.syncbn_backward(
dz, x, weight, bias, mean, var, sum_dz, sum_dz_xhat,
ctx.affine, ctx.eps)
return dx, dweight, dbias, \
None, None, None, None, None, None
示例10: backward
# 需要導入模塊: from torch.cuda import comm [as 別名]
# 或者: from torch.cuda.comm import broadcast_coalesced [as 別名]
def backward(ctx, dz):
x, _ex, _exs, gamma, beta = ctx.saved_tensors
dz = dz.contiguous()
# BN backward
if dz.is_cuda:
dx, _dex, _dexs, dgamma, dbeta = \
lib.gpu.batchnorm_backward(dz, x, _ex, _exs, gamma, beta, ctx.eps)
else:
raise NotImplemented
if ctx.training:
if ctx.sync:
if ctx.is_master:
_dex, _dexs = [_dex.unsqueeze(0)], [_dexs.unsqueeze(0)]
for _ in range(ctx.master_queue.maxsize):
_dex_w, _dexs_w = ctx.master_queue.get()
ctx.master_queue.task_done()
_dex.append(_dex_w.unsqueeze(0))
_dexs.append(_dexs_w.unsqueeze(0))
_dex = comm.gather(_dex).mean(0)
_dexs = comm.gather(_dexs).mean(0)
tensors = comm.broadcast_coalesced((_dex, _dexs), [_dex.get_device()] + ctx.worker_ids)
for ts, queue in zip(tensors[1:], ctx.worker_queues):
queue.put(ts)
else:
ctx.master_queue.put((_dex, _dexs))
_dex, _dexs = ctx.worker_queue.get()
ctx.worker_queue.task_done()
if x.is_cuda:
dx_ = lib.gpu.expectation_backward(x, _dex, _dexs)
else:
raise NotImplemented
dx = dx + dx_
return dx, dgamma, dbeta, None, None, None, None, None, None, None, None, None
示例11: backward
# 需要導入模塊: from torch.cuda import comm [as 別名]
# 或者: from torch.cuda.comm import broadcast_coalesced [as 別名]
def backward(ctx, dz):
x, _ex, _exs, gamma, beta = ctx.saved_tensors
dz = dz.contiguous()
# BN backward
if dz.is_cuda:
dx, _dex, _dexs, dgamma, dbeta = lib.gpu.batchnorm_backward(dz, x, _ex, _exs, gamma, beta, ctx.eps)
else:
raise NotImplemented
if ctx.training:
if ctx.sync:
if ctx.is_master:
_dex, _dexs = [_dex.unsqueeze(0)], [_dexs.unsqueeze(0)]
for _ in range(ctx.master_queue.maxsize):
_dex_w, _dexs_w = ctx.master_queue.get()
ctx.master_queue.task_done()
_dex.append(_dex_w.unsqueeze(0))
_dexs.append(_dexs_w.unsqueeze(0))
_dex = comm.gather(_dex).mean(0)
_dexs = comm.gather(_dexs).mean(0)
tensors = comm.broadcast_coalesced((_dex, _dexs), [_dex.get_device()] + ctx.worker_ids)
for ts, queue in zip(tensors[1:], ctx.worker_queues):
queue.put(ts)
else:
ctx.master_queue.put((_dex, _dexs))
_dex, _dexs = ctx.worker_queue.get()
ctx.worker_queue.task_done()
if x.is_cuda:
dx_ = lib.gpu.expectation_backward(x, _dex, _dexs)
else:
raise NotImplemented
dx = dx + dx_
return dx, dgamma, dbeta, None, None, None, None, None, None, None, None, None
示例12: backward
# 需要導入模塊: from torch.cuda import comm [as 別名]
# 或者: from torch.cuda.comm import broadcast_coalesced [as 別名]
def backward(ctx, dz):
x, _ex, _exs, gamma, beta = ctx.saved_tensors
dz = dz.contiguous()
# BN backward
dx, _dex, _dexs, dgamma, dbeta = _C.batchnorm_backward(dz, x, _ex, _exs, gamma, beta, ctx.eps)
if ctx.training:
if ctx.sync:
if ctx.is_master:
_dex, _dexs = [_dex.unsqueeze(0)], [_dexs.unsqueeze(0)]
for _ in range(ctx.master_queue.maxsize):
_dex_w, _dexs_w = ctx.master_queue.get()
ctx.master_queue.task_done()
_dex.append(_dex_w.unsqueeze(0))
_dexs.append(_dexs_w.unsqueeze(0))
_dex = comm.gather(_dex).mean(0)
_dexs = comm.gather(_dexs).mean(0)
tensors = comm.broadcast_coalesced((_dex, _dexs), [_dex.get_device()] + ctx.worker_ids)
for ts, queue in zip(tensors[1:], ctx.worker_queues):
queue.put(ts)
else:
ctx.master_queue.put((_dex, _dexs))
_dex, _dexs = ctx.worker_queue.get()
ctx.worker_queue.task_done()
dx_ = _C.expectation_backward(x, _dex, _dexs)
dx = dx + dx_
return dx, dgamma, dbeta, None, None, None, None, None, None, None, None, None