本文整理汇总了Python中torch.distributed._backend方法的典型用法代码示例。如果您正苦于以下问题:Python distributed._backend方法的具体用法?Python distributed._backend怎么用?Python distributed._backend使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类torch.distributed
的用法示例。
在下文中一共展示了distributed._backend方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import _backend [as 别名]
def __init__(self, module):
super(DistributedDataParallel, self).__init__()
self.warn_on_half = True#$ True if dist._backend == dist.dist_backend.GLOO else False
self.module = module
for p in self.module.state_dict().values():
if torch.is_tensor(p):
dist.broadcast(p, 0)
def allreduce_params():
if(self.needs_reduction):
self.needs_reduction = False
buckets = {}
for param in self.module.parameters():
if param.requires_grad and param.grad is not None:
tp = type(param.data)
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
if self.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case.")
self.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(self.module.parameters()):
if param.requires_grad:
def allreduce_hook(*unused):
param._execution_engine.queue_callback(allreduce_params)
param.register_hook(allreduce_hook)
示例2: reduce
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import _backend [as 别名]
def reduce(self, op):
"""
Reduces average value over all workers.
:param op: 'sum' or 'mean', reduction operator
"""
if op not in ('sum', 'mean'):
raise NotImplementedError
distributed = (get_world_size() > 1)
if distributed:
if(hasattr(dist, "get_backend")):
backend = dist.get_backend()
else:
backend = dist._backend
cuda = (backend == dist.dist_backend.NCCL)
if cuda:
avg = torch.cuda.FloatTensor([self.avg])
_sum = torch.cuda.FloatTensor([self.sum])
else:
avg = torch.FloatTensor([self.avg])
_sum = torch.FloatTensor([self.sum])
dist.all_reduce(avg, op=dist.reduce_op.SUM)
dist.all_reduce(_sum, op=dist.reduce_op.SUM)
self.avg = avg.item()
self.sum = _sum.item()
if op == 'mean':
self.avg /= get_world_size()
self.sum /= get_world_size()
示例3: __init__
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import _backend [as 别名]
def __init__(self, module):
super(DistributedDataParallel, self).__init__()
self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
self.module = module
param_list = [param for param in self.module.state_dict().values() if torch.is_tensor(param)]
if dist._backend == dist.dist_backend.NCCL:
for param in param_list:
assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU."
#broadcast parameters
flat_dist_call(param_list, dist.broadcast, (0,) )
#all reduce gradient hook
def allreduce_params():
if(self.needs_reduction):
self.needs_reduction = False
else:
return
grads = [param.grad.data for param in self.module.parameters() if param.grad is not None]
flat_dist_call(grads, dist.all_reduce)
for param in list(self.module.parameters()):
def allreduce_hook(*unused):
torch.autograd.Variable._execution_engine.queue_callback(allreduce_params)
if param.requires_grad:
param.register_hook(allreduce_hook)
示例4: __getstate__
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import _backend [as 别名]
def __getstate__(self):
attrs = copy.copy(self.__dict__)
if self._backend != self.backend_enum_holder.NCCL:
del attrs['self.bucket_streams']
del attrs['self.bucket_events']
return attrs
示例5: __init__
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import _backend [as 别名]
def __init__(self, module):
super(DistributedDataParallel, self).__init__()
#fallback for PyTorch 0.3
if not hasattr(dist, '_backend'):
self.warn_on_half = True
else:
self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
self.module = module
for p in self.module.state_dict().values():
if not torch.is_tensor(p):
continue
dist.broadcast(p, 0)
def allreduce_params():
if(self.needs_reduction):
self.needs_reduction = False
buckets = {}
for param in self.module.parameters():
if param.requires_grad and param.grad is not None:
tp = type(param.data)
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
if self.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case. This currently requires" +
"PyTorch built from top of tree master.")
self.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(self.module.parameters()):
def allreduce_hook(*unused):
param._execution_engine.queue_callback(allreduce_params)
if param.requires_grad:
param.register_hook(allreduce_hook)
示例6: apply_gradient_allreduce
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import _backend [as 别名]
def apply_gradient_allreduce(module):
if not hasattr(dist, '_backend'):
module.warn_on_half = True
else:
module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
for p in module.state_dict().values():
if not torch.is_tensor(p):
continue
dist.broadcast(p, 0)
def allreduce_params():
if(module.needs_reduction):
module.needs_reduction = False
buckets = {}
for param in module.parameters():
if param.requires_grad and param.grad is not None:
tp = param.data.dtype
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
if module.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case. This currently requires" +
"PyTorch built from top of tree master.")
module.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(module.parameters()):
def allreduce_hook(*unused):
Variable._execution_engine.queue_callback(allreduce_params)
if param.requires_grad:
param.register_hook(allreduce_hook)
def set_needs_reduction(self, input, output):
self.needs_reduction = True
module.register_forward_hook(set_needs_reduction)
return module
示例7: __init__
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import _backend [as 别名]
def __init__(self, module):
super(DistributedDataParallel, self).__init__()
self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
self.module = module
for p in self.module.state_dict().values():
if not torch.is_tensor(p):
continue
if dist._backend == dist.dist_backend.NCCL:
assert p.is_cuda, "NCCL backend only supports model parameters to be on GPU."
dist.broadcast(p, 0)
def allreduce_params():
if(self.needs_reduction):
self.needs_reduction = False
buckets = {}
for param in self.module.parameters():
if param.requires_grad and param.grad is not None:
tp = param.data.type()
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
if self.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case.")
self.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(self.module.parameters()):
def allreduce_hook(*unused):
param._execution_engine.queue_callback(allreduce_params)
if param.requires_grad:
param.register_hook(allreduce_hook)
示例8: __init__
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import _backend [as 别名]
def __init__(self, module):
super(DistributedDataParallel, self).__init__()
#fallback for PyTorch 0.3
if not hasattr(dist, '_backend'):
self.warn_on_half = True
else:
self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
self.module = module
for p in list(self.module.state_dict().values()):
if not torch.is_tensor(p):
continue
dist.broadcast(p, 0)
def allreduce_params():
if(self.needs_reduction):
self.needs_reduction = False
buckets = {}
for param in self.module.parameters():
if param.requires_grad and param.grad is not None:
tp = type(param.data)
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
if self.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print(("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case. This currently requires" +
"PyTorch built from top of tree master."))
self.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(self.module.parameters()):
def allreduce_hook(*unused):
param._execution_engine.queue_callback(allreduce_params)
if param.requires_grad:
param.register_hook(allreduce_hook)
示例9: apply_gradient_allreduce
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import _backend [as 别名]
def apply_gradient_allreduce(module):
if not hasattr(dist, '_backend'):
module.warn_on_half = True
else:
module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
for p in list(module.state_dict().values()):
if not torch.is_tensor(p):
continue
dist.broadcast(p, 0)
def allreduce_params():
if(module.needs_reduction):
module.needs_reduction = False
buckets = {}
for param in module.parameters():
if param.requires_grad and param.grad is not None:
tp = type(param.data)
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
if module.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print(("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case. This currently requires" +
"PyTorch built from top of tree master."))
module.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(module.parameters()):
def allreduce_hook(*unused):
Variable._execution_engine.queue_callback(allreduce_params)
if param.requires_grad:
param.register_hook(allreduce_hook)
def set_needs_reduction(self, input, output):
self.needs_reduction = True
module.register_forward_hook(set_needs_reduction)
return module
示例10: apply_gradient_allreduce
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import _backend [as 别名]
def apply_gradient_allreduce(module):
"""
Modifies existing model to do gradient allreduce, but doesn't change class
so you don't need "module"
"""
if not hasattr(dist, '_backend'):
module.warn_on_half = True
else:
module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
for p in module.state_dict().values():
if not torch.is_tensor(p):
continue
dist.broadcast(p, 0)
def allreduce_params():
if(module.needs_reduction):
module.needs_reduction = False
buckets = {}
for param in module.parameters():
if param.requires_grad and param.grad is not None:
tp = type(param.data)
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
if module.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case. This currently requires" +
"PyTorch built from top of tree master.")
module.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(module.parameters()):
def allreduce_hook(*unused):
Variable._execution_engine.queue_callback(allreduce_params)
if param.requires_grad:
param.register_hook(allreduce_hook)
dir(param)
def set_needs_reduction(self, input, output):
self.needs_reduction = True
module.register_forward_hook(set_needs_reduction)
return module
示例11: reduce
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import _backend [as 别名]
def reduce(self, op):
"""
Reduces average value over all workers.
:param op: 'sum' or 'mean', reduction operator
"""
if op not in ('sum', 'mean'):
raise NotImplementedError
distributed = (get_world_size() > 1)
if distributed:
# Backward/forward compatibility around
# https://github.com/pytorch/pytorch/commit/540ef9b1fc5506369a48491af8a285a686689b36 and
# https://github.com/pytorch/pytorch/commit/044d00516ccd6572c0d6ab6d54587155b02a3b86
# To accomodate change in Pytorch's distributed API
if hasattr(dist, "get_backend"):
_backend = dist.get_backend()
if hasattr(dist, "DistBackend"):
backend_enum_holder = dist.DistBackend
else:
backend_enum_holder = dist.Backend
else:
_backend = dist._backend
backend_enum_holder = dist.dist_backend
cuda = _backend == backend_enum_holder.NCCL
if cuda:
avg = torch.cuda.FloatTensor([self.avg])
_sum = torch.cuda.FloatTensor([self.sum])
else:
avg = torch.FloatTensor([self.avg])
_sum = torch.FloatTensor([self.sum])
_reduce_op = dist.reduce_op if hasattr(dist, "reduce_op") else dist.ReduceOp
dist.all_reduce(avg, op=_reduce_op.SUM)
dist.all_reduce(_sum, op=_reduce_op.SUM)
self.avg = avg.item()
self.sum = _sum.item()
if op == 'mean':
self.avg /= get_world_size()
self.sum /= get_world_size()