本文整理汇总了Python中torch.distributed.broadcast方法的典型用法代码示例。如果您正苦于以下问题:Python distributed.broadcast方法的具体用法?Python distributed.broadcast怎么用?Python distributed.broadcast使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类torch.distributed
的用法示例。
在下文中一共展示了distributed.broadcast方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: sync
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def sync(self, src, grp=None, async_op=False):
keys = []
handles = []
for k, t in self.state_dict().items():
if grp is None:
h = dist.broadcast(t, src, async_op=True)
else:
h = dist.broadcast(t, src, grp, async_op=True)
keys.append(k)
handles.append(h)
if not async_op:
for k, h in zip(keys, handles):
h.wait()
return handles
示例2: broadcast_obj
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def broadcast_obj(self, obj, src, group=None):
"""Broadcasts a given object to all parties."""
if group is None:
group = self.main_group
if self.rank == src:
assert obj is not None, "src party must provide obj for broadcast"
buf = pickle.dumps(obj)
size = torch.tensor(len(buf), dtype=torch.int32)
arr = torch.from_numpy(numpy.frombuffer(buf, dtype=numpy.int8))
dist.broadcast(size, src, group=group)
dist.broadcast(arr, src, group=group)
else:
size = torch.tensor(1, dtype=torch.int32)
dist.broadcast(size, src, group=group)
data = torch.empty(size=(size,), dtype=torch.int8)
dist.broadcast(data, src, group=group)
buf = data.numpy().tobytes()
obj = serial.restricted_loads(buf)
return obj
示例3: broadcast
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def broadcast(self, input, src, batched=False):
"""Broadcasts the tensor to all parties."""
assert dist.is_initialized(), "initialize the communicator first"
if batched:
assert isinstance(input, list), "batched reduce input must be a list"
reqs = []
for tensor in input:
reqs.append(
dist.broadcast(tensor, src, group=self.main_group, async_op=True)
)
for req in reqs:
req.wait()
else:
assert torch.is_tensor(
input.data
), "unbatched input for reduce must be a torch tensor"
dist.broadcast(input.data, src, group=self.main_group)
return input
示例4: broadcast_coalesced
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def broadcast_coalesced(tensors, src=0, buffer_size=10 * MB):
r"""
Broadcast a sequence of tensors to the default group from rank 0.
Small tensors are first coalesced into a buffer to reduce the number of
broadcasts.
tensors (sequence): tensors to broadcast. Each tensor needs to be on the
same GPU.
src (int): src rank. Default: 0.
buffer_size (int): maximum size of the buffer for coalescing. Default: 10MB.
"""
for tensors in _take_tensors(tensors, buffer_size):
flat_tensors = _flatten_dense_tensors(tensors)
dist.broadcast(flat_tensors, src)
for old_t, new_t in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
old_t.data = new_t
示例5: receive_tensor_helper
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def receive_tensor_helper(tensor, src_rank, group, tag, num_iterations,
broadcast):
dist.barrier()
start_time = time.time()
for i in range(num_iterations):
if broadcast:
dist.broadcast(tensor=tensor, group=group, src=src_rank)
else:
dist.recv(tensor=tensor.cpu(), src=src_rank, tag=tag)
end_time = time.time()
dist.barrier()
size = tensor.size()[0]
throughput = (size * 4. * num_iterations) / (
(end_time - start_time) * 10**9)
print("Time to receive %s MB: %.3f seconds" %
((size * 4.) / 10**6,
(end_time - start_time) / num_iterations))
print("Throughput: %.3f GB/s" % throughput)
示例6: train_step
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def train_step(self, *args, **kwargs):
if self.need_reinit:
if dist.is_initialized():
# parallel mode
print("wait for barrier")
dist.barrier()
print("start to broadcast")
for p in self._raw_model.parameters():
dist.broadcast(p.data, 0)
print("wrap with DDP")
self._ddp_model = nn.parallel.DistributedDataParallel(
self._raw_model,
broadcast_buffers=False,
check_reduction=True,
)
else:
# single worker mode
# skip all reduce
print("single worker mode")
self._ddp_model = self._raw_model
self._optimizer = optim.SGD(self._ddp_model.parameters(), lr=1e-3)
self.need_reinit = False
self._train_step(*args, **kwargs)
示例7: collect_results_cpu
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def collect_results_cpu(result_part, size, tmpdir=None):
rank, world_size = get_dist_info()
# create a tmp dir if it is not specified
if tmpdir is None:
MAX_LEN = 512
# 32 is whitespace
dir_tensor = torch.full((MAX_LEN, ),
32,
dtype=torch.uint8,
device='cuda')
if rank == 0:
tmpdir = tempfile.mkdtemp()
tmpdir = torch.tensor(
bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
dir_tensor[:len(tmpdir)] = tmpdir
dist.broadcast(dir_tensor, 0)
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
else:
mmcv.mkdir_or_exist(tmpdir)
# dump the part result to the dir
mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
dist.barrier()
# collect all parts
if rank != 0:
return None
else:
# load results of all parts from tmp dir
part_list = []
for i in range(world_size):
part_file = osp.join(tmpdir, f'part_{i}.pkl')
part_list.append(mmcv.load(part_file))
# sort the results
ordered_results = []
for res in zip(*part_list):
ordered_results.extend(list(res))
# the dataloader may pad some samples
ordered_results = ordered_results[:size]
# remove tmp dir
shutil.rmtree(tmpdir)
return ordered_results
示例8: collect_results
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def collect_results(result_part, size, tmpdir=None):
rank, world_size = get_dist_info()
# create a tmp dir if it is not specified
if tmpdir is None:
MAX_LEN = 512
# 32 is whitespace
dir_tensor = torch.full((MAX_LEN, ),
32,
dtype=torch.uint8,
device='cuda')
if rank == 0:
tmpdir = tempfile.mkdtemp()
tmpdir = torch.tensor(
bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
dir_tensor[:len(tmpdir)] = tmpdir
dist.broadcast(dir_tensor, 0)
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
else:
mmcv.mkdir_or_exist(tmpdir)
# dump the part result to the dir
mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
dist.barrier()
# collect all parts
if rank != 0:
return None
else:
# load results of all parts from tmp dir
part_list = []
for i in range(world_size):
part_file = osp.join(tmpdir, f'part_{i}.pkl')
part_list.append(mmcv.load(part_file))
# sort the results
ordered_results = []
for res in zip(*part_list):
ordered_results.extend(list(res))
# the dataloader may pad some samples
ordered_results = ordered_results[:size]
# remove tmp dir
shutil.rmtree(tmpdir)
return ordered_results
示例9: collect_results
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def collect_results(result_part, size, tmpdir=None):
rank, world_size = get_dist_info()
# create a tmp dir if it is not specified
if tmpdir is None:
MAX_LEN = 512
# 32 is whitespace
dir_tensor = torch.full((MAX_LEN, ),
32,
dtype=torch.uint8,
device='cuda')
if rank == 0:
tmpdir = tempfile.mkdtemp()
tmpdir = torch.tensor(
bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
dir_tensor[:len(tmpdir)] = tmpdir
dist.broadcast(dir_tensor, 0)
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
else:
mmcv.mkdir_or_exist(tmpdir)
# dump the part result to the dir
mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank)))
dist.barrier()
# collect all parts
if rank != 0:
return None
else:
# load results of all parts from tmp dir
part_list = []
for i in range(world_size):
part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i))
part_list.append(mmcv.load(part_file))
# sort the results
ordered_results = []
for res in zip(*part_list):
ordered_results.extend(list(res))
# the dataloader may pad some samples
ordered_results = ordered_results[:size]
# remove tmp dir
shutil.rmtree(tmpdir)
return ordered_results
示例10: __init__
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def __init__(self, module):
super(DistributedDataParallel, self).__init__()
self.warn_on_half = True#$ True if dist._backend == dist.dist_backend.GLOO else False
self.module = module
for p in self.module.state_dict().values():
if torch.is_tensor(p):
dist.broadcast(p, 0)
def allreduce_params():
if(self.needs_reduction):
self.needs_reduction = False
buckets = {}
for param in self.module.parameters():
if param.requires_grad and param.grad is not None:
tp = type(param.data)
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
if self.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case.")
self.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(self.module.parameters()):
if param.requires_grad:
def allreduce_hook(*unused):
param._execution_engine.queue_callback(allreduce_params)
param.register_hook(allreduce_hook)
示例11: collect_results
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def collect_results(result_part, size, tmpdir=None):
rank, world_size = get_dist_info()
# create a tmp dir if it is not specified
if tmpdir is None:
MAX_LEN = 512
# 32 is whitespace
dir_tensor = torch.full((MAX_LEN,), 32, dtype=torch.uint8, device='cuda')
if rank == 0:
tmpdir = tempfile.mkdtemp()
tmpdir = torch.Tensor(bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
dir_tensor[:len(tmpdir)] = tmpdir
dist.broadcast(dir_tensor, 0)
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
else:
mmcv.mkdir_or_exist(tmpdir)
# dump the part result to the dir
mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank)))
dist.barrier()
# collect all parts
if rank != 0:
return None
else:
# load results of all parts from tmp dir
part_list = []
for i in range(world_size):
part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i))
part_list.append(mmcv.load(part_file))
# sort the results
ordered_results = []
for res in zip(*part_list):
ordered_results.extend(list(res))
# the dataloader may pad some samples
ordered_results = ordered_results[:size]
# remove tmp dir
shutil.rmtree(tmpdir)
return ordered_results
示例12: _broadcast
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def _broadcast(rank, rows, columns):
source = 0
tensor = _get_tensor(rank, rows, columns)
logger.debug('Rank: {},\nTensor BEFORE broadcast: {}'.format(rank, tensor))
dist.broadcast(tensor, src=source)
logger.debug('Rank: {},\nTensor AFTER broadcast: {}\n'.format(rank, tensor))
assert torch.equal(tensor, _get_tensor(source, rows, columns)), \
'Rank {}: Tensor was not equal to rank {} tensor after broadcast.'.format(rank, source)
示例13: add_args
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def add_args(parser):
"""Add optimizer-specific arguments to the parser."""
parser.add_argument(
"--block-lr", default=1, type=float, help="block learning rate for bmuf"
)
parser.add_argument(
"--block-momentum",
default=0.875,
type=float,
help="block momentum for bmuf",
)
parser.add_argument(
"--global-sync-iter",
default=50,
type=int,
help="Iteration for syncing global model",
)
parser.add_argument(
"--warmup-iterations",
default=500,
type=int,
help="warmup iterations for model to broadcast",
)
parser.add_argument(
"--use-nbm",
default=False,
action="store_true",
help="Specify whether you want to use classical BM / Nesterov BM",
)
parser.add_argument(
"--average-sync",
default=False,
action="store_true",
help="Specify whether you want to average the local momentum after each sync",
)
示例14: _warmup_sync
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def _warmup_sync(self, root_rank=0):
if self.world_size <= 1:
return
# Broadcast the local model to all gpus
for param in self.params:
dist.broadcast(param.data, src=root_rank)
# Update local optimizer state
if self.average_sync:
self._optimizer.average_params()
else:
self._optimizer.load_state_dict(self.initial_state)
self._reset_local_data()
示例15: broadcast_params
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import broadcast [as 别名]
def broadcast_params(model):
""" broadcast model parameters """
for p in model.state_dict().values():
dist.broadcast(p, 0)