本文整理汇总了Python中torch.distributed.get_world_size函数的典型用法代码示例。如果您正苦于以下问题:Python get_world_size函数的具体用法?Python get_world_size怎么用?Python get_world_size使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了get_world_size函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: flat_dist_call
def flat_dist_call(tensors, call, extra_args=None):
flat_dist_call.warn_on_half = True
buckets = {}
for tensor in tensors:
tp = tensor.type()
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(tensor)
if flat_dist_call.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case.")
flat_dist_call.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
coalesced = _flatten_dense_tensors(bucket)
if extra_args is not None:
call(coalesced, *extra_args)
else:
call(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(bucket, _unflatten_dense_tensors(coalesced, bucket)):
buf.copy_(synced)
示例2: _process_batch
def _process_batch():
dev_grad_batch, dev_events, job_event = queue.get()
dev_coalesced = []
# Coalesce the tensors on all devices and start a local reduction
for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams):
with torch.cuda.device(dev_id), torch.cuda.stream(stream):
stream.wait_event(event)
coalesced = _flatten_tensors(grad_batch)
dev_coalesced.append(coalesced)
# Wait for all copies to complete before starting the NCCL kernel
for stream in reduction_streams:
stream.synchronize()
nccl.reduce(dev_coalesced, root=device_ids[0], streams=nccl_streams)
# From now on we're only going to work on the first device (from device_ids)
grad_batch = dev_grad_batch[0]
coalesced = dev_coalesced[0]
reduce_stream = reduction_streams[0]
with torch.cuda.stream(reduce_stream):
reduce_stream.wait_stream(nccl_streams[0])
coalesced /= dist.get_world_size()
dist.all_reduce(coalesced, group=group_id)
for grad, reduced in zip(grad_batch, _unflatten_tensors(coalesced, grad_batch)):
grad.copy_(reduced)
job_event.set()
示例3: test_send_recv
def test_send_recv(self):
rank = dist.get_rank()
tensor = _build_tensor(rank + 1)
for dest in range(0, dist.get_world_size()):
if dest == rank:
continue
dist.send(tensor, dest)
for src in range(0, dist.get_world_size()):
if src == rank:
continue
tensor = _build_tensor(src + 1, value=-1)
expected_tensor = _build_tensor(src + 1)
dist.recv(tensor, src)
self.assertEqual(tensor, expected_tensor)
self._barrier()
示例4: test_send_recv_any_source
def test_send_recv_any_source(self):
rank = dist.get_rank()
tensor = _build_tensor(10, rank)
for dest in range(0, dist.get_world_size()):
if dest == rank:
continue
dist.send(tensor, dest)
recv_ranks = set()
for src in range(0, dist.get_world_size()):
if src == rank:
continue
tensor = _build_tensor(10, value=-1)
dist.recv(tensor)
recv_ranks.add(tensor.resize_(1)[0])
self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
self._barrier()
示例5: reduction_fn_nccl
def reduction_fn_nccl():
# This function only needs to be called once
if not self.need_reduction:
return
self.need_reduction = False
all_grads = [[] for _ in range(len(self._module_copies))]
all_grads_buckets_iters = []
# Bucketing all the gradients
for dev_idx, module in enumerate(self._module_copies):
for param in module.parameters():
if not param.requires_grad or param.grad is None:
continue
if param.grad.requires_grad:
raise RuntimeError("DistributedDataParallel only works "
"with gradients that don't require "
"grad")
# Adding the gradients for reduction
all_grads[dev_idx].append(param.grad.data)
# Now bucketing the parameters
dev_grads_buckets = _take_tensors(all_grads[dev_idx],
self.nccl_reduce_bucket_size)
all_grads_buckets_iters.append(dev_grads_buckets)
# Now reduce each bucket one after another
for grads_batch in zip(*all_grads_buckets_iters):
grads_batch_coalesced = []
# Coalesce each bucket
for dev_idx, dev_grads_batch in enumerate(grads_batch):
dev_id = self.device_ids[dev_idx]
with torch.cuda.device(dev_id):
dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
grads_batch_coalesced.append(dev_grads_batch_coalesced)
# We will only use device 0's results, but this single op should be
# faster than doing the following two operation sequentially:
# (1) intra-node reduce to lead GPU, followed by
# (2) inter-node allreduce for all the first lead GPUs in all nodes
dist.all_reduce_multigpu(grads_batch_coalesced,
group=self.nccl_reduction_group_id)
# Now only work on the first device of self.device_ids, uncoalesce
# the gradients for each bucket
grads_batch_coalesced[0] /= dist.get_world_size()
grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0])
for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
grad.copy_(reduced)
# clear the gradients and save memory for replicas
for module in self._module_copies[1:]:
for param in module.parameters():
if param.requires_grad:
param.grad = None
param.data.set_()
示例6: __init__
def __init__(self, num_replicas=None, rank=None):
if num_replicas is None:
num_replicas = get_world_size()
if rank is None:
rank = get_rank()
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.extra = 0
示例7: test_mpi
def test_mpi():
dist.init_process_group('mpi')
world_size = dist.get_world_size()
rank = dist.get_rank()
vector = [0] * world_size
vector[rank] = 1
vector = torch.DoubleTensor(vector)
dist.all_reduce(vector, op=dist.reduce_op.SUM)
print("Host {} : Rank {} : {}".format(get_hostname(), rank, vector))
示例8: __init__
def __init__(self, dataset, num_replicas=None, rank=None):
if num_replicas is None:
num_replicas = get_world_size()
if rank is None:
rank = get_rank()
self.dataset = dataset
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.num_samples = int(math.ceil(len(self.dataset) / self.num_replicas))
self.total_size = self.num_samples * self.num_replicas
示例9: allreduce_params
def allreduce_params():
if self.needs_reduction:
self.needs_reduction = False
buckets = defaultdict(list)
for param in self.module.parameters():
if param.requires_grad and param.grad is not None:
tp = type(param.data)
buckets[tp].append(param)
for bucket in buckets.values():
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
示例10: _init_multigpu_helper
def _init_multigpu_helper(self):
"""Multigpu tests are designed to simulate the multi nodes with multi
GPUs on each node. Nccl backend requires equal #GPUs in each process.
On a single node, all visible GPUs are evenly
divided to subsets, each process only uses a subset.
"""
nGPUs = torch.cuda.device_count()
world_size = dist.get_world_size()
visible_devices = range(nGPUs)
if BACKEND == 'nccl':
apply_hack_for_nccl()
nGPUs_per_process = nGPUs // world_size
rank_to_GPU = {i: list(visible_devices[i * nGPUs_per_process: (i + 1) * nGPUs_per_process])
for i in range(world_size)}
return rank_to_GPU
示例11: config_pytorch
def config_pytorch(options):
"""Config pytorch packages.
Fix random number for packages and initialize distributed environment for pytorch.
Setup cuda environment for pytorch.
:param options: A global object containing specified options.
:type options: argparse.Namespace
"""
# Setting `cudnn.deterministic = True` will turn on
# CUDNN deterministic setting which can slow down training considerably.
# Unexpected behavior may also be observed from checkpoint.
# See: https: // github.com/pytorch/examples/blob/master/imagenet/main.py
if options.cudnn_deterministic:
cudnn.deterministic = True
log.warning('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.', 0)
if options.seed is not None:
random.seed(options.seed)
torch.manual_seed(options.seed)
# define the graph for the computation.
if options.use_cuda:
assert torch.cuda.is_available()
options.rank = dist.get_rank()
options.world_size = dist.get_world_size()
options.graph = FCGraph(options)
# enable cudnn accelerator if we are using cuda.
if options.use_cuda:
options.graph.assigned_gpu_id()
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
if torch.backends.cudnn.version() is None:
log.warning("CUDNN not found on device.")
log.info("World size={}, Rank={}, hostname={}, cuda_available={}, cuda_device={}".format(
options.world_size, options.rank, socket.gethostname(), torch.cuda.is_available(),
torch.cuda.current_device()))
示例12: __init__
def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None):
"""
Samples batches assuming they are in order of size to batch similarly sized samples together.
"""
super(DistributedBucketingSampler, self).__init__(data_source)
if num_replicas is None:
num_replicas = get_world_size()
if rank is None:
rank = get_rank()
self.data_source = data_source
self.ids = list(range(0, len(data_source)))
self.batch_size = batch_size
self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)]
self.num_replicas = num_replicas
self.rank = rank
self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas))
self.total_size = self.num_samples * self.num_replicas
示例13: test_isend
def test_isend(self):
rank = dist.get_rank()
world_size = dist.get_world_size()
if rank == 0:
requests = [
dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size)
]
for request in requests:
request.wait()
self.assertTrue(request.is_completed())
else:
tensor = _build_tensor(rank, -1)
dist.recv(tensor, 0)
self.assertEqual(tensor, _build_tensor(rank, 10))
self._barrier()
示例14: test_irecv
def test_irecv(self):
rank = dist.get_rank()
world_size = dist.get_world_size()
if rank == 0:
expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)]
requests = [
dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size)
]
for src in range(1, world_size):
requests[src - 1].wait()
self.assertTrue(requests[src - 1].is_completed())
self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10))
else:
tensor = _build_tensor(rank, 10)
dist.send(tensor, 0)
self._barrier()
示例15: test_get_rank
def test_get_rank(self):
test_dir = os.path.join(TEMP_DIR, 'test_dir')
pid = str(os.getpid())
num_processes = dist.get_world_size()
with open(os.path.join(test_dir, pid), 'w') as f:
f.write(str(dist.get_rank()))
self._barrier()
all_ranks = set()
for f_name in os.listdir(test_dir):
with open(os.path.join(test_dir, f_name), 'r') as f:
all_ranks.add(int(f.read()))
self.assertEqual(len(all_ranks), num_processes)
self._barrier()
if dist.get_rank() == 0:
for f_name in os.listdir(test_dir):
os.unlink(os.path.join(test_dir, f_name))
self._barrier()