当前位置: 首页>>代码示例>>Python>>正文


Python distributed.get_world_size函数代码示例

本文整理汇总了Python中torch.distributed.get_world_size函数的典型用法代码示例。如果您正苦于以下问题:Python get_world_size函数的具体用法?Python get_world_size怎么用?Python get_world_size使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了get_world_size函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: flat_dist_call

def flat_dist_call(tensors, call, extra_args=None):
    flat_dist_call.warn_on_half = True
    buckets = {}
    for tensor in tensors:
        tp = tensor.type()
        if tp not in buckets:
            buckets[tp] = []
        buckets[tp].append(tensor)
                    
    if flat_dist_call.warn_on_half:
        if torch.cuda.HalfTensor in buckets:
            print("WARNING: gloo dist backend for half parameters may be extremely slow." +
                  " It is recommended to use the NCCL backend in this case.")
            flat_dist_call.warn_on_half = False

    for tp in buckets:
        bucket = buckets[tp]
        coalesced = _flatten_dense_tensors(bucket)
        if extra_args is not None:
            call(coalesced, *extra_args)
        else:
            call(coalesced)
        coalesced /= dist.get_world_size()
        for buf, synced in zip(bucket, _unflatten_dense_tensors(coalesced, bucket)):
            buf.copy_(synced)
开发者ID:Henley13,项目名称:imagenet-fast,代码行数:25,代码来源:distributed.py

示例2: _process_batch

        def _process_batch():
            dev_grad_batch, dev_events, job_event = queue.get()
            dev_coalesced = []
            # Coalesce the tensors on all devices and start a local reduction
            for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams):
                with torch.cuda.device(dev_id), torch.cuda.stream(stream):
                    stream.wait_event(event)
                    coalesced = _flatten_tensors(grad_batch)
                    dev_coalesced.append(coalesced)
            # Wait for all copies to complete before starting the NCCL kernel
            for stream in reduction_streams:
                stream.synchronize()
            nccl.reduce(dev_coalesced, root=device_ids[0], streams=nccl_streams)

            # From now on we're only going to work on the first device (from device_ids)
            grad_batch = dev_grad_batch[0]
            coalesced = dev_coalesced[0]
            reduce_stream = reduction_streams[0]
            with torch.cuda.stream(reduce_stream):
                reduce_stream.wait_stream(nccl_streams[0])
                coalesced /= dist.get_world_size()
                dist.all_reduce(coalesced, group=group_id)
                for grad, reduced in zip(grad_batch, _unflatten_tensors(coalesced, grad_batch)):
                    grad.copy_(reduced)
            job_event.set()
开发者ID:athiwatp,项目名称:pytorch,代码行数:25,代码来源:distributed.py

示例3: test_send_recv

    def test_send_recv(self):
        rank = dist.get_rank()
        tensor = _build_tensor(rank + 1)
        for dest in range(0, dist.get_world_size()):
            if dest == rank:
                continue
            dist.send(tensor, dest)

        for src in range(0, dist.get_world_size()):
            if src == rank:
                continue
            tensor = _build_tensor(src + 1, value=-1)
            expected_tensor = _build_tensor(src + 1)
            dist.recv(tensor, src)
            self.assertEqual(tensor, expected_tensor)

        self._barrier()
开发者ID:athiwatp,项目名称:pytorch,代码行数:17,代码来源:test_distributed.py

示例4: test_send_recv_any_source

    def test_send_recv_any_source(self):
        rank = dist.get_rank()
        tensor = _build_tensor(10, rank)
        for dest in range(0, dist.get_world_size()):
            if dest == rank:
                continue
            dist.send(tensor, dest)

        recv_ranks = set()
        for src in range(0, dist.get_world_size()):
            if src == rank:
                continue
            tensor = _build_tensor(10, value=-1)
            dist.recv(tensor)
            recv_ranks.add(tensor.resize_(1)[0])

        self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
        self._barrier()
开发者ID:athiwatp,项目名称:pytorch,代码行数:18,代码来源:test_distributed.py

示例5: reduction_fn_nccl

        def reduction_fn_nccl():
            # This function only needs to be called once
            if not self.need_reduction:
                return

            self.need_reduction = False
            all_grads = [[] for _ in range(len(self._module_copies))]
            all_grads_buckets_iters = []

            # Bucketing all the gradients
            for dev_idx, module in enumerate(self._module_copies):
                for param in module.parameters():
                    if not param.requires_grad or param.grad is None:
                        continue
                    if param.grad.requires_grad:
                        raise RuntimeError("DistributedDataParallel only works "
                                           "with gradients that don't require "
                                           "grad")
                    # Adding the gradients for reduction
                    all_grads[dev_idx].append(param.grad.data)

                # Now bucketing the parameters
                dev_grads_buckets = _take_tensors(all_grads[dev_idx],
                                                  self.nccl_reduce_bucket_size)

                all_grads_buckets_iters.append(dev_grads_buckets)

            # Now reduce each bucket one after another
            for grads_batch in zip(*all_grads_buckets_iters):
                grads_batch_coalesced = []
                # Coalesce each bucket
                for dev_idx, dev_grads_batch in enumerate(grads_batch):
                    dev_id = self.device_ids[dev_idx]
                    with torch.cuda.device(dev_id):
                        dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
                        grads_batch_coalesced.append(dev_grads_batch_coalesced)

                # We will only use device 0's results, but this single op should be
                # faster than doing the following two operation sequentially:
                # (1) intra-node reduce to lead GPU, followed by
                # (2) inter-node allreduce for all the first lead GPUs in all nodes
                dist.all_reduce_multigpu(grads_batch_coalesced,
                                         group=self.nccl_reduction_group_id)

                # Now only work on the first device of self.device_ids, uncoalesce
                # the gradients for each bucket
                grads_batch_coalesced[0] /= dist.get_world_size()
                grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0])
                for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
                    grad.copy_(reduced)

            # clear the gradients and save memory for replicas
            for module in self._module_copies[1:]:
                for param in module.parameters():
                    if param.requires_grad:
                        param.grad = None
                        param.data.set_()
开发者ID:inkawhich,项目名称:pytorch,代码行数:57,代码来源:distributed.py

示例6: __init__

 def __init__(self, num_replicas=None, rank=None):
     if num_replicas is None:
         num_replicas = get_world_size()
     if rank is None:
         rank = get_rank()
     self.num_replicas = num_replicas
     self.rank = rank
     self.epoch = 0
     self.extra = 0
开发者ID:AhlamMD,项目名称:decaNLP,代码行数:9,代码来源:iterator.py

示例7: test_mpi

def test_mpi():
    dist.init_process_group('mpi')
    world_size = dist.get_world_size()
    rank = dist.get_rank()

    vector = [0] * world_size
    vector[rank] = 1
    vector = torch.DoubleTensor(vector)

    dist.all_reduce(vector, op=dist.reduce_op.SUM)
    print("Host {} : Rank {} : {}".format(get_hostname(), rank, vector))
开发者ID:mlbench,项目名称:mlbench,代码行数:11,代码来源:main.py

示例8: __init__

 def __init__(self, dataset, num_replicas=None, rank=None):
     if num_replicas is None:
         num_replicas = get_world_size()
     if rank is None:
         rank = get_rank()
     self.dataset = dataset
     self.num_replicas = num_replicas
     self.rank = rank
     self.epoch = 0
     self.num_samples = int(math.ceil(len(self.dataset) / self.num_replicas))
     self.total_size = self.num_samples * self.num_replicas
开发者ID:athiwatp,项目名称:pytorch,代码行数:11,代码来源:distributed.py

示例9: allreduce_params

        def allreduce_params():
            if self.needs_reduction:
                self.needs_reduction = False
                buckets = defaultdict(list)
                for param in self.module.parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = type(param.data)
                        buckets[tp].append(param)

                for bucket in buckets.values():
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    dist.all_reduce(coalesced)
                    coalesced /= dist.get_world_size()
                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)
开发者ID:RichieMay,项目名称:pytorch,代码行数:16,代码来源:distributed_cpu.py

示例10: _init_multigpu_helper

    def _init_multigpu_helper(self):
        """Multigpu tests are designed to simulate the multi nodes with multi
        GPUs on each node. Nccl backend requires equal #GPUs in each process.
        On a single node, all visible GPUs are evenly
        divided to subsets, each process only uses a subset.
        """
        nGPUs = torch.cuda.device_count()
        world_size = dist.get_world_size()
        visible_devices = range(nGPUs)

        if BACKEND == 'nccl':
            apply_hack_for_nccl()

        nGPUs_per_process = nGPUs // world_size
        rank_to_GPU = {i: list(visible_devices[i * nGPUs_per_process: (i + 1) * nGPUs_per_process])
                       for i in range(world_size)}
        return rank_to_GPU
开发者ID:xiongyw,项目名称:pytorch,代码行数:17,代码来源:test_distributed.py

示例11: config_pytorch

def config_pytorch(options):
    """Config pytorch packages.

    Fix random number for packages and initialize distributed environment for pytorch.
    Setup cuda environment for pytorch.

    :param options: A global object containing specified options.
    :type options: argparse.Namespace
    """

    # Setting `cudnn.deterministic = True` will turn on
    # CUDNN deterministic setting which can slow down training considerably.
    # Unexpected behavior may also be observed from checkpoint.
    # See: https: // github.com/pytorch/examples/blob/master/imagenet/main.py
    if options.cudnn_deterministic:
        cudnn.deterministic = True
        log.warning('You have chosen to seed training. '
                    'This will turn on the CUDNN deterministic setting, '
                    'which can slow down your training considerably! '
                    'You may see unexpected behavior when restarting '
                    'from checkpoints.', 0)

    if options.seed is not None:
        random.seed(options.seed)
        torch.manual_seed(options.seed)

    # define the graph for the computation.
    if options.use_cuda:
        assert torch.cuda.is_available()

    options.rank = dist.get_rank()
    options.world_size = dist.get_world_size()
    options.graph = FCGraph(options)

    # enable cudnn accelerator if we are using cuda.
    if options.use_cuda:
        options.graph.assigned_gpu_id()
        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True

        if torch.backends.cudnn.version() is None:
            log.warning("CUDNN not found on device.")

        log.info("World size={}, Rank={}, hostname={}, cuda_available={}, cuda_device={}".format(
            options.world_size, options.rank, socket.gethostname(), torch.cuda.is_available(),
            torch.cuda.current_device()))
开发者ID:mlbench,项目名称:mlbench,代码行数:46,代码来源:config.py

示例12: __init__

 def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None):
     """
     Samples batches assuming they are in order of size to batch similarly sized samples together.
     """
     super(DistributedBucketingSampler, self).__init__(data_source)
     if num_replicas is None:
         num_replicas = get_world_size()
     if rank is None:
         rank = get_rank()
     self.data_source = data_source
     self.ids = list(range(0, len(data_source)))
     self.batch_size = batch_size
     self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)]
     self.num_replicas = num_replicas
     self.rank = rank
     self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas))
     self.total_size = self.num_samples * self.num_replicas
开发者ID:vamsimynam,项目名称:deepspeech.pytorch,代码行数:17,代码来源:data_loader.py

示例13: test_isend

    def test_isend(self):
        rank = dist.get_rank()
        world_size = dist.get_world_size()

        if rank == 0:
            requests = [
                dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size)
            ]
            for request in requests:
                request.wait()
                self.assertTrue(request.is_completed())
        else:
            tensor = _build_tensor(rank, -1)
            dist.recv(tensor, 0)
            self.assertEqual(tensor, _build_tensor(rank, 10))

        self._barrier()
开发者ID:athiwatp,项目名称:pytorch,代码行数:17,代码来源:test_distributed.py

示例14: test_irecv

    def test_irecv(self):
        rank = dist.get_rank()
        world_size = dist.get_world_size()

        if rank == 0:
            expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)]
            requests = [
                dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size)
            ]

            for src in range(1, world_size):
                requests[src - 1].wait()
                self.assertTrue(requests[src - 1].is_completed())
                self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10))
        else:
            tensor = _build_tensor(rank, 10)
            dist.send(tensor, 0)

        self._barrier()
开发者ID:athiwatp,项目名称:pytorch,代码行数:19,代码来源:test_distributed.py

示例15: test_get_rank

    def test_get_rank(self):
        test_dir = os.path.join(TEMP_DIR, 'test_dir')
        pid = str(os.getpid())
        num_processes = dist.get_world_size()
        with open(os.path.join(test_dir, pid), 'w') as f:
            f.write(str(dist.get_rank()))

        self._barrier()

        all_ranks = set()
        for f_name in os.listdir(test_dir):
            with open(os.path.join(test_dir, f_name), 'r') as f:
                all_ranks.add(int(f.read()))
        self.assertEqual(len(all_ranks), num_processes)

        self._barrier()

        if dist.get_rank() == 0:
            for f_name in os.listdir(test_dir):
                os.unlink(os.path.join(test_dir, f_name))

        self._barrier()
开发者ID:athiwatp,项目名称:pytorch,代码行数:22,代码来源:test_distributed.py


注:本文中的torch.distributed.get_world_size函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。