Python distributed.get_world_size方法代码示例

本文整理汇总了Python中torch.distributed.get_world_size方法的典型用法代码示例。如果您正苦于以下问题：Python distributed.get_world_size方法的具体用法？Python distributed.get_world_size怎么用？Python distributed.get_world_size使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类torch.distributed的用法示例。

在下文中一共展示了distributed.get_world_size方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: allreduce_grads

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
    """Allreduce gradients.

    Args:
        params (list[torch.Parameters]): List of parameters of a model
        coalesce (bool, optional): Whether allreduce parameters as a whole.
            Defaults to True.
        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
            Defaults to -1.
    """
    grads = [
        param.grad.data for param in params
        if param.requires_grad and param.grad is not None
    ]
    world_size = dist.get_world_size()
    if coalesce:
        _allreduce_coalesced(grads, world_size, bucket_size_mb)
    else:
        for tensor in grads:
            dist.all_reduce(tensor.div_(world_size))

开发者ID:open-mmlab，项目名称:mmdetection，代码行数:22，代码来源:dist_utils.py

示例2: init

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
        if num_replicas is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            num_replicas = dist.get_world_size()
        if rank is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            rank = dist.get_rank()
        self.dataset = dataset
        self.num_replicas = num_replicas
        self.rank = rank
        self.epoch = 0
        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
        self.total_size = self.num_samples * self.num_replicas
        self.shuffle = shuffle

开发者ID:Res2Net，项目名称:Res2Net-maskrcnn，代码行数:18，代码来源:distributed.py

示例3: init

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def __init__(self,
                 dataset,
                 samples_per_gpu=1,
                 num_replicas=None,
                 rank=None):
        if num_replicas is None:
            num_replicas = get_world_size()
        if rank is None:
            rank = get_rank()
        self.dataset = dataset
        self.samples_per_gpu = samples_per_gpu
        self.num_replicas = num_replicas
        self.rank = rank
        self.epoch = 0

        assert hasattr(self.dataset, 'flag')
        self.flag = self.dataset.flag
        self.group_sizes = np.bincount(self.flag)

        self.num_samples = 0
        for i, j in enumerate(self.group_sizes):
            self.num_samples += int(
                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
                          self.num_replicas)) * self.samples_per_gpu
        self.total_size = self.num_samples * self.num_replicas

开发者ID:dingjiansw101，项目名称:AerialDetection，代码行数:27，代码来源:sampler.py

示例4: forward

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def forward(self, input):
        if get_world_size() == 1 or not self.training:
            return super().forward(input)

        assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs"
        C = input.shape[1]
        mean = torch.mean(input, dim=[0, 2, 3])
        meansqr = torch.mean(input * input, dim=[0, 2, 3])

        vec = torch.cat([mean, meansqr], dim=0)
        vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())

        mean, meansqr = torch.split(vec, C)
        var = meansqr - mean * mean
        self.running_mean += self.momentum * (mean.detach() - self.running_mean)
        self.running_var += self.momentum * (var.detach() - self.running_var)

        invstd = torch.rsqrt(var + self.eps)
        scale = self.weight * invstd
        bias = self.bias - mean * scale
        scale = scale.reshape(1, -1, 1, 1)
        bias = bias.reshape(1, -1, 1, 1)
        return input * scale + bias

开发者ID:soeaver，项目名称:Parsing-R-CNN，代码行数:25，代码来源:batch_norm.py

示例5: reduce_loss_dict

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def reduce_loss_dict(loss_dict):
    """
    Reduce the loss dictionary from all processes so that process with rank
    0 has the averaged results. Returns a dict with the same fields as
    loss_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:
        return loss_dict
    with torch.no_grad():
        loss_names = []
        all_losses = []
        for k in sorted(loss_dict.keys()):
            loss_names.append(k)
            all_losses.append(loss_dict[k])
        all_losses = torch.stack(all_losses, dim=0)
        dist.reduce(all_losses, dst=0)
        if dist.get_rank() == 0:
            # only main process gets accumulated, so only divide by
            # world_size in this case
            all_losses /= world_size
        reduced_losses = {k: v for k, v in zip(loss_names, all_losses)}
    return reduced_losses

开发者ID:AceCoooool，项目名称:LEDNet，代码行数:25，代码来源:parallel.py

示例6: init

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
        import torch.distributed as dist

        super().__init__(dataset)
        if num_replicas is None:  # pragma: no cover
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            num_replicas = dist.get_world_size()
        if rank is None:  # pragma: no cover
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            rank = dist.get_rank()

        self.dataset = dataset
        self.num_replicas = num_replicas
        self.rank = rank
        self.epoch = 0
        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
        self.total_size = self.num_samples * self.num_replicas
        self.shuffle = shuffle

开发者ID:mars-project，项目名称:mars，代码行数:22，代码来源:sampler.py

示例7: _gather

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def _gather(rank, rows, columns):
    dest = 0
    tensor = _get_tensor(rank, rows, columns)
    if rank == dest:
        tensors_list = _get_zeros_tensors_list(rows, columns)
        logger.debug('Rank: {},\nTensor BEFORE gather: {}. tensors_list: {}'.format(
            rank, tensor, tensors_list))
        dist.gather(tensor=tensor, gather_list=tensors_list)
        logger.debug('Rank: {},\nTensor AFTER gather: {}. tensors_list: {}\n'.format(
            rank, tensor, tensors_list))
        for i in range(dist.get_world_size()):
            assert torch.equal(tensors_list[i], _get_tensor(i, rows, columns)), \
                'Rank {}: tensors lists are not the same after gather.'
    else:
        logger.debug('Rank: {},\nTensor BEFORE gather: {}\n'.format(rank, tensor))
        dist.gather(tensor=tensor, dst=dest)
        logger.debug('Rank: {},\nTensor AFTER gather: {}\n'.format(rank, tensor))

    # tensor shouldn't have changed
    assert torch.equal(tensor, _get_tensor(rank, rows, columns)), \
        'Rank {}: Tensor got changed after gather.'.format(rank)

开发者ID:aws，项目名称:sagemaker-pytorch-training-toolkit，代码行数:23，代码来源:distributed_operations.py

示例8: gather_tensors

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def gather_tensors(input_array):
    world_size = dist.get_world_size()
    ## gather shapes first
    myshape = input_array.shape
    mycount = input_array.size
    shape_tensor = torch.Tensor(np.array(myshape)).cuda()
    all_shape = [torch.Tensor(np.array(myshape)).cuda() for i in range(world_size)]
    dist.all_gather(all_shape, shape_tensor)
    ## compute largest shapes
    all_shape = [x.cpu().numpy() for x in all_shape]
    all_count = [int(x.prod()) for x in all_shape]
    all_shape = [list(map(int, x)) for x in all_shape]
    max_count = max(all_count)
    ## padding tensors and gather them
    output_tensors = [torch.Tensor(max_count).cuda() for i in range(world_size)]
    padded_input_array = np.zeros(max_count)
    padded_input_array[:mycount] = input_array.reshape(-1)
    input_tensor = torch.Tensor(padded_input_array).cuda()
    dist.all_gather(output_tensors, input_tensor)
    ## unpadding gathered tensors
    padded_output = [x.cpu().numpy() for x in output_tensors]
    output = [x[:all_count[i]].reshape(all_shape[i]) for i,x in enumerate(padded_output)]
    return output

开发者ID:XiaohangZhan，项目名称:conditional-motion-propagation，代码行数:25，代码来源:distributed_utils.py

示例9: init

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def __init__(self, dataset, total_iter, batch_size, world_size=None, rank=None, last_iter=-1):
        if world_size is None:
            world_size = dist.get_world_size()
        if rank is None:
            rank = dist.get_rank()
        assert rank < world_size
        self.dataset = dataset
        self.total_iter = total_iter
        self.batch_size = batch_size
        self.world_size = world_size
        self.rank = rank
        self.last_iter = last_iter

        self.total_size = self.total_iter*self.batch_size

        self.indices = self.gen_new_list()
        self.call = 0

开发者ID:XiaohangZhan，项目名称:conditional-motion-propagation，代码行数:19，代码来源:distributed_utils.py

示例10: init

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def __init__(self, params, dist_model=False):
        model_params = params['module']
        self.model = models.modules.__dict__[params['module']['arch']](model_params)
        utils.init_weights(self.model, init_type='xavier')
        self.model.cuda()
        if dist_model:
            self.model = utils.DistModule(self.model)
            self.world_size = dist.get_world_size()
        else:
            self.model = models.modules.FixModule(self.model)
            self.world_size = 1

        if params['optim'] == 'SGD':
            self.optim = torch.optim.SGD(
                self.model.parameters(), lr=params['lr'],
                momentum=0.9, weight_decay=0.0001)
        elif params['optim'] == 'Adam':
            self.optim = torch.optim.Adam(
                self.model.parameters(), lr=params['lr'],
                betas=(params['beta1'], 0.999))
        else:   
            raise Exception("No such optimizer: {}".format(params['optim']))

        cudnn.benchmark = True

开发者ID:XiaohangZhan，项目名称:conditional-motion-propagation，代码行数:26，代码来源:single_stage_model.py

示例11: reduce_mean

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def reduce_mean(tensor):
    if not (dist.is_available() and dist.is_initialized()):
        return tensor
    tensor = tensor.clone()
    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
    return tensor

开发者ID:open-mmlab，项目名称:mmdetection，代码行数:8，代码来源:gfl_head.py

示例12: _parse_losses

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def _parse_losses(self, losses):
        """Parse the raw outputs (losses) of the network.

        Args:
            losses (dict): Raw output of the network, which usually contain
                losses and other necessary infomation.

        Returns:
            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
                which may be a weighted sum of all losses, log_vars contains
                all the variables to be sent to the logger.
        """
        log_vars = OrderedDict()
        for loss_name, loss_value in losses.items():
            if isinstance(loss_value, torch.Tensor):
                log_vars[loss_name] = loss_value.mean()
            elif isinstance(loss_value, list):
                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
            else:
                raise TypeError(
                    f'{loss_name} is not a tensor or list of tensors')

        loss = sum(_value for _key, _value in log_vars.items()
                   if 'loss' in _key)

        log_vars['loss'] = loss
        for loss_name, loss_value in log_vars.items():
            # reduce loss when distributed training
            if dist.is_available() and dist.is_initialized():
                loss_value = loss_value.data.clone()
                dist.all_reduce(loss_value.div_(dist.get_world_size()))
            log_vars[loss_name] = loss_value.item()

        return loss, log_vars

开发者ID:open-mmlab，项目名称:mmdetection，代码行数:36，代码来源:base.py

示例13: get_current_train_batch_size

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def get_current_train_batch_size(self):
        if self.in_distributed_mode():
            train_batch_size = max(self.setting.train_batch_size // dist.get_world_size(), 1)
        else:
            train_batch_size = self.setting.train_batch_size

        return train_batch_size

开发者ID:dolphin-zs，项目名称:Doc2EDAG，代码行数:9，代码来源:base_task.py

示例14: average_gradients

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def average_gradients(model):
    """ Gradient averaging. """
    size = float(dist.get_world_size())
    for name, param in model.named_parameters():
        try:
            dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM)
            param.grad.data /= size
        except Exception as e:
            logger.error('Error when all_reduce parameter {}, size={}, grad_type={}, error message {}'.format(
                name, param.size(), param.grad.data.dtype, repr(e)
            ))

开发者ID:dolphin-zs，项目名称:Doc2EDAG，代码行数:13，代码来源:base_task.py

示例15: get_world_size

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_world_size [as 别名]
def get_world_size():
    if not dist.is_available():
        return 1
    if not dist.is_initialized():
        return 1
    return dist.get_world_size()

开发者ID:Res2Net，项目名称:Res2Net-maskrcnn，代码行数:8，代码来源:comm.py

注：本文中的torch.distributed.get_world_size方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。