当前位置: 首页>>代码示例>>Python>>正文


Python distributed.get_backend方法代码示例

本文整理汇总了Python中torch.distributed.get_backend方法的典型用法代码示例。如果您正苦于以下问题:Python distributed.get_backend方法的具体用法?Python distributed.get_backend怎么用?Python distributed.get_backend使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在torch.distributed的用法示例。


在下文中一共展示了distributed.get_backend方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _serialize_to_tensor

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _serialize_to_tensor(data, group):
    backend = dist.get_backend(group)
    assert backend in ["gloo", "nccl"]
    device = torch.device("cpu" if backend == "gloo" else "cuda")

    buffer = pickle.dumps(data)
    if len(buffer) > 1024 ** 3:
        logger = logging.getLogger(__name__)
        logger.warning(
            "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
                get_rank(), len(buffer) / (1024 ** 3), device
            )
        )
    storage = torch.ByteStorage.from_buffer(buffer)
    tensor = torch.ByteTensor(storage).to(device=device)
    return tensor 
开发者ID:JDAI-CV,项目名称:fast-reid,代码行数:18,代码来源:comm.py

示例2: spmd_main

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def spmd_main(local_world_size, local_rank):
    # These are the parameters used to initialize the process group
    env_dict = {
        key: os.environ[key]
        for key in ("MASTER_ADDR", "MASTER_PORT", "RANK", "WORLD_SIZE")
    }
    print(f"[{os.getpid()}] Initializing process group with: {env_dict}")
    dist.init_process_group(backend="nccl")
    print(
        f"[{os.getpid()}]: world_size = {dist.get_world_size()}, "
        + f"rank = {dist.get_rank()}, backend={dist.get_backend()}"
    )

    demo_basic(local_world_size, local_rank)

    # Tear down the process group
    dist.destroy_process_group() 
开发者ID:pytorch,项目名称:examples,代码行数:19,代码来源:example.py

示例3: _test__native_dist_model_create_from_backend_no_dist

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _test__native_dist_model_create_from_backend_no_dist(backend, true_device):
    from datetime import timedelta

    model = _NativeDistModel.create_from_backend(backend=backend, timeout=timedelta(seconds=20))

    assert dist.is_available() and dist.is_initialized()
    assert dist.get_backend() == backend

    _assert_model(
        model,
        {
            "device": true_device,
            "local_rank": 0,
            "rank": 0,
            "world_size": 1,
            "node_index": 0,
            "nnodes": 1,
            "nproc_per_node": 1,
        },
    )

    model.finalize() 
开发者ID:pytorch,项目名称:ignite,代码行数:24,代码来源:test_native.py

示例4: reduce

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def reduce(self, op):
        """
        Reduces average value over all workers.

        :param op: 'sum' or 'mean', reduction operator
        """
        if op not in ('sum', 'mean'):
            raise NotImplementedError

        distributed = (get_world_size() > 1)
        if distributed:
            if(hasattr(dist, "get_backend")):
                backend = dist.get_backend()
            else:
                backend = dist._backend

            cuda = (backend == dist.dist_backend.NCCL)

            if cuda:
                avg = torch.cuda.FloatTensor([self.avg])
                _sum = torch.cuda.FloatTensor([self.sum])
            else:
                avg = torch.FloatTensor([self.avg])
                _sum = torch.FloatTensor([self.sum])
            dist.all_reduce(avg, op=dist.reduce_op.SUM)
            dist.all_reduce(_sum, op=dist.reduce_op.SUM)
            self.avg = avg.item()
            self.sum = _sum.item()

            if op == 'mean':
                self.avg /= get_world_size()
                self.sum /= get_world_size() 
开发者ID:mlperf,项目名称:training_results_v0.5,代码行数:34,代码来源:utils.py

示例5: get_distributed_backend

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def get_distributed_backend(self):
        """Returns name of torch.distributed backend used."""
        assert dist.is_initialized(), "initialize the communicator first"
        return dist.get_backend() 
开发者ID:facebookresearch,项目名称:CrypTen,代码行数:6,代码来源:distributed_communicator.py

示例6: all_gather_coalesced

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def all_gather_coalesced(tensors, buffer_size=256 * MB):
        assert dist.get_backend() == dist.dist_backend.NCCL  # gloo gives some weird device error
        world_size = dist.get_world_size()
        rcv_lsts = [[] for _ in range(world_size)]
        for tensors in _take_tensors(tensors, buffer_size):
            flat_tensors = _flatten_dense_tensors(tensors)
            tmp_rcv_lst = [torch.empty_like(flat_tensors) for _ in range(world_size)]
            dist.all_gather(tmp_rcv_lst, flat_tensors)
            for i, rcv_flat_tensors in enumerate(tmp_rcv_lst):
                for rcv_t in _unflatten_dense_tensors(rcv_flat_tensors, tensors):
                    rcv_lsts[i].append(rcv_t)
        return rcv_lsts 
开发者ID:SsnL,项目名称:dataset-distillation,代码行数:14,代码来源:distributed.py

示例7: _get_global_gloo_group

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD 
开发者ID:JDAI-CV,项目名称:fast-reid,代码行数:11,代码来源:comm.py

示例8: _get_global_gloo_group

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    Returns:
        (group): pytorch dist group.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD 
开发者ID:facebookresearch,项目名称:SlowFast,代码行数:13,代码来源:distributed.py

示例9: _serialize_to_tensor

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _serialize_to_tensor(data, group):
    """
    Seriialize the tensor to ByteTensor. Note that only `gloo` and `nccl`
        backend is supported.
    Args:
        data (data): data to be serialized.
        group (group): pytorch dist group.
    Returns:
        tensor (ByteTensor): tensor that serialized.
    """

    backend = dist.get_backend(group)
    assert backend in ["gloo", "nccl"]
    device = torch.device("cpu" if backend == "gloo" else "cuda")

    buffer = pickle.dumps(data)
    if len(buffer) > 1024 ** 3:
        logger = logging.getLogger(__name__)
        logger.warning(
            "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
                get_rank(), len(buffer) / (1024 ** 3), device
            )
        )
    storage = torch.ByteStorage.from_buffer(buffer)
    tensor = torch.ByteTensor(storage).to(device=device)
    return tensor 
开发者ID:facebookresearch,项目名称:SlowFast,代码行数:28,代码来源:distributed.py

示例10: backend

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def backend(self) -> str:
            return dist.get_backend() 
开发者ID:pytorch,项目名称:ignite,代码行数:4,代码来源:native.py

示例11: _test__native_dist_model_create_from_backend_dist

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _test__native_dist_model_create_from_backend_dist(local_rank, rank, world_size, backend, true_device):
    import os
    from datetime import timedelta

    timeout = timedelta(seconds=20)
    os.environ["RANK"] = "{}".format(rank)

    assert "MASTER_ADDR" not in os.environ
    assert "MASTER_PORT" not in os.environ

    model = _NativeDistModel.create_from_backend(backend=backend, timeout=timeout)

    assert dist.is_available() and dist.is_initialized()
    assert dist.get_backend() == backend

    with pytest.raises(RuntimeError, match=r"Can not create new distributed process group if default one is"):
        _NativeDistModel.create_from_backend(backend=backend, timeout=timeout)

    _assert_model(
        model,
        {
            "device": true_device,
            "local_rank": local_rank,
            "rank": rank,
            "world_size": world_size,
            "node_index": 0,
            "nnodes": 1,
            "nproc_per_node": world_size,
        },
    )

    model.finalize()

    del os.environ["RANK"]

    assert "MASTER_ADDR" not in os.environ
    assert "MASTER_PORT" not in os.environ
    assert "RANK" not in os.environ 
开发者ID:pytorch,项目名称:ignite,代码行数:40,代码来源:test_native.py

示例12: _test_dist_spawn_fn

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _test_dist_spawn_fn(local_rank, backend, world_size, device):
    from ignite.distributed.utils import _model

    assert dist.is_available() and dist.is_initialized()
    assert dist.get_backend() == backend

    assert isinstance(_model, _NativeDistModel), "{} vs _NativeDistModel".format(type(_model))

    assert _model.get_local_rank() == local_rank
    assert _model.get_world_size() == world_size
    if backend == "nccl":
        assert _model.device() == torch.device("{}:{}".format(device, local_rank))
    elif backend == "gloo":
        assert _model.device() == torch.device(device) 
开发者ID:pytorch,项目名称:ignite,代码行数:16,代码来源:test_native.py

示例13: _serialize_to_tensor

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _serialize_to_tensor(data, group):
    backend = dist.get_backend(group)
    assert backend in ["gloo", "nccl"]
    device = torch.device("cpu" if backend == "gloo" else "cuda")

    buffer = pickle.dumps(data)
    if len(buffer) > 1024**3:
        logger.warning(
            "Rank {} trying to all-gather {:.2f} GB of data on device {}".
            format(get_rank(),
                   len(buffer) / (1024**3), device))
    storage = torch.ByteStorage.from_buffer(buffer)
    tensor = torch.ByteTensor(storage).to(device=device)
    return tensor 
开发者ID:MegviiDetection,项目名称:video_analyst,代码行数:16,代码来源:dist_utils.py

示例14: peak_memory_mb

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def peak_memory_mb() -> Dict[int, float]:
    """
    Get peak memory usage for each worker, as measured by max-resident-set size:

    https://unix.stackexchange.com/questions/30940/getrusage-system-call-what-is-maximum-resident-set-size

    Only works on OSX and Linux, otherwise the result will be 0.0 for every worker.
    """
    if resource is None or sys.platform not in ("linux", "darwin"):
        peak_mb = 0.0
    else:
        peak = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        if sys.platform == "darwin":
            # On OSX the result is in bytes.
            peak_mb = peak / 1_000_000
        else:
            # On Linux the result is in kilobytes.
            peak_mb = peak / 1_000

    if is_distributed():
        global_rank = dist.get_rank()
        world_size = dist.get_world_size()

        peak_mb_tensor = torch.tensor([float(global_rank), peak_mb])
        # All of these tensors will be gathered into this list.
        gather_results = [torch.tensor([0.0, 0.0]) for _ in range(world_size)]

        # If the backend is 'nccl', this means we're training on GPUs, so these tensors
        # need to be on GPU.
        if dist.get_backend() == "nccl":
            peak_mb_tensor = peak_mb_tensor.cuda()
            gather_results = [x.cuda() for x in gather_results]

        dist.all_gather(gather_results, peak_mb_tensor)

        results_dict: Dict[int, float] = {}
        for peak_mb_tensor in gather_results:
            worker = int(peak_mb_tensor[0])
            peak_mb = round(float(peak_mb_tensor[1]), 3)
            results_dict[worker] = peak_mb

        return results_dict
    else:
        return {0: peak_mb} 
开发者ID:allenai,项目名称:allennlp,代码行数:46,代码来源:util.py

示例15: reduce

# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def reduce(self, op):
        """
        Reduces average value over all workers.

        :param op: 'sum' or 'mean', reduction operator
        """
        if op not in ('sum', 'mean'):
            raise NotImplementedError

        distributed = (get_world_size() > 1)
        if distributed:
            # Backward/forward compatibility around
            # https://github.com/pytorch/pytorch/commit/540ef9b1fc5506369a48491af8a285a686689b36 and
            # https://github.com/pytorch/pytorch/commit/044d00516ccd6572c0d6ab6d54587155b02a3b86
            # To accomodate change in Pytorch's distributed API
            if hasattr(dist, "get_backend"):
                _backend = dist.get_backend()
                if hasattr(dist, "DistBackend"):
                    backend_enum_holder = dist.DistBackend
                else:
                    backend_enum_holder = dist.Backend
            else:
                _backend = dist._backend
                backend_enum_holder = dist.dist_backend

            cuda = _backend == backend_enum_holder.NCCL

            if cuda:
                avg = torch.cuda.FloatTensor([self.avg])
                _sum = torch.cuda.FloatTensor([self.sum])
            else:
                avg = torch.FloatTensor([self.avg])
                _sum = torch.FloatTensor([self.sum])

            _reduce_op = dist.reduce_op if hasattr(dist, "reduce_op") else dist.ReduceOp
            dist.all_reduce(avg, op=_reduce_op.SUM)
            dist.all_reduce(_sum, op=_reduce_op.SUM)
            self.avg = avg.item()
            self.sum = _sum.item()

            if op == 'mean':
                self.avg /= get_world_size()
                self.sum /= get_world_size() 
开发者ID:mlperf,项目名称:training,代码行数:45,代码来源:utils.py


注:本文中的torch.distributed.get_backend方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。