本文整理汇总了Python中torch.distributed.get_backend方法的典型用法代码示例。如果您正苦于以下问题:Python distributed.get_backend方法的具体用法?Python distributed.get_backend怎么用?Python distributed.get_backend使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类torch.distributed
的用法示例。
在下文中一共展示了distributed.get_backend方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _serialize_to_tensor
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _serialize_to_tensor(data, group):
backend = dist.get_backend(group)
assert backend in ["gloo", "nccl"]
device = torch.device("cpu" if backend == "gloo" else "cuda")
buffer = pickle.dumps(data)
if len(buffer) > 1024 ** 3:
logger = logging.getLogger(__name__)
logger.warning(
"Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
get_rank(), len(buffer) / (1024 ** 3), device
)
)
storage = torch.ByteStorage.from_buffer(buffer)
tensor = torch.ByteTensor(storage).to(device=device)
return tensor
示例2: spmd_main
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def spmd_main(local_world_size, local_rank):
# These are the parameters used to initialize the process group
env_dict = {
key: os.environ[key]
for key in ("MASTER_ADDR", "MASTER_PORT", "RANK", "WORLD_SIZE")
}
print(f"[{os.getpid()}] Initializing process group with: {env_dict}")
dist.init_process_group(backend="nccl")
print(
f"[{os.getpid()}]: world_size = {dist.get_world_size()}, "
+ f"rank = {dist.get_rank()}, backend={dist.get_backend()}"
)
demo_basic(local_world_size, local_rank)
# Tear down the process group
dist.destroy_process_group()
示例3: _test__native_dist_model_create_from_backend_no_dist
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _test__native_dist_model_create_from_backend_no_dist(backend, true_device):
from datetime import timedelta
model = _NativeDistModel.create_from_backend(backend=backend, timeout=timedelta(seconds=20))
assert dist.is_available() and dist.is_initialized()
assert dist.get_backend() == backend
_assert_model(
model,
{
"device": true_device,
"local_rank": 0,
"rank": 0,
"world_size": 1,
"node_index": 0,
"nnodes": 1,
"nproc_per_node": 1,
},
)
model.finalize()
示例4: reduce
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def reduce(self, op):
"""
Reduces average value over all workers.
:param op: 'sum' or 'mean', reduction operator
"""
if op not in ('sum', 'mean'):
raise NotImplementedError
distributed = (get_world_size() > 1)
if distributed:
if(hasattr(dist, "get_backend")):
backend = dist.get_backend()
else:
backend = dist._backend
cuda = (backend == dist.dist_backend.NCCL)
if cuda:
avg = torch.cuda.FloatTensor([self.avg])
_sum = torch.cuda.FloatTensor([self.sum])
else:
avg = torch.FloatTensor([self.avg])
_sum = torch.FloatTensor([self.sum])
dist.all_reduce(avg, op=dist.reduce_op.SUM)
dist.all_reduce(_sum, op=dist.reduce_op.SUM)
self.avg = avg.item()
self.sum = _sum.item()
if op == 'mean':
self.avg /= get_world_size()
self.sum /= get_world_size()
示例5: get_distributed_backend
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def get_distributed_backend(self):
"""Returns name of torch.distributed backend used."""
assert dist.is_initialized(), "initialize the communicator first"
return dist.get_backend()
示例6: all_gather_coalesced
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def all_gather_coalesced(tensors, buffer_size=256 * MB):
assert dist.get_backend() == dist.dist_backend.NCCL # gloo gives some weird device error
world_size = dist.get_world_size()
rcv_lsts = [[] for _ in range(world_size)]
for tensors in _take_tensors(tensors, buffer_size):
flat_tensors = _flatten_dense_tensors(tensors)
tmp_rcv_lst = [torch.empty_like(flat_tensors) for _ in range(world_size)]
dist.all_gather(tmp_rcv_lst, flat_tensors)
for i, rcv_flat_tensors in enumerate(tmp_rcv_lst):
for rcv_t in _unflatten_dense_tensors(rcv_flat_tensors, tensors):
rcv_lsts[i].append(rcv_t)
return rcv_lsts
示例7: _get_global_gloo_group
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _get_global_gloo_group():
"""
Return a process group based on gloo backend, containing all the ranks
The result is cached.
"""
if dist.get_backend() == "nccl":
return dist.new_group(backend="gloo")
else:
return dist.group.WORLD
示例8: _get_global_gloo_group
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _get_global_gloo_group():
"""
Return a process group based on gloo backend, containing all the ranks
The result is cached.
Returns:
(group): pytorch dist group.
"""
if dist.get_backend() == "nccl":
return dist.new_group(backend="gloo")
else:
return dist.group.WORLD
示例9: _serialize_to_tensor
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _serialize_to_tensor(data, group):
"""
Seriialize the tensor to ByteTensor. Note that only `gloo` and `nccl`
backend is supported.
Args:
data (data): data to be serialized.
group (group): pytorch dist group.
Returns:
tensor (ByteTensor): tensor that serialized.
"""
backend = dist.get_backend(group)
assert backend in ["gloo", "nccl"]
device = torch.device("cpu" if backend == "gloo" else "cuda")
buffer = pickle.dumps(data)
if len(buffer) > 1024 ** 3:
logger = logging.getLogger(__name__)
logger.warning(
"Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
get_rank(), len(buffer) / (1024 ** 3), device
)
)
storage = torch.ByteStorage.from_buffer(buffer)
tensor = torch.ByteTensor(storage).to(device=device)
return tensor
示例10: backend
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def backend(self) -> str:
return dist.get_backend()
示例11: _test__native_dist_model_create_from_backend_dist
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _test__native_dist_model_create_from_backend_dist(local_rank, rank, world_size, backend, true_device):
import os
from datetime import timedelta
timeout = timedelta(seconds=20)
os.environ["RANK"] = "{}".format(rank)
assert "MASTER_ADDR" not in os.environ
assert "MASTER_PORT" not in os.environ
model = _NativeDistModel.create_from_backend(backend=backend, timeout=timeout)
assert dist.is_available() and dist.is_initialized()
assert dist.get_backend() == backend
with pytest.raises(RuntimeError, match=r"Can not create new distributed process group if default one is"):
_NativeDistModel.create_from_backend(backend=backend, timeout=timeout)
_assert_model(
model,
{
"device": true_device,
"local_rank": local_rank,
"rank": rank,
"world_size": world_size,
"node_index": 0,
"nnodes": 1,
"nproc_per_node": world_size,
},
)
model.finalize()
del os.environ["RANK"]
assert "MASTER_ADDR" not in os.environ
assert "MASTER_PORT" not in os.environ
assert "RANK" not in os.environ
示例12: _test_dist_spawn_fn
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _test_dist_spawn_fn(local_rank, backend, world_size, device):
from ignite.distributed.utils import _model
assert dist.is_available() and dist.is_initialized()
assert dist.get_backend() == backend
assert isinstance(_model, _NativeDistModel), "{} vs _NativeDistModel".format(type(_model))
assert _model.get_local_rank() == local_rank
assert _model.get_world_size() == world_size
if backend == "nccl":
assert _model.device() == torch.device("{}:{}".format(device, local_rank))
elif backend == "gloo":
assert _model.device() == torch.device(device)
示例13: _serialize_to_tensor
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def _serialize_to_tensor(data, group):
backend = dist.get_backend(group)
assert backend in ["gloo", "nccl"]
device = torch.device("cpu" if backend == "gloo" else "cuda")
buffer = pickle.dumps(data)
if len(buffer) > 1024**3:
logger.warning(
"Rank {} trying to all-gather {:.2f} GB of data on device {}".
format(get_rank(),
len(buffer) / (1024**3), device))
storage = torch.ByteStorage.from_buffer(buffer)
tensor = torch.ByteTensor(storage).to(device=device)
return tensor
示例14: peak_memory_mb
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def peak_memory_mb() -> Dict[int, float]:
"""
Get peak memory usage for each worker, as measured by max-resident-set size:
https://unix.stackexchange.com/questions/30940/getrusage-system-call-what-is-maximum-resident-set-size
Only works on OSX and Linux, otherwise the result will be 0.0 for every worker.
"""
if resource is None or sys.platform not in ("linux", "darwin"):
peak_mb = 0.0
else:
peak = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
if sys.platform == "darwin":
# On OSX the result is in bytes.
peak_mb = peak / 1_000_000
else:
# On Linux the result is in kilobytes.
peak_mb = peak / 1_000
if is_distributed():
global_rank = dist.get_rank()
world_size = dist.get_world_size()
peak_mb_tensor = torch.tensor([float(global_rank), peak_mb])
# All of these tensors will be gathered into this list.
gather_results = [torch.tensor([0.0, 0.0]) for _ in range(world_size)]
# If the backend is 'nccl', this means we're training on GPUs, so these tensors
# need to be on GPU.
if dist.get_backend() == "nccl":
peak_mb_tensor = peak_mb_tensor.cuda()
gather_results = [x.cuda() for x in gather_results]
dist.all_gather(gather_results, peak_mb_tensor)
results_dict: Dict[int, float] = {}
for peak_mb_tensor in gather_results:
worker = int(peak_mb_tensor[0])
peak_mb = round(float(peak_mb_tensor[1]), 3)
results_dict[worker] = peak_mb
return results_dict
else:
return {0: peak_mb}
示例15: reduce
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import get_backend [as 别名]
def reduce(self, op):
"""
Reduces average value over all workers.
:param op: 'sum' or 'mean', reduction operator
"""
if op not in ('sum', 'mean'):
raise NotImplementedError
distributed = (get_world_size() > 1)
if distributed:
# Backward/forward compatibility around
# https://github.com/pytorch/pytorch/commit/540ef9b1fc5506369a48491af8a285a686689b36 and
# https://github.com/pytorch/pytorch/commit/044d00516ccd6572c0d6ab6d54587155b02a3b86
# To accomodate change in Pytorch's distributed API
if hasattr(dist, "get_backend"):
_backend = dist.get_backend()
if hasattr(dist, "DistBackend"):
backend_enum_holder = dist.DistBackend
else:
backend_enum_holder = dist.Backend
else:
_backend = dist._backend
backend_enum_holder = dist.dist_backend
cuda = _backend == backend_enum_holder.NCCL
if cuda:
avg = torch.cuda.FloatTensor([self.avg])
_sum = torch.cuda.FloatTensor([self.sum])
else:
avg = torch.FloatTensor([self.avg])
_sum = torch.FloatTensor([self.sum])
_reduce_op = dist.reduce_op if hasattr(dist, "reduce_op") else dist.ReduceOp
dist.all_reduce(avg, op=_reduce_op.SUM)
dist.all_reduce(_sum, op=_reduce_op.SUM)
self.avg = avg.item()
self.sum = _sum.item()
if op == 'mean':
self.avg /= get_world_size()
self.sum /= get_world_size()