本文整理汇总了Python中torch.distributed.barrier方法的典型用法代码示例。如果您正苦于以下问题:Python distributed.barrier方法的具体用法?Python distributed.barrier怎么用?Python distributed.barrier使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类torch.distributed
的用法示例。
在下文中一共展示了distributed.barrier方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: init_distributed_mode
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE'])
args.gpu = int(os.environ['LOCAL_RANK'])
elif 'SLURM_PROCID' in os.environ:
args.rank = int(os.environ['SLURM_PROCID'])
args.gpu = args.rank % torch.cuda.device_count()
else:
print('Not using distributed mode')
args.distributed = False
return
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl'
print('| distributed init (rank {}): {}'.format(
args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
torch.distributed.barrier()
setup_for_distributed(args.rank == 0)
示例2: step
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def step(self, loss):
self.optimizer.zero_grad()
loss.backward()
dist.barrier()
handles = []
for param in self.network.parameters():
handles.append(dist.all_reduce(param.grad, async_op=True))
for handle in handles:
handle.wait()
if self.divide_grad:
for param in self.network.parameters():
param.grad.mul_(1.0 / self.world_sz)
if self.grad_norm_clip:
nn.utils.clip_grad_norm_(
self.network.parameters(), self.grad_norm_clip
)
self.optimizer.step()
示例3: step
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def step(self):
print(f"learner {self.rank} step")
# make sure exp_handles are done
for handle in self.exp_handles:
handle.wait()
# batch together exp
time.sleep(random.randint(0, 3))
# update with other learners
dist.barrier(self.learner_group)
for p in self.network_grads:
dist.all_reduce(p, group=self.learner_group)
print(f"learner {self.rank} shared gradients")
return True
示例4: receive_tensor_helper
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def receive_tensor_helper(tensor, src_rank, group, tag, num_iterations,
broadcast):
dist.barrier()
start_time = time.time()
for i in range(num_iterations):
if broadcast:
dist.broadcast(tensor=tensor, group=group, src=src_rank)
else:
dist.recv(tensor=tensor.cpu(), src=src_rank, tag=tag)
end_time = time.time()
dist.barrier()
size = tensor.size()[0]
throughput = (size * 4. * num_iterations) / (
(end_time - start_time) * 10**9)
print("Time to receive %s MB: %.3f seconds" %
((size * 4.) / 10**6,
(end_time - start_time) / num_iterations))
print("Throughput: %.3f GB/s" % throughput)
示例5: collect_results_cpu
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def collect_results_cpu(result_part, size, tmpdir=None):
rank, world_size = get_dist_info()
# create a tmp dir if it is not specified
if tmpdir is None:
MAX_LEN = 512
# 32 is whitespace
dir_tensor = torch.full((MAX_LEN, ),
32,
dtype=torch.uint8,
device='cuda')
if rank == 0:
tmpdir = tempfile.mkdtemp()
tmpdir = torch.tensor(
bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
dir_tensor[:len(tmpdir)] = tmpdir
dist.broadcast(dir_tensor, 0)
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
else:
mmcv.mkdir_or_exist(tmpdir)
# dump the part result to the dir
mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
dist.barrier()
# collect all parts
if rank != 0:
return None
else:
# load results of all parts from tmp dir
part_list = []
for i in range(world_size):
part_file = osp.join(tmpdir, f'part_{i}.pkl')
part_list.append(mmcv.load(part_file))
# sort the results
ordered_results = []
for res in zip(*part_list):
ordered_results.extend(list(res))
# the dataloader may pad some samples
ordered_results = ordered_results[:size]
# remove tmp dir
shutil.rmtree(tmpdir)
return ordered_results
示例6: collect_results
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def collect_results(result_part, size, tmpdir=None):
rank, world_size = get_dist_info()
# create a tmp dir if it is not specified
if tmpdir is None:
MAX_LEN = 512
# 32 is whitespace
dir_tensor = torch.full((MAX_LEN, ),
32,
dtype=torch.uint8,
device='cuda')
if rank == 0:
tmpdir = tempfile.mkdtemp()
tmpdir = torch.tensor(
bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
dir_tensor[:len(tmpdir)] = tmpdir
dist.broadcast(dir_tensor, 0)
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
else:
mmcv.mkdir_or_exist(tmpdir)
# dump the part result to the dir
mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
dist.barrier()
# collect all parts
if rank != 0:
return None
else:
# load results of all parts from tmp dir
part_list = []
for i in range(world_size):
part_file = osp.join(tmpdir, f'part_{i}.pkl')
part_list.append(mmcv.load(part_file))
# sort the results
ordered_results = []
for res in zip(*part_list):
ordered_results.extend(list(res))
# the dataloader may pad some samples
ordered_results = ordered_results[:size]
# remove tmp dir
shutil.rmtree(tmpdir)
return ordered_results
示例7: _init_summary_writer
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def _init_summary_writer(self):
if self.is_master_node():
self.logging('Init Summary Writer')
current_time = datetime.now().strftime('%b%d_%H-%M-%S')
sum_dir = '{}-{}'.format(self.setting.summary_dir_name, current_time)
self.summary_writer = SummaryWriter(sum_dir)
self.logging('Writing summary into {}'.format(sum_dir))
if self.in_distributed_mode():
# TODO: maybe this can be removed
dist.barrier()
示例8: synchronize
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def synchronize():
"""
Helper function to synchronize (barrier) among all processes when
using distributed training
"""
if not dist.is_available():
return
if not dist.is_initialized():
return
world_size = dist.get_world_size()
if world_size == 1:
return
dist.barrier()
示例9: after_train_epoch
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def after_train_epoch(self, runner):
if not self.every_n_epochs(runner, self.interval):
return
runner.model.eval()
results = [None for _ in range(len(self.dataset))]
if runner.rank == 0:
prog_bar = mmcv.ProgressBar(len(self.dataset))
for idx in range(runner.rank, len(self.dataset), runner.world_size):
data = self.dataset[idx]
data_gpu = scatter(
collate([data], samples_per_gpu=1),
[torch.cuda.current_device()])[0]
# compute output
with torch.no_grad():
result = runner.model(
return_loss=False, rescale=True, **data_gpu)
results[idx] = result
batch_size = runner.world_size
if runner.rank == 0:
for _ in range(batch_size):
prog_bar.update()
if runner.rank == 0:
print('\n')
dist.barrier()
for i in range(1, runner.world_size):
tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i))
tmp_results = mmcv.load(tmp_file)
for idx in range(i, len(results), runner.world_size):
results[idx] = tmp_results[idx]
os.remove(tmp_file)
self.evaluate(runner, results)
else:
tmp_file = osp.join(runner.work_dir,
'temp_{}.pkl'.format(runner.rank))
mmcv.dump(results, tmp_file)
dist.barrier()
dist.barrier()
示例10: collect_results
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def collect_results(result_part, size, tmpdir=None):
rank, world_size = get_dist_info()
# create a tmp dir if it is not specified
if tmpdir is None:
MAX_LEN = 512
# 32 is whitespace
dir_tensor = torch.full((MAX_LEN, ),
32,
dtype=torch.uint8,
device='cuda')
if rank == 0:
tmpdir = tempfile.mkdtemp()
tmpdir = torch.tensor(
bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
dir_tensor[:len(tmpdir)] = tmpdir
dist.broadcast(dir_tensor, 0)
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
else:
mmcv.mkdir_or_exist(tmpdir)
# dump the part result to the dir
mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank)))
dist.barrier()
# collect all parts
if rank != 0:
return None
else:
# load results of all parts from tmp dir
part_list = []
for i in range(world_size):
part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i))
part_list.append(mmcv.load(part_file))
# sort the results
ordered_results = []
for res in zip(*part_list):
ordered_results.extend(list(res))
# the dataloader may pad some samples
ordered_results = ordered_results[:size]
# remove tmp dir
shutil.rmtree(tmpdir)
return ordered_results
示例11: summarize_mp
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def summarize_mp(predictions, annotations_file, img_list, log_dir, mask=False):
# Write partial results to file (all workers)
rank = dist.get_rank()
with open(path.join(log_dir, "coco_ap_{:02d}.json".format(rank)), "w") as fid:
json.dump(predictions, fid)
with open(path.join(log_dir, "img_list_{:02d}.json".format(rank)), "w") as fid:
json.dump(img_list, fid)
dist.barrier()
# Merge results from all workers and run evaluation (only rank 0)
if rank == 0:
predictions = []
img_list = []
for i in range(dist.get_world_size()):
coco_ap_file = path.join(log_dir, "coco_ap_{:02d}.json".format(i))
with open(coco_ap_file) as fid:
predictions += json.load(fid)
remove(coco_ap_file)
img_list_file = path.join(log_dir, "img_list_{:02d}.json".format(i))
with open(img_list_file) as fid:
img_list += json.load(fid)
remove(img_list_file)
det_map, msk_map = summarize(predictions, annotations_file, img_list, mask)
else:
det_map, msk_map = 0, 0
dist.barrier()
return det_map, msk_map
示例12: collect_results
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def collect_results(result_part, size, tmpdir=None):
rank, world_size = get_dist_info()
# create a tmp dir if it is not specified
if tmpdir is None:
MAX_LEN = 512
# 32 is whitespace
dir_tensor = torch.full((MAX_LEN,), 32, dtype=torch.uint8, device='cuda')
if rank == 0:
tmpdir = tempfile.mkdtemp()
tmpdir = torch.Tensor(bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
dir_tensor[:len(tmpdir)] = tmpdir
dist.broadcast(dir_tensor, 0)
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
else:
mmcv.mkdir_or_exist(tmpdir)
# dump the part result to the dir
mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank)))
dist.barrier()
# collect all parts
if rank != 0:
return None
else:
# load results of all parts from tmp dir
part_list = []
for i in range(world_size):
part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i))
part_list.append(mmcv.load(part_file))
# sort the results
ordered_results = []
for res in zip(*part_list):
ordered_results.extend(list(res))
# the dataloader may pad some samples
ordered_results = ordered_results[:size]
# remove tmp dir
shutil.rmtree(tmpdir)
return ordered_results
示例13: _barrier
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def _barrier(rank):
logger.debug('Rank: {}, Waiting for other processes before the barrier.'.format(rank))
dist.barrier()
logger.debug('Rank: {}, Passing the barrier'.format(rank))
示例14: _do_validation
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def _do_validation(self):
with self.ddp_model.no_sync():
# 因为模型参数不更新,可以关闭同步
self.callback_manager.on_valid_begin()
eval_res = self.test_manager.on_valid_begin()
eval_res = list(filter(lambda x: x is not None, eval_res))
if len(eval_res):
eval_res, is_better = list(zip(*eval_res))
eval_res = eval_res[0]
is_better = is_better[0]
else:
eval_res, is_better = None, None
if self.metric_key is None and eval_res is not None:
eval_res0 = list(eval_res.values())[0]
self.metric_key = list(eval_res0.keys())[0]
# logger.info('{}, {}'.format(eval_res, is_better))
# save better model on master node
if is_better is not None and self.cp_save_path:
if is_better:
self.save_check_point(self._best_save_name(), only_params=False)
dist.barrier()
if not self.is_master and self.metric_key is None:
# 主进程自动得到了metric_key,而其它进程没有
prefix = 'best_' + self.model.__class__.__name__
suffix = self.start_time
fn_list = os.listdir(self.cp_save_path)
fn_list = [fn for fn in fn_list if fn.startswith(prefix) and fn.endswith(suffix)]
if len(fn_list) == 1:
best_name = fn_list[0]
self.metric_key = best_name[len(prefix):-len(suffix)].strip('_')
# print('RANK {} metric_key {}'.format(self.rank, self.metric_key))
self.callback_manager.on_valid_end(
eval_res, self.metric_key, self.optimizer, is_better)
self.ddp_model.train()
示例15: synchronize_between_processes
# 需要导入模块: from torch import distributed [as 别名]
# 或者: from torch.distributed import barrier [as 别名]
def synchronize_between_processes(self):
"""
Warning: does not synchronize the deque!
"""
if not is_dist_avail_and_initialized():
return
t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
dist.barrier()
dist.all_reduce(t)
t = t.tolist()
self.count = int(t[0])
self.total = t[1]