本文整理汇总了Python中horovod.torch.local_rank方法的典型用法代码示例。如果您正苦于以下问题:Python torch.local_rank方法的具体用法?Python torch.local_rank怎么用?Python torch.local_rank使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类horovod.torch
的用法示例。
在下文中一共展示了torch.local_rank方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def main():
args = parser.parse_args()
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
warnings.warn('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')
hvd.init()
local_rank = hvd.local_rank()
torch.cuda.set_device(local_rank)
main_worker(local_rank, 4, args)
示例2: get_model
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def get_model(conf, num_class=10, data_parallel=True):
name = conf['type']
if name == 'wresnet40_2':
model = WideResNet(40, 2, dropout_rate=0.0, num_classes=num_class)
elif name == 'wresnet28_2':
model = WideResNet(28, 2, dropout_rate=0.0, num_classes=num_class)
elif name == 'wresnet28_10':
model = WideResNet(28, 10, dropout_rate=0.0, num_classes=num_class)
else:
raise NameError('no model named, %s' % name)
if data_parallel:
model = model.cuda()
model = DataParallel(model)
else:
import horovod.torch as hvd
device = torch.device('cuda', hvd.local_rank())
model = model.to(device)
cudnn.benchmark = True
return model
示例3: __init__
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def __init__(self, accumulation_step=1):
hvd.init()
self.local_rank = hvd.local_rank()
self.world_size = hvd.size()
self.rank = hvd.rank()
self.n_gpu = torch.cuda.device_count()
self.node_count = self.world_size // self.n_gpu
self.accumulation_step = accumulation_step
self.count_down = accumulation_step - 1
self._multi_node = self.node_count > 1
if not self._multi_node:
# use PyTorch build-in NCCL backend for single node training
torch.distributed.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:6000",
world_size=self.n_gpu,
rank=self.local_rank,
)
示例4: test_horovod_allreduce_multi_gpu
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def test_horovod_allreduce_multi_gpu(self):
"""Test that the allreduce works on multiple GPUs."""
# Only do this test if there are GPUs available.
if not torch.cuda.is_available():
return
hvd.init()
local_rank = hvd.local_rank()
size = hvd.size()
iter = 0
dtypes = [torch.cuda.IntTensor, torch.cuda.LongTensor,
torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
dims = [1, 2, 3]
for dtype, dim in itertools.product(dtypes, dims):
iter += 1
torch.manual_seed(1234)
tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100)
device = local_rank * 2 + (iter + local_rank) % 2
tensor = tensor.cuda(device).type(dtype)
multiplied = tensor * size
hvd.allreduce_(tensor, average=False)
max_difference = tensor.sub(multiplied).max()
# Threshold for floating point equality depends on number of
# ranks, since we're comparing against precise multiplication.
if size <= 3 or dtype in [torch.cuda.IntTensor, torch.cuda.LongTensor]:
threshold = 0
elif size < 10:
threshold = 1e-4
elif size < 15:
threshold = 5e-4
else:
break
assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
示例5: __init__
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def __init__(self, **kwargs):
super().__init__(**kwargs)
hvd.init()
torch.set_num_threads(int(os.environ.get("OMP_NUM_THREADS", 1)))
torch.cuda.set_device(hvd.local_rank())
torch.backends.cudnn.benchmark = True
self.avg_loss = AvgMeter(50)
self.dtype = kwargs.get("dtype", None) # just for test for now
示例6: _checkpoint
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def _checkpoint(self, epoch):
if self.checkpoint_dir and hvd.local_rank() == 0:
out_fname = '{:02d}.pth'.format(epoch)
out_fname = os.path.join(self.checkpoint_dir, out_fname)
state = {'model': self.model.state_dict(),
'optimizer': self.optimizer.state_dict()}
torch.save(state, out_fname)
示例7: _init_loaders
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def _init_loaders(self):
allreduce_batch_size = self.batch_size * self.batches_per_allreduce
if hvd.local_rank() != 0:
hvd.allreduce(torch.tensor(0), name='barrier')
self.train_dataset = datasets.CIFAR10(
root=self.dataset_path, download=(hvd.local_rank() == 0),
train=True,
transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
self.normalize,]))
if hvd.local_rank() == 0:
hvd.allreduce(torch.tensor(0), name='barrier')
self.val_dataset = datasets.CIFAR10(
root=self.dataset_path,
train=False,
transform=transforms.Compose([
transforms.ToTensor(),
self.normalize,]))
self.train_loader = torch.utils.data.DataLoader(
self.train_dataset, batch_size=allreduce_batch_size,
shuffle=True, num_workers=8, pin_memory=True)
self.val_loader = torch.utils.data.DataLoader(
self.val_dataset, batch_size=allreduce_batch_size,
shuffle=False, num_workers=8, pin_memory=True)
示例8: evaluation_forward
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: bool = False):
# make dataloader_idx arg in validation_step optional
args = [batch, batch_idx]
if (test_mode and len(self.test_dataloaders) > 1) \
or (not test_mode and len(self.val_dataloaders) > 1):
args.append(dataloader_idx)
# handle DP, DDP forward
if self.use_ddp or self.use_dp or self.use_ddp2:
output = model(*args)
return output
# Horovod
if self.use_horovod and self.on_gpu:
batch = self.transfer_batch_to_gpu(batch, hvd.local_rank())
args[0] = batch
# single GPU data transfer
if self.single_gpu:
# for single GPU put inputs on gpu manually
root_gpu = 0
if isinstance(self.data_parallel_device_ids, list):
root_gpu = self.data_parallel_device_ids[0]
batch = self.transfer_batch_to_gpu(batch, root_gpu)
args[0] = batch
# TPU data transfer
if self.use_tpu:
batch = self.transfer_batch_to_tpu(batch, self.tpu_id)
args[0] = batch
# CPU, TPU or gpu step
if test_mode:
output = model.test_step(*args)
else:
output = model.validation_step(*args)
return output
示例9: sync_horovod
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def sync_horovod(self):
if self.use_horovod:
hvd.join(hvd.local_rank() if self.on_gpu else -1)
示例10: _set_horovod_backend
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def _set_horovod_backend(self):
self.check_horovod()
self.use_horovod = True
# Initialize Horovod to get rank / size info
hvd.init()
if self.on_gpu:
# Horovod assigns one local GPU per process
self.root_gpu = hvd.local_rank()
示例11: get_local_rank
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def get_local_rank() -> int:
# returns -1 if not distributed, else returns local rank
# it works before dist.init_process_group
if not is_distributed():
return -1
else:
if is_horovod_available():
import horovod.torch as hvd
return hvd.local_rank()
return int(get_environ('LOCAL_RANK', 0))
示例12: __init__
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def __init__(self, **kwargs):
default_attr = dict(
# Training options
batch_size=32, base_lr=0.0125, momentum=0.9, wd=1e-4, epochs=90, warmup_epochs=5,
stride=10, label_smoothing=-1.0, rand_target=False,
# validation options
run_val=True,
# Model/checkpoint options
model=None, checkpoint_dir=None, dataset_path='/mnt/imagenet-test/',
# Attack options
attack=None, attack_backward_steps=0, attack_loss='avg', scale_eps=False, rand_init=True,
# Communication options
fp16_allreduce=False,
# Logging options
logger=None)
default_attr.update(kwargs)
for k in default_attr:
setattr(self, k, default_attr[k])
assert self.attack_loss in ['avg', 'adv_only', 'logsumexp', 'max']
# Validate args
assert self.model != None
# Set up checkpointing
if self.checkpoint_dir is not None:
os.makedirs(self.checkpoint_dir, exist_ok=True)
self.cuda = True
self.batches_per_allreduce = 1
self.verbose = 1 if hvd.rank() == 0 else 0
self.compression = hvd.Compression.fp16 if self.fp16_allreduce else hvd.Compression.none
if self.verbose:
print(self.model)
torch.cuda.set_device(hvd.local_rank())
if self.cuda:
self.model.cuda()
if self.attack:
self.attack = self.attack()
self.attack_backward_steps = self.attack.nb_backward_steps
self.normalize = transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
self._init_loaders()
self._init_optimizer()
self._start_sync()
示例13: horovod_train
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def horovod_train(self, model):
# call setup after the ddp process has connected
self.setup('fit')
if self.is_function_implemented('setup', model):
model.setup('fit')
if torch.cuda.is_available() and self.on_gpu:
# Horovod: pin GPU to local rank
assert self.root_gpu == hvd.local_rank()
torch.cuda.set_device(self.root_gpu)
model.cuda(self.root_gpu)
# avoid duplicating progress bar
if hvd.rank() != 0 and self.progress_bar_callback is not None:
self.progress_bar_callback.disable()
# CHOOSE OPTIMIZER
# allow for lr schedulers as well
self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)
# Horovod: scale the learning rate by the number of workers to account for
# increased total batch size
for optimizer in self.optimizers:
for param_group in optimizer.param_groups:
param_group['lr'] *= hvd.size()
if self.use_amp:
# An example
model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level)
self.optimizers = optimizers
self.reinit_scheduler_properties(self.optimizers, self.lr_schedulers)
# Horovod: broadcast parameters & optimizer state to ensure consistent initialization
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
for optimizer in self.optimizers:
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
def filter_named_parameters(model, optimizer):
opt_params = set([p for group in optimizer.param_groups for p in group.get('params', [])])
return [(name, p) for name, p in model.named_parameters() if p in opt_params]
# Horovod: wrap optimizers to perform gradient aggregation via allreduce
self.optimizers = [
hvd.DistributedOptimizer(optimizer, named_parameters=filter_named_parameters(model, optimizer))
for optimizer in self.optimizers
]
# Update logger rank info from Horovod to avoid race conditions from different ranks
# creating directories / writing files in the same locations.
self.global_rank = hvd.rank()
rank_zero_only.rank = self.global_rank
with ExitStack() as stack:
for optimizer in self.optimizers:
# Synchronization will be performed explicitly following backward()
stack.enter_context(optimizer.skip_synchronize())
self.run_pretrain_routine(model)
# Make sure all workers have finished training before returning to the user
hvd.join()
示例14: spawn_ddp_children
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def spawn_ddp_children(self, model):
port = os.environ['MASTER_PORT']
master_address = '127.0.0.1' if 'MASTER_ADDR' not in os.environ else os.environ['MASTER_ADDR']
os.environ['MASTER_PORT'] = f'{port}'
os.environ['MASTER_ADDR'] = f'{master_address}'
# allow the user to pass the node rank
node_rank = '0'
if 'NODE_RANK' in os.environ:
node_rank = os.environ['NODE_RANK']
if 'GROUP_RANK' in os.environ:
node_rank = os.environ['GROUP_RANK']
os.environ['NODE_RANK'] = node_rank
os.environ['LOCAL_RANK'] = '0'
# when user is using hydra find the absolute path
path_lib = abspath if not HYDRA_AVAILABLE else to_absolute_path
# pull out the commands used to run the script and resolve the abs file path
command = sys.argv
try:
full_path = path_lib(command[0])
except Exception as e:
full_path = abspath(command[0])
command[0] = full_path
# use the same python interpreter and actually running
command = [sys.executable] + command
# since this script sets the visible devices we replace the gpus flag with a number
num_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',').__len__()
if '--gpus' in command:
gpu_flag_idx = command.index('--gpus')
command[gpu_flag_idx + 1] = f'{num_gpus}'
os.environ['WORLD_SIZE'] = f'{num_gpus * self.num_nodes}'
self.interactive_ddp_procs = []
for local_rank in range(1, self.num_processes):
env_copy = os.environ.copy()
env_copy['LOCAL_RANK'] = f'{local_rank}'
# start process
proc = subprocess.Popen(command, env=env_copy)
self.interactive_ddp_procs.append(proc)
# starting all processes at once can cause issues
# with dataloaders delay between 1-10 seconds
delay = np.random.uniform(1, 5, 1)[0]
sleep(delay)
local_rank = 0
self.ddp_train(local_rank, model, is_master=True)
示例15: __init__
# 需要导入模块: from horovod import torch [as 别名]
# 或者: from horovod.torch import local_rank [as 别名]
def __init__(
self,
language=Language.ENGLISH,
num_labels=2,
cache_dir=".",
use_distributed=False,
):
"""
Args:
language: Language passed to pre-trained BERT model to pick the appropriate
model
num_labels: number of unique labels in train dataset
cache_dir: cache_dir to load pre-trained BERT model. Defaults to "."
"""
if num_labels < 2:
raise ValueError("Number of labels should be at least 2.")
self.language = language
self.num_labels = num_labels
self.cache_dir = cache_dir
self.use_distributed = use_distributed
# create classifier
self.model = BertForSequenceClassification.from_pretrained(
language.value, cache_dir=cache_dir, num_labels=num_labels
)
# define optimizer and model parameters
param_optimizer = list(self.model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [
p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
],
"weight_decay": 0.01,
},
{
"params": [
p for n, p in param_optimizer if any(nd in n for nd in no_decay)
]
},
]
self.optimizer_params = optimizer_grouped_parameters
self.name_parameters = self.model.named_parameters()
self.state_dict = self.model.state_dict()
if use_distributed:
hvd.init()
if torch.cuda.is_available():
torch.cuda.set_device(hvd.local_rank())
else:
warnings.warn("No GPU available! Using CPU.")