本文整理汇总了Python中apex.amp.master_params方法的典型用法代码示例。如果您正苦于以下问题:Python amp.master_params方法的具体用法?Python amp.master_params怎么用?Python amp.master_params使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apex.amp
的用法示例。
在下文中一共展示了amp.master_params方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: rescale_gradients
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def rescale_gradients(self) -> float:
"""
Performs gradient rescaling. Is a no-op if gradient rescaling is not enabled.
Returns the norm of the gradients.
"""
if self._opt_level is not None:
# See: https://nvidia.github.io/apex/advanced.html#gradient-clipping
parameters_to_clip = [
p for p in amp.master_params(self.optimizer) if p.grad is not None
]
else:
parameters_to_clip = [p for p in self.model.parameters() if p.grad is not None]
if self._grad_norm:
return clip_grad_norm_(parameters_to_clip, self._grad_norm)
else:
return torch.norm(
torch.stack([torch.norm(p.grad.detach()) for p in parameters_to_clip])
)
示例2: _make_step
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def _make_step(self):
data, target = self.state.input
output = self.state.model(data)
self.state.output = output
loss = self.state.criterion(output, target)
if self.state.is_train:
with amp.scale_loss(loss / self.accumulate_steps, self.state.optimizer) as scaled_loss:
scaled_loss.backward()
if self.gradient_clip_val > 0:
torch.nn.utils.clip_grad_norm_(amp.master_params(self.state.optimizer), self.gradient_clip_val)
if self.state.step % self.accumulate_steps == 0:
self.state.optimizer.step()
self.state.optimizer.zero_grad()
torch.cuda.synchronize()
# update metrics
self.state.loss_meter.update(to_numpy(loss))
with torch.no_grad():
for metric, meter in zip(self.state.metrics, self.state.metric_meters):
meter.update(to_numpy(metric(output, target).squeeze()))
示例3: save_state
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def save_state(self, save_directory: typing.Union[str, Path], epoch_id: int):
save_directory = Path(save_directory)
if not save_directory.exists():
save_directory.mkdir()
else:
assert save_directory.is_dir(), "Save path should be a directory"
model_to_save = getattr(self.model, 'module', self.model)
model_to_save.save_pretrained(save_directory)
optimizer_state: typing.Dict[str, typing.Any] = {
'optimizer': self.optimizer.state_dict(),
'scheduler': self.scheduler.state_dict(),
'epoch': epoch_id}
if APEX_FOUND:
optimizer_state['master params'] = list(amp.master_params(self.optimizer))
try:
optimizer_state['amp'] = amp.state_dict()
except AttributeError:
pass
torch.save(optimizer_state, save_directory / 'checkpoint.bin')
示例4: backward_propagate
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def backward_propagate(self, loss, step):
loss = self.adjust_loss(loss)
if self.global_step % self.log_loss_every == 0 and self.local_rank in [-1, 0]:
if self.local_rank in [-1, 0]:
MlLogger.log_metrics(
{"Train_loss_total": float(loss.detach().cpu().numpy())},
step=self.global_step,
)
if self.log_learning_rate:
MlLogger.log_metrics({"learning_rate": self.lr_schedule.get_last_lr()[0]},
step=self.global_step)
if self.use_amp:
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
if step % self.grad_acc_steps == 0:
if self.max_grad_norm is not None:
if self.use_amp:
torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.max_grad_norm)
else:
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
self.optimizer.step()
self.optimizer.zero_grad()
if self.lr_schedule:
self.lr_schedule.step()
return loss
示例5: on_backward_end
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def on_backward_end(self):
if self.step%self.update_every==0:
if self.parameters is None:
if getattr(self.trainer, 'fp16', ''):
_check_fp16()
self.clip_fun(amp.master_params(self.optimizer), self.clip_value)
else:
self.clip_fun(self.model.parameters(), self.clip_value)
else:
self.clip_fun(self.parameters, self.clip_value)
示例6: train
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def train(train_step):
train_loader = train_corpus.batch_generator()
start_time = time.time()
for batch, item in enumerate(train_loader):
net.train()
data, targets, word_cnt, batch_len = get_batch(item)
# Starting each batch, we detach the hidden state from how it was previously produced.
# If we didn't, the model would try backpropagating all the way to start of the dataset.
optimizer.zero_grad()
# Network
logits, new_targets = net(data, targets)
loss = F.cross_entropy(logits.view(-1, nsampled+1), new_targets)
# AMP
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.clip)
optimizer.step()
scheduler.step(train_step)
train_step += 1
interval = 125
if batch % interval == 0:
elapsed = time.time() - start_time
print('Epoch: {:3d} | {:5d}/{:5d} batches | lr {:.6f} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'
.format(epoch, batch, batch_len, scheduler.get_lr()[0], elapsed * 1000 / interval, loss.item(), math.exp(loss.item())))
start_time = time.time()
sys.stdout.flush()
return train_step
# Load the saved model.
示例7: optimize
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def optimize(self, loss):
"""
Normalization on the loss (gradient accumulation or distributed training), followed by
backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
Also update the metrics for tensorboard.
"""
# Check for NaN
if (loss != loss).data.any():
logger.error("NaN detected")
exit()
if self.multi_gpu:
loss = loss.mean()
if self.params.gradient_accumulation_steps > 1:
loss = loss / self.params.gradient_accumulation_steps
if self.fp16:
from apex import amp
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
self.iter()
if self.n_iter % self.params.gradient_accumulation_steps == 0:
if self.fp16:
torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
else:
torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
self.optimizer.step()
self.optimizer.zero_grad()
self.scheduler.step()
示例8: resume_from_checkpoint
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def resume_from_checkpoint(self, checkpoint_dir: str) -> int:
checkpoint = torch.load(
os.path.join(checkpoint_dir, 'checkpoint.bin'), map_location=self.device)
self.optimizer.load_state_dict(checkpoint['optimizer'])
if self.fp16:
self.optimizer._lazy_init_maybe_master_weights()
self.optimizer._amp_stash.lazy_init_called = True
self.optimizer.load_state_dict(checkpoint['optimizer'])
for param, saved in zip(
amp.master_params(self.optimizer), checkpoint['master params']):
param.data.copy_(saved.data)
amp.load_state_dict(checkpoint['amp'])
self.scheduler.load_state_dict(checkpoint['scheduler'])
start_epoch = checkpoint['epoch'] + 1
return start_epoch
示例9: train_epoch
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def train_epoch(self, data):
pbar = ProgressBar(n_total=len(data))
tr_loss = AverageMeter()
self.epoch_reset()
for step, batch in enumerate(data):
self.batch_reset()
self.model.train()
batch = tuple(t.to(self.device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
print("input_ids, input_mask, segment_ids, label_ids SIZE: \n")
print(input_ids.size(), input_mask.size(),
segment_ids.size(), label_ids.size())
logits = self.model(input_ids, input_mask, segment_ids)
print("logits and label ids size: ",
logits.size(), label_ids.size())
loss = self.criterion(output=logits, target=label_ids)
if len(self.n_gpu) >= 2:
loss = loss.mean()
if self.gradient_accumulation_steps > 1:
loss = loss / self.gradient_accumulation_steps
if self.fp16:
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
scaled_loss.backward()
clip_grad_norm_(amp.master_params(
self.optimizer), self.grad_clip)
else:
loss.backward()
clip_grad_norm_(self.model.parameters(), self.grad_clip)
if (step + 1) % self.gradient_accumulation_steps == 0:
self.lr_scheduler.step()
self.optimizer.step()
self.optimizer.zero_grad()
self.global_step += 1
if self.batch_metrics:
for metric in self.batch_metrics:
metric(logits=logits, target=label_ids)
self.info[metric.name()] = metric.value()
self.info['loss'] = loss.item()
tr_loss.update(loss.item(), n=1)
if self.verbose >= 1:
pbar.batch_step(step=step, info=self.info, bar_type='Training')
self.outputs.append(logits.cpu().detach())
self.targets.append(label_ids.cpu().detach())
print("\n------------- train result --------------")
# epoch metric
self.outputs = torch.cat(self.outputs, dim=0).cpu().detach()
self.targets = torch.cat(self.targets, dim=0).cpu().detach()
self.result['loss'] = tr_loss.avg
if self.epoch_metrics:
for metric in self.epoch_metrics:
metric(logits=self.outputs, target=self.targets)
value = metric.value()
if value:
self.result[f'{metric.name()}'] = value
if "cuda" in str(self.device):
torch.cuda.empty_cache()
return self.result
示例10: _train_batch
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def _train_batch(self, args, step, inputs, labels, masker, eval_dataset, eval_masker, model):
inputs = self._to(args, inputs)
labels = self._to(args, labels)
model.train()
loss = self._forward(args, inputs, labels, masker, model, backprop=True)
self.tr_loss += loss.item()
if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16:
torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), args.max_grad_norm)
else:
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
self.optimizer.step()
self.scheduler.step() # Update learning rate schedule
model.zero_grad()
self._post_training()
self.global_step += 1
if args.local_rank in [-1, 0] and args.logging_steps > 0 and self.global_step % args.logging_steps == 0:
# Log metrics
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
results = self.evaluate(args, eval_dataset, eval_masker, model)
for key, value in results.items():
self.tb_writer.add_scalar('eval_{}'.format(key), value, self.global_step)
self._train_writer(args.logging_steps)
if args.local_rank in [-1, 0] and args.save_steps > 0 and self.global_step % args.save_steps == 0:
checkpoint_prefix = 'checkpoint'
# Save model checkpoint
output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, self.global_step))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
self.tokenizer.save_pretrained(output_dir)
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
logger.info("Saving model checkpoint to %s", output_dir)
torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
torch.save(self.scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
logger.info("Saving optimizer and scheduler states to %s", output_dir)
self._rotate_checkpoints(args, checkpoint_prefix)
示例11: train_batch
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def train_batch(self, batch, batch_info=None):
args = self.args
model = self.model
optimizer = self.optimizer
step = batch_info["batch_idx"]
model.train()
batch = tuple(t.to(self.device) for t in batch)
inputs = {
"input_ids": batch[0],
"attention_mask": batch[1],
"labels": batch[3]
}
if args.model_type != "distilbert":
# XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
inputs["token_type_ids"] = (batch[2] if args.model_type in [
"bert", "xlnet", "albert"
] else None)
outputs = model(**inputs)
# model outputs are always tuple in transformers (see doc)
loss = outputs[0]
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
batch_loss = loss.item()
# last step in epoch but step is always smaller
# than gradient_accumulation_steps
ending = (self.train_data_len <= args.gradient_accumulation_steps
and (step + 1) == self.train_data_len)
if (step + 1) % args.gradient_accumulation_steps == 0 or ending:
if args.fp16:
torch.nn.utils.clip_grad_norm_(
amp.master_params(optimizer), args.max_grad_norm)
else:
torch.nn.utils.clip_grad_norm_(model.parameters(),
args.max_grad_norm)
self.optimizer.step()
self._warmup_scheduler.step() # Update learning rate schedule
model.zero_grad()
self._global_step += 1
learning_rate_scalar = self._warmup_scheduler.get_lr()[0]
return {"learning_rate": learning_rate_scalar, "loss": batch_loss}
示例12: train_epoch
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def train_epoch(self,data):
pbar = ProgressBar(n_total = len(data),desc='Training')
tr_loss = AverageMeter()
self.epoch_reset()
for step, batch in enumerate(data):
self.batch_reset()
self.model.train()
batch = tuple(t.to(self.device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
logits = self.model(input_ids, segment_ids,input_mask)
loss = self.criterion(output=logits,target=label_ids)
if len(self.args.n_gpu) >= 2:
loss = loss.mean()
if self.args.gradient_accumulation_steps > 1:
loss = loss / self.args.gradient_accumulation_steps
if self.args.fp16:
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
scaled_loss.backward()
clip_grad_norm_(amp.master_params(self.optimizer), self.args.grad_clip)
else:
loss.backward()
clip_grad_norm_(self.model.parameters(), self.args.grad_clip)
if (step + 1) % self.args.gradient_accumulation_steps == 0:
self.scheduler.step()
self.optimizer.step()
self.optimizer.zero_grad()
self.global_step += 1
if self.batch_metrics:
for metric in self.batch_metrics:
metric(logits = logits,target = label_ids)
self.info[metric.name()] = metric.value()
self.info['loss'] = loss.item()
tr_loss.update(loss.item(),n = 1)
if self.verbose >= 1:
pbar(step= step,info = self.info)
self.outputs.append(logits.cpu().detach())
self.targets.append(label_ids.cpu().detach())
print("\n------------- train result --------------")
# epoch metric
self.outputs = torch.cat(self.outputs, dim =0).cpu().detach()
self.targets = torch.cat(self.targets, dim =0).cpu().detach()
self.result['loss'] = tr_loss.avg
if self.epoch_metrics:
for metric in self.epoch_metrics:
metric(logits=self.outputs, target=self.targets)
value = metric.value()
if value:
self.result[f'{metric.name()}'] = value
if "cuda" in str(self.device):
torch.cuda.empty_cache()
return self.result
示例13: _step_distributed_fp16
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def _step_distributed_fp16(self) -> None:
# manually allreduce gradients after all accumulation steps
# check for Inf/NaN
# 1. allocate an uninitialized buffer for flattened gradient
scaler = _amp_state.loss_scalers[0]
master_grads = [p.grad for p in amp.master_params(self.optimizer) if p.grad is not None]
flat_grad_size = sum(p.numel() for p in master_grads)
# allreduce_dtype = torch.float16 if args.allreduce_post_accumulation_fp16 else \
# torch.float32
allreduce_dtype = torch.float16
flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype)
# 2. combine unflattening and predivision of unscaled 'raw' gradient
allreduced_views = apex_C.unflatten(flat_raw, master_grads)
self._overflow_buf.zero_()
amp_C.multi_tensor_scale(
65536,
self._overflow_buf,
[master_grads, allreduced_views],
scaler.loss_scale() / (
torch.distributed.get_world_size() * self.gradient_accumulation_steps))
# 3. sum gradient across ranks. Because of the predivision, this averages the gradient
torch.distributed.all_reduce(flat_raw)
# 4. combine unscaling and unflattening of allreduced gradient
self._overflow_buf.zero_()
amp_C.multi_tensor_scale(
65536,
self._overflow_buf,
[allreduced_views, master_grads],
1. / scaler.loss_scale())
# 5. update loss scale
scaler = _amp_state.loss_scalers[0]
old_overflow_buf = scaler._overflow_buf
scaler._overflow_buf = self._overflow_buf
had_overflow = scaler.update_scale()
scaler._overfloat_buf = old_overflow_buf
# 6. call optimizer step function
if had_overflow == 0:
self._step()
else:
# Overflow detected, print message and clear gradients
logger.info(f"Gradient overflow. Skipping step, reducing loss scale to "
f"{scaler.loss_scale()}")
if _amp_state.opt_properties.master_weights:
for param in self.optimizer._amp_stash.all_fp32_from_fp16_params:
param.grad = None
for param in self.model.parameters():
param.grad = None
示例14: train
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import master_params [as 别名]
def train(epoch, model, optimizer, scheduler):
global global_step
epoch_loss = 0.0
running_num = 0
running_loss = np.zeros(3)
train_sampler.set_epoch(epoch)
model.train()
bar = tqdm(train_loader) if args.local_rank == 0 else train_loader
for batch_idx, (x, c) in enumerate(bar):
scheduler.step()
global_step += 1
x, c = x.to(device, non_blocking=True), c.to(device, non_blocking=True)
optimizer.zero_grad()
log_p, logdet = model(x, c)
log_p, logdet = torch.mean(log_p), torch.mean(logdet)
loss = -(log_p + logdet)
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.)
optimizer.step()
running_num += 1
running_loss[0] += loss.item()
running_loss[1] += log_p.item()
running_loss[2] += logdet.item()
epoch_loss += loss.item()
if args.local_rank == 0:
bar.set_description('{}/{}, [Log pdf, Log p(z), Log Det] : {}'
.format(epoch, global_step, running_loss / running_num))
if (batch_idx + 1) % 100 == 0:
running_num = 0
running_loss = np.zeros(3)
del x, c, log_p, logdet, loss
del running_loss
gc.collect()
print('{}/{}/{} Training Loss : {:.4f}'.format(epoch, global_step, args.local_rank, epoch_loss / (len(train_loader))))
return epoch_loss / len(train_loader)