本文整理汇总了Python中apex.amp.scale_loss方法的典型用法代码示例。如果您正苦于以下问题:Python amp.scale_loss方法的具体用法?Python amp.scale_loss怎么用?Python amp.scale_loss使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类apex.amp
的用法示例。
在下文中一共展示了amp.scale_loss方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _forward
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def _forward(self, args, inputs, labels, masker, model, backprop=True):
outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
if backprop:
if args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel training
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
self._post_step(args, outputs)
return loss
else:
return loss
示例2: test_gradient_accumulation_with_apex_amp
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def test_gradient_accumulation_with_apex_amp(self, mocker):
desired_bs, accum_steps = 32, 4
real_bs = desired_bs // accum_steps
num_iter = 10
task = mod_task.XORTask(batch_size=real_bs)
# Wrap model and optimizer by `amp.initialize`. Beside, `amp` requires
# CUDA GPU. So we have to move model to GPU first.
model, optimizer, device = task.model, task.optimizer, task.device
model = model.to(device)
task.model, task.optimizer = amp.initialize(model, optimizer)
lr_finder = prepare_lr_finder(task)
spy = mocker.spy(amp, "scale_loss")
lr_finder.range_test(
task.train_loader, num_iter=num_iter, accumulation_steps=accum_steps
)
assert spy.call_count == accum_steps * num_iter
示例3: test_mixed_precision
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def test_mixed_precision(self, mocker):
batch_size = 32
num_iter = 10
task = mod_task.XORTask(batch_size=batch_size)
# Wrap model and optimizer by `amp.initialize`. Beside, `amp` requires
# CUDA GPU. So we have to move model to GPU first.
model, optimizer, device = task.model, task.optimizer, task.device
model = model.to(device)
task.model, task.optimizer = amp.initialize(model, optimizer)
assert hasattr(task.optimizer, "_amp_stash")
lr_finder = prepare_lr_finder(task)
spy = mocker.spy(amp, "scale_loss")
lr_finder.range_test(task.train_loader, num_iter=num_iter)
# NOTE: Here we did not perform gradient accumulation, so that call count
# of `amp.scale_loss` should equal to `num_iter`.
assert spy.call_count == num_iter
示例4: step
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def step(self, batch):
self.model.train()
self.optim.zero_grad()
img, target = batch
img, target = img.cuda(), target.cuda()
score, feat = self.model(img)
loss = self.loss_func(score, feat, target)
if self.mix_precision:
with amp.scale_loss(loss, self.optim) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
self.optim.step()
acc = (score.max(1)[1] == target).float().mean()
self.loss_avg.update(loss.cpu().item())
self.acc_avg.update(acc.cpu().item())
return self.loss_avg.avg, self.acc_avg.avg
示例5: _make_step
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def _make_step(self):
data, target = self.state.input
output = self.state.model(data)
self.state.output = output
loss = self.state.criterion(output, target)
if self.state.is_train:
with amp.scale_loss(loss / self.accumulate_steps, self.state.optimizer) as scaled_loss:
scaled_loss.backward()
if self.gradient_clip_val > 0:
torch.nn.utils.clip_grad_norm_(amp.master_params(self.state.optimizer), self.gradient_clip_val)
if self.state.step % self.accumulate_steps == 0:
self.state.optimizer.step()
self.state.optimizer.zero_grad()
torch.cuda.synchronize()
# update metrics
self.state.loss_meter.update(to_numpy(loss))
with torch.no_grad():
for metric, meter in zip(self.state.metrics, self.state.metric_meters):
meter.update(to_numpy(metric(output, target).squeeze()))
示例6: compute_gradient_penalty
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def compute_gradient_penalty(self, real_data, fake_data, condition, landmarks):
epsilon_shape = [real_data.shape[0]] + [1]*(real_data.dim() - 1)
epsilon = torch.rand(epsilon_shape)
epsilon = epsilon.to(fake_data.device, fake_data.dtype)
real_data = real_data.to(fake_data.dtype)
x_hat = epsilon * real_data + (1-epsilon) * fake_data.detach()
x_hat.requires_grad = True
logits = self.discriminator(x_hat, condition, landmarks)
logits = logits.sum()
grad = torch.autograd.grad(
outputs=logits,
inputs=x_hat,
grad_outputs=torch.ones(logits.shape).to(fake_data.dtype).to(fake_data.device),
create_graph=True
)[0]
grad = grad.view(x_hat.shape[0], -1)
gradient_pen = ((grad.norm(p=2, dim=1) - 1)**2)
to_backward = gradient_pen.sum() * 10
with amp.scale_loss(to_backward, self.d_optimizer, loss_id=1) as scaled_loss:
scaled_loss.backward(retain_graph=True)
return gradient_pen.detach().mean()
示例7: train
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def train(model, lattice, optim, y, rot, trans=None, ctf_params=None, use_amp=False):
model.train()
optim.zero_grad()
B = y.size(0)
D = lattice.D
# reconstruct circle of pixels instead of whole image
mask = lattice.get_circular_mask(D//2)
yhat = model(lattice.coords[mask] @ rot).view(B,-1)
if ctf_params is not None:
freqs = lattice.freqs2d[mask]
freqs = freqs.unsqueeze(0).expand(B, *freqs.shape)/ctf_params[:,0].view(B,1,1)
yhat *= ctf.compute_ctf(freqs, *torch.split(ctf_params[:,1:], 1, 1))
y = y.view(B,-1)[:, mask]
if trans is not None:
y = lattice.translate_ht(y, trans.unsqueeze(1), mask).view(B,-1)
loss = F.mse_loss(yhat, y)
if use_amp:
with amp.scale_loss(loss, optim) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
optim.step()
return loss.item()
示例8: test_larc_mixed_precision
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def test_larc_mixed_precision(self):
for opt_level in ["O0", "O1", "O2", "O3"]:
model = MyModel(1)
optimizer = LARC(
torch.optim.SGD(
[{"params": model.parameters(), "lr": 0.25}], momentum=0.125
)
)
model, optimizer = amp.initialize(
model, optimizer, opt_level=opt_level, verbosity=0
)
optimizer.zero_grad()
loss = model(self.x)
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
optimizer.step()
示例9: backward_propagate
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def backward_propagate(self, loss, step):
loss = self.adjust_loss(loss)
if self.global_step % self.log_loss_every == 0 and self.local_rank in [-1, 0]:
if self.local_rank in [-1, 0]:
MlLogger.log_metrics(
{"Train_loss_total": float(loss.detach().cpu().numpy())},
step=self.global_step,
)
if self.log_learning_rate:
MlLogger.log_metrics({"learning_rate": self.lr_schedule.get_last_lr()[0]},
step=self.global_step)
if self.use_amp:
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
if step % self.grad_acc_steps == 0:
if self.max_grad_norm is not None:
if self.use_amp:
torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.max_grad_norm)
else:
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
self.optimizer.step()
self.optimizer.zero_grad()
if self.lr_schedule:
self.lr_schedule.step()
return loss
示例10: after_train_iter
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def after_train_iter(self, runner):
runner.model.zero_grad()
runner.optimizer.zero_grad()
# Note: If mixed precision is not used, this ends up doing nothing
# Otherwise apply loss scaling for mixed-precision recipe
with amp.scale_loss(runner.outputs['loss'], runner.optimizer) as scaled_losses:
scaled_losses.backward()
all_reduce_grads(runner.model, self.coalesce, self.bucket_size_mb)
if self.grad_clip is not None:
self.clip_grads(runner.model.parameters())
runner.optimizer.step()
示例11: apex_closure
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def apex_closure():
from apex import amp
def _apex_closure(state):
# Zero grads
state[torchbearer.OPTIMIZER].zero_grad()
_forward_with_exceptions(torchbearer.X, torchbearer.MODEL, torchbearer.Y_PRED, state)
state[torchbearer.CALLBACK_LIST].on_forward(state)
# Loss Calculation
try:
state[torchbearer.LOSS] = state[torchbearer.CRITERION](state)
except TypeError:
loss_function_params = _get_param_list(state[torchbearer.Y_PRED]) + _get_param_list(state[torchbearer.Y_TRUE])
state[torchbearer.LOSS] = state[torchbearer.CRITERION](*loss_function_params)
state[torchbearer.CALLBACK_LIST].on_criterion(state)
# Backwards pass
with amp.scale_loss(state[torchbearer.LOSS], state[torchbearer.OPTIMIZER]) as scaled_loss:
scaled_loss.backward(**state[torchbearer.BACKWARD_ARGS])
state[torchbearer.CALLBACK_LIST].on_backward(state)
return _apex_closure
示例12: create_supervised_trainer
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def create_supervised_trainer(
model, optimizer, loss_fn,
device=None, non_blocking=False,
prepare_batch=_prepare_batch,
output_transform=lambda x, y, y_pred, loss: loss.item(),
accumulation_steps: int = 1,
fp16: bool = False,
):
def update_fn(engine, batch):
model.train()
x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
y_pred = model(x)
loss = loss_fn(y_pred, y)
if fp16:
from apex import amp
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
if engine.state.iteration % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
return output_transform(x, y, y_pred, loss)
return Engine(update_fn)
示例13: _train_batch
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def _train_batch(self, train_iter, accumulation_steps, non_blocking_transfer=True):
self.model.train()
total_loss = None # for late initialization
self.optimizer.zero_grad()
for i in range(accumulation_steps):
inputs, labels = next(train_iter)
inputs, labels = self._move_to_device(
inputs, labels, non_blocking=non_blocking_transfer
)
# Forward pass
outputs = self.model(inputs)
loss = self.criterion(outputs, labels)
# Loss should be averaged in each step
loss /= accumulation_steps
# Backward pass
if IS_AMP_AVAILABLE and hasattr(self.optimizer, "_amp_stash"):
# For minor performance optimization, see also:
# https://nvidia.github.io/apex/advanced.html#gradient-accumulation-across-iterations
delay_unscale = ((i + 1) % accumulation_steps) != 0
with amp.scale_loss(
loss, self.optimizer, delay_unscale=delay_unscale
) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
if total_loss is None:
total_loss = loss
else:
total_loss += loss
self.optimizer.step()
return total_loss.item()
示例14: train
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def train(train_step):
train_loader = train_corpus.batch_generator()
start_time = time.time()
for batch, item in enumerate(train_loader):
net.train()
data, targets, word_cnt, batch_len = get_batch(item)
# Starting each batch, we detach the hidden state from how it was previously produced.
# If we didn't, the model would try backpropagating all the way to start of the dataset.
optimizer.zero_grad()
# Network
logits, new_targets = net(data, targets)
loss = F.cross_entropy(logits.view(-1, nsampled+1), new_targets)
# AMP
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.clip)
optimizer.step()
scheduler.step(train_step)
train_step += 1
interval = 125
if batch % interval == 0:
elapsed = time.time() - start_time
print('Epoch: {:3d} | {:5d}/{:5d} batches | lr {:.6f} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'
.format(epoch, batch, batch_len, scheduler.get_lr()[0], elapsed * 1000 / interval, loss.item(), math.exp(loss.item())))
start_time = time.time()
sys.stdout.flush()
return train_step
# Load the saved model.
示例15: update_core
# 需要导入模块: from apex import amp [as 别名]
# 或者: from apex.amp import scale_loss [as 别名]
def update_core(self):
"""Update the model."""
# When we pass one iterator and optimizer to StandardUpdater.__init__,
# they are automatically named 'main'.
train_iter = self.get_iterator("main")
optimizer = self.get_optimizer("main")
# Progress the dataset iterator for sentences at each iteration.
self.model.zero_grad() # Clear the parameter gradients
accum = {"loss": 0.0, "nll": 0.0, "count": 0}
for _ in range(self.accum_grad):
batch = train_iter.__next__()
# Concatenate the token IDs to matrices and send them to the device
# self.converter does this job
# (it is chainer.dataset.concat_examples by default)
x, t = concat_examples(batch, device=self.device[0], padding=(0, -100))
if self.device[0] == -1:
loss, nll, count = self.model(x, t)
else:
# apex does not support torch.nn.DataParallel
loss, nll, count = data_parallel(self.model, (x, t), self.device)
# backward
loss = loss.mean() / self.accum_grad
if self.use_apex:
from apex import amp
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward() # Backprop
# accumulate stats
accum["loss"] += float(loss)
accum["nll"] += float(nll.sum())
accum["count"] += int(count.sum())
for k, v in accum.items():
reporter.report({k: v}, optimizer.target)
if self.gradclip is not None:
nn.utils.clip_grad_norm_(self.model.parameters(), self.gradclip)
optimizer.step() # Update the parameters
self.scheduler.step(n_iter=self.iteration)