本文整理汇总了Python中fairseq.utils.clip_grad_norm_方法的典型用法代码示例。如果您正苦于以下问题:Python utils.clip_grad_norm_方法的具体用法?Python utils.clip_grad_norm_怎么用?Python utils.clip_grad_norm_使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类fairseq.utils
的用法示例。
在下文中一共展示了utils.clip_grad_norm_方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_clip_grad_norm_
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import clip_grad_norm_ [as 别名]
def test_clip_grad_norm_(self):
params = torch.nn.Parameter(torch.zeros(5)).requires_grad_(False)
grad_norm = utils.clip_grad_norm_(params, 1.0)
self.assertTrue(torch.is_tensor(grad_norm))
self.assertEqual(grad_norm, 0.0)
params = [torch.nn.Parameter(torch.zeros(5)) for i in range(3)]
for p in params:
p.grad = torch.full((5,), fill_value=2.)
grad_norm = utils.clip_grad_norm_(params, 1.0)
exp_grad_norm = torch.full((15,), fill_value=2.).norm()
self.assertTrue(torch.is_tensor(grad_norm))
self.assertEqual(grad_norm, exp_grad_norm)
grad_norm = utils.clip_grad_norm_(params, 1.0)
self.assertAlmostEqual(grad_norm, torch.tensor(1.0))
示例2: clip_grad_norm
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import clip_grad_norm_ [as 别名]
def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
"""Clips gradient norm and updates dynamic loss scaler."""
self._sync_fp16_grads_to_fp32()
grad_norm = utils.clip_grad_norm_(self.fp32_params, max_norm, aggregate_norm_fn)
# detect overflow and adjust loss scale
if self.scaler is not None:
overflow = DynamicLossScaler.has_overflow(grad_norm)
prev_scale = self.scaler.loss_scale
self.scaler.update_scale(overflow)
if overflow:
if self.scaler.loss_scale <= self.min_loss_scale:
# Use FloatingPointError as an uncommon error that parent
# functions can safely catch to stop training.
self.scaler.loss_scale = prev_scale
raise FloatingPointError((
'Minimum loss scale reached ({}). Your loss is probably exploding. '
'Try lowering the learning rate, using gradient clipping or '
'increasing the batch size.'
).format(self.min_loss_scale))
raise OverflowError('setting loss scale to: ' + str(self.scaler.loss_scale))
return grad_norm
示例3: test_clip_grad_norm_
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import clip_grad_norm_ [as 别名]
def test_clip_grad_norm_(self):
params = torch.nn.Parameter(torch.zeros(5)).requires_grad_(False)
grad_norm = utils.clip_grad_norm_(params, 1.0)
self.assertTrue(torch.is_tensor(grad_norm))
self.assertEqual(grad_norm, 0.0)
params = [torch.nn.Parameter(torch.zeros(5)) for i in range(3)]
for p in params:
p.grad = torch.full((5,), fill_value=2)
grad_norm = utils.clip_grad_norm_(params, 1.0)
exp_grad_norm = torch.full((15,), fill_value=2).norm()
self.assertTrue(torch.is_tensor(grad_norm))
self.assertEqual(grad_norm, exp_grad_norm)
grad_norm = utils.clip_grad_norm_(params, 1.0)
self.assertAlmostEqual(grad_norm, torch.tensor(1.0))
示例4: clip_grad_norm
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import clip_grad_norm_ [as 别名]
def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
"""Clips gradient norm and updates dynamic loss scaler."""
self._sync_fp16_grads_to_fp32()
grad_norm = utils.clip_grad_norm_(self.fp32_params, max_norm, aggregate_norm_fn)
# detect overflow and adjust loss scale
overflow = DynamicLossScaler.has_overflow(grad_norm)
prev_scale = self.scaler.loss_scale
self.scaler.update_scale(overflow)
if overflow:
if self.scaler.loss_scale <= self.min_loss_scale:
# Use FloatingPointError as an uncommon error that parent
# functions can safely catch to stop training.
self.scaler.loss_scale = prev_scale
raise FloatingPointError((
'Minimum loss scale reached ({}). Your loss is probably exploding. '
'Try lowering the learning rate, using gradient clipping or '
'increasing the batch size.'
).format(self.min_loss_scale))
raise OverflowError('setting loss scale to: ' + str(self.scaler.loss_scale))
return grad_norm
示例5: clip_grad_norm
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import clip_grad_norm_ [as 别名]
def clip_grad_norm(self, max_norm):
"""Clips gradient norm and updates dynamic loss scaler."""
self._sync_fp16_grads_to_fp32()
grad_norm = utils.clip_grad_norm_(self.fp32_params.grad.data, max_norm)
# detect overflow and adjust loss scale
overflow = DynamicLossScaler.has_overflow(grad_norm)
self.scaler.update_scale(overflow)
if overflow:
if self.scaler.loss_scale <= self.args.min_loss_scale:
# Use FloatingPointError as an uncommon error that parent
# functions can safely catch to stop training.
raise FloatingPointError((
'Minimum loss scale reached ({}). Your loss is probably exploding. '
'Try lowering the learning rate, using gradient clipping or '
'increasing the batch size.'
).format(self.args.min_loss_scale))
raise OverflowError('setting loss scale to: ' + str(self.scaler.loss_scale))
return grad_norm
示例6: _all_reduce_and_rescale
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import clip_grad_norm_ [as 别名]
def _all_reduce_and_rescale(self, grad_denom):
# undo effect of dynamic loss scaling on gradients
grad_denom *= self.scaler.loss_scale
if self.args.distributed_world_size > 1:
# flatten grads into a single buffer
flat_grads = self._flat_grads = self._get_flat_grads(self._flat_grads)
# scale gradients to avoid overflow in all-reduce
flat_grads.div_(self.args.distributed_world_size)
grad_denom /= self.args.distributed_world_size
# all-reduce flat grads
torch.distributed.all_reduce(flat_grads)
# copy grads back to FP32
self.fp32_params.grad.data.copy_(flat_grads)
else:
# single worker: copy grads directly to FP32
self._get_flat_grads(out=self.fp32_params.grad.data)
# rescale and clip grads
self.fp32_params.grad.data.div_(grad_denom)
grad_norm = utils.clip_grad_norm_(self.fp32_params.grad.data, self.args.clip_norm)
# detect overflow and adjust loss scale
overflow = DynamicLossScaler.has_overflow(grad_norm)
self.scaler.update_scale(overflow)
if overflow:
raise OverflowError('setting loss scale to: ' + str(self.scaler.loss_scale))
return grad_norm
示例7: _all_reduce_and_rescale
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import clip_grad_norm_ [as 别名]
def _all_reduce_and_rescale(self, grad_denom):
# flatten grads into a single buffer and all-reduce
flat_grads = self._flat_grads = self._get_flat_grads(self._flat_grads)
if self.args.distributed_world_size > 1:
torch.distributed.all_reduce(flat_grads)
# rescale and clip gradients
flat_grads.div_(grad_denom)
grad_norm = utils.clip_grad_norm_(flat_grads, self.args.clip_norm)
# copy grads back into model parameters
self._set_flat_grads(flat_grads)
return grad_norm
示例8: clip_grad_norm
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import clip_grad_norm_ [as 别名]
def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
"""Clips gradient norm."""
return utils.clip_grad_norm_(self.params, max_norm, aggregate_norm_fn)
示例9: _all_reduce_and_rescale
# 需要导入模块: from fairseq import utils [as 别名]
# 或者: from fairseq.utils import clip_grad_norm_ [as 别名]
def _all_reduce_and_rescale(self, grad_denom, non_empty = True):
# flatten grads into a single buffer and all-reduce
flat_grads = self._flat_grads = self._get_flat_grads(out=self._flat_grads, has_grad = non_empty)
if self.args.distributed_world_size > 1:
torch.distributed.all_reduce(flat_grads)
# rescale and clip gradients
flat_grads.div_(grad_denom)
grad_norm = utils.clip_grad_norm_(flat_grads, self.args.clip_norm)
# copy grads back into model parameters
self._set_flat_grads(flat_grads)
return grad_norm