本文整理汇总了Python中transformers.AdamW方法的典型用法代码示例。如果您正苦于以下问题:Python transformers.AdamW方法的具体用法?Python transformers.AdamW怎么用?Python transformers.AdamW使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类transformers
的用法示例。
在下文中一共展示了transformers.AdamW方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_default_optimizer
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import AdamW [as 别名]
def get_default_optimizer(model, weight_decay, learning_rate, adam_epsilon):
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [
p
for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)
],
"weight_decay": weight_decay,
},
{
"params": [
p
for n, p in model.named_parameters()
if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
optimizer = AdamW(
optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon
)
return optimizer
示例2: create_optimizer
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import AdamW [as 别名]
def create_optimizer(args, parameters):
"""
Creates an adam optimizer.
"""
optimizer = AdamW(
lr=args.lr,
params=parameters,
weight_decay=0.01)
return optimizer
# implementation is from DialoGPT repo
示例3: get_optimizer
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import AdamW [as 别名]
def get_optimizer(model, lr, weight_decay, model_type='siamese'):
param_groups = get_optimizer_param_groups(model.head, lr, weight_decay)
if model_type == 'siamese':
param_groups += get_optimizer_param_groups(model.transformer, lr / 100, weight_decay)
elif model_type == 'double':
param_groups += get_optimizer_param_groups(model.q_transformer, lr / 100, weight_decay)
param_groups += get_optimizer_param_groups(model.a_transformer, lr / 100, weight_decay)
return AdamW(param_groups)
示例4: test_adam_w
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import AdamW [as 别名]
def test_adam_w(self):
w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
target = torch.tensor([0.4, 0.2, -0.5])
criterion = torch.nn.MSELoss()
# No warmup, constant schedule, no gradient clipping
optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0)
for _ in range(100):
loss = criterion(w, target)
loss.backward()
optimizer.step()
w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves.
w.grad.zero_()
self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
示例5: prepare_for_training
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import AdamW [as 别名]
def prepare_for_training(args, model, checkpoint_state_dict, amp):
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
'weight_decay': args.weight_decay},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
if amp:
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
if checkpoint_state_dict:
amp.load_state_dict(checkpoint_state_dict['amp'])
if checkpoint_state_dict:
optimizer.load_state_dict(checkpoint_state_dict['optimizer'])
model.load_state_dict(checkpoint_state_dict['model'])
# multi-gpu training (should be after apex fp16 initialization)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
# Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
return model, optimizer