本文整理匯總了Python中torch.optim.Adam類的典型用法代碼示例。如果您正苦於以下問題:Python Adam類的具體用法?Python Adam怎麽用?Python Adam使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了Adam類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: pretrain
def pretrain(self, train_data, corrupter, tester):
src, rel, dst = train_data
n_train = len(src)
optimizer = Adam(self.mdl.parameters())
#optimizer = SGD(self.mdl.parameters(), lr=1e-4)
n_epoch = self.config.n_epoch
n_batch = self.config.n_batch
best_perf = 0
for epoch in range(n_epoch):
epoch_loss = 0
rand_idx = t.randperm(n_train)
src = src[rand_idx]
rel = rel[rand_idx]
dst = dst[rand_idx]
src_corrupted, dst_corrupted = corrupter.corrupt(src, rel, dst)
src_cuda = src.cuda()
rel_cuda = rel.cuda()
dst_cuda = dst.cuda()
src_corrupted = src_corrupted.cuda()
dst_corrupted = dst_corrupted.cuda()
for s0, r, t0, s1, t1 in batch_by_num(n_batch, src_cuda, rel_cuda, dst_cuda, src_corrupted, dst_corrupted,
n_sample=n_train):
self.mdl.zero_grad()
loss = t.sum(self.mdl.pair_loss(Variable(s0), Variable(r), Variable(t0), Variable(s1), Variable(t1)))
loss.backward()
optimizer.step()
self.mdl.constraint()
epoch_loss += loss.data[0]
logging.info('Epoch %d/%d, Loss=%f', epoch + 1, n_epoch, epoch_loss / n_train)
if (epoch + 1) % self.config.epoch_per_test == 0:
test_perf = tester()
if test_perf > best_perf:
self.save(os.path.join(config().task.dir, self.config.model_file))
best_perf = test_perf
return best_perf
示例2: __init__
def __init__(self, memory, nb_status, nb_actions, action_noise=None,
gamma=0.99, tau=0.001, normalize_observations=True,
batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.),
actor_lr=1e-4, critic_lr=1e-3):
self.nb_status = nb_status
self.nb_actions = nb_actions
self.action_range = action_range
self.observation_range = observation_range
self.normalize_observations = normalize_observations
self.actor = Actor(self.nb_status, self.nb_actions)
self.actor_target = Actor(self.nb_status, self.nb_actions)
self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)
self.critic = Critic(self.nb_status, self.nb_actions)
self.critic_target = Critic(self.nb_status, self.nb_actions)
self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)
# Create replay buffer
self.memory = memory # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
self.action_noise = action_noise
# Hyper-parameters
self.batch_size = batch_size
self.tau = tau
self.discount = gamma
if self.normalize_observations:
self.obs_rms = RunningMeanStd()
else:
self.obs_rms = None
示例3: pretrain
def pretrain(self, train_data, corrupter, tester):
src, rel, dst = train_data
n_train = len(src)
n_epoch = self.config.n_epoch
n_batch = self.config.n_batch
optimizer = Adam(self.mdl.parameters(), weight_decay=self.weight_decay)
best_perf = 0
for epoch in range(n_epoch):
epoch_loss = 0
if epoch % self.config.sample_freq == 0:
rand_idx = t.randperm(n_train)
src = src[rand_idx]
rel = rel[rand_idx]
dst = dst[rand_idx]
src_corrupted, rel_corrupted, dst_corrupted = corrupter.corrupt(src, rel, dst)
src_corrupted = src_corrupted.cuda()
rel_corrupted = rel_corrupted.cuda()
dst_corrupted = dst_corrupted.cuda()
for ss, rs, ts in batch_by_num(n_batch, src_corrupted, rel_corrupted, dst_corrupted, n_sample=n_train):
self.mdl.zero_grad()
label = t.zeros(len(ss)).type(t.LongTensor).cuda()
loss = t.sum(self.mdl.softmax_loss(Variable(ss), Variable(rs), Variable(ts), label))
loss.backward()
optimizer.step()
epoch_loss += loss.data[0]
logging.info('Epoch %d/%d, Loss=%f', epoch + 1, n_epoch, epoch_loss / n_train)
if (epoch + 1) % self.config.epoch_per_test == 0:
test_perf = tester()
if test_perf > best_perf:
self.save(os.path.join(config().task.dir, self.config.model_file))
best_perf = test_perf
return best_perf
示例4: _init_optimizers
def _init_optimizers(self):
if self.generator_optim is None or self.critic_optim is None:
from torch.optim import Adam
trainable_generator_params = (
p for p in self.generator.parameters() if p.requires_grad)
trainable_critic_params = (
p for p in self.critic.parameters() if p.requires_grad)
self.generator_optim = Adam(
trainable_generator_params, lr=0.0001, betas=(0, 0.9))
self.critic_optim = Adam(
trainable_critic_params, lr=0.0001, betas=(0, 0.9))
示例5: __init__
def __init__(self, nb_status, nb_actions, args, writer):
self.clip_actor_grad = args.clip_actor_grad
self.nb_status = nb_status * args.window_length
self.nb_actions = nb_actions
self.discrete = args.discrete
self.pic = args.pic
self.writer = writer
self.select_time = 0
if self.pic:
self.nb_status = args.pic_status
# Create Actor and Critic Network
net_cfg = {
'hidden1':args.hidden1,
'hidden2':args.hidden2,
'use_bn':args.bn,
'init_method':args.init_method
}
if args.pic:
self.cnn = CNN(1, args.pic_status)
self.cnn_target = CNN(1, args.pic_status)
self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)
self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)
hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
hard_update(self.critic_target, self.critic)
if args.pic:
hard_update(self.cnn_target, self.cnn)
#Create replay buffer
self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
self.random_process = Myrandom(size=nb_actions)
# Hyper-parameters
self.batch_size = args.batch_size
self.tau = args.tau
self.discount = args.discount
self.depsilon = 1.0 / args.epsilon
#
self.epsilon = 1.0
self.s_t = None # Most recent state
self.a_t = None # Most recent action
self.use_cuda = args.cuda
#
if self.use_cuda: self.cuda()
示例6: __init__
def __init__(self, nb_status, nb_actions, args):
self.num_actor = 3
self.nb_status = nb_status * args.window_length
self.nb_actions = nb_actions
self.discrete = args.discrete
self.pic = args.pic
if self.pic:
self.nb_status = args.pic_status
# Create Actor and Critic Network
net_cfg = {
'hidden1':args.hidden1,
'hidden2':args.hidden2,
'use_bn':args.bn
}
if args.pic:
self.cnn = CNN(3, args.pic_status)
self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
self.actors = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)]
self.actor_targets = [Actor(self.nb_status, self.nb_actions) for _ in
range(self.num_actor)]
self.actor_optims = [Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor)]
self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)
for i in range(self.num_actor):
hard_update(self.actor_targets[i], self.actors[i]) # Make sure target is with the same weight
hard_update(self.critic_target, self.critic)
#Create replay buffer
self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
self.random_process = Myrandom(size=nb_actions)
# Hyper-parameters
self.batch_size = args.batch_size
self.tau = args.tau
self.discount = args.discount
self.depsilon = 1.0 / args.epsilon
#
self.epsilon = 1.0
self.s_t = None # Most recent state
self.a_t = None # Most recent action
self.use_cuda = args.cuda
#
if self.use_cuda: self.cuda()
示例7: __init__
def __init__(self, gamma, tau, hidden_size, num_inputs, action_space):
self.num_inputs = num_inputs
self.action_space = action_space
self.actor = Actor(hidden_size, self.num_inputs, self.action_space)
self.actor_target = Actor(hidden_size, self.num_inputs, self.action_space)
self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)
self.critic = Critic(hidden_size, self.num_inputs, self.action_space)
self.critic_target = Critic(hidden_size, self.num_inputs, self.action_space)
self.critic_optim = Adam(self.critic.parameters(), lr=1e-3)
self.gamma = gamma
self.tau = tau
hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
hard_update(self.critic_target, self.critic)
示例8: __init__
def __init__(self, gamma, tau, hidden_size, num_inputs, action_space):
self.action_space = action_space
self.num_inputs = num_inputs
self.model = Policy(hidden_size, num_inputs, action_space)
self.target_model = Policy(hidden_size, num_inputs, action_space)
self.optimizer = Adam(self.model.parameters(), lr=1e-3)
self.gamma = gamma
self.tau = tau
hard_update(self.target_model, self.model)
示例9: train
def train(self, data):
data = data['data']
self.network.train()
optimizer = Adam(trainable_parameters(self.network), lr=1e-5)
for epoch, batch in self._driver(data):
self.network.zero_grad()
# choose a batch of anchors
indices, anchor = self._select_batch(data)
anchor_v = self._variable(anchor)
a = self._apply_network_and_normalize(anchor_v)
# choose negative examples
negative_indices, negative = self._select_batch(data)
negative_v = self._variable(negative)
n = self._apply_network_and_normalize(negative_v)
# choose a deformation for this batch and apply it to produce the
# positive examples
deformation = choice(self.deformations)
positive = deformation(anchor, data[indices, ...]) \
.astype(np.float32)
positive_v = self._variable(positive)
p = self._apply_network_and_normalize(positive_v)
error = self.loss.forward(a, p, n)
error.backward()
optimizer.step()
self.on_batch_complete(
epoch=epoch,
batch=batch,
error=float(error.data.cpu().numpy().squeeze()),
deformation=deformation.__name__)
return self.network
示例10: learn
def learn(learning_rate, iterations, x, y, validation=None, stop_early=False, run_comment=''):
# Define a neural network using high-level modules.
writer = SummaryWriter(comment=run_comment)
model = Sequential(
Linear(len(x[0]), len(y[0]), bias=True) # n inputs -> 1 output
)
loss_fn = BCEWithLogitsLoss(reduction='sum') # reduction=mean converges slower.
# TODO: Add an option to twiddle pos_weight, which lets us trade off precision and recall. Maybe also graph using add_pr_curve(), which can show how that tradeoff is going.
optimizer = Adam(model.parameters(),lr=learning_rate)
if validation:
validation_ins, validation_outs = validation
previous_validation_loss = None
with progressbar(range(iterations)) as bar:
for t in bar:
y_pred = model(x) # Make predictions.
loss = loss_fn(y_pred, y)
writer.add_scalar('loss', loss, t)
if validation:
validation_loss = loss_fn(model(validation_ins), validation_outs)
if stop_early:
if previous_validation_loss is not None and previous_validation_loss < validation_loss:
print('Stopping early at iteration {t} because validation error rose.'.format(t=t))
model.load_state_dict(previous_model)
break
else:
previous_validation_loss = validation_loss
previous_model = model.state_dict()
writer.add_scalar('validation_loss', validation_loss, t)
writer.add_scalar('training_accuracy_per_tag', accuracy_per_tag(model, x, y), t)
optimizer.zero_grad() # Zero the gradients.
loss.backward() # Compute gradients.
optimizer.step()
# Horizontal axis is what confidence. Vertical is how many samples were that confidence.
writer.add_histogram('confidence', confidences(model, x), t)
writer.close()
return model
示例11: train
def train(self, training_data: TrainingData) -> None:
x_train, y_train, x_val, y_val, vocab, class_to_i, i_to_class = preprocess_dataset(training_data)
self.class_to_i = class_to_i
self.i_to_class = i_to_class
log.info('Batchifying data')
train_batches = batchify(x_train, y_train, shuffle=True)
val_batches = batchify(x_val, y_val, shuffle=False)
self.model = ElmoModel(len(i_to_class), dropout=self.dropout)
if CUDA:
self.model = self.model.cuda()
log.info(f'Parameters:\n{self.parameters()}')
log.info(f'Model:\n{self.model}')
parameters = list(self.model.classifier.parameters())
for mix in self.model.elmo._scalar_mixes:
parameters.extend(list(mix.parameters()))
self.optimizer = Adam(parameters)
self.criterion = nn.CrossEntropyLoss()
self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode='max')
temp_prefix = get_tmp_filename()
self.model_file = f'{temp_prefix}.pt'
manager = TrainingManager([
BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1),
MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), self.model_file, monitor='test_acc')
])
log.info('Starting training')
epoch = 0
while True:
self.model.train()
train_acc, train_loss, train_time = self.run_epoch(train_batches)
random.shuffle(train_batches)
self.model.eval()
test_acc, test_loss, test_time = self.run_epoch(val_batches, train=False)
stop_training, reasons = manager.instruct(
train_time, train_loss, train_acc,
test_time, test_loss, test_acc
)
if stop_training:
log.info(' '.join(reasons))
break
else:
self.scheduler.step(test_acc)
epoch += 1
示例12: __init__
def __init__(self, args, exp_model, logging_func):
self.args = args
# Exploration Model
self.exp_model = exp_model
self.log = logging_func["log"]
# Experience Replay
self.replay = ExpReplay(args.exp_replay_size, args.stale_limit, exp_model, args, priority=self.args.prioritized)
# DQN and Target DQN
model = get_models(args.model)
self.dqn = model(actions=args.actions, atoms=args.atoms)
self.target_dqn = model(actions=args.actions, atoms=args.atoms)
dqn_params = 0
for weight in self.dqn.parameters():
weight_params = 1
for s in weight.size():
weight_params *= s
dqn_params += weight_params
print("Distrib DQN has {:,} parameters.".format(dqn_params))
self.target_dqn.eval()
if args.gpu:
print("Moving models to GPU.")
self.dqn.cuda()
self.target_dqn.cuda()
# Optimizer
self.optimizer = Adam(self.dqn.parameters(), lr=args.lr)
# self.optimizer = RMSprop(self.dqn.parameters(), lr=args.lr)
self.T = 0
self.target_sync_T = -self.args.t_max
示例13: DDPG
class DDPG(object):
def __init__(self, nb_status, nb_actions, args, writer):
self.clip_actor_grad = args.clip_actor_grad
self.nb_status = nb_status * args.window_length
self.nb_actions = nb_actions
self.discrete = args.discrete
self.pic = args.pic
self.writer = writer
self.select_time = 0
if self.pic:
self.nb_status = args.pic_status
# Create Actor and Critic Network
net_cfg = {
'hidden1':args.hidden1,
'hidden2':args.hidden2,
'use_bn':args.bn,
'init_method':args.init_method
}
if args.pic:
self.cnn = CNN(1, args.pic_status)
self.cnn_target = CNN(1, args.pic_status)
self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)
self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)
hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
hard_update(self.critic_target, self.critic)
if args.pic:
hard_update(self.cnn_target, self.cnn)
#Create replay buffer
self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
self.random_process = Myrandom(size=nb_actions)
# Hyper-parameters
self.batch_size = args.batch_size
self.tau = args.tau
self.discount = args.discount
self.depsilon = 1.0 / args.epsilon
#
self.epsilon = 1.0
self.s_t = None # Most recent state
self.a_t = None # Most recent action
self.use_cuda = args.cuda
#
if self.use_cuda: self.cuda()
def normalize(self, pic):
pic = pic.swapaxes(0, 2).swapaxes(1, 2)
return pic
def update_policy(self):
# Sample batch
state_batch, action_batch, reward_batch, \
next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)
# Prepare for the target q batch
if self.pic:
state_batch = np.array([self.normalize(x) for x in state_batch])
state_batch = to_tensor(state_batch, volatile=True)
state_batch = self.cnn(state_batch)
next_state_batch = np.array([self.normalize(x) for x in next_state_batch])
next_state_batch = to_tensor(next_state_batch, volatile=True)
next_state_batch = self.cnn_target(next_state_batch)
next_q_values = self.critic_target([
next_state_batch,
self.actor_target(next_state_batch)
])
else:
next_q_values = self.critic_target([
to_tensor(next_state_batch, volatile=True),
self.actor_target(to_tensor(next_state_batch, volatile=True)),
])
# print('batch of picture is ok')
next_q_values.volatile = False
target_q_batch = to_tensor(reward_batch) + \
self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values
# Critic update
self.critic.zero_grad()
if self.pic: self.cnn.zero_grad()
if self.pic:
state_batch.volatile = False
q_batch = self.critic([state_batch, to_tensor(action_batch)])
else:
q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])
# print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
value_loss = criterion(q_batch, target_q_batch)
value_loss.backward()
self.critic_optim.step()
#.........這裏部分代碼省略.........
示例14: RnnGuesser
class RnnGuesser(AbstractGuesser):
def __init__(self, config_num):
super(RnnGuesser, self).__init__(config_num)
if self.config_num is not None:
guesser_conf = conf['guessers']['qanta.guesser.rnn.RnnGuesser'][self.config_num]
self.gradient_clip = guesser_conf['gradient_clip']
self.n_hidden_units = guesser_conf['n_hidden_units']
self.n_hidden_layers = guesser_conf['n_hidden_layers']
self.nn_dropout = guesser_conf['dropout']
self.batch_size = guesser_conf['batch_size']
self.use_wiki = guesser_conf['use_wiki']
self.n_wiki_sentences = guesser_conf['n_wiki_sentences']
self.wiki_title_replace_token = guesser_conf['wiki_title_replace_token']
self.lowercase = guesser_conf['lowercase']
self.random_seed = guesser_conf['random_seed']
self.page_field: Optional[Field] = None
self.qanta_id_field: Optional[Field] = None
self.text_field: Optional[Field] = None
self.n_classes = None
self.emb_dim = None
self.model_file = None
self.model: Optional[RnnModel] = None
self.optimizer = None
self.criterion = None
self.scheduler = None
@property
def ans_to_i(self):
return self.page_field.vocab.stoi
@property
def i_to_ans(self):
return self.page_field.vocab.itos
def parameters(self):
return conf['guessers']['qanta.guesser.rnn.RnnGuesser'][self.config_num]
def train(self, training_data):
log.info('Loading Quiz Bowl dataset')
train_iter, val_iter, dev_iter = QuizBowl.iters(
batch_size=self.batch_size, lower=self.lowercase,
use_wiki=self.use_wiki, n_wiki_sentences=self.n_wiki_sentences,
replace_title_mentions=self.wiki_title_replace_token,
sort_within_batch=True
)
log.info(f'Training Data={len(training_data[0])}')
log.info(f'N Train={len(train_iter.dataset.examples)}')
log.info(f'N Test={len(val_iter.dataset.examples)}')
fields: Dict[str, Field] = train_iter.dataset.fields
self.page_field = fields['page']
self.n_classes = len(self.ans_to_i)
self.qanta_id_field = fields['qanta_id']
self.emb_dim = 300
self.text_field = fields['text']
log.info(f'Text Vocab={len(self.text_field.vocab)}')
log.info('Initializing Model')
self.model = RnnModel(
self.n_classes,
text_field=self.text_field,
emb_dim=self.emb_dim,
n_hidden_units=self.n_hidden_units, n_hidden_layers=self.n_hidden_layers,
nn_dropout=self.nn_dropout
)
if CUDA:
self.model = self.model.cuda()
log.info(f'Parameters:\n{self.parameters()}')
log.info(f'Model:\n{self.model}')
self.optimizer = Adam(self.model.parameters())
self.criterion = nn.CrossEntropyLoss()
self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode='max')
temp_prefix = get_tmp_filename()
self.model_file = f'{temp_prefix}.pt'
manager = TrainingManager([
BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1),
MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), self.model_file, monitor='test_acc')
])
log.info('Starting training')
epoch = 0
while True:
self.model.train()
train_acc, train_loss, train_time = self.run_epoch(train_iter)
self.model.eval()
test_acc, test_loss, test_time = self.run_epoch(val_iter)
stop_training, reasons = manager.instruct(
train_time, train_loss, train_acc,
test_time, test_loss, test_acc
)
if stop_training:
log.info(' '.join(reasons))
#.........這裏部分代碼省略.........
示例15: DDPG
class DDPG(object):
def __init__(self, memory, nb_status, nb_actions, action_noise=None,
gamma=0.99, tau=0.001, normalize_observations=True,
batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.),
actor_lr=1e-4, critic_lr=1e-3):
self.nb_status = nb_status
self.nb_actions = nb_actions
self.action_range = action_range
self.observation_range = observation_range
self.normalize_observations = normalize_observations
self.actor = Actor(self.nb_status, self.nb_actions)
self.actor_target = Actor(self.nb_status, self.nb_actions)
self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)
self.critic = Critic(self.nb_status, self.nb_actions)
self.critic_target = Critic(self.nb_status, self.nb_actions)
self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)
# Create replay buffer
self.memory = memory # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
self.action_noise = action_noise
# Hyper-parameters
self.batch_size = batch_size
self.tau = tau
self.discount = gamma
if self.normalize_observations:
self.obs_rms = RunningMeanStd()
else:
self.obs_rms = None
def pi(self, obs, apply_noise=True, compute_Q=True):
obs = np.array([obs])
action = to_numpy(self.actor(to_tensor(obs))).squeeze(0)
if compute_Q:
q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data
else:
q = None
if self.action_noise is not None and apply_noise:
noise = self.action_noise()
assert noise.shape == action.shape
action += noise
action = np.clip(action, self.action_range[0], self.action_range[1])
return action, q[0][0]
def store_transition(self, obs0, action, reward, obs1, terminal1):
self.memory.append(obs0, action, reward, obs1, terminal1)
if self.normalize_observations:
self.obs_rms.update(np.array([obs0]))
def train(self):
# Get a batch.
batch = self.memory.sample(batch_size=self.batch_size)
next_q_values = self.critic_target([
to_tensor(batch['obs1'], volatile=True),
self.actor_target(to_tensor(batch['obs1'], volatile=True))])
next_q_values.volatile = False
target_q_batch = to_tensor(batch['rewards']) + \
self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values
self.critic.zero_grad()
q_batch = self.critic([to_tensor(batch['obs0']), to_tensor(batch['actions'])])
value_loss = criterion(q_batch, target_q_batch)
value_loss.backward()
self.critic_optim.step()
self.actor.zero_grad()
policy_loss = -self.critic([to_tensor(batch['obs0']), self.actor(to_tensor(batch['obs0']))]).mean()
policy_loss.backward()
self.actor_optim.step()
# Target update
soft_update(self.actor_target, self.actor, self.tau)
soft_update(self.critic_target, self.critic, self.tau)
return value_loss.cpu().data[0], policy_loss.cpu().data[0]
def initialize(self):
hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
hard_update(self.critic_target, self.critic)
def update_target_net(self):
soft_update(self.actor_target, self.actor, self.tau)
soft_update(self.critic_target, self.critic, self.tau)
def reset(self):
if self.action_noise is not None:
self.action_noise.reset()
def cuda(self):
self.actor.cuda()
self.actor_target.cuda()
self.critic.cuda()
self.critic_target.cuda()