本文整理汇总了Python中baselines.logger.dump_tabular方法的典型用法代码示例。如果您正苦于以下问题:Python logger.dump_tabular方法的具体用法?Python logger.dump_tabular怎么用?Python logger.dump_tabular使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类baselines.logger
的用法示例。
在下文中一共展示了logger.dump_tabular方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from baselines import logger [as 别名]
# 或者: from baselines.logger import dump_tabular [as 别名]
def main(policy_file, seed, n_test_rollouts, render):
set_global_seeds(seed)
# Load policy.
with open(policy_file, 'rb') as f:
policy = pickle.load(f)
env_name = policy.info['env_name']
# Prepare params.
params = config.DEFAULT_PARAMS
if env_name in config.DEFAULT_ENV_PARAMS:
params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in
params['env_name'] = env_name
params = config.prepare_params(params)
config.log_params(params, logger=logger)
dims = config.configure_dims(params)
eval_params = {
'exploit': True,
'use_target_net': params['test_with_polyak'],
'compute_Q': True,
'rollout_batch_size': 1,
'render': bool(render),
}
for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
eval_params[name] = params[name]
evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
evaluator.seed(seed)
# Run evaluation.
evaluator.clear_history()
for _ in range(n_test_rollouts):
evaluator.generate_rollouts()
# record logs
for key, val in evaluator.logs('test'):
logger.record_tabular(key, np.mean(val))
logger.dump_tabular()
示例2: call
# 需要导入模块: from baselines import logger [as 别名]
# 或者: from baselines.logger import dump_tabular [as 别名]
def call(self, on_policy):
runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
if on_policy:
enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
self.episode_stats.feed(rewards, dones)
if buffer is not None:
buffer.put(enc_obs, actions, rewards, mus, dones, masks)
else:
# get obs, actions, rewards, mus, dones from buffer.
obs, actions, rewards, mus, dones, masks = buffer.get()
# reshape stuff correctly
obs = obs.reshape(runner.batch_ob_shape)
actions = actions.reshape([runner.nbatch])
rewards = rewards.reshape([runner.nbatch])
mus = mus.reshape([runner.nbatch, runner.nact])
dones = dones.reshape([runner.nbatch])
masks = masks.reshape([runner.batch_ob_shape[0]])
names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps)
if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0):
logger.record_tabular("total_timesteps", steps)
logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
# IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
# Thus, this is mean until end of life, not end of episode.
# For true episode rewards, see the monitor files in the log folder.
logger.record_tabular("mean_episode_length", self.episode_stats.mean_length())
logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward())
for name, val in zip(names_ops, values_ops):
logger.record_tabular(name, float(val))
logger.dump_tabular()
示例3: learn
# 需要导入模块: from baselines import logger [as 别名]
# 或者: from baselines.logger import dump_tabular [as 别名]
def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
tf.reset_default_graph()
set_global_seeds(seed)
nenvs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
nbatch = nenvs*nsteps
tstart = time.time()
for update in range(1, total_timesteps//nbatch+1):
obs, states, rewards, masks, actions, values = runner.run()
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
nseconds = time.time()-tstart
fps = int((update*nbatch)/nseconds)
if update % log_interval == 0 or update == 1:
ev = explained_variance(values, rewards)
logger.record_tabular("nupdates", update)
logger.record_tabular("total_timesteps", update*nbatch)
logger.record_tabular("fps", fps)
logger.record_tabular("policy_entropy", float(policy_entropy))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(ev))
logger.dump_tabular()
env.close()
示例4: main
# 需要导入模块: from baselines import logger [as 别名]
# 或者: from baselines.logger import dump_tabular [as 别名]
def main(policy_file, seed, n_test_rollouts, render):
set_global_seeds(seed)
# Load policy.
with open(policy_file, 'rb') as f:
policy = pickle.load(f)
env_name = policy.info['env_name']
# Prepare params.
params = config.DEFAULT_PARAMS
if env_name in config.DEFAULT_ENV_PARAMS:
params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in
params['env_name'] = env_name
params = config.prepare_params(params)
config.log_params(params, logger=logger)
dims = config.configure_dims(params)
eval_params = {
'exploit': True,
'use_target_net': params['test_with_polyak'],
'compute_Q': True,
'rollout_batch_size': 1,
'render': bool(render),
}
for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
eval_params[name] = params[name]
evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
evaluator.seed(seed)
# Run evaluation.
evaluator.clear_history()
for _ in range(n_test_rollouts):
evaluator.generate_rollouts()
# record logs
for key, val in evaluator.logs('test'):
logger.record_tabular(key, np.mean(val))
logger.dump_tabular()
示例5: call
# 需要导入模块: from baselines import logger [as 别名]
# 或者: from baselines.logger import dump_tabular [as 别名]
def call(self, on_policy):
runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
if on_policy:
enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
self.episode_stats.feed(rewards, dones)
if buffer is not None:
buffer.put(enc_obs, actions, rewards, mus, dones, masks)
else:
# get obs, actions, rewards, mus, dones from buffer.
obs, actions, rewards, mus, dones, masks = buffer.get()
# reshape stuff correctly
obs = obs.reshape(runner.batch_ob_shape)
actions = actions.reshape([runner.nbatch])
rewards = rewards.reshape([runner.nbatch])
mus = mus.reshape([runner.nbatch, runner.nact])
dones = dones.reshape([runner.nbatch])
masks = masks.reshape([runner.batch_ob_shape[0]])
names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps)
if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0):
logger.record_tabular("total_timesteps", steps)
logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
# IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
# Thus, this is mean until end of life, not end of episode.
# For true episode rewards, see the monitor files in the log folder.
logger.record_tabular("mean_episode_length", self.episode_stats.mean_length())
logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward())
for name, val in zip(names_ops, values_ops):
logger.record_tabular(name, float(val))
logger.dump_tabular()
示例6: log
# 需要导入模块: from baselines import logger [as 别名]
# 或者: from baselines.logger import dump_tabular [as 别名]
def log(self):
if self.t > 0 and self.print_freq is not None and len(self.episode_rewards) % self.print_freq == 0:
mean_100ep_reward = np.mean(self.episode_rewards[-100:])
num_episodes = len(self.episode_rewards)
logger.record_tabular('steps', self.t)
logger.record_tabular('episodes', num_episodes)
logger.record_tabular('mean 100 episode reward', '{:.3f}'.format(mean_100ep_reward))
logger.record_tabular('exploration (target)', '{:.3f} %'.format(100 * self.exploration_schedule.value(self.t)))
logger.record_tabular('exploration (current)', '{:.3f} %'.format(100 * (1.0 - self.greedy_freq)))
logger.dump_tabular()
示例7: learn
# 需要导入模块: from baselines import logger [as 别名]
# 或者: from baselines.logger import dump_tabular [as 别名]
def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
set_global_seeds(seed)
nenvs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
nbatch = nenvs*nsteps
tstart = time.time()
for update in range(1, total_timesteps//nbatch+1):
obs, states, rewards, masks, actions, values = runner.run()
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
nseconds = time.time()-tstart
fps = int((update*nbatch)/nseconds)
if update % log_interval == 0 or update == 1:
ev = explained_variance(values, rewards)
logger.record_tabular("nupdates", update)
logger.record_tabular("total_timesteps", update*nbatch)
logger.record_tabular("fps", fps)
logger.record_tabular("policy_entropy", float(policy_entropy))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(ev))
logger.dump_tabular()
env.close()
return model
示例8: learn
# 需要导入模块: from baselines import logger [as 别名]
# 或者: from baselines.logger import dump_tabular [as 别名]
def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, sil_update=4, sil_beta=0.0):
set_global_seeds(seed)
nenvs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, sil_update=sil_update, sil_beta=sil_beta)
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
episode_stats = EpisodeStats(nsteps, nenvs)
nbatch = nenvs*nsteps
tstart = time.time()
for update in range(1, total_timesteps//nbatch+1):
obs, states, rewards, masks, actions, values, raw_rewards = runner.run()
episode_stats.feed(raw_rewards, masks)
policy_loss, value_loss, policy_entropy, v_avg = model.train(obs, states, rewards, masks, actions, values)
sil_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train()
nseconds = time.time()-tstart
fps = int((update*nbatch)/nseconds)
if update % log_interval == 0 or update == 1:
ev = explained_variance(values, rewards)
logger.record_tabular("nupdates", update)
logger.record_tabular("total_timesteps", update*nbatch)
logger.record_tabular("fps", fps)
logger.record_tabular("policy_entropy", float(policy_entropy))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(ev))
logger.record_tabular("episode_reward", episode_stats.mean_reward())
logger.record_tabular("best_episode_reward", float(model.sil.get_best_reward()))
if sil_update > 0:
logger.record_tabular("sil_num_episodes", float(model.sil.num_episodes()))
logger.record_tabular("sil_valid_samples", float(sil_samples))
logger.record_tabular("sil_steps", float(model.sil.num_steps()))
logger.dump_tabular()
env.close()
return model
示例9: learn
# 需要导入模块: from baselines import logger [as 别名]
# 或者: from baselines.logger import dump_tabular [as 别名]
def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
tf.reset_default_graph()
set_global_seeds(seed)
nenvs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
num_procs = len(env.remotes) # HACK
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef,
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
nbatch = nenvs*nsteps
tstart = time.time()
for update in range(1, total_timesteps//nbatch+1):
obs, states, rewards, masks, actions, values = runner.run()
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
nseconds = time.time()-tstart
fps = int((update*nbatch)/nseconds)
if update % log_interval == 0 or update == 1:
ev = explained_variance(values, rewards)
logger.record_tabular("nupdates", update)
logger.record_tabular("total_timesteps", update*nbatch)
logger.record_tabular("fps", fps)
logger.record_tabular("policy_entropy", float(policy_entropy))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(ev))
logger.dump_tabular()
env.close()
示例10: learn
# 需要导入模块: from baselines import logger [as 别名]
# 或者: from baselines.logger import dump_tabular [as 别名]
def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), v_mix_coef=0.5, ent_coef=0.01, max_grad_norm=0.5,
lr_alpha=7e-4, lr_beta=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100,
v_ex_coef=1.0, r_ex_coef=0.0, r_in_coef=1.0):
tf.reset_default_graph()
set_global_seeds(seed)
nenvs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef,
v_ex_coef=v_ex_coef, max_grad_norm=max_grad_norm, lr_alpha=lr_alpha, lr_beta=lr_beta,
alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule,
v_mix_coef=v_mix_coef, r_ex_coef=r_ex_coef, r_in_coef=r_in_coef)
runner = Runner(env, model, nsteps=nsteps, gamma=gamma, r_ex_coef=r_ex_coef, r_in_coef=r_in_coef)
nbatch = nenvs*nsteps
tstart = time.time()
epinfobuf = deque(maxlen=100)
eprexbuf = deque(maxlen=100)
eprinbuf = deque(maxlen=100)
eplenbuf = deque(maxlen=100)
for update in range(1, total_timesteps//nbatch+1):
obs, ac, policy_states, r_in, r_ex, ret_ex, ret_mix, \
v_ex, v_mix, last_v_ex, last_v_mix, masks, dones, \
epinfo, ep_r_ex, ep_r_in, ep_len = runner.run()
dis_v_mix_last = np.zeros([nbatch], np.float32)
coef_mat = np.zeros([nbatch, nbatch], np.float32)
for i in range(nbatch):
dis_v_mix_last[i] = gamma ** (nsteps - i % nsteps) * last_v_mix[i // nsteps]
coef = 1.0
for j in range(i, nbatch):
if j > i and j % nsteps == 0:
break
coef_mat[i][j] = coef
coef *= gamma
if dones[j]:
dis_v_mix_last[i] = 0
break
entropy = model.train(obs, policy_states[0], masks, ac, r_ex, ret_ex, v_ex, v_mix, dis_v_mix_last, coef_mat)
nseconds = time.time()-tstart
fps = int((update*nbatch)/nseconds)
epinfobuf.extend(epinfo)
eprexbuf.extend(ep_r_ex)
eprinbuf.extend(ep_r_in)
eplenbuf.extend(ep_len)
if update % log_interval == 0 or update == 1:
logger.record_tabular("nupdates", update)
logger.record_tabular("total_timesteps", update*nbatch)
logger.record_tabular("fps", fps)
logger.record_tabular("entropy", float(entropy))
v_ex_ev = explained_variance(v_ex, ret_ex)
logger.record_tabular("v_ex_ev", float(v_ex_ev))
v_mix_ev = explained_variance(v_mix, ret_mix)
logger.record_tabular("v_mix_ev", float(v_mix_ev))
logger.record_tabular("gamescoremean", safemean([epinfo['r'] for epinfo in epinfobuf]))
logger.record_tabular("gamelenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
logger.dump_tabular()
env.close()
示例11: learn
# 需要导入模块: from baselines import logger [as 别名]
# 或者: from baselines.logger import dump_tabular [as 别名]
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
kfac_clip=0.001, save_interval=None, lrschedule='linear'):
tf.reset_default_graph()
set_global_seeds(seed)
nenvs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
lrschedule=lrschedule)
if save_interval and logger.get_dir():
import cloudpickle
with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
fh.write(cloudpickle.dumps(make_model))
model = make_model()
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
nbatch = nenvs*nsteps
tstart = time.time()
coord = tf.train.Coordinator()
enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
for update in range(1, total_timesteps//nbatch+1):
obs, states, rewards, masks, actions, values = runner.run()
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
model.old_obs = obs
nseconds = time.time()-tstart
fps = int((update*nbatch)/nseconds)
if update % log_interval == 0 or update == 1:
ev = explained_variance(values, rewards)
logger.record_tabular("nupdates", update)
logger.record_tabular("total_timesteps", update*nbatch)
logger.record_tabular("fps", fps)
logger.record_tabular("policy_entropy", float(policy_entropy))
logger.record_tabular("policy_loss", float(policy_loss))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(ev))
logger.dump_tabular()
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
print('Saving to', savepath)
model.save(savepath)
coord.request_stop()
coord.join(enqueue_threads)
env.close()
示例12: train
# 需要导入模块: from baselines import logger [as 别名]
# 或者: from baselines.logger import dump_tabular [as 别名]
def train(policy, rollout_worker, evaluator,
n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval,
save_policies, **kwargs):
rank = MPI.COMM_WORLD.Get_rank()
latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl')
best_policy_path = os.path.join(logger.get_dir(), 'policy_best.pkl')
periodic_policy_path = os.path.join(logger.get_dir(), 'policy_{}.pkl')
logger.info("Training...")
best_success_rate = -1
for epoch in range(n_epochs):
# train
rollout_worker.clear_history()
for _ in range(n_cycles):
episode = rollout_worker.generate_rollouts()
policy.store_episode(episode)
for _ in range(n_batches):
policy.train()
policy.update_target_net()
# test
evaluator.clear_history()
for _ in range(n_test_rollouts):
evaluator.generate_rollouts()
# record logs
logger.record_tabular('epoch', epoch)
for key, val in evaluator.logs('test'):
logger.record_tabular(key, mpi_average(val))
for key, val in rollout_worker.logs('train'):
logger.record_tabular(key, mpi_average(val))
for key, val in policy.logs():
logger.record_tabular(key, mpi_average(val))
if rank == 0:
logger.dump_tabular()
# save the policy if it's better than the previous ones
success_rate = mpi_average(evaluator.current_success_rate())
if rank == 0 and success_rate >= best_success_rate and save_policies:
best_success_rate = success_rate
logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path))
evaluator.save_policy(best_policy_path)
evaluator.save_policy(latest_policy_path)
if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies:
policy_path = periodic_policy_path.format(epoch)
logger.info('Saving periodic policy to {} ...'.format(policy_path))
evaluator.save_policy(policy_path)
# make sure that different threads have different seeds
local_uniform = np.random.uniform(size=(1,))
root_uniform = local_uniform.copy()
MPI.COMM_WORLD.Bcast(root_uniform, root=0)
if rank != 0:
assert local_uniform[0] != root_uniform[0]
示例13: learn
# 需要导入模块: from baselines import logger [as 别名]
# 或者: from baselines.logger import dump_tabular [as 别名]
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, **network_kwargs):
set_global_seeds(seed)
if network == 'cnn':
network_kwargs['one_dim_bias'] = True
policy = build_policy(env, network, **network_kwargs)
nenvs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
lrschedule=lrschedule)
if save_interval and logger.get_dir():
import cloudpickle
with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
fh.write(cloudpickle.dumps(make_model))
model = make_model()
if load_path is not None:
model.load(load_path)
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
nbatch = nenvs*nsteps
tstart = time.time()
coord = tf.train.Coordinator()
enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
for update in range(1, total_timesteps//nbatch+1):
obs, states, rewards, masks, actions, values = runner.run()
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
model.old_obs = obs
nseconds = time.time()-tstart
fps = int((update*nbatch)/nseconds)
if update % log_interval == 0 or update == 1:
ev = explained_variance(values, rewards)
logger.record_tabular("nupdates", update)
logger.record_tabular("total_timesteps", update*nbatch)
logger.record_tabular("fps", fps)
logger.record_tabular("policy_entropy", float(policy_entropy))
logger.record_tabular("policy_loss", float(policy_loss))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(ev))
logger.dump_tabular()
if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
print('Saving to', savepath)
model.save(savepath)
coord.request_stop()
coord.join(enqueue_threads)
env.close()
return model