本文整理汇总了Python中baselines.common.tf_util.save_variables方法的典型用法代码示例。如果您正苦于以下问题:Python tf_util.save_variables方法的具体用法?Python tf_util.save_variables怎么用?Python tf_util.save_variables使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类baselines.common.tf_util
的用法示例。
在下文中一共展示了tf_util.save_variables方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: initialize
# 需要导入模块: from baselines.common import tf_util [as 别名]
# 或者: from baselines.common.tf_util import save_variables [as 别名]
def initialize(self, sess):
self.sess = sess
self.sess.run(tf.global_variables_initializer())
self.save = functools.partial(save_variables, sess=self.sess)
self.load = functools.partial(load_variables, sess=self.load)
self.actor_optimizer.sync()
self.critic_optimizer.sync()
self.sess.run(self.target_init_updates)
示例2: learn
# 需要导入模块: from baselines.common import tf_util [as 别名]
# 或者: from baselines.common.tf_util import save_variables [as 别名]
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4,
adam_epsilon=1e-5, optim_stepsize=3e-4,
ckpt_dir=None, log_dir=None, task_name=None,
verbose=False):
val_per_iter = int(max_iters/10)
ob_space = env.observation_space
ac_space = env.action_space
pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
# placeholder
ob = U.get_placeholder_cached(name="ob")
ac = pi.pdtype.sample_placeholder([None])
stochastic = U.get_placeholder_cached(name="stochastic")
loss = tf.reduce_mean(tf.square(ac-pi.ac))
var_list = pi.get_trainable_variables()
adam = MpiAdam(var_list, epsilon=adam_epsilon)
lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)])
U.initialize()
adam.sync()
logger.log("Pretraining with Behavior Cloning...")
for iter_so_far in tqdm(range(int(max_iters))):
ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train')
train_loss, g = lossandgrad(ob_expert, ac_expert, True)
adam.update(g, optim_stepsize)
if verbose and iter_so_far % val_per_iter == 0:
ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
val_loss, _ = lossandgrad(ob_expert, ac_expert, True)
logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss))
if ckpt_dir is None:
savedir_fname = tempfile.TemporaryDirectory().name
else:
savedir_fname = osp.join(ckpt_dir, task_name)
U.save_variables(savedir_fname, variables=pi.get_variables())
return savedir_fname
示例3: save
# 需要导入模块: from baselines.common import tf_util [as 别名]
# 或者: from baselines.common.tf_util import save_variables [as 别名]
def save(self, save_path):
tf_util.save_variables(save_path)
示例4: __init__
# 需要导入模块: from baselines.common import tf_util [as 别名]
# 或者: from baselines.common.tf_util import save_variables [as 别名]
def __init__(self, policy, env, nsteps,
ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
sess = tf_util.get_session()
nenvs = env.num_envs
nbatch = nenvs*nsteps
with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
step_model = policy(nenvs, 1, sess)
train_model = policy(nbatch, nsteps, sess)
A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
ADV = tf.placeholder(tf.float32, [nbatch])
R = tf.placeholder(tf.float32, [nbatch])
LR = tf.placeholder(tf.float32, [])
neglogpac = train_model.pd.neglogp(A)
entropy = tf.reduce_mean(train_model.pd.entropy())
pg_loss = tf.reduce_mean(ADV * neglogpac)
vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)
loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
params = find_trainable_variables("a2c_model")
grads = tf.gradients(loss, params)
if max_grad_norm is not None:
grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
grads = list(zip(grads, params))
trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
_train = trainer.apply_gradients(grads)
lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
def train(obs, states, rewards, masks, actions, values):
advs = rewards - values
for step in range(len(obs)):
cur_lr = lr.value()
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
if states is not None:
td_map[train_model.S] = states
td_map[train_model.M] = masks
policy_loss, value_loss, policy_entropy, _ = sess.run(
[pg_loss, vf_loss, entropy, _train],
td_map
)
return policy_loss, value_loss, policy_entropy
self.train = train
self.train_model = train_model
self.step_model = step_model
self.step = step_model.step
self.value = step_model.value
self.initial_state = step_model.initial_state
self.save = functools.partial(tf_util.save_variables, sess=sess)
self.load = functools.partial(tf_util.load_variables, sess=sess)
tf.global_variables_initializer().run(session=sess)