本文整理匯總了Python中baselines.ppo2.ppo2.learn方法的典型用法代碼示例。如果您正苦於以下問題:Python ppo2.learn方法的具體用法?Python ppo2.learn怎麽用?Python ppo2.learn使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類baselines.ppo2.ppo2
的用法示例。
在下文中一共展示了ppo2.learn方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: train
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def train(env_id, num_timesteps, seed, policy):
ncpu = multiprocessing.cpu_count()
if sys.platform == 'darwin': ncpu //= 2
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=ncpu,
inter_op_parallelism_threads=ncpu)
config.gpu_options.allow_growth = True #pylint: disable=E1101
tf.Session(config=config).__enter__()
env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy}[policy]
ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
ent_coef=.01,
lr=lambda f : f * 2.5e-4,
cliprange=lambda f : f * 0.1,
total_timesteps=int(num_timesteps * 1.1))
示例2: test_microbatches
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def test_microbatches():
def env_fn():
env = gym.make('CartPole-v0')
env.seed(0)
return env
learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0)
env_ref = DummyVecEnv([env_fn])
sess_ref = make_session(make_default=True, graph=tf.Graph())
learn_fn(env=env_ref)
vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()}
env_test = DummyVecEnv([env_fn])
sess_test = make_session(make_default=True, graph=tf.Graph())
learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2))
vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()}
for v in vars_ref:
np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=1e-3)
示例3: train
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def train(env_id, num_timesteps, seed, policy, hparams):
ncpu = multiprocessing.cpu_count()
#if sys.platform == 'darwin': ncpu //= 2
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=hparams['gpu_fraction'])
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=ncpu,
inter_op_parallelism_threads=ncpu,
gpu_options=gpu_options)
config.gpu_options.allow_growth = False #pylint: disable=E1101
tf.Session(config=config).__enter__()
video_log_dir = os.path.join(hparams['base_dir'], 'videos', hparams['experiment_name'])
env = VecFrameStack(make_atari_env(env_id, 8, seed, video_log_dir=video_log_dir, write_attention_video='attention' in policy, nsteps=128), 4)
policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy, 'cnn_attention': CnnAttentionPolicy}[policy]
ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
ent_coef=.01,
lr=lambda f : f * 2.5e-4,
cliprange=lambda f : f * 0.1,
total_timesteps=int(num_timesteps * 1.1),
hparams=hparams)
示例4: main
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def main():
"""Run PPO until the environment throws an exception."""
config = tf.ConfigProto()
config.gpu_options.allow_growth = True # pylint: disable=E1101
with tf.Session(config=config):
# Take more timesteps than we need to be sure that
# we stop due to an exception.
ppo2.learn(policy=policies.CnnPolicy,
env=DummyVecEnv([make_env]),
nsteps=4096,
nminibatches=8,
lam=0.95,
gamma=0.99,
noptepochs=3,
log_interval=1,
ent_coef=0.01,
lr=lambda _: 2e-4,
cliprange=lambda _: 0.1,
total_timesteps=int(1e7),
load_path='./pretrain_model') # Set to None if no pretrained model
示例5: main
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def main():
"""Run PPO until the environment throws an exception."""
config = tf.ConfigProto()
config.gpu_options.allow_growth = True # pylint: disable=E1101
env_fns, env_names = create_envs()
with tf.Session(config=config):
# Take more timesteps than we need to be sure that
# we stop due to an exception.
ppo2.learn(policy=policies.CnnPolicy,
env=SubprocVecEnv(env_fns),
nsteps=4096,
nminibatches=8,
lam=0.95,
gamma=0.99,
noptepochs=3,
log_interval=1,
ent_coef=0.01,
lr=lambda _: 2e-4,
cliprange=lambda _: 0.1,
total_timesteps=int(1e9),
save_interval=10,
save_path='./checkpoints_joint_ppo2',
load_path=None)
示例6: train
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def train(env_id, num_timesteps, seed, policy):
ncpu = multiprocessing.cpu_count()
if sys.platform == 'darwin': ncpu //= 2
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=ncpu,
inter_op_parallelism_threads=ncpu)
config.gpu_options.allow_growth = True #pylint: disable=E1101
tf.Session(config=config).__enter__()
env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy, 'mlp': MlpPolicy}[policy]
ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
ent_coef=.01,
lr=lambda f : f * 2.5e-4,
cliprange=lambda f : f * 0.1,
total_timesteps=int(num_timesteps * 1.1))
示例7: run
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def run(bsuite_id: str) -> str:
"""Runs a PPO agent on a given bsuite environment, logging to CSV."""
def _load_env():
raw_env = bsuite.load_and_record(
bsuite_id=bsuite_id,
save_path=FLAGS.save_path,
logging_mode=FLAGS.logging_mode,
overwrite=FLAGS.overwrite,
)
if FLAGS.verbose:
raw_env = terminal_logging.wrap_environment(raw_env, log_every=True) # pytype: disable=wrong-arg-types
return gym_wrapper.GymFromDMEnv(raw_env)
env = dummy_vec_env.DummyVecEnv([_load_env])
ppo2.learn(
env=env,
network=FLAGS.network,
lr=FLAGS.learning_rate,
total_timesteps=FLAGS.total_timesteps, # make sure to run enough steps
nsteps=FLAGS.nsteps,
gamma=FLAGS.agent_discount,
)
return bsuite_id
示例8: test_identity
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def test_identity(learn_func):
'''
Test if the algorithm (with a given policy)
can learn an identity transformation (i.e. return observation as an action)
'''
np.random.seed(0)
np_random.seed(0)
random.seed(0)
env = DummyVecEnv([lambda: IdentityEnv(10)])
with tf.Graph().as_default(), tf.Session().as_default():
tf.set_random_seed(0)
model = learn_func(env)
N_TRIALS = 1000
sum_rew = 0
obs = env.reset()
for i in range(N_TRIALS):
obs, rew, done, _ = env.step(model.step(obs)[0])
sum_rew += rew
assert sum_rew > 0.9 * N_TRIALS
示例9: test_microbatches
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def test_microbatches():
def env_fn():
env = gym.make('CartPole-v0')
env.seed(0)
return env
learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0)
env_ref = DummyVecEnv([env_fn])
sess_ref = make_session(make_default=True, graph=tf.Graph())
learn_fn(env=env_ref)
vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()}
env_test = DummyVecEnv([env_fn])
sess_test = make_session(make_default=True, graph=tf.Graph())
learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2))
# learn_fn(env=env_test)
vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()}
for v in vars_ref:
np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=3e-3)
示例10: main
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def main():
"""Run PPO until the environment throws an exception."""
config = tf.ConfigProto()
config.gpu_options.allow_growth = True # pylint: disable=E1101
with tf.Session(config=config):
# Take more timesteps than we need to be sure that
# we stop due to an exception.
ppo2.learn(policy=policies.CnnPolicy,
env=DummyVecEnv([make_env]),
nsteps=4096,
nminibatches=8,
lam=0.95,
gamma=0.99,
noptepochs=3,
log_interval=1,
ent_coef=0.01,
lr=lambda _: 2e-4,
cliprange=lambda _: 0.1,
total_timesteps=int(1e7))
示例11: train
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def train(env_id, num_timesteps, seed, policy, r_ex_coef, r_in_coef, lr_alpha, lr_beta, reward_freq):
from baselines.common import set_global_seeds
from baselines.common.vec_env.vec_normalize import VecNormalize
from baselines.ppo2 import ppo2
from baselines.ppo2.policies import MlpPolicy, MlpPolicyIntrinsicReward
import gym
import tensorflow as tf
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
ncpu = 1
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=ncpu,
inter_op_parallelism_threads=ncpu)
config.gpu_options.allow_growth = True
tf.Session(config=config).__enter__()
def make_env():
env = gym.make(env_id)
env = bench.Monitor(env, logger.get_dir())
return env
env = DummyVecEnv([make_env])
env = VecNormalize(env)
set_global_seeds(seed)
if policy == 'mlp':
policy = MlpPolicy
elif policy == 'mlp_int':
policy = MlpPolicyIntrinsicReward
else:
raise NotImplementedError
ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
ent_coef=0.0,
lr_alpha=lr_alpha,
cliprange=0.2,
total_timesteps=num_timesteps,
r_ex_coef=r_ex_coef,
r_in_coef=r_in_coef,
lr_beta=lr_beta,
reward_freq=reward_freq)
示例12: train
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def train(env_id, num_timesteps, seed):
from baselines.common import set_global_seeds
from baselines.common.vec_env.vec_normalize import VecNormalize
from baselines.ppo2 import ppo2
from baselines.ppo2.policies import MlpPolicy
import gym
import tensorflow as tf
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
ncpu = 1
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=ncpu,
inter_op_parallelism_threads=ncpu)
tf.Session(config=config).__enter__()
def make_env():
env = gym.make(env_id)
env = bench.Monitor(env, logger.get_dir())
return env
env = DummyVecEnv([make_env])
env = VecNormalize(env)
set_global_seeds(seed)
policy = MlpPolicy
ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
ent_coef=0.0,
lr=3e-4,
cliprange=0.2,
total_timesteps=num_timesteps)
示例13: train
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def train(env_id, num_timesteps, seed):
from baselines.common import set_global_seeds
from baselines.common.vec_env.vec_normalize import VecNormalize
from baselines.ppo2 import ppo2
from baselines.ppo2.policies import MlpPolicy
import gym
import tensorflow as tf
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
ncpu = 1
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=ncpu,
inter_op_parallelism_threads=ncpu)
tf.Session(config=config).__enter__()
def make_env():
env = gym.make(env_id)
env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
return env
env = DummyVecEnv([make_env])
env = VecNormalize(env)
set_global_seeds(seed)
policy = MlpPolicy
model = ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
ent_coef=0.0,
lr=3e-4,
cliprange=0.2,
total_timesteps=num_timesteps)
return model, env
示例14: train
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def train(_):
"""Trains a PPO2 policy."""
vec_env = SubprocVecEnv([
(lambda _i=i: create_single_football_env(_i))
for i in range(FLAGS.num_envs)
], context=None)
# Import tensorflow after we create environments. TF is not fork sake, and
# we could be using TF as part of environment if one of the players is
# controled by an already trained model.
import tensorflow.compat.v1 as tf
ncpu = multiprocessing.cpu_count()
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=ncpu,
inter_op_parallelism_threads=ncpu)
config.gpu_options.allow_growth = True
tf.Session(config=config).__enter__()
ppo2.learn(network=FLAGS.policy,
total_timesteps=FLAGS.num_timesteps,
env=vec_env,
seed=FLAGS.seed,
nsteps=FLAGS.nsteps,
nminibatches=FLAGS.nminibatches,
noptepochs=FLAGS.noptepochs,
max_grad_norm=FLAGS.max_grad_norm,
gamma=FLAGS.gamma,
ent_coef=FLAGS.ent_coef,
lr=FLAGS.lr,
log_interval=1,
save_interval=FLAGS.save_interval,
cliprange=FLAGS.cliprange,
load_path=FLAGS.load_path)
示例15: train
# 需要導入模塊: from baselines.ppo2 import ppo2 [as 別名]
# 或者: from baselines.ppo2.ppo2 import learn [as 別名]
def train(env_id, num_timesteps, seed, policy):
from baselines.common import set_global_seeds
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
from baselines.ppo2 import ppo2
from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
import gym
import logging
import multiprocessing
import os.path as osp
import tensorflow as tf
ncpu = multiprocessing.cpu_count()
if sys.platform == 'darwin': ncpu //= 2
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=ncpu,
inter_op_parallelism_threads=ncpu)
config.gpu_options.allow_growth = True #pylint: disable=E1101
gym.logger.setLevel(logging.WARN)
tf.Session(config=config).__enter__()
def make_env(rank):
def env_fn():
env = make_atari(env_id)
env.seed(seed + rank)
env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
return wrap_deepmind(env)
return env_fn
nenvs = 8
env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
set_global_seeds(seed)
env = VecFrameStack(env, 4)
policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy}[policy]
ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
ent_coef=.01,
lr=lambda f : f * 2.5e-4,
cliprange=lambda f : f * 0.1,
total_timesteps=int(num_timesteps * 1.1))