本文整理匯總了Python中baselines.ppo1.mlp_policy.MlpPolicy方法的典型用法代碼示例。如果您正苦於以下問題:Python mlp_policy.MlpPolicy方法的具體用法?Python mlp_policy.MlpPolicy怎麽用?Python mlp_policy.MlpPolicy使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類baselines.ppo1.mlp_policy
的用法示例。
在下文中一共展示了mlp_policy.MlpPolicy方法的13個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: train
# 需要導入模塊: from baselines.ppo1 import mlp_policy [as 別名]
# 或者: from baselines.ppo1.mlp_policy import MlpPolicy [as 別名]
def train(env_id, num_timesteps, seed):
import baselines.common.tf_util as U
sess = U.single_threaded_session()
sess.__enter__()
rank = MPI.COMM_WORLD.Get_rank()
if rank == 0:
logger.configure()
else:
logger.configure(format_strs=[])
logger.set_level(logger.DISABLED)
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
def policy_fn(name, ob_space, ac_space):
return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=32, num_hid_layers=2)
env = make_mujoco_env(env_id, workerseed)
trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
env.close()
示例2: train
# 需要導入模塊: from baselines.ppo1 import mlp_policy [as 別名]
# 或者: from baselines.ppo1.mlp_policy import MlpPolicy [as 別名]
def train(env_id, num_timesteps, seed):
from baselines.ppo1 import mlp_policy, pposgd_simple
import baselines.common.tf_util as U
rank = MPI.COMM_WORLD.Get_rank()
sess = U.single_threaded_session()
sess.__enter__()
mujoco_py.ignore_mujoco_warnings().__enter__()
workerseed = seed + 10000 * rank
set_global_seeds(workerseed)
env = make_robotics_env(env_id, workerseed, rank=rank)
def policy_fn(name, ob_space, ac_space):
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=256, num_hid_layers=3)
pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps,
timesteps_per_actorbatch=2048,
clip_param=0.2, entcoeff=0.0,
optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256,
gamma=0.99, lam=0.95, schedule='linear',
)
env.close()
示例3: train
# 需要導入模塊: from baselines.ppo1 import mlp_policy [as 別名]
# 或者: from baselines.ppo1.mlp_policy import MlpPolicy [as 別名]
def train(env_id, num_timesteps, seed):
import baselines.common.tf_util as U
sess = U.single_threaded_session()
sess.__enter__()
rank = MPI.COMM_WORLD.Get_rank()
if rank != 0:
logger.set_level(logger.DISABLED)
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed)
env = gym.make(env_id)
def policy_fn(name, ob_space, ac_space):
return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
hid_size=32, num_hid_layers=2)
env = bench.Monitor(env, logger.get_dir() and
osp.join(logger.get_dir(), str(rank)))
env.seed(workerseed)
gym.logger.setLevel(logging.WARN)
trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
env.close()
示例4: train
# 需要導入模塊: from baselines.ppo1 import mlp_policy [as 別名]
# 或者: from baselines.ppo1.mlp_policy import MlpPolicy [as 別名]
def train(env_id, num_timesteps, seed):
from baselines.ppo1 import mlp_policy, pposgd_simple
U.make_session(num_cpu=1).__enter__()
set_global_seeds(seed)
env = gym.make(env_id)
def policy_fn(name, ob_space, ac_space):
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=64, num_hid_layers=2)
env = bench.Monitor(env, logger.get_dir())
env.seed(seed)
gym.logger.setLevel(logging.WARN)
pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps,
timesteps_per_actorbatch=2048,
clip_param=0.2, entcoeff=0.0,
optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
gamma=0.99, lam=0.95, schedule='linear',
)
env.close()
示例5: train
# 需要導入模塊: from baselines.ppo1 import mlp_policy [as 別名]
# 或者: from baselines.ppo1.mlp_policy import MlpPolicy [as 別名]
def train(env_id, num_timesteps, seed):
from baselines.ppo1 import mlp_policy, pposgd_simple
U.make_session(num_cpu=1).__enter__()
def policy_fn(name, ob_space, ac_space):
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=64, num_hid_layers=2)
env = make_mujoco_env(env_id, seed)
pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps,
timesteps_per_actorbatch=2048,
clip_param=0.2, entcoeff=0.0,
optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
gamma=0.99, lam=0.95, schedule='linear',
)
env.close()
示例6: train
# 需要導入模塊: from baselines.ppo1 import mlp_policy [as 別名]
# 或者: from baselines.ppo1.mlp_policy import MlpPolicy [as 別名]
def train(num_timesteps, seed, model_path=None):
env_id = 'Humanoid-v2'
from baselines.ppo1 import mlp_policy, pposgd_simple
U.make_session(num_cpu=1).__enter__()
def policy_fn(name, ob_space, ac_space):
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=64, num_hid_layers=2)
env = make_mujoco_env(env_id, seed)
# parameters below were the best found in a simple random search
# these are good enough to make humanoid walk, but whether those are
# an absolute best or not is not certain
env = RewScale(env, 0.1)
pi = pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps,
timesteps_per_actorbatch=2048,
clip_param=0.2, entcoeff=0.0,
optim_epochs=10,
optim_stepsize=3e-4,
optim_batchsize=64,
gamma=0.99,
lam=0.95,
schedule='linear',
)
env.close()
if model_path:
U.save_state(model_path)
return pi
示例7: train
# 需要導入模塊: from baselines.ppo1 import mlp_policy [as 別名]
# 或者: from baselines.ppo1.mlp_policy import MlpPolicy [as 別名]
def train(num_timesteps, seed, model_path=None):
env_id = 'Humanoid-v2'
from baselines.ppo1 import mlp_policy, pposgd_simple
U.make_session(num_cpu=1).__enter__()
def policy_fn(name, ob_space, ac_space):
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=64, num_hid_layers=2)
env = make_mujoco_env(env_id, seed)
# parameters below were the best found in a simple random search
# these are good enough to make humanoid walk, but whether those are
# an absolute best or not is not certain
env = RewScale(env, 0.1)
pi = pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps,
timesteps_per_actorbatch=2048,
clip_param=0.2, entcoeff=0.0,
optim_epochs=10,
optim_stepsize=3e-4,
optim_batchsize=64,
gamma=0.99,
lam=0.95,
schedule='linear',
)
env.close()
if model_path:
U.save_state(model_path)
return pi
示例8: train
# 需要導入模塊: from baselines.ppo1 import mlp_policy [as 別名]
# 或者: from baselines.ppo1.mlp_policy import MlpPolicy [as 別名]
def train(num_timesteps, seed, model_path=None):
env_id = 'Humanoid-v2'
from baselines.ppo1 import mlp_policy, pposgd_simple
U.make_session(num_cpu=1).__enter__()
def policy_fn(name, ob_space, ac_space):
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=64, num_hid_layers=2)
env = make_mujoco_env(env_id, seed)
# parameters below were the best found in a simple random search
# these are good enough to make humanoid walk, but whether those are
# an absolute best or not is not certain
env = RewScale(env, 0.1)
logger.log("NOTE: reward will be scaled by a factor of 10 in logged stats. Check the monitor for unscaled reward.")
pi = pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps,
timesteps_per_actorbatch=2048,
clip_param=0.1, entcoeff=0.0,
optim_epochs=10,
optim_stepsize=1e-4,
optim_batchsize=64,
gamma=0.99,
lam=0.95,
schedule='constant',
)
env.close()
if model_path:
U.save_state(model_path)
return pi
示例9: train
# 需要導入模塊: from baselines.ppo1 import mlp_policy [as 別名]
# 或者: from baselines.ppo1.mlp_policy import MlpPolicy [as 別名]
def train(env_id, num_timesteps, seed):
import baselines.common.tf_util as U
sess = U.single_threaded_session()
sess.__enter__()
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
def policy_fn(name, ob_space, ac_space):
return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=32, num_hid_layers=2)
# Create a new base directory like /tmp/openai-2018-05-21-12-27-22-552435
log_dir = os.path.join(energyplus_logbase_dir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
if not os.path.exists(log_dir + '/output'):
os.makedirs(log_dir + '/output')
os.environ["ENERGYPLUS_LOG"] = log_dir
model = os.getenv('ENERGYPLUS_MODEL')
if model is None:
print('Environment variable ENERGYPLUS_MODEL is not defined')
os.exit()
weather = os.getenv('ENERGYPLUS_WEATHER')
if weather is None:
print('Environment variable ENERGYPLUS_WEATHER is not defined')
os.exit()
rank = MPI.COMM_WORLD.Get_rank()
if rank == 0:
print('train: init logger with dir={}'.format(log_dir)) #XXX
logger.configure(log_dir)
else:
logger.configure(format_strs=[])
logger.set_level(logger.DISABLED)
env = make_energyplus_env(env_id, workerseed)
trpo_mpi.learn(env, policy_fn,
max_timesteps=num_timesteps,
#timesteps_per_batch=1*1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
timesteps_per_batch=16*1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
env.close()
示例10: main
# 需要導入模塊: from baselines.ppo1 import mlp_policy [as 別名]
# 或者: from baselines.ppo1.mlp_policy import MlpPolicy [as 別名]
def main():
# use fixed random state
rand_state = np.random.RandomState(1).get_state()
np.random.set_state(rand_state)
tf_set_seeds(np.random.randint(1, 2**31 - 1))
# Create the Create2 docker environment
env = Create2DockerEnv(30, port='/dev/ttyUSB0', ir_window=20, ir_history=1,
obs_history=1, dt=0.045, random_state=rand_state)
env = NormalizedEnv(env)
# Start environment processes
env.start()
# Create baselines TRPO policy function
sess = U.single_threaded_session()
sess.__enter__()
def policy_fn(name, ob_space, ac_space):
return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=32, num_hid_layers=2)
# Create and start plotting process
plot_running = Value('i', 1)
shared_returns = Manager().dict({"write_lock": False,
"episodic_returns": [],
"episodic_lengths": [], })
# Spawn plotting process
pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running))
pp.start()
# Create callback function for logging data from baselines TRPO learn
kindred_callback = create_callback(shared_returns)
# Train baselines TRPO
learn(env, policy_fn,
max_timesteps=40000,
timesteps_per_batch=2048,
max_kl=0.05,
cg_iters=10,
cg_damping=0.1,
vf_iters=5,
vf_stepsize=0.001,
gamma=0.995,
lam=0.995,
callback=kindred_callback
)
# Safely terminate plotter process
plot_running.value = 0 # shutdown ploting process
time.sleep(2)
pp.join()
env.close()
示例11: main
# 需要導入模塊: from baselines.ppo1 import mlp_policy [as 別名]
# 或者: from baselines.ppo1.mlp_policy import MlpPolicy [as 別名]
def main():
# use fixed random state
rand_state = np.random.RandomState(1).get_state()
np.random.set_state(rand_state)
tf_set_seeds(np.random.randint(1, 2**31 - 1))
# Create the Create2 mover environment
env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15, random_state=rand_state)
env = NormalizedEnv(env)
# Start environment processes
env.start()
# Create baselines TRPO policy function
sess = U.single_threaded_session()
sess.__enter__()
def policy_fn(name, ob_space, ac_space):
return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=32, num_hid_layers=2)
# Create and start plotting process
plot_running = Value('i', 1)
shared_returns = Manager().dict({"write_lock": False,
"episodic_returns": [],
"episodic_lengths": [], })
# Spawn plotting process
pp = Process(target=plot_create2_mover, args=(env, 2048, shared_returns, plot_running))
pp.start()
# Create callback function for logging data from baselines TRPO learn
kindred_callback = create_callback(shared_returns)
# Train baselines TRPO
learn(env, policy_fn,
max_timesteps=40000,
timesteps_per_batch=2048,
max_kl=0.05,
cg_iters=10,
cg_damping=0.1,
vf_iters=5,
vf_stepsize=0.001,
gamma=0.995,
lam=0.995,
callback=kindred_callback
)
# Safely terminate plotter process
plot_running.value = 0 # shutdown ploting process
time.sleep(2)
pp.join()
env.close()
示例12: main
# 需要導入模塊: from baselines.ppo1 import mlp_policy [as 別名]
# 或者: from baselines.ppo1.mlp_policy import MlpPolicy [as 別名]
def main():
# use fixed random state
rand_state = np.random.RandomState(1).get_state()
np.random.set_state(rand_state)
tf_set_seeds(np.random.randint(1, 2**31 - 1))
#Create Asynchronous Simulation of InvertedDoublePendulum-v2 mujoco environment.
env = DoubleInvertedPendulumEnv(agent_dt=0.005,
sensor_dt=[0.01, 0.0033333],
is_render=False,
random_state=rand_state
)
# Start environment processes
env.start()
# Create baselines ppo policy function
sess = U.single_threaded_session()
sess.__enter__()
def policy_fn(name, ob_space, ac_space):
return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=64, num_hid_layers=2)
# create and start plotting process
plot_running = Value('i', 1)
shared_returns = Manager().dict({"write_lock": False,
"episodic_returns": [],
"episodic_lengths": [], })
# Plotting process
pp = Process(target=plot_returns, args=(env, 2048, shared_returns, plot_running))
pp.start()
# Create callback function for logging data from baselines PPO learn
kindred_callback = create_callback(shared_returns)
# Train baselines PPO
learn(env,
policy_fn,
max_timesteps=1e6,
timesteps_per_actorbatch=2048,
clip_param=0.2,
entcoeff=0.0,
optim_epochs=10,
optim_stepsize=0.0001,
optim_batchsize=64,
gamma=0.995,
lam=0.995,
schedule="linear",
callback=kindred_callback,
)
# Safely terminate plotter process
plot_running.value = 0 # shutdown ploting process
time.sleep(2)
pp.join()
# Shutdown the environment
env.close()
示例13: train
# 需要導入模塊: from baselines.ppo1 import mlp_policy [as 別名]
# 或者: from baselines.ppo1.mlp_policy import MlpPolicy [as 別名]
def train(env, num_timesteps, seed, ckpt_dir=None,
render=False, ckpt_freq=0, restore_dir=None, optim_stepsize=3e-4,
schedule="linear", gamma=0.99, optim_epochs=10, optim_batchsize=64,
horizon=2048):
from baselines.common.fc_learning_utils import FlightLog
from mpi4py import MPI
from baselines import logger
from baselines.ppo1.mlp_policy import MlpPolicy
from baselines.common import set_global_seeds
from baselines.ppo1 import pposgd_simple
import baselines.common.tf_util as U
sess = U.single_threaded_session()
sess.__enter__()
rank = MPI.COMM_WORLD.Get_rank()
if rank == 0:
logger.configure()
else:
logger.configure(format_strs=[])
logger.set_level(logger.DISABLED)
workerseed = seed + 1000000 * rank
def policy_fn(name, ob_space, ac_space):
return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=32, num_hid_layers=2)
if render:
env.render()
env.seed(workerseed)
set_global_seeds(workerseed)
pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps,
timesteps_per_actorbatch=horizon,
clip_param=0.2, entcoeff=0.0,
optim_epochs=optim_epochs,
optim_stepsize=optim_stepsize,
optim_batchsize=optim_batchsize,
gamma=0.99, lam=0.95, schedule=schedule,
flight_log = None,
ckpt_dir = ckpt_dir,
restore_dir = restore_dir,
save_timestep_period= ckpt_freq
)
env.close()