本文整理汇总了Python中ray.rllib.evaluation.policy_evaluator.PolicyEvaluator.foreach_policy方法的典型用法代码示例。如果您正苦于以下问题:Python PolicyEvaluator.foreach_policy方法的具体用法?Python PolicyEvaluator.foreach_policy怎么用?Python PolicyEvaluator.foreach_policy使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类ray.rllib.evaluation.policy_evaluator.PolicyEvaluator
的用法示例。
在下文中一共展示了PolicyEvaluator.foreach_policy方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _testWithOptimizer
# 需要导入模块: from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator [as 别名]
# 或者: from ray.rllib.evaluation.policy_evaluator.PolicyEvaluator import foreach_policy [as 别名]
def _testWithOptimizer(self, optimizer_cls):
n = 3
env = gym.make("CartPole-v0")
act_space = env.action_space
obs_space = env.observation_space
dqn_config = {"gamma": 0.95, "n_step": 3}
if optimizer_cls == SyncReplayOptimizer:
# TODO: support replay with non-DQN graphs. Currently this can't
# happen since the replay buffer doesn't encode extra fields like
# "advantages" that PG uses.
policies = {
"p1": (DQNPolicyGraph, obs_space, act_space, dqn_config),
"p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
}
else:
policies = {
"p1": (PGPolicyGraph, obs_space, act_space, {}),
"p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
}
ev = PolicyEvaluator(
env_creator=lambda _: MultiCartpole(n),
policy_graph=policies,
policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
batch_steps=50)
if optimizer_cls == AsyncGradientsOptimizer:
def policy_mapper(agent_id):
return ["p1", "p2"][agent_id % 2]
remote_evs = [
PolicyEvaluator.as_remote().remote(
env_creator=lambda _: MultiCartpole(n),
policy_graph=policies,
policy_mapping_fn=policy_mapper,
batch_steps=50)
]
else:
remote_evs = []
optimizer = optimizer_cls(ev, remote_evs, {})
for i in range(200):
ev.foreach_policy(
lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02))
if isinstance(p, DQNPolicyGraph) else None)
optimizer.step()
result = collect_metrics(ev, remote_evs)
if i % 20 == 0:
ev.foreach_policy(
lambda p, _: p.update_target()
if isinstance(p, DQNPolicyGraph) else None)
print("Iter {}, rew {}".format(i,
result["policy_reward_mean"]))
print("Total reward", result["episode_reward_mean"])
if result["episode_reward_mean"] >= 25 * n:
return
print(result)
raise Exception("failed to improve reward")