本文整理汇总了Python中env.Env方法的典型用法代码示例。如果您正苦于以下问题:Python env.Env方法的具体用法?Python env.Env怎么用?Python env.Env使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类env
的用法示例。
在下文中一共展示了env.Env方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test
# 需要导入模块: import env [as 别名]
# 或者: from env import Env [as 别名]
def test(actor):
with torch.no_grad():
env = Env()
state, done, total_reward = env.reset(), False, 0
while not done:
action = torch.clamp(actor(state), min=-1, max=1) # Use purely exploitative policy at test time
state, reward, done = env.step(action)
total_reward += reward
return total_reward
示例2: test
# 需要导入模块: import env [as 别名]
# 或者: from env import Env [as 别名]
def test(agent):
with torch.no_grad():
env = Env()
state, done, total_reward = env.reset(), False, 0
while not done:
action = agent(state).argmax(dim=1, keepdim=True) # Use purely exploitative policy at test time
state, reward, done = env.step(convert_discrete_to_continuous_action(action))
total_reward += reward
return total_reward
示例3: test
# 需要导入模块: import env [as 别名]
# 或者: from env import Env [as 别名]
def test(actor):
with torch.no_grad():
env = Env()
state, done, total_reward = env.reset(), False, 0
while not done:
action = actor(state).mean # Use purely exploitative policy at test time
state, reward, done = env.step(action)
total_reward += reward
return total_reward
示例4: test
# 需要导入模块: import env [as 别名]
# 或者: from env import Env [as 别名]
def test(args, T, dqn, val_mem, metrics, results_dir, evaluate=False):
env = Env(args)
env.eval()
metrics['steps'].append(T)
T_rewards, T_Qs = [], []
# Test performance over several episodes
done = True
for _ in range(args.evaluation_episodes):
while True:
if done:
state, reward_sum, done = env.reset(), 0, False
action = dqn.act_e_greedy(state) # Choose an action ε-greedily
state, reward, done = env.step(action) # Step
reward_sum += reward
if args.render:
env.render()
if done:
T_rewards.append(reward_sum)
break
env.close()
# Test Q-values over validation memory
for state in val_mem: # Iterate over valid states
T_Qs.append(dqn.evaluate_q(state))
avg_reward, avg_Q = sum(T_rewards) / len(T_rewards), sum(T_Qs) / len(T_Qs)
if not evaluate:
# Save model parameters if improved
if avg_reward > metrics['best_avg_reward']:
metrics['best_avg_reward'] = avg_reward
dqn.save(results_dir)
# Append to results and save metrics
metrics['rewards'].append(T_rewards)
metrics['Qs'].append(T_Qs)
torch.save(metrics, os.path.join(results_dir, 'metrics.pth'))
# Plot
_plot_line(metrics['steps'], metrics['rewards'], 'Reward', path=results_dir)
_plot_line(metrics['steps'], metrics['Qs'], 'Q', path=results_dir)
# Return average reward and Q-value
return avg_reward, avg_Q
# Plots min, max and mean + standard deviation bars of a population over time