本文整理汇总了Python中agent.Agent.get_action方法的典型用法代码示例。如果您正苦于以下问题:Python Agent.get_action方法的具体用法?Python Agent.get_action怎么用?Python Agent.get_action使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类agent.Agent
的用法示例。
在下文中一共展示了Agent.get_action方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Environment
# 需要导入模块: from agent import Agent [as 别名]
# 或者: from agent.Agent import get_action [as 别名]
class Environment():
def __init__(self):
env = gym.make(ENV)
self.env = wrappers.Monitor(env, '/tmp/gym/mountaincar_dqn', force=True)
self.num_states = self.env.observation_space.shape[0]
self.num_actions = self.env.action_space.n
self.agent = Agent(self.num_states, self.num_actions)
def run(self):
complete_episodes = 0
episode_final = False
output = open('result.log', 'w')
print(self.num_states, self.num_actions)
for episode in range(NUM_EPISODE):
observation = self.env.reset()
state = torch.from_numpy(observation).type(torch.FloatTensor)
state = torch.unsqueeze(state, 0)
for step in range(MAX_STEPS):
if episode_final:
self.env.render(mode='rgb_array')
action = self.agent.get_action(state, episode)
observation_next, _, done, _ = self.env.step(action.item())
state_next = torch.from_numpy(observation_next).type(torch.FloatTensor)
state_next = torch.unsqueeze(state_next, 0)
reward = torch.FloatTensor([0.0])
if done:
state_next = None
if 199 <= step:
reward = torch.FloatTensor([-1.0])
complete_episodes = 0
else:
reward = torch.FloatTensor([1.0])
complete_episodes = complete_episodes + 1
self.agent.memory(state, action, state_next, reward)
self.agent.update_q_function()
state = state_next
if done:
message = 'episode: {0}, step: {1}'.format(episode, step)
print(message)
output.write(message + '\n')
break
if episode_final:
break
if 10 <= complete_episodes:
print('success 10 times in sequence')
# episode_final = True
self.env.close()
output.close()
示例2: main
# 需要导入模块: from agent import Agent [as 别名]
# 或者: from agent.Agent import get_action [as 别名]
def main(env_name, render=False, monitor=True, load=False, seed=0):
env = gym.make(env_name)
view_path = "./video/" + env_name
model_path = "./model/" + env_name + "_"
n_st = env.observation_space.shape[0]
if type(env.action_space) == gym.spaces.discrete.Discrete:
# CartPole-v0, Acrobot-v0, MountainCar-v0
n_act = env.action_space.n
action_list = range(0, n_act)
elif type(env.action_space) == gym.spaces.box.Box:
# Pendulum-v0
action_list = [np.array([a]) for a in [-2.0, 2.0]]
n_act = len(action_list)
agent = Agent(n_st, n_act, seed)
if load:
agent.load_model(model_path)
if monitor:
env.monitor.start(view_path, video_callable=None, force=True, seed=seed)
for i_episode in xrange(1000):
observation = env.reset()
r_sum = 0
q_list = []
for t in xrange(200):
if render:
env.render()
state = observation.astype(np.float32).reshape((1,n_st))
act_i, q = agent.get_action(state)
q_list.append(q)
action = action_list[act_i]
observation, reward, ep_end, _ = env.step(action)
state_dash = observation.astype(np.float32).reshape((1,n_st))
agent.stock_experience(state, act_i, reward, state_dash, ep_end)
agent.train()
r_sum += reward
if ep_end:
break
print "\t".join(map(str,[i_episode, r_sum, agent.epsilon, agent.loss, sum(q_list)/float(t+1) ,agent.step]))
agent.save_model(model_path)
if monitor:
env.monitor.close()
示例3: __init__
# 需要导入模块: from agent import Agent [as 别名]
# 或者: from agent.Agent import get_action [as 别名]
class Environment:
def __init__(self, config):
config.model_type = Config.MODEL_TYPE_CONV2D
print(config.device)
self.config = config
self.env = wrap_deepmind(make_atari(config.env), frame_stack=True)
self.num_states = self.env.observation_space.shape[-1]
self.num_actions = self.env.action_space.n
self.agent = Agent(config, self.num_states, self.num_actions, self.config.num_atoms)
self.total_step = np.zeros(100)
self.data_path = config.data_path
if self.data_path != Config.DATA_PATH_DEFAULT:
self.agent.load_model()
def prepro(self, observation):
ret = np.zeros((4, 84, 84))
ret[0] = observation[:, :, 0]
ret[1] = observation[:, :, 1]
ret[2] = observation[:, :, 2]
ret[3] = observation[:, :, 3]
return ret
def run_episode(self, episode, steps_accumulated=0):
start_time = time.time()
total_reward = 0
observation = self.prepro(self.env.reset())
state = torch.from_numpy(observation).to(self.config.device, dtype=torch.uint8).unsqueeze(0)
for step in range(self.config.num_steps):
if self.config.is_render:
time.sleep(0.064)
self.env.render()
action = self.agent.get_action(state, step + steps_accumulated)
observation_next, reward, done, _ = self.env.step(action.item())
if done:
state_next = None
self.total_step = np.hstack((self.total_step[1:], step + 1))
else:
state_next = self.prepro(observation_next)
state_next = torch.from_numpy(state_next).to(self.config.device, dtype=torch.uint8).unsqueeze(0)
total_reward += reward
reward = torch.tensor([reward], dtype=torch.uint8, device=self.config.device)
if not self.config.is_render:
self.agent.observe(state, action, state_next, reward)
if step % self.config.replay_interval == 0:
self.agent.learn(episode)
state = state_next
if done:
elapsed_time = round(time.time() - start_time, 3)
print('episode: {0}, steps: {1}, mean steps {2}, time: {3}, reward: {4}'.format(episode, step, self.total_step.mean(), elapsed_time, total_reward))
return step + 1
return self.config.num_steps
def run(self):
if not self.config.is_render:
steps = 0
while True:
steps += self.run_episode(-1)
if self.config.steps_learning_start <= steps:
break
steps = 0
for episode in range(self.config.num_episodes):
steps += self.run_episode(episode, steps)
self.env.close()
if self.config.is_saved:
self.agent.save_model()
示例4: __init__
# 需要导入模块: from agent import Agent [as 别名]
# 或者: from agent.Agent import get_action [as 别名]
class Environment:
def __init__(self, config):
print(config.device)
self.config = config
self.env = gym.make(ENV)
# self.env = wrappers.Monitor(self.env, '/tmp/gym/cartpole_dqn', force=True)
self.num_states = self.env.observation_space.shape[0]
self.num_actions = self.env.action_space.n
self.agent = Agent(config, self.num_states, self.num_actions, config.num_atoms)
self.total_step = np.zeros(100)
def is_success_episode(self, step):
return NUM_STEPS_TO_SUCCEED <= step
def run_episode(self, episode, steps_accumulated=0):
start_time = time.time()
observation = self.env.reset()
state = torch.from_numpy(observation).to(self.config.device, dtype=torch.float32).unsqueeze(0)
for step in range(MAX_STEPS):
action = self.agent.get_action(state, step + steps_accumulated)
observation_next, _, done, _ = self.env.step(action.item())
if done:
state_next = None
self.total_step = np.hstack((self.total_step[1:], step + 1))
if self.is_success_episode(step):
reward = torch.tensor([1.0], dtype=torch.float32, device=self.config.device)
else:
reward = torch.tensor([-1.0], dtype=torch.float32, device=self.config.device)
else:
reward = torch.tensor([0.0], dtype=torch.float32, device=self.config.device)
state_next = torch.from_numpy(observation_next).to(self.config.device, dtype=torch.float32).unsqueeze(0)
self.agent.observe(state, action, state_next, reward)
if step % self.config.replay_interval == 0:
self.agent.learn(episode)
state = state_next
if done:
elapsed_time = round(time.time() - start_time, 3)
print('episode: {0}, steps: {1}, mean steps {2}, time: {3}'.format(episode, step, self.total_step.mean(), elapsed_time))
return step + 1
return MAX_STEPS
def run(self):
steps = 0
while True:
steps += self.run_episode(-1)
if self.config.steps_learning_start < steps:
break
steps = 0
for episode in range(self.config.num_episodes):
if MEAN_STEPS_TO_SUCCEED <= self.total_step.mean():
print('over {0} steps of average last 100 episodes, last episode: {1}, steps: {2}'.format(MEAN_STEPS_TO_SUCCEED, episode, steps))
break
steps += self.run_episode(episode, steps)
self.env.close()