本文整理汇总了Python中emulator.Emulator.next方法的典型用法代码示例。如果您正苦于以下问题:Python Emulator.next方法的具体用法?Python Emulator.next怎么用?Python Emulator.next使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类emulator.Emulator
的用法示例。
在下文中一共展示了Emulator.next方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: ActorLearner
# 需要导入模块: from emulator import Emulator [as 别名]
# 或者: from emulator.Emulator import next [as 别名]
#.........这里部分代码省略.........
self.global_score = alg_conf['global_score']
self.global_score_placeholder = alg_conf['global_score_placeholder']
self.update_global_score_op = alg_conf['update_global_score_op']
self.global_score_summary = summary_conf['global_score_summary']
self.thread_score = alg_conf['thread_score']
self.thread_score_placeholder = alg_conf['thread_score_placeholder']
self.update_thread_score_op = alg_conf['update_thread_score_op']
self.rescale_rewards = alg_conf['rescale_rewards']
if self.rescale_rewards:
self.thread_max_reward = alg_conf['thread_max_reward']
self.thread_max_reward_placeholder = \
alg_conf['thread_max_reward_placeholder']
self.update_max_reward_op = alg_conf['update_thread_max_reward_op']
self.max_reward = self.session.run(self.thread_max_reward)
# Updating target network at regular intervals w.r.t. global step,
# global step, and global scores requires locking! Otherwise, global
# step and score are handled asynchronously by tensorflow. They ought
# to be in lock step.
self.lock = alg_conf['lock']
def reduce_thread_epsilon(self):
""" Linear annealing """
if self.global_step <= self.max_epsilon_annealing_steps:
self.epsilon = self.epsilon_init - ((self.global_step *
(self.epsilon_init - self.epsilon_limit)) /
self.max_epsilon_annealing_steps)
self.session.run(self.update_thread_epsilon_op,
feed_dict={self.epsilon_placeholder: self.epsilon})
def choose_next_action(self, state, policy_type):
""" Epsilon greedy/direct policy """
new_action = np.zeros([self.num_actions])
if policy_type == 'e-greedy':
if np.random.rand() <= self.epsilon:
action_index = np.random.randint(0,self.num_actions)
else:
network_output = self.session.run(
self.local_network.output_layer,
feed_dict={self.local_network.input_placeholder: state})[0]
action_index = np.argmax(network_output)
self.reduce_thread_epsilon()
elif policy_type == 'direct':
network_output = self.session.run(
self.local_network.output_layer_p,
feed_dict={self.local_network.input_placeholder: state})[0]
# print('softmax output:', network_output)
action_index = np.random.choice(
range(self.num_actions), p=network_output)
new_action[action_index] = 1
return new_action
def execute_action(self, a):
""" Returns the next state, reward and whether or not the next state
is terminal. """
return self.emulator.next(a)
def apply_gradients_to_shared_network(self):
""" Apply accumulated gradients to the shared network and clear
accumulated gradients. """