本文整理汇总了Python中constants.ENTROPY_BETA属性的典型用法代码示例。如果您正苦于以下问题:Python constants.ENTROPY_BETA属性的具体用法?Python constants.ENTROPY_BETA怎么用?Python constants.ENTROPY_BETA使用的例子?那么, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类constants
的用法示例。
在下文中一共展示了constants.ENTROPY_BETA属性的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: rl_loss
# 需要导入模块: import constants [as 别名]
# 或者: from constants import ENTROPY_BETA [as 别名]
def rl_loss(self):
with tf.variable_scope('a3c_loss'):
action_size = self.pi.get_shape().as_list()[1]
self.taken_action = tf.placeholder(tf.float32, [None, action_size], name='taken_action')
# temporary difference (R-V) (input for policy)
self.td = tf.placeholder(tf.float32, [None], name='td_placeholder')
# avoid NaN with clipping when value in pi becomes zero
log_pi = tf.log(tf.clip_by_value(self.pi, 1e-20, 1.0))
# policy entropy
entropy = -tf.reduce_sum(self.pi * log_pi, axis=1)
# policy loss (output) (Adding minus, because the original paper's
# objective function is for gradient ascent, but we use gradient
# descent optimizer.)
self.policy_loss = -tf.reduce_mean(tf.reduce_sum(
tf.multiply(log_pi, self.taken_action), axis=1) * self.td + entropy * constants.ENTROPY_BETA)
# R (input for value)
self.r = tf.placeholder(tf.float32, [None], name='reward_placeholder')
# value loss (output)
# (Learning rate for Critic is half of Actor's, so multiply by 0.5) and half from L2 Loss.
self.value_loss = 0.25 * tf.losses.huber_loss(self.r, self.v)
# gradienet of policy and value are summed up
self.rl_total_loss = self.policy_loss + self.value_loss
示例2: __init__
# 需要导入模块: import constants [as 别名]
# 或者: from constants import ENTROPY_BETA [as 别名]
def __init__(self,
thread_index,
global_network,
initial_learning_rate,
learning_rate_input,
grad_applier,
max_global_time_step,
device,
network_scope="network",
scene_scope="scene",
task_scope="task"):
self.thread_index = thread_index
self.learning_rate_input = learning_rate_input
self.max_global_time_step = max_global_time_step
self.network_scope = network_scope
self.scene_scope = scene_scope
self.task_scope = task_scope
self.scopes = [network_scope, scene_scope, task_scope]
self.local_network = ActorCriticFFNetwork(
action_size=ACTION_SIZE,
device=device,
network_scope=network_scope,
scene_scopes=[scene_scope])
self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)
self.trainer = AccumTrainer(device)
self.trainer.prepare_minimize(self.local_network.total_loss,
self.local_network.get_vars())
self.accum_gradients = self.trainer.accumulate_gradients()
self.reset_gradients = self.trainer.reset_gradients()
accum_grad_names = [self._local_var_name(x) for x in self.trainer.get_accum_grad_list()]
global_net_vars = [x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names]
self.apply_gradients = grad_applier.apply_gradients(
global_net_vars, self.trainer.get_accum_grad_list() )
self.sync = self.local_network.sync_from(global_network)
self.env = None
self.local_t = 0
self.initial_learning_rate = initial_learning_rate
self.episode_reward = 0
self.episode_length = 0
self.episode_max_q = -np.inf
示例3: __init__
# 需要导入模块: import constants [as 别名]
# 或者: from constants import ENTROPY_BETA [as 别名]
def __init__(self,
thread_index,
global_network,
pinitial_learning_rate,
plearning_rate_input,
pgrad_applier,
vinitial_learning_rate,
vlearning_rate_input,
vgrad_applier,
max_global_time_step,
device,task_index=""):
self.thread_index = thread_index
self.plearning_rate_input = plearning_rate_input
self.vlearning_rate_input = vlearning_rate_input
self.max_global_time_step = max_global_time_step
self.game_state = GameState()
state=self.game_state.reset();
self.game_state.reset_gs(state);
self.action_size=self.game_state.action_size;
self.state_size=self.game_state.state_size;
self.local_max_iter=self.game_state.local_max_iter;
if USE_LSTM:
self.local_network = GameACLSTMNetwork(self.action_size,self.state_size,self.game_state.action_low,self.game_state.action_high, thread_index, device)
else:
self.local_network = GameACFFNetwork(self.action_size,self.state_size,self.game_state.action_low,self.game_state.action_high, thread_index, device)
self.local_network.prepare_loss(ENTROPY_BETA)
with tf.device(device):
pvar_refs = [v._ref() for v in self.local_network.get_pvars()]
self.policy_gradients = tf.gradients(
self.local_network.policy_loss, pvar_refs,
gate_gradients=False,
aggregation_method=None,
colocate_gradients_with_ops=False)
vvar_refs = [v._ref() for v in self.local_network.get_vvars()]
self.value_gradients = tf.gradients(
self.local_network.value_loss, vvar_refs,
gate_gradients=False,
aggregation_method=None,
colocate_gradients_with_ops=False)
self.apply_policy_gradients = pgrad_applier.apply_gradients(
self.local_network.get_pvars(),
self.policy_gradients )
self.apply_value_gradients = vgrad_applier.apply_gradients(
self.local_network.get_vvars(),
self.value_gradients )
self.local_t = 0
self.pinitial_learning_rate = pinitial_learning_rate
self.vinitial_learning_rate = vinitial_learning_rate
self.episode_reward = 0
# variable controling log output
self.prev_local_t = 0
示例4: __init__
# 需要导入模块: import constants [as 别名]
# 或者: from constants import ENTROPY_BETA [as 别名]
def __init__(self,
thread_index,
global_network,
initial_learning_rate,
learning_rate_input,
grad_applier,
max_global_time_step,
device,task_index=""):
self.thread_index = thread_index
self.learning_rate_input = learning_rate_input
self.max_global_time_step = max_global_time_step
if USE_LSTM:
self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
else:
self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)
self.local_network.prepare_loss(ENTROPY_BETA)
with tf.device(device):
var_refs = [v._ref() for v in self.local_network.get_vars()]
self.gradients = tf.gradients(
self.local_network.total_loss, var_refs,
gate_gradients=False,
aggregation_method=None,
colocate_gradients_with_ops=False)
if(global_network):
self.apply_gradients = grad_applier.apply_gradients(
global_network.get_vars(),
self.gradients )
self.sync = self.local_network.sync_from(global_network)
self.mode="threading";
else:
self.apply_gradients = grad_applier.apply_gradients(
self.local_network.get_vars(),
self.gradients )
self.mode="dist_tensor";
if not (task_index):
self.game_state = GameState(113 * thread_index)
else:
self.game_state = GameState(113 * task_index)
self.local_t = 0
self.initial_learning_rate = initial_learning_rate
self.episode_reward = 0
# variable controling log output
self.prev_local_t = 0
示例5: __init__
# 需要导入模块: import constants [as 别名]
# 或者: from constants import ENTROPY_BETA [as 别名]
def __init__(self,
thread_index,
global_network,
initial_learning_rate,
learning_rate_input,
grad_applier,
max_global_time_step,
device):
self.thread_index = thread_index
self.learning_rate_input = learning_rate_input
self.max_global_time_step = max_global_time_step
if USE_LSTM:
self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
else:
self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)
self.local_network.prepare_loss(ENTROPY_BETA)
with tf.device(device):
var_refs = [v._ref() for v in self.local_network.get_vars()]
self.gradients = tf.gradients(
self.local_network.total_loss, var_refs,
gate_gradients=False,
aggregation_method=None,
colocate_gradients_with_ops=False)
self.apply_gradients = grad_applier.apply_gradients(
global_network.get_vars(),
self.gradients )
self.sync = self.local_network.sync_from(global_network)
self.game_state = GameState(113 * thread_index)
self.local_t = 0
self.initial_learning_rate = initial_learning_rate
self.episode_reward = 0
# variable controling log output
self.prev_local_t = 0
示例6: __init__
# 需要导入模块: import constants [as 别名]
# 或者: from constants import ENTROPY_BETA [as 别名]
def __init__(self,
thread_index,
global_network,
initial_learning_rate,
learning_rate_input,
grad_applier,
max_global_time_step,
device,FLAGS="",task_index=""):
self.thread_index = thread_index
self.task_index = task_index
self.learning_rate_input = learning_rate_input
self.max_global_time_step = max_global_time_step
self.limit_global_time_step = 100*10**6;
if(FLAGS.use_lstm):
self.local_network = GameACPathNetLSTMNetwork(ACTION_SIZE, thread_index, device,FLAGS)
else:
self.local_network = GameACPathNetNetwork(ACTION_SIZE, thread_index, device,FLAGS)
self.local_network.prepare_loss(ENTROPY_BETA)
with tf.device(device):
var_refs = [v._ref() for v in self.local_network.get_vars()]
self.gradients = tf.gradients(
self.local_network.total_loss, var_refs,
gate_gradients=False,
aggregation_method=None,
colocate_gradients_with_ops=False)
self.apply_gradients = grad_applier.apply_gradients(
self.local_network.get_vars(),
self.gradients )
self.game_state = GameState(113 * task_index)
self.local_t = 0
self.initial_learning_rate = initial_learning_rate
self.episode_reward = 0
# variable controling log output
self.prev_local_t = 0