本文整理汇总了Python中baselines.common.mpi_moments.mpi_moments方法的典型用法代码示例。如果您正苦于以下问题:Python mpi_moments.mpi_moments方法的具体用法?Python mpi_moments.mpi_moments怎么用?Python mpi_moments.mpi_moments使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类baselines.common.mpi_moments
的用法示例。
在下文中一共展示了mpi_moments.mpi_moments方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: mpi_average
# 需要导入模块: from baselines.common import mpi_moments [as 别名]
# 或者: from baselines.common.mpi_moments import mpi_moments [as 别名]
def mpi_average(value):
if value == []:
value = [0.]
if not isinstance(value, list):
value = [value]
return mpi_moments(np.array(value))[0]
示例2: mpi_mean
# 需要导入模块: from baselines.common import mpi_moments [as 别名]
# 或者: from baselines.common.mpi_moments import mpi_moments [as 别名]
def mpi_mean(value):
if value == []:
value = [0.]
if not isinstance(value, list):
value = [value]
return mpi_moments(np.array(value))[0][0]
示例3: mpi_std
# 需要导入模块: from baselines.common import mpi_moments [as 别名]
# 或者: from baselines.common.mpi_moments import mpi_moments [as 别名]
def mpi_std(value):
if value == []:
value = [0.]
if not isinstance(value, list):
value = [value]
return mpi_moments(np.array(value))[1][0]
示例4: mpi_average
# 需要导入模块: from baselines.common import mpi_moments [as 别名]
# 或者: from baselines.common.mpi_moments import mpi_moments [as 别名]
def mpi_average(value):
if not isinstance(value, list):
value = [value]
if not any(value):
value = [0.]
return mpi_moments(np.array(value))[0]
示例5: run
# 需要导入模块: from baselines.common import mpi_moments [as 别名]
# 或者: from baselines.common.mpi_moments import mpi_moments [as 别名]
def run(self):
# shift forward
if len(self.mb_stuff[2]) >= self.nsteps+self.num_steps_to_cut_left+self.num_steps_to_cut_right:
self.mb_stuff = [l[self.nsteps:] for l in self.mb_stuff]
mb_obs, mb_increase_ent, mb_rewards, mb_reward_avg, mb_actions, mb_values, mb_valids, mb_random_resets, \
mb_dones, mb_neglogpacs, mb_states = self.mb_stuff
epinfos = []
while len(mb_rewards) < self.nsteps+self.num_steps_to_cut_left+self.num_steps_to_cut_right:
actions, values, states, neglogpacs = self.model.step(mb_obs[-1], mb_states[-1], mb_dones[-1], mb_increase_ent[-1])
mb_actions.append(actions)
mb_values.append(values)
mb_states.append(states)
mb_neglogpacs.append(neglogpacs)
obs, rewards, dones, infos = self.env.step(actions)
mb_obs.append(np.cast[self.model.train_model.X.dtype.name](obs))
mb_increase_ent.append(np.asarray([info.get('increase_entropy', False) for info in infos], dtype=np.uint8))
mb_rewards.append(rewards)
mb_dones.append(dones)
mb_valids.append([(not info.get('replay_reset.invalid_transition', False)) for info in infos])
mb_random_resets.append(np.array([info.get('replay_reset.random_reset', False) for info in infos]))
for info in infos:
maybeepinfo = info.get('episode')
if maybeepinfo: epinfos.append(maybeepinfo)
# GAE
mb_advs = [np.zeros_like(mb_values[0])] * (len(mb_rewards) + 1)
for t in reversed(range(len(mb_rewards))):
if t < self.num_steps_to_cut_left:
mb_valids[t] = np.zeros_like(mb_valids[t])
else:
if t == len(mb_values)-1:
next_value = self.model.value(mb_obs[-1], mb_states[-1], mb_dones[-1])
else:
next_value = mb_values[t+1]
use_next = np.logical_not(mb_dones[t+1])
adv_mask = np.logical_not(mb_random_resets[t+1])
delta = mb_rewards[t] + self.gamma * use_next * next_value - mb_values[t]
mb_advs[t] = adv_mask * (delta + self.gamma * self.lam * use_next * mb_advs[t + 1])
# extract arrays
end = self.nsteps + self.num_steps_to_cut_left
ar_mb_obs = np.asarray(mb_obs[:end], dtype=self.model.train_model.X.dtype.name)
ar_mb_ent = np.stack(mb_increase_ent[:end], axis=0)
ar_mb_valids = np.asarray(mb_valids[:end], dtype=np.float32)
ar_mb_actions = np.asarray(mb_actions[:end])
ar_mb_values = np.asarray(mb_values[:end], dtype=np.float32)
ar_mb_neglogpacs = np.asarray(mb_neglogpacs[:end], dtype=np.float32)
ar_mb_dones = np.asarray(mb_dones[:end], dtype=np.bool)
ar_mb_advs = np.asarray(mb_advs[:end], dtype=np.float32)
ar_mb_rets = ar_mb_values + ar_mb_advs
if self.norm_adv:
adv_mean, adv_std, _ = mpi_moments(ar_mb_advs.ravel())
ar_mb_advs = (ar_mb_advs - adv_mean) / (adv_std + 1e-7)
# obs, increase_ent, advantages, masks, actions, values, neglogpacs, valids, returns, states, epinfos = runner.run()
return (*map(sf01, (ar_mb_obs, ar_mb_ent, ar_mb_advs, ar_mb_dones, ar_mb_actions, ar_mb_values, ar_mb_neglogpacs, ar_mb_valids, ar_mb_rets)),
mb_states[0], epinfos)