本文整理汇总了Python中memory.Memory.add方法的典型用法代码示例。如果您正苦于以下问题:Python Memory.add方法的具体用法?Python Memory.add怎么用?Python Memory.add使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类memory.Memory
的用法示例。
在下文中一共展示了Memory.add方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: zip
# 需要导入模块: from memory import Memory [as 别名]
# 或者: from memory.Memory import add [as 别名]
action, q = sess.run([train_actor_output, train_critic_current_action], feed_dict={k: [[v]] for k, v in zip(state_placeholders, env_state)})
action = action[0]
action = action if testing else eta_noise.reflected_ou(action * np.array([1, 1, 0, 1]), theta=[.15, .15, .75, .15], sigma=[.10, .10, .10, .10], min=-1, max=1)
assert action.shape == env.action_space.sample().shape, (action.shape, env.action_space.sample().shape)
max_xvel = 20
max_yvel = 8
max_yawrate = 0.2
max_altitude = 15
action = np.clip(action, -1, 1) * np.array([max_xvel, max_yvel, max_yawrate, max_altitude / 4.0]) - np.array([0, 0, 0, max_altitude])
env_next_state, env_reward, env_done, env_info = env.step(action)
replay_buffer.add(env_state, env_reward, action, env_done, priority=300)
env_state = env_next_state
total_reward += env_reward
if training:
states_batch, action_batch, reward_batch, next_states_batch, done_batch, indexes = replay_buffer.sample(BATCH_SIZE, prioritized=True)
feed = {
action_placeholder: action_batch,
reward_placeholder: reward_batch,
done_placeholder: done_batch
}
feed.update({k: v for k, v in zip(state_placeholders, states_batch)})
示例2: __init__
# 需要导入模块: from memory import Memory [as 别名]
# 或者: from memory.Memory import add [as 别名]
class DQN:
def __init__(self, env, params):
self.env = env
params.actions = env.actions()
self.num_actions = env.actions()
self.episodes = params.episodes
self.steps = params.steps
self.train_steps = params.train_steps
self.update_freq = params.update_freq
self.save_weights = params.save_weights
self.history_length = params.history_length
self.discount = params.discount
self.eps = params.init_eps
self.eps_delta = (params.init_eps - params.final_eps) / params.final_eps_frame
self.replay_start_size = params.replay_start_size
self.eps_endt = params.final_eps_frame
self.random_starts = params.random_starts
self.batch_size = params.batch_size
self.ckpt_file = params.ckpt_dir+'/'+params.game
self.global_step = tf.Variable(0, trainable=False)
if params.lr_anneal:
self.lr = tf.train.exponential_decay(params.lr, self.global_step, params.lr_anneal, 0.96, staircase=True)
else:
self.lr = params.lr
self.buffer = Buffer(params)
self.memory = Memory(params.size, self.batch_size)
with tf.variable_scope("train") as self.train_scope:
self.train_net = ConvNet(params, trainable=True)
with tf.variable_scope("target") as self.target_scope:
self.target_net = ConvNet(params, trainable=False)
self.optimizer = tf.train.RMSPropOptimizer(self.lr, params.decay_rate, 0.0, self.eps)
self.actions = tf.placeholder(tf.float32, [None, self.num_actions])
self.q_target = tf.placeholder(tf.float32, [None])
self.q_train = tf.reduce_max(tf.mul(self.train_net.y, self.actions), reduction_indices=1)
self.diff = tf.sub(self.q_target, self.q_train)
half = tf.constant(0.5)
if params.clip_delta > 0:
abs_diff = tf.abs(self.diff)
clipped_diff = tf.clip_by_value(abs_diff, 0, 1)
linear_part = abs_diff - clipped_diff
quadratic_part = tf.square(clipped_diff)
self.diff_square = tf.mul(half, tf.add(quadratic_part, linear_part))
else:
self.diff_square = tf.mul(half, tf.square(self.diff))
if params.accumulator == 'sum':
self.loss = tf.reduce_sum(self.diff_square)
else:
self.loss = tf.reduce_mean(self.diff_square)
# backprop with RMS loss
self.task = self.optimizer.minimize(self.loss, global_step=self.global_step)
def randomRestart(self):
self.env.restart()
for _ in range(self.random_starts):
action = rand.randrange(self.num_actions)
reward = self.env.act(action)
state = self.env.getScreen()
terminal = self.env.isTerminal()
self.buffer.add(state)
if terminal:
self.env.restart()
def trainEps(self, train_step):
if train_step < self.eps_endt:
return self.eps - train_step * self.eps_delta
else:
return self.eps_endt
def observe(self, exploration_rate):
if rand.random() < exploration_rate:
a = rand.randrange(self.num_actions)
else:
x = self.buffer.getInput()
action_values = self.train_net.y.eval( feed_dict={ self.train_net.x: x } )
a = np.argmax(action_values)
state = self.buffer.getState()
action = np.zeros(self.num_actions)
action[a] = 1.0
reward = self.env.act(a)
screen = self.env.getScreen()
self.buffer.add(screen)
next_state = self.buffer.getState()
terminal = self.env.isTerminal()
self.memory.add(state, action, reward, next_state, terminal)
return state, action, reward, next_state, terminal
def doMinibatch(self, sess, successes, failures):
#.........这里部分代码省略.........
示例3: __init__
# 需要导入模块: from memory import Memory [as 别名]
# 或者: from memory.Memory import add [as 别名]
class Agent:
def __init__(self, env, model, epsilon=.9, min_epsilon=.1, epsilon_decay=1e-3):
self.env = env
self.model = model
self.epsilon = epsilon
self.min_epsilon = min_epsilon
self.epsilon_decay = epsilon_decay
self.episode = 0
self.positiveMemory = Memory(model=self.model, episode_max_size=20)
self.negativeMemory = Memory(model=self.model, episode_max_size=10)
def play(self):
terminal = False
observation = self.env.reset()
X = np.zeros((2,) + observation.shape)
X[0] = observation
X[1] = observation
total_reward = 0
while terminal == False and total_reward < 200:
y = self.model.predict(X)
action = np.argmax(y)
observation, reward, terminal, info = self.env.executeAction(action)
total_reward += reward
X[0] = X[1]
X[1] = observation
return total_reward
def learn(self, overfit=False, games=1, warmup=0, skip_frames=4):
self.episode += 1.
epsilon = max(self.min_epsilon, self.epsilon - self.episode * self.epsilon_decay)
total_reward = 0
qs = []
predictions = None
if warmup > 0:
print "Adding %d warmup games"%(warmup)
games += warmup
for game in range(1, games + 1):
print "Game %d/%d..."%(game, games)
terminal = False
observation = self.env.reset()
framebuffer = np.zeros((skip_frames,) + observation.shape)
framebuffer[-1] = observation
frame = 0
action = np.random.randint(0, 2)
episode = []
while terminal == False:
frame += 1
if frame%skip_frames != 0:
observation, reward, terminal, info = self.env.executeAction(action)
if frame%skip_frames == 0 or reward != 0 or terminal:
X = framebuffer.copy()
y = self.model.predict(X)
qs.append(max(y))
if predictions is None:
predictions = np.zeros_like(y)
predictions[np.argmax(y)] += 1
if frame%skip_frames == 0:
if np.random.rand() <= epsilon:
action = np.random.randint(0, len(y))
else:
action = np.argmax(y)
observation, reward, terminal, info = self.env.executeAction(action)
total_reward += reward
y[action] = 1. # encourage current action, for now
episode.append((X, y, action, reward, terminal))
if reward == 1:
self.positiveMemory.add(episode, positive=True)
episode = []
if reward == -1:
self.negativeMemory.add(episode, positive=False)
episode = []
framebuffer[0:skip_frames-1] = framebuffer[1:]
framebuffer[-1] = observation
print "Score %.1f"%(total_reward / games)
X_pos, y_pos = self.positiveMemory.sample(nbr_positive=(games-warmup)*25)
X_neg, y_neg = self.negativeMemory.sample(nbr_negative=(games-warmup)*100)
if not X_pos is None:
print "Sample %d positive and %d negative memories"%(len(y_pos), len(y_neg))
X_t = np.concatenate((X_pos, X_neg))
y_t = np.concatenate((y_pos, y_neg))
else:
print "Sample %d negative memories"%(len(y_neg))
#.........这里部分代码省略.........