本文整理汇总了Python中memory.Memory.sample方法的典型用法代码示例。如果您正苦于以下问题:Python Memory.sample方法的具体用法?Python Memory.sample怎么用?Python Memory.sample使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类memory.Memory
的用法示例。
在下文中一共展示了Memory.sample方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from memory import Memory [as 别名]
# 或者: from memory.Memory import sample [as 别名]
class DDPG:
def __init__(self, obs_dim, action_dim, hiddens_actor, hiddens_critic, layer_norm=False, memory_size=50000):
self.obs_dim = obs_dim
self.action_dim = action_dim
self.noise_stddev = 1.
self.noise_stddev_decrease = 5e-4
self.noise_stddev_lower = 5e-2
actor_activations = [dy.tanh for _ in range(len(hiddens_actor))] + [dy.tanh]
critic_activations = [dy.tanh for _ in range(len(hiddens_critic))] + [None]
self.actor = MLP(inpt_shape=(obs_dim,), hiddens=hiddens_actor + [action_dim], activation=actor_activations,
layer_norm=layer_norm)
self.critic = MLP(inpt_shape=(obs_dim + action_dim,), hiddens=hiddens_critic + [1],
activation=critic_activations, layer_norm=layer_norm)
self.actor_target = MLP(inpt_shape=(obs_dim,), hiddens=hiddens_actor + [action_dim],
activation=actor_activations, layer_norm=layer_norm)
self.critic_target = MLP(inpt_shape=(obs_dim + action_dim,), hiddens=hiddens_critic + [1],
activation=critic_activations, layer_norm=layer_norm)
self.actor_target.update(self.actor, soft=False)
self.critic_target.update(self.critic, soft=False)
self.trainer_actor = dy.AdamTrainer(self.actor.pc)
self.trainer_critic = dy.AdamTrainer(self.critic.pc)
self.trainer_actor.set_learning_rate(1e-4)
self.trainer_critic.set_learning_rate(1e-3)
self.memory = Memory(memory_size)
def act(self, obs):
dy.renew_cg()
action = self.actor(obs).npvalue()
if self.noise_stddev > 0:
noise = np.random.randn(self.action_dim) * self.noise_stddev
action += noise
return np.clip(action, -1, 1)
def store(self, exp):
self.memory.store(exp)
def learn(self, batch_size):
exps = self.memory.sample(batch_size)
obss, actions, rewards, obs_nexts, dones = self._process(exps)
# Update critic
dy.renew_cg()
target_actions = self.actor_target(obs_nexts, batched=True)
target_values = self.critic_target(dy.concatenate([dy.inputTensor(obs_nexts, batched=True), target_actions]),
batched=True)
target_values = rewards + 0.99 * target_values.npvalue() * (1 - dones)
dy.renew_cg()
values = self.critic(np.concatenate([obss, actions]), batched=True)
loss = dy.mean_batches((values - dy.inputTensor(target_values, batched=True)) ** 2)
loss_value_critic = loss.npvalue()
loss.backward()
self.trainer_critic.update()
# update actor
dy.renew_cg()
actions = self.actor(obss, batched=True)
obs_and_actions = dy.concatenate([dy.inputTensor(obss, batched=True), actions])
loss = -dy.mean_batches(self.critic(obs_and_actions, batched=True))
loss_value_actor = loss.npvalue()
loss.backward()
self.trainer_actor.update()
self.noise_stddev = (
self.noise_stddev - self.noise_stddev_decrease) if self.noise_stddev > self.noise_stddev_lower else self.noise_stddev_lower
self.actor_target.update(self.actor, soft=True)
self.critic_target.update(self.critic, soft=True)
return loss_value_actor + loss_value_critic
# data in memory: [memory_size, exp], exp: [obs, action, reward, obs_next, done]
# output: [obss, actions, rewards, obs_nexts, dones], 'X's: [x, batch_size]
@staticmethod
def _process(exps):
n = len(exps)
ret = []
for i in range(5):
ret.append([])
for j in range(n):
ret[i].append(exps[j][i])
ret = [np.transpose(arr) for arr in ret]
return ret
@property
def epsilon(self):
return self.noise_stddev
示例2: dqn
# 需要导入模块: from memory import Memory [as 别名]
# 或者: from memory.Memory import sample [as 别名]
def dqn(env,
model,
base_path,
batch_size=32,
epsilon=0.01,
save_model_every=1000,
update_target_every=1000,
learning_starts=200,
memory_size=500000,
num_iterations=6250000):
events_path = os.path.join(base_path, 'events')
models_path = os.path.join(base_path, 'models')
if not os.path.exists(events_path):
os.makedirs(events_path)
if not os.path.exists(models_path):
os.makedirs(models_path)
model.load_model(models_path)
summary_writer = SummaryWriter(events_path)
rewards_history = []
pkl_path = '{}/rewards.pkl'.format(base_path)
if os.path.exists(pkl_path):
with open(pkl_path, 'rb') as f:
rewards_history = pickle.load(f)
memory_buffer = Memory(memory_size)
results_buffer = ResultsBuffer(rewards_history)
global_step = model.get_global_step()
try:
states = env.reset()
for i in range(learning_starts):
actions = model.get_action(states, epsilon)
next_states, rewards, dones, info = env.step(actions)
memory_buffer.extend(
zip(states, actions, rewards, next_states, dones))
states = next_states
states = env.reset()
start = time.time()
for i in range(num_iterations):
actions = model.get_action(states, epsilon)
next_states, rewards, dones, info = env.step(actions)
results_buffer.update_infos(info, global_step)
memory_buffer.extend(
zip(states, actions, rewards, next_states, dones))
global_step, summaries = model.update(
*memory_buffer.sample(batch_size))
results_buffer.update_summaries(summaries)
if global_step % update_target_every == 0:
model.update_target()
if global_step % save_model_every == 0:
t = time.time() - start
model.save_model(models_path)
print("Save model, global_step: {}, delta_time: {}.".format(
global_step, t))
results_buffer.add_summary(summary_writer, global_step, t)
start = time.time()
states = next_states
except Exception as e:
raise e
finally:
model.save_model(models_path)
with open(pkl_path, 'wb') as f:
pickle.dump(results_buffer.rewards_history, f)
示例3: DQNAgent
# 需要导入模块: from memory import Memory [as 别名]
# 或者: from memory.Memory import sample [as 别名]
class DQNAgent(object):
""" This agent uses DQN for making action decisions with 1-epsilon probability """
def __init__(self, name, state_dim, action_dim, epsdecay=0.995,
buffersize=500000, samplesize=32, minsamples=10000,
gamma=0.99, state_norm_file='../params/state-stats.pkl', update_target_freq=600,
LEARNING_RATE = 1e-4):
""" Accepts a unique agent name, number of variables in the state,
number of actions and parameters of DQN then initialize the agent"""
# Unique name for the agent
self.name = name
# no:of state and action dimensions
self.state_dim = state_dim
self.action_dim = action_dim
# Create buffer for experience replay
self.memory = Memory(maxsize=buffersize)
# Set initial epsilon to 1.0
self.eps = 1.0
# Minimum number of samples in the buffer to start learning
self.minsamples = minsamples
# Number of random samples to be drawn from the buffer for experience replay
self.samplesize = samplesize
# Decay factor for epsilon for each episode
self.epsdecay = epsdecay
# Discount factor for Q learning
self.gamma = gamma
self.LEARNING_RATE = LEARNING_RATE
# Create the base predictor neural network
# and if required the target neural network too.
self._create_nns_()
# Load the state variable normalizers from pickle file if exists
self.update_target_freq = update_target_freq
# Boolean flag indicating whether the agent started learning or not
self.started_learning = False
# Keeps a count of number of steps.
self.steps = 0
def _preprocess_state_(self, instate):
self.mean = np.zeros(self.state_dim)
self.std = np.ones(self.state_dim)
# Normalize raw state vector by mean and std normalizers
return ((instate - self.mean)/self.std)
def _create_nns_(self):
# Create predictor DQN
self.model = self._create_model_()
self.target_model = self._create_model_()
def _create_model_(self):
model = Sequential()
## Flatten the input shape, it doesn't affect the batch size. input_shape (1, ) + (4, )
model.add(Dense(40, input_dim=self.state_dim))
model.add(Activation('relu'))
## now the model will take as input arrays of shape (*, 4) and output arrays of shape (*, 32)
## using the actiation function of relu sigmol?
# Second layers
model.add(Dense(32))
model.add(Activation('relu'))
# Thirsday layers
model.add(Dense(16))
model.add(Activation('relu'))
# Output layers, output dimension is the number of the classes in the multi-classifier
model.add(Dense(self.action_dim))
model.add(Activation('linear'))
## adam is optimization function, loss function is mse
adam = Adam(lr=self.LEARNING_RATE)
model.compile(loss='mse',optimizer=adam)
return model
def _update_target_model_(self):
# Copy weights from predictor NN to target network.
self.target_model.set_weights(self.model.get_weights())
def decide(self, curstate, testmode=False):
""" Accepts current state as input and returns action to take """
# Do not do eps greedy policy for test trials
if not testmode:
if (random.random() <= self.eps) or (not self.started_learning):
return random.randint(0, self.action_dim-1)
# convert state to a matrix with one row
s = np.array([self._preprocess_state_(curstate)])
# Return the action with maximum predicted Q value.
return np.argmax(self.model.predict(s)[0])
def observe(self, prevstate, action, reward, curstate, done):
""" Accepts an observation (s,a,r,s',done) as input, store them in memory buffer for
experience replay """
# Normalize both states
prevstate_normalized = self._preprocess_state_(prevstate)
#.........这里部分代码省略.........
示例4: zip
# 需要导入模块: from memory import Memory [as 别名]
# 或者: from memory.Memory import sample [as 别名]
max_xvel = 20
max_yvel = 8
max_yawrate = 0.2
max_altitude = 15
action = np.clip(action, -1, 1) * np.array([max_xvel, max_yvel, max_yawrate, max_altitude / 4.0]) - np.array([0, 0, 0, max_altitude])
env_next_state, env_reward, env_done, env_info = env.step(action)
replay_buffer.add(env_state, env_reward, action, env_done, priority=300)
env_state = env_next_state
total_reward += env_reward
if training:
states_batch, action_batch, reward_batch, next_states_batch, done_batch, indexes = replay_buffer.sample(BATCH_SIZE, prioritized=True)
feed = {
action_placeholder: action_batch,
reward_placeholder: reward_batch,
done_placeholder: done_batch
}
feed.update({k: v for k, v in zip(state_placeholders, states_batch)})
feed.update({k: v for k, v in zip(next_state_placeholders, next_states_batch)})
_, _, errors, critic_error = sess.run([train_critic, train_actor, q_error, q_error_batch], feed_dict=feed)
replay_buffer.update(indexes, errors)
print 'q:{:5f} reward:{:5f} trainerror:{:5f}'.format(q[0], env_reward, critic_error)
示例5: __init__
# 需要导入模块: from memory import Memory [as 别名]
# 或者: from memory.Memory import sample [as 别名]
class Agent:
def __init__(self, env, model, epsilon=.9, min_epsilon=.1, epsilon_decay=1e-3):
self.env = env
self.model = model
self.epsilon = epsilon
self.min_epsilon = min_epsilon
self.epsilon_decay = epsilon_decay
self.episode = 0
self.positiveMemory = Memory(model=self.model, episode_max_size=20)
self.negativeMemory = Memory(model=self.model, episode_max_size=10)
def play(self):
terminal = False
observation = self.env.reset()
X = np.zeros((2,) + observation.shape)
X[0] = observation
X[1] = observation
total_reward = 0
while terminal == False and total_reward < 200:
y = self.model.predict(X)
action = np.argmax(y)
observation, reward, terminal, info = self.env.executeAction(action)
total_reward += reward
X[0] = X[1]
X[1] = observation
return total_reward
def learn(self, overfit=False, games=1, warmup=0, skip_frames=4):
self.episode += 1.
epsilon = max(self.min_epsilon, self.epsilon - self.episode * self.epsilon_decay)
total_reward = 0
qs = []
predictions = None
if warmup > 0:
print "Adding %d warmup games"%(warmup)
games += warmup
for game in range(1, games + 1):
print "Game %d/%d..."%(game, games)
terminal = False
observation = self.env.reset()
framebuffer = np.zeros((skip_frames,) + observation.shape)
framebuffer[-1] = observation
frame = 0
action = np.random.randint(0, 2)
episode = []
while terminal == False:
frame += 1
if frame%skip_frames != 0:
observation, reward, terminal, info = self.env.executeAction(action)
if frame%skip_frames == 0 or reward != 0 or terminal:
X = framebuffer.copy()
y = self.model.predict(X)
qs.append(max(y))
if predictions is None:
predictions = np.zeros_like(y)
predictions[np.argmax(y)] += 1
if frame%skip_frames == 0:
if np.random.rand() <= epsilon:
action = np.random.randint(0, len(y))
else:
action = np.argmax(y)
observation, reward, terminal, info = self.env.executeAction(action)
total_reward += reward
y[action] = 1. # encourage current action, for now
episode.append((X, y, action, reward, terminal))
if reward == 1:
self.positiveMemory.add(episode, positive=True)
episode = []
if reward == -1:
self.negativeMemory.add(episode, positive=False)
episode = []
framebuffer[0:skip_frames-1] = framebuffer[1:]
framebuffer[-1] = observation
print "Score %.1f"%(total_reward / games)
X_pos, y_pos = self.positiveMemory.sample(nbr_positive=(games-warmup)*25)
X_neg, y_neg = self.negativeMemory.sample(nbr_negative=(games-warmup)*100)
if not X_pos is None:
print "Sample %d positive and %d negative memories"%(len(y_pos), len(y_neg))
X_t = np.concatenate((X_pos, X_neg))
y_t = np.concatenate((y_pos, y_neg))
else:
print "Sample %d negative memories"%(len(y_neg))
#.........这里部分代码省略.........