当前位置: 首页>>代码示例>>Python>>正文


Python Memory.sample方法代码示例

本文整理汇总了Python中memory.Memory.sample方法的典型用法代码示例。如果您正苦于以下问题:Python Memory.sample方法的具体用法?Python Memory.sample怎么用?Python Memory.sample使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在memory.Memory的用法示例。


在下文中一共展示了Memory.sample方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from memory import Memory [as 别名]
# 或者: from memory.Memory import sample [as 别名]
class DDPG:
    def __init__(self, obs_dim, action_dim, hiddens_actor, hiddens_critic, layer_norm=False, memory_size=50000):
        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.noise_stddev = 1.
        self.noise_stddev_decrease = 5e-4
        self.noise_stddev_lower = 5e-2

        actor_activations = [dy.tanh for _ in range(len(hiddens_actor))] + [dy.tanh]
        critic_activations = [dy.tanh for _ in range(len(hiddens_critic))] + [None]
        self.actor = MLP(inpt_shape=(obs_dim,), hiddens=hiddens_actor + [action_dim], activation=actor_activations,
                         layer_norm=layer_norm)
        self.critic = MLP(inpt_shape=(obs_dim + action_dim,), hiddens=hiddens_critic + [1],
                          activation=critic_activations, layer_norm=layer_norm)
        self.actor_target = MLP(inpt_shape=(obs_dim,), hiddens=hiddens_actor + [action_dim],
                                activation=actor_activations, layer_norm=layer_norm)
        self.critic_target = MLP(inpt_shape=(obs_dim + action_dim,), hiddens=hiddens_critic + [1],
                                 activation=critic_activations, layer_norm=layer_norm)
        self.actor_target.update(self.actor, soft=False)
        self.critic_target.update(self.critic, soft=False)

        self.trainer_actor = dy.AdamTrainer(self.actor.pc)
        self.trainer_critic = dy.AdamTrainer(self.critic.pc)
        self.trainer_actor.set_learning_rate(1e-4)
        self.trainer_critic.set_learning_rate(1e-3)

        self.memory = Memory(memory_size)

    def act(self, obs):
        dy.renew_cg()
        action = self.actor(obs).npvalue()
        if self.noise_stddev > 0:
            noise = np.random.randn(self.action_dim) * self.noise_stddev
            action += noise
        return np.clip(action, -1, 1)

    def store(self, exp):
        self.memory.store(exp)

    def learn(self, batch_size):
        exps = self.memory.sample(batch_size)
        obss, actions, rewards, obs_nexts, dones = self._process(exps)

        # Update critic
        dy.renew_cg()
        target_actions = self.actor_target(obs_nexts, batched=True)
        target_values = self.critic_target(dy.concatenate([dy.inputTensor(obs_nexts, batched=True), target_actions]),
                                           batched=True)
        target_values = rewards + 0.99 * target_values.npvalue() * (1 - dones)

        dy.renew_cg()
        values = self.critic(np.concatenate([obss, actions]), batched=True)
        loss = dy.mean_batches((values - dy.inputTensor(target_values, batched=True)) ** 2)
        loss_value_critic = loss.npvalue()
        loss.backward()
        self.trainer_critic.update()

        # update actor
        dy.renew_cg()
        actions = self.actor(obss, batched=True)
        obs_and_actions = dy.concatenate([dy.inputTensor(obss, batched=True), actions])
        loss = -dy.mean_batches(self.critic(obs_and_actions, batched=True))
        loss_value_actor = loss.npvalue()
        loss.backward()
        self.trainer_actor.update()

        self.noise_stddev = (
                    self.noise_stddev - self.noise_stddev_decrease) if self.noise_stddev > self.noise_stddev_lower else self.noise_stddev_lower

        self.actor_target.update(self.actor, soft=True)
        self.critic_target.update(self.critic, soft=True)

        return loss_value_actor + loss_value_critic

    # data in memory: [memory_size, exp], exp: [obs, action, reward, obs_next, done]
    # output: [obss, actions, rewards, obs_nexts, dones], 'X's: [x, batch_size]
    @staticmethod
    def _process(exps):
        n = len(exps)
        ret = []
        for i in range(5):
            ret.append([])
            for j in range(n):
                ret[i].append(exps[j][i])

        ret = [np.transpose(arr) for arr in ret]
        return ret

    @property
    def epsilon(self):
        return self.noise_stddev
开发者ID:danielhers,项目名称:cnn,代码行数:94,代码来源:ddpg.py

示例2: dqn

# 需要导入模块: from memory import Memory [as 别名]
# 或者: from memory.Memory import sample [as 别名]
def dqn(env,
        model,
        base_path,
        batch_size=32,
        epsilon=0.01,
        save_model_every=1000,
        update_target_every=1000,
        learning_starts=200,
        memory_size=500000,
        num_iterations=6250000):
    events_path = os.path.join(base_path, 'events')
    models_path = os.path.join(base_path, 'models')
    if not os.path.exists(events_path):
        os.makedirs(events_path)
    if not os.path.exists(models_path):
        os.makedirs(models_path)

    model.load_model(models_path)
    summary_writer = SummaryWriter(events_path)
    rewards_history = []
    pkl_path = '{}/rewards.pkl'.format(base_path)
    if os.path.exists(pkl_path):
        with open(pkl_path, 'rb') as f:
            rewards_history = pickle.load(f)

    memory_buffer = Memory(memory_size)
    results_buffer = ResultsBuffer(rewards_history)
    global_step = model.get_global_step()

    try:
        states = env.reset()
        for i in range(learning_starts):
            actions = model.get_action(states, epsilon)
            next_states, rewards, dones, info = env.step(actions)

            memory_buffer.extend(
                zip(states, actions, rewards, next_states, dones))
            states = next_states

        states = env.reset()
        start = time.time()
        for i in range(num_iterations):
            actions = model.get_action(states, epsilon)
            next_states, rewards, dones, info = env.step(actions)

            results_buffer.update_infos(info, global_step)
            memory_buffer.extend(
                zip(states, actions, rewards, next_states, dones))

            global_step, summaries = model.update(
                *memory_buffer.sample(batch_size))
            results_buffer.update_summaries(summaries)

            if global_step % update_target_every == 0:
                model.update_target()

            if global_step % save_model_every == 0:
                t = time.time() - start
                model.save_model(models_path)
                print("Save model, global_step: {}, delta_time: {}.".format(
                    global_step, t))
                results_buffer.add_summary(summary_writer, global_step, t)
                start = time.time()

            states = next_states

    except Exception as e:
        raise e

    finally:
        model.save_model(models_path)
        with open(pkl_path, 'wb') as f:
            pickle.dump(results_buffer.rewards_history, f)
开发者ID:liber145,项目名称:oh-my-q-learning,代码行数:75,代码来源:dqn.py

示例3: DQNAgent

# 需要导入模块: from memory import Memory [as 别名]
# 或者: from memory.Memory import sample [as 别名]
class DQNAgent(object):
    """ This agent uses DQN for making action decisions with 1-epsilon probability """

    def __init__(self, name, state_dim, action_dim, epsdecay=0.995,
                 buffersize=500000, samplesize=32, minsamples=10000,
                 gamma=0.99, state_norm_file='../params/state-stats.pkl', update_target_freq=600,
                 LEARNING_RATE = 1e-4):
        """ Accepts a unique agent name, number of variables in the state,
            number of actions and parameters of DQN then initialize the agent"""

        # Unique name for the agent
        self.name       = name
        # no:of state and action dimensions
        self.state_dim  = state_dim
        self.action_dim = action_dim
        # Create buffer for experience replay
        self.memory     = Memory(maxsize=buffersize)


        # Set initial epsilon to 1.0
        self.eps        = 1.0
        # Minimum number of samples in the buffer to start learning
        self.minsamples = minsamples
        # Number of random samples to be drawn from the buffer for experience replay
        self.samplesize = samplesize
        # Decay factor for epsilon for each episode
        self.epsdecay   = epsdecay
        # Discount factor for Q learning
        self.gamma      = gamma
        self.LEARNING_RATE = LEARNING_RATE


        # Create the base predictor neural network
        # and if required the target neural network too.
        self._create_nns_()
        # Load the state variable normalizers from pickle file if exists
        self.update_target_freq = update_target_freq
        # Boolean flag indicating whether the agent started learning or not
        self.started_learning = False
        # Keeps a count of number of steps.
        self.steps = 0

    def _preprocess_state_(self, instate):
        self.mean = np.zeros(self.state_dim)
        self.std  = np.ones(self.state_dim)
        # Normalize raw state vector by mean and std normalizers
        return ((instate - self.mean)/self.std)



    def _create_nns_(self):
        # Create predictor DQN
        self.model        = self._create_model_()
        self.target_model = self._create_model_()

    def _create_model_(self):
        model = Sequential()
        ## Flatten the input shape, it doesn't affect the batch size. input_shape (1, ) + (4, )
        model.add(Dense(40, input_dim=self.state_dim))
        model.add(Activation('relu'))
        ## now the model will take as input arrays of shape (*, 4) and output arrays of shape (*, 32)
        ## using the actiation function of relu sigmol?

        # Second layers
        model.add(Dense(32))
        model.add(Activation('relu'))

        # Thirsday layers
        model.add(Dense(16))
        model.add(Activation('relu'))

        # Output layers, output dimension is the number of the classes in the multi-classifier
        model.add(Dense(self.action_dim))
        model.add(Activation('linear'))

        ## adam is optimization function, loss function is mse
        adam = Adam(lr=self.LEARNING_RATE)
        model.compile(loss='mse',optimizer=adam)
        return model

    def _update_target_model_(self):
        # Copy weights from predictor NN to target network.
        self.target_model.set_weights(self.model.get_weights())

    def decide(self, curstate, testmode=False):
        """ Accepts current state as input and returns action to take """
        # Do not do eps greedy policy for test trials
        if not testmode:
            if (random.random() <= self.eps) or (not self.started_learning):
                return random.randint(0, self.action_dim-1)
        # convert state to a matrix with one row
        s = np.array([self._preprocess_state_(curstate)])
        # Return the action with maximum predicted Q value.
        return np.argmax(self.model.predict(s)[0])

    def observe(self, prevstate, action, reward, curstate, done):
        """ Accepts an observation (s,a,r,s',done) as input, store them in memory buffer for
            experience replay """
        # Normalize both states
        prevstate_normalized = self._preprocess_state_(prevstate)
#.........这里部分代码省略.........
开发者ID:rding0731,项目名称:rding.github.io,代码行数:103,代码来源:agent.py

示例4: zip

# 需要导入模块: from memory import Memory [as 别名]
# 或者: from memory.Memory import sample [as 别名]
        max_xvel = 20
        max_yvel = 8
        max_yawrate = 0.2
        max_altitude = 15
        action = np.clip(action, -1, 1) * np.array([max_xvel, max_yvel, max_yawrate, max_altitude / 4.0]) - np.array([0, 0, 0, max_altitude])

        env_next_state, env_reward, env_done, env_info = env.step(action)
        replay_buffer.add(env_state, env_reward, action, env_done, priority=300)

        env_state = env_next_state

        total_reward += env_reward

        if training:
            states_batch, action_batch, reward_batch, next_states_batch, done_batch, indexes = replay_buffer.sample(BATCH_SIZE, prioritized=True)

            feed = {
                action_placeholder: action_batch,
                reward_placeholder: reward_batch,
                done_placeholder: done_batch
            }

            feed.update({k: v for k, v in zip(state_placeholders, states_batch)})
            feed.update({k: v for k, v in zip(next_state_placeholders, next_states_batch)})

            _, _, errors, critic_error = sess.run([train_critic, train_actor, q_error, q_error_batch], feed_dict=feed)

            replay_buffer.update(indexes, errors)

            print 'q:{:5f} reward:{:5f} trainerror:{:5f}'.format(q[0], env_reward, critic_error)
开发者ID:superjax,项目名称:NNOA,代码行数:32,代码来源:ddpg.py

示例5: __init__

# 需要导入模块: from memory import Memory [as 别名]
# 或者: from memory.Memory import sample [as 别名]
class Agent:
	def __init__(self, env, model, epsilon=.9, min_epsilon=.1, epsilon_decay=1e-3):
		self.env = env
		self.model = model
		self.epsilon = epsilon
		self.min_epsilon = min_epsilon
		self.epsilon_decay = epsilon_decay
		self.episode = 0
		self.positiveMemory = Memory(model=self.model, episode_max_size=20)
		self.negativeMemory = Memory(model=self.model, episode_max_size=10)

	def play(self):
		terminal = False
		observation = self.env.reset()
		X = np.zeros((2,) + observation.shape)
		X[0] = observation
		X[1] = observation

		total_reward = 0
		while terminal == False and total_reward < 200:
			y = self.model.predict(X)
			action = np.argmax(y)

			observation, reward, terminal, info = self.env.executeAction(action)
			total_reward += reward

			X[0] = X[1]
			X[1] = observation

		return total_reward

	def learn(self, overfit=False, games=1, warmup=0, skip_frames=4):
		self.episode += 1.
		epsilon = max(self.min_epsilon, self.epsilon - self.episode * self.epsilon_decay)

		total_reward = 0
		qs = []
		predictions = None

		if warmup > 0:
			print "Adding %d warmup games"%(warmup)
			games += warmup

		for game in range(1, games + 1):
			print "Game %d/%d..."%(game, games)
			terminal = False
			observation = self.env.reset()
			framebuffer = np.zeros((skip_frames,) + observation.shape)
			framebuffer[-1] = observation
			frame = 0
			action = np.random.randint(0, 2)
			episode = []
			while terminal == False:
				frame += 1

				if frame%skip_frames != 0:
					observation, reward, terminal, info = self.env.executeAction(action)

				if frame%skip_frames == 0 or reward != 0 or terminal:
					X = framebuffer.copy()
					y = self.model.predict(X)
					qs.append(max(y))
					if predictions is None:
						predictions = np.zeros_like(y)
					predictions[np.argmax(y)] += 1

					if frame%skip_frames == 0:
						if np.random.rand() <= epsilon:
							action = np.random.randint(0, len(y))
						else:
							action = np.argmax(y)

						observation, reward, terminal, info = self.env.executeAction(action)

					total_reward += reward

					y[action] = 1. # encourage current action, for now
					episode.append((X, y, action, reward, terminal))

					if reward == 1:
						self.positiveMemory.add(episode, positive=True)
						episode = []
					if reward == -1:
						self.negativeMemory.add(episode, positive=False)
						episode = []

				framebuffer[0:skip_frames-1] = framebuffer[1:]
				framebuffer[-1] = observation

		print "Score %.1f"%(total_reward / games)

		X_pos, y_pos = self.positiveMemory.sample(nbr_positive=(games-warmup)*25)
		X_neg, y_neg = self.negativeMemory.sample(nbr_negative=(games-warmup)*100)

		if not X_pos is None:
			print "Sample %d positive and %d negative memories"%(len(y_pos), len(y_neg))
			X_t = np.concatenate((X_pos, X_neg))
			y_t = np.concatenate((y_pos, y_neg))
		else:
			print "Sample %d negative memories"%(len(y_neg))
#.........这里部分代码省略.........
开发者ID:blazer82,项目名称:ai,代码行数:103,代码来源:agent.py


注:本文中的memory.Memory.sample方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。