本文整理汇总了Python中rl.policy.EpsGreedyQPolicy方法的典型用法代码示例。如果您正苦于以下问题:Python policy.EpsGreedyQPolicy方法的具体用法?Python policy.EpsGreedyQPolicy怎么用?Python policy.EpsGreedyQPolicy使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类rl.policy
的用法示例。
在下文中一共展示了policy.EpsGreedyQPolicy方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from rl import policy [as 别名]
# 或者: from rl.policy import EpsGreedyQPolicy [as 别名]
def __init__(self, model, nb_actions, policy=None, test_policy=None, gamma=.99, nb_steps_warmup=10,
train_interval=1, delta_clip=np.inf, *args, **kwargs):
super(SarsaAgent, self).__init__(*args, **kwargs)
# Do not use defaults in constructor because that would mean that each instance shares the same
# policy.
if policy is None:
policy = EpsGreedyQPolicy()
if test_policy is None:
test_policy = GreedyQPolicy()
self.model = model
self.nb_actions = nb_actions
self.policy = policy
self.test_policy = test_policy
self.gamma = gamma
self.nb_steps_warmup = nb_steps_warmup
self.train_interval = train_interval
self.delta_clip = delta_clip
self.compiled = False
self.actions = None
self.observations = None
self.rewards = None
示例2: test_dqn
# 需要导入模块: from rl import policy [as 别名]
# 或者: from rl.policy import EpsGreedyQPolicy [as 别名]
def test_dqn():
env = TwoRoundDeterministicRewardEnv()
np.random.seed(123)
env.seed(123)
random.seed(123)
nb_actions = env.action_space.n
# Next, we build a very simple model.
model = Sequential()
model.add(Dense(16, input_shape=(1,)))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
memory = SequentialMemory(limit=1000, window_length=1)
policy = EpsGreedyQPolicy(eps=.1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50,
target_model_update=1e-1, policy=policy, enable_double_dqn=False)
dqn.compile(Adam(lr=1e-3))
dqn.fit(env, nb_steps=2000, visualize=False, verbose=0)
policy.eps = 0.
h = dqn.test(env, nb_episodes=20, visualize=False)
assert_allclose(np.mean(h.history['episode_reward']), 3.)
示例3: test_double_dqn
# 需要导入模块: from rl import policy [as 别名]
# 或者: from rl.policy import EpsGreedyQPolicy [as 别名]
def test_double_dqn():
env = TwoRoundDeterministicRewardEnv()
np.random.seed(123)
env.seed(123)
random.seed(123)
nb_actions = env.action_space.n
# Next, we build a very simple model.
model = Sequential()
model.add(Dense(16, input_shape=(1,)))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
memory = SequentialMemory(limit=1000, window_length=1)
policy = EpsGreedyQPolicy(eps=.1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50,
target_model_update=1e-1, policy=policy, enable_double_dqn=True)
dqn.compile(Adam(lr=1e-3))
dqn.fit(env, nb_steps=2000, visualize=False, verbose=0)
policy.eps = 0.
h = dqn.test(env, nb_episodes=20, visualize=False)
assert_allclose(np.mean(h.history['episode_reward']), 3.)
示例4: test_duel_dqn
# 需要导入模块: from rl import policy [as 别名]
# 或者: from rl.policy import EpsGreedyQPolicy [as 别名]
def test_duel_dqn():
env = TwoRoundDeterministicRewardEnv()
np.random.seed(123)
env.seed(123)
random.seed(123)
nb_actions = env.action_space.n
# Next, we build a very simple model.
model = Sequential()
model.add(Dense(16, input_shape=(1,)))
model.add(Activation('relu'))
model.add(Dense(nb_actions, activation='linear'))
memory = SequentialMemory(limit=1000, window_length=1)
policy = EpsGreedyQPolicy(eps=.1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50,
target_model_update=1e-1, policy=policy, enable_double_dqn=False, enable_dueling_network=True)
dqn.compile(Adam(lr=1e-3))
dqn.fit(env, nb_steps=2000, visualize=False, verbose=0)
policy.eps = 0.
h = dqn.test(env, nb_episodes=20, visualize=False)
assert_allclose(np.mean(h.history['episode_reward']), 3.)
示例5: main
# 需要导入模块: from rl import policy [as 别名]
# 或者: from rl.policy import EpsGreedyQPolicy [as 别名]
def main():
ENV_NAME = 'LunarLander-v2'
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(42)
env.seed(42)
num_actions = env.action_space.n
state_space = env.observation_space.shape[0]
print(num_actions)
model = build_model(state_space, num_actions)
memory = SequentialMemory(limit=50000, window_length=1)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
nb_steps=10000)
dqn = DQNAgent(model=model, nb_actions=num_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=0.00025), metrics=['mae'])
callbacks = build_callbacks(ENV_NAME)
dqn.fit(env, nb_steps=500000,
visualize=False,
verbose=2,
callbacks=callbacks)
# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)
示例6: main
# 需要导入模块: from rl import policy [as 别名]
# 或者: from rl.policy import EpsGreedyQPolicy [as 别名]
def main():
ENV_NAME = 'BreakoutDeterministic-v4'
INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(42)
env.seed(42)
num_actions = env.action_space.n
input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
model = build_model(INPUT_SHAPE, num_actions)
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
nb_steps=1000000)
dqn = DQNAgent(model=model, nb_actions=num_actions, policy=policy, memory=memory,
processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
train_interval=4, delta_clip=1.)
dqn.compile(Adam(lr=.00025), metrics=['mae'])
callbacks = build_callbacks(ENV_NAME)
dqn.fit(env,
nb_steps=1750000,
log_interval=10000,
visualize=False,
verbose=2,
callbacks=callbacks)
# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=10, visualize=True)
示例7: main
# 需要导入模块: from rl import policy [as 别名]
# 或者: from rl.policy import EpsGreedyQPolicy [as 别名]
def main():
ENV_NAME = 'CartPole-v0'
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(42)
env.seed(42)
num_actions = env.action_space.n
state_space = env.observation_space.shape[0]
print(num_actions)
model = build_model(state_space, num_actions)
memory = SequentialMemory(limit=50000, window_length=1)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
nb_steps=10000)
dqn = DQNAgent(model=model, nb_actions=num_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
callbacks = build_callbacks(ENV_NAME)
dqn.fit(env, nb_steps=50000,
visualize=False,
verbose=2,
callbacks=callbacks)
# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)
示例8: main
# 需要导入模块: from rl import policy [as 别名]
# 或者: from rl.policy import EpsGreedyQPolicy [as 别名]
def main():
ENV_NAME = 'LunarLander-v2'
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(42)
env.seed(42)
num_actions = env.action_space.n
state_space = env.observation_space.shape[0]
print(num_actions)
model = build_model(state_space, num_actions)
memory = SequentialMemory(limit=50000, window_length=1)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
nb_steps=10000)
dqn = DQNAgent(model=model, nb_actions=num_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
callbacks = build_callbacks(ENV_NAME)
# After training is done, we save the final weights.
dqn.load_weights('dqn_LunarLander-v2_weights_510000.h5f')
# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=10, visualize=True)
示例9: main
# 需要导入模块: from rl import policy [as 别名]
# 或者: from rl.policy import EpsGreedyQPolicy [as 别名]
def main():
ENV_NAME = 'BreakoutDeterministic-v4'
INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(42)
env.seed(42)
num_actions = env.action_space.n
model = build_model(INPUT_SHAPE, num_actions)
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
nb_steps=1000000)
dqn = DQNAgent(model=model, nb_actions=num_actions, policy=policy, memory=memory,
processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
train_interval=4, delta_clip=1.)
dqn.compile(Adam(lr=.00025), metrics=['mae'])
callbacks = build_callbacks(ENV_NAME)
# After training is done, we save the final weights.
dqn.load_weights('dqn_BreakoutDeterministic-v4_weights_1750000.h5f')
# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=10, visualize=True)
示例10: __init__
# 需要导入模块: from rl import policy [as 别名]
# 或者: from rl.policy import EpsGreedyQPolicy [as 别名]
def __init__(self, model, policy=None, test_policy=None, enable_double_dqn=False, enable_dueling_network=False,
dueling_type='avg', *args, **kwargs):
super(DQNAgent, self).__init__(*args, **kwargs)
# Validate (important) input.
if list(model.output.shape) != list((None, self.nb_actions)):
raise ValueError('Model output "{}" has invalid shape. DQN expects a model that has one dimension for each action, in this case {}.'.format(model.output, self.nb_actions))
# Parameters.
self.enable_double_dqn = enable_double_dqn
self.enable_dueling_network = enable_dueling_network
self.dueling_type = dueling_type
if self.enable_dueling_network:
# get the second last layer of the model, abandon the last layer
layer = model.layers[-2]
nb_action = model.output.shape[-1]
# layer y has a shape (nb_action+1,)
# y[:,0] represents V(s;theta)
# y[:,1:] represents A(s,a;theta)
y = Dense(nb_action + 1, activation='linear')(layer.output)
# caculate the Q(s,a;theta)
# dueling_type == 'avg'
# Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta)))
# dueling_type == 'max'
# Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta)))
# dueling_type == 'naive'
# Q(s,a;theta) = V(s;theta) + A(s,a;theta)
if self.dueling_type == 'avg':
outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.mean(a[:, 1:], axis=1, keepdims=True), output_shape=(nb_action,))(y)
elif self.dueling_type == 'max':
outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.max(a[:, 1:], axis=1, keepdims=True), output_shape=(nb_action,))(y)
elif self.dueling_type == 'naive':
outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:], output_shape=(nb_action,))(y)
else:
assert False, "dueling_type must be one of {'avg','max','naive'}"
model = Model(inputs=model.input, outputs=outputlayer)
# Related objects.
self.model = model
if policy is None:
policy = EpsGreedyQPolicy()
if test_policy is None:
test_policy = GreedyQPolicy()
self.policy = policy
self.test_policy = test_policy
# State.
self.reset_states()
示例11: __init__
# 需要导入模块: from rl import policy [as 别名]
# 或者: from rl.policy import EpsGreedyQPolicy [as 别名]
def __init__(self, model, policy=None, test_policy=None, enable_double_dqn=False, enable_dueling_network=False,
dueling_type='avg', *args, **kwargs):
super(DQNAgent, self).__init__(*args, **kwargs)
# Validate (important) input.
if hasattr(model.output, '__len__') and len(model.output) > 1:
raise ValueError('Model "{}" has more than one output. DQN expects a model that has a single output.'.format(model))
if model.output._keras_shape != (None, self.nb_actions):
raise ValueError('Model output "{}" has invalid shape. DQN expects a model that has one dimension for each action, in this case {}.'.format(model.output, self.nb_actions))
# Parameters.
self.enable_double_dqn = enable_double_dqn
self.enable_dueling_network = enable_dueling_network
self.dueling_type = dueling_type
if self.enable_dueling_network:
# get the second last layer of the model, abandon the last layer
layer = model.layers[-2]
nb_action = model.output._keras_shape[-1]
# layer y has a shape (nb_action+1,)
# y[:,0] represents V(s;theta)
# y[:,1:] represents A(s,a;theta)
y = Dense(nb_action + 1, activation='linear')(layer.output)
# caculate the Q(s,a;theta)
# dueling_type == 'avg'
# Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta)))
# dueling_type == 'max'
# Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta)))
# dueling_type == 'naive'
# Q(s,a;theta) = V(s;theta) + A(s,a;theta)
if self.dueling_type == 'avg':
outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.mean(a[:, 1:], axis=1, keepdims=True), output_shape=(nb_action,))(y)
elif self.dueling_type == 'max':
outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.max(a[:, 1:], axis=1, keepdims=True), output_shape=(nb_action,))(y)
elif self.dueling_type == 'naive':
outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:], output_shape=(nb_action,))(y)
else:
assert False, "dueling_type must be one of {'avg','max','naive'}"
model = Model(inputs=model.input, outputs=outputlayer)
# Related objects.
self.model = model
if policy is None:
policy = EpsGreedyQPolicy()
if test_policy is None:
test_policy = GreedyQPolicy()
self.policy = policy
self.test_policy = test_policy
# State.
self.reset_states()
示例12: training_game
# 需要导入模块: from rl import policy [as 别名]
# 或者: from rl.policy import EpsGreedyQPolicy [as 别名]
def training_game():
env = Environment()
input_shape = (FLAGS.screen_size, FLAGS.screen_size, 1)
nb_actions = 12 # Number of actions
model = neural_network_model(input_shape, nb_actions)
memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH)
processor = SC2Proc()
# Policy
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=1, value_min=0.7, value_test=.0, nb_steps=1e6)
# Agent
dqn = DQNAgent(model=model,
nb_actions=nb_actions,
memory=memory,
enable_double_dqn=False,
nb_steps_warmup=500,
# nb_steps_warmup=1,
target_model_update=1e-2,
policy=policy,
batch_size=150,
processor=processor)
dqn.compile(Adam(lr=.001), metrics=["mae"])
# Tensorboard callback
callbacks = keras.callbacks.TensorBoard(log_dir='./Graph', histogram_freq=0,
write_graph=True, write_images=False)
# Save the parameters and upload them when needed
name = FLAGS.mini_game
w_file = "dqn_{}_weights.h5f".format(name)
check_w_file = "train_w" + name + "_weights.h5f"
if SAVE_MODEL:
check_w_file = "train_w" + name + "_weights_{step}.h5f"
log_file = "training_w_{}_log.json".format(name)
if LOAD_MODEL:
dqn.load_weights(w_file)
dqn.fit(env, callbacks=callbacks, nb_steps=1e7, action_repetition=2,
log_interval=1e4, verbose=2)
dqn.save_weights(w_file, overwrite=True)
dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
示例13: training_game
# 需要导入模块: from rl import policy [as 别名]
# 或者: from rl.policy import EpsGreedyQPolicy [as 别名]
def training_game():
env = Environment(map_name="HallucinIce", visualize=True, game_steps_per_episode=150, agent_interface_format=features.AgentInterfaceFormat(
feature_dimensions=features.Dimensions(screen=64, minimap=32)
))
input_shape = (_SIZE, _SIZE, 1)
nb_actions = _SIZE * _SIZE # Should this be an integer
model = neural_network_model(input_shape, nb_actions)
# memory : how many subsequent observations should be provided to the network?
memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH)
processor = SC2Proc()
### Policy
# Agent´s behaviour function. How the agent pick actions
# LinearAnnealedPolicy is a wrapper that transforms the policy into a linear incremental linear solution . Then why im not see LAP with other than not greedy ?
# EpsGreedyQPolicy is a way of selecting random actions with uniform distributions from a set of actions . Select an action that can give max or min rewards
# BolztmanQPolicy . Assumption that it follows a Boltzman distribution. gives the probability that a system will be in a certain state as a function of that state´s energy??
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=1, value_min=0.7, value_test=.0,
nb_steps=1e6)
# policy = (BoltzmanQPolicy( tau=1., clip= (-500,500)) #clip defined in between -500 / 500
### Agent
# Double Q-learning ( combines Q-Learning with a deep Neural Network )
# Q Learning -- Bellman equation
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory,
nb_steps_warmup=500, target_model_update=1e-2, policy=policy,
batch_size=150, processor=processor)
dqn.compile(Adam(lr=.001), metrics=["mae"])
## Save the parameters and upload them when needed
name = "HallucinIce"
w_file = "dqn_{}_weights.h5f".format(name)
check_w_file = "train_w" + name + "_weights.h5f"
if SAVE_MODEL:
check_w_file = "train_w" + name + "_weights_{step}.h5f"
log_file = "training_w_{}_log.json".format(name)
callbacks = [ModelIntervalCheckpoint(check_w_file, interval=1000)]
callbacks += [FileLogger(log_file, interval=100)]
if LOAD_MODEL:
dqn.load_weights(w_file)
dqn.fit(env, callbacks=callbacks, nb_steps=1e7, action_repetition=2,
log_interval=1e4, verbose=2)
dqn.save_weights(w_file, overwrite=True)
dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)