本文整理汇总了Python中world.World.allMoveList方法的典型用法代码示例。如果您正苦于以下问题:Python World.allMoveList方法的具体用法?Python World.allMoveList怎么用?Python World.allMoveList使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类world.World
的用法示例。
在下文中一共展示了World.allMoveList方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Qlearning
# 需要导入模块: from world import World [as 别名]
# 或者: from world.World import allMoveList [as 别名]
def Qlearning(episodes, initialState,policy,alpha_pred=0.2,alpha_prey=0.5):
initValue=0
policyParam=0.2
discount=0.7
# world object, (starting state is trivial)
world = World((5,5),initialState)
# Q value table
Q_pred = {}
Q_prey = {}
steps = [0]*episodes
rewards = [0]*episodes
for i in range(episodes):
iterations = 0
# initialize world
world = World((5,5),initialState)
while True:
# world.prettyPrint()
state = world.position
# move the predator according to policy with one parameter (epsilon for E-greedy or Tua for softmax)
pred_action = policy(state, world.allMoveList(), Q_pred, policyParam, initValue)
prey_action = policy(state, world.singleMoveList(), Q_prey, policyParam, initValue)
reward = world.move(prey_action, pred_action)
iterations += 1
if (state,pred_action) not in Q_pred:
Q_pred[(state,pred_action)] = initValue
if (state,prey_action) not in Q_prey:
Q_prey[(state,prey_action)] = initValue
# check if predator caught the prey
if world.stopState():
# the Q(s,a) update rule (note that the next state is the absorbing state)
Q_prey[state,prey_action] = Q_prey.get((state,prey_action),initValue) + alpha_prey * (reward[0] - Q_prey[state,prey_action])
Q_pred[state,pred_action] = Q_pred.get((state,pred_action),initValue) + alpha_pred * (reward[1] - Q_pred[state,pred_action])
break
newState = world.position
# the maximum value the agent can have after another move
maxQ_pred = max([Q_pred.get((newState,nextAction),initValue) for nextAction in world.allMoveList()])
maxQ_prey = max([Q_prey.get((newState,nextAction),initValue) for nextAction in world.singleMoveList()])
# the Q(s,a) update rule (note that the immediate reward is zero)
Q_pred[state,pred_action] = Q_pred[(state,pred_action)] + alpha_pred * ( discount*maxQ_pred - Q_pred[state,pred_action])
Q_prey[state,prey_action] = Q_prey[(state,prey_action)] + alpha_prey * ( discount*maxQ_prey - Q_prey[state,prey_action])
if i > 0 and i % 1000 == 0:
print "Episode", i
# print the number of steps the predator took
steps[i] = iterations
if reward[1] > 0:
rewards[i] = 1
return steps, rewards
示例2: World
# 需要导入模块: from world import World [as 别名]
# 或者: from world.World import allMoveList [as 别名]
from world import World
import random
import pylab as pl
#world = World((5,5),[(10,0),(0,10),(0,0),(10,10),(10,1),(0,9),(0,1),(10,9)])
predatorLocations = [(0,0),(0,10),(10,0),(10,10)]
preds = []
iters = []
prey = []
for no in range(len(predatorLocations)):
world = World((5,5),predatorLocations[:no+1])
allMoves = world.allMoveList()
singleMoves = world.singleMoveList()
runs = 1000
totalCaughtPrey = 0
totalIterations = 0
for i in range(runs):
world = World((5,5),predatorLocations[:no+1])
iterations = 0
while not world.stopState():
preyMove = random.choice(singleMoves)
predatorMoves = random.choice(allMoves)
reward = world.move(preyMove,predatorMoves)
iterations += 1
if reward[0] < 0:
totalCaughtPrey += 1
totalIterations += iterations
示例3: policyHillClimbing
# 需要导入模块: from world import World [as 别名]
# 或者: from world.World import allMoveList [as 别名]
def policyHillClimbing(episodes,initial_state,gamma=0.5, delta=0.2, alpha_pred=0.4, alpha_prey=0.1):
world = World((5,5),initial_state)
# initialization might be too expansive
Q_pred = {}
Q_prey = {}
seen_states = [] # keep track of seen states
pi_pred = {}
pi_prey = {}
initValue = 0.0
num_actions_prey = len(world.singleMoveList())
num_actions_pred = len(world.allMoveList())
steps = [0]*episodes
rewards = [0]*episodes
for i in range(episodes):
# initialize world
world = World((5,5),initial_state)
iterations =0
state = world.position
seen_states.append(state)
seen_states.append((0,0))
for action_p in world.singleMoveList():
Q_prey[(state, action_p)] = initValue
pi_prey[(state, action_p)] = 1/float(num_actions_prey)
Q_prey[((0,0), action_p)] = 0
for action_p in world.allMoveList():
Q_pred[(state,action_p)] = initValue
pi_pred[(state, action_p)] = 1/float(num_actions_pred)
Q_pred[((0,0), action_p)] = 0
while not world.stopState():
state = world.position
# choose action
action_pred = greedy_policy(pi_pred, state, world.allMoveList())
action_prey = greedy_policy(pi_prey, state, world.singleMoveList())
reward = world.move(action_prey,action_pred)
new_state = world.position
iterations +=1
# update Q
if new_state not in seen_states:
seen_states.append(new_state)
for action_p in world.singleMoveList():
Q_prey[(new_state, action_p)] = initValue
pi_prey[(new_state, action_p)] = 1/float(num_actions_prey)
for action_p in world.allMoveList():
Q_pred[(new_state,action_p)] = initValue
pi_pred[(new_state, action_p)] = 1/float(num_actions_pred)
best_Q_pred = max([Q_pred[(new_state,action)] for action in world.allMoveList()])
best_Q_prey = max([Q_prey[(new_state,action)] for action in world.singleMoveList()])
Q_pred[(state,action_pred)] = (1.0-alpha_pred)*Q_pred[(state,action_pred)] + alpha_pred*(reward[1]+ gamma* best_Q_pred)
Q_prey[(state,action_prey)] = (1.0-alpha_prey)*Q_prey[(state,action_prey)] + alpha_prey*(reward[0]+ gamma* best_Q_prey)
# update pi for predator and prey
if Q_pred[(state,action_pred)] == max([Q_pred[(state,action)] for action in world.allMoveList()]):
pi_pred[(state,action_pred)] += delta
else:
pi_pred[(state,action_pred)] -= delta/(num_actions_pred-1.0)
if Q_prey[(state,action_prey)] == max([Q_prey[(state,action)] for action in world.singleMoveList()]):
pi_prey[(state,action_prey)] += delta
else:
pi_prey[(state,action_prey)] -= delta/(num_actions_prey-1.0)
# restrict to probability distribution and make it epsilon greedy (divide 0.1 over all actions)
sum_value = sum([Q_pred[(state,action)] for action in world.allMoveList()])
for action_p in world.allMoveList():
if sum_value > 0:
pi_pred[(state, action_p)] /= sum_value
pi_pred[(state, action_p)] *= 0.9
pi_pred[(state, action_p)] += 0.1/num_actions_pred
sum_value = sum([Q_prey[(state,action)] for action in world.singleMoveList()])
for action_p in world.singleMoveList():
if sum_value > 0:
pi_prey[(state, action_p)] /= sum_value
pi_prey[(state, action_p)] *= 0.9
pi_prey[(state, action_p)] += 0.1/num_actions_prey
#alpha *= decay
rewards[i]=reward[0]
steps[i]= iterations
print "Episode", i
return steps, rewards
示例4: minimax
# 需要导入模块: from world import World [as 别名]
# 或者: from world.World import allMoveList [as 别名]
def minimax(episodes,initial_state,epsilon, decay, gamma, alpha_pred=1.0, alpha_prey=1.0):
# initialization might be too expansive
Q_pred = dict()
Q_prey = dict()
V_pred = dict()
V_prey = dict()
pi_pred = dict()
pi_prey = dict()
initValue = 1.0
# initialisation
world = World((5,5),initial_state)
for state in world.allStates():
V_pred[state] = 1.0
V_prey[state] = 1.0
for action in world.allMoveList():
pi_pred[(state,action)]=1.0/len(world.allMoveList())
for prey_move in world.singleMoveList():
Q_pred[(state, action, prey_move)]=1.0
Q_prey[(state, action, prey_move)]=1.0
for action in world.singleMoveList():
pi_prey[(state,action)]=1.0/len(world.singleMoveList())
# absorbing states
terminal_state = tuple([(0,0)] * len(initial_state))
V_pred[terminal_state] = 0.0
V_prey[terminal_state] = 0.0
steps = [0]*episodes
rewards = [0]*episodes
for epi in range(episodes):
# initialize world
world = World((5,5),initial_state)
# print "Begin Pred", V_pred[world.position]
# print "End Prey", V_prey[world.position]
# for s in world.singleMoveList():
# print s, "Pred", V_pred[(s,)]
# print s, "Prey", V_pred[(s,)]
# for a in world.allMoveList():
# for a2 in world.singleMoveList():
# print s, "Q", a, a2, Q_pred[(state,a,a2)]
iterations =0
while not world.stopState():
state = world.position
# choose action
action_pred = minimax_policy(epsilon, pi_pred, state, world.allMoveList())
action_prey = minimax_policy(epsilon, pi_prey, state, world.singleMoveList())
reward = world.move(action_prey,action_pred)
iterations +=1
new_state = world.position
# update Q
# if (state,action_prey) not in Q_prey:
# Q_prey[state,action_prey] = initValue
# if (state,action_pred) not in Q_pred:
# Q_pred[state,action_pred] = initValue
Q_pred[(state,action_pred,action_prey)] = (1.0-alpha_pred)*Q_pred[(state,action_pred,action_prey)] + alpha_pred*(reward[1]+ gamma* V_pred[new_state])
Q_prey[(state,action_pred,action_prey)] = (1.0-alpha_prey)*Q_prey[(state,action_pred,action_prey)] + alpha_prey*(reward[0]+ gamma* V_prey[new_state])
# update pi
# adapted from example: http://abel.ee.ucla.edu/cvxopt/examples/tutorial/lp.html
## PREDATOR update
# constraint to minimize w.r.t. prey action
minConstr = [[1.0] + [-Q_pred[(state,a_pred,a_prey)] for a_pred in world.allMoveList()] for a_prey in world.singleMoveList()]
# constrinat to keep every pi(a) positive
posConstr = []
for i in range(1,len(world.allMoveList())+1):
new_row = [0.0] * (len(world.allMoveList())+1)
new_row[i] = -1.0
posConstr.append(new_row)
normGreater = [0.0] + [1.0] * len(world.allMoveList())
normSmaller = [0.0] + [-1.0] * len(world.allMoveList())
A = matrix([normGreater, normSmaller] + minConstr + posConstr).trans()
b = matrix([ 1.0, -1.0] + [0.0] * (len(world.singleMoveList()) + len(world.allMoveList())) )
# -1 V and 0 for all pi(s,a)
c = matrix([ -1.0 ] + [0.0] * len(world.allMoveList()))
sol=solvers.lp(c,A,b)
V_pred[state] = sol['x'][0]
for a_pred, x in zip(world.allMoveList(),sol['x'][1:]):
pi_pred[(state,a_pred)] = x
# ## PREY update
# constraint to minimize w.r.t. prey action
minConstr = [[1.0] + [-Q_prey[(state,a_pred,a_prey)] for a_prey in world.singleMoveList()] for a_pred in world.allMoveList()]
# # constriant to keep every pi(a) positive
posConstr = []
for i in range(1,len(world.singleMoveList())+1):
new_row = [0.0] * (len(world.singleMoveList())+1)
new_row[i] = -1.0
posConstr.append(new_row)
normGreater = [0.0] + [ 1.0] * len(world.singleMoveList())
normSmaller = [0.0] + [-1.0] * len(world.singleMoveList())
#.........这里部分代码省略.........