本文整理汇总了Python中world.World.moveList方法的典型用法代码示例。如果您正苦于以下问题:Python World.moveList方法的具体用法?Python World.moveList怎么用?Python World.moveList使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类world.World
的用法示例。
在下文中一共展示了World.moveList方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from world import World [as 别名]
# 或者: from world.World import moveList [as 别名]
def __init__(self, policy):
# world object, (starting state is trivial)
world = World((0,0),(1,1))
value = {}
for state in world.allStates():
value[state] = 0
discount = 0.9
delta = 1
while abs(delta) > 0.00001:
delta = 0
for state in world.allStates():
world.setState(state)
old = value[state]
# we can set the minimum to 0 since we know every value will be 0 or positive
curMax = 0
for move in world.moveList():
if world.posAfterMove(move) == (0,0):
probSum = 10
else:
probSum = 0
for nextState,prob in world.nextPreyStates():
probSum += prob*discount*value[nextState]
curMax = max(curMax,probSum)
value[state] = curMax
delta = max(delta,abs(old - curMax))
value[(0,0)] = 10
self.value = value
self.actionList = []
self.allList = []
self.bottomPolicy = policy
self.discount = discount
示例2: Qlearning
# 需要导入模块: from world import World [as 别名]
# 或者: from world.World import moveList [as 别名]
def Qlearning(episodes, policy, startState=(-5,-5), initValue=15,policyParam=0.1, alpha=0.4,discount=0.9):
# world object, (starting state is trivial)
world = World((0,0),(1,1))
# Q value table
Q = {}
for state in world.allStates():
for move in world.moveList():
Q[state,move] = initValue
steps = [0]*episodes
for i in range(episodes):
iterations = 0
# initialize world
world.setState(startState)
while True:
state = world.position
# move the predator according to policy with one parameter (epsilon for E-greedy or Tua for softmax)
action = policy(state, world, Q, policyParam)
world.move(action)
iterations += 1
# check if predator caught the prey
if world.stopState():
# the Q(s,a) update rule (note that the next state is the absorbing state)
Q[state,action] = Q[state,action] + alpha * (10 - Q[state,action])
break
# move the prey (stochasticly)
world.performPreyMove()
newState = world.position
# the maximum value the agent can have after another move
maxQ = max([Q[newState,nextAction] for nextAction in world.moveList()])
# the Q(s,a) update rule (note that the immediate reward is zero)
Q[state,action] = Q[state,action] + alpha * ( discount*maxQ - Q[state,action])
# print the number of steps the predator took
steps[i] = iterations
return steps
示例3: isOptimal
# 需要导入模块: from world import World [as 别名]
# 或者: from world.World import moveList [as 别名]
def isOptimal(self,state, move):
world = World((0,0),(1,1))
ourMove = 0
bestMove = 0
for nmove in world.moveList():
world.setState(state)
world.move(nmove)
if world.position == (0,0):
probSum = 10
else:
probSum = 0
for nextState,prob in world.nextPreyStates():
probSum += prob*self.discount*self.value[nextState]
bestMove = max(bestMove,probSum)
if nmove == move:
ourMove = probSum
return ourMove/bestMove > 0.97
示例4: MCon
# 需要导入模块: from world import World [as 别名]
# 或者: from world.World import moveList [as 别名]
def MCon(episodes, initValue=15,epsilon=0.1, alpha=0.5,discount=0.9):
# world object, (starting state is trivial)
world = World((0,0),(1,1))
# initialize Q value table and Return list for every (s,a)-pair
Q = {}
R = {}
for state in world.allStates():
for move in world.moveList():
Q[state,move] = initValue # some value
R[state,move] = [] # empty list; return = cummulative discounted reward
steps = [0]*episodes # list counting number of iterations
for i in range(episodes):
iterations = 0
# initialize world
world.setState((-5,-5))
stateActionPairs = {}
# generate an episode using current policy
while True:
state = world.position
# move the predator according to policy
action = epsGreedyPolicy(state, world, Q, epsilon)
world.move(action)
if not (state,action) in stateActionPairs: # store first occurence
stateActionPairs[(state,action)] = iterations # will be used for discounting
iterations += 1
# check if predator caught the prey
if world.stopState():
break
# move the prey (stochasticly)
world.performPreyMove()
newState = world.position
steps[i] = iterations # save amount of iterations needed to catch the prey
# update Q and R
for pair in stateActionPairs.keys():
firstReturn = 10.0*discount**(iterations-stateActionPairs[pair]) # always zero but 10 when episode ends
R[pair].append(firstReturn)
Q[pair] = np.mean(np.array(R[pair]))
# update policy done in epsilon greedy policy code
return steps
示例5: MCoff
# 需要导入模块: from world import World [as 别名]
# 或者: from world.World import moveList [as 别名]
def MCoff(episodes, behaPolicy, matches=[], initValue=15,discount=0.9):
# behaPolicy = dictionary with keys (state,action) and value P(action|state)
world = World((0,0),(1,1))
movelist = world.moveList()
def policy(world):
return world.pickElementWithProbs([(move,behaPolicy[(world.position,move)]) for move in movelist])
# initialize Q value table and Return list for every (s,a)-pair
Q = {}
R = {}
num = {}
denum = {}
for state in world.allStates():
for move in world.moveList():
num[state,move] = 0.0
denum[state,move] = 0.0
Q[state,move] = float(initValue) # some value
R[state,move] = [] # empty list; return = cummulative discounted reward
steps = [0]*episodes # list counting number of iterations
for epi in range(episodes):
time = 0
totalTime =0
# initialize world
world.setState((-5,-5))
episode = []
while True:
action = policy(world)
episode.append((world.position, action))
if action == None:
print action, state
world.move(action)
if world.stopState():
break
world.performPreyMove()
# save the pairs that match, and their first occurence
matchingHistory = {}
# last time move was equal to policy
last = 0
for i, (state, action) in enumerate(episode[::-1]):
actionValues = [(Q[state,maction],maction) for maction in world.moveList()]
bestActions = [actionValues[j][1] for j in maxIndices(actionValues)]
matchingHistory[(state, action)] = len(episode)-i - 1
if action not in bestActions:
last = len(episode)-i
break
matches.append(len(episode)-last)
for (state, action) in matchingHistory:
if matchingHistory[(state, action)] >= last-1:
w = np.prod([ 1.0/behaPolicy[episode[j]] for j in range(matchingHistory[(state, action)],len(episode))])
num[(state,move)] += w * (10.0*discount**matchingHistory[(state, action)]) # return is gamma^{T-t}*10
denum[(state,move)] += w
Q[(state,move)]= num[(state,move)]/float(denum[(state,move)])
world.setState((-5,-5))
iterations = 0
while True:
iterations += 1
actionValues = [(maction, Q[state,maction]) for maction in world.moveList()]
bestAction = random.choice([actionValues[j][0] for j in maxIndices(actionValues)])
world.move(bestAction)
if world.stopState() or iterations > 2000:
break
world.performPreyMove()
steps[epi] = iterations
return steps