| 
 | 
 
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册  
 
x
 
代码取自:《用Python动手学强化学习》第二章,基于价值最大化的贝尔曼方程 
 
 
- def R(s):
 
 -     if s == "happy_end":
 
 -         return 1
 
 -     elif s == "bad_end":
 
 -         return -1
 
 -     else:
 
 -         return 0
 
  
- def transit_func(s, a):
 
 -     actions = s.split("_")[1:]
 
 -     LIMIT_GAME_COUNT = 5
 
 -     HAPPY_END_BORDER = 4
 
 -     MOVE_PROB = 0.9
 
 -     
 
 -     def next_state(state, action):
 
 -         return "_".join([state, action])
 
 -     
 
 -     if len(actions) == LIMIT_GAME_COUNT:
 
 -         up_count = sum([1 if a == "up" else 0 for a in actions])
 
 -         state = "happy_end" if up_count >= HAPPY_END_BORDER else "bad_end"
 
 -         prob = 1
 
 -         return {state : prob}
 
 -     else:#递归
 
 -         opposite = "up" if a == "down" else "down"
 
 -         return {next_state(s, a): MOVE_PROB,
 
 -                 next_state(s, opposite): 1 - MOVE_PROB}
 
 -         
 
 - def max_V_on_next_state(s):
 
 -     if s in ["happy_end", "bad_end"]:
 
 -         return 0
 
 -     actions = ["up", "down"]
 
 -     values = []
 
 -     for a in actions:
 
 -         transition_probs = transit_func(s, a)
 
 -         v = 0
 
 -         for next_state in transition_probs:
 
 -             prob = transition_probs[next_state]
 
 -             v += prob * V(next_state)
 
 -         values.append(v)
 
 -     return max(values)
 
  复制代码 |   
 
 
 
 |