|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
代码取自:《用Python动手学强化学习》第二章,基于价值最大化的贝尔曼方程
- def R(s):
- if s == "happy_end":
- return 1
- elif s == "bad_end":
- return -1
- else:
- return 0
- def transit_func(s, a):
- actions = s.split("_")[1:]
- LIMIT_GAME_COUNT = 5
- HAPPY_END_BORDER = 4
- MOVE_PROB = 0.9
-
- def next_state(state, action):
- return "_".join([state, action])
-
- if len(actions) == LIMIT_GAME_COUNT:
- up_count = sum([1 if a == "up" else 0 for a in actions])
- state = "happy_end" if up_count >= HAPPY_END_BORDER else "bad_end"
- prob = 1
- return {state : prob}
- else:#递归
- opposite = "up" if a == "down" else "down"
- return {next_state(s, a): MOVE_PROB,
- next_state(s, opposite): 1 - MOVE_PROB}
-
- def max_V_on_next_state(s):
- if s in ["happy_end", "bad_end"]:
- return 0
- actions = ["up", "down"]
- values = []
- for a in actions:
- transition_probs = transit_func(s, a)
- v = 0
- for next_state in transition_probs:
- prob = transition_probs[next_state]
- v += prob * V(next_state)
- values.append(v)
- return max(values)
复制代码 |
|