|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 糖逗 于 2021-9-20 15:56 编辑
代码出处:《用Python动手学习强化学习》第二章:基于动态规划的价值近似学习(价值迭代)
- class Planner():
- def __init__(self, env):
- self.env = env
- self.log = []
-
- def initialize(self):
- self.env.reset()
- self.log = []
-
- def plan(self, gamma = 0.9, threshold = 0.0001):
- raise Exception("Planner have to implements plan method.")
-
- def transitions_at(self, state, action):
- transition_probs = self.env.transit_func(state, action)
- for next_state in transition_probs:
- prob = transition_probs[next_state]
- reward, _ = self.env.reward_func(next_state)
- yield prob, next_state, reward
-
- def dict_to_grid(self, state_reward_dict):
- grid = []
- for i in range(self.env.row_length):
- row = [0] * self.env.column_length
- grid.append(row)
- for s in state_reward_dict:
- grid[s.row][s.column] = state_reward_dict[s]
- return grid
- class ValueIterationPlanner(Planner):
- def __init__(self, env):
- super().__init__(env)
-
- def plan(self, gamma = 0.9, threshold = 0.0001):
- self.initialize()
- actions = self.env.actions
- V = {}
- for s in self.env.states:
- V[s] = 0
-
- while True:
- delta = 0
- self.log.append(self.dict_to_grid(V))
- for s in V:
- if not self.env.can_action_at(s):
- continue
- expected_rewards = []
- for a in actions:
- r = 0
- for prob, next_state, reward in self.transitions_at(s, a):
- r += prob * (reward + gamma * V[next_state])
- expected_rewards.append(r)
-
- max_reward = max(expected_rewards)
- delta = max(delta, abs(max_reward - V[s]))
- V[s] = max_reward
-
- if delta < threshold:
- break
-
- V_grid = self.dict_to_grid(V)
- return V_grid
复制代码 |
|