|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
代码出处:《用Python动手学习强化学习》第二章:基于动态规划的策略学习(策略迭代)
- class PolicyIterationPlanner(Planner):
- def __init__(self, env):
- super().__init__(env)
- self.policy = {}
- def initialize(self):
- super().initialize()
- self.policy = {}
- actions = self.env.actions
- states = self.env.states
- for s in states:
- self.policy[s] = {}
- for a in actions:
- # 初始化策略
- # 一开始时各种行动的概率都是一样的
- self.policy[s][a] = 1 / len(actions)
- def estimate_by_policy(self, gamma, threshold):
- V = {}
- for s in self.env.states:
- # 初始化各种状态的期望奖励
- V[s] = 0
- while True:
- delta = 0
- for s in V:
- expected_rewards = []
- for a in self.policy[s]:
- action_prob = self.policy[s][a]
- r = 0
- for prob, next_state, reward in self.transitions_at(s, a):
- r += action_prob * prob * \
- (reward + gamma * V[next_state])
- expected_rewards.append(r)
- value = sum(expected_rewards)
- delta = max(delta, abs(value - V[s]))
- V[s] = value
- if delta < threshold:
- break
- return V
- def plan(self, gamma=0.9, threshold=0.0001):
- self.initialize()
- states = self.env.states
- actions = self.env.actions
- def take_max_action(action_value_dict):
- return max(action_value_dict, key=action_value_dict.get)
- while True:
- update_stable = True
- # 在当前的策略下估计期望奖励
- V = self.estimate_by_policy(gamma, threshold)
- self.log.append(self.dict_to_grid(V))
- for s in states:
- # 在当前的策略下得到行动
- policy_action = take_max_action(self.policy[s])
- # 与其他行动比较
- action_rewards = {}
- for a in actions:
- r = 0
- for prob, next_state, reward in self.transitions_at(s, a):
- r += prob * (reward + gamma * V[next_state])
- action_rewards[a] = r
- best_action = take_max_action(action_rewards)
- if policy_action != best_action:
- update_stable = False
- # 更新策略(设置 best_action prob=1, otherwise=0 (贪婪))
- for a in self.policy[s]:
- prob = 1 if a == best_action else 0
- self.policy[s][a] = prob
- if update_stable:
- # 如果策略没有更新,则停止迭代
- break
- # 将字典转换为二维数组
- V_grid = self.dict_to_grid(V)
- return V_grid
复制代码 |
|