《用Python动手学习强化学习》【策略迭代】【动态规划】
代码出处:《用Python动手学习强化学习》第二章:基于动态规划的策略学习(策略迭代)class PolicyIterationPlanner(Planner):
def __init__(self, env):
super().__init__(env)
self.policy = {}
def initialize(self):
super().initialize()
self.policy = {}
actions = self.env.actions
states = self.env.states
for s in states:
self.policy = {}
for a in actions:
# 初始化策略
# 一开始时各种行动的概率都是一样的
self.policy = 1 / len(actions)
def estimate_by_policy(self, gamma, threshold):
V = {}
for s in self.env.states:
# 初始化各种状态的期望奖励
V = 0
while True:
delta = 0
for s in V:
expected_rewards = []
for a in self.policy:
action_prob = self.policy
r = 0
for prob, next_state, reward in self.transitions_at(s, a):
r += action_prob * prob * \
(reward + gamma * V)
expected_rewards.append(r)
value = sum(expected_rewards)
delta = max(delta, abs(value - V))
V = value
if delta < threshold:
break
return V
def plan(self, gamma=0.9, threshold=0.0001):
self.initialize()
states = self.env.states
actions = self.env.actions
def take_max_action(action_value_dict):
return max(action_value_dict, key=action_value_dict.get)
while True:
update_stable = True
# 在当前的策略下估计期望奖励
V = self.estimate_by_policy(gamma, threshold)
self.log.append(self.dict_to_grid(V))
for s in states:
# 在当前的策略下得到行动
policy_action = take_max_action(self.policy)
# 与其他行动比较
action_rewards = {}
for a in actions:
r = 0
for prob, next_state, reward in self.transitions_at(s, a):
r += prob * (reward + gamma * V)
action_rewards = r
best_action = take_max_action(action_rewards)
if policy_action != best_action:
update_stable = False
# 更新策略(设置 best_action prob=1, otherwise=0 (贪婪))
for a in self.policy:
prob = 1 if a == best_action else 0
self.policy = prob
if update_stable:
# 如果策略没有更新,则停止迭代
break
# 将字典转换为二维数组
V_grid = self.dict_to_grid(V)
return V_grid 沙发?
页:
[1]