马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 糖逗 于 2021-9-20 15:56 编辑
代码出处:《用Python动手学习强化学习》第二章:基于动态规划的价值近似学习(价值迭代)
class Planner():
def __init__(self, env):
self.env = env
self.log = []
def initialize(self):
self.env.reset()
self.log = []
def plan(self, gamma = 0.9, threshold = 0.0001):
raise Exception("Planner have to implements plan method.")
def transitions_at(self, state, action):
transition_probs = self.env.transit_func(state, action)
for next_state in transition_probs:
prob = transition_probs[next_state]
reward, _ = self.env.reward_func(next_state)
yield prob, next_state, reward
def dict_to_grid(self, state_reward_dict):
grid = []
for i in range(self.env.row_length):
row = [0] * self.env.column_length
grid.append(row)
for s in state_reward_dict:
grid[s.row][s.column] = state_reward_dict[s]
return grid
class ValueIterationPlanner(Planner):
def __init__(self, env):
super().__init__(env)
def plan(self, gamma = 0.9, threshold = 0.0001):
self.initialize()
actions = self.env.actions
V = {}
for s in self.env.states:
V[s] = 0
while True:
delta = 0
self.log.append(self.dict_to_grid(V))
for s in V:
if not self.env.can_action_at(s):
continue
expected_rewards = []
for a in actions:
r = 0
for prob, next_state, reward in self.transitions_at(s, a):
r += prob * (reward + gamma * V[next_state])
expected_rewards.append(r)
max_reward = max(expected_rewards)
delta = max(delta, abs(max_reward - V[s]))
V[s] = max_reward
if delta < threshold:
break
V_grid = self.dict_to_grid(V)
return V_grid
|