《用Python动手学习强化学习》【价值迭代】【动态规划】
本帖最后由 糖逗 于 2021-9-20 15:56 编辑代码出处:《用Python动手学习强化学习》第二章:基于动态规划的价值近似学习(价值迭代)
class Planner():
def __init__(self, env):
self.env = env
self.log = []
def initialize(self):
self.env.reset()
self.log = []
def plan(self, gamma = 0.9, threshold = 0.0001):
raise Exception("Planner have to implements plan method.")
def transitions_at(self, state, action):
transition_probs = self.env.transit_func(state, action)
for next_state in transition_probs:
prob = transition_probs
reward, _ = self.env.reward_func(next_state)
yield prob, next_state, reward
def dict_to_grid(self, state_reward_dict):
grid = []
for i in range(self.env.row_length):
row = * self.env.column_length
grid.append(row)
for s in state_reward_dict:
grid = state_reward_dict
return grid
class ValueIterationPlanner(Planner):
def __init__(self, env):
super().__init__(env)
def plan(self, gamma = 0.9, threshold = 0.0001):
self.initialize()
actions = self.env.actions
V = {}
for s in self.env.states:
V = 0
while True:
delta = 0
self.log.append(self.dict_to_grid(V))
for s in V:
if not self.env.can_action_at(s):
continue
expected_rewards = []
for a in actions:
r = 0
for prob, next_state, reward in self.transitions_at(s, a):
r += prob * (reward + gamma * V)
expected_rewards.append(r)
max_reward = max(expected_rewards)
delta = max(delta, abs(max_reward - V))
V = max_reward
if delta < threshold:
break
V_grid = self.dict_to_grid(V)
return V_grid
{:10_327:}
页:
[1]