import numpy as np
import pandas as pd
import time
def create_environment(state_x,state_y,episode,step_counter,table):
'''
就是创建游戏的简单画面,定义游戏的移动等
'''
env_list = [['#', '#', '#', '#'] ,
['#', '&', '&', '#'] ,
['#', '&', '@', '#'] ,
['#', '#', '#', '#'] ]
if state_x == 1000:
interraction = 'episode %d: total step_step_move: %d'%(episode + 1,step_counter)
print('\r{}'.format(interraction),end = '')
time.sleep(2)
print('\r ',end = '')
else:
env_list[state_x][state_y] = 'o'
#interraction = ''.join(env_list)
#print('\r{}'.format(interraction),end = '')
print("***************new way******************")
print("\n")
for i in range(4):
print(env_list[i])
print("\n")
time.sleep(Fresh_time)
def get_evn_feedback(state_x,state_y,A):
'''
就是看环境会给我多少reward以及下一步的状态是什么
'''
if A == 'right':
#判断有没有超出边界
if state_x >= N_state_x or state_x < 0 or \
state_y >= N_state_y or state_y < 0:
R = -1
if state_x >= N_state_x:
state_x =0
if state_x < 0:
state_x = N_state_x-1
if state_y >= N_state_y:
state_y = 0
if state_y < 0:
state_y = N_state_y-1
#踩到陷阱
elif (state_x,state_y) in [(1,1),(1,2),(2,1)]:
state_x = state_x
state_y += 1
R = -1
#终点
elif (state_x,state_y) == (2,2):
state_x = 1000
state_y = 1000
R = 1
#其他情况不给奖励
else:
R = 0
if state_y - 1 <0:
state_y = N_state_y-1
else:
state_y -= 1
state_x = state_x
elif A == 'left':
#判断有没有超出边界
if state_x >= N_state_x or state_x < 0 or \
state_y >= N_state_y or state_y < 0:
R = -1
if state_x >= N_state_x:
state_x =0
if state_x < 0:
state_x = N_state_x-1
if state_y >= N_state_y:
state_y =0
if state_y < 0:
state_y = N_state_y-1
#踩到陷阱
elif (state_x,state_y) in [(1,1),(1,2),(2,1)]:
state_x = state_x
state_y -= 1
R = -1
#终点
elif (state_x,state_y) == (2,2):
state_x = 1000
state_y = 1000
R = 1
#其他情况不给奖励
else:
R = 0
state_x = state_x
if state_y +1 >= N_state_y:
state_y = 0
else:
state_y += 1
elif A == 'up':
#判断有没有超出边界
if state_x >= N_state_x or state_x < 0 or \
state_y >= N_state_y or state_y < 0:
R = -1
if state_x >= N_state_x:
state_x =0
if state_x < 0:
state_x = N_state_x -1
if state_y >= N_state_y:
state_y =0
if state_y < 0:
state_y = N_state_y -1
#踩到陷阱
elif (state_x,state_y) in [(1,1),(1,2),(2,1)]:
state_x -= 1
state_y = state_y
R = -1
#终点
elif (state_x,state_y) == (2,2):
state_x = 1000
state_y = 1000
R = 1
#其他情况不给奖励
else:
R = 0
if state_x - 1 < 0:
state_x = N_state_x - 1
else:
state_x -= 1
state_y = state_y
else:
#判断有没有超出边界
if state_x >= N_state_x or state_x < 0 or \
state_y >= N_state_y or state_y < 0:
R = -1
if state_x >= N_state_x:
state_x =0
if state_x < 0:
state_x = N_state_x -1
if state_y >= N_state_y:
state_y =0
if state_y < 0:
state_y = N_state_y -1
#踩到陷阱
elif (state_x,state_y) in [(1,1),(1,2),(2,1)]:
state_x += 1
state_y = state_y
R = -1
#终点
elif (state_x,state_y) == (2,2):
state_x = 1000
state_y = 1000
R = 1
#其他情况不给奖励
else:
R = 0
if state_x + 1 >= N_state_x:
state_x = 0
else:
state_x += 1
state_y = state_y
return state_x,state_y,R
def build_q_table(N_state_x,N_state_y):
table = np.zeros((N_state_x,N_state_y)) #表格的值都初始化为0
return table
#print(build_q_table(N_state,Action))
def choose_action(state_x,state_y,q_table):
'''
state = (N_state_x,N_state_y)
下一步动作的选取要根据当前的状态以及已有的q_table来进行选择
选择的时候分为两种情况:
1.在90%的情况下,根据当前的状态以及已有的q_table来选择最优的action
2.10%的情况下,随机选择一个action
'''
score_of_4_dir,score_of_4_direction = get_neibor_q(state_x,state_y,q_table)
#>0.9,也就是产生的随机数是在0.9~1之间,10%的情况下,随机选择下部一的动作
#全部为0的情况也随机选取下一步的状态
sign_all_0 = True
#判断上下左右四个分数是不是全部为0
for i in range(len(score_of_4_dir)):
if score_of_4_dir[i] != 0 and score_of_4_dir[i] != -100:
sign_all_0= False
break
if np.random.uniform() > Greedy_plicy or sign_all_0 == True :
#socre = -100的位置表示已经超出了游戏边界,这个方向不能走,三级选择的方向里面需要排除掉这些,
#再进行随机选择
action_choice = Action.copy()
for i in range(4): #四个方向
if score_of_4_dir[i] == -100:
action_choice[i] = 'prohibit'
#一直随机挑选,不能选择标记为none的方向
action_name = ''
while action_name == 'prohibit' or action_name == '':
action_name = np.random.choice(action_choice)
else:
#标记为none的地方对应的q表值为-100,不可能是最大值
action_name = Action[score_of_4_dir.index(max(score_of_4_dir))] #返回state_of_action里面较大值的索引
return action_name
def get_neibor_q(state_x,state_y,q_table):
'''
获取当前状态的上下左右四个状态的q值以及对应的坐标
'''
#当前的点不一定能向四个方向前进,可能到边上了
#分数在列表里面的存储顺序默认为 上下左右
position_of_4_direction = [[state_x-1, state_y],\
[state_x+1, state_y],\
[state_x, state_y-1],\
[state_x, state_y+1]]
score_of_4_dir = []
for i in range(len(position_of_4_direction)):
if position_of_4_direction[i][0] < 0 or position_of_4_direction[i][0] >= N_state_x \
or position_of_4_direction[i][1] < 0 or position_of_4_direction[i][1] >= N_state_y:
position_of_4_direction[i][0] = -100
position_of_4_direction[i][1] = -100 #设置成这个值,表示这个方向不能走
score_of_4_dir.append(-100)
else:
# print("position_of_4_direction[{}][0]={} ".format(i,position_of_4_direction[i][0]))
# print("\n")
# print("position_of_4_direction[{}][0]={}".format(i,position_of_4_direction[i][1]))
# print("\n")
score_of_4_dir.append(q_table[position_of_4_direction[i][0],position_of_4_direction[i][1]])
return score_of_4_dir,position_of_4_direction
def reforcement_learning():
table = build_q_table(N_state_x,N_state_y)
for episode in range(max_episodes):
state_x,state_y = 0,0
step_counter =0
create_environment(state_x,state_y,episode,step_counter,table)
while state_x != 1000:
A = choose_action(state_x,state_y,table)
state_x_,state_y_,R = get_evn_feedback(state_x,state_y,A)
q_predict = table[state_x,state_y]
if state_x_ != 1000 :
score_of_4_dir,position_of_4_direction = get_neibor_q(state_x,state_y,table)
position_of_4_direction = get_neibor_q(state_x,state_y,table)
reality = R + discount_factor * max(score_of_4_dir)
else:
reality= R #discount_factor * max(table.iloc[S_,:])不存在了,已经终结没有下一个状态了
state_x,state_y = 1000,1000
table[state_x,state_y] = table[state_x,state_y] + Learning_rate * (reality - q_predict)
state_x,state_y = state_x_,state_y_
create_environment(state_x,state_y,episode,step_counter+1,table)
step_counter += 1
return table
if __name__== "__main__":
N_state_x = 4 #状态的种类
N_state_y = 4
Action = ['up','down','left','right']
Greedy_plicy = 0.9 #多少比例择优选择action,其余比例下随机选择action
Learning_rate = 0.1 #学习率
discount_factor = 0.9 #未来奖励的衰减值
max_episodes = 50 #只玩13回合就结束
Fresh_time = 0.3 #0.3秒显示走一步
np.random.seed(2)
q_table = reforcement_learning()
print(q_table)