本文作者:hhh5460
本文地址:
特别感谢:本文的三幅图皆来自莫凡的教程 https://morvanzhou.github.io/
pandas是基于numpy的,但是两者之间的操作有区别,故在实现上述算法时的细节有出入。故记录之
几点说明:
1). 为了更好的说明问题,采用最简单的例一。
2). 分离了环境与个体,采用类编程的形式。
3). 调整了环境与个体的变量、函数的位置,使得Agent完全不需要改动!
4). 个体与环境的互动逻辑更符合实际
〇、效果图
一、pandas实现
1.q-learning
class RLQLearning(Agent): '''Agent的子类''' def __init__(self, env): super().__init__(env) def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.4): '''学习''' print('q-learning算法') for _ in range(episode): s = self.env.reset() is_win = False while not is_win: a = self.observe(s, epsilon) r, s1, is_win = self.env.step(a) self.Q.ix[s, a] += alpha * (r + gamma * self.Q.ix[s1, self.env.get_valid_actions(s1)].max() - self.Q.ix[s, a]) s = s1
2.saras
class RLSaras(Agent): '''Agent的子类''' def __init__(self, env): super().__init__(env) def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.4): '''学习''' print('saras算法') for _ in range(episode): s = self.env.reset() a = self.observe(s, epsilon) is_win = False while not is_win: r, s1, is_win = self.env.step(a) a1 = self.observe(s1, epsilon) self.Q.ix[s, a] += alpha * (r + gamma * self.Q.ix[s1, a1] - self.Q.ix[s, a]) s, a = s1, a1
3.saras(lambda)
class RLSarasLambda(Agent): '''Agent的子类''' def __init__(self, env): super().__init__(env) self.E = self.Q.copy() # 复制Q table def learn(self, alpha=0.01, gamma=0.9, lambda_=0.9, episode=100, epsilon=0.4): '''学习''' print('saras(lambda)算法,lambda_为衰减值') for _ in range(episode): self.E *= 0 s = self.env.reset() a = self.observe(s, epsilon) is_win = False while not is_win: r, s1, is_win = self.env.step(a) a1 = self.observe(s1, epsilon) delta = r + gamma * self.Q.ix[s1, a1] - self.Q.ix[s, a] #self.E.ix[s, a] += 1 # 效果不如下两句 self.E.ix[s] *= 0 self.E.ix[s, a] = 1 for s_ in self.env.states: for a_ in self.env.actions: self.Q.ix[s_, a_] += alpha * delta * self.E.ix[s_, a_] self.E.ix[s_, a_] *= gamma * lambda_ s, a = s1, a1
4.完整代码
1 import pandas as pd 2 import random 3 import time 4 5 6 ''' 7 -o---T 8 # T 就是宝藏的位置, o 是探索者的位置 9 ''' 10 11 # 作者:hhh5460 12 # 时间:20181221 13 # 地点:Tai Zi Miao 14 15 class Env(object): 16 '''环境''' 17 def __init__(self): 18 '''初始化''' 19 self.board = list('-----T') 20 self.states = range(6) 21 self.actions = ['left', 'right'] 22 self.rewards = [0,0,0,0,0,1] 23 24 def get_valid_actions(self, state): 25 '''取当前状态下所有的合法动作''' 26 valid_actions = [] 27 if state != 5: # 除末状态(位置),皆可向右 28 valid_actions.append('right') 29 if state != 0: # 除首状态(位置),皆可向左 30 valid_actions.append('left') 31 return valid_actions 32 33 def _step(self, action): 34 '''执行动作,到达新状态''' 35 if action == 'right' and self.state != self.states[-1]: # 除末状态(位置),向右+1 36 self.state += 1 37 elif action == 'left' and self.state != self.states[0]: # 除首状态(位置),向左-1 38 self.state -= 1 39 40 def reset(self): 41 '''重置环境,返回状态0''' 42 self.board = list('-----T') 43 self.state = 0 44 self.board[self.state] = 'o' 45 print('\r ', end='') 46 print('\r{}'.format(''.join(self.board)), end='') 47 return self.state 48 49 def step(self, action, step_time=0.1): 50 '''执行动作 返回奖励、新状态、胜利标志''' 51 self.board[self.state] = '-' # 擦除旧位置'o' 52 self._step(action) # 到达新位置 53 self.board[self.state] = 'o' # 改变新位置 54 55 reward = self.rewards[self.state] # 奖励 56 is_win = [False, True][self.state == self.states[-1]] # 胜利标志 57 if is_win == True: 58 print('\r{} WIN!'.format(''.join(self.board)), end='') # 胜利,则加特写镜头 59 else: 60 print('\r{}'.format(''.join(self.board)), end='') 61 time.sleep(step_time) 62 63 return reward, self.state, is_win 64 65 66 class Agent(object): 67 '''智能体''' 68 def __init__(self, env): 69 '''初始化''' 70 # 环境 71 self.env = env 72 # 大脑 73 self.Q = pd.DataFrame(data=[[0 for _ in self.env.actions] for _ in self.env.states], 74 index=self.env.states, 75 columns=self.env.actions) 76 77 def observe(self, state, epsilon=0.4): 78 '''观察''' 79 # 根据自身所处状态,按某种策略选择相应的动作 80 if random.uniform(0,1) < epsilon: # 贪婪 81 s = self.Q.ix[state].filter(items=self.env.get_valid_actions(state)) 82 action = random.choice(s[s==s.max()].index) # 可能多个最大值! 83 else: # 探索 84 action = random.choice(self.env.get_valid_actions(state)) 85 return action 86 87 def learn(self,*args, **kw): 88 '''学习''' 89 pass 90 91 def play(self, step_time=0.5): 92 '''玩耍''' 93 # 学有所成 94 s = self.env.reset() 95 is_win = False 96 while not is_win: 97 a = self.observe(s, epsilon=1.) # 1.,100%贪婪,即利用 98 _, s1, is_win = self.env.step(a, step_time) 99 s = s1100 print()101 102 class RLQLearning(Agent):103 '''Agent的子类'''104 def __init__(self, env):105 super().__init__(env)106 107 def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.4):108 '''学习'''109 print('q-learning算法')110 for _ in range(episode):111 s = self.env.reset()112 is_win = False113 while not is_win:114 a = self.observe(s, epsilon)115 r, s1, is_win = self.env.step(a)116 self.Q.ix[s, a] += alpha * (r + gamma * self.Q.ix[s1, self.env.get_valid_actions(s1)].max() - self.Q.ix[s, a])117 s = s1118 119 class RLSaras(Agent):120 '''Agent的子类'''121 def __init__(self, env):122 super().__init__(env)123 124 def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.4):125 '''学习'''126 print('saras算法')127 for _ in range(episode):128 s = self.env.reset()129 a = self.observe(s, epsilon)130 is_win = False131 while not is_win:132 r, s1, is_win = self.env.step(a)133 a1 = self.observe(s1, epsilon)134 self.Q.ix[s, a] += alpha * (r + gamma * self.Q.ix[s1, a1] - self.Q.ix[s, a])135 s, a = s1, a1136 137 class RLSarasLambda(Agent):138 '''Agent的子类'''139 def __init__(self, env):140 super().__init__(env)141 self.E = self.Q.copy() # 复制Q table142 143 def learn(self, alpha=0.01, gamma=0.9, lambda_=0.9, episode=100, epsilon=0.4):144 '''学习'''145 print('saras(lambda)算法,lambda_为衰减值')146 for _ in range(episode):147 self.E *= 0148 s = self.env.reset()149 a = self.observe(s, epsilon)150 is_win = False151 while not is_win:152 r, s1, is_win = self.env.step(a)153 a1 = self.observe(s1, epsilon)154 delta = r + gamma * self.Q.ix[s1, a1] - self.Q.ix[s, a]155 #self.E.ix[s, a] += 1 # 效果不如下两句156 self.E.ix[s] *= 0157 self.E.ix[s, a] = 1158 for s_ in self.env.states:159 for a_ in self.env.actions:160 self.Q.ix[s_, a_] += alpha * delta * self.E.ix[s_, a_]161 self.E.ix[s_, a_] *= gamma * lambda_162 s, a = s1, a1163 164 165 if __name__ == '__main__':166 env = Env() # 环境167 168 agent = RLQLearning(env) # 个体169 agent.learn(episode=13) # 先学170 agent.play() # 再玩171 172 agent2 = RLSaras(env) # 个体2173 agent2.learn(episode=13) # 先学174 agent2.play() # 再玩175 176 agent3 = RLSarasLambda(env) # 个体3177 agent3.learn(episode=13) # 先学178 agent3.play() # 再玩
二、numpy实现
1.q-learning
2.saras
3.saras(lambda)
4.完整代码
1 import numpy as np 2 import time 3 4 5 ''' 6 -o---T 7 # T 就是宝藏的位置, o 是探索者的位置 8 ''' 9 10 # 作者:hhh5460 11 # 时间:20181221 12 # 地点:Tai Zi Miao 13 14 class Env(object): 15 '''环境''' 16 def __init__(self): 17 '''初始化''' 18 self.board = list('-----T') 19 self.states = range(6) 20 self.actions = ['left', 'right'] # 索引[0,1] 21 self.rewards = [0,0,0,0,0,1] 22 23 def get_valid_actions(self, state): 24 '''取当前状态下所有的合法动作(索引)''' 25 valid_actions = [] 26 if state != self.states[0]: # 除首状态(位置),皆可向左 27 valid_actions.append(self.actions.index('left')) 28 if state != self.states[-1]: # 除末状态(位置),皆可向右 29 valid_actions.append(self.actions.index('right')) 30 return valid_actions 31 32 def _step(self, action): 33 '''执行动作(索引),到达新状态''' 34 if self.actions[action] == 'left' and self.state > self.states[0]: # 除首状态(位置),向左-1 35 self.state = self.state - 1 36 elif self.actions[action] == 'right' and self.state < self.states[-1]: # 除末状态(位置),向右+1 37 self.state = self.state + 1 38 39 def reset(self): 40 '''重置环境,返回状态0''' 41 self.board = list('-----T') 42 self.state = 0 43 self.board[self.state] = 'o' 44 print('\r ', end='') 45 print('\r{}'.format(''.join(self.board)), end='') 46 return self.state 47 48 def step(self, action, step_time=0.1): 49 '''执行动作 返回奖励、新状态、胜利标志''' 50 self.board[self.state] = '-' # 擦除旧位置'o' 51 self._step(action) # 到达新位置 52 self.board[self.state] = 'o' # 改变新位置 53 54 reward = self.rewards[self.state] # 奖励 55 is_win = [False, True][self.state == self.states[-1]] # 胜利标志 56 if is_win == True: 57 print('\r{} WIN!'.format(''.join(self.board)), end='') # 胜利,则加特写镜头 58 else: 59 print('\r{}'.format(''.join(self.board)), end='') 60 time.sleep(step_time) 61 62 return reward, self.state, is_win 63 64 65 class Agent(object): 66 '''智能体''' 67 def __init__(self, env): 68 '''初始化''' 69 # 环境 70 self.env = env 71 # 大脑 72 self.Q = np.zeros((len(self.env.states), len(self.env.actions)), dtype=np.float32) 73 74 def observe(self, state, epsilon=0.8): 75 '''观察''' 76 # 根据自身所处状态,按某种策略选择相应的动作(索引) 77 valid_actions = self.env.get_valid_actions(state) 78 arr = self.Q[state, valid_actions] 79 if (np.random.uniform() > epsilon 80 or arr.max() == 0 81 or len(arr[arr==arr.max()]) > 1): 82 action = np.random.choice(valid_actions) # 探索 83 else: 84 action = self.Q[state].argmax() # 利用 85 return action 86 87 def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.8): 88 '''学习''' 89 pass 90 91 def play(self, step_time=0.5): 92 '''玩耍''' 93 # 学有所成 94 s = self.env.reset() 95 is_win = False 96 while not is_win: 97 a = self.observe(s, epsilon=1.) # 1.,100%贪婪,即利用 98 _, s1, is_win = self.env.step(a, step_time) 99 s = s1100 print()101 102 class RLQLearning(Agent):103 '''智能体'''104 def __init__(self, env):105 '''初始化'''106 super().__init__(env)107 108 def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.8):109 '''学习'''110 print('q-learning算法')111 for _ in range(episode):112 s = self.env.reset()113 is_win = False114 while not is_win:115 a = self.observe(s, epsilon)116 r, s1, is_win = self.env.step(a)117 self.Q[s, a] += alpha * (r + gamma * self.Q[s1, self.env.get_valid_actions(s1)].max() - self.Q[s, a])118 s = s1119 120 class RLSaras(Agent):121 '''Agent的子类'''122 def __init__(self, env):123 super().__init__(env)124 125 def learn(self, alpha=0.01, gamma=0.9, episode=100, epsilon=0.4):126 '''学习'''127 print('saras算法')128 for _ in range(episode):129 s = self.env.reset()130 a = self.observe(s, epsilon)131 is_win = False132 while not is_win:133 r, s1, is_win = self.env.step(a)134 a1 = self.observe(s1, epsilon)135 self.Q[s, a] += alpha * (r + gamma * self.Q[s1, a1] - self.Q[s, a])136 s, a = s1, a1137 138 class RLSarasLambda(Agent):139 '''Agent的子类'''140 def __init__(self, env):141 super().__init__(env)142 self.E = self.Q.copy() # 复制Q table143 144 def learn(self, alpha=0.01, gamma=0.9, lambda_=0.9, episode=100, epsilon=0.4):145 '''学习'''146 print('saras(lambda)算法,lambda_为衰减值')147 for _ in range(episode):148 self.E *= 0149 s = self.env.reset()150 a = self.observe(s, epsilon)151 is_win = False152 while not is_win:153 r, s1, is_win = self.env.step(a)154 a1 = self.observe(s1, epsilon)155 delta = r + gamma * self.Q[s1, a1] - self.Q[s, a]156 #self.E.ix[s, a] += 1 # 效果不如下两句157 self.E[s] *= 0158 self.E[s, a] = 1159 for s_ in self.env.states:160 for a_ in range(len(self.env.actions)): # 遍历动作索引!!161 self.Q[s_, a_] += alpha * delta * self.E[s_, a_]162 self.E[s_, a_] *= gamma * lambda_163 s, a = s1, a1164 165 if __name__ == '__main__':166 env = Env() # 环境167 agent = RLQLearning(env) # 个体168 agent.learn(episode=13) # 先学169 agent.play() # 再玩170 171 agent2 = RLSaras(env) # 个体2172 agent2.learn(episode=13) # 先学173 agent2.play() # 再玩174 175 agent3 = RLSarasLambda(env) # 个体3176 agent3.learn(episode=13) # 先学177 agent3.play() # 再玩