import time
import random
# 相对于Q 效果会差一些
class Env():def __init__(self, length, height):# define the height and length of the mapself.length = lengthself.height = height# define the agent's start positionself.x = 0self.y = 0def render(self, frames=50):for i in range(self.height):if i == 0: # cliff is in the line 0line = ['S'] + ['x']*(self.length - 2) + ['T'] # 'S':start, 'T':terminal, 'x':the cliffelse:line = ['.'] * self.lengthif self.x == i:line[self.y] = 'o' # mark the agent's position as 'o'print(''.join(line))print('\033['+str(self.height+1)+'A')  # printer go back to top-lefttime.sleep(1.0 / frames)def step(self, action):"""4 legal actions, 0:up, 1:down, 2:left, 3:right"""change = [[0, 1], [0, -1], [-1, 0], [1, 0]]self.x = min(self.height - 1, max(0, self.x + change[action][0]))self.y = min(self.length - 1, max(0, self.y + change[action][1]))states = [self.x, self.y]reward = -1 # 每一步的奖赏terminal = Falseif self.x == 0: # if agent is on the cliff line "SxxxxxT"if self.y > 0: # if agent is not on the start positionterminal = Trueif self.y != self.length - 1: # if agent fallsreward = -100 # 进入悬崖的奖赏return reward, states, terminaldef reset(self):self.x = 0self.y = 0class Q_table():def __init__(self, length, height, actions=4, alpha=0.1, gamma=0.9):self.table = [0] * actions * length * height # initialize all Q(s,a) to zeroself.actions = actionsself.length = lengthself.height = heightself.alpha = alphaself.gamma = gammadef _index(self, a, x, y):"""Return the index of Q([x,y], a) in Q_table."""return a * self.height * self.length + x * self.length + ydef _epsilon(self):return 0.1 # 可更改# version for better convergence:# """At the beginning epsilon is 0.2, after 300 episodes decades to 0.05, and eventually go to 0."""# return 20. / (num_episode + 100)def take_action(self, x, y, num_episode):"""epsilon-greedy action selection"""if random.random() < self._epsilon():return int(random.random() * 4)else:actions_value = [self.table[self._index(a, x, y)] for a in range(self.actions)]return actions_value.index(max(actions_value))def epsilon_q(self, x, y): # 更改actions_value = [self.table[self._index(a, x, y)] for a in range(self.actions)]# 更改return max(actions_value) if random.random() > self._epsilon()  else actions_value[int(random.random() * 4)]def update(self, a, s0, s1, r, is_terminated):# both s0, s1 have the form [x,y]q_predict = self.table[self._index(a, s0[0], s0[1])]if not is_terminated:q_target = r + self.gamma * self.epsilon_q(s1[0], s1[1]) # 更改else:q_target = rself.table[self._index(a, s0[0], s0[1])] += self.alpha * (q_target - q_predict)def cliff_walk():env = Env(length=12, height=4)table = Q_table(length=12, height=4)for num_episode in range(5000):# within the whole learning processepisodic_reward = 0is_terminated = Falses0 = [0, 0]while not is_terminated:# within one episodeaction = table.take_action(s0[0], s0[1], num_episode)r, s1, is_terminated = env.step(action)table.update(action, s0, s1, r, is_terminated)episodic_reward += r# env.render(frames=100)s0 = s1if num_episode % 1 == 0:print("Episode: {}, Score: {}".format(num_episode, episodic_reward))env.reset()cliff_walk()

Episode: 0, Score: -100
Episode: 20, Score: -147
Episode: 40, Score: -48
Episode: 60, Score: -131
Episode: 80, Score: -54
Episode: 100, Score: -63
Episode: 120, Score: -39
Episode: 140, Score: -100
Episode: 160, Score: -38
Episode: 180, Score: -31
Episode: 200, Score: -28
Episode: 220, Score: -25
Episode: 240, Score: -17
Episode: 260, Score: -26
Episode: 280, Score: -103
Episode: 300, Score: -17
Episode: 320, Score: -100
Episode: 340, Score: -17
Episode: 360, Score: -21
Episode: 380, Score: -23
Episode: 400, Score: -19
Episode: 420, Score: -24
Episode: 440, Score: -23
Episode: 460, Score: -100
Episode: 480, Score: -16
Episode: 500, Score: -17
Episode: 520, Score: -28
Episode: 540, Score: -15
Episode: 560, Score: -15
Episode: 580, Score: -17
Episode: 600, Score: -100
Episode: 620, Score: -19
Episode: 640, Score: -19
Episode: 660, Score: -102
Episode: 680, Score: -17
Episode: 700, Score: -16
Episode: 720, Score: -17
Episode: 740, Score: -19
Episode: 760, Score: -115
Episode: 780, Score: -15
Episode: 800, Score: -17
Episode: 820, Score: -16
Episode: 840, Score: -15
Episode: 860, Score: -15
Episode: 880, Score: -17
Episode: 900, Score: -17
Episode: 920, Score: -19
Episode: 940, Score: -17
Episode: 960, Score: -18
Episode: 980, Score: -23
Episode: 1000, Score: -19
Episode: 1020, Score: -18
Episode: 1040, Score: -17
Episode: 1060, Score: -20
Episode: 1080, Score: -17
Episode: 1100, Score: -17
Episode: 1120, Score: -19
Episode: 1140, Score: -21
Episode: 1160, Score: -24
Episode: 1180, Score: -20
Episode: 1200, Score: -21
Episode: 1220, Score: -19
Episode: 1240, Score: -19
Episode: 1260, Score: -17
Episode: 1280, Score: -23
Episode: 1300, Score: -17
Episode: 1320, Score: -15
Episode: 1340, Score: -15
Episode: 1360, Score: -15
Episode: 1380, Score: -20
Episode: 1400, Score: -19
Episode: 1420, Score: -17
Episode: 1440, Score: -15
Episode: 1460, Score: -17
Episode: 1480, Score: -15
Episode: 1500, Score: -15
Episode: 1520, Score: -15
Episode: 1540, Score: -15
Episode: 1560, Score: -18
Episode: 1580, Score: -17
Episode: 1600, Score: -15
Episode: 1620, Score: -20
Episode: 1640, Score: -17
Episode: 1660, Score: -117
Episode: 1680, Score: -21
Episode: 1700, Score: -21
Episode: 1720, Score: -22
Episode: 1740, Score: -18
Episode: 1760, Score: -19
Episode: 1780, Score: -17
Episode: 1800, Score: -19
Episode: 1820, Score: -19
Episode: 1840, Score: -17
Episode: 1860, Score: -20
Episode: 1880, Score: -17
Episode: 1900, Score: -21
Episode: 1920, Score: -17
Episode: 1940, Score: -17
Episode: 1960, Score: -15
Episode: 1980, Score: -17
Episode: 2000, Score: -15
Episode: 2020, Score: -19
Episode: 2040, Score: -17
Episode: 2060, Score: -19
Episode: 2080, Score: -18
Episode: 2100, Score: -17
Episode: 2120, Score: -18
Episode: 2140, Score: -18
Episode: 2160, Score: -17
Episode: 2180, Score: -21
Episode: 2200, Score: -20
Episode: 2220, Score: -21
Episode: 2240, Score: -18
Episode: 2260, Score: -17
Episode: 2280, Score: -17
Episode: 2300, Score: -18
Episode: 2320, Score: -18
Episode: 2340, Score: -17
Episode: 2360, Score: -17
Episode: 2380, Score: -19
Episode: 2400, Score: -18
Episode: 2420, Score: -100
Episode: 2440, Score: -19
Episode: 2460, Score: -23
Episode: 2480, Score: -19
Episode: 2500, Score: -19
Episode: 2520, Score: -18
Episode: 2540, Score: -18
Episode: 2560, Score: -19
Episode: 2580, Score: -21
Episode: 2600, Score: -18
Episode: 2620, Score: -21
Episode: 2640, Score: -20
Episode: 2660, Score: -17
Episode: 2680, Score: -19
Episode: 2700, Score: -18
Episode: 2720, Score: -19
Episode: 2740, Score: -22
Episode: 2760, Score: -19
Episode: 2780, Score: -22
Episode: 2800, Score: -17
Episode: 2820, Score: -17
Episode: 2840, Score: -18
Episode: 2860, Score: -17
Episode: 2880, Score: -21
Episode: 2900, Score: -21
Episode: 2920, Score: -17
Episode: 2940, Score: -18
Episode: 2960, Score: -17
Episode: 2980, Score: -19
Episode: 3000, Score: -18
Episode: 3020, Score: -17
Episode: 3040, Score: -17
Episode: 3060, Score: -21
Episode: 3080, Score: -15
Episode: 3100, Score: -19
Episode: 3120, Score: -17
Episode: 3140, Score: -17
Episode: 3160, Score: -17
Episode: 3180, Score: -17
Episode: 3200, Score: -17
Episode: 3220, Score: -18
Episode: 3240, Score: -19
Episode: 3260, Score: -19
Episode: 3280, Score: -17
Episode: 3300, Score: -18
Episode: 3320, Score: -17
Episode: 3340, Score: -25
Episode: 3360, Score: -18
Episode: 3380, Score: -17
Episode: 3400, Score: -19
Episode: 3420, Score: -17
Episode: 3440, Score: -15
Episode: 3460, Score: -118
Episode: 3480, Score: -17
Episode: 3500, Score: -15
Episode: 3520, Score: -17
Episode: 3540, Score: -19
Episode: 3560, Score: -21
Episode: 3580, Score: -17
Episode: 3600, Score: -17
Episode: 3620, Score: -17
Episode: 3640, Score: -19
Episode: 3660, Score: -15
Episode: 3680, Score: -15
Episode: 3700, Score: -100
Episode: 3720, Score: -17
Episode: 3740, Score: -17
Episode: 3760, Score: -100
Episode: 3780, Score: -100
Episode: 3800, Score: -17
Episode: 3820, Score: -18
Episode: 3840, Score: -19
Episode: 3860, Score: -17
Episode: 3880, Score: -19
Episode: 3900, Score: -19
Episode: 3920, Score: -19
Episode: 3940, Score: -18
Episode: 3960, Score: -18
Episode: 3980, Score: -15
Episode: 4000, Score: -19
Episode: 4020, Score: -17
Episode: 4040, Score: -20
Episode: 4060, Score: -19
Episode: 4080, Score: -17
Episode: 4100, Score: -19
Episode: 4120, Score: -15
Episode: 4140, Score: -22
Episode: 4160, Score: -17
Episode: 4180, Score: -22
Episode: 4200, Score: -18
Episode: 4220, Score: -18
Episode: 4240, Score: -19
Episode: 4260, Score: -100
Episode: 4280, Score: -17
Episode: 4300, Score: -19
Episode: 4320, Score: -17
Episode: 4340, Score: -19
Episode: 4360, Score: -21
Episode: 4380, Score: -22
Episode: 4400, Score: -21
Episode: 4420, Score: -18
Episode: 4440, Score: -22
Episode: 4460, Score: -17
Episode: 4480, Score: -20
Episode: 4500, Score: -17
Episode: 4520, Score: -17
Episode: 4540, Score: -17
Episode: 4560, Score: -19
Episode: 4580, Score: -17
Episode: 4600, Score: -19
Episode: 4620, Score: -24
Episode: 4640, Score: -18
Episode: 4660, Score: -17
Episode: 4680, Score: -17
Episode: 4700, Score: -19
Episode: 4720, Score: -15
Episode: 4740, Score: -17
Episode: 4760, Score: -19
Episode: 4780, Score: -17
Episode: 4800, Score: -19
Episode: 4820, Score: -19
Episode: 4840, Score: -21
Episode: 4860, Score: -19
Episode: 4880, Score: -18
Episode: 4900, Score: -17
Episode: 4920, Score: -20
Episode: 4940, Score: -17
Episode: 4960, Score: -17
Episode: 4980, Score: -17

Model-Free TD Control: Sarsa相关推荐

  1. 论文翻译 —— Model Free Episodic Control

    标题:Model Free Episodic Control 文章链接:Model Free Episodic Control 代码实现:sudeepraja Model-Free-Episodic- ...

  2. 论文理解【RL - Episodic Control】 ——【MFEC】Model Free Episodic Control

    标题:Model Free Episodic Control 文章链接:Model Free Episodic Control 代码实现:sudeepraja Model-Free-Episodic- ...

  3. MPCC(Model Predictive Contouring Control)—自动驾驶中的轨迹控制算法

    MPCC(Model Predictive Contouring Control)-自动驾驶中的轨迹控制算法 引言 在自动驾驶技术快速发展的背景下,轨迹控制算法成为实现高性能自动驾驶的关键技术之一.而 ...

  4. 4、强化学习--model free 控制

    model free控制 有模型下的策略迭代 Ɛ-贪婪探索(MC-control) GLIE(Greedy in the Limit with Infinite Exploration) TD Con ...

  5. 强化学习(五) - 时序差分学习(Temporal-Difference Learning)及其实例----Sarsa算法, Q学习, 期望Sarsa算法

    强化学习(五) - 时序差分学习(Temporal-Difference Learning)及其实例 5.1 TD预测 例5.1 回家时间的估计 5.2 TD预测方法的优势 例5.2 随机移动 5.3 ...

  6. 第二章 马尔可夫决策过程及表格型方法

    马尔可夫决策过程. 在介绍马尔可夫决策过程之前,先介绍它的简化版本:马尔可夫链以及马尔可夫奖励过程,通过跟这两种过程的比较,我们可以更生动地理解马尔可夫决策过程. 第二部分会介绍马尔可夫决策过程中的 ...

  7. 【David Silver强化学习公开课】-5:Model-Free Control

    一.介绍 这一讲的内容是大部分情况下真实使用的算法,也就是在对环境一无所知的情况下,去学习出一个好的策略.首先介绍一些概念: Model-Free Control,在环境未知的前提下,如何学习策略(价 ...

  8. RL 实践(3)—— 悬崖漫步【QLearning Sarsa 各种变体】

    本文介绍如何用 QLeaning 系列和 Sarsa 系列表格方法解经典的悬崖漫步 (Cliff Walking) 问题 完整代码下载:4_[Gym Custom] Cliff Walking (Q- ...

  9. 6 Temporal-Difference (TD) Learning

    [上一节5 蒙特卡洛方法(Monte Carlo Method)] [下一节7 Multi-step Bootstrapping] Temporal-difference (TD) Learning ...

最新文章

  1. 干货分享 | 阿里PB级Kubernetes日志平台建设实践
  2. jdbc连接Oracle/MySQL数据库进行批量导入操作,如何提高效率???
  3. 解决 /lib64/libc.so.6: version `GLIBC_2.15‘ not found 问题
  4. 这篇看完我得理解ES6中中常见语法
  5. 吴恩达深度学习笔记(八) —— ResNets残差网络
  6. dataframe根据时间戳timestamp切分成多个dataframe
  7. 龙腾世纪:起源(推荐一个可以用来英语学习的RPG游戏)
  8. 使用Box2D制作AS3游戏——2.1a版本——Hello World Box2D .
  9. How to Install Ruby on Rails on CentOS 6
  10. HDU 1285 确定比赛名次【拓扑排序】
  11. stvd使用c语言编程,STVD使用教程.pdf
  12. 异数OS 织梦师-纤手(二)-- LPC RPC篇
  13. Java使用二维码实现签到技术
  14. EDG夺冠火爆全网,官网域名用的如何?
  15. MYSQL-mysql中的truncate的用法
  16. 工信部:发挥、坚持、强化,做数字化转型主力军
  17. 【AGC012E】 Camel and Oases ST表+状压dp
  18. Apollo的启动步骤和客户端调用
  19. 2021年中国货物运输量、货物运输周转量及港口货物情况分析[图]
  20. 计算机毕业设计Java家电产品售后(源码+系统+mysql数据库+lw文档)

热门文章

  1. xml格式数据转coco数据
  2. mysql中func什么意思_在C语言程序中,func()是什么意思?
  3. Android中使用的三角函数与反三角函数
  4. Java JDK下载安装环境配置详细图文教程
  5. GHOST“克隆”软件
  6. Android情人节短信祝福应用源码
  7. 蓄电池内阻在线监测及告警方案
  8. <title> 标签:定义文档的标题
  9. 【程序员变帅指南】相亲不穿特步,提升衣品变酷
  10. 解决idea开启tomcat报错问题 Configuration Error: deployment source ‘untitled:war exploded‘ is not valid