ddqn玩flappybird

在这篇的基础上修改，git
5w步时平均过6个，5.7w(3.5h)过300多个。

dqn：eval计算当前state的q(q_eval)，target计算下个state的q，取最大值*gama+当前r来更新q_eval(上一篇没有使用target)
ddqn：eval计算当前state的q(q_eval)，eval计算下个state的q，取最大值的行动action，target计算下个state的action的q(q_target)，用q_target*gama+当前r来更新q_eval

loss没有收敛很正常，因为有新的state不断的加入。

from tensorflow import keras
import numpy as np, time, cv2, sys, tensorflow as tf, mathsys.path.append("game/")
import dqn_flappy_bird.game.wrapped_flappy_bird as game# train_on_batch + tensorboard 用
def named_logs(model, logs):result = {}for l in zip(model.metrics_names, logs):result[l[0]] = l[1]return resultclass DoubleDQN:GAME = 'bird'  # the name of the game being played for log filesn_actions = 2  # number of valid actions# 4帧图像作输入，可以提取速度方向的信息n_features = 80 * 80 * 4  # number of valid actionsgamma = 0.99  # decay rate of past observationsOBSERVE = 2.EXPLORE = 50000.  # frames over which to anneal epsilonFINAL_EPSILON = 0.0001  # final value of epsilonINITIAL_EPSILON =1  # 一开始随机探索# INITIAL_EPSILON = 0.0001  # 探索完后去掉随机epsilon = INITIAL_EPSILON# 一开始每5步choose_action，剩下4步不动，以更快更多的拿到reward=1的记录FRAME_PER_ACTION=5# FRAME_PER_ACTION = 1lr = 1e-4batch_size = 32  # size of batchmemory_size = 7000  # number of previous transitions to remembermemory_count = 0# 初期reward=1概率极低，单独保存成功的记忆不被覆盖batch_plus_size = 8  # size of success batchmemory_plus_size = 1000memory_plus_count = 0# 预计失败的记忆能学到更多东西，保证每次学习都有失败的记忆(成功的记忆一样)batch_minus_size = 8memory_minus_size = 1000memory_minus_count = 0update_model_step = 1e3ModelFP = 'models/model_diy.h5'# 每 save_model_step 步保存一次模型save_model_step = 2e3# 每 learn_step 步学习一次，也许不需要此参数learn_step = 5step = 0# 各层的初始权重kernel_initializer = 'truncated_normal'# cmd里 tensorboard --logdir=**/logs，可以localhost:6006查看训练曲线# 有时logdir要输入全路径，否则找不到训练数据tensorboard = keras.callbacks.TensorBoard(log_dir='logs',  # TensorBoard文件保存的路径batch_size=batch_size,)def __init__(self):self.memory = np.zeros((self.memory_size, 2 + 2 * self.n_features + self.n_actions))self.memory_plus = np.zeros((self.memory_plus_size, 2 + 2 * self.n_features + self.n_actions))self.memory_minus = np.zeros((self.memory_minus_size, 2 + 2 * self.n_features + self.n_actions))self.model = self._build_net((80, 80, 4))self.model_target = self._build_net((80, 80, 4))self.tensorboard.set_model(self.model)self.load_model()def _build_net(self, input_shape):model = keras.Sequential()conv1 = keras.layers.Conv2D(filters=32, input_shape=input_shape, kernel_size=(8, 8),kernel_initializer=self.kernel_initializer, activation="relu", padding="SAME")model.add(conv1)pool1 = keras.layers.MaxPooling2D(pool_size=(2, 2), padding='same', data_format='channels_last')model.add(pool1)conv2 = keras.layers.Conv2D(filters=64, kernel_size=(4, 4), activation="relu",kernel_initializer=self.kernel_initializer, padding="SAME")model.add(conv2)# pool2 = keras.layers.MaxPooling2D(pool_size=(2, 2), padding='same', data_format='channels_last')# model.add(pool2)conv3 = keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation="relu",kernel_initializer=self.kernel_initializer, padding="SAME")model.add(conv3)flat = keras.layers.Flatten()model.add(flat)fc1 = keras.layers.Dense(512, kernel_initializer=self.kernel_initializer, activation='relu')model.add(fc1)fc2 = keras.layers.Dense(self.n_actions, kernel_initializer=self.kernel_initializer)model.add(fc2)optimizer = keras.optimizers.Adam(learning_rate=self.lr)model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])return modeldef preprocess_pic(self, pic):x_t = cv2.cvtColor(cv2.resize(pic, (80, 80)), cv2.COLOR_BGR2GRAY)ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)# x=np.where(x_t[:,:63]==255)# print("x",x)# cv2.imshow('x', x_t);# cv2.waitKey(0)return x_t / 255  # 图片做归一化# 分开保存成功/失败/其他的记忆(也许不需要分开)def store_transition(self, s_t, action, reward, terminal, s_t_):data = np.hstack((s_t.flatten(), action, reward, terminal, s_t_.flatten()))if reward == 1:index = self.memory_plus_count % self.memory_plus_sizeself.memory_plus[index] = dataself.memory_plus_count += 1elif reward <= -1:index = self.memory_minus_count % self.memory_minus_sizeself.memory_minus[index] = dataself.memory_minus_count += 1else:index = self.memory_count % self.memory_sizeself.memory[index] = dataself.memory_count += 1def choose_action(self, s_t):if self.step % self.FRAME_PER_ACTION:action = 0else:if np.random.uniform() > self.epsilon:s_t = s_t[np.newaxis, :]actions_value = self.model.predict(s_t)# action=np.random.choice(np.where(actions_value == np.max(actions_value))[1])action = np.argmax(actions_value, axis=1)[0]# print('\t{} <<<choose_action {}'.format(action,actions_value))else:action = np.random.randint(0, self.n_actions)# print('\t{} <<<random choose_action'.format(action))return tf.one_hot(action, self.n_actions)# 分开获取记忆，合并后打乱def choose_batch_data(self):sample_index = np.random.choice(min(self.memory_count, self.memory_size),self.batch_size - self.batch_plus_size - self.batch_minus_size)sample_plus_index = np.random.choice(min(self.memory_plus_count, self.memory_plus_size), self.batch_plus_size)sample_minus_index = np.random.choice(min(self.memory_minus_count, self.memory_minus_size),self.batch_minus_size)x1, x2, x3 = self.memory[sample_index], self.memory_plus[sample_plus_index], self.memory_minus[sample_minus_index]batch = np.vstack((x1, x2, x3))np.random.shuffle(batch)o_s = batch[:, :self.n_features].reshape(-1, 80, 80, 4)a_s = batch[:, self.n_features:self.n_features + self.n_actions]a_s = np.argmax(a_s, axis=1)r_s = batch[:, self.n_features + self.n_actions]t_s = batch[:, 1 + self.n_features + self.n_actions]o_s_ = batch[:, -self.n_features:].reshape(-1, 80, 80, 4)return o_s, a_s, r_s, t_s, o_s_def learn(self):if self.step % self.update_model_step == 0:self.model_target.set_weights(self.model.get_weights())o_s, a_s, r_s, t_s, o_s_ = self.choose_batch_data()q_eval = self.model.predict(o_s, batch_size=self.batch_size)q_next = self.model.predict(o_s_, batch_size=self.batch_size)q_next_target = self.model_target.predict(o_s_, batch_size=self.batch_size)q_target = q_eval.copy()max_next_action = np.argmax(q_next, axis=1)target_part = r_s + (1 - t_s) * self.gamma * q_next_target[range(self.batch_size), max_next_action]# target_part=r_s + (1-t_s) * self.gamma * np.max(q_next,axis=1)q_target[range(self.batch_size), a_s] = target_parthistory = self.model.train_on_batch(o_s, q_target)# train_on_batch + tensorboard 用self.tensorboard.on_epoch_end(self.step, named_logs(self.model, history))if self.epsilon > self.FINAL_EPSILON:self.epsilon -= (self.INITIAL_EPSILON - self.FINAL_EPSILON) / self.EXPLOREdef save_model(self):self.model.save_weights(self.ModelFP)print('>' * 88, 'model saved')def load_model(self):try:self.model.load_weights(self.ModelFP)self.model_target.load_weights(self.ModelFP)except Exception:print('没找到模型，不加载')def train(self):game_state = game.GameState()do_nothing = np.zeros(self.n_actions)do_nothing[0] = 1# image,reward,terminalx_t, _, _ = game_state.frame_step(do_nothing)x_t = self.preprocess_pic(x_t)s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)while 1:action = self.choose_action(s_t)x_t_, reward, terminal = game_state.frame_step(action)# RL take action and get next observation and rewards_t_ = self.preprocess_pic(x_t_)s_t_ = s_t_.reshape((80, 80, 1))s_t_ = np.append(s_t_, s_t[:, :, :3], axis=2)# s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)# 初始阶段跑到顶部的记忆太多，加大惩罚，加速训练(也许不需要？)x1 = np.where(s_t_[16, :63, 0] == 1)[0]if len(x1) <= 2 and x1[0] < 10: reward -= 0.5# print(x1)self.store_transition(s_t, action, reward, terminal, s_t_)if (self.memory_plus_count >= self.OBSERVE) and (self.step % self.learn_step == 0):self.FRAME_PER_ACTION = max(1, math.ceil(self.FRAME_PER_ACTION * (1 - self.step / self.EXPLORE)))self.INITIAL_EPSILON = .5if self.epsilon > .5: self.epsilon = .5self.learn()s_t = s_t_self.step += 1if self.memory_plus_count < self.OBSERVE:state = "observe"elif self.memory_plus_count >= self.OBSERVE and self.step <= self.EXPLORE:state = "explore"else:state = "train"print("TIMESTEP", self.step, "/ STATE", state, "/ EPSILON", self.epsilon, "/ REWARD", reward)if self.epsilon > self.FINAL_EPSILON and self.memory_plus_count > self.OBSERVE:self.epsilon -= (self.INITIAL_EPSILON - self.FINAL_EPSILON) / self.EXPLOREif self.step % self.save_model_step == self.save_model_step - 1:self.save_model()dqn = DoubleDQN()
dqn.train()

ddqn玩flappybird相关推荐

dqn在训练过程中loss越来越大_用DQN算法玩FlappyBird
DQN算法可以用于解决离散的动作问题,而FlappyBird的操作正好是离散的. FlappyBird的游戏状态一般可以通过图像加卷积神经网络(CNN)来进行强化学习.但是通过图像分析会比较麻烦,因为 ...
教遗传算法人工智能玩超级马里奥大陆
来源:DeepHub IMBA本文约1900字,建议阅读5分钟本文将将向读者展示如何开发遗传算法 AI 以使用 Python 玩超级马里奥乐园. 这是教授 AI 为 GameBoy 玩超级马里奥乐园 ...
PaddlePaddle版Flappy-Bird—使用DQN算法实现游戏智能
刚刚举行的 WAVE SUMMIT 2019 深度学习开发者峰会上,PaddlePaddle 发布了 PARL 1.1 版本,这一版新增了 IMPALA.A3C.A2C 等一系列并行算法.作者重新测试 ...
Matlab意识流速成班
Matlab意识流速成班假装有目录: ...... 好吧,太短了,懒得添加目录了. 0.好吧,本来不想说的,Matlab安装问题顾左右而言他. 知乎为什么整小程序,值乎,想法等等功能?微信为什么越 ...
强化学习及Python代码示例
1 概述强化学习是机器学习里面的一个分支.它强调如何基于环境而行动,以取得最大化的预期收益.其灵感来源于心理学中的行为主义理论,既有机体如何在环境给予的奖励或者惩罚的刺激下,逐步形成对刺激的预期,产 ...
程序员老黄历Java源码实现
今早起来,无聊之际把程序员老黄历用Java实现了一番. 原JS版地址:http://sandbox.runjs.cn/show/ydp3it7b/ 尊重原作--哈哈哈(连备注都复制的人是不是很恶心的? ...
[JQuery实现] 测测你今天的运势如何？（程序猿老黄历）
用程序猿老黄历,来测测你的运势吧写在前面 1. 什么是 JSON ? 2. 什么是 ajax 请求 ? 3. JQuery 简单开发 4. 程序猿老黄历实现结束语写在前面本篇文章主要内容是通过 ...
人工智能与信息社会超星学习通网课题库大全最全中国大学 MOOC 北京大学考试题目答案
[单选题]2016年3月,人工智能程序()在韩国首尔以4:1的比分战胜的人类围棋冠军李世石. • A.AlphaGo • B.DeepMind • C.Deepblue • D.AlphaGo Zer ...
飞桨领航团AI达人创造营学习笔记1
令人拍案叫绝的创意都是怎么产生的? 第一讲回放链接:https://www.bilibili.com/video/BV1qq4y1X7uZ 1 韩磊/ninetailskim:令人拍案叫绝的创意都是怎 ...
【Python小游戏】用AI玩Python小游戏FlappyBird【源码】
提示:如果本文对您有帮助,欢迎点赞支持! 文章目录目录前言一.Flappy Bird是什么? 二.本项目简介 1.实现效果 2.游戏逻辑的相关链接 (1)游戏逻辑部分 (2)AI训练逻辑部分说 ...

ddqn玩flappybird

ddqn玩flappybird相关推荐

最新文章

热门文章