commit 2cf96936246aab67cd5772d6ddc174fcff0a94a6 Author: Koha9 Date: Tue Mar 5 19:11:38 2024 +0900 Source upload diff --git a/scrollball-main.ipynb b/scrollball-main.ipynb new file mode 100644 index 0000000..2e971c7 --- /dev/null +++ b/scrollball-main.ipynb @@ -0,0 +1,510 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 自由課題レポート\n", + "\n", + "## 1 課題概要\n", + " 今度使用するデータセットは自分で作成したゲーム環境が生成したエージェント観測データである。Double DQN(Deep Q-Learning)モデルを使用し、エージェントが観測した環境データによって最善な動作を予測する。\n", + "## 2 ゲーム環境\n", + "### 2.1 ゲーム概要\n", + " Ml-AgentsのサンプルオブジェクトScrollBallを模倣した、ターゲットを追いかけるゲームである。真ん中の球は自機エージェント、正方形はターゲットと設定する。ゲーム開始時にターゲットは自機エージェント以外のゲームエリア内にランダムで生成する。自機は上下左右各方向にコントロールすることができ、リアルにシミュレーションするため、Unityの物理エンジンを使用する。各方向に入力があった場合、直接に一定の速度でその方向に移動するわけではなく、一定な加速度を加えることと設定している。自機エージェントがターゲットと接触するとゲームクリアと判定し、8秒経過もしくはゲームエリアから落ちる場合は失敗判定とする。\n", + "### 2.2 観測情報\n", + " エージェントはターゲットの位置、および自機位置のx,z座標が観測できることとする。その他、自機の水平と垂直xz方向の速度と合わせてステップ毎に合計六つのデータが観測できる。\n", + "### 2.3 Reward設定\n", + " ステップ毎に環境からRewardが返すことになっている。このステップの操作に評価を行うことである。自機がターゲットに前ステップより近づくとその距離値が正のRewardとしてが戻る、逆に前ステップより遠く離れると離れた距離値が負のRewardとして戻ってくる。ゲーム成功時にもらうRewardが10,失敗するときに-10と設定している。\n", + "### 2.4 動作確定\n", + " 2種類の動作が存在する、垂直及び水平になる。垂直では-1,0,1三つの値があり、各自下、静止、上と意味する,水平も-1,0,1三つの値があり、各左、静止、右と意味する。\n", + "## 3 DoubleDQN\n", + "### 3.1 Q-Learning\n", + " Q-Learningはすべての環境状態を十分にサンプリングし、各状態ににQualityが最も高い動作を実行する機械学習手法である。\n", + " Q-Learning試行したすべての環境状態をQ-tableに記録し、その環境状態で試行した動作が環境からもらうRewardをそのQ値として記録する。まだ同様な環境が観測した場合、Q-Tableに記載したQ値が最も高い動作を実行する。だが多くな場合、ゲームの環境状態は無限であり、すべての状況を記録することは難しい、しかもメモリにも大量に消耗する。\n", + "### 3.2 DQN(Deep Q-Learning)\n", + " DQNはQ-Tableの代わりにニューラルネットワークを使用して、観測した環境状態から各動作のQ値を予測する。今回使用するDouble DQNの構造及びコードは以下になる。\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\tensorflow_addons\\utils\\ensure_tf_install.py:53: UserWarning: Tensorflow Addons supports using Python ops for all Tensorflow versions above or equal to 2.4.0 and strictly below 2.7.0 (nightly versions are not supported). \n", + " The versions of TensorFlow you are currently using is 2.8.0 and is not supported. \n", + "Some things might work, some things might not.\n", + "If you were to encounter a bug, do not file an issue.\n", + "If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. \n", + "You can find the compatibility matrix in TensorFlow Addon's readme:\n", + "https://github.com/tensorflow/addons\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ML-Agents Version : 0.27.0\n", + "TensroFlow Version: 2.8.0\n" + ] + } + ], + "source": [ + "import mlagents_envs\n", + "from mlagents_envs.base_env import ActionTuple\n", + "from mlagents_envs.environment import UnityEnvironment\n", + "\n", + "import tensorflow as tf\n", + "import tensorflow_addons as tfa\n", + "import tensorboard\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import time\n", + "import datetime\n", + "from collections import deque\n", + "from IPython.display import clear_output\n", + "\n", + "print(\"ML-Agents Version :\",mlagents_envs.__version__)\n", + "print(\"TensroFlow Version:\",tf.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# 割り当てに必要なGPUメモリのみを割り当てようとする\n", + "physical_devices = tf.config.list_physical_devices('GPU')\n", + "tf.config.experimental.set_memory_growth(physical_devices[0], True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "以下環境を実行する際にディレクトリにすべて半角英数字符号となっていることが必要" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ステップ毎に環境観測データ数 6\n", + "ステップ毎に実行可能な動作数 2\n" + ] + } + ], + "source": [ + "# 環境パラメータ\n", + "log_dir = \"ML-logs/\" + datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n", + "tb_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)\n", + "env_path = './ScrollBall-Build/ML-ScrollBall-Sample'\n", + "brain_name = 'RollerBallBrain'\n", + "\n", + "# ゲーム環境獲得\n", + "env = UnityEnvironment(file_name=env_path, seed=1, side_channels=[])\n", + "env.reset()\n", + "\n", + "# 環境スペック獲得\n", + "tracked_agent = -1\n", + "behavior_specs = env.behavior_specs\n", + "behavior_name = list(behavior_specs)[0]\n", + "spec = behavior_specs[behavior_name]\n", + "observation_specs = spec.observation_specs[0] # 観測spec\n", + "action_spec = spec.action_spec # 動作spec\n", + "\n", + "\n", + "ENV_Discrete_ACTION_SIZE = action_spec.discrete_size# 連続的な動作のSize\n", + "ENV_Continuous_ACTION_SIZE = action_spec.continuous_size# 離散的な動作のSize\n", + "STATE_SIZE = observation_specs.shape[0]# 環境観測データ数\n", + "SAVE_STEPS = 100 # SAVE_STEPS毎にNNを保存する\n", + "ACTION_SIZE = ENV_Discrete_ACTION_SIZE * 3#トータル動作数、一種類の動作に三つの動作が存在するため、*3とする\n", + "MAX_EXP_NUM = 2500 # ExperiencePoolに保存できる最大過去記録数\n", + "\n", + "EPSILON_CUT_STEP = 1300\n", + "EPISODES = 500\n", + "REPLACE_STEPS = 50\n", + "BATCH_SIZE = 256\n", + "LEARNING_RATE = 0.0005\n", + "GAMMA = 0.9\n", + "\n", + "epsilon = 1\n", + "epsilon_min = 0.01\n", + "\n", + "print(\"ステップ毎に環境観測データ数\",STATE_SIZE)\n", + "print(\"ステップ毎に実行可能な動作数\",ENV_Discrete_ACTION_SIZE)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Experience Pool\n", + "class experiencePool:\n", + " def __init__(self):\n", + " self.exp_pool = deque(maxlen=MAX_EXP_NUM)\n", + "\n", + " def add(self, state, action, reward, netx_state, done):\n", + " self.exp_pool.append((state, action, reward, netx_state, done))\n", + "\n", + " def get_random(self, num=1):\n", + " random_index = np.random.choice(len(self.exp_pool), num)\n", + " random_exps = [self.exp_pool[i] for i in random_index]\n", + " return random_exps\n", + "\n", + " def get_len(self):\n", + " return len(self.exp_pool)\n", + "\n", + "# DQNメソッド\n", + "class DQN:\n", + " def __init__(self,load,load_dir):\n", + " self.learning_rate = LEARNING_RATE\n", + " self.epsilon = 1\n", + " self.epsilon_min = 0.01\n", + " self.epsilon_cut = (1-self.epsilon_min)/EPSILON_CUT_STEP\n", + " self.gamma = GAMMA\n", + " \n", + " if load:\n", + " #既存NNデータをローディングする\n", + " self.epsilon = self.epsilon_min\n", + " main_load_dir = load_dir+\"main.h5\"\n", + " target_load_dir = load_dir+\"target.h5\"\n", + " self.main_net,self.target_net = self.loadNN(main_load_dir,target_load_dir)\n", + " else:\n", + " #新規mainとtarget NNを作成する\n", + " self.main_net = self.build_net()\n", + " self.target_net = self.build_net()\n", + " self.exp_pool = experiencePool()\n", + "\n", + " # ---------------------------------------------------------------------------------\n", + " def build_net(self):\n", + " # NNを作成\n", + " rectifiedAdam = tfa.optimizers.RectifiedAdam(learning_rate = self.learning_rate,weight_decay = 0.001)\n", + " #Adam = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)\n", + " neural_net = tf.keras.Sequential()\n", + " neural_net.add(tf.keras.layers.Dense(\n", + " units=128, activation='relu', input_dim=STATE_SIZE))\n", + " neural_net.add(tf.keras.layers.Dense(\n", + " units=256, activation='relu'))\n", + " neural_net.add(tf.keras.layers.Dense(\n", + " units=128, activation='relu'))\n", + " neural_net.add(tf.keras.layers.Dense(\n", + " units=64, activation='relu'))\n", + " neural_net.add(tf.keras.layers.Dense(\n", + " units=ACTION_SIZE, activation='elu'))\n", + "\n", + " neural_net.compile(optimizer=rectifiedAdam, loss='mse', metrics=['accuracy'])\n", + " \n", + " return neural_net\n", + "\n", + " def select_action(self, state):\n", + " # 動作Q値を予測と動作選択\n", + " random_num = np.random.sample()\n", + " \n", + " if random_num > self.epsilon:\n", + " # DQNをベースにし、動作を選択する\n", + " predictResult = self.main_net(state).numpy()[0]\n", + " actionX = np.argmax(predictResult[0:3])-1\n", + " actionZ = np.argmax(predictResult[3:])-1\n", + " action = np.array([actionX, actionZ], dtype=np.float32)\n", + " #print(\"action = \",action)\n", + " else:\n", + " # ランダムで動作を選択\n", + " actionX = np.random.randint(ACTION_SIZE/2)-1\n", + " actionY = np.random.randint(ACTION_SIZE/2)-1\n", + " action = np.array([actionX, actionY], dtype=np.float32)\n", + "\n", + " # 缩小epsilon\n", + " if self.epsilon > self.epsilon_min:\n", + " self.epsilon -= self.epsilon_cut\n", + "\n", + " return action\n", + "\n", + " def training(self):\n", + " # トレーニング開始\n", + " if self.exp_pool.get_len() >= BATCH_SIZE:\n", + " # トレーニング集を獲得\n", + " exp_set = self.exp_pool.get_random(num=BATCH_SIZE)\n", + " exp_state = [data[0] for data in exp_set] # EXP_Poolが記録した当時ラウンドの環境状態\n", + " exp_action = [data[1] for data in exp_set] # そのラウンドで選んだ動作\n", + " exp_reward = [data[2] for data in exp_set] # その動作に応じるreward\n", + " exp_next_state = [data[3] for data in exp_set] # その動作が実行した後の環境状態。\n", + " exp_done = [data[4] for data in exp_set] # 実行後にゲームが終了したか\n", + "\n", + " exp_state = np.asarray(exp_state).squeeze()\n", + " exp_action = np.asarray(exp_action).squeeze()\n", + " exp_next_state = np.asarray(exp_next_state).squeeze()\n", + "\n", + " # 各ネットでQ値予測\n", + " target_net_q = self.target_net(exp_next_state).numpy() # target_NN 未来状況のQ値を予測\n", + " main_net_q = self.main_net(exp_state).numpy() # main_NN 現在状況のQ値を予測\n", + "\n", + " # トレーニング用Q値、目標y、\n", + " y = main_net_q.copy() # (1,6)\n", + " \n", + " # Batch全体インデクス、[0,1,......,BATCH_SIZE]\n", + " batch_index = np.arange(BATCH_SIZE, dtype=np.int32)\n", + " \n", + " # 動作の値(-1,0,1)によってQ値のインデクス(Xは(0,1,2)、Zは(3,4,5),各自Shapeは(1,BATCH_SIZE))を作成\n", + " exp_actionX_index = exp_action[:,0] + 1\n", + " exp_actionZ_index = exp_action[:,1] + 4\n", + " exp_actionX_index = exp_actionX_index.astype(np.int)\n", + " exp_actionZ_index = exp_actionZ_index.astype(np.int)\n", + " \n", + " # target_NNが未来状況によって予測したQ値から 垂直/水平動作各自の最大値Q値を摘出\n", + " fixedX = np.max(target_net_q[:, :3], axis=1) # (batchsize,1)\n", + " fixedZ = np.max(target_net_q[:, -3:], axis=1) # (batchsize,1)\n", + " # そのラウンドで受けたreward+未来最大Q値の和で修正値となる\n", + " fixedX = exp_reward + self.gamma*fixedX \n", + " fixedZ = exp_reward + self.gamma*fixedZ\n", + " # ゲーム終了のラウンドでの修正値は元のreward,ゲーム続行する時の修正値はfixedXとfixedYとする\n", + " y_fixedX = np.where(exp_done,exp_reward,fixedX)\n", + " y_fixedZ = np.where(exp_done,exp_reward,fixedZ)\n", + " \n", + " # 修正値を応用\n", + " y[batch_index, exp_actionX_index] = y_fixedX\n", + " y[batch_index, exp_actionZ_index] = y_fixedZ\n", + "\n", + " # main_netに入れて、フィットする\n", + " self.main_net.fit(exp_state, y, epochs=5, verbose=0,callbacks = [tb_callback])\n", + " \n", + " def saveNN(self):\n", + " # 両NNを保存する\n", + " main_save_dir= \"ML-Model/\" + datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\")+\"main.h5\"\n", + " target_save_dir= \"ML-Model/\" + datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\")+\"target.h5\"\n", + " self.main_net.save(main_save_dir)\n", + " self.target_net.save(target_save_dir)\n", + " print(\"Model Saved\")\n", + "\n", + " def loadNN(self,main_load_dir,target_load_dir):\n", + " # 両NNをローディングする\n", + " main_net_loaded = tf.keras.models.load_model(main_load_dir)\n", + " target_net_loaded = tf.keras.models.load_model(target_load_dir)\n", + " print(\"Model Loaded\")\n", + " return main_net_loaded,target_net_loaded" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Episode 261 done,Reward = 10.0 mean steps = 13.049618320610687 exp_num = 2500 \n", + "epsilon = 0.009999999999982863\n", + "\n", + "Episode 262 done,Reward = -10.0 mean steps = 13.049429657794677 exp_num = 2500 \n", + "epsilon = 0.009999999999982863\n", + "target_net replaced\n", + "\n", + "Episode 263 done,Reward = -10.0 mean steps = 13.151515151515152 exp_num = 2500 \n", + "epsilon = 0.009999999999982863\n", + "\n", + "Episode 264 done,Reward = 10.0 mean steps = 13.124528301886793 exp_num = 2500 \n", + "epsilon = 0.009999999999982863\n", + "\n", + "Episode 265 done,Reward = 10.0 mean steps = 13.135338345864662 exp_num = 2500 \n", + "epsilon = 0.009999999999982863\n", + "target_net replaced\n", + "Model Saved\n", + "\n", + "Episode 266 done,Reward = 10.0 mean steps = 13.142322097378278 exp_num = 2500 \n", + "epsilon = 0.009999999999982863\n", + "\n", + "Episode 267 done,Reward = -10.0 mean steps = 13.156716417910447 exp_num = 2500 \n", + "epsilon = 0.009999999999982863\n", + "\n", + "Episode 268 done,Reward = 10.0 mean steps = 13.118959107806692 exp_num = 2500 \n", + "epsilon = 0.009999999999982863\n" + ] + }, + { + "ename": "UnityCommunicatorStoppedException", + "evalue": "Communicator has exited.", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mUnityCommunicatorStoppedException\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_13628/1299734337.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[1;31m# 動作をゲーム環境に渡す。\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 52\u001b[0m \u001b[0menv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_actions\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbehavior_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbehavior_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maction\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maction_Tuple\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 53\u001b[1;33m \u001b[0menv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 54\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[1;31m# 環境が動作を実行し、次の環境状態を返す。\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\mlagents_envs\\timers.py\u001b[0m in \u001b[0;36mwrapped\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 303\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mwrapped\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 304\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mhierarchical_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__qualname__\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 305\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 306\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 307\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mwrapped\u001b[0m \u001b[1;31m# type: ignore\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\mlagents_envs\\environment.py\u001b[0m in \u001b[0;36mstep\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 333\u001b[0m \u001b[0moutputs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_communicator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexchange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstep_input\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_poll_process\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 334\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0moutputs\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 335\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mUnityCommunicatorStoppedException\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Communicator has exited.\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 336\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_update_behavior_specs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 337\u001b[0m \u001b[0mrl_output\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0moutputs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrl_output\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mUnityCommunicatorStoppedException\u001b[0m: Communicator has exited." + ] + } + ], + "source": [ + "# エージェント作成\n", + "agent = DQN(load = False,load_dir=\"ML-Model/\" + \"20220205-051103\")\n", + "# トレーニング済みNNをローディングするにはこちらのコードを使用↓\n", + "#agent = DQN(load = True,load_dir=\"ML-Model/\" + \"FinalNN-\")\n", + "\n", + "total_steps = 0 \n", + "steps_list = []\n", + "successTimes = 0\n", + "failTimes = 0\n", + "successTimes_his = []\n", + "failTimes_his = []\n", + "this10TimesWin = 0\n", + "MeanWinPerin10Times = []\n", + "\n", + "for episode in range(EPISODES):\n", + " # episode 開始 \n", + " done = False #ゲーム終了状態をFalse\n", + " steps = 0 \n", + " # 環境初期状態を獲得\n", + " env.reset()\n", + " decision_steps, terminal_steps = env.get_steps(behavior_name)\n", + " state = decision_steps.obs[0]\n", + " state = np.reshape(state, [1, STATE_SIZE])\n", + " \n", + " # ゲームスタート\n", + " while True:\n", + " reward = 0\n", + " steps+=1\n", + " total_steps += 1\n", + " # エージェントナンバーをトラックする\n", + " if tracked_agent == -1 and len(decision_steps) >= 1:\n", + " tracked_agent = decision_steps.agent_id[0]\n", + " \n", + " # REPLACE_STEPS毎でtarget_net にmain_netで入れ替える\n", + " if total_steps % REPLACE_STEPS == 0 and total_steps !=0:\n", + " agent.target_net.set_weights(agent.main_net.get_weights())\n", + " print('target_net replaced')\n", + " \n", + " # SAVE_STEPS毎でNNを保存する\n", + " if total_steps % SAVE_STEPS ==0 and total_steps !=0:\n", + " agent.saveNN()\n", + " \n", + " # main_netで動作選択\n", + " action = agent.select_action(state=state)\n", + " continuous_actions = np.array([[0]], dtype=np.float)\n", + " discrete_actions = np.expand_dims(action,axis=0)\n", + " # 動作をML-Agentsが認識可能なActionTuple型に変換\n", + " action_Tuple = ActionTuple(\n", + " continuous=continuous_actions, discrete=discrete_actions)\n", + "\n", + " # 動作をゲーム環境に渡す。\n", + " env.set_actions(behavior_name=behavior_name, action=action_Tuple)\n", + " env.step()\n", + " \n", + " # 環境が動作を実行し、次の環境状態を返す。\n", + " decision_steps, terminal_steps = env.get_steps(behavior_name)\n", + " if tracked_agent in decision_steps: # ゲーム終了していない場合、環境状態がdecision_stepsに保存される\n", + " next_state = decision_steps[tracked_agent].obs[0]\n", + " next_state = np.reshape(next_state,[1,STATE_SIZE])\n", + " reward = decision_steps[tracked_agent].reward\n", + " if tracked_agent in terminal_steps: # ゲーム終了した場合、環境状態がterminal_stepsに保存される\n", + " next_state = terminal_steps[tracked_agent].obs[0]\n", + " next_state = np.reshape(next_state,[1,STATE_SIZE])\n", + " reward = terminal_steps[tracked_agent].reward\n", + " done = True\n", + " \n", + " # Experience_poolに保存\n", + " agent.exp_pool.add(state,action,reward,next_state,done)\n", + " #print(\"Reward = \",reward)\n", + " # 環境状態を次状態に変更。\n", + " state = next_state\n", + " \n", + " # ゲーム終了後処理\n", + " if done:\n", + " mean_steps = total_steps/(episode+1)\n", + " print(\"\\nEpisode\",episode,\"done,Reward =\",reward,\"mean steps =\",mean_steps,\"exp_num =\",agent.exp_pool.get_len(),\"\\nepsilon =\",agent.epsilon)\n", + " agent.training()\n", + " if(reward >=10):\n", + " successTimes+=1\n", + " this10TimesWin+=1\n", + " else:\n", + " failTimes+=1\n", + " successTimes_his.append(successTimes)\n", + " failTimes_his.append(failTimes)\n", + " if episode % 10 ==0 and episode !=0:\n", + " clear_output()\n", + " this10TimesWinPer = float(this10TimesWin/10)\n", + " this10TimesWin = 0\n", + " MeanWinPerin10Times.append(this10TimesWinPer)\n", + " # 合計成功数(緑)、合計失敗数(赤)を図で表示する\n", + " plt.figure(figsize = (15,10))\n", + " plt.plot(range(len(successTimes_his)), successTimes_his,color='green',linestyle='--', linewidth=1, label='TotalWinTimes')\n", + " plt.plot(range(len(successTimes_his)), failTimes_his,color='red', linewidth=1,marker='o', markersize=1, markerfacecolor='black',markeredgecolor='black',label='TotalFaildTimes')\n", + " # plt.savefig('output.jpg')\n", + " plt.legend()\n", + " plt.savefig(\"wintimes.png\")\n", + " plt.show()\n", + " \n", + " #10回実行した後の成功確率を図で表示する\n", + " plt.figure(figsize=(15,10))\n", + " plt.plot(MeanWinPerin10Times)\n", + " plt.savefig(\"steps.png\")\n", + " plt.show()\n", + " break\n", + "env.close()\n", + "print(\"Finished~\")" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "c62a1b52b24525839a95f7ca2b53f501cc329096d80c6be9aea5c814c594ecdd" + }, + "kernelspec": { + "display_name": "Python 3.9.7 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "undefined.undefined.undefined" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scrollball-main.py b/scrollball-main.py new file mode 100644 index 0000000..ad52ec3 --- /dev/null +++ b/scrollball-main.py @@ -0,0 +1,314 @@ +import mlagents_envs +from mlagents_envs.base_env import ActionTuple +from mlagents_envs.environment import UnityEnvironment + +import argparse +import tensorflow as tf +import tensorflow_addons as tfa +import tensorboard +import numpy as np +import matplotlib.pyplot as plt +import time +import datetime +from collections import deque +from IPython.display import clear_output + +print("ML-Agents Version :",mlagents_envs.__version__) +print("TensroFlow Version:",tf.__version__) + +# 環境パラメータ +log_dir = "ML-logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") +tb_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir) +env_path = './ScrollBall-Build/ML-ScrollBall-Sample' +brain_name = 'RollerBallBrain' + +parser = argparse.ArgumentParser() +parser.add_argument("--env_path", default=env_path) +parser.add_argument("-train", action="store_true",default=False) +parser.add_argument("-silent", action="store_true",default=False) + +args = parser.parse_args() + +# ゲーム環境獲得 +env = UnityEnvironment(file_name=args.env_path, seed=1, side_channels=[]) +env.reset() + +# 環境スペック獲得 +tracked_agent = -1 +behavior_specs = env.behavior_specs +behavior_name = list(behavior_specs)[0] +spec = behavior_specs[behavior_name] +observation_specs = spec.observation_specs[0] # 観測spec +action_spec = spec.action_spec # 動作spec + +ENV_Discrete_ACTION_SIZE = action_spec.discrete_size# 連続的な動作のSize +ENV_Continuous_ACTION_SIZE = action_spec.continuous_size# 離散的な動作のSize +STATE_SIZE = observation_specs.shape[0]# 環境観測データ数 +SAVE_STEPS = 100 # SAVE_STEPS毎にNNを保存する +ACTION_SIZE = ENV_Discrete_ACTION_SIZE * 3#トータル動作数、一種類の動作に三つの動作が存在するため、*3とする +MAX_EXP_NUM = 2500 # ExperiencePoolに保存できる最大過去記録数 + +EPSILON_CUT_STEP = 1300 +EPISODES = 500 +REPLACE_STEPS = 50 +BATCH_SIZE = 256 +LEARNING_RATE = 0.0005 +GAMMA = 0.9 + +epsilon = 1 +epsilon_min = 0.01 + +print("ステップ毎に環境観測データ数",STATE_SIZE) +print("ステップ毎に実行可能な動作数",ENV_Discrete_ACTION_SIZE) + +# Experience Pool +class experiencePool: + def __init__(self): + self.exp_pool = deque(maxlen=MAX_EXP_NUM) + + def add(self, state, action, reward, netx_state, done): + self.exp_pool.append((state, action, reward, netx_state, done)) + + def get_random(self, num=1): + random_index = np.random.choice(len(self.exp_pool), num) + random_exps = [self.exp_pool[i] for i in random_index] + return random_exps + + def get_len(self): + return len(self.exp_pool) + +# DQNメソッド +class DQN: + def __init__(self,load,load_dir): + self.learning_rate = LEARNING_RATE + self.epsilon = 1 + self.epsilon_min = 0.01 + self.epsilon_cut = (1-self.epsilon_min)/EPSILON_CUT_STEP + self.gamma = GAMMA + + if load: + #既存NNデータをローディングする + self.epsilon = self.epsilon_min + main_load_dir = load_dir+"main.h5" + target_load_dir = load_dir+"target.h5" + self.main_net,self.target_net = self.loadNN(main_load_dir,target_load_dir) + else: + #新規mainとtarget NNを作成する + self.main_net = self.build_net() + self.target_net = self.build_net() + self.exp_pool = experiencePool() + + # --------------------------------------------------------------------------------- + def build_net(self): + # NNを作成 + rectifiedAdam = tfa.optimizers.RectifiedAdam(learning_rate = self.learning_rate,weight_decay = 0.001) + #Adam = tf.keras.optimizers.Adam(learning_rate=self.learning_rate) + neural_net = tf.keras.Sequential() + neural_net.add(tf.keras.layers.Dense( + units=128, activation='relu', input_dim=STATE_SIZE)) + neural_net.add(tf.keras.layers.Dense( + units=256, activation='relu')) + neural_net.add(tf.keras.layers.Dense( + units=128, activation='relu')) + neural_net.add(tf.keras.layers.Dense( + units=64, activation='relu')) + neural_net.add(tf.keras.layers.Dense( + units=ACTION_SIZE, activation='elu')) + + neural_net.compile(optimizer=rectifiedAdam, loss='mse', metrics=['accuracy']) + + return neural_net + + def select_action(self, state): + # 動作Q値を予測と動作選択 + random_num = np.random.sample() + + if random_num > self.epsilon: + # DQNをベースにし、動作を選択する + predictResult = self.main_net(state).numpy()[0] + actionX = np.argmax(predictResult[0:3])-1 + actionZ = np.argmax(predictResult[3:])-1 + action = np.array([actionX, actionZ], dtype=np.float32) + #print("action = ",action) + else: + # ランダムで動作を選択 + actionX = np.random.randint(ACTION_SIZE/2)-1 + actionY = np.random.randint(ACTION_SIZE/2)-1 + action = np.array([actionX, actionY], dtype=np.float32) + + # 缩小epsilon + if self.epsilon > self.epsilon_min: + self.epsilon -= self.epsilon_cut + + return action + + def training(self): + # トレーニング開始 + if self.exp_pool.get_len() >= BATCH_SIZE: + # トレーニング集を獲得 + exp_set = self.exp_pool.get_random(num=BATCH_SIZE) + exp_state = [data[0] for data in exp_set] # EXP_Poolが記録した当時ラウンドの環境状態 + exp_action = [data[1] for data in exp_set] # そのラウンドで選んだ動作 + exp_reward = [data[2] for data in exp_set] # その動作に応じるreward + exp_next_state = [data[3] for data in exp_set] # その動作が実行した後の環境状態。 + exp_done = [data[4] for data in exp_set] # 実行後にゲームが終了したか + + exp_state = np.asarray(exp_state).squeeze() + exp_action = np.asarray(exp_action).squeeze() + exp_next_state = np.asarray(exp_next_state).squeeze() + + # 各ネットでQ値予測 + target_net_q = self.target_net(exp_next_state).numpy() # target_NN 未来状況のQ値を予測 + main_net_q = self.main_net(exp_state).numpy() # main_NN 現在状況のQ値を予測 + + # トレーニング用Q値、目標y、 + y = main_net_q.copy() # (1,6) + + # Batch全体インデクス、[0,1,......,BATCH_SIZE] + batch_index = np.arange(BATCH_SIZE, dtype=np.int32) + + # 動作の値(-1,0,1)によってQ値のインデクス(Xは(0,1,2)、Zは(3,4,5),各自Shapeは(1,BATCH_SIZE))を作成 + exp_actionX_index = exp_action[:,0] + 1 + exp_actionZ_index = exp_action[:,1] + 4 + exp_actionX_index = exp_actionX_index.astype(np.int) + exp_actionZ_index = exp_actionZ_index.astype(np.int) + + # target_NNが未来状況によって予測したQ値から 垂直/水平動作各自の最大値Q値を摘出 + fixedX = np.max(target_net_q[:, :3], axis=1) # (batchsize,1) + fixedZ = np.max(target_net_q[:, -3:], axis=1) # (batchsize,1) + # そのラウンドで受けたreward+未来最大Q値の和で修正値となる + fixedX = exp_reward + self.gamma*fixedX + fixedZ = exp_reward + self.gamma*fixedZ + # ゲーム終了のラウンドでの修正値は元のreward,ゲーム続行する時の修正値はfixedXとfixedYとする + y_fixedX = np.where(exp_done,exp_reward,fixedX) + y_fixedZ = np.where(exp_done,exp_reward,fixedZ) + + # 修正値を応用 + y[batch_index, exp_actionX_index] = y_fixedX + y[batch_index, exp_actionZ_index] = y_fixedZ + + # main_netに入れて、フィットする + self.main_net.fit(exp_state, y, epochs=5, verbose=0,callbacks = [tb_callback]) + + def saveNN(self): + # 両NNを保存する + main_save_dir= "ML-Model/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+"main.h5" + target_save_dir= "ML-Model/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+"target.h5" + self.main_net.save(main_save_dir) + self.target_net.save(target_save_dir) + print("Model Saved") + + def loadNN(self,main_load_dir,target_load_dir): + # 両NNをローディングする + main_net_loaded = tf.keras.models.load_model(main_load_dir) + target_net_loaded = tf.keras.models.load_model(target_load_dir) + print("Model Loaded") + return main_net_loaded,target_net_loaded + +if not args.train: + agent = DQN(load = True,load_dir="ML-Model/" + "FinalNN-") +else: + agent = DQN(load = False,load_dir="ML-Model/" + "20220205-051103") + +total_steps = 0 +steps_list = [] +successTimes = 0 +failTimes = 0 +successTimes_his = [] +failTimes_his = [] +this10TimesWin = 0 +MeanWinPerin10Times = [] + +for episode in range(EPISODES): + # episode 開始 + done = False #ゲーム終了状態をFalse + steps = 0 + # 環境初期状態を獲得 + env.reset() + decision_steps, terminal_steps = env.get_steps(behavior_name) + state = decision_steps.obs[0] + state = np.reshape(state, [1, STATE_SIZE]) + + # ゲームスタート + while True: + reward = 0 + steps+=1 + total_steps += 1 + # エージェントナンバーをトラックする + if tracked_agent == -1 and len(decision_steps) >= 1: + tracked_agent = decision_steps.agent_id[0] + + # REPLACE_STEPS毎でtarget_net にmain_netで入れ替える + if total_steps % REPLACE_STEPS == 0 and total_steps !=0: + agent.target_net.set_weights(agent.main_net.get_weights()) + print('target_net replaced') + + # SAVE_STEPS毎でNNを保存する + if total_steps % SAVE_STEPS ==0 and total_steps !=0: + agent.saveNN() + + # main_netで動作選択 + action = agent.select_action(state=state) + continuous_actions = np.array([[0]], dtype=np.float) + discrete_actions = np.expand_dims(action,axis=0) + # 動作をML-Agentsが認識可能なActionTuple型に変換 + action_Tuple = ActionTuple( + continuous=continuous_actions, discrete=discrete_actions) + + # 動作をゲーム環境に渡す。 + env.set_actions(behavior_name=behavior_name, action=action_Tuple) + env.step() + + # 環境が動作を実行し、次の環境状態を返す。 + decision_steps, terminal_steps = env.get_steps(behavior_name) + if tracked_agent in decision_steps: # ゲーム終了していない場合、環境状態がdecision_stepsに保存される + next_state = decision_steps[tracked_agent].obs[0] + next_state = np.reshape(next_state,[1,STATE_SIZE]) + reward = decision_steps[tracked_agent].reward + if tracked_agent in terminal_steps: # ゲーム終了した場合、環境状態がterminal_stepsに保存される + next_state = terminal_steps[tracked_agent].obs[0] + next_state = np.reshape(next_state,[1,STATE_SIZE]) + reward = terminal_steps[tracked_agent].reward + done = True + + # Experience_poolに保存 + agent.exp_pool.add(state,action,reward,next_state,done) + #print("Reward = ",reward) + # 環境状態を次状態に変更。 + state = next_state + + # ゲーム終了後処理 + if done: + mean_steps = total_steps/(episode+1) + print("\nEpisode",episode,"done,Reward =",reward,"mean steps =",mean_steps,"exp_num =",agent.exp_pool.get_len(),"\nepsilon =",agent.epsilon) + agent.training() + if(reward >=10): + successTimes+=1 + this10TimesWin+=1 + else: + failTimes+=1 + successTimes_his.append(successTimes) + failTimes_his.append(failTimes) + if episode % 10 ==0 and episode !=0: + clear_output() + this10TimesWinPer = float(this10TimesWin/10) + this10TimesWin = 0 + MeanWinPerin10Times.append(this10TimesWinPer) + # 合計成功数(緑)、合計失敗数(赤)を図で表示する + if not args.silent: + plt.figure(figsize = (15,10)) + plt.plot(range(len(successTimes_his)), successTimes_his,color='green',linestyle='--', linewidth=1, label='TotalWinTimes') + plt.plot(range(len(successTimes_his)), failTimes_his,color='red', linewidth=1,marker='o', markersize=1, markerfacecolor='black',markeredgecolor='black',label='TotalFaildTimes') + # plt.savefig('output.jpg') + plt.legend() + plt.savefig("wintimes.png") + plt.show() + + #10回実行した後の成功確率を図で表示する + plt.figure(figsize=(15,10)) + plt.plot(MeanWinPerin10Times) + plt.savefig("steps.png") + plt.show() + break +env.close() +print("Finished~") \ No newline at end of file