MultiThread PPO First Commit

2023-11-23 15:25:34 +09:00
8 changed files with 307 additions and 293 deletions
--- a/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
+++ b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
@ -81,184 +81,43 @@
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import argparse\n",
-    "import wandb\n",
-    "import time\n",
-    "import numpy as np\n",
-    "import random\n",
-    "import uuid\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "import torch.optim as optim\n",
-    "\n",
-    "from AimbotEnv import Aimbot\n",
-    "from tqdm import tqdm\n",
-    "from torch.distributions.normal import Normal\n",
-    "from torch.distributions.categorical import Categorical\n",
-    "from distutils.util import strtobool\n",
-    "from torch.utils.tensorboard import SummaryWriter\n",
-    "from mlagents_envs.environment import UnityEnvironment\n",
-    "from mlagents_envs.side_channel.side_channel import (\n",
-    "    SideChannel,\n",
-    "    IncomingMessage,\n",
-    "    OutgoingMessage,\n",
-    ")\n",
-    "from typing import List\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "AttributeError",
-     "evalue": "'aaa' object has no attribute 'outa'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[5], line 14\u001b[0m\n\u001b[0;32m     12\u001b[0m asd \u001b[39m=\u001b[39m aaa(outa, outb)\n\u001b[0;32m     13\u001b[0m asd\u001b[39m.\u001b[39mfunc()\n\u001b[1;32m---> 14\u001b[0m \u001b[39mprint\u001b[39m(asd\u001b[39m.\u001b[39;49mouta)  \u001b[39m# 输出 100\u001b[39;00m\n",
-      "\u001b[1;31mAttributeError\u001b[0m: 'aaa' object has no attribute 'outa'"
-     ]
-    }
-   ],
-   "source": [
-    "class aaa():\n",
-    "    def __init__(self, a, b):\n",
-    "        self.a = a\n",
-    "        self.b = b\n",
-    "\n",
-    "    def func(self):\n",
-    "        global outa\n",
-    "        outa = 100\n",
-    "\n",
-    "outa = 1\n",
-    "outb = 2\n",
-    "asd = aaa(outa, outb)\n",
-    "asd.func()\n",
-    "print(asd.outa)  # 输出 100"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "usage: ipykernel_launcher.py [-h] [--seed SEED]\n",
-      "ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme=\"hmac-sha256\" --Session.key=b\"46ef9317-59fb-4ab6-ae4e-6b35744fc423\" --shell=9002 --transport=\"tcp\" --iopub=9004 --f=c:\\Users\\UCUNI\\AppData\\Roaming\\jupyter\\runtime\\kernel-v2-311926K1uko38tdWb.json\n"
-     ]
-    },
-    {
-     "ename": "SystemExit",
-     "evalue": "2",
-     "output_type": "error",
-     "traceback": [
-      "An exception has occurred, use %tb to see the full traceback.\n",
-      "\u001b[1;31mSystemExit\u001b[0m\u001b[1;31m:\u001b[0m 2\n"
-     ]
-    }
-   ],
-   "source": [
-    "import argparse\n",
-    "\n",
-    "def parse_args():\n",
-    "    parser = argparse.ArgumentParser()\n",
-    "    parser.add_argument(\"--seed\", type=int, default=11,\n",
-    "                        help=\"seed of the experiment\")\n",
-    "    args = parser.parse_args()\n",
-    "    return args\n",
-    "\n",
-    "arggg = parse_args()\n",
-    "print(type(arggg))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "3"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "y=\"a;b;c\"\n",
-    "len(y.split(\";\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2]\n"
+      "0\n",
+      "i =  0\n",
+      "i =  1\n",
+      "i =  2\n",
+      "i =  3\n",
+      "i =  4\n",
+      "i =  5\n",
+      "i =  6\n",
+      "i =  7\n",
+      "i =  8\n",
+      "i =  9\n",
+      "10\n"
     ]
    }
   ],
   "source": [
-    "a = np.array([1,2,3,4])\n",
-    "print(a[[False,True,False,False]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{1, 2, 3, 4}"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "a = {1,2,3}\n",
-    "a.add(4)\n",
-    "a"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([3, 4])"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "a = np.array([[1,3],[2,4]])\n",
-    "a.max(axis=1)\n"
+    "import threading\n",
+    "\n",
+    "num = 0\n",
+    "\n",
+    "def print_numers():\n",
+    "    global num\n",
+    "    for i in range(10):\n",
+    "        num +=1\n",
+    "        print(\"i = \",i)\n",
+    "\n",
+    "thread = threading.Thread(target=print_numers)\n",
+    "\n",
+    "print(num)\n",
+    "thread.start()\n",
+    "thread.join()\n",
+    "print(num)"
   ]
  }
 ],
--- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
+++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
@ -19,8 +19,8 @@ import torch.optim as optim
 # side channel uuid
 SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
 # tensorboard names
-GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel"
-GAME_TYPE = "GotoOnly-3.6-Level0123-newModel-Onehot"
+GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel_V2"
+GAME_TYPE = "GotoOnly-Level0123-new512Model"

 if __name__ == "__main__":
    args = parse_args()
--- a/Aimbot-PPO-Python/Pytorch/aimemory.py
+++ b/Aimbot-PPO-Python/Pytorch/aimemory.py
@ -58,7 +58,6 @@ class PPOMem:
            # print("Win! Broadcast reward!",rewardBF[-1])
            print(sum(thisRewardBF) / len(thisRewardBF))
            thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward
-            # broadcast result reward, increase all reward in this round by remainTime * self.result_broadcast_ratio
            thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist()
        else:
            print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1])
@ -89,7 +88,7 @@ class PPOMem:
                self.dones_bf[i].append(done[i])
                self.values_bf[i].append(value_cpu[i])
                if now_step % self.decision_period == 0:
-                    # on decision period, add last skiped round's reward, only affact in decision_period != 1
+                    # on decision period, add last skiped round's reward
                    self.rewards_bf[i].append(reward[i] + last_reward[i])
                else:
                    # not on decision period, only add this round's reward
--- a/Aimbot-PPO-Python/Pytorch/arguments-cn.md
+++ b/Aimbot-PPO-Python/Pytorch/arguments-cn.md
@ -1,56 +0,0 @@
-
-
-本项目使用以下命令行参数来配置运行环境和模型训练参数：
-
- `--seed <int>`：实验的随机种子。默认值为`9331`。
- `--path <str>`：环境路径。默认值为`"./Build/3.6/Aimbot-ParallelEnv"`。
- `--workerID <int>`：Unity worker ID。默认值为`1`。
- `--baseport <int>`：用于连接Unity环境的端口。默认值为`500`。
- `--lr <float>`：优化器的默认学习率。默认值为`5e-5`。
- `--cuda`：如果启用，将默认使用cuda。可以通过传入`true`或`false`来开启或关闭。
- `--total-timesteps <int>`：实验的总时间步数。默认值为`3150000`。
-
-### 模型参数
-
- `--train`：是否训练模型。默认启用。
- `--freeze-viewnet`：是否冻结视图网络(raycast)。默认为`False`。
- `--datasetSize <int>`：训练数据集的大小，当数据集收集足够的数据时开始训练。默认值为`6000`。
- `--minibatchSize <int>`：minibatch大小。默认值为`512`。
- `--epochs <int>`：更新策略的K次迭代。默认值为`3`。
- `--annealLR`：是否对策略和价值网络进行学习率退火。默认为`True`。
- `--wandb-track`：是否在wandb上跟踪。默认为`False`。
- `--save-model`：是否保存模型。默认为`False`。
- `--wandb-entity <str>`：wandb项目的实体。默认值为`"koha9"`。
- `--load-dir <str>`：模型加载目录。默认值为`None`。
- `--decision-period <int>`：Timestep之间的动作执行间隔。默认值为`1`。
- `--result-broadcast-ratio <float>`：当赢得回合时，对结果的reward进行broadcast的比例，默认值为`1/30`。
- `--target-lr <float>`：下调学习率的目标值。默认值为`1e-6`。
-
-### 损失函数参数
-
- `--policy-coef <float>`：策略损失的系数。默认值为`[0.8, 0.8, 0.8, 0.8]`。
- `--entropy-coef <float>`：熵损失的系数。默认值为`[0.05, 0.05, 0.05, 0.05]`。
- `--critic-coef <float>`：评论家损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
- `--loss-coef <float>`：总损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
-
-### GAE损失参数
-
- `--gae`：是否使用GAE进行优势计算。默认启用。
- `--norm-adv`：是否标准化优势。默认为`False`。
- `--gamma <float>`：折扣因子gamma。默认值为`0.999`。
- `--gaeLambda <float>`：GAE的lambda值。默认值为`0.95`。
- `--clip-coef <float>`：替代裁剪系数。默认值为`0.11`。
- `--clip-vloss`：是否使用论文中的裁剪价值函数损失。默认启用。
- `--max-grad-norm <float>`：梯度裁剪的最大范数。默认值为`0.5`。
-
-### 环境参数
-
- `--target-num <int>`：目标种类数量。默认值为`4`。
- `--env-timelimit <int>`：每轮的时间限制。默认值为`30`。
- `--base-win-reward <int>`：赢得回合的基础奖励。默认值为`999`。
- `--base-lose-reward <int>`：输掉回合的基础奖励。默认值为`-999`。
- `--target-state-size <int>`：target状态的大小。默认值为`6`。
- `--time-state-size <int>`：游戏剩余时间状态的大小。默认值为`1`。
- `--gun-state-size <int>`：枪状态的大小。默认值为`1`。
- `--my-state-size <int>`：我的状态大小。默认值为`4`。
- `--total-target-size <int>`：总target状态的大小。默认值为`12`。
--- a/Aimbot-PPO-Python/Pytorch/arguments-jp.md
+++ b/Aimbot-PPO-Python/Pytorch/arguments-jp.md
@ -1,52 +0,0 @@
- `--seed <int>`：実験の乱数Seed。デフォルト値は`9331`。
- `--path <str>`：環境パス。デフォルト値は`"./Build/3.6/Aimbot-ParallelEnv"`。
- `--workerID <int>`：Unity Worker ID。デフォルト値は`1`。
- `--baseport <int>`：Unity環境への接続用Port。デフォルト値は`500`。
- `--lr <float>`：Optimizerのデフォルト学習率。デフォルト値は`5e-5`。
- `--cuda`：有効にすると、デフォルトでcudaを使用します。`true`または`false`を渡すことで有効/無効を切り替えられます。
- `--total-timesteps <int>`：実験の合計タイムステップ数。デフォルト値は`3150000`。
-
-### モデルパラメータ
-
- `--train`：モデルを訓練するかどうか。デフォルトで有効。
- `--freeze-viewnet`：ビューネットワーク(raycast)をfreezeする。デフォルトは`False`。
- `--datasetSize <int>`：訓練データセットのサイズ。データセットが十分なデータを集めたら訓練を開始する。デフォルト値は`6000`。
- `--minibatchSize <int>`：minibatchのサイズ。デフォルト値は`512`。
- `--epochs <int>`：epochs。デフォルト値は`3`。
- `--annealLR`：ポリシーとバリューネットワークの学習率を退火するかどうか。デフォルトは`True`。
- `--wandb-track`：wandbでトラッキングするかどうか。デフォルトは`False`。
- `--save-model`：モデルを保存するかどうか。デフォルトは`False`。
- `--wandb-entity <str>`：wandbプロジェクトのエンティティ。デフォルト値は`"koha9"`。
- `--load-dir <str>`：モデルのロードディレクトリ。デフォルト値は`None`。
- `--decision-period <int>`：実際動作を実行する時のタイムステップの間隔。デフォルト値は`1`。
- `--result-broadcast-ratio <float>`：ラウンドに勝った場合の報酬のbroadcast ratio、デフォルト値は`1/30`。
- `--target-lr <float>`：学習率を下げる時の目標値。デフォルト値は`1e-6`。
-
-### 損失関数パラメータ
-
- `--policy-coef <float>`：policy損失の係数。デフォルト値は`[0.8, 0.8, 0.8, 0.8]`。
- `--entropy-coef <float>`：entropy損失の係数。デフォルト値は`[0.05, 0.05, 0.05, 0.05]`。
- `--critic-coef <float>`：critic損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
- `--loss-coef <float>`：全体の損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
-
-### GAE損失パラメータ
-
- `--gae`：GAEを使用してアドバンテージを計算するかどうか。デフォルトで有効。
- `--norm-adv`：アドバンテージを正規化するかどうか。デフォルトは`False`。
- `--gamma <float>`：割引因子gamma。デフォルト値は`0.999`。
- `--gaeLambda <float>`：GAEのlambda値。デフォルト値は`0.95`。
- `--clip-coef <float>`：代替クリッピング係数。デフォルト値は`0.11`。
- `--clip-vloss`：論文で述べられている価値関数の損失のクリッピングを使用するかどうか。デフォルトで有効。
- `--max-grad-norm <float>`：勾配のクリッピングの最大ノルム。デフォルト値は`0.5`。
-
-### 環境パラメータ
-
- `--target-num <int>`：Targetの種類数。デフォルト値は`4`。
- `--env-timelimit <int>`：ラウンドごとの時間制限。デフォルト値は`30`。
- `--base-win-reward <int>`：ラウンドに勝った場合の基本報酬。デフォルト値は`999`。
- `--base-lose-reward <int>`：ラウンドに負けた場合の基本報酬。デフォルト値は`-999`。
- `--target-state-size <int>`：Targetの状態サイズ。デフォルト値は`6`。
- `--time-state-size <int>`：ゲームの残り時間の状態サイズ。デフォルト値は`1`。
- `--gun-state-size <int>`：銃の状態サイズ。デフォルト値は`1`。
- `--my-state-size <int>`：自分の状態サイズ。デフォルト値は`4`。
- `--total-target-size <int>`：全Targetの状態サイズ。デフォルト値は`12`。
--- a/Aimbot-PPO-Python/Pytorch/arguments.py
+++ b/Aimbot-PPO-Python/Pytorch/arguments.py
@ -4,7 +4,7 @@ import uuid
 from distutils.util import strtobool

 DEFAULT_SEED = 9331
-ENV_PATH = "../Build/3.6/Aimbot-ParallelEnv"
+ENV_PATH = "../Build/3.5/Aimbot-ParallelEnv"
 WAND_ENTITY = "koha9"
 WORKER_ID = 1
 BASE_PORT = 1000
@ -16,15 +16,15 @@ TOTAL_STEPS = 3150000
 BATCH_SIZE = 512
 MAX_TRAINNING_DATASETS = 6000
 DECISION_PERIOD = 1
-LEARNING_RATE = 5e-5
-GAMMA = 0.999
+LEARNING_RATE = 1.5e-4
+GAMMA = 0.99
 GAE_LAMBDA = 0.95
 EPOCHS = 3
 CLIP_COEF = 0.11
 LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
 POLICY_COEF = [0.8, 0.8, 0.8, 0.8]
 ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
-CRITIC_COEF = [1.0, 1.0, 1.0, 1.0]
+CRITIC_COEF = [0.8, 0.8, 0.8, 0.8]
 TARGET_LEARNING_RATE = 1e-6

 FREEZE_VIEW_NETWORK = False
@ -35,7 +35,7 @@ TRAIN = True
 SAVE_MODEL = True
 WANDB_TACK = True
 LOAD_DIR = None
-# LOAD_DIR = "../PPO-Model/GotoOnly-Level1234_9331_1697122986/8.853553.pt"
+# LOAD_DIR = "../PPO-Model/GotoOnly-Level0123_9331_1696965321/5.1035867.pt"

 # Unity Environment Parameters
 TARGET_STATE_SIZE = 6
--- a/Aimbot-PPO-Python/Pytorch/multiThread-PPO.py
+++ b/Aimbot-PPO-Python/Pytorch/multiThread-PPO.py
@ -0,0 +1,255 @@
+import time
+import numpy as np
+import random
+import uuid
+import torch
+import atexit
+import os
+
+from aimbotEnv import Aimbot
+from aimbotEnv import AimbotSideChannel
+from ppoagent import PPOAgent
+from airecorder import WandbRecorder
+from aimemory import PPOMem
+from aimemory import Targets
+from arguments import parse_args
+from arguments import set_save_model, is_save_model
+import torch.optim as optim
+
+# side channel uuid
+SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
+# tensorboard names
+GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel_V2"
+GAME_TYPE = "GotoOnly-Level0123-new512Model"
+
+if __name__ == "__main__":
+    args = parse_args()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+    best_reward = -1
+
+    # Initialize environment agent optimizer
+    aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID)
+    env = Aimbot(
+        env_path=args.path,
+        worker_id=args.workerID,
+        base_port=args.baseport,
+        side_channels=[aimbot_side_channel])
+    if args.load_dir is None:
+        agent = PPOAgent(
+            env=env,
+            this_args=args,
+            device=device,
+        ).to(device)
+    else:
+        agent = torch.load(args.load_dir)
+        # freeze
+        if args.freeze_viewnet:
+            # freeze the view network
+            print("FREEZE VIEW NETWORK is not compatible with Full MNN!")
+            raise NotImplementedError
+        print("Load Agent", args.load_dir)
+        print(agent.eval())
+    # optimizer
+    optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
+    # Tensorboard and WandB Recorder
+    run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
+    wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
+
+    # start the game
+    total_update_step = args.target_num * args.total_timesteps // args.datasetSize
+    target_steps = [0 for i in range(args.target_num)]
+    start_time = time.time()
+    state, _, done = env.reset()
+
+    # initialize AI memories
+    ppo_memories = PPOMem(
+        args=args,
+        unity_agent_num=env.unity_agent_num,
+        device=device,
+    )
+
+    # MAIN LOOP: run agent in environment
+    for total_steps in range(total_update_step):
+        # discount learning rate, while step == total_update_step lr will be 0
+        if args.annealLR:
+            final_lr_ratio = args.target_lr / args.lr
+            frac = 1.0 - ((total_steps + 1.0) / total_update_step)
+            lr_now = frac * args.lr
+            optimizer.param_groups[0]["lr"] = lr_now
+        else:
+            lr_now = args.lr
+
+        # episode start show learning rate
+        print("new episode", total_steps, "learning rate = ", lr_now)
+        step = 0
+        training = False
+        train_queue = []
+        last_reward = [0. for i in range(env.unity_agent_num)]
+        # MAIN LOOP: run agent in environment
+        while True:
+            # Target Type(state[0][0]) is stay(4),use all zero action
+            if state[0][0] == 4:
+                next_state, reward, next_done = env.step(env.all_zero_action)
+                state, done = next_state, next_done
+                continue
+            # On decision point, and Target Type(state[0][0]) is not stay(4) choose action by agent
+            if step % args.decision_period == 0:
+                step += 1
+                # Choose action by agent
+                with torch.no_grad():
+                    # predict actions
+                    action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
+                        torch.tensor(state,dtype=torch.float32).to(device)
+                    )
+                    value = value.flatten()
+
+                # variable from GPU to CPU
+                action_cpu = action.cpu().numpy()
+                dis_logprob_cpu = dis_logprob.cpu().numpy()
+                con_logprob_cpu = con_logprob.cpu().numpy()
+                value_cpu = value.cpu().numpy()
+                # Environment step
+                next_state, reward, next_done = env.step(action_cpu)
+
+                # save memories
+                if args.train:
+                    ppo_memories.save_memories(
+                        now_step=step,
+                        agent=agent,
+                        state=state,
+                        action_cpu=action_cpu,
+                        dis_logprob_cpu=dis_logprob_cpu,
+                        con_logprob_cpu=con_logprob_cpu,
+                        reward=reward,
+                        done=done,
+                        value_cpu=value_cpu,
+                        last_reward=last_reward,
+                        next_done=next_done,
+                        next_state=next_state,
+                    )
+                    # check if any training dataset is full and ready to train
+                    for i in range(args.target_num):
+                        if ppo_memories.obs[i].size()[0] >= args.datasetSize:
+                            # start train NN
+                            train_queue.append(i)
+                    if len(train_queue) > 0:
+                        # break while loop and start train
+                        break
+                    # update state
+                state, done = next_state, next_done
+            else:
+                step += 1
+                # skip this step use last predict action
+                next_state, reward, next_done = env.step(action_cpu)
+                # save memories
+                if args.train:
+                    ppo_memories.save_memories(
+                        now_step=step,
+                        agent=agent,
+                        state=state,
+                        action_cpu=action_cpu,
+                        dis_logprob_cpu=dis_logprob_cpu,
+                        con_logprob_cpu=con_logprob_cpu,
+                        reward=reward,
+                        done=done,
+                        value_cpu=value_cpu,
+                        last_reward=last_reward,
+                        next_done=next_done,
+                        next_state=next_state,
+                    )
+                    # update state
+                    state = next_state
+                    last_reward = reward
+
+        if args.train:
+            # train mode on
+            mean_reward_list = []  # for WANDB
+            # loop all training queue
+            for this_train_ind in train_queue:
+                # start time
+                start_time = time.time()
+                target_steps[this_train_ind] += 1
+                # train agent
+                (
+                    v_loss,
+                    dis_pg_loss,
+                    con_pg_loss,
+                    loss,
+                    entropy_loss
+                ) = agent.train_net(
+                    this_train_ind=this_train_ind,
+                    ppo_memories=ppo_memories,
+                    optimizer=optimizer
+                )
+                # record mean reward before clear history
+                print("done")
+                target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
+                mean_reward_list.append(target_reward_mean)
+                targetName = Targets(this_train_ind).name
+
+                # clear this target training set buffer
+                ppo_memories.clear_training_datasets(this_train_ind)
+                # record rewards for plotting purposes
+                wdb_recorder.add_target_scalar(
+                    targetName,
+                    this_train_ind,
+                    v_loss,
+                    dis_pg_loss,
+                    con_pg_loss,
+                    loss,
+                    entropy_loss,
+                    target_reward_mean,
+                    target_steps,
+                )
+                print(f"episode over Target{targetName} mean reward:", target_reward_mean)
+            TotalRewardMean = np.mean(mean_reward_list)
+            wdb_recorder.add_global_scalar(
+                TotalRewardMean,
+                optimizer.param_groups[0]["lr"],
+                total_steps,
+            )
+            # print cost time as seconds
+            print("cost time:", time.time() - start_time)
+            # New Record! or save model
+            if ((is_save_model() or TotalRewardMean > best_reward) and args.save_model):
+                # check saveDir is exist
+                saveDir = "../PPO-Model/" + run_name + "/"
+                if not os.path.isdir(saveDir):
+                    os.mkdir(saveDir)
+                best_reward = TotalRewardMean
+                torch.save(agent, saveDir + str(TotalRewardMean) + ".pt")
+                print("Model Saved!")
+                set_save_model(False)
+        else:
+            # train mode off
+            mean_reward_list = []  # for WANDB
+            # while not in training mode, clear the buffer
+            for this_train_ind in train_queue:
+                target_steps[this_train_ind] += 1
+                targetName = Targets(this_train_ind).name
+                target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
+                mean_reward_list.append(target_reward_mean)
+                print(target_steps[this_train_ind])
+
+                # clear this target training set buffer
+                ppo_memories.clear_training_datasets(this_train_ind)
+
+                # record rewards for plotting purposes
+                wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean,
+                                               target_steps[this_train_ind])
+                wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind])
+                print(f"episode over Target{targetName} mean reward:", target_reward_mean)
+            TotalRewardMean = np.mean(mean_reward_list)
+            wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
+
+    saveDir = "../PPO-Model/" + run_name + "/"
+    if not os.path.isdir(saveDir):
+        os.mkdir(saveDir)
+    best_reward = target_reward_mean
+    torch.save(agent, saveDir + "_last.pt")
+    env.close()
+    wdb_recorder.writer.close()
--- a/Aimbot-PPO-Python/Pytorch/ppoagent.py
+++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py
@ -8,14 +8,14 @@ from aimbotEnv import Aimbot
 from torch.distributions.normal import Normal
 from torch.distributions.categorical import Categorical

+firstLayerNum = 512
+secondLayerNum = 128

 def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    nn.init.orthogonal_(layer.weight, std)
    nn.init.constant_(layer.bias, bias_const)
    return layer

-neural_size_1 = 400
-neural_size_2 = 300

 class PPOAgent(nn.Module):
    def __init__(
@ -33,6 +33,15 @@ class PPOAgent(nn.Module):
        self.unity_action_size = env.unity_action_size
        self.state_size = self.unity_observation_shape[0]
        self.agent_num = env.unity_agent_num
+        self.target_size = self.args.target_state_size
+        self.time_state_size = self.args.time_state_size
+        self.gun_state_size = self.args.gun_state_size
+        self.my_state_size = self.args.my_state_size
+        self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
+        self.state_size_without_ray = self.args.total_target_size
+        self.head_input_size = (
+                env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
+        )  # except target state input

        self.unity_discrete_type = env.unity_discrete_type
        self.discrete_size = env.unity_discrete_size
@ -42,9 +51,9 @@ class PPOAgent(nn.Module):
        self.hidden_networks = nn.ModuleList(
            [
                nn.Sequential(
-                    layer_init(nn.Linear(self.state_size, neural_size_1)),
+                    layer_init(nn.Linear(self.state_size, firstLayerNum)),
                    nn.LeakyReLU(),
-                    layer_init(nn.Linear(neural_size_1, neural_size_2)),
+                    layer_init(nn.Linear(firstLayerNum, secondLayerNum)),
                    nn.LeakyReLU(),
                    )
                for i in range(self.target_num)
@ -52,16 +61,16 @@ class PPOAgent(nn.Module):
        )

        self.actor_dis = nn.ModuleList(
-            [layer_init(nn.Linear(neural_size_2, self.discrete_size), std=0.5) for i in range(self.target_num)]
+            [layer_init(nn.Linear(secondLayerNum, self.discrete_size), std=0.5) for i in range(self.target_num)]
        )
        self.actor_mean = nn.ModuleList(
-            [layer_init(nn.Linear(neural_size_2, self.continuous_size), std=0) for i in range(self.target_num)]
+            [layer_init(nn.Linear(secondLayerNum, self.continuous_size), std=0) for i in range(self.target_num)]
        )
        self.actor_logstd = nn.ParameterList(
            [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
        )
        self.critic = nn.ModuleList(
-            [layer_init(nn.Linear(neural_size_2, 1), std=0) for i in range(self.target_num)]
+            [layer_init(nn.Linear(secondLayerNum, 1), std=0) for i in range(self.target_num)]
        )

    def get_value(self, state: torch.Tensor):