MultiThread PPO First Commit

2023-11-23 15:25:34 +09:00
5 changed files with 296 additions and 180 deletions
--- a/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
+++ b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
@ -81,184 +81,43 @@
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import argparse\n",
-    "import wandb\n",
-    "import time\n",
-    "import numpy as np\n",
-    "import random\n",
-    "import uuid\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "import torch.optim as optim\n",
-    "\n",
-    "from AimbotEnv import Aimbot\n",
-    "from tqdm import tqdm\n",
-    "from torch.distributions.normal import Normal\n",
-    "from torch.distributions.categorical import Categorical\n",
-    "from distutils.util import strtobool\n",
-    "from torch.utils.tensorboard import SummaryWriter\n",
-    "from mlagents_envs.environment import UnityEnvironment\n",
-    "from mlagents_envs.side_channel.side_channel import (\n",
-    "    SideChannel,\n",
-    "    IncomingMessage,\n",
-    "    OutgoingMessage,\n",
-    ")\n",
-    "from typing import List\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "AttributeError",
-     "evalue": "'aaa' object has no attribute 'outa'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[5], line 14\u001b[0m\n\u001b[0;32m     12\u001b[0m asd \u001b[39m=\u001b[39m aaa(outa, outb)\n\u001b[0;32m     13\u001b[0m asd\u001b[39m.\u001b[39mfunc()\n\u001b[1;32m---> 14\u001b[0m \u001b[39mprint\u001b[39m(asd\u001b[39m.\u001b[39;49mouta)  \u001b[39m# 输出 100\u001b[39;00m\n",
-      "\u001b[1;31mAttributeError\u001b[0m: 'aaa' object has no attribute 'outa'"
-     ]
-    }
-   ],
-   "source": [
-    "class aaa():\n",
-    "    def __init__(self, a, b):\n",
-    "        self.a = a\n",
-    "        self.b = b\n",
-    "\n",
-    "    def func(self):\n",
-    "        global outa\n",
-    "        outa = 100\n",
-    "\n",
-    "outa = 1\n",
-    "outb = 2\n",
-    "asd = aaa(outa, outb)\n",
-    "asd.func()\n",
-    "print(asd.outa)  # 输出 100"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "usage: ipykernel_launcher.py [-h] [--seed SEED]\n",
-      "ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme=\"hmac-sha256\" --Session.key=b\"46ef9317-59fb-4ab6-ae4e-6b35744fc423\" --shell=9002 --transport=\"tcp\" --iopub=9004 --f=c:\\Users\\UCUNI\\AppData\\Roaming\\jupyter\\runtime\\kernel-v2-311926K1uko38tdWb.json\n"
-     ]
-    },
-    {
-     "ename": "SystemExit",
-     "evalue": "2",
-     "output_type": "error",
-     "traceback": [
-      "An exception has occurred, use %tb to see the full traceback.\n",
-      "\u001b[1;31mSystemExit\u001b[0m\u001b[1;31m:\u001b[0m 2\n"
-     ]
-    }
-   ],
-   "source": [
-    "import argparse\n",
-    "\n",
-    "def parse_args():\n",
-    "    parser = argparse.ArgumentParser()\n",
-    "    parser.add_argument(\"--seed\", type=int, default=11,\n",
-    "                        help=\"seed of the experiment\")\n",
-    "    args = parser.parse_args()\n",
-    "    return args\n",
-    "\n",
-    "arggg = parse_args()\n",
-    "print(type(arggg))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "3"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "y=\"a;b;c\"\n",
-    "len(y.split(\";\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2]\n"
+      "0\n",
+      "i =  0\n",
+      "i =  1\n",
+      "i =  2\n",
+      "i =  3\n",
+      "i =  4\n",
+      "i =  5\n",
+      "i =  6\n",
+      "i =  7\n",
+      "i =  8\n",
+      "i =  9\n",
+      "10\n"
     ]
    }
   ],
   "source": [
-    "a = np.array([1,2,3,4])\n",
-    "print(a[[False,True,False,False]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{1, 2, 3, 4}"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "a = {1,2,3}\n",
-    "a.add(4)\n",
-    "a"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([3, 4])"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "a = np.array([[1,3],[2,4]])\n",
-    "a.max(axis=1)\n"
+    "import threading\n",
+    "\n",
+    "num = 0\n",
+    "\n",
+    "def print_numers():\n",
+    "    global num\n",
+    "    for i in range(10):\n",
+    "        num +=1\n",
+    "        print(\"i = \",i)\n",
+    "\n",
+    "thread = threading.Thread(target=print_numers)\n",
+    "\n",
+    "print(num)\n",
+    "thread.start()\n",
+    "thread.join()\n",
+    "print(num)"
   ]
  }
 ],
--- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
+++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
@ -19,8 +19,8 @@ import torch.optim as optim
 # side channel uuid
 SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
 # tensorboard names
-GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel"
-GAME_TYPE = "GotoOnly-Level2345"
+GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel_V2"
+GAME_TYPE = "GotoOnly-Level0123-new512Model"

 if __name__ == "__main__":
    args = parse_args()
--- a/Aimbot-PPO-Python/Pytorch/arguments.py
+++ b/Aimbot-PPO-Python/Pytorch/arguments.py
@ -4,7 +4,7 @@ import uuid
 from distutils.util import strtobool

 DEFAULT_SEED = 9331
-ENV_PATH = "../Build/3.4/Aimbot-ParallelEnv"
+ENV_PATH = "../Build/3.5/Aimbot-ParallelEnv"
 WAND_ENTITY = "koha9"
 WORKER_ID = 1
 BASE_PORT = 1000
@ -22,9 +22,9 @@ GAE_LAMBDA = 0.95
 EPOCHS = 3
 CLIP_COEF = 0.11
 LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
-POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
+POLICY_COEF = [0.8, 0.8, 0.8, 0.8]
 ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
-CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
+CRITIC_COEF = [0.8, 0.8, 0.8, 0.8]
 TARGET_LEARNING_RATE = 1e-6

 FREEZE_VIEW_NETWORK = False
@ -35,7 +35,7 @@ TRAIN = True
 SAVE_MODEL = True
 WANDB_TACK = True
 LOAD_DIR = None
-LOAD_DIR = "../PPO-Model/GotoOnly-Level1234_9331_1697122986/8.853553.pt"
+# LOAD_DIR = "../PPO-Model/GotoOnly-Level0123_9331_1696965321/5.1035867.pt"

 # Unity Environment Parameters
 TARGET_STATE_SIZE = 6
--- a/Aimbot-PPO-Python/Pytorch/multiThread-PPO.py
+++ b/Aimbot-PPO-Python/Pytorch/multiThread-PPO.py
@ -0,0 +1,255 @@
+import time
+import numpy as np
+import random
+import uuid
+import torch
+import atexit
+import os
+
+from aimbotEnv import Aimbot
+from aimbotEnv import AimbotSideChannel
+from ppoagent import PPOAgent
+from airecorder import WandbRecorder
+from aimemory import PPOMem
+from aimemory import Targets
+from arguments import parse_args
+from arguments import set_save_model, is_save_model
+import torch.optim as optim
+
+# side channel uuid
+SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
+# tensorboard names
+GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel_V2"
+GAME_TYPE = "GotoOnly-Level0123-new512Model"
+
+if __name__ == "__main__":
+    args = parse_args()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+    best_reward = -1
+
+    # Initialize environment agent optimizer
+    aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID)
+    env = Aimbot(
+        env_path=args.path,
+        worker_id=args.workerID,
+        base_port=args.baseport,
+        side_channels=[aimbot_side_channel])
+    if args.load_dir is None:
+        agent = PPOAgent(
+            env=env,
+            this_args=args,
+            device=device,
+        ).to(device)
+    else:
+        agent = torch.load(args.load_dir)
+        # freeze
+        if args.freeze_viewnet:
+            # freeze the view network
+            print("FREEZE VIEW NETWORK is not compatible with Full MNN!")
+            raise NotImplementedError
+        print("Load Agent", args.load_dir)
+        print(agent.eval())
+    # optimizer
+    optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
+    # Tensorboard and WandB Recorder
+    run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
+    wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
+
+    # start the game
+    total_update_step = args.target_num * args.total_timesteps // args.datasetSize
+    target_steps = [0 for i in range(args.target_num)]
+    start_time = time.time()
+    state, _, done = env.reset()
+
+    # initialize AI memories
+    ppo_memories = PPOMem(
+        args=args,
+        unity_agent_num=env.unity_agent_num,
+        device=device,
+    )
+
+    # MAIN LOOP: run agent in environment
+    for total_steps in range(total_update_step):
+        # discount learning rate, while step == total_update_step lr will be 0
+        if args.annealLR:
+            final_lr_ratio = args.target_lr / args.lr
+            frac = 1.0 - ((total_steps + 1.0) / total_update_step)
+            lr_now = frac * args.lr
+            optimizer.param_groups[0]["lr"] = lr_now
+        else:
+            lr_now = args.lr
+
+        # episode start show learning rate
+        print("new episode", total_steps, "learning rate = ", lr_now)
+        step = 0
+        training = False
+        train_queue = []
+        last_reward = [0. for i in range(env.unity_agent_num)]
+        # MAIN LOOP: run agent in environment
+        while True:
+            # Target Type(state[0][0]) is stay(4),use all zero action
+            if state[0][0] == 4:
+                next_state, reward, next_done = env.step(env.all_zero_action)
+                state, done = next_state, next_done
+                continue
+            # On decision point, and Target Type(state[0][0]) is not stay(4) choose action by agent
+            if step % args.decision_period == 0:
+                step += 1
+                # Choose action by agent
+                with torch.no_grad():
+                    # predict actions
+                    action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
+                        torch.tensor(state,dtype=torch.float32).to(device)
+                    )
+                    value = value.flatten()
+
+                # variable from GPU to CPU
+                action_cpu = action.cpu().numpy()
+                dis_logprob_cpu = dis_logprob.cpu().numpy()
+                con_logprob_cpu = con_logprob.cpu().numpy()
+                value_cpu = value.cpu().numpy()
+                # Environment step
+                next_state, reward, next_done = env.step(action_cpu)
+
+                # save memories
+                if args.train:
+                    ppo_memories.save_memories(
+                        now_step=step,
+                        agent=agent,
+                        state=state,
+                        action_cpu=action_cpu,
+                        dis_logprob_cpu=dis_logprob_cpu,
+                        con_logprob_cpu=con_logprob_cpu,
+                        reward=reward,
+                        done=done,
+                        value_cpu=value_cpu,
+                        last_reward=last_reward,
+                        next_done=next_done,
+                        next_state=next_state,
+                    )
+                    # check if any training dataset is full and ready to train
+                    for i in range(args.target_num):
+                        if ppo_memories.obs[i].size()[0] >= args.datasetSize:
+                            # start train NN
+                            train_queue.append(i)
+                    if len(train_queue) > 0:
+                        # break while loop and start train
+                        break
+                    # update state
+                state, done = next_state, next_done
+            else:
+                step += 1
+                # skip this step use last predict action
+                next_state, reward, next_done = env.step(action_cpu)
+                # save memories
+                if args.train:
+                    ppo_memories.save_memories(
+                        now_step=step,
+                        agent=agent,
+                        state=state,
+                        action_cpu=action_cpu,
+                        dis_logprob_cpu=dis_logprob_cpu,
+                        con_logprob_cpu=con_logprob_cpu,
+                        reward=reward,
+                        done=done,
+                        value_cpu=value_cpu,
+                        last_reward=last_reward,
+                        next_done=next_done,
+                        next_state=next_state,
+                    )
+                    # update state
+                    state = next_state
+                    last_reward = reward
+
+        if args.train:
+            # train mode on
+            mean_reward_list = []  # for WANDB
+            # loop all training queue
+            for this_train_ind in train_queue:
+                # start time
+                start_time = time.time()
+                target_steps[this_train_ind] += 1
+                # train agent
+                (
+                    v_loss,
+                    dis_pg_loss,
+                    con_pg_loss,
+                    loss,
+                    entropy_loss
+                ) = agent.train_net(
+                    this_train_ind=this_train_ind,
+                    ppo_memories=ppo_memories,
+                    optimizer=optimizer
+                )
+                # record mean reward before clear history
+                print("done")
+                target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
+                mean_reward_list.append(target_reward_mean)
+                targetName = Targets(this_train_ind).name
+
+                # clear this target training set buffer
+                ppo_memories.clear_training_datasets(this_train_ind)
+                # record rewards for plotting purposes
+                wdb_recorder.add_target_scalar(
+                    targetName,
+                    this_train_ind,
+                    v_loss,
+                    dis_pg_loss,
+                    con_pg_loss,
+                    loss,
+                    entropy_loss,
+                    target_reward_mean,
+                    target_steps,
+                )
+                print(f"episode over Target{targetName} mean reward:", target_reward_mean)
+            TotalRewardMean = np.mean(mean_reward_list)
+            wdb_recorder.add_global_scalar(
+                TotalRewardMean,
+                optimizer.param_groups[0]["lr"],
+                total_steps,
+            )
+            # print cost time as seconds
+            print("cost time:", time.time() - start_time)
+            # New Record! or save model
+            if ((is_save_model() or TotalRewardMean > best_reward) and args.save_model):
+                # check saveDir is exist
+                saveDir = "../PPO-Model/" + run_name + "/"
+                if not os.path.isdir(saveDir):
+                    os.mkdir(saveDir)
+                best_reward = TotalRewardMean
+                torch.save(agent, saveDir + str(TotalRewardMean) + ".pt")
+                print("Model Saved!")
+                set_save_model(False)
+        else:
+            # train mode off
+            mean_reward_list = []  # for WANDB
+            # while not in training mode, clear the buffer
+            for this_train_ind in train_queue:
+                target_steps[this_train_ind] += 1
+                targetName = Targets(this_train_ind).name
+                target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
+                mean_reward_list.append(target_reward_mean)
+                print(target_steps[this_train_ind])
+
+                # clear this target training set buffer
+                ppo_memories.clear_training_datasets(this_train_ind)
+
+                # record rewards for plotting purposes
+                wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean,
+                                               target_steps[this_train_ind])
+                wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind])
+                print(f"episode over Target{targetName} mean reward:", target_reward_mean)
+            TotalRewardMean = np.mean(mean_reward_list)
+            wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
+
+    saveDir = "../PPO-Model/" + run_name + "/"
+    if not os.path.isdir(saveDir):
+        os.mkdir(saveDir)
+    best_reward = target_reward_mean
+    torch.save(agent, saveDir + "_last.pt")
+    env.close()
+    wdb_recorder.writer.close()
--- a/Aimbot-PPO-Python/Pytorch/ppoagent.py
+++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py
@ -8,6 +8,8 @@ from aimbotEnv import Aimbot
 from torch.distributions.normal import Normal
 from torch.distributions.categorical import Categorical

+firstLayerNum = 512
+secondLayerNum = 128

 def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    nn.init.orthogonal_(layer.weight, std)
@ -49,9 +51,9 @@ class PPOAgent(nn.Module):
        self.hidden_networks = nn.ModuleList(
            [
                nn.Sequential(
-                    layer_init(nn.Linear(self.state_size, 256)),
+                    layer_init(nn.Linear(self.state_size, firstLayerNum)),
                    nn.LeakyReLU(),
-                    layer_init(nn.Linear(256, 128)),
+                    layer_init(nn.Linear(firstLayerNum, secondLayerNum)),
                    nn.LeakyReLU(),
                    )
                for i in range(self.target_num)
@ -59,16 +61,16 @@ class PPOAgent(nn.Module):
        )

        self.actor_dis = nn.ModuleList(
-            [layer_init(nn.Linear(128, self.discrete_size), std=0.5) for i in range(self.target_num)]
+            [layer_init(nn.Linear(secondLayerNum, self.discrete_size), std=0.5) for i in range(self.target_num)]
        )
        self.actor_mean = nn.ModuleList(
-            [layer_init(nn.Linear(128, self.continuous_size), std=0) for i in range(self.target_num)]
+            [layer_init(nn.Linear(secondLayerNum, self.continuous_size), std=0) for i in range(self.target_num)]
        )
        self.actor_logstd = nn.ParameterList(
            [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
        )
        self.critic = nn.ModuleList(
-            [layer_init(nn.Linear(128, 1), std=0) for i in range(self.target_num)]
+            [layer_init(nn.Linear(secondLayerNum, 1), std=0) for i in range(self.target_num)]
        )

    def get_value(self, state: torch.Tensor):