From 2ea8a5f104ace68af0e2fa0dcc4426b6951c37cc Mon Sep 17 00:00:00 2001 From: Koha9 Date: Thu, 23 Nov 2023 15:25:34 +0900 Subject: [PATCH] MultiThread PPO First Commit --- Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb | 197 ++------------ Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py | 4 +- Aimbot-PPO-Python/Pytorch/arguments.py | 8 +- Aimbot-PPO-Python/Pytorch/multiThread-PPO.py | 255 ++++++++++++++++++ Aimbot-PPO-Python/Pytorch/ppoagent.py | 12 +- 5 files changed, 296 insertions(+), 180 deletions(-) create mode 100644 Aimbot-PPO-Python/Pytorch/multiThread-PPO.py diff --git a/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb index 60f7284..72a4f5a 100644 --- a/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb +++ b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb @@ -81,184 +81,43 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], - "source": [ - "import argparse\n", - "import wandb\n", - "import time\n", - "import numpy as np\n", - "import random\n", - "import uuid\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "\n", - "from AimbotEnv import Aimbot\n", - "from tqdm import tqdm\n", - "from torch.distributions.normal import Normal\n", - "from torch.distributions.categorical import Categorical\n", - "from distutils.util import strtobool\n", - "from torch.utils.tensorboard import SummaryWriter\n", - "from mlagents_envs.environment import UnityEnvironment\n", - "from mlagents_envs.side_channel.side_channel import (\n", - " SideChannel,\n", - " IncomingMessage,\n", - " OutgoingMessage,\n", - ")\n", - "from typing import List\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'aaa' object has no attribute 'outa'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[5], line 14\u001b[0m\n\u001b[0;32m 12\u001b[0m asd \u001b[39m=\u001b[39m aaa(outa, outb)\n\u001b[0;32m 13\u001b[0m asd\u001b[39m.\u001b[39mfunc()\n\u001b[1;32m---> 14\u001b[0m \u001b[39mprint\u001b[39m(asd\u001b[39m.\u001b[39;49mouta) \u001b[39m# 输出 100\u001b[39;00m\n", - "\u001b[1;31mAttributeError\u001b[0m: 'aaa' object has no attribute 'outa'" - ] - } - ], - "source": [ - "class aaa():\n", - " def __init__(self, a, b):\n", - " self.a = a\n", - " self.b = b\n", - "\n", - " def func(self):\n", - " global outa\n", - " outa = 100\n", - "\n", - "outa = 1\n", - "outb = 2\n", - "asd = aaa(outa, outb)\n", - "asd.func()\n", - "print(asd.outa) # 输出 100" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "usage: ipykernel_launcher.py [-h] [--seed SEED]\n", - "ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme=\"hmac-sha256\" --Session.key=b\"46ef9317-59fb-4ab6-ae4e-6b35744fc423\" --shell=9002 --transport=\"tcp\" --iopub=9004 --f=c:\\Users\\UCUNI\\AppData\\Roaming\\jupyter\\runtime\\kernel-v2-311926K1uko38tdWb.json\n" - ] - }, - { - "ename": "SystemExit", - "evalue": "2", - "output_type": "error", - "traceback": [ - "An exception has occurred, use %tb to see the full traceback.\n", - "\u001b[1;31mSystemExit\u001b[0m\u001b[1;31m:\u001b[0m 2\n" - ] - } - ], - "source": [ - "import argparse\n", - "\n", - "def parse_args():\n", - " parser = argparse.ArgumentParser()\n", - " parser.add_argument(\"--seed\", type=int, default=11,\n", - " help=\"seed of the experiment\")\n", - " args = parser.parse_args()\n", - " return args\n", - "\n", - "arggg = parse_args()\n", - "print(type(arggg))" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y=\"a;b;c\"\n", - "len(y.split(\";\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[2]\n" + "0\n", + "i = 0\n", + "i = 1\n", + "i = 2\n", + "i = 3\n", + "i = 4\n", + "i = 5\n", + "i = 6\n", + "i = 7\n", + "i = 8\n", + "i = 9\n", + "10\n" ] } ], "source": [ - "a = np.array([1,2,3,4])\n", - "print(a[[False,True,False,False]])" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{1, 2, 3, 4}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a = {1,2,3}\n", - "a.add(4)\n", - "a" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([3, 4])" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a = np.array([[1,3],[2,4]])\n", - "a.max(axis=1)\n" + "import threading\n", + "\n", + "num = 0\n", + "\n", + "def print_numers():\n", + " global num\n", + " for i in range(10):\n", + " num +=1\n", + " print(\"i = \",i)\n", + "\n", + "thread = threading.Thread(target=print_numers)\n", + "\n", + "print(num)\n", + "thread.start()\n", + "thread.join()\n", + "print(num)" ] } ], diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index 84c56af..74867d2 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -19,8 +19,8 @@ import torch.optim as optim # side channel uuid SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e") # tensorboard names -GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel" -GAME_TYPE = "GotoOnly-Level2345" +GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel_V2" +GAME_TYPE = "GotoOnly-Level0123-new512Model" if __name__ == "__main__": args = parse_args() diff --git a/Aimbot-PPO-Python/Pytorch/arguments.py b/Aimbot-PPO-Python/Pytorch/arguments.py index bcfe8fb..0552cb9 100644 --- a/Aimbot-PPO-Python/Pytorch/arguments.py +++ b/Aimbot-PPO-Python/Pytorch/arguments.py @@ -4,7 +4,7 @@ import uuid from distutils.util import strtobool DEFAULT_SEED = 9331 -ENV_PATH = "../Build/3.4/Aimbot-ParallelEnv" +ENV_PATH = "../Build/3.5/Aimbot-ParallelEnv" WAND_ENTITY = "koha9" WORKER_ID = 1 BASE_PORT = 1000 @@ -22,9 +22,9 @@ GAE_LAMBDA = 0.95 EPOCHS = 3 CLIP_COEF = 0.11 LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence -POLICY_COEF = [1.0, 1.0, 1.0, 1.0] +POLICY_COEF = [0.8, 0.8, 0.8, 0.8] ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05] -CRITIC_COEF = [0.5, 0.5, 0.5, 0.5] +CRITIC_COEF = [0.8, 0.8, 0.8, 0.8] TARGET_LEARNING_RATE = 1e-6 FREEZE_VIEW_NETWORK = False @@ -35,7 +35,7 @@ TRAIN = True SAVE_MODEL = True WANDB_TACK = True LOAD_DIR = None -LOAD_DIR = "../PPO-Model/GotoOnly-Level1234_9331_1697122986/8.853553.pt" +# LOAD_DIR = "../PPO-Model/GotoOnly-Level0123_9331_1696965321/5.1035867.pt" # Unity Environment Parameters TARGET_STATE_SIZE = 6 diff --git a/Aimbot-PPO-Python/Pytorch/multiThread-PPO.py b/Aimbot-PPO-Python/Pytorch/multiThread-PPO.py new file mode 100644 index 0000000..74867d2 --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/multiThread-PPO.py @@ -0,0 +1,255 @@ +import time +import numpy as np +import random +import uuid +import torch +import atexit +import os + +from aimbotEnv import Aimbot +from aimbotEnv import AimbotSideChannel +from ppoagent import PPOAgent +from airecorder import WandbRecorder +from aimemory import PPOMem +from aimemory import Targets +from arguments import parse_args +from arguments import set_save_model, is_save_model +import torch.optim as optim + +# side channel uuid +SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e") +# tensorboard names +GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel_V2" +GAME_TYPE = "GotoOnly-Level0123-new512Model" + +if __name__ == "__main__": + args = parse_args() + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + + device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + best_reward = -1 + + # Initialize environment agent optimizer + aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID) + env = Aimbot( + env_path=args.path, + worker_id=args.workerID, + base_port=args.baseport, + side_channels=[aimbot_side_channel]) + if args.load_dir is None: + agent = PPOAgent( + env=env, + this_args=args, + device=device, + ).to(device) + else: + agent = torch.load(args.load_dir) + # freeze + if args.freeze_viewnet: + # freeze the view network + print("FREEZE VIEW NETWORK is not compatible with Full MNN!") + raise NotImplementedError + print("Load Agent", args.load_dir) + print(agent.eval()) + # optimizer + optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) + # Tensorboard and WandB Recorder + run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}" + wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args) + + # start the game + total_update_step = args.target_num * args.total_timesteps // args.datasetSize + target_steps = [0 for i in range(args.target_num)] + start_time = time.time() + state, _, done = env.reset() + + # initialize AI memories + ppo_memories = PPOMem( + args=args, + unity_agent_num=env.unity_agent_num, + device=device, + ) + + # MAIN LOOP: run agent in environment + for total_steps in range(total_update_step): + # discount learning rate, while step == total_update_step lr will be 0 + if args.annealLR: + final_lr_ratio = args.target_lr / args.lr + frac = 1.0 - ((total_steps + 1.0) / total_update_step) + lr_now = frac * args.lr + optimizer.param_groups[0]["lr"] = lr_now + else: + lr_now = args.lr + + # episode start show learning rate + print("new episode", total_steps, "learning rate = ", lr_now) + step = 0 + training = False + train_queue = [] + last_reward = [0. for i in range(env.unity_agent_num)] + # MAIN LOOP: run agent in environment + while True: + # Target Type(state[0][0]) is stay(4),use all zero action + if state[0][0] == 4: + next_state, reward, next_done = env.step(env.all_zero_action) + state, done = next_state, next_done + continue + # On decision point, and Target Type(state[0][0]) is not stay(4) choose action by agent + if step % args.decision_period == 0: + step += 1 + # Choose action by agent + with torch.no_grad(): + # predict actions + action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value( + torch.tensor(state,dtype=torch.float32).to(device) + ) + value = value.flatten() + + # variable from GPU to CPU + action_cpu = action.cpu().numpy() + dis_logprob_cpu = dis_logprob.cpu().numpy() + con_logprob_cpu = con_logprob.cpu().numpy() + value_cpu = value.cpu().numpy() + # Environment step + next_state, reward, next_done = env.step(action_cpu) + + # save memories + if args.train: + ppo_memories.save_memories( + now_step=step, + agent=agent, + state=state, + action_cpu=action_cpu, + dis_logprob_cpu=dis_logprob_cpu, + con_logprob_cpu=con_logprob_cpu, + reward=reward, + done=done, + value_cpu=value_cpu, + last_reward=last_reward, + next_done=next_done, + next_state=next_state, + ) + # check if any training dataset is full and ready to train + for i in range(args.target_num): + if ppo_memories.obs[i].size()[0] >= args.datasetSize: + # start train NN + train_queue.append(i) + if len(train_queue) > 0: + # break while loop and start train + break + # update state + state, done = next_state, next_done + else: + step += 1 + # skip this step use last predict action + next_state, reward, next_done = env.step(action_cpu) + # save memories + if args.train: + ppo_memories.save_memories( + now_step=step, + agent=agent, + state=state, + action_cpu=action_cpu, + dis_logprob_cpu=dis_logprob_cpu, + con_logprob_cpu=con_logprob_cpu, + reward=reward, + done=done, + value_cpu=value_cpu, + last_reward=last_reward, + next_done=next_done, + next_state=next_state, + ) + # update state + state = next_state + last_reward = reward + + if args.train: + # train mode on + mean_reward_list = [] # for WANDB + # loop all training queue + for this_train_ind in train_queue: + # start time + start_time = time.time() + target_steps[this_train_ind] += 1 + # train agent + ( + v_loss, + dis_pg_loss, + con_pg_loss, + loss, + entropy_loss + ) = agent.train_net( + this_train_ind=this_train_ind, + ppo_memories=ppo_memories, + optimizer=optimizer + ) + # record mean reward before clear history + print("done") + target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy()) + mean_reward_list.append(target_reward_mean) + targetName = Targets(this_train_ind).name + + # clear this target training set buffer + ppo_memories.clear_training_datasets(this_train_ind) + # record rewards for plotting purposes + wdb_recorder.add_target_scalar( + targetName, + this_train_ind, + v_loss, + dis_pg_loss, + con_pg_loss, + loss, + entropy_loss, + target_reward_mean, + target_steps, + ) + print(f"episode over Target{targetName} mean reward:", target_reward_mean) + TotalRewardMean = np.mean(mean_reward_list) + wdb_recorder.add_global_scalar( + TotalRewardMean, + optimizer.param_groups[0]["lr"], + total_steps, + ) + # print cost time as seconds + print("cost time:", time.time() - start_time) + # New Record! or save model + if ((is_save_model() or TotalRewardMean > best_reward) and args.save_model): + # check saveDir is exist + saveDir = "../PPO-Model/" + run_name + "/" + if not os.path.isdir(saveDir): + os.mkdir(saveDir) + best_reward = TotalRewardMean + torch.save(agent, saveDir + str(TotalRewardMean) + ".pt") + print("Model Saved!") + set_save_model(False) + else: + # train mode off + mean_reward_list = [] # for WANDB + # while not in training mode, clear the buffer + for this_train_ind in train_queue: + target_steps[this_train_ind] += 1 + targetName = Targets(this_train_ind).name + target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy()) + mean_reward_list.append(target_reward_mean) + print(target_steps[this_train_ind]) + + # clear this target training set buffer + ppo_memories.clear_training_datasets(this_train_ind) + + # record rewards for plotting purposes + wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean, + target_steps[this_train_ind]) + wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind]) + print(f"episode over Target{targetName} mean reward:", target_reward_mean) + TotalRewardMean = np.mean(mean_reward_list) + wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) + + saveDir = "../PPO-Model/" + run_name + "/" + if not os.path.isdir(saveDir): + os.mkdir(saveDir) + best_reward = target_reward_mean + torch.save(agent, saveDir + "_last.pt") + env.close() + wdb_recorder.writer.close() diff --git a/Aimbot-PPO-Python/Pytorch/ppoagent.py b/Aimbot-PPO-Python/Pytorch/ppoagent.py index 74232a7..2554f5b 100644 --- a/Aimbot-PPO-Python/Pytorch/ppoagent.py +++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py @@ -8,6 +8,8 @@ from aimbotEnv import Aimbot from torch.distributions.normal import Normal from torch.distributions.categorical import Categorical +firstLayerNum = 512 +secondLayerNum = 128 def layer_init(layer, std=np.sqrt(2), bias_const=0.0): nn.init.orthogonal_(layer.weight, std) @@ -49,9 +51,9 @@ class PPOAgent(nn.Module): self.hidden_networks = nn.ModuleList( [ nn.Sequential( - layer_init(nn.Linear(self.state_size, 256)), + layer_init(nn.Linear(self.state_size, firstLayerNum)), nn.LeakyReLU(), - layer_init(nn.Linear(256, 128)), + layer_init(nn.Linear(firstLayerNum, secondLayerNum)), nn.LeakyReLU(), ) for i in range(self.target_num) @@ -59,16 +61,16 @@ class PPOAgent(nn.Module): ) self.actor_dis = nn.ModuleList( - [layer_init(nn.Linear(128, self.discrete_size), std=0.5) for i in range(self.target_num)] + [layer_init(nn.Linear(secondLayerNum, self.discrete_size), std=0.5) for i in range(self.target_num)] ) self.actor_mean = nn.ModuleList( - [layer_init(nn.Linear(128, self.continuous_size), std=0) for i in range(self.target_num)] + [layer_init(nn.Linear(secondLayerNum, self.continuous_size), std=0) for i in range(self.target_num)] ) self.actor_logstd = nn.ParameterList( [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)] ) self.critic = nn.ModuleList( - [layer_init(nn.Linear(128, 1), std=0) for i in range(self.target_num)] + [layer_init(nn.Linear(secondLayerNum, 1), std=0) for i in range(self.target_num)] ) def get_value(self, state: torch.Tensor):