From a21fd724af84ccf50f94a012a901f816bfeed1bb Mon Sep 17 00:00:00 2001 From: Koha9 Date: Sat, 22 Jul 2023 19:26:39 +0900 Subject: [PATCH] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E6=95=B4=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 分离ppoagent,AI memory,AI Recorder 优化Aimbot Env 正规化各类命名 Archive不使用的package --- .vscode/settings.json | 4 +- Aimbot-PPO-Python/Pytorch/AimbotEnv.py | 107 +++++-- .../Pytorch/{ => Archive}/AimBotEnv-old.py | 0 .../Pytorch/{ => Archive}/graph.py | 0 .../Pytorch/{ => Archive}/ppo.py | 0 .../Pytorch/{ => Archive}/test2.ipynb | 57 ++++ .../Pytorch/{ => Archive}/testEnv.py | 0 .../Pytorch/{ => Archive}/testarea.ipynb | 0 Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py | 291 ++++++------------ Aimbot-PPO-Python/Pytorch/aimemory.py | 146 +++++++++ Aimbot-PPO-Python/Pytorch/ppoagent.py | 173 ++++------- 11 files changed, 438 insertions(+), 340 deletions(-) rename Aimbot-PPO-Python/Pytorch/{ => Archive}/AimBotEnv-old.py (100%) rename Aimbot-PPO-Python/Pytorch/{ => Archive}/graph.py (100%) rename Aimbot-PPO-Python/Pytorch/{ => Archive}/ppo.py (100%) rename Aimbot-PPO-Python/Pytorch/{ => Archive}/test2.ipynb (89%) rename Aimbot-PPO-Python/Pytorch/{ => Archive}/testEnv.py (100%) rename Aimbot-PPO-Python/Pytorch/{ => Archive}/testarea.ipynb (100%) create mode 100644 Aimbot-PPO-Python/Pytorch/aimemory.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 26df38b..780162f 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,5 @@ { - "python.linting.enabled": false + "python.linting.enabled": false, + "python.analysis.typeCheckingMode": "off", + "commentTranslate.source": "intellsmi.deepl-translate-deepl" } \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/AimbotEnv.py b/Aimbot-PPO-Python/Pytorch/AimbotEnv.py index 0ca1a59..1a4baca 100644 --- a/Aimbot-PPO-Python/Pytorch/AimbotEnv.py +++ b/Aimbot-PPO-Python/Pytorch/AimbotEnv.py @@ -1,9 +1,16 @@ import gym import numpy as np - +import uuid +import airecorder from numpy import ndarray from mlagents_envs.base_env import ActionTuple from mlagents_envs.environment import UnityEnvironment +from typing import Tuple, List +from mlagents_envs.side_channel.side_channel import ( + SideChannel, + IncomingMessage, + OutgoingMessage, +) class Aimbot(gym.Env): @@ -61,7 +68,7 @@ class Aimbot(gym.Env): # agents number self.unity_agent_num = len(self.unity_agent_IDS) - def reset(self): + def reset(self)->Tuple[np.ndarray, List, List]: """reset enviroment and get observations Returns: @@ -69,7 +76,7 @@ class Aimbot(gym.Env): """ # reset env self.env.reset() - nextState, reward, done = self.getSteps() + nextState, reward, done = self.get_steps() return nextState, reward, done # TODO: @@ -80,7 +87,7 @@ class Aimbot(gym.Env): def step( self, actions: ndarray, - ): + )->Tuple[np.ndarray, List, List]: """change ations list to ActionTuple then send it to enviroment Args: @@ -114,10 +121,10 @@ class Aimbot(gym.Env): self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple) self.env.step() # get nextState & reward & done after this action - nextStates, rewards, dones = self.getSteps() + nextStates, rewards, dones = self.get_steps() return nextStates, rewards, dones - def getSteps(self): + def get_steps(self)->Tuple[np.ndarray, List, List]: """get enviroment now observations. Include State, Reward, Done @@ -127,28 +134,92 @@ class Aimbot(gym.Env): ndarray: nextState, reward, done """ # get nextState & reward & done - decisionSteps, terminalSteps = self.env.get_steps(self.unity_beha_name) - nextStates = [] + decision_steps, terminal_steps = self.env.get_steps(self.unity_beha_name) + next_states = [] dones = [] rewards = [] - for thisAgentID in self.unity_agent_IDS: + for this_agent_ID in self.unity_agent_IDS: # while Episode over agentID will both in decisionSteps and terminalSteps. # avoid redundant state and reward, # use agentExist toggle to check if agent is already exist. - agentExist = False + agent_exist = False # game done - if thisAgentID in terminalSteps: - nextStates.append(terminalSteps[thisAgentID].obs[0]) + if this_agent_ID in terminal_steps: + next_states.append(terminal_steps[this_agent_ID].obs[0]) dones.append(True) - rewards.append(terminalSteps[thisAgentID].reward) - agentExist = True + rewards.append(terminal_steps[this_agent_ID].reward) + agent_exist = True # game not over yet and agent not in terminalSteps - if (thisAgentID in decisionSteps) and (not agentExist): - nextStates.append(decisionSteps[thisAgentID].obs[0]) + if (this_agent_ID in decision_steps) and (not agent_exist): + next_states.append(decision_steps[this_agent_ID].obs[0]) dones.append(False) - rewards.append(decisionSteps[thisAgentID].reward) + rewards.append(decision_steps[this_agent_ID].reward) - return np.asarray(nextStates), rewards, dones + return np.asarray(next_states), rewards, dones def close(self): self.env.close() + +class AimbotSideChannel(SideChannel): + def __init__(self, channel_id: uuid.UUID) -> None: + super().__init__(channel_id) + + def on_message_received(self, msg: IncomingMessage) -> None: + """ + Note: We must implement this method of the SideChannel interface to + receive messages from Unity + Message will be sent like this: + "Warning|Message1|Message2|Message3" or + "Error|Message1|Message2|Message3" + """ + this_message = msg.read_string() + this_result = this_message.split("|") + if(this_result[0] == "result"): + airecorder.total_rounds[this_result[1]]+=1 + if(this_result[2] == "Win"): + airecorder.win_rounds[this_result[1]]+=1 + #print(TotalRounds) + #print(WinRounds) + elif(this_result[0] == "Error"): + print(this_message) + # # while Message type is Warning + # if(thisResult[0] == "Warning"): + # # while Message1 is result means one game is over + # if (thisResult[1] == "Result"): + # TotalRounds[thisResult[2]]+=1 + # # while Message3 is Win means this agent win this game + # if(thisResult[3] == "Win"): + # WinRounds[thisResult[2]]+=1 + # # while Message1 is GameState means this game is just start + # # and tell python which game mode is + # elif (thisResult[1] == "GameState"): + # SCrecieved = 1 + # # while Message type is Error + # elif(thisResult[0] == "Error"): + # print(thisMessage) + # 发送函数 + def send_string(self, data: str) -> None: + # send a string toC# + msg = OutgoingMessage() + msg.write_string(data) + super().queue_message_to_send(msg) + + def send_bool(self, data: bool) -> None: + msg = OutgoingMessage() + msg.write_bool(data) + super().queue_message_to_send(msg) + + def send_int(self, data: int) -> None: + msg = OutgoingMessage() + msg.write_int32(data) + super().queue_message_to_send(msg) + + def send_float(self, data: float) -> None: + msg = OutgoingMessage() + msg.write_float32(data) + super().queue_message_to_send(msg) + + def send_float_list(self, data: List[float]) -> None: + msg = OutgoingMessage() + msg.write_float32_list(data) + super().queue_message_to_send(msg) \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/AimBotEnv-old.py b/Aimbot-PPO-Python/Pytorch/Archive/AimBotEnv-old.py similarity index 100% rename from Aimbot-PPO-Python/Pytorch/AimBotEnv-old.py rename to Aimbot-PPO-Python/Pytorch/Archive/AimBotEnv-old.py diff --git a/Aimbot-PPO-Python/Pytorch/graph.py b/Aimbot-PPO-Python/Pytorch/Archive/graph.py similarity index 100% rename from Aimbot-PPO-Python/Pytorch/graph.py rename to Aimbot-PPO-Python/Pytorch/Archive/graph.py diff --git a/Aimbot-PPO-Python/Pytorch/ppo.py b/Aimbot-PPO-Python/Pytorch/Archive/ppo.py similarity index 100% rename from Aimbot-PPO-Python/Pytorch/ppo.py rename to Aimbot-PPO-Python/Pytorch/Archive/ppo.py diff --git a/Aimbot-PPO-Python/Pytorch/test2.ipynb b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb similarity index 89% rename from Aimbot-PPO-Python/Pytorch/test2.ipynb rename to Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb index 7cd4d47..caa3aaa 100644 --- a/Aimbot-PPO-Python/Pytorch/test2.ipynb +++ b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb @@ -141,6 +141,63 @@ "asd.func()\n", "print(asd.outa) # 输出 100" ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "usage: ipykernel_launcher.py [-h] [--seed SEED]\n", + "ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme=\"hmac-sha256\" --Session.key=b\"46ef9317-59fb-4ab6-ae4e-6b35744fc423\" --shell=9002 --transport=\"tcp\" --iopub=9004 --f=c:\\Users\\UCUNI\\AppData\\Roaming\\jupyter\\runtime\\kernel-v2-311926K1uko38tdWb.json\n" + ] + }, + { + "ename": "SystemExit", + "evalue": "2", + "output_type": "error", + "traceback": [ + "An exception has occurred, use %tb to see the full traceback.\n", + "\u001b[1;31mSystemExit\u001b[0m\u001b[1;31m:\u001b[0m 2\n" + ] + } + ], + "source": [ + "import argparse\n", + "\n", + "def parse_args():\n", + " parser = argparse.ArgumentParser()\n", + " parser.add_argument(\"--seed\", type=int, default=11,\n", + " help=\"seed of the experiment\")\n", + " args = parser.parse_args()\n", + " return args\n", + "\n", + "arggg = parse_args()\n", + "print(type(arggg))" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1.2, 3.2)\n", + "1.2\n" + ] + } + ], + "source": [ + "aaa = (1.2,3.2)\n", + "print(aaa)\n", + "print(aaa[0])" + ] } ], "metadata": { diff --git a/Aimbot-PPO-Python/Pytorch/testEnv.py b/Aimbot-PPO-Python/Pytorch/Archive/testEnv.py similarity index 100% rename from Aimbot-PPO-Python/Pytorch/testEnv.py rename to Aimbot-PPO-Python/Pytorch/Archive/testEnv.py diff --git a/Aimbot-PPO-Python/Pytorch/testarea.ipynb b/Aimbot-PPO-Python/Pytorch/Archive/testarea.ipynb similarity index 100% rename from Aimbot-PPO-Python/Pytorch/testarea.ipynb rename to Aimbot-PPO-Python/Pytorch/Archive/testarea.ipynb diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index 97b6ab4..9e2e95e 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -10,16 +10,15 @@ import atexit from aimbotEnv import Aimbot +from aimbotEnv import AimbotSideChannel from ppoagent import PPOAgent -from ppoagent import GAE -from ppoagent import AimbotSideChannel from airecorder import WandbRecorder +from aimemory import PPOMem +from aimemory import Targets from enum import Enum from distutils.util import strtobool -bestReward = -1 - -SCrecieved = 0 +best_reward = -1 DEFAULT_SEED = 9331 ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv" @@ -29,8 +28,8 @@ WORKER_ID = 1 BASE_PORT = 1000 # tensorboard names -game_name = "Aimbot_Target_Hybrid_PMNN_V3" -game_type = "Mix_Verification" +GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3" +GAME_TYPE = "Mix_Verification" # max round steps per agent is 2500/Decision_period, 25 seconds # !!!check every parameters before run!!! @@ -61,13 +60,6 @@ WANDB_TACK = False LOAD_DIR = None #LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt" -# public data -class Targets(Enum): - Free = 0 - Go = 1 - Attack = 2 - Defence = 3 - Num = 4 TARGET_STATE_SIZE = 6 INAREA_STATE_SIZE = 1 TIME_STATE_SIZE = 1 @@ -159,21 +151,6 @@ def parse_args(): return args -def broadCastEndReward(rewardBF:list,remainTime:float): - thisRewardBF = rewardBF - if (rewardBF[-1]<=-500): - # print("Lose DO NOT BROAD CAST",rewardBF[-1]) - thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD - elif (rewardBF[-1]>=500): - # print("Win! Broadcast reward!",rewardBF[-1]) - print(sum(thisRewardBF)/len(thisRewardBF)) - thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD - thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist() - else: - print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1]) - return torch.Tensor(thisRewardBF).to(device) - - if __name__ == "__main__": args = parse_args() random.seed(args.seed) @@ -183,18 +160,20 @@ if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") # Initialize environment anget optimizer - aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID); - env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel]) + aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID); + env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimbot_sidechannel]) if args.load_dir is None: agent = PPOAgent( env = env, - trainAgent=args.train, - targetNum=TARGETNUM, + this_args=args, + train_agent=args.train, + target_num=TARGETNUM, target_state_size= TARGET_STATE_SIZE, time_state_size=TIME_STATE_SIZE, gun_state_size=GUN_STATE_SIZE, my_state_size=MY_STATE_SIZE, total_t_size=TOTAL_T_SIZE, + device=device, ).to(device) else: agent = torch.load(args.load_dir) @@ -210,8 +189,8 @@ if __name__ == "__main__": optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) # Tensorboard and WandB Recorder - run_name = f"{game_type}_{args.seed}_{int(time.time())}" - wdb_recorder = WandbRecorder(game_name, game_type, run_name, args) + run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}" + wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args) @atexit.register def save_model(): @@ -219,60 +198,49 @@ if __name__ == "__main__": env.close() if args.save_model: # save model while exit - saveDir = "../PPO-Model/"+ run_name + "_last.pt" - torch.save(agent, saveDir) - print("save model to " + saveDir) - - # Trajectory Buffer - ob_bf = [[] for i in range(env.unity_agent_num)] - act_bf = [[] for i in range(env.unity_agent_num)] - dis_logprobs_bf = [[] for i in range(env.unity_agent_num)] - con_logprobs_bf = [[] for i in range(env.unity_agent_num)] - rewards_bf = [[] for i in range(env.unity_agent_num)] - dones_bf = [[] for i in range(env.unity_agent_num)] - values_bf = [[] for i in range(env.unity_agent_num)] + save_dir = "../PPO-Model/"+ run_name + "_last.pt" + torch.save(agent, save_dir) + print("save model to " + save_dir) # start the game total_update_step = using_targets_num * args.total_timesteps // args.datasetSize target_steps = [0 for i in range(TARGETNUM)] start_time = time.time() state, _, done = env.reset() - # state = torch.Tensor(next_obs).to(device) - # next_done = torch.zeros(env.unity_agent_num).to(device) - # initialize empty training datasets - obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size) - actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size) - dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) - con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) - rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) - values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) - advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) - returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + # initialize AI memories + ppo_memories = PPOMem( + env = env, + device = device, + args=args, + target_num = TARGETNUM, + target_state_size = TARGET_STATE_SIZE, + base_lose_reward = BASE_LOSEREWARD, + base_win_reward = BASE_WINREWARD, + ) for total_steps in range(total_update_step): # discunt learning rate, while step == total_update_step lr will be 0 if args.annealLR: - finalRatio = TARGET_LEARNING_RATE/args.lr + final_lr_ratio = TARGET_LEARNING_RATE/args.lr frac = 1.0 - ((total_steps + 1.0) / total_update_step) - lrnow = frac * args.lr - optimizer.param_groups[0]["lr"] = lrnow + lr_now = frac * args.lr + optimizer.param_groups[0]["lr"] = lr_now else: - lrnow = args.lr - print("new episode",total_steps,"learning rate = ",lrnow) + lr_now = args.lr + print("new episode",total_steps,"learning rate = ",lr_now) # MAIN LOOP: run agent in environment step = 0 training = False - trainQueue = [] + train_queue = [] last_reward = [0.for i in range(env.unity_agent_num)] while True: if step % args.decision_period == 0: step += 1 # Choose action by agent - with torch.no_grad(): # predict actions action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value( @@ -289,61 +257,27 @@ if __name__ == "__main__": next_state, reward, next_done = env.step(action_cpu) # save memories - for i in range(env.unity_agent_num): - # save memories to buffers - ob_bf[i].append(state[i]) - act_bf[i].append(action_cpu[i]) - dis_logprobs_bf[i].append(dis_logprob_cpu[i]) - con_logprobs_bf[i].append(con_logprob_cpu[i]) - rewards_bf[i].append(reward[i]+last_reward[i]) - dones_bf[i].append(done[i]) - values_bf[i].append(value_cpu[i]) - remainTime = state[i,TARGET_STATE_SIZE] - if next_done[i] == True: - # finished a round, send finished memories to training datasets - # compute advantage and discounted reward - #print(i,"over") - roundTargetType = int(state[i,0]) - thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime) - adv, rt = GAE( - agent, - args, - thisRewardsTensor, - torch.Tensor(dones_bf[i]).to(device), - torch.tensor(values_bf[i]).to(device), - torch.tensor(next_state[i]).to(device).unsqueeze(0), - torch.Tensor([next_done[i]]).to(device), - device, - ) - # send memories to training datasets - obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0) - actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0) - dis_logprobs[roundTargetType] = torch.cat( - (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0 - ) - con_logprobs[roundTargetType] = torch.cat( - (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0 - ) - rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0) - values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0) - advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0) - returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0) - - # clear buffers - ob_bf[i] = [] - act_bf[i] = [] - dis_logprobs_bf[i] = [] - con_logprobs_bf[i] = [] - rewards_bf[i] = [] - dones_bf[i] = [] - values_bf[i] = [] - print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}") + ppo_memories.save_memories( + now_step = step, + agent = agent, + state = state, + action_cpu = action_cpu, + dis_logprob_cpu = dis_logprob_cpu, + con_logprob_cpu = con_logprob_cpu, + reward = reward, + done = done, + value_cpu = value_cpu, + last_reward = last_reward, + next_done = next_done, + next_state=next_state, + ) + # check if any training dataset is full and ready to train for i in range(TARGETNUM): - if obs[i].size()[0] >= args.datasetSize: + if ppo_memories.obs[i].size()[0] >= args.datasetSize: # start train NN - trainQueue.append(i) - if(len(trainQueue)>0): + train_queue.append(i) + if(len(train_queue)>0): break state, done = next_state, next_done else: @@ -351,76 +285,40 @@ if __name__ == "__main__": # skip this step use last predict action next_state, reward, next_done = env.step(action_cpu) # save memories - for i in range(env.unity_agent_num): - if next_done[i] == True: - #print(i,"over???") - # save memories to buffers - ob_bf[i].append(state[i]) - act_bf[i].append(action_cpu[i]) - dis_logprobs_bf[i].append(dis_logprob_cpu[i]) - con_logprobs_bf[i].append(con_logprob_cpu[i]) - rewards_bf[i].append(reward[i]) - dones_bf[i].append(done[i]) - values_bf[i].append(value_cpu[i]) - remainTime = state[i,TARGET_STATE_SIZE] - # finished a round, send finished memories to training datasets - # compute advantage and discounted reward - roundTargetType = int(state[i,0]) - thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime) - adv, rt = GAE( - agent, - args, - thisRewardsTensor, - torch.Tensor(dones_bf[i]).to(device), - torch.tensor(values_bf[i]).to(device), - torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0), - torch.Tensor([next_done[i]]).to(device), - device - ) - # send memories to training datasets - obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0) - actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0) - dis_logprobs[roundTargetType] = torch.cat( - (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0 - ) - con_logprobs[roundTargetType] = torch.cat( - (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0 - ) - rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0) - values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0) - advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0) - returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0) - - # clear buffers - ob_bf[i] = [] - act_bf[i] = [] - dis_logprobs_bf[i] = [] - con_logprobs_bf[i] = [] - rewards_bf[i] = [] - dones_bf[i] = [] - values_bf[i] = [] - print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}") + ppo_memories.save_memories( + now_step = step, + agent = agent, + state = state, + action_cpu = action_cpu, + dis_logprob_cpu = dis_logprob_cpu, + con_logprob_cpu = con_logprob_cpu, + reward = reward, + done = done, + value_cpu = value_cpu, + last_reward = last_reward, + next_done = next_done, + next_state=next_state, + ) state = next_state last_reward = reward - i += 1 if args.train: # train mode on - meanRewardList = [] # for WANDB + mean_reward_list = [] # for WANDB # loop all tarining queue - for thisT in trainQueue: + for thisT in train_queue: # sart time - startTime = time.time() + start_time = time.time() target_steps[thisT]+=1 # flatten the batch - b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape) - b_dis_logprobs = dis_logprobs[thisT].reshape(-1) - b_con_logprobs = con_logprobs[thisT].reshape(-1) - b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,)) - b_advantages = advantages[thisT].reshape(-1) - b_returns = returns[thisT].reshape(-1) - b_values = values[thisT].reshape(-1) + b_obs = ppo_memories.obs[thisT].reshape((-1,) + env.unity_observation_shape) + b_dis_logprobs = ppo_memories.dis_logprobs[thisT].reshape(-1) + b_con_logprobs = ppo_memories.con_logprobs[thisT].reshape(-1) + b_actions = ppo_memories.actions[thisT].reshape((-1,) + (env.unity_action_size,)) + b_advantages = ppo_memories.advantages[thisT].reshape(-1) + b_returns = ppo_memories.returns[thisT].reshape(-1) + b_values = ppo_memories.values[thisT].reshape(-1) b_size = b_obs.size()[0] # Optimizing the policy and value network b_inds = np.arange(b_size) @@ -529,19 +427,12 @@ if __name__ == "__main__": """ # record mean reward before clear history print("done") - targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) - meanRewardList.append(targetRewardMean) + targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy()) + mean_reward_list.append(targetRewardMean) targetName = Targets(thisT).name # clear this target trainning set buffer - obs[thisT] = torch.tensor([]).to(device) - actions[thisT] = torch.tensor([]).to(device) - dis_logprobs[thisT] = torch.tensor([]).to(device) - con_logprobs[thisT] = torch.tensor([]).to(device) - rewards[thisT] = torch.tensor([]).to(device) - values[thisT] = torch.tensor([]).to(device) - advantages[thisT] = torch.tensor([]).to(device) - returns[thisT] = torch.tensor([]).to(device) + ppo_memories.clear_training_datasets(thisT) # record rewards for plotting purposes wdb_recorder.add_target_scalar( @@ -556,7 +447,7 @@ if __name__ == "__main__": target_steps, ) print(f"episode over Target{targetName} mean reward:", targetRewardMean) - TotalRewardMean = np.mean(meanRewardList) + TotalRewardMean = np.mean(mean_reward_list) wdb_recorder.add_global_scalar( TotalRewardMean, optimizer.param_groups[0]["lr"], @@ -565,35 +456,29 @@ if __name__ == "__main__": # print cost time as seconds print("cost time:", time.time() - start_time) # New Record! - if TotalRewardMean > bestReward and args.save_model: - bestReward = targetRewardMean + if TotalRewardMean > best_reward and args.save_model: + best_reward = targetRewardMean saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt" torch.save(agent, saveDir) else: # train mode off - meanRewardList = [] # for WANDB + mean_reward_list = [] # for WANDB # while not in training mode, clear the buffer - for thisT in trainQueue: + for thisT in train_queue: target_steps[thisT]+=1 targetName = Targets(thisT).name - targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) - meanRewardList.append(targetRewardMean) + targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy()) + mean_reward_list.append(targetRewardMean) print(target_steps[thisT]) - obs[thisT] = torch.tensor([]).to(device) - actions[thisT] = torch.tensor([]).to(device) - dis_logprobs[thisT] = torch.tensor([]).to(device) - con_logprobs[thisT] = torch.tensor([]).to(device) - rewards[thisT] = torch.tensor([]).to(device) - values[thisT] = torch.tensor([]).to(device) - advantages[thisT] = torch.tensor([]).to(device) - returns[thisT] = torch.tensor([]).to(device) + # clear this target trainning set buffer + ppo_memories.clear_training_datasets(thisT) # record rewards for plotting purposes wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) wdb_recorder.add_win_ratio(targetName,target_steps[thisT]) print(f"episode over Target{targetName} mean reward:", targetRewardMean) - TotalRewardMean = np.mean(meanRewardList) + TotalRewardMean = np.mean(mean_reward_list) wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) saveDir = "../PPO-Model/"+ run_name + "_last.pt" diff --git a/Aimbot-PPO-Python/Pytorch/aimemory.py b/Aimbot-PPO-Python/Pytorch/aimemory.py new file mode 100644 index 0000000..9751c85 --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/aimemory.py @@ -0,0 +1,146 @@ +import torch +import numpy as np +import argparse +from aimbotEnv import Aimbot +from ppoagent import PPOAgent +from enum import Enum + +# public data +class Targets(Enum): + Free = 0 + Go = 1 + Attack = 2 + Defence = 3 + Num = 4 + +class PPOMem: + def __init__( + self, + env: Aimbot, + args: argparse.Namespace, + device: torch.device, + target_num: int, + target_state_size: int, + base_lose_reward: int, + base_win_reward: int, + ) -> None: + self.data_set_size = args.datasetSize + self.result_broadcast_ratio = args.result_broadcast_ratio + self.decision_period = args.decision_period + self.unity_agent_num = env.unity_agent_num + + self.base_lose_reward = base_lose_reward + self.base_win_reward = base_win_reward + self.target_state_size = target_state_size + self.device = device + + # Trajectory Buffer + self.ob_bf = [[] for i in range(env.unity_agent_num)] + self.act_bf = [[] for i in range(env.unity_agent_num)] + self.dis_logprobs_bf = [[] for i in range(env.unity_agent_num)] + self.con_logprobs_bf = [[] for i in range(env.unity_agent_num)] + self.rewards_bf = [[] for i in range(env.unity_agent_num)] + self.dones_bf = [[] for i in range(env.unity_agent_num)] + self.values_bf = [[] for i in range(env.unity_agent_num)] + + # initialize empty training datasets + self.obs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_observation_size) + self.actions = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_action_size) + self.dis_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) + self.con_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) + self.rewards = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) + self.values = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) + self.advantages = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) + self.returns = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) + + def broad_cast_end_reward(self, rewardBF: list, remainTime: float) -> torch.Tensor: + thisRewardBF = rewardBF.copy() + if rewardBF[-1] <= -500: + # print("Lose DO NOT BROAD CAST",rewardBF[-1]) + thisRewardBF[-1] = rewardBF[-1] - self.base_lose_reward + elif rewardBF[-1] >= 500: + # print("Win! Broadcast reward!",rewardBF[-1]) + print(sum(thisRewardBF) / len(thisRewardBF)) + thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward + thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist() + else: + print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1]) + return torch.Tensor(thisRewardBF).to(self.device) + + def save_memories( + self, + now_step: int, + agent: PPOAgent, + state: np.ndarray, + action_cpu: np.ndarray, + dis_logprob_cpu: np.ndarray, + con_logprob_cpu: np.ndarray, + reward: list, + done: list, + value_cpu: np.ndarray, + last_reward: list, + next_done: list, + next_state: np.ndarray, + ): + for i in range(self.unity_agent_num): + if now_step % self.decision_period == 0 or next_done[i] == True: + # only on decision period or finished a round, save memories to buffer + self.ob_bf[i].append(state[i]) + self.act_bf[i].append(action_cpu[i]) + self.dis_logprobs_bf[i].append(dis_logprob_cpu[i]) + self.con_logprobs_bf[i].append(con_logprob_cpu[i]) + self.dones_bf[i].append(done[i]) + self.values_bf[i].append(value_cpu[i]) + if now_step % self.decision_period == 0: + # on decision period, add last skiped round's reward + self.rewards_bf[i].append(reward[i] + last_reward[i]) + else: + # not on decision period, only add this round's reward + self.rewards_bf[i].append(reward[i]) + if next_done[i] == True: + # finished a round, send finished memories to training datasets + # compute advantage and discounted reward + remainTime = state[i, self.target_state_size] + roundTargetType = int(state[i, 0]) + thisRewardsTensor = self.broad_cast_end_reward(self.rewards_bf[i], remainTime) + adv, rt = agent.gae( + rewards=thisRewardsTensor, + dones=torch.Tensor(self.dones_bf[i]).to(self.device), + values=torch.tensor(self.values_bf[i]).to(self.device), + next_obs=torch.tensor(next_state[i]).to(self.device).unsqueeze(0), + next_done=torch.Tensor([next_done[i]]).to(self.device), + ) + # send memories to training datasets + self.obs[roundTargetType] = torch.cat((self.obs[roundTargetType], torch.tensor(self.ob_bf[i]).to(self.device)), 0) + self.actions[roundTargetType] = torch.cat((self.actions[roundTargetType], torch.tensor(self.act_bf[i]).to(self.device)), 0) + self.dis_logprobs[roundTargetType] = torch.cat((self.dis_logprobs[roundTargetType], torch.tensor(self.dis_logprobs_bf[i]).to(self.device)), 0) + self.con_logprobs[roundTargetType] = torch.cat((self.con_logprobs[roundTargetType], torch.tensor(self.con_logprobs_bf[i]).to(self.device)), 0) + self.rewards[roundTargetType] = torch.cat((self.rewards[roundTargetType], thisRewardsTensor), 0) + self.values[roundTargetType] = torch.cat((self.values[roundTargetType], torch.tensor(self.values_bf[i]).to(self.device)), 0) + self.advantages[roundTargetType] = torch.cat((self.advantages[roundTargetType], adv), 0) + self.returns[roundTargetType] = torch.cat((self.returns[roundTargetType], rt), 0) + + # clear buffers + self.clear_buffers(i) + print(f"train dataset {Targets(roundTargetType).name} added:{self.obs[roundTargetType].size()[0]}/{self.data_set_size}") + + def clear_buffers(self,ind:int): + # clear buffers + self.ob_bf[ind] = [] + self.act_bf[ind] = [] + self.dis_logprobs_bf[ind] = [] + self.con_logprobs_bf[ind] = [] + self.rewards_bf[ind] = [] + self.dones_bf[ind] = [] + self.values_bf[ind] = [] + + def clear_training_datasets(self,ind:int): + # clear training datasets + self.obs[ind] = torch.tensor([]).to(self.device) + self.actions[ind] = torch.tensor([]).to(self.device) + self.dis_logprobs[ind] = torch.tensor([]).to(self.device) + self.con_logprobs[ind] = torch.tensor([]).to(self.device) + self.rewards[ind] = torch.tensor([]).to(self.device) + self.values[ind] = torch.tensor([]).to(self.device) + self.advantages[ind] = torch.tensor([]).to(self.device) + self.returns[ind] = torch.tensor([]).to(self.device) \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/ppoagent.py b/Aimbot-PPO-Python/Pytorch/ppoagent.py index d13bae0..917fc3e 100644 --- a/Aimbot-PPO-Python/Pytorch/ppoagent.py +++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py @@ -1,17 +1,11 @@ import numpy as np import torch -import uuid -import airecorder +import argparse + from torch import nn -from typing import List from aimbotEnv import Aimbot from torch.distributions.normal import Normal from torch.distributions.categorical import Categorical -from mlagents_envs.side_channel.side_channel import ( - SideChannel, - IncomingMessage, - OutgoingMessage, -) def layer_init(layer, std=np.sqrt(2), bias_const=0.0): @@ -24,17 +18,21 @@ class PPOAgent(nn.Module): def __init__( self, env: Aimbot, - trainAgent: bool, - targetNum: int, + this_args:argparse.Namespace, + train_agent: bool, + target_num: int, target_state_size: int, time_state_size: int, gun_state_size: int, my_state_size: int, total_t_size: int, + device: torch.device, ): super(PPOAgent, self).__init__() - self.trainAgent = trainAgent - self.targetNum = targetNum + self.device = device + self.args = this_args + self.trainAgent = train_agent + self.targetNum = target_num self.stateSize = env.unity_observation_shape[0] self.agentNum = env.unity_agent_num self.targetSize = target_state_size @@ -56,28 +54,28 @@ class PPOAgent(nn.Module): self.targetNetworks = nn.ModuleList( [ nn.Sequential(layer_init(nn.Linear(self.nonRaySize, 100)), nn.LeakyReLU()) - for i in range(targetNum) + for i in range(target_num) ] ) self.middleNetworks = nn.ModuleList( [ nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU()) - for i in range(targetNum) + for i in range(target_num) ] ) self.actor_dis = nn.ModuleList( - [layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(targetNum)] + [layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(target_num)] ) self.actor_mean = nn.ModuleList( - [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(targetNum)] + [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(target_num)] ) # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)]) # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) self.actor_logstd = nn.ParameterList( - [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)] + [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(target_num)] ) # nn.Parameter(torch.zeros(1, self.continuous_size)) self.critic = nn.ModuleList( - [layer_init(nn.Linear(200, 1), std=1) for i in range(targetNum)] + [layer_init(nn.Linear(200, 1), std=1) for i in range(target_num)] ) def get_value(self, state: torch.Tensor): @@ -165,103 +163,42 @@ class PPOAgent(nn.Module): criticV, ) - -def GAE(agent, args, rewards, dones, values, next_obs, next_done, device): - # GAE - with torch.no_grad(): - next_value = agent.get_value(next_obs).reshape(1, -1) - data_size = rewards.size()[0] - if args.gae: - advantages = torch.zeros_like(rewards).to(device) - lastgaelam = 0 - for t in reversed(range(data_size)): - if t == data_size - 1: - nextnonterminal = 1.0 - next_done - nextvalues = next_value - else: - nextnonterminal = 1.0 - dones[t + 1] - nextvalues = values[t + 1] - delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] - advantages[t] = lastgaelam = ( - delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam - ) - returns = advantages + values - else: - returns = torch.zeros_like(rewards).to(device) - for t in reversed(range(data_size)): - if t == data_size - 1: - nextnonterminal = 1.0 - next_done - next_return = next_value - else: - nextnonterminal = 1.0 - dones[t + 1] - next_return = returns[t + 1] - returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return - advantages = returns - values - return advantages, returns - - -class AimbotSideChannel(SideChannel): - def __init__(self, channel_id: uuid.UUID) -> None: - super().__init__(channel_id) - - def on_message_received(self, msg: IncomingMessage) -> None: - global SCrecieved # make sure this variable is global - """ - Note: We must implement this method of the SideChannel interface to - receive messages from Unity - Message will be sent like this: - "Warning|Message1|Message2|Message3" or - "Error|Message1|Message2|Message3" - """ - thisMessage = msg.read_string() - thisResult = thisMessage.split("|") - if(thisResult[0] == "result"): - airecorder.total_rounds[thisResult[1]]+=1 - if(thisResult[2] == "Win"): - airecorder.win_rounds[thisResult[1]]+=1 - #print(TotalRounds) - #print(WinRounds) - elif(thisResult[0] == "Error"): - print(thisMessage) - - # # while Message type is Warning - # if(thisResult[0] == "Warning"): - # # while Message1 is result means one game is over - # if (thisResult[1] == "Result"): - # TotalRounds[thisResult[2]]+=1 - # # while Message3 is Win means this agent win this game - # if(thisResult[3] == "Win"): - # WinRounds[thisResult[2]]+=1 - # # while Message1 is GameState means this game is just start - # # and tell python which game mode is - # elif (thisResult[1] == "GameState"): - # SCrecieved = 1 - # # while Message type is Error - # elif(thisResult[0] == "Error"): - # print(thisMessage) - # 发送函数 - def send_string(self, data: str) -> None: - # send a string toC# - msg = OutgoingMessage() - msg.write_string(data) - super().queue_message_to_send(msg) - - def send_bool(self, data: bool) -> None: - msg = OutgoingMessage() - msg.write_bool(data) - super().queue_message_to_send(msg) - - def send_int(self, data: int) -> None: - msg = OutgoingMessage() - msg.write_int32(data) - super().queue_message_to_send(msg) - - def send_float(self, data: float) -> None: - msg = OutgoingMessage() - msg.write_float32(data) - super().queue_message_to_send(msg) - - def send_float_list(self, data: List[float]) -> None: - msg = OutgoingMessage() - msg.write_float32_list(data) - super().queue_message_to_send(msg) \ No newline at end of file + def gae( + self, + rewards: torch.Tensor, + dones: torch.Tensor, + values: torch.tensor, + next_obs: torch.tensor, + next_done: torch.Tensor, + ) -> tuple: + # GAE + with torch.no_grad(): + next_value = self.get_value(next_obs).reshape(1, -1) + data_size = rewards.size()[0] + if self.args.gae: + advantages = torch.zeros_like(rewards).to(self.device) + last_gae_lam = 0 + for t in reversed(range(data_size)): + if t == data_size - 1: + nextnonterminal = 1.0 - next_done + next_values = next_value + else: + nextnonterminal = 1.0 - dones[t + 1] + next_values = values[t + 1] + delta = rewards[t] + self.args.gamma * next_values * nextnonterminal - values[t] + advantages[t] = last_gae_lam = ( + delta + self.args.gamma * self.args.gaeLambda * nextnonterminal * last_gae_lam + ) + returns = advantages + values + else: + returns = torch.zeros_like(rewards).to(self.device) + for t in reversed(range(data_size)): + if t == data_size - 1: + nextnonterminal = 1.0 - next_done + next_return = next_value + else: + nextnonterminal = 1.0 - dones[t + 1] + next_return = returns[t + 1] + returns[t] = rewards[t] + self.args.gamma * nextnonterminal * next_return + advantages = returns - values + return advantages, returns \ No newline at end of file