diff --git a/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb index caa3aaa..1d09dd1 100644 --- a/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb +++ b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb @@ -185,18 +185,27 @@ "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "(1.2, 3.2)\n", - "1.2\n" + "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mkoha9\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n" ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "aaa = (1.2,3.2)\n", - "print(aaa)\n", - "print(aaa[0])" + "import wandb\n", + "wandb.login()" ] } ], diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index 9e2e95e..b390b6a 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -1,156 +1,28 @@ -import argparse import time import numpy as np import random import uuid import torch -import torch.nn as nn -import torch.optim as optim import atexit - from aimbotEnv import Aimbot from aimbotEnv import AimbotSideChannel from ppoagent import PPOAgent from airecorder import WandbRecorder from aimemory import PPOMem from aimemory import Targets -from enum import Enum -from distutils.util import strtobool +from arguments import parse_args +import torch.optim as optim -best_reward = -1 - -DEFAULT_SEED = 9331 -ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv" +# side channel uuid SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e") -WAND_ENTITY = "koha9" -WORKER_ID = 1 -BASE_PORT = 1000 - # tensorboard names -GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3" +GAME_NAME = "Aimbot_Hybrid_V3" GAME_TYPE = "Mix_Verification" -# max round steps per agent is 2500/Decision_period, 25 seconds -# !!!check every parameters before run!!! - -TOTAL_STEPS = 3150000 -BATCH_SIZE = 512 -MAX_TRAINNING_DATASETS = 6000 -DECISION_PERIOD = 1 -LEARNING_RATE = 6.5e-4 -GAMMA = 0.99 -GAE_LAMBDA = 0.95 -EPOCHS = 3 -CLIP_COEF = 0.11 -LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence -POLICY_COEF = [1.0, 1.0, 1.0, 1.0] -ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05] -CRITIC_COEF = [0.5, 0.5, 0.5, 0.5] -TARGET_LEARNING_RATE = 1e-6 -FREEZE_VIEW_NETWORK = True - -BROADCASTREWARD = False -ANNEAL_LEARNING_RATE = True -CLIP_VLOSS = True -NORM_ADV = False -TRAIN = True -SAVE_MODEL = False -WANDB_TACK = False -LOAD_DIR = None -#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt" - -TARGET_STATE_SIZE = 6 -INAREA_STATE_SIZE = 1 -TIME_STATE_SIZE = 1 -GUN_STATE_SIZE = 1 -MY_STATE_SIZE = 4 -TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE -BASE_WINREWARD = 999 -BASE_LOSEREWARD = -999 -TARGETNUM= 4 -ENV_TIMELIMIT = 30 -RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT - # !!!SPECIAL PARAMETERS!!! -# change it while program is finished using_targets_num = 3 - -def parse_args(): - # fmt: off - # pytorch and environment parameters - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=DEFAULT_SEED, - help="seed of the experiment") - parser.add_argument("--path", type=str, default=ENV_PATH, - help="enviroment path") - parser.add_argument("--workerID", type=int, default=WORKER_ID, - help="unity worker ID") - parser.add_argument("--baseport", type=int, default=BASE_PORT, - help="port to connect to Unity environment") - parser.add_argument("--lr", type=float, default=LEARNING_RATE, - help="the learning rate of optimizer") - parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, - help="if toggled, cuda will be enabled by default") - parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS, - help="total timesteps of the experiments") - - # model parameters - parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True, - help="Train Model or not") - parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True, - help="freeze view network or not") - parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS, - help="training dataset size,start training while dataset collect enough data") - parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE, - help="nimi batch size") - parser.add_argument("--epochs", type=int, default=EPOCHS, - help="the K epochs to update the policy") - parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True, - help="Toggle learning rate annealing for policy and value networks") - parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True, - help="track on the wandb") - parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True, - help="save model or not") - parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY, - help="the entity (team) of wandb's project") - parser.add_argument("--load-dir", type=str, default=LOAD_DIR, - help="load model directory") - parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD, - help="the number of steps to run in each environment per policy rollout") - parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO, - help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime") - parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True, - help="save model or not") - # GAE loss - parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, - help="Use GAE for advantage computation") - parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True, - help="Toggles advantages normalization") - parser.add_argument("--gamma", type=float, default=GAMMA, - help="the discount factor gamma") - parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA, - help="the lambda for the general advantage estimation") - parser.add_argument("--clip-coef", type=float, default=CLIP_COEF, - help="the surrogate clipping coefficient") - parser.add_argument("--policy-coef", type=float, default=POLICY_COEF, - help="coefficient of the policy") - parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF, - help="coefficient of the entropy") - parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF, - help="coefficient of the value function") - parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True, - help="Toggles whether or not to use a clipped loss for the value function, as per the paper.") - parser.add_argument("--max-grad-norm", type=float, default=0.5, - help="the maximum norm for the gradient clipping") - parser.add_argument("--target-kl", type=float, default=None, - help="the target KL divergence threshold") - # fmt: on - args = parser.parse_args() - return args - - if __name__ == "__main__": args = parse_args() random.seed(args.seed) @@ -158,6 +30,7 @@ if __name__ == "__main__": torch.manual_seed(args.seed) device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + best_reward = -1 # Initialize environment anget optimizer aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID); @@ -166,18 +39,11 @@ if __name__ == "__main__": agent = PPOAgent( env = env, this_args=args, - train_agent=args.train, - target_num=TARGETNUM, - target_state_size= TARGET_STATE_SIZE, - time_state_size=TIME_STATE_SIZE, - gun_state_size=GUN_STATE_SIZE, - my_state_size=MY_STATE_SIZE, - total_t_size=TOTAL_T_SIZE, device=device, ).to(device) else: agent = torch.load(args.load_dir) - # freeze + # freeze if args.freeze_viewnet: # freeze the view network for p in agent.viewNetwork.parameters(): @@ -185,9 +51,8 @@ if __name__ == "__main__": print("VIEW NETWORK FREEZED") print("Load Agent", args.load_dir) print(agent.eval()) - + # optimizer optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) - # Tensorboard and WandB Recorder run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}" wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args) @@ -204,34 +69,30 @@ if __name__ == "__main__": # start the game total_update_step = using_targets_num * args.total_timesteps // args.datasetSize - target_steps = [0 for i in range(TARGETNUM)] + target_steps = [0 for i in range(args.target_num)] start_time = time.time() state, _, done = env.reset() # initialize AI memories ppo_memories = PPOMem( - env = env, - device = device, args=args, - target_num = TARGETNUM, - target_state_size = TARGET_STATE_SIZE, - base_lose_reward = BASE_LOSEREWARD, - base_win_reward = BASE_WINREWARD, + unity_agent_num=env.unity_agent_num, + device = device, ) + # MAIN LOOP: run agent in environment for total_steps in range(total_update_step): # discunt learning rate, while step == total_update_step lr will be 0 - if args.annealLR: - final_lr_ratio = TARGET_LEARNING_RATE/args.lr + final_lr_ratio = args.target_lr/args.lr frac = 1.0 - ((total_steps + 1.0) / total_update_step) lr_now = frac * args.lr optimizer.param_groups[0]["lr"] = lr_now else: lr_now = args.lr + + # episode start show learning rate print("new episode",total_steps,"learning rate = ",lr_now) - - # MAIN LOOP: run agent in environment step = 0 training = False @@ -271,14 +132,15 @@ if __name__ == "__main__": next_done = next_done, next_state=next_state, ) - # check if any training dataset is full and ready to train - for i in range(TARGETNUM): + for i in range(args.target_num): if ppo_memories.obs[i].size()[0] >= args.datasetSize: # start train NN train_queue.append(i) if(len(train_queue)>0): + # break while loop and start train break + # update state state, done = next_state, next_done else: step += 1 @@ -299,7 +161,7 @@ if __name__ == "__main__": next_done = next_done, next_state=next_state, ) - + # update state state = next_state last_reward = reward @@ -307,137 +169,34 @@ if __name__ == "__main__": # train mode on mean_reward_list = [] # for WANDB # loop all tarining queue - for thisT in train_queue: + for this_train_ind in train_queue: # sart time start_time = time.time() - target_steps[thisT]+=1 - # flatten the batch - b_obs = ppo_memories.obs[thisT].reshape((-1,) + env.unity_observation_shape) - b_dis_logprobs = ppo_memories.dis_logprobs[thisT].reshape(-1) - b_con_logprobs = ppo_memories.con_logprobs[thisT].reshape(-1) - b_actions = ppo_memories.actions[thisT].reshape((-1,) + (env.unity_action_size,)) - b_advantages = ppo_memories.advantages[thisT].reshape(-1) - b_returns = ppo_memories.returns[thisT].reshape(-1) - b_values = ppo_memories.values[thisT].reshape(-1) - b_size = b_obs.size()[0] - # Optimizing the policy and value network - b_inds = np.arange(b_size) - # clipfracs = [] - for epoch in range(args.epochs): - print(epoch,end="") - # shuffle all datasets - np.random.shuffle(b_inds) - for start in range(0, b_size, args.minibatchSize): - print(".",end="") - end = start + args.minibatchSize - mb_inds = b_inds[start:end] - if(np.size(mb_inds)<=1): - break - mb_advantages = b_advantages[mb_inds] - - # normalize advantages - if args.norm_adv: - mb_advantages = (mb_advantages - mb_advantages.mean()) / ( - mb_advantages.std() + 1e-8 - ) - - ( - _, - new_dis_logprob, - dis_entropy, - new_con_logprob, - con_entropy, - newvalue, - ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) - # discrete ratio - dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] - dis_ratio = dis_logratio.exp() - # continuous ratio - con_logratio = new_con_logprob - b_con_logprobs[mb_inds] - con_ratio = con_logratio.exp() - - """ - # early stop - with torch.no_grad(): - # calculate approx_kl http://joschu.net/blog/kl-approx.html - old_approx_kl = (-logratio).mean() - approx_kl = ((ratio - 1) - logratio).mean() - clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] - """ - - # discrete Policy loss - dis_pg_loss_orig = -mb_advantages * dis_ratio - dis_pg_loss_clip = -mb_advantages * torch.clamp( - dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef + target_steps[this_train_ind]+=1 + # train agent + ( + v_loss, + dis_pg_loss, + con_pg_loss, + loss, + entropy_loss + ) = agent.train_net( + this_train_ind=this_train_ind, + ppo_memories=ppo_memories, + optimizer=optimizer ) - dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() - # continuous Policy loss - con_pg_loss_orig = -mb_advantages * con_ratio - con_pg_loss_clip = -mb_advantages * torch.clamp( - con_ratio, 1 - args.clip_coef, 1 + args.clip_coef - ) - con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() - - # Value loss - newvalue = newvalue.view(-1) - if args.clip_vloss: - v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 - v_clipped = b_values[mb_inds] + torch.clamp( - newvalue - b_values[mb_inds], - -args.clip_coef, - args.clip_coef, - ) - v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 - v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) - v_loss = 0.5 * v_loss_max.mean() - else: - v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() - - # total loss - entropy_loss = dis_entropy.mean() + con_entropy.mean() - loss = ( - dis_pg_loss * POLICY_COEF[thisT] - + con_pg_loss * POLICY_COEF[thisT] - + entropy_loss * ENTROPY_COEF[thisT] - + v_loss * CRITIC_COEF[thisT] - )*LOSS_COEF[thisT] - - if(torch.isnan(loss).any()): - print("LOSS Include NAN!!!") - if(torch.isnan(dis_pg_loss.any())): - print("dis_pg_loss include nan") - if(torch.isnan(con_pg_loss.any())): - print("con_pg_loss include nan") - if(torch.isnan(entropy_loss.any())): - print("entropy_loss include nan") - if(torch.isnan(v_loss.any())): - print("v_loss include nan") - raise - - optimizer.zero_grad() - loss.backward() - # Clips gradient norm of an iterable of parameters. - nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) - optimizer.step() - - """ - if args.target_kl is not None: - if approx_kl > args.target_kl: - break - """ # record mean reward before clear history print("done") - targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy()) + targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy()) mean_reward_list.append(targetRewardMean) - targetName = Targets(thisT).name + targetName = Targets(this_train_ind).name # clear this target trainning set buffer - ppo_memories.clear_training_datasets(thisT) - + ppo_memories.clear_training_datasets(this_train_ind) # record rewards for plotting purposes wdb_recorder.add_target_scalar( targetName, - thisT, + this_train_ind, v_loss, dis_pg_loss, con_pg_loss, @@ -464,19 +223,19 @@ if __name__ == "__main__": # train mode off mean_reward_list = [] # for WANDB # while not in training mode, clear the buffer - for thisT in train_queue: - target_steps[thisT]+=1 - targetName = Targets(thisT).name - targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy()) + for this_train_ind in train_queue: + target_steps[this_train_ind]+=1 + targetName = Targets(this_train_ind).name + targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy()) mean_reward_list.append(targetRewardMean) - print(target_steps[thisT]) + print(target_steps[this_train_ind]) # clear this target trainning set buffer - ppo_memories.clear_training_datasets(thisT) + ppo_memories.clear_training_datasets(this_train_ind) # record rewards for plotting purposes - wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) - wdb_recorder.add_win_ratio(targetName,target_steps[thisT]) + wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[this_train_ind]) + wdb_recorder.add_win_ratio(targetName,target_steps[this_train_ind]) print(f"episode over Target{targetName} mean reward:", targetRewardMean) TotalRewardMean = np.mean(mean_reward_list) wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) diff --git a/Aimbot-PPO-Python/Pytorch/aimemory.py b/Aimbot-PPO-Python/Pytorch/aimemory.py index 9751c85..89aad78 100644 --- a/Aimbot-PPO-Python/Pytorch/aimemory.py +++ b/Aimbot-PPO-Python/Pytorch/aimemory.py @@ -1,7 +1,6 @@ import torch import numpy as np import argparse -from aimbotEnv import Aimbot from ppoagent import PPOAgent from enum import Enum @@ -16,42 +15,39 @@ class Targets(Enum): class PPOMem: def __init__( self, - env: Aimbot, args: argparse.Namespace, + unity_agent_num: int, device: torch.device, - target_num: int, - target_state_size: int, - base_lose_reward: int, - base_win_reward: int, ) -> None: + self.target_num = args.target_num self.data_set_size = args.datasetSize self.result_broadcast_ratio = args.result_broadcast_ratio self.decision_period = args.decision_period - self.unity_agent_num = env.unity_agent_num + self.unity_agent_num = unity_agent_num - self.base_lose_reward = base_lose_reward - self.base_win_reward = base_win_reward - self.target_state_size = target_state_size + self.base_lose_reward = args.base_lose_reward + self.base_win_reward = args.base_win_reward + self.target_state_size = args.target_state_size self.device = device # Trajectory Buffer - self.ob_bf = [[] for i in range(env.unity_agent_num)] - self.act_bf = [[] for i in range(env.unity_agent_num)] - self.dis_logprobs_bf = [[] for i in range(env.unity_agent_num)] - self.con_logprobs_bf = [[] for i in range(env.unity_agent_num)] - self.rewards_bf = [[] for i in range(env.unity_agent_num)] - self.dones_bf = [[] for i in range(env.unity_agent_num)] - self.values_bf = [[] for i in range(env.unity_agent_num)] + self.ob_bf = [[] for i in range(self.unity_agent_num)] + self.act_bf = [[] for i in range(self.unity_agent_num)] + self.dis_logprobs_bf = [[] for i in range(self.unity_agent_num)] + self.con_logprobs_bf = [[] for i in range(self.unity_agent_num)] + self.rewards_bf = [[] for i in range(self.unity_agent_num)] + self.dones_bf = [[] for i in range(self.unity_agent_num)] + self.values_bf = [[] for i in range(self.unity_agent_num)] # initialize empty training datasets - self.obs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_observation_size) - self.actions = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_action_size) - self.dis_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) - self.con_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) - self.rewards = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) - self.values = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) - self.advantages = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) - self.returns = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) + self.obs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,env.unity_observation_size) + self.actions = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,env.unity_action_size) + self.dis_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1) + self.con_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1) + self.rewards = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1) + self.values = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1) + self.advantages = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1) + self.returns = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1) def broad_cast_end_reward(self, rewardBF: list, remainTime: float) -> torch.Tensor: thisRewardBF = rewardBF.copy() diff --git a/Aimbot-PPO-Python/Pytorch/arguments.py b/Aimbot-PPO-Python/Pytorch/arguments.py new file mode 100644 index 0000000..78f58f4 --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/arguments.py @@ -0,0 +1,154 @@ +import argparse +import uuid + +from distutils.util import strtobool + +DEFAULT_SEED = 9331 +ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv" +WAND_ENTITY = "koha9" +WORKER_ID = 1 +BASE_PORT = 1000 + +# tensorboard names +GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3" +GAME_TYPE = "Mix_Verification" + +# max round steps per agent is 2500/Decision_period, 25 seconds +TOTAL_STEPS = 3150000 +BATCH_SIZE = 512 +MAX_TRAINNING_DATASETS = 6000 +DECISION_PERIOD = 1 +LEARNING_RATE = 6.5e-4 +GAMMA = 0.99 +GAE_LAMBDA = 0.95 +EPOCHS = 3 +CLIP_COEF = 0.11 +LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence +POLICY_COEF = [1.0, 1.0, 1.0, 1.0] +ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05] +CRITIC_COEF = [0.5, 0.5, 0.5, 0.5] +TARGET_LEARNING_RATE = 1e-6 + +FREEZE_VIEW_NETWORK = False +BROADCASTREWARD = False +ANNEAL_LEARNING_RATE = True +CLIP_VLOSS = True +NORM_ADV = False +TRAIN = True +SAVE_MODEL = True +WANDB_TACK = True +LOAD_DIR = None +#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt" + +# Unity Environment Parameters +TARGET_STATE_SIZE = 6 +INAREA_STATE_SIZE = 1 +TIME_STATE_SIZE = 1 +GUN_STATE_SIZE = 1 +MY_STATE_SIZE = 4 +TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE +BASE_WINREWARD = 999 +BASE_LOSEREWARD = -999 +TARGETNUM= 4 +ENV_TIMELIMIT = 30 +RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT + +def parse_args(): + # fmt: off + # pytorch and environment parameters + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=DEFAULT_SEED, + help="seed of the experiment") + parser.add_argument("--path", type=str, default=ENV_PATH, + help="enviroment path") + parser.add_argument("--workerID", type=int, default=WORKER_ID, + help="unity worker ID") + parser.add_argument("--baseport", type=int, default=BASE_PORT, + help="port to connect to Unity environment") + parser.add_argument("--lr", type=float, default=LEARNING_RATE, + help="the default learning rate of optimizer") + parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, + help="if toggled, cuda will be enabled by default") + parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS, + help="total timesteps of the experiments") + + # model parameters + parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True, + help="Train Model or not") + parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True, + help="freeze view network or not") + parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS, + help="training dataset size,start training while dataset collect enough data") + parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE, + help="nimi batch size") + parser.add_argument("--epochs", type=int, default=EPOCHS, + help="the K epochs to update the policy") + parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True, + help="Toggle learning rate annealing for policy and value networks") + parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True, + help="track on the wandb") + parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True, + help="save model or not") + parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY, + help="the entity (team) of wandb's project") + parser.add_argument("--load-dir", type=str, default=LOAD_DIR, + help="load model directory") + parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD, + help="the number of steps to run in each environment per policy rollout") + parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO, + help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime") + parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True, + help="save model or not") + # target_learning_rate + parser.add_argument("--target-lr", type=float, default=TARGET_LEARNING_RATE, + help="target value of downscaling the learning rate") + + # POLICY_COEF ENTROPY_COEF CRITIC_COEF LOSS_COEF + parser.add_argument("--policy-coef", type=float, default=POLICY_COEF, + help="coefficient of the policy loss") + parser.add_argument("--entropy-coef", type=float, default=ENTROPY_COEF, + help="coefficient of the entropy loss") + parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF, + help="coefficient of the critic loss") + parser.add_argument("--loss-coef", type=float, default=LOSS_COEF, + help="coefficient of the total loss") + + # GAE loss + parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, + help="Use GAE for advantage computation") + parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True, + help="Toggles advantages normalization") + parser.add_argument("--gamma", type=float, default=GAMMA, + help="the discount factor gamma") + parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA, + help="the lambda for the general advantage estimation") + parser.add_argument("--clip-coef", type=float, default=CLIP_COEF, + help="the surrogate clipping coefficient") + parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True, + help="Toggles whether or not to use a clipped loss for the value function, as per the paper.") + parser.add_argument("--max-grad-norm", type=float, default=0.5, + help="the maximum norm for the gradient clipping") + parser.add_argument("--target-kl", type=float, default=None, + help="the target KL divergence threshold") + # environment parameters + parser.add_argument("--target-num", type=int, default=TARGETNUM, + help="the number of targets") + parser.add_argument("--env-timelimit", type=int, default=ENV_TIMELIMIT, + help="the time limit of each round") + parser.add_argument("--base-win-reward", type=int, default=BASE_WINREWARD, + help="the base reward of win round") + parser.add_argument("--base-lose-reward", type=int, default=BASE_LOSEREWARD, + help="the base reward of lose round") + parser.add_argument("--target-state-size", type=int, default=TARGET_STATE_SIZE, + help="the size of target state") + parser.add_argument("--time-state-size", type=int, default=TIME_STATE_SIZE, + help="the size of time state") + parser.add_argument("--gun-state-size", type=int, default=GUN_STATE_SIZE, + help="the size of gun state") + parser.add_argument("--my-state-size", type=int, default=MY_STATE_SIZE, + help="the size of my state") + parser.add_argument("--total-target-size", type=int, default=TOTAL_T_SIZE, + help="the size of total target state") + # fmt: on + args = parser.parse_args() + return args \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/ppoagent.py b/Aimbot-PPO-Python/Pytorch/ppoagent.py index 917fc3e..bcc041b 100644 --- a/Aimbot-PPO-Python/Pytorch/ppoagent.py +++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py @@ -1,6 +1,7 @@ import numpy as np import torch import argparse +import time from torch import nn from aimbotEnv import Aimbot @@ -19,123 +20,118 @@ class PPOAgent(nn.Module): self, env: Aimbot, this_args:argparse.Namespace, - train_agent: bool, - target_num: int, - target_state_size: int, - time_state_size: int, - gun_state_size: int, - my_state_size: int, - total_t_size: int, device: torch.device, ): super(PPOAgent, self).__init__() self.device = device self.args = this_args - self.trainAgent = train_agent - self.targetNum = target_num - self.stateSize = env.unity_observation_shape[0] - self.agentNum = env.unity_agent_num - self.targetSize = target_state_size - self.timeSize = time_state_size - self.gunSize = gun_state_size - self.myStateSize = my_state_size - self.raySize = env.unity_observation_shape[0] - total_t_size - self.nonRaySize = total_t_size + self.train_agent = self.args.train + self.target_num = self.args.target_num + self.unity_observation_shape = env.unity_observation_shape + self.unity_action_size = env.unity_action_size + self.state_size = self.unity_observation_shape[0] + self.agent_num = env.unity_agent_num + self.target_size = self.args.target_state_size + self.time_state_size = self.args.time_state_size + self.gun_state_size = self.args.gun_state_size + self.my_state_size = self.args.my_state_size + self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size + self.state_size_without_ray = self.args.total_target_size self.head_input_size = ( - env.unity_observation_shape[0] - self.targetSize - self.timeSize - self.gunSize + env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size ) # except target state input - self.unityDiscreteType = env.unity_discrete_type + self.unity_discrete_type = env.unity_discrete_type self.discrete_size = env.unity_discrete_size self.discrete_shape = list(env.unity_discrete_branches) self.continuous_size = env.unity_continuous_size - self.viewNetwork = nn.Sequential(layer_init(nn.Linear(self.raySize, 200)), nn.LeakyReLU()) - self.targetNetworks = nn.ModuleList( + self.view_network = nn.Sequential(layer_init(nn.Linear(self.ray_state_size, 200)), nn.LeakyReLU()) + self.target_networks = nn.ModuleList( [ - nn.Sequential(layer_init(nn.Linear(self.nonRaySize, 100)), nn.LeakyReLU()) - for i in range(target_num) + nn.Sequential(layer_init(nn.Linear(self.state_size_without_ray, 100)), nn.LeakyReLU()) + for i in range(self.target_num) ] ) - self.middleNetworks = nn.ModuleList( + self.middle_networks = nn.ModuleList( [ nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU()) - for i in range(target_num) + for i in range(self.target_num) ] ) self.actor_dis = nn.ModuleList( - [layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(target_num)] + [layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(self.target_num)] ) self.actor_mean = nn.ModuleList( - [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(target_num)] + [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)] ) # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)]) # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) self.actor_logstd = nn.ParameterList( - [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(target_num)] + [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)] ) # nn.Parameter(torch.zeros(1, self.continuous_size)) self.critic = nn.ModuleList( - [layer_init(nn.Linear(200, 1), std=1) for i in range(target_num)] + [layer_init(nn.Linear(200, 1), std=1) for i in range(self.target_num)] ) def get_value(self, state: torch.Tensor): target = state[:, 0].to(torch.int32) # int - thisStateNum = target.size()[0] - viewInput = state[:, -self.raySize :] # all ray input - targetInput = state[:, : self.nonRaySize] - viewLayer = self.viewNetwork(viewInput) - targetLayer = torch.stack( - [self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)] + this_state_num = target.size()[0] + view_input = state[:, -self.ray_state_size :] # all ray input + target_input = state[:, : self.state_size_without_ray] + view_layer = self.view_network(view_input) + target_layer = torch.stack( + [self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)] ) - middleInput = torch.cat([viewLayer, targetLayer], dim=1) - middleLayer = torch.stack( - [self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)] + middle_input = torch.cat([view_layer, target_layer], dim=1) + middle_layer = torch.stack( + [self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)] ) criticV = torch.stack( - [self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)] + [self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)] ) # self.critic return criticV def get_actions_value(self, state: torch.Tensor, actions=None): target = state[:, 0].to(torch.int32) # int - thisStateNum = target.size()[0] - viewInput = state[:, -self.raySize :] # all ray input - targetInput = state[:, : self.nonRaySize] - viewLayer = self.viewNetwork(viewInput) - targetLayer = torch.stack( - [self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)] + this_state_num = target.size()[0] + view_input = state[:, -self.ray_state_size :] # all ray input + target_input = state[:, : self.state_size_without_ray] + view_layer = self.view_network(view_input) + target_layer = torch.stack( + [self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)] ) - middleInput = torch.cat([viewLayer, targetLayer], dim=1) - middleLayer = torch.stack( - [self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)] + middle_input = torch.cat([view_layer, target_layer], dim=1) + middle_layer = torch.stack( + [self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)] ) # discrete # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出 dis_logits = torch.stack( - [self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)] + [self.actor_dis[target[i]](middle_layer[i]) for i in range(this_state_num)] ) split_logits = torch.split(dis_logits, self.discrete_shape, dim=1) multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits] # continuous actions_mean = torch.stack( - [self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)] + [self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)] ) # self.actor_mean(hidden) # action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden) # action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean) action_logstd = torch.stack( - [torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(thisStateNum)] + [torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)] ) # print(action_logstd) action_std = torch.exp(action_logstd) # torch.exp(action_logstd) con_probs = Normal(actions_mean, action_std) # critic criticV = torch.stack( - [self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)] + [self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)] ) # self.critic if actions is None: - if self.trainAgent: + if self.train_agent: # select actions base on probability distribution model disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) conAct = con_probs.sample() @@ -148,8 +144,8 @@ class PPOAgent(nn.Module): conAct = con_probs.sample() actions = torch.cat([disAct.T, conAct], dim=1) else: - disAct = actions[:, 0 : self.unityDiscreteType].T - conAct = actions[:, self.unityDiscreteType :] + disAct = actions[:, 0 : self.unity_discrete_type].T + conAct = actions[:, self.unity_discrete_type :] dis_log_prob = torch.stack( [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)] ) @@ -162,6 +158,123 @@ class PPOAgent(nn.Module): con_probs.entropy().sum(1), criticV, ) + def train_net(self, this_train_ind:int,ppo_memories,optimizer) -> tuple: + start_time = time.time() + # flatten the batch + b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape) + b_dis_logprobs = ppo_memories.dis_logprobs[this_train_ind].reshape(-1) + b_con_logprobs = ppo_memories.con_logprobs[this_train_ind].reshape(-1) + b_actions = ppo_memories.actions[this_train_ind].reshape((-1,) + (self.unity_action_size,)) + b_advantages = ppo_memories.advantages[this_train_ind].reshape(-1) + b_returns = ppo_memories.returns[this_train_ind].reshape(-1) + b_values = ppo_memories.values[this_train_ind].reshape(-1) + b_size = b_obs.size()[0] + # optimizing the policy and value network + b_inds = np.arange(b_size) + + for epoch in range(self.args.epochs): + print("epoch:",epoch,end="") + # shuffle all datasets + np.random.shuffle(b_inds) + for start in range(0, b_size, self.args.minibatchSize): + print(".",end="") + end = start + self.args.minibatchSize + mb_inds = b_inds[start:end] + if(np.size(mb_inds)<=1): + break + mb_advantages = b_advantages[mb_inds] + + # normalize advantages + if self.args.norm_adv: + mb_advantages = (mb_advantages - mb_advantages.mean()) / ( + mb_advantages.std() + 1e-8 + ) + + ( + _, + new_dis_logprob, + dis_entropy, + new_con_logprob, + con_entropy, + newvalue, + ) = self.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) + # discrete ratio + dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] + dis_ratio = dis_logratio.exp() + # continuous ratio + con_logratio = new_con_logprob - b_con_logprobs[mb_inds] + con_ratio = con_logratio.exp() + + """ + # early stop + with torch.no_grad(): + # calculate approx_kl http://joschu.net/blog/kl-approx.html + old_approx_kl = (-logratio).mean() + approx_kl = ((ratio - 1) - logratio).mean() + clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] + """ + + # discrete Policy loss + dis_pg_loss_orig = -mb_advantages * dis_ratio + dis_pg_loss_clip = -mb_advantages * torch.clamp( + dis_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef + ) + dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() + # continuous Policy loss + con_pg_loss_orig = -mb_advantages * con_ratio + con_pg_loss_clip = -mb_advantages * torch.clamp( + con_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef + ) + con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() + + # Value loss + newvalue = newvalue.view(-1) + if self.args.clip_vloss: + v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 + v_clipped = b_values[mb_inds] + torch.clamp( + newvalue - b_values[mb_inds], + -self.args.clip_coef, + self.args.clip_coef, + ) + v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 + v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) + v_loss = 0.5 * v_loss_max.mean() + else: + v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() + + # total loss + entropy_loss = dis_entropy.mean() + con_entropy.mean() + loss = ( + dis_pg_loss * self.args.policy_coef[this_train_ind] + + con_pg_loss * self.args.policy_coef[this_train_ind] + + entropy_loss * self.args.entropy_coef[this_train_ind] + + v_loss * self.args.critic_coef[this_train_ind] + )*self.args.loss_coef[this_train_ind] + + if(torch.isnan(loss).any()): + print("LOSS Include NAN!!!") + if(torch.isnan(dis_pg_loss.any())): + print("dis_pg_loss include nan") + if(torch.isnan(con_pg_loss.any())): + print("con_pg_loss include nan") + if(torch.isnan(entropy_loss.any())): + print("entropy_loss include nan") + if(torch.isnan(v_loss.any())): + print("v_loss include nan") + raise + + optimizer.zero_grad() + loss.backward() + # Clips gradient norm of an iterable of parameters. + nn.utils.clip_grad_norm_(self.parameters(), self.args.max_grad_norm) + optimizer.step() + + """ + if args.target_kl is not None: + if approx_kl > args.target_kl: + break + """ + return (v_loss,dis_pg_loss,con_pg_loss,loss,entropy_loss) def gae( self, diff --git a/Aimbot-PPO-Python/Pytorch/pytorch_run_archive.zip b/Aimbot-PPO-Python/Pytorch/pytorch_run_archive.zip new file mode 100644 index 0000000..f085b11 Binary files /dev/null and b/Aimbot-PPO-Python/Pytorch/pytorch_run_archive.zip differ