import argparse import time import numpy as np import random import uuid import torch import torch.nn as nn import torch.optim as optim import atexit from aimbotEnv import Aimbot from aimbotEnv import AimbotSideChannel from ppoagent import PPOAgent from airecorder import WandbRecorder from aimemory import PPOMem from aimemory import Targets from enum import Enum from distutils.util import strtobool best_reward = -1 DEFAULT_SEED = 9331 ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv" SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e") WAND_ENTITY = "koha9" WORKER_ID = 1 BASE_PORT = 1000 # tensorboard names GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3" GAME_TYPE = "Mix_Verification" # max round steps per agent is 2500/Decision_period, 25 seconds # !!!check every parameters before run!!! TOTAL_STEPS = 3150000 BATCH_SIZE = 512 MAX_TRAINNING_DATASETS = 6000 DECISION_PERIOD = 1 LEARNING_RATE = 6.5e-4 GAMMA = 0.99 GAE_LAMBDA = 0.95 EPOCHS = 3 CLIP_COEF = 0.11 LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence POLICY_COEF = [1.0, 1.0, 1.0, 1.0] ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05] CRITIC_COEF = [0.5, 0.5, 0.5, 0.5] TARGET_LEARNING_RATE = 1e-6 FREEZE_VIEW_NETWORK = True BROADCASTREWARD = False ANNEAL_LEARNING_RATE = True CLIP_VLOSS = True NORM_ADV = False TRAIN = True SAVE_MODEL = False WANDB_TACK = False LOAD_DIR = None #LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt" TARGET_STATE_SIZE = 6 INAREA_STATE_SIZE = 1 TIME_STATE_SIZE = 1 GUN_STATE_SIZE = 1 MY_STATE_SIZE = 4 TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE BASE_WINREWARD = 999 BASE_LOSEREWARD = -999 TARGETNUM= 4 ENV_TIMELIMIT = 30 RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT # !!!SPECIAL PARAMETERS!!! # change it while program is finished using_targets_num = 3 def parse_args(): # fmt: off # pytorch and environment parameters parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=DEFAULT_SEED, help="seed of the experiment") parser.add_argument("--path", type=str, default=ENV_PATH, help="enviroment path") parser.add_argument("--workerID", type=int, default=WORKER_ID, help="unity worker ID") parser.add_argument("--baseport", type=int, default=BASE_PORT, help="port to connect to Unity environment") parser.add_argument("--lr", type=float, default=LEARNING_RATE, help="the learning rate of optimizer") parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, help="if toggled, cuda will be enabled by default") parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS, help="total timesteps of the experiments") # model parameters parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True, help="Train Model or not") parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True, help="freeze view network or not") parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS, help="training dataset size,start training while dataset collect enough data") parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE, help="nimi batch size") parser.add_argument("--epochs", type=int, default=EPOCHS, help="the K epochs to update the policy") parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True, help="track on the wandb") parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True, help="save model or not") parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY, help="the entity (team) of wandb's project") parser.add_argument("--load-dir", type=str, default=LOAD_DIR, help="load model directory") parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD, help="the number of steps to run in each environment per policy rollout") parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO, help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime") parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True, help="save model or not") # GAE loss parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, help="Use GAE for advantage computation") parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True, help="Toggles advantages normalization") parser.add_argument("--gamma", type=float, default=GAMMA, help="the discount factor gamma") parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA, help="the lambda for the general advantage estimation") parser.add_argument("--clip-coef", type=float, default=CLIP_COEF, help="the surrogate clipping coefficient") parser.add_argument("--policy-coef", type=float, default=POLICY_COEF, help="coefficient of the policy") parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF, help="coefficient of the entropy") parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF, help="coefficient of the value function") parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True, help="Toggles whether or not to use a clipped loss for the value function, as per the paper.") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="the maximum norm for the gradient clipping") parser.add_argument("--target-kl", type=float, default=None, help="the target KL divergence threshold") # fmt: on args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") # Initialize environment anget optimizer aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID); env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimbot_sidechannel]) if args.load_dir is None: agent = PPOAgent( env = env, this_args=args, train_agent=args.train, target_num=TARGETNUM, target_state_size= TARGET_STATE_SIZE, time_state_size=TIME_STATE_SIZE, gun_state_size=GUN_STATE_SIZE, my_state_size=MY_STATE_SIZE, total_t_size=TOTAL_T_SIZE, device=device, ).to(device) else: agent = torch.load(args.load_dir) # freeze if args.freeze_viewnet: # freeze the view network for p in agent.viewNetwork.parameters(): p.requires_grad = False print("VIEW NETWORK FREEZED") print("Load Agent", args.load_dir) print(agent.eval()) optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) # Tensorboard and WandB Recorder run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}" wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args) @atexit.register def save_model(): # close env env.close() if args.save_model: # save model while exit save_dir = "../PPO-Model/"+ run_name + "_last.pt" torch.save(agent, save_dir) print("save model to " + save_dir) # start the game total_update_step = using_targets_num * args.total_timesteps // args.datasetSize target_steps = [0 for i in range(TARGETNUM)] start_time = time.time() state, _, done = env.reset() # initialize AI memories ppo_memories = PPOMem( env = env, device = device, args=args, target_num = TARGETNUM, target_state_size = TARGET_STATE_SIZE, base_lose_reward = BASE_LOSEREWARD, base_win_reward = BASE_WINREWARD, ) for total_steps in range(total_update_step): # discunt learning rate, while step == total_update_step lr will be 0 if args.annealLR: final_lr_ratio = TARGET_LEARNING_RATE/args.lr frac = 1.0 - ((total_steps + 1.0) / total_update_step) lr_now = frac * args.lr optimizer.param_groups[0]["lr"] = lr_now else: lr_now = args.lr print("new episode",total_steps,"learning rate = ",lr_now) # MAIN LOOP: run agent in environment step = 0 training = False train_queue = [] last_reward = [0.for i in range(env.unity_agent_num)] while True: if step % args.decision_period == 0: step += 1 # Choose action by agent with torch.no_grad(): # predict actions action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value( torch.Tensor(state).to(device) ) value = value.flatten() # variable from GPU to CPU action_cpu = action.cpu().numpy() dis_logprob_cpu = dis_logprob.cpu().numpy() con_logprob_cpu = con_logprob.cpu().numpy() value_cpu = value.cpu().numpy() # Environment step next_state, reward, next_done = env.step(action_cpu) # save memories ppo_memories.save_memories( now_step = step, agent = agent, state = state, action_cpu = action_cpu, dis_logprob_cpu = dis_logprob_cpu, con_logprob_cpu = con_logprob_cpu, reward = reward, done = done, value_cpu = value_cpu, last_reward = last_reward, next_done = next_done, next_state=next_state, ) # check if any training dataset is full and ready to train for i in range(TARGETNUM): if ppo_memories.obs[i].size()[0] >= args.datasetSize: # start train NN train_queue.append(i) if(len(train_queue)>0): break state, done = next_state, next_done else: step += 1 # skip this step use last predict action next_state, reward, next_done = env.step(action_cpu) # save memories ppo_memories.save_memories( now_step = step, agent = agent, state = state, action_cpu = action_cpu, dis_logprob_cpu = dis_logprob_cpu, con_logprob_cpu = con_logprob_cpu, reward = reward, done = done, value_cpu = value_cpu, last_reward = last_reward, next_done = next_done, next_state=next_state, ) state = next_state last_reward = reward if args.train: # train mode on mean_reward_list = [] # for WANDB # loop all tarining queue for thisT in train_queue: # sart time start_time = time.time() target_steps[thisT]+=1 # flatten the batch b_obs = ppo_memories.obs[thisT].reshape((-1,) + env.unity_observation_shape) b_dis_logprobs = ppo_memories.dis_logprobs[thisT].reshape(-1) b_con_logprobs = ppo_memories.con_logprobs[thisT].reshape(-1) b_actions = ppo_memories.actions[thisT].reshape((-1,) + (env.unity_action_size,)) b_advantages = ppo_memories.advantages[thisT].reshape(-1) b_returns = ppo_memories.returns[thisT].reshape(-1) b_values = ppo_memories.values[thisT].reshape(-1) b_size = b_obs.size()[0] # Optimizing the policy and value network b_inds = np.arange(b_size) # clipfracs = [] for epoch in range(args.epochs): print(epoch,end="") # shuffle all datasets np.random.shuffle(b_inds) for start in range(0, b_size, args.minibatchSize): print(".",end="") end = start + args.minibatchSize mb_inds = b_inds[start:end] if(np.size(mb_inds)<=1): break mb_advantages = b_advantages[mb_inds] # normalize advantages if args.norm_adv: mb_advantages = (mb_advantages - mb_advantages.mean()) / ( mb_advantages.std() + 1e-8 ) ( _, new_dis_logprob, dis_entropy, new_con_logprob, con_entropy, newvalue, ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) # discrete ratio dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] dis_ratio = dis_logratio.exp() # continuous ratio con_logratio = new_con_logprob - b_con_logprobs[mb_inds] con_ratio = con_logratio.exp() """ # early stop with torch.no_grad(): # calculate approx_kl http://joschu.net/blog/kl-approx.html old_approx_kl = (-logratio).mean() approx_kl = ((ratio - 1) - logratio).mean() clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] """ # discrete Policy loss dis_pg_loss_orig = -mb_advantages * dis_ratio dis_pg_loss_clip = -mb_advantages * torch.clamp( dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef ) dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() # continuous Policy loss con_pg_loss_orig = -mb_advantages * con_ratio con_pg_loss_clip = -mb_advantages * torch.clamp( con_ratio, 1 - args.clip_coef, 1 + args.clip_coef ) con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() # Value loss newvalue = newvalue.view(-1) if args.clip_vloss: v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 v_clipped = b_values[mb_inds] + torch.clamp( newvalue - b_values[mb_inds], -args.clip_coef, args.clip_coef, ) v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() # total loss entropy_loss = dis_entropy.mean() + con_entropy.mean() loss = ( dis_pg_loss * POLICY_COEF[thisT] + con_pg_loss * POLICY_COEF[thisT] + entropy_loss * ENTROPY_COEF[thisT] + v_loss * CRITIC_COEF[thisT] )*LOSS_COEF[thisT] if(torch.isnan(loss).any()): print("LOSS Include NAN!!!") if(torch.isnan(dis_pg_loss.any())): print("dis_pg_loss include nan") if(torch.isnan(con_pg_loss.any())): print("con_pg_loss include nan") if(torch.isnan(entropy_loss.any())): print("entropy_loss include nan") if(torch.isnan(v_loss.any())): print("v_loss include nan") raise optimizer.zero_grad() loss.backward() # Clips gradient norm of an iterable of parameters. nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) optimizer.step() """ if args.target_kl is not None: if approx_kl > args.target_kl: break """ # record mean reward before clear history print("done") targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy()) mean_reward_list.append(targetRewardMean) targetName = Targets(thisT).name # clear this target trainning set buffer ppo_memories.clear_training_datasets(thisT) # record rewards for plotting purposes wdb_recorder.add_target_scalar( targetName, thisT, v_loss, dis_pg_loss, con_pg_loss, loss, entropy_loss, targetRewardMean, target_steps, ) print(f"episode over Target{targetName} mean reward:", targetRewardMean) TotalRewardMean = np.mean(mean_reward_list) wdb_recorder.add_global_scalar( TotalRewardMean, optimizer.param_groups[0]["lr"], total_steps, ) # print cost time as seconds print("cost time:", time.time() - start_time) # New Record! if TotalRewardMean > best_reward and args.save_model: best_reward = targetRewardMean saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt" torch.save(agent, saveDir) else: # train mode off mean_reward_list = [] # for WANDB # while not in training mode, clear the buffer for thisT in train_queue: target_steps[thisT]+=1 targetName = Targets(thisT).name targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy()) mean_reward_list.append(targetRewardMean) print(target_steps[thisT]) # clear this target trainning set buffer ppo_memories.clear_training_datasets(thisT) # record rewards for plotting purposes wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) wdb_recorder.add_win_ratio(targetName,target_steps[thisT]) print(f"episode over Target{targetName} mean reward:", targetRewardMean) TotalRewardMean = np.mean(mean_reward_list) wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) saveDir = "../PPO-Model/"+ run_name + "_last.pt" torch.save(agent, saveDir) env.close() wdb_recorder.writer.close()