diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index d64a08f..650ddda 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -10,6 +10,7 @@ import torch.optim as optim from AimbotEnv import Aimbot from tqdm import tqdm +from enum import Enum from torch.distributions.normal import Normal from torch.distributions.categorical import Categorical from distutils.util import strtobool @@ -34,11 +35,11 @@ BASE_PORT = 1001 # max round steps per agent is 2500/Decision_period, 25 seconds # !!!check every parameters before run!!! -TOTAL_STEPS = 6000000 +TOTAL_STEPS = 6750000 BATCH_SIZE = 512 -MAX_TRAINNING_DATASETS = 8000 +MAX_TRAINNING_DATASETS = 3000 DECISION_PERIOD = 1 -LEARNING_RATE = 8e-4 +LEARNING_RATE = 1e-3 GAMMA = 0.99 GAE_LAMBDA = 0.95 EPOCHS = 4 @@ -54,17 +55,27 @@ NORM_ADV = True TRAIN = True WANDB_TACK = True -#LOAD_DIR = None -LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt" +LOAD_DIR = None +#LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt" # public data +class Targets(Enum): + Free = 0 + Go = 1 + Attack = 2 + Defence = 3 + Num = 4 BASE_WINREWARD = 999 BASE_LOSEREWARD = -999 TARGETNUM= 4 ENV_TIMELIMIT = 30 RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT -TotalRounds = {"Go":0,"Attack":0,"Free":0} -WinRounds = {"Go":0,"Attack":0,"Free":0} +TotalRounds = {"Free":0,"Go":0,"Attack":0} +WinRounds = {"Free":0,"Go":0,"Attack":0} + +# !!!SPECIAL PARAMETERS!!! +# change it while program is finished +using_targets_num = 3 def parse_args(): @@ -164,7 +175,7 @@ class PPOAgent(nn.Module): def get_actions_value(self, state: torch.Tensor, actions=None): hidden = self.network(state) - targets = state[:,0] + targets = state[:,0].to(torch.int32) # discrete # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出 @@ -321,8 +332,8 @@ if __name__ == "__main__": optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) # Tensorboard and WandB Recorder - game_name = "Aimbot_Target" - game_type = "OffPolicy_HMNN_EndBC" + game_name = "Aimbot_Target_Hybrid_Multi_Output" + game_type = "OffPolicy_EndBC" run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}" if args.wandb_track: wandb.init( @@ -351,14 +362,24 @@ if __name__ == "__main__": dones_bf = [[] for i in range(env.unity_agent_num)] values_bf = [[] for i in range(env.unity_agent_num)] - # TRY NOT TO MODIFY: start the game - total_update_step = args.total_timesteps // args.datasetSize - global_step = 0 + # start the game + total_update_step = using_targets_num * args.total_timesteps // args.datasetSize + target_steps = [0 for i in range(TARGETNUM)] start_time = time.time() state, _, done = env.reset() # state = torch.Tensor(next_obs).to(device) # next_done = torch.zeros(env.unity_agent_num).to(device) + # initialize empty training datasets + obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size) + actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size) + dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + for total_steps in range(total_update_step): # discunt learning rate, while step == total_update_step lr will be 0 print("new episode") @@ -368,24 +389,15 @@ if __name__ == "__main__": lrnow = frac * args.lr optimizer.param_groups[0]["lr"] = lrnow - # initialize empty training datasets - obs = torch.tensor([]).to(device) # (n,env.unity_observation_size) - actions = torch.tensor([]).to(device) # (n,env.unity_action_size) - dis_logprobs = torch.tensor([]).to(device) # (n,1) - con_logprobs = torch.tensor([]).to(device) # (n,1) - rewards = torch.tensor([]).to(device) # (n,1) - values = torch.tensor([]).to(device) # (n,1) - advantages = torch.tensor([]).to(device) # (n,1) - returns = torch.tensor([]).to(device) # (n,1) # MAIN LOOP: run agent in environment i = 0 training = False + trainQueue = [] while True: if i % args.decision_period == 0: step = round(i / args.decision_period) # Choose action by agent - global_step += 1 * env.unity_agent_num with torch.no_grad(): # predict actions @@ -416,7 +428,8 @@ if __name__ == "__main__": # finished a round, send finished memories to training datasets # compute advantage and discounted reward #print(i,"over") - thisRewardsTensor = broadCastEndReward(rewards_bf[i],state[i,6]) + roundTargetType = int(state[i,0]) + thisRewardsTensor = broadCastEndReward(rewards_bf[i],roundTargetType) adv, rt = GAE( agent, args, @@ -427,18 +440,18 @@ if __name__ == "__main__": torch.Tensor([next_done[i]]).to(device), ) # send memories to training datasets - obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0) - actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0) - dis_logprobs = torch.cat( - (dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0 + obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0) + actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0) + dis_logprobs[roundTargetType] = torch.cat( + (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0 ) - con_logprobs = torch.cat( - (con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0 + con_logprobs[roundTargetType] = torch.cat( + (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0 ) - rewards = torch.cat((rewards, thisRewardsTensor), 0) - values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0) - advantages = torch.cat((advantages, adv), 0) - returns = torch.cat((returns, rt), 0) + rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0) + values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0) + advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0) + returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0) # clear buffers ob_bf[i] = [] @@ -448,10 +461,13 @@ if __name__ == "__main__": rewards_bf[i] = [] dones_bf[i] = [] values_bf[i] = [] - print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}") + print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}") - if obs.size()[0] >= args.datasetSize: - # start train NN + for i in range(TARGETNUM): + if obs[i].size()[0] >= args.datasetSize: + # start train NN + trainQueue.append(i) + if(len(trainQueue)>0): break state, done = next_state, next_done else: @@ -507,128 +523,143 @@ if __name__ == "__main__": i += 1 if args.train: - # flatten the batch - b_obs = obs.reshape((-1,) + env.unity_observation_shape) - b_dis_logprobs = dis_logprobs.reshape(-1) - b_con_logprobs = con_logprobs.reshape(-1) - b_actions = actions.reshape((-1,) + (env.unity_action_size,)) - b_advantages = advantages.reshape(-1) - b_returns = returns.reshape(-1) - b_values = values.reshape(-1) - b_size = b_obs.size()[0] - # Optimizing the policy and value network - b_inds = np.arange(b_size) - # clipfracs = [] - for epoch in range(args.epochs): - # shuffle all datasets - np.random.shuffle(b_inds) - for start in range(0, b_size, args.minibatchSize): - end = start + args.minibatchSize - mb_inds = b_inds[start:end] - mb_advantages = b_advantages[mb_inds] + meanRewardList = [] # for WANDB + # loop all tarining queue + for thisT in trainQueue: + target_steps[thisT]+=1 + # flatten the batch + b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape) + b_dis_logprobs = dis_logprobs[thisT].reshape(-1) + b_con_logprobs = con_logprobs[thisT].reshape(-1) + b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,)) + b_advantages = advantages[thisT].reshape(-1) + b_returns = returns[thisT].reshape(-1) + b_values = values[thisT].reshape(-1) + b_size = b_obs[thisT].size()[0] + # Optimizing the policy and value network + b_inds = np.arange(b_size) + # clipfracs = [] + for epoch in range(args.epochs): + # shuffle all datasets + np.random.shuffle(b_inds) + for start in range(0, b_size, args.minibatchSize): + end = start + args.minibatchSize + mb_inds = b_inds[start:end] + mb_advantages = b_advantages[mb_inds] - # normalize advantages - if args.norm_adv: - mb_advantages = (mb_advantages - mb_advantages.mean()) / ( - mb_advantages.std() + 1e-8 + # normalize advantages + if args.norm_adv: + mb_advantages = (mb_advantages - mb_advantages.mean()) / ( + mb_advantages.std() + 1e-8 + ) + + ( + _, + new_dis_logprob, + dis_entropy, + new_con_logprob, + con_entropy, + newvalue, + ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) + # discrete ratio + dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] + dis_ratio = dis_logratio.exp() + # continuous ratio + con_logratio = new_con_logprob - b_con_logprobs[mb_inds] + con_ratio = con_logratio.exp() + + """ + # early stop + with torch.no_grad(): + # calculate approx_kl http://joschu.net/blog/kl-approx.html + old_approx_kl = (-logratio).mean() + approx_kl = ((ratio - 1) - logratio).mean() + clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] + """ + + # discrete Policy loss + dis_pg_loss_orig = -mb_advantages * dis_ratio + dis_pg_loss_clip = -mb_advantages * torch.clamp( + dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef + ) + dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() + # continuous Policy loss + con_pg_loss_orig = -mb_advantages * con_ratio + con_pg_loss_clip = -mb_advantages * torch.clamp( + con_ratio, 1 - args.clip_coef, 1 + args.clip_coef + ) + con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() + + # Value loss + newvalue = newvalue.view(-1) + if args.clip_vloss: + v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 + v_clipped = b_values[mb_inds] + torch.clamp( + newvalue - b_values[mb_inds], + -args.clip_coef, + args.clip_coef, + ) + v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 + v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) + v_loss = 0.5 * v_loss_max.mean() + else: + v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() + + # total loss + entropy_loss = dis_entropy.mean() + con_entropy.mean() + loss = ( + dis_pg_loss * args.policy_coef + + con_pg_loss * args.policy_coef + - entropy_loss * args.ent_coef + + v_loss * args.critic_coef ) - ( - _, - new_dis_logprob, - dis_entropy, - new_con_logprob, - con_entropy, - newvalue, - ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) - # discrete ratio - dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] - dis_ratio = dis_logratio.exp() - # continuous ratio - con_logratio = new_con_logprob - b_con_logprobs[mb_inds] - con_ratio = con_logratio.exp() + optimizer.zero_grad() + loss.backward() + # Clips gradient norm of an iterable of parameters. + nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) + optimizer.step() """ - # early stop - with torch.no_grad(): - # calculate approx_kl http://joschu.net/blog/kl-approx.html - old_approx_kl = (-logratio).mean() - approx_kl = ((ratio - 1) - logratio).mean() - clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] + if args.target_kl is not None: + if approx_kl > args.target_kl: + break """ + # record mean reward before clear history + targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) + meanRewardList.append(targetRewardMean) + targetName = Targets(thisT).name - # discrete Policy loss - dis_pg_loss_orig = -mb_advantages * dis_ratio - dis_pg_loss_clip = -mb_advantages * torch.clamp( - dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef - ) - dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() - # continuous Policy loss - con_pg_loss_orig = -mb_advantages * con_ratio - con_pg_loss_clip = -mb_advantages * torch.clamp( - con_ratio, 1 - args.clip_coef, 1 + args.clip_coef - ) - con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() + # clear this target trainning set buffer + obs[thisT] = torch.tensor([]).to(device) + actions[thisT] = torch.tensor([]).to(device) + dis_logprobs[thisT] = torch.tensor([]).to(device) + con_logprobs[thisT] = torch.tensor([]).to(device) + rewards[thisT] = torch.tensor([]).to(device) + values[thisT] = torch.tensor([]).to(device) + advantages[thisT] = torch.tensor([]).to(device) + returns[thisT] = torch.tensor([]).to(device) - # Value loss - newvalue = newvalue.view(-1) - if args.clip_vloss: - v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 - v_clipped = b_values[mb_inds] + torch.clamp( - newvalue - b_values[mb_inds], - -args.clip_coef, - args.clip_coef, - ) - v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 - v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) - v_loss = 0.5 * v_loss_max.mean() - else: - v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() - - # total loss - entropy_loss = dis_entropy.mean() + con_entropy.mean() - loss = ( - dis_pg_loss * args.policy_coef - + con_pg_loss * args.policy_coef - - entropy_loss * args.ent_coef - + v_loss * args.critic_coef - ) - - optimizer.zero_grad() - loss.backward() - # Clips gradient norm of an iterable of parameters. - nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) - optimizer.step() - - """ - if args.target_kl is not None: - if approx_kl > args.target_kl: - break - """ - # record rewards for plotting purposes - rewardsMean = np.mean(rewards.to("cpu").detach().numpy().copy()) - writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) - writer.add_scalar("losses/value_loss", v_loss.item(), global_step) - writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step) - writer.add_scalar("losses/con_policy_loss", con_pg_loss.item(), global_step) - writer.add_scalar("losses/total_loss", loss.item(), global_step) - writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step) - # writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step) - # writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) - # writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) - # print("SPS:", int(global_step / (time.time() - start_time))) - print("episode over mean reward:", rewardsMean) - writer.add_scalar( - "charts/SPS", int(global_step / (time.time() - start_time)), global_step - ) - writer.add_scalar("charts/Reward", rewardsMean, global_step) - writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"], global_step) - writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"], global_step) - writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"], global_step) - if rewardsMean > bestReward: - bestReward = rewardsMean - saveDir = "../PPO-Model/Target-700-500-256-hybrid-" + str(rewardsMean) + ".pt" + # record rewards for plotting purposes + writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT]) + print(f"episode over Target{targetName} mean reward:", targetRewardMean) + TotalRewardMean = np.mean(meanRewardList) + writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) + writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps) + # New Record! + if TotalRewardMean > bestReward: + bestReward = targetRewardMean + saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt" torch.save(agent, saveDir) + saveDir = "../PPO-Model/Hybrid-MNN-500-300-Last" + ".pt" + torch.save(agent, saveDir) env.close() writer.close()