From cbc385ca10f9be79f742db800b174c5dbb4e7bab Mon Sep 17 00:00:00 2001 From: Koha9 Date: Sat, 3 Dec 2022 07:54:38 +0900 Subject: [PATCH] Change training dataset storage method save training dataset by it target type. while training NN use single target training set to backward NN. this improve at least 20 times faster than last update! --- Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py | 331 +++++++++++++---------- 1 file changed, 181 insertions(+), 150 deletions(-) diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index d64a08f..650ddda 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -10,6 +10,7 @@ import torch.optim as optim from AimbotEnv import Aimbot from tqdm import tqdm +from enum import Enum from torch.distributions.normal import Normal from torch.distributions.categorical import Categorical from distutils.util import strtobool @@ -34,11 +35,11 @@ BASE_PORT = 1001 # max round steps per agent is 2500/Decision_period, 25 seconds # !!!check every parameters before run!!! -TOTAL_STEPS = 6000000 +TOTAL_STEPS = 6750000 BATCH_SIZE = 512 -MAX_TRAINNING_DATASETS = 8000 +MAX_TRAINNING_DATASETS = 3000 DECISION_PERIOD = 1 -LEARNING_RATE = 8e-4 +LEARNING_RATE = 1e-3 GAMMA = 0.99 GAE_LAMBDA = 0.95 EPOCHS = 4 @@ -54,17 +55,27 @@ NORM_ADV = True TRAIN = True WANDB_TACK = True -#LOAD_DIR = None -LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt" +LOAD_DIR = None +#LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt" # public data +class Targets(Enum): + Free = 0 + Go = 1 + Attack = 2 + Defence = 3 + Num = 4 BASE_WINREWARD = 999 BASE_LOSEREWARD = -999 TARGETNUM= 4 ENV_TIMELIMIT = 30 RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT -TotalRounds = {"Go":0,"Attack":0,"Free":0} -WinRounds = {"Go":0,"Attack":0,"Free":0} +TotalRounds = {"Free":0,"Go":0,"Attack":0} +WinRounds = {"Free":0,"Go":0,"Attack":0} + +# !!!SPECIAL PARAMETERS!!! +# change it while program is finished +using_targets_num = 3 def parse_args(): @@ -164,7 +175,7 @@ class PPOAgent(nn.Module): def get_actions_value(self, state: torch.Tensor, actions=None): hidden = self.network(state) - targets = state[:,0] + targets = state[:,0].to(torch.int32) # discrete # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出 @@ -321,8 +332,8 @@ if __name__ == "__main__": optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) # Tensorboard and WandB Recorder - game_name = "Aimbot_Target" - game_type = "OffPolicy_HMNN_EndBC" + game_name = "Aimbot_Target_Hybrid_Multi_Output" + game_type = "OffPolicy_EndBC" run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}" if args.wandb_track: wandb.init( @@ -351,14 +362,24 @@ if __name__ == "__main__": dones_bf = [[] for i in range(env.unity_agent_num)] values_bf = [[] for i in range(env.unity_agent_num)] - # TRY NOT TO MODIFY: start the game - total_update_step = args.total_timesteps // args.datasetSize - global_step = 0 + # start the game + total_update_step = using_targets_num * args.total_timesteps // args.datasetSize + target_steps = [0 for i in range(TARGETNUM)] start_time = time.time() state, _, done = env.reset() # state = torch.Tensor(next_obs).to(device) # next_done = torch.zeros(env.unity_agent_num).to(device) + # initialize empty training datasets + obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size) + actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size) + dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + for total_steps in range(total_update_step): # discunt learning rate, while step == total_update_step lr will be 0 print("new episode") @@ -368,24 +389,15 @@ if __name__ == "__main__": lrnow = frac * args.lr optimizer.param_groups[0]["lr"] = lrnow - # initialize empty training datasets - obs = torch.tensor([]).to(device) # (n,env.unity_observation_size) - actions = torch.tensor([]).to(device) # (n,env.unity_action_size) - dis_logprobs = torch.tensor([]).to(device) # (n,1) - con_logprobs = torch.tensor([]).to(device) # (n,1) - rewards = torch.tensor([]).to(device) # (n,1) - values = torch.tensor([]).to(device) # (n,1) - advantages = torch.tensor([]).to(device) # (n,1) - returns = torch.tensor([]).to(device) # (n,1) # MAIN LOOP: run agent in environment i = 0 training = False + trainQueue = [] while True: if i % args.decision_period == 0: step = round(i / args.decision_period) # Choose action by agent - global_step += 1 * env.unity_agent_num with torch.no_grad(): # predict actions @@ -416,7 +428,8 @@ if __name__ == "__main__": # finished a round, send finished memories to training datasets # compute advantage and discounted reward #print(i,"over") - thisRewardsTensor = broadCastEndReward(rewards_bf[i],state[i,6]) + roundTargetType = int(state[i,0]) + thisRewardsTensor = broadCastEndReward(rewards_bf[i],roundTargetType) adv, rt = GAE( agent, args, @@ -427,18 +440,18 @@ if __name__ == "__main__": torch.Tensor([next_done[i]]).to(device), ) # send memories to training datasets - obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0) - actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0) - dis_logprobs = torch.cat( - (dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0 + obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0) + actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0) + dis_logprobs[roundTargetType] = torch.cat( + (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0 ) - con_logprobs = torch.cat( - (con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0 + con_logprobs[roundTargetType] = torch.cat( + (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0 ) - rewards = torch.cat((rewards, thisRewardsTensor), 0) - values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0) - advantages = torch.cat((advantages, adv), 0) - returns = torch.cat((returns, rt), 0) + rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0) + values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0) + advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0) + returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0) # clear buffers ob_bf[i] = [] @@ -448,10 +461,13 @@ if __name__ == "__main__": rewards_bf[i] = [] dones_bf[i] = [] values_bf[i] = [] - print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}") + print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}") - if obs.size()[0] >= args.datasetSize: - # start train NN + for i in range(TARGETNUM): + if obs[i].size()[0] >= args.datasetSize: + # start train NN + trainQueue.append(i) + if(len(trainQueue)>0): break state, done = next_state, next_done else: @@ -507,128 +523,143 @@ if __name__ == "__main__": i += 1 if args.train: - # flatten the batch - b_obs = obs.reshape((-1,) + env.unity_observation_shape) - b_dis_logprobs = dis_logprobs.reshape(-1) - b_con_logprobs = con_logprobs.reshape(-1) - b_actions = actions.reshape((-1,) + (env.unity_action_size,)) - b_advantages = advantages.reshape(-1) - b_returns = returns.reshape(-1) - b_values = values.reshape(-1) - b_size = b_obs.size()[0] - # Optimizing the policy and value network - b_inds = np.arange(b_size) - # clipfracs = [] - for epoch in range(args.epochs): - # shuffle all datasets - np.random.shuffle(b_inds) - for start in range(0, b_size, args.minibatchSize): - end = start + args.minibatchSize - mb_inds = b_inds[start:end] - mb_advantages = b_advantages[mb_inds] + meanRewardList = [] # for WANDB + # loop all tarining queue + for thisT in trainQueue: + target_steps[thisT]+=1 + # flatten the batch + b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape) + b_dis_logprobs = dis_logprobs[thisT].reshape(-1) + b_con_logprobs = con_logprobs[thisT].reshape(-1) + b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,)) + b_advantages = advantages[thisT].reshape(-1) + b_returns = returns[thisT].reshape(-1) + b_values = values[thisT].reshape(-1) + b_size = b_obs[thisT].size()[0] + # Optimizing the policy and value network + b_inds = np.arange(b_size) + # clipfracs = [] + for epoch in range(args.epochs): + # shuffle all datasets + np.random.shuffle(b_inds) + for start in range(0, b_size, args.minibatchSize): + end = start + args.minibatchSize + mb_inds = b_inds[start:end] + mb_advantages = b_advantages[mb_inds] - # normalize advantages - if args.norm_adv: - mb_advantages = (mb_advantages - mb_advantages.mean()) / ( - mb_advantages.std() + 1e-8 + # normalize advantages + if args.norm_adv: + mb_advantages = (mb_advantages - mb_advantages.mean()) / ( + mb_advantages.std() + 1e-8 + ) + + ( + _, + new_dis_logprob, + dis_entropy, + new_con_logprob, + con_entropy, + newvalue, + ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) + # discrete ratio + dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] + dis_ratio = dis_logratio.exp() + # continuous ratio + con_logratio = new_con_logprob - b_con_logprobs[mb_inds] + con_ratio = con_logratio.exp() + + """ + # early stop + with torch.no_grad(): + # calculate approx_kl http://joschu.net/blog/kl-approx.html + old_approx_kl = (-logratio).mean() + approx_kl = ((ratio - 1) - logratio).mean() + clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] + """ + + # discrete Policy loss + dis_pg_loss_orig = -mb_advantages * dis_ratio + dis_pg_loss_clip = -mb_advantages * torch.clamp( + dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef + ) + dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() + # continuous Policy loss + con_pg_loss_orig = -mb_advantages * con_ratio + con_pg_loss_clip = -mb_advantages * torch.clamp( + con_ratio, 1 - args.clip_coef, 1 + args.clip_coef + ) + con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() + + # Value loss + newvalue = newvalue.view(-1) + if args.clip_vloss: + v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 + v_clipped = b_values[mb_inds] + torch.clamp( + newvalue - b_values[mb_inds], + -args.clip_coef, + args.clip_coef, + ) + v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 + v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) + v_loss = 0.5 * v_loss_max.mean() + else: + v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() + + # total loss + entropy_loss = dis_entropy.mean() + con_entropy.mean() + loss = ( + dis_pg_loss * args.policy_coef + + con_pg_loss * args.policy_coef + - entropy_loss * args.ent_coef + + v_loss * args.critic_coef ) - ( - _, - new_dis_logprob, - dis_entropy, - new_con_logprob, - con_entropy, - newvalue, - ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) - # discrete ratio - dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] - dis_ratio = dis_logratio.exp() - # continuous ratio - con_logratio = new_con_logprob - b_con_logprobs[mb_inds] - con_ratio = con_logratio.exp() + optimizer.zero_grad() + loss.backward() + # Clips gradient norm of an iterable of parameters. + nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) + optimizer.step() """ - # early stop - with torch.no_grad(): - # calculate approx_kl http://joschu.net/blog/kl-approx.html - old_approx_kl = (-logratio).mean() - approx_kl = ((ratio - 1) - logratio).mean() - clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] + if args.target_kl is not None: + if approx_kl > args.target_kl: + break """ + # record mean reward before clear history + targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) + meanRewardList.append(targetRewardMean) + targetName = Targets(thisT).name - # discrete Policy loss - dis_pg_loss_orig = -mb_advantages * dis_ratio - dis_pg_loss_clip = -mb_advantages * torch.clamp( - dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef - ) - dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() - # continuous Policy loss - con_pg_loss_orig = -mb_advantages * con_ratio - con_pg_loss_clip = -mb_advantages * torch.clamp( - con_ratio, 1 - args.clip_coef, 1 + args.clip_coef - ) - con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() + # clear this target trainning set buffer + obs[thisT] = torch.tensor([]).to(device) + actions[thisT] = torch.tensor([]).to(device) + dis_logprobs[thisT] = torch.tensor([]).to(device) + con_logprobs[thisT] = torch.tensor([]).to(device) + rewards[thisT] = torch.tensor([]).to(device) + values[thisT] = torch.tensor([]).to(device) + advantages[thisT] = torch.tensor([]).to(device) + returns[thisT] = torch.tensor([]).to(device) - # Value loss - newvalue = newvalue.view(-1) - if args.clip_vloss: - v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 - v_clipped = b_values[mb_inds] + torch.clamp( - newvalue - b_values[mb_inds], - -args.clip_coef, - args.clip_coef, - ) - v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 - v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) - v_loss = 0.5 * v_loss_max.mean() - else: - v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() - - # total loss - entropy_loss = dis_entropy.mean() + con_entropy.mean() - loss = ( - dis_pg_loss * args.policy_coef - + con_pg_loss * args.policy_coef - - entropy_loss * args.ent_coef - + v_loss * args.critic_coef - ) - - optimizer.zero_grad() - loss.backward() - # Clips gradient norm of an iterable of parameters. - nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) - optimizer.step() - - """ - if args.target_kl is not None: - if approx_kl > args.target_kl: - break - """ - # record rewards for plotting purposes - rewardsMean = np.mean(rewards.to("cpu").detach().numpy().copy()) - writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) - writer.add_scalar("losses/value_loss", v_loss.item(), global_step) - writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step) - writer.add_scalar("losses/con_policy_loss", con_pg_loss.item(), global_step) - writer.add_scalar("losses/total_loss", loss.item(), global_step) - writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step) - # writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step) - # writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) - # writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) - # print("SPS:", int(global_step / (time.time() - start_time))) - print("episode over mean reward:", rewardsMean) - writer.add_scalar( - "charts/SPS", int(global_step / (time.time() - start_time)), global_step - ) - writer.add_scalar("charts/Reward", rewardsMean, global_step) - writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"], global_step) - writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"], global_step) - writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"], global_step) - if rewardsMean > bestReward: - bestReward = rewardsMean - saveDir = "../PPO-Model/Target-700-500-256-hybrid-" + str(rewardsMean) + ".pt" + # record rewards for plotting purposes + writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT]) + print(f"episode over Target{targetName} mean reward:", targetRewardMean) + TotalRewardMean = np.mean(meanRewardList) + writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) + writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps) + # New Record! + if TotalRewardMean > bestReward: + bestReward = targetRewardMean + saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt" torch.save(agent, saveDir) + saveDir = "../PPO-Model/Hybrid-MNN-500-300-Last" + ".pt" + torch.save(agent, saveDir) env.close() writer.close()