From 52ccce88bc5f11fe093a2e3e1211f8994b6b67e6 Mon Sep 17 00:00:00 2001 From: Koha9 Date: Fri, 4 Aug 2023 03:49:49 +0900 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E9=A2=84=E6=B5=8B=E5=87=BD?= =?UTF-8?q?=E6=95=B0=E5=B0=8F=E9=94=99=E8=AF=AF=EF=BC=8C=E8=A7=84=E8=8C=83?= =?UTF-8?q?=E5=8C=96=E5=91=BD=E5=90=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修正get_actions_value中在非学习模式时仍然使用sample模式而不是取最佳值 规范化命名 --- .../Pytorch/.idea/dictionaries/UCUNI.xml | 3 + Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py | 7 +- Aimbot-PPO-Python/Pytorch/ppoagent.py | 136 +++++++++--------- 3 files changed, 69 insertions(+), 77 deletions(-) diff --git a/Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml b/Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml index 0a09ad1..84600fd 100644 --- a/Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml +++ b/Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml @@ -2,6 +2,9 @@ aimbot + logprobs + logstd + unclipped \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index d9f8c5e..4664a7b 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -20,9 +20,6 @@ SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e") GAME_NAME = "Aimbot_Hybrid_V3" GAME_TYPE = "Mix_Verification" -# !!!SPECIAL PARAMETERS!!! -using_targets_num = 3 - if __name__ == "__main__": args = parse_args() random.seed(args.seed) @@ -61,7 +58,6 @@ if __name__ == "__main__": run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}" wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args) - @atexit.register def save_model(): # close env @@ -72,9 +68,8 @@ if __name__ == "__main__": torch.save(agent, save_dir) print("save model to " + save_dir) - # start the game - total_update_step = using_targets_num * args.total_timesteps // args.datasetSize + total_update_step = args.target_num * args.total_timesteps // args.datasetSize target_steps = [0 for i in range(args.target_num)] start_time = time.time() state, _, done = env.reset() diff --git a/Aimbot-PPO-Python/Pytorch/ppoagent.py b/Aimbot-PPO-Python/Pytorch/ppoagent.py index dcf405d..50ab701 100644 --- a/Aimbot-PPO-Python/Pytorch/ppoagent.py +++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py @@ -17,10 +17,10 @@ def layer_init(layer, std=np.sqrt(2), bias_const=0.0): class PPOAgent(nn.Module): def __init__( - self, - env: Aimbot, - this_args:argparse.Namespace, - device: torch.device, + self, + env: Aimbot, + this_args: argparse.Namespace, + device: torch.device, ): super(PPOAgent, self).__init__() self.device = device @@ -38,7 +38,7 @@ class PPOAgent(nn.Module): self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size self.state_size_without_ray = self.args.total_target_size self.head_input_size = ( - env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size + env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size ) # except target state input self.unity_discrete_type = env.unity_discrete_type @@ -65,9 +65,6 @@ class PPOAgent(nn.Module): self.actor_mean = nn.ModuleList( [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)] ) - # self.actor_logstd = - # nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)]) - # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) self.actor_logstd = nn.ParameterList( [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)] ) # nn.Parameter(torch.zeros(1, self.continuous_size)) @@ -78,7 +75,7 @@ class PPOAgent(nn.Module): def get_value(self, state: torch.Tensor): target = state[:, 0].to(torch.int32) # int this_state_num = target.size()[0] - view_input = state[:, -self.ray_state_size :] # all ray input + view_input = state[:, -self.ray_state_size:] # all ray input target_input = state[:, : self.state_size_without_ray] view_layer = self.view_network(view_input) target_layer = torch.stack( @@ -96,7 +93,7 @@ class PPOAgent(nn.Module): def get_actions_value(self, state: torch.Tensor, actions=None): target = state[:, 0].to(torch.int32) # int this_state_num = target.size()[0] - view_input = state[:, -self.ray_state_size :] # all ray input + view_input = state[:, -self.ray_state_size:] # all ray input target_input = state[:, : self.state_size_without_ray] view_layer = self.view_network(view_input) target_layer = torch.stack( @@ -118,8 +115,6 @@ class PPOAgent(nn.Module): actions_mean = torch.stack( [self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)] ) # self.actor_mean(hidden) - # action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden) - # action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean) action_logstd = torch.stack( [torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)] ) @@ -134,32 +129,31 @@ class PPOAgent(nn.Module): if actions is None: if self.train_agent: # select actions base on probability distribution model - disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) - conAct = con_probs.sample() - actions = torch.cat([disAct.T, conAct], dim=1) + dis_act = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) + con_act = con_probs.sample() + actions = torch.cat([dis_act.T, con_act], dim=1) else: # select actions base on best probability distribution - # disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits]) - conAct = actions_mean - disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) - conAct = con_probs.sample() - actions = torch.cat([disAct.T, conAct], dim=1) + dis_act = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits]) + con_act = actions_mean + actions = torch.cat([dis_act.T, con_act], dim=1) else: - disAct = actions[:, 0 : self.unity_discrete_type].T - conAct = actions[:, self.unity_discrete_type :] + dis_act = actions[:, 0: self.unity_discrete_type].T + con_act = actions[:, self.unity_discrete_type:] dis_log_prob = torch.stack( - [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)] + [ctgr.log_prob(act) for act, ctgr in zip(dis_act, multi_categoricals)] ) dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals]) return ( actions, dis_log_prob.sum(0), dis_entropy.sum(0), - con_probs.log_prob(conAct).sum(1), + con_probs.log_prob(con_act).sum(1), con_probs.entropy().sum(1), criticV, ) - def train_net(self, this_train_ind:int,ppo_memories,optimizer) -> tuple: + + def train_net(self, this_train_ind: int, ppo_memories, optimizer) -> tuple: start_time = time.time() # flatten the batch b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape) @@ -171,24 +165,24 @@ class PPOAgent(nn.Module): b_values = ppo_memories.values[this_train_ind].reshape(-1) b_size = b_obs.size()[0] # optimizing the policy and value network - b_inds = np.arange(b_size) - + b_index = np.arange(b_size) + for epoch in range(self.args.epochs): - print("epoch:",epoch,end="") + print("epoch:", epoch, end="") # shuffle all datasets - np.random.shuffle(b_inds) + np.random.shuffle(b_index) for start in range(0, b_size, self.args.minibatchSize): - print(".",end="") + print(".", end="") end = start + self.args.minibatchSize - mb_inds = b_inds[start:end] - if(np.size(mb_inds)<=1): + mb_index = b_index[start:end] + if np.size(mb_index) <= 1: break - mb_advantages = b_advantages[mb_inds] + mb_advantages = b_advantages[mb_index] # normalize advantages if self.args.norm_adv: mb_advantages = (mb_advantages - mb_advantages.mean()) / ( - mb_advantages.std() + 1e-8 + mb_advantages.std() + 1e-8 ) ( @@ -197,14 +191,14 @@ class PPOAgent(nn.Module): dis_entropy, new_con_logprob, con_entropy, - newvalue, - ) = self.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) + new_value, + ) = self.get_actions_value(b_obs[mb_index], b_actions[mb_index]) # discrete ratio - dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] - dis_ratio = dis_logratio.exp() + dis_log_ratio = new_dis_logprob - b_dis_logprobs[mb_index] + dis_ratio = dis_log_ratio.exp() # continuous ratio - con_logratio = new_con_logprob - b_con_logprobs[mb_inds] - con_ratio = con_logratio.exp() + con_log_ratio = new_con_logprob - b_con_logprobs[mb_index] + con_ratio = con_log_ratio.exp() """ # early stop @@ -229,38 +223,38 @@ class PPOAgent(nn.Module): con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() # Value loss - newvalue = newvalue.view(-1) + new_value = new_value.view(-1) if self.args.clip_vloss: - v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 - v_clipped = b_values[mb_inds] + torch.clamp( - newvalue - b_values[mb_inds], + v_loss_unclipped = (new_value - b_returns[mb_index]) ** 2 + v_clipped = b_values[mb_index] + torch.clamp( + new_value - b_values[mb_index], -self.args.clip_coef, self.args.clip_coef, ) - v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 + v_loss_clipped = (v_clipped - b_returns[mb_index]) ** 2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: - v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() + v_loss = 0.5 * ((new_value - b_returns[mb_index]) ** 2).mean() # total loss entropy_loss = dis_entropy.mean() + con_entropy.mean() loss = ( - dis_pg_loss * self.args.policy_coef[this_train_ind] - + con_pg_loss * self.args.policy_coef[this_train_ind] - + entropy_loss * self.args.entropy_coef[this_train_ind] - + v_loss * self.args.critic_coef[this_train_ind] - )*self.args.loss_coef[this_train_ind] + dis_pg_loss * self.args.policy_coef[this_train_ind] + + con_pg_loss * self.args.policy_coef[this_train_ind] + + entropy_loss * self.args.entropy_coef[this_train_ind] + + v_loss * self.args.critic_coef[this_train_ind] + ) * self.args.loss_coef[this_train_ind] - if(torch.isnan(loss).any()): + if torch.isnan(loss).any(): print("LOSS Include NAN!!!") - if(torch.isnan(dis_pg_loss.any())): + if torch.isnan(dis_pg_loss.any()): print("dis_pg_loss include nan") - if(torch.isnan(con_pg_loss.any())): + if torch.isnan(con_pg_loss.any()): print("con_pg_loss include nan") - if(torch.isnan(entropy_loss.any())): + if torch.isnan(entropy_loss.any()): print("entropy_loss include nan") - if(torch.isnan(v_loss.any())): + if torch.isnan(v_loss.any()): print("v_loss include nan") raise @@ -275,15 +269,15 @@ class PPOAgent(nn.Module): if approx_kl > args.target_kl: break """ - return (v_loss,dis_pg_loss,con_pg_loss,loss,entropy_loss) + return v_loss, dis_pg_loss, con_pg_loss, loss, entropy_loss def gae( - self, - rewards: torch.Tensor, - dones: torch.Tensor, - values: torch.tensor, - next_obs: torch.tensor, - next_done: torch.Tensor, + self, + rewards: torch.Tensor, + dones: torch.Tensor, + values: torch.tensor, + next_obs: torch.tensor, + next_done: torch.Tensor, ) -> tuple: # GAE with torch.no_grad(): @@ -294,25 +288,25 @@ class PPOAgent(nn.Module): last_gae_lam = 0 for t in reversed(range(data_size)): if t == data_size - 1: - nextnonterminal = 1.0 - next_done + next_non_terminal = 1.0 - next_done next_values = next_value else: - nextnonterminal = 1.0 - dones[t + 1] + next_non_terminal = 1.0 - dones[t + 1] next_values = values[t + 1] - delta = rewards[t] + self.args.gamma * next_values * nextnonterminal - values[t] + delta = rewards[t] + self.args.gamma * next_values * next_non_terminal - values[t] advantages[t] = last_gae_lam = ( - delta + self.args.gamma * self.args.gaeLambda * nextnonterminal * last_gae_lam + delta + self.args.gamma * self.args.gaeLambda * next_non_terminal * last_gae_lam ) returns = advantages + values else: returns = torch.zeros_like(rewards).to(self.device) for t in reversed(range(data_size)): if t == data_size - 1: - nextnonterminal = 1.0 - next_done + next_non_terminal = 1.0 - next_done next_return = next_value else: - nextnonterminal = 1.0 - dones[t + 1] + next_non_terminal = 1.0 - dones[t + 1] next_return = returns[t + 1] - returns[t] = rewards[t] + self.args.gamma * nextnonterminal * next_return + returns[t] = rewards[t] + self.args.gamma * next_non_terminal * next_return advantages = returns - values - return advantages, returns \ No newline at end of file + return advantages, returns