From 474032d1e8a87f228f6ef463bf5c6483f23263fb Mon Sep 17 00:00:00 2001 From: Koha9 Date: Thu, 3 Nov 2022 07:16:18 +0900 Subject: [PATCH] hybrid dis-con action, save-load, converge wad observed add discrete and continuous action in same NN model. model save and load. reward is increasing, converge was observed. this two models are seems good: Aimbot_9331_1667423213_hybrid_train2 Aimbot_9331_1667389873_hybrid --- Aimbot-PPO-Python/Pytorch/AimbotEnv.py | 27 ++-- Aimbot-PPO-Python/Pytorch/ppo.py | 171 ++++++++++++++++------- Aimbot-PPO-Python/Pytorch/testarea.ipynb | 128 +++++++++++++++++ 3 files changed, 264 insertions(+), 62 deletions(-) diff --git a/Aimbot-PPO-Python/Pytorch/AimbotEnv.py b/Aimbot-PPO-Python/Pytorch/AimbotEnv.py index ae1384b..7c8466d 100644 --- a/Aimbot-PPO-Python/Pytorch/AimbotEnv.py +++ b/Aimbot-PPO-Python/Pytorch/AimbotEnv.py @@ -48,6 +48,11 @@ class Aimbot(gym.Env): self.unity_discrete_type = self.unity_action_spec.discrete_size # environment discrete action type. int 3+3+2=8 self.unity_discrete_size = sum(self.unity_discrete_branches) + # environment total action size. int 3+2=5 + self.unity_action_size = self.unity_discrete_type + self.unity_continuous_size + # ActionExistBool + self.unity_dis_act_exist = self.unity_discrete_type != 0 + self.unity_con_act_exist = self.unity_continuous_size != 0 # AGENT SPECS # all agents ID @@ -85,21 +90,23 @@ class Aimbot(gym.Env): """ # take action to enviroment # return mextState,reward,done - if self.unity_discrete_size == 0: + # discrete action + if self.unity_dis_act_exist: + # create discrete action from actions list + discreteActions = actions[:, 0 : self.unity_discrete_type] + else: # create empty discrete action discreteActions = np.asarray([[0]]) + # continuous action + if self.unity_con_act_exist: + # create continuous actions from actions list + continuousActions = actions[:, self.unity_discrete_type :] else: - # create discrete action from actions list - discreteActions = actions[:, 0 : self.unity_discrete_size] - """ - if self.unity_continuous_size == 0: # create empty continuous action continuousActions = np.asanyarray([[0.0]]) - else: - # create continuous actions from actions list - continuousActions = actions[:,self.unity_discrete_size :] - """ - continuousActions = np.asanyarray([[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]) + + # Dummy continuous action + # continuousActions = np.asanyarray([[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]) # create actionTuple thisActionTuple = ActionTuple(continuous=continuousActions, discrete=discreteActions) # take action to env diff --git a/Aimbot-PPO-Python/Pytorch/ppo.py b/Aimbot-PPO-Python/Pytorch/ppo.py index f01ca62..411b294 100644 --- a/Aimbot-PPO-Python/Pytorch/ppo.py +++ b/Aimbot-PPO-Python/Pytorch/ppo.py @@ -20,12 +20,12 @@ WORKER_ID = 1 BASE_PORT = 2002 -LEARNING_RATE = 2e-3 +LEARNING_RATE = 7e-4 GAMMA = 0.99 GAE_LAMBDA = 0.95 TOTAL_STEPS = 2000000 -STEP_NUM = 128 -MINIBATCH_NUM = 4 +STEP_NUM = 256 +MINIBATCH_NUM = 1 EPOCHS = 4 CLIP_COEF = 0.1 ENTROPY_COEF = 0.01 @@ -35,9 +35,13 @@ ANNEAL_LEARNING_RATE = True CLIP_VLOSS = True NORM_ADV = True +WANDB_TACK = True +LOAD_DIR = "../PPO-Model/SmallArea-256-128-hybrid.pt" + def parse_args(): # fmt: off + # pytorch and environment parameters parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=DEFAULT_SEED, help="seed of the experiment") @@ -54,6 +58,7 @@ def parse_args(): parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS, help="total timesteps of the experiments") + # model parameters parser.add_argument("--stepNum", type=int, default=STEP_NUM, help="the number of steps to run in each environment per policy rollout") parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM, @@ -62,8 +67,13 @@ def parse_args(): help="the K epochs to update the policy") parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True, help="Toggle learning rate annealing for policy and value networks") - parser.add_argument("--wandb-entity", type=str, default=None, + parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True, + help="track on the wandb") + parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY, help="the entity (team) of wandb's project") + parser.add_argument("--load-dir", type=str, default=LOAD_DIR, + help="load model directory") + # GAE parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, help="Use GAE for advantage computation") @@ -101,16 +111,17 @@ class PPOAgent(nn.Module): super(PPOAgent, self).__init__() self.discrete_size = env.unity_discrete_size self.discrete_shape = list(env.unity_discrete_branches) + self.continuous_size = env.unity_continuous_size self.network = nn.Sequential( - layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 128)), - nn.Tanh(), - layer_init(nn.Linear(128, 128)), + layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 256)), nn.ReLU(), - layer_init(nn.Linear(128, 128)), + layer_init(nn.Linear(256, 128)), nn.ReLU(), ) - self.dis_Actor = layer_init(nn.Linear(128, self.discrete_size), std=0.01) + self.actor_dis = layer_init(nn.Linear(128, self.discrete_size), std=0.01) + self.actor_mean = layer_init(nn.Linear(128, self.continuous_size), std=0.01) + self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) self.critic = layer_init(nn.Linear(128, 1), std=1) def get_value(self, state: torch.Tensor): @@ -118,16 +129,35 @@ class PPOAgent(nn.Module): def get_actions_value(self, state: torch.Tensor, actions=None): hidden = self.network(state) - dis_logits = self.dis_Actor(hidden) + # discrete + dis_logits = self.actor_dis(hidden) split_logits = torch.split(dis_logits, self.discrete_shape, dim=1) multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits] + # continuous + actions_mean = self.actor_mean(hidden) + action_logstd = self.actor_logstd.expand_as(actions_mean) + action_std = torch.exp(action_logstd) + con_probs = Normal(actions_mean, action_std) + if actions is None: - actions = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) - log_prob = torch.stack( - [ctgr.log_prob(act) for act, ctgr in zip(actions, multi_categoricals)] + disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) + conAct = con_probs.sample() + actions = torch.cat([disAct.T, conAct], dim=1) + else: + disAct = actions[:, 0 : env.unity_discrete_type].T + conAct = actions[:, env.unity_discrete_type :] + dis_log_prob = torch.stack( + [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)] + ) + dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals]) + return ( + actions, + dis_log_prob.sum(0), + dis_entropy.sum(0), + con_probs.log_prob(conAct).sum(1), + con_probs.entropy().sum(1), + self.critic(hidden), ) - entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals]) - return actions.T, log_prob.sum(0), entropy.sum(0), self.critic(hidden) if __name__ == "__main__": @@ -140,21 +170,28 @@ if __name__ == "__main__": # Initialize environment anget optimizer env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport) - agent = PPOAgent(env).to(device) + if args.load_dir is None: + agent = PPOAgent(env).to(device) + else: + agent = torch.load(args.load_dir) + print("Load Agent", args.load_dir) + print(agent.eval()) + optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) # Tensorboard and WandB Recorder game_name = "Aimbot" run_name = f"{game_name}__{args.seed}__{int(time.time())}" - wandb.init( - project=run_name, - entity=args.wandb_entity, - sync_tensorboard=True, - config=vars(args), - name=run_name, - monitor_gym=True, - save_code=True, - ) + if args.wandb_track: + wandb.init( + project=run_name, + entity=args.wandb_entity, + sync_tensorboard=True, + config=vars(args), + name=run_name, + monitor_gym=True, + save_code=True, + ) writer = SummaryWriter(f"runs/{run_name}") writer.add_text( @@ -165,10 +202,9 @@ if __name__ == "__main__": # Memory Record obs = torch.zeros((args.stepNum, env.unity_agent_num) + env.unity_observation_shape).to(device) - actions = torch.zeros((args.stepNum, env.unity_agent_num) + (env.unity_discrete_type,)).to( - device - ) - logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device) + actions = torch.zeros((args.stepNum, env.unity_agent_num) + (env.unity_action_size,)).to(device) + dis_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device) + con_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device) rewards = torch.zeros((args.stepNum, env.unity_agent_num)).to(device) dones = torch.zeros((args.stepNum, env.unity_agent_num)).to(device) values = torch.zeros((args.stepNum, env.unity_agent_num)).to(device) @@ -198,13 +234,14 @@ if __name__ == "__main__": with torch.no_grad(): # predict actions - action, logprob, _, value = agent.get_actions_value(next_obs) + action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(next_obs) value = value.flatten() next_obs, reward, done = env.step(action.cpu().numpy()) # save memories actions[step] = action - logprobs[step] = logprob + dis_logprobs[step] = dis_logprob + con_logprobs[step] = con_logprob values[step] = value rewards[step] = torch.tensor(reward).to(device).view(-1) next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device) @@ -241,15 +278,16 @@ if __name__ == "__main__": # flatten the batch b_obs = obs.reshape((-1,) + env.unity_observation_shape) - b_logprobs = logprobs.reshape(-1) - b_actions = actions.reshape((-1,) + (env.unity_discrete_type,)) + b_dis_logprobs = dis_logprobs.reshape(-1) + b_con_logprobs = con_logprobs.reshape(-1) + b_actions = actions.reshape((-1,) + (env.unity_action_size,)) b_advantages = advantages.reshape(-1) b_returns = returns.reshape(-1) b_values = values.reshape(-1) # Optimizing the policy and value network b_inds = np.arange(args.batch_size) - clipfracs = [] + #clipfracs = [] for epoch in range(args.epochs): # shuffle all datasets np.random.shuffle(b_inds) @@ -264,26 +302,42 @@ if __name__ == "__main__": mb_advantages.std() + 1e-8 ) - # ratio - _, newlogprob, entropy, newvalue = agent.get_actions_value( - b_obs[mb_inds], b_actions.long()[mb_inds].T - ) - logratio = newlogprob - b_logprobs[mb_inds] - ratio = logratio.exp() + ( + _, + new_dis_logprob, + dis_entropy, + new_con_logprob, + con_entropy, + newvalue, + ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) + # discrete ratio + dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] + dis_ratio = dis_logratio.exp() + # continuous ratio + con_logratio = new_con_logprob - b_con_logprobs[mb_inds] + con_ratio = con_logratio.exp() + """ # early stop with torch.no_grad(): # calculate approx_kl http://joschu.net/blog/kl-approx.html old_approx_kl = (-logratio).mean() approx_kl = ((ratio - 1) - logratio).mean() clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] + """ - # Policy loss - pg_loss1 = -mb_advantages * ratio - pg_loss2 = -mb_advantages * torch.clamp( - ratio, 1 - args.clip_coef, 1 + args.clip_coef + # discrete Policy loss + dis_pg_loss_orig = -mb_advantages * dis_ratio + dis_pg_loss_clip = -mb_advantages * torch.clamp( + dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef ) - pg_loss = torch.max(pg_loss1, pg_loss2).mean() + dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() + # continuous Policy loss + con_pg_loss_orig = -mb_advantages * con_ratio + con_pg_loss_clip = -mb_advantages * torch.clamp( + con_ratio, 1 - args.clip_coef, 1 + args.clip_coef + ) + con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() # Value loss newvalue = newvalue.view(-1) @@ -300,8 +354,14 @@ if __name__ == "__main__": else: v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() - entropy_loss = entropy.mean() - loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.critic_coef + # total loss + entropy_loss = dis_entropy.mean() + con_entropy.mean() + loss = ( + dis_pg_loss + + con_pg_loss + - entropy_loss * args.ent_coef + + v_loss * args.critic_coef + ) optimizer.zero_grad() loss.backward() @@ -309,19 +369,26 @@ if __name__ == "__main__": nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) optimizer.step() + """ if args.target_kl is not None: if approx_kl > args.target_kl: break + """ # record rewards for plotting purposes writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) writer.add_scalar("losses/value_loss", v_loss.item(), global_step) - writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) - writer.add_scalar("losses/entropy", entropy_loss.item(), global_step) - writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step) - writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) - writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) + writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step) + writer.add_scalar("losses/con_policy_loss", con_pg_loss.item(), global_step) + writer.add_scalar("losses/total_loss", loss.item(), global_step) + writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step) + # writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step) + # writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) + #writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) print("SPS:", int(global_step / (time.time() - start_time))) writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) + writer.add_scalar( + "charts/Reward", np.mean(rewards.to("cpu").detach().numpy().copy()), global_step + ) env.close() writer.close() diff --git a/Aimbot-PPO-Python/Pytorch/testarea.ipynb b/Aimbot-PPO-Python/Pytorch/testarea.ipynb index 44a1f64..8801206 100644 --- a/Aimbot-PPO-Python/Pytorch/testarea.ipynb +++ b/Aimbot-PPO-Python/Pytorch/testarea.ipynb @@ -303,6 +303,134 @@ "(128, 4) + env.unity_observation_shape\n", "env.reset()" ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[1, 2, 3],\n", + " [1, 2, 3],\n", + " [1, 2, 3],\n", + " [1, 2, 3]], device='cuda:0')\n", + "tensor([[1],\n", + " [2],\n", + " [3],\n", + " [4]], device='cuda:0')\n" + ] + }, + { + "data": { + "text/plain": [ + "tensor([[1, 2, 3, 1],\n", + " [1, 2, 3, 2],\n", + " [1, 2, 3, 3],\n", + " [1, 2, 3, 4]], device='cuda:0')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "aa = torch.tensor([[1,2,3],[1,2,3],[1,2,3],[1,2,3]]).to(\"cuda:0\")\n", + "bb = torch.tensor([[1],[2],[3],[4]]).to(\"cuda:0\")\n", + "print(aa)\n", + "print(bb)\n", + "torch.cat([aa,bb],axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "Can't get attribute 'PPOAgent' on ", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_31348\\1930153251.py\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mmymodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"../PPO-Model/SmallArea-256-128-hybrid.pt\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mmymodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0meval\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36mload\u001b[1;34m(f, map_location, pickle_module, **pickle_load_args)\u001b[0m\n\u001b[0;32m 710\u001b[0m \u001b[0mopened_file\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mseek\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0morig_position\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 711\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 712\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_load\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_zipfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmap_location\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 713\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0m_legacy_load\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmap_location\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 714\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36m_load\u001b[1;34m(zip_file, map_location, pickle_module, pickle_file, **pickle_load_args)\u001b[0m\n\u001b[0;32m 1047\u001b[0m \u001b[0munpickler\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mUnpicklerWrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_file\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1048\u001b[0m \u001b[0munpickler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpersistent_load\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpersistent_load\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1049\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0munpickler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1050\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1051\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_utils\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_loaded_sparse_tensors\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36mfind_class\u001b[1;34m(self, mod_name, name)\u001b[0m\n\u001b[0;32m 1040\u001b[0m \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1041\u001b[0m \u001b[0mmod_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_module_mapping\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmod_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmod_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1042\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmod_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1043\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1044\u001b[0m \u001b[1;31m# Load the data (which may in turn use `persistent_load` to load tensors)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mAttributeError\u001b[0m: Can't get attribute 'PPOAgent' on " + ] + } + ], + "source": [ + "import torch\n", + "\n", + "def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n", + " torch.nn.init.orthogonal_(layer.weight, std)\n", + " torch.nn.init.constant_(layer.bias, bias_const)\n", + " return layer\n", + "\n", + "class PPOAgent(nn.Module):\n", + " def __init__(self, env: Aimbot):\n", + " super(PPOAgent, self).__init__()\n", + " self.discrete_size = env.unity_discrete_size\n", + " self.discrete_shape = list(env.unity_discrete_branches)\n", + " self.continuous_size = env.unity_continuous_size\n", + "\n", + " self.network = nn.Sequential(\n", + " layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 256)),\n", + " nn.ReLU(),\n", + " layer_init(nn.Linear(256, 128)),\n", + " nn.ReLU(),\n", + " )\n", + " self.actor_dis = layer_init(nn.Linear(128, self.discrete_size), std=0.01)\n", + " self.actor_mean = layer_init(nn.Linear(128, self.continuous_size), std=0.01)\n", + " self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))\n", + " self.critic = layer_init(nn.Linear(128, 1), std=1)\n", + "\n", + " def get_value(self, state: torch.Tensor):\n", + " return self.critic(self.network(state))\n", + "\n", + " def get_actions_value(self, state: torch.Tensor, actions=None):\n", + " hidden = self.network(state)\n", + " # discrete\n", + " dis_logits = self.actor_dis(hidden)\n", + " split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n", + " multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n", + " # continuous\n", + " actions_mean = self.actor_mean(hidden)\n", + " action_logstd = self.actor_logstd.expand_as(actions_mean)\n", + " action_std = torch.exp(action_logstd)\n", + " con_probs = Normal(actions_mean, action_std)\n", + "\n", + " if actions is None:\n", + " disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])\n", + " conAct = con_probs.sample()\n", + " actions = torch.cat([disAct.T, conAct], dim=1)\n", + " else:\n", + " disAct = actions[:, 0 : env.unity_discrete_type].T\n", + " conAct = actions[:, env.unity_discrete_type :]\n", + " dis_log_prob = torch.stack(\n", + " [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]\n", + " )\n", + " dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])\n", + " return (\n", + " actions,\n", + " dis_log_prob.sum(0),\n", + " dis_entropy.sum(0),\n", + " con_probs.log_prob(conAct).sum(1),\n", + " con_probs.entropy().sum(1),\n", + " self.critic(hidden),\n", + " )\n", + "\n", + "\n", + "mymodel = torch.load(\"../PPO-Model/SmallArea-256-128-hybrid.pt\")\n", + "mymodel.eval()" + ] } ], "metadata": {