import argparse import time import numpy as np import random import torch import torch.nn as nn import torch.optim as optim from AimbotEnv import Aimbot from torch.distributions.normal import Normal from torch.distributions.categorical import Categorical from distutils.util import strtobool from torch.utils.tensorboard import SummaryWriter DEFAULT_SEED = 9331 ENV_PATH = "../Build-ParallelEnv/Aimbot-ParallelEnv" WORKER_ID = 1 BASE_PORT = 2002 LEARNING_RATE = 2e-3 GAMMA = 0.99 GAE_LAMBDA = 0.95 TOTAL_STEPS = 2000000 STEP_NUM = 128 MINIBATCH_NUM = 4 EPOCHS = 4 CLIP_COEF = 0.1 ENTROPY_COEF = 0.01 CRITIC_COEF = 0.5 ANNEAL_LEARNING_RATE = True CLIP_VLOSS = True NORM_ADV = True def parse_args(): # fmt: off parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=DEFAULT_SEED, help="seed of the experiment") parser.add_argument("--path", type=str, default=ENV_PATH, help="enviroment path") parser.add_argument("--workerID", type=int, default=WORKER_ID, help="unity worker ID") parser.add_argument("--baseport", type=int, default=BASE_PORT, help="port to connect to Unity environment") parser.add_argument("--lr", type=float, default=LEARNING_RATE, help="the learning rate of optimizer") parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, help="if toggled, cuda will be enabled by default") parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS, help="total timesteps of the experiments") parser.add_argument("--stepNum", type=int, default=STEP_NUM, help="the number of steps to run in each environment per policy rollout") parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM, help="the number of mini-batches") parser.add_argument("--epochs", type=int, default=EPOCHS, help="the K epochs to update the policy") parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True, help="Toggle learning rate annealing for policy and value networks") # GAE parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, help="Use GAE for advantage computation") parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True, help="Toggles advantages normalization") parser.add_argument("--gamma", type=float, default=GAMMA, help="the discount factor gamma") parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA, help="the lambda for the general advantage estimation") parser.add_argument("--clip-coef", type=float, default=CLIP_COEF, help="the surrogate clipping coefficient") parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF, help="coefficient of the entropy") parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF, help="coefficient of the value function") parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True, help="Toggles whether or not to use a clipped loss for the value function, as per the paper.") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="the maximum norm for the gradient clipping") parser.add_argument("--target-kl", type=float, default=None, help="the target KL divergence threshold") # fmt: on args = parser.parse_args() return args def layer_init(layer, std=np.sqrt(2), bias_const=0.0): torch.nn.init.orthogonal_(layer.weight, std) torch.nn.init.constant_(layer.bias, bias_const) return layer class PPOAgent(nn.Module): def __init__(self, env: Aimbot): super(PPOAgent, self).__init__() self.discrete_size = env.unity_discrete_size self.discrete_shape = list(env.unity_discrete_branches) self.network = nn.Sequential( layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 128)), nn.Tanh(), layer_init(nn.Linear(128, 128)), nn.ReLU(), layer_init(nn.Linear(128, 128)), nn.ReLU(), ) self.dis_Actor = layer_init(nn.Linear(128, self.discrete_size), std=0.01) self.critic = layer_init(nn.Linear(128, 1), std=1) def get_value(self, state: torch.Tensor): return self.critic(self.network(state)) def get_actions_value(self, state: torch.Tensor, actions=None): hidden = self.network(state) dis_logits = self.dis_Actor(hidden) split_logits = torch.split(dis_logits, self.discrete_shape, dim=1) multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits] if actions is None: actions = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) log_prob = torch.stack( [ctgr.log_prob(act) for act, ctgr in zip(actions, multi_categoricals)] ) entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals]) return actions.T, log_prob.sum(0), entropy.sum(0), self.critic(hidden) if __name__ == "__main__": args = parse_args() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") # Initialize environment anget optimizer env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport) agent = PPOAgent(env).to(device) optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) # Memory Record obs = torch.zeros((args.stepNum, env.unity_agent_num) + env.unity_observation_shape).to(device) actions = torch.zeros((args.stepNum, env.unity_agent_num) + (env.unity_discrete_type,)).to( device ) logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device) rewards = torch.zeros((args.stepNum, env.unity_agent_num)).to(device) dones = torch.zeros((args.stepNum, env.unity_agent_num)).to(device) values = torch.zeros((args.stepNum, env.unity_agent_num)).to(device) # TRY NOT TO MODIFY: start the game args.batch_size = int(env.unity_agent_num * args.stepNum) args.minibatch_size = int(args.batch_size // args.minibatchesNum) total_update_step = args.total_timesteps // args.batch_size global_step = 0 start_time = time.time() next_obs, _, _ = env.reset() next_obs = torch.Tensor(next_obs).to(device) next_done = torch.zeros(env.unity_agent_num).to(device) for total_steps in range(total_update_step): # discunt learning rate, while step == total_update_step lr will be 0 if args.annealLR: frac = 1.0 - (total_steps - 1.0) / total_update_step lrnow = frac * args.lr optimizer.param_groups[0]["lr"] = lrnow # MAIN LOOP: run agent in environment for step in range(args.stepNum): print(step) global_step += 1 * env.unity_agent_num obs[step] = next_obs dones[step] = next_done with torch.no_grad(): # predict actions action, logprob, _, value = agent.get_actions_value(next_obs) value = value.flatten() next_obs, reward, done = env.step(action.cpu().numpy()) # save memories actions[step] = action logprobs[step] = logprob values[step] = value rewards[step] = torch.tensor(reward).to(device).view(-1) next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device) # GAE with torch.no_grad(): next_value = agent.get_value(next_obs).reshape(1, -1) if args.gae: advantages = torch.zeros_like(rewards).to(device) lastgaelam = 0 for t in reversed(range(args.stepNum)): if t == args.stepNum - 1: nextnonterminal = 1.0 - next_done nextvalues = next_value else: nextnonterminal = 1.0 - dones[t + 1] nextvalues = values[t + 1] delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] advantages[t] = lastgaelam = ( delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam ) returns = advantages + values else: returns = torch.zeros_like(rewards).to(device) for t in reversed(range(args.stepNum)): if t == args.stepNum - 1: nextnonterminal = 1.0 - next_done next_return = next_value else: nextnonterminal = 1.0 - dones[t + 1] next_return = returns[t + 1] returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return advantages = returns - values # flatten the batch b_obs = obs.reshape((-1,) + env.unity_observation_shape) b_logprobs = logprobs.reshape(-1) b_actions = actions.reshape((-1,) + (env.unity_discrete_type,)) b_advantages = advantages.reshape(-1) b_returns = returns.reshape(-1) b_values = values.reshape(-1) # Optimizing the policy and value network b_inds = np.arange(args.batch_size) clipfracs = [] for epoch in range(args.epochs): # shuffle all datasets np.random.shuffle(b_inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size mb_inds = b_inds[start:end] mb_advantages = b_advantages[mb_inds] # normalize advantages if args.norm_adv: mb_advantages = (mb_advantages - mb_advantages.mean()) / ( mb_advantages.std() + 1e-8 ) # ratio _, newlogprob, entropy, newvalue = agent.get_actions_value( b_obs[mb_inds], b_actions.long()[mb_inds].T ) logratio = newlogprob - b_logprobs[mb_inds] ratio = logratio.exp() # early stop with torch.no_grad(): # calculate approx_kl http://joschu.net/blog/kl-approx.html old_approx_kl = (-logratio).mean() approx_kl = ((ratio - 1) - logratio).mean() clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] # Policy loss pg_loss1 = -mb_advantages * ratio pg_loss2 = -mb_advantages * torch.clamp( ratio, 1 - args.clip_coef, 1 + args.clip_coef ) pg_loss = torch.max(pg_loss1, pg_loss2).mean() # Value loss newvalue = newvalue.view(-1) if args.clip_vloss: v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 v_clipped = b_values[mb_inds] + torch.clamp( newvalue - b_values[mb_inds], -args.clip_coef, args.clip_coef, ) v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() entropy_loss = entropy.mean() loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.critic_coef optimizer.zero_grad() loss.backward() # Clips gradient norm of an iterable of parameters. nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) optimizer.step() if args.target_kl is not None: if approx_kl > args.target_kl: break