Aimbot-PPO/Aimbot-PPO-Python/Pytorch/ppo.py
Koha9 0dbe2013ae weight and bias sync added
weight and bias sync added
2022-11-01 19:11:45 +09:00

328 lines
14 KiB
Python

import argparse
import wandb
import time
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from AimbotEnv import Aimbot
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
from distutils.util import strtobool
from torch.utils.tensorboard import SummaryWriter
DEFAULT_SEED = 9331
ENV_PATH = "../Build-ParallelEnv/Aimbot-ParallelEnv"
WAND_ENTITY = "koha9"
WORKER_ID = 1
BASE_PORT = 2002
LEARNING_RATE = 2e-3
GAMMA = 0.99
GAE_LAMBDA = 0.95
TOTAL_STEPS = 2000000
STEP_NUM = 128
MINIBATCH_NUM = 4
EPOCHS = 4
CLIP_COEF = 0.1
ENTROPY_COEF = 0.01
CRITIC_COEF = 0.5
ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True
NORM_ADV = True
def parse_args():
# fmt: off
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
help="seed of the experiment")
parser.add_argument("--path", type=str, default=ENV_PATH,
help="enviroment path")
parser.add_argument("--workerID", type=int, default=WORKER_ID,
help="unity worker ID")
parser.add_argument("--baseport", type=int, default=BASE_PORT,
help="port to connect to Unity environment")
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
help="the learning rate of optimizer")
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="if toggled, cuda will be enabled by default")
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
help="total timesteps of the experiments")
parser.add_argument("--stepNum", type=int, default=STEP_NUM,
help="the number of steps to run in each environment per policy rollout")
parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM,
help="the number of mini-batches")
parser.add_argument("--epochs", type=int, default=EPOCHS,
help="the K epochs to update the policy")
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
help="Toggle learning rate annealing for policy and value networks")
parser.add_argument("--wandb-entity", type=str, default=None,
help="the entity (team) of wandb's project")
# GAE
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="Use GAE for advantage computation")
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
help="Toggles advantages normalization")
parser.add_argument("--gamma", type=float, default=GAMMA,
help="the discount factor gamma")
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
help="the lambda for the general advantage estimation")
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
help="the surrogate clipping coefficient")
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
help="coefficient of the entropy")
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
help="coefficient of the value function")
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
parser.add_argument("--max-grad-norm", type=float, default=0.5,
help="the maximum norm for the gradient clipping")
parser.add_argument("--target-kl", type=float, default=None,
help="the target KL divergence threshold")
# fmt: on
args = parser.parse_args()
return args
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class PPOAgent(nn.Module):
def __init__(self, env: Aimbot):
super(PPOAgent, self).__init__()
self.discrete_size = env.unity_discrete_size
self.discrete_shape = list(env.unity_discrete_branches)
self.network = nn.Sequential(
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 128)),
nn.Tanh(),
layer_init(nn.Linear(128, 128)),
nn.ReLU(),
layer_init(nn.Linear(128, 128)),
nn.ReLU(),
)
self.dis_Actor = layer_init(nn.Linear(128, self.discrete_size), std=0.01)
self.critic = layer_init(nn.Linear(128, 1), std=1)
def get_value(self, state: torch.Tensor):
return self.critic(self.network(state))
def get_actions_value(self, state: torch.Tensor, actions=None):
hidden = self.network(state)
dis_logits = self.dis_Actor(hidden)
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
if actions is None:
actions = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
log_prob = torch.stack(
[ctgr.log_prob(act) for act, ctgr in zip(actions, multi_categoricals)]
)
entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
return actions.T, log_prob.sum(0), entropy.sum(0), self.critic(hidden)
if __name__ == "__main__":
args = parse_args()
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
# Initialize environment anget optimizer
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport)
agent = PPOAgent(env).to(device)
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
# Tensorboard and WandB Recorder
game_name = "Aimbot"
run_name = f"{game_name}__{args.seed}__{int(time.time())}"
wandb.init(
project=run_name,
entity=args.wandb_entity,
sync_tensorboard=True,
config=vars(args),
name=run_name,
monitor_gym=True,
save_code=True,
)
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
"hyperparameters",
"|param|value|\n|-|-|\n%s"
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)
# Memory Record
obs = torch.zeros((args.stepNum, env.unity_agent_num) + env.unity_observation_shape).to(device)
actions = torch.zeros((args.stepNum, env.unity_agent_num) + (env.unity_discrete_type,)).to(
device
)
logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
rewards = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
dones = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
values = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
# TRY NOT TO MODIFY: start the game
args.batch_size = int(env.unity_agent_num * args.stepNum)
args.minibatch_size = int(args.batch_size // args.minibatchesNum)
total_update_step = args.total_timesteps // args.batch_size
global_step = 0
start_time = time.time()
next_obs, _, _ = env.reset()
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(env.unity_agent_num).to(device)
for total_steps in range(total_update_step):
# discunt learning rate, while step == total_update_step lr will be 0
if args.annealLR:
frac = 1.0 - (total_steps - 1.0) / total_update_step
lrnow = frac * args.lr
optimizer.param_groups[0]["lr"] = lrnow
# MAIN LOOP: run agent in environment
for step in range(args.stepNum):
global_step += 1 * env.unity_agent_num
obs[step] = next_obs
dones[step] = next_done
with torch.no_grad():
# predict actions
action, logprob, _, value = agent.get_actions_value(next_obs)
value = value.flatten()
next_obs, reward, done = env.step(action.cpu().numpy())
# save memories
actions[step] = action
logprobs[step] = logprob
values[step] = value
rewards[step] = torch.tensor(reward).to(device).view(-1)
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
# GAE
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
if args.gae:
advantages = torch.zeros_like(rewards).to(device)
lastgaelam = 0
for t in reversed(range(args.stepNum)):
if t == args.stepNum - 1:
nextnonterminal = 1.0 - next_done
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
nextvalues = values[t + 1]
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
advantages[t] = lastgaelam = (
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(device)
for t in reversed(range(args.stepNum)):
if t == args.stepNum - 1:
nextnonterminal = 1.0 - next_done
next_return = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
advantages = returns - values
# flatten the batch
b_obs = obs.reshape((-1,) + env.unity_observation_shape)
b_logprobs = logprobs.reshape(-1)
b_actions = actions.reshape((-1,) + (env.unity_discrete_type,))
b_advantages = advantages.reshape(-1)
b_returns = returns.reshape(-1)
b_values = values.reshape(-1)
# Optimizing the policy and value network
b_inds = np.arange(args.batch_size)
clipfracs = []
for epoch in range(args.epochs):
# shuffle all datasets
np.random.shuffle(b_inds)
for start in range(0, args.batch_size, args.minibatch_size):
end = start + args.minibatch_size
mb_inds = b_inds[start:end]
mb_advantages = b_advantages[mb_inds]
# normalize advantages
if args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
mb_advantages.std() + 1e-8
)
# ratio
_, newlogprob, entropy, newvalue = agent.get_actions_value(
b_obs[mb_inds], b_actions.long()[mb_inds].T
)
logratio = newlogprob - b_logprobs[mb_inds]
ratio = logratio.exp()
# early stop
with torch.no_grad():
# calculate approx_kl http://joschu.net/blog/kl-approx.html
old_approx_kl = (-logratio).mean()
approx_kl = ((ratio - 1) - logratio).mean()
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
# Policy loss
pg_loss1 = -mb_advantages * ratio
pg_loss2 = -mb_advantages * torch.clamp(
ratio, 1 - args.clip_coef, 1 + args.clip_coef
)
pg_loss = torch.max(pg_loss1, pg_loss2).mean()
# Value loss
newvalue = newvalue.view(-1)
if args.clip_vloss:
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
v_clipped = b_values[mb_inds] + torch.clamp(
newvalue - b_values[mb_inds],
-args.clip_coef,
args.clip_coef,
)
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
entropy_loss = entropy.mean()
loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.critic_coef
optimizer.zero_grad()
loss.backward()
# Clips gradient norm of an iterable of parameters.
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
optimizer.step()
if args.target_kl is not None:
if approx_kl > args.target_kl:
break
# record rewards for plotting purposes
writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
print("SPS:", int(global_step / (time.time() - start_time)))
writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
env.close()
writer.close()