328 lines
14 KiB
Python
328 lines
14 KiB
Python
import argparse
|
|
import wandb
|
|
import time
|
|
import numpy as np
|
|
import random
|
|
import torch
|
|
import torch.nn as nn
|
|
import torch.optim as optim
|
|
|
|
from AimbotEnv import Aimbot
|
|
from torch.distributions.normal import Normal
|
|
from torch.distributions.categorical import Categorical
|
|
from distutils.util import strtobool
|
|
from torch.utils.tensorboard import SummaryWriter
|
|
|
|
DEFAULT_SEED = 9331
|
|
ENV_PATH = "../Build-ParallelEnv/Aimbot-ParallelEnv"
|
|
WAND_ENTITY = "koha9"
|
|
WORKER_ID = 1
|
|
BASE_PORT = 2002
|
|
|
|
|
|
LEARNING_RATE = 2e-3
|
|
GAMMA = 0.99
|
|
GAE_LAMBDA = 0.95
|
|
TOTAL_STEPS = 2000000
|
|
STEP_NUM = 128
|
|
MINIBATCH_NUM = 4
|
|
EPOCHS = 4
|
|
CLIP_COEF = 0.1
|
|
ENTROPY_COEF = 0.01
|
|
CRITIC_COEF = 0.5
|
|
|
|
ANNEAL_LEARNING_RATE = True
|
|
CLIP_VLOSS = True
|
|
NORM_ADV = True
|
|
|
|
|
|
def parse_args():
|
|
# fmt: off
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
|
|
help="seed of the experiment")
|
|
parser.add_argument("--path", type=str, default=ENV_PATH,
|
|
help="enviroment path")
|
|
parser.add_argument("--workerID", type=int, default=WORKER_ID,
|
|
help="unity worker ID")
|
|
parser.add_argument("--baseport", type=int, default=BASE_PORT,
|
|
help="port to connect to Unity environment")
|
|
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
|
|
help="the learning rate of optimizer")
|
|
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
|
help="if toggled, cuda will be enabled by default")
|
|
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
|
|
help="total timesteps of the experiments")
|
|
|
|
parser.add_argument("--stepNum", type=int, default=STEP_NUM,
|
|
help="the number of steps to run in each environment per policy rollout")
|
|
parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM,
|
|
help="the number of mini-batches")
|
|
parser.add_argument("--epochs", type=int, default=EPOCHS,
|
|
help="the K epochs to update the policy")
|
|
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
|
|
help="Toggle learning rate annealing for policy and value networks")
|
|
parser.add_argument("--wandb-entity", type=str, default=None,
|
|
help="the entity (team) of wandb's project")
|
|
# GAE
|
|
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
|
help="Use GAE for advantage computation")
|
|
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
|
|
help="Toggles advantages normalization")
|
|
parser.add_argument("--gamma", type=float, default=GAMMA,
|
|
help="the discount factor gamma")
|
|
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
|
|
help="the lambda for the general advantage estimation")
|
|
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
|
|
help="the surrogate clipping coefficient")
|
|
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
|
|
help="coefficient of the entropy")
|
|
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
|
|
help="coefficient of the value function")
|
|
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
|
|
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
|
|
parser.add_argument("--max-grad-norm", type=float, default=0.5,
|
|
help="the maximum norm for the gradient clipping")
|
|
parser.add_argument("--target-kl", type=float, default=None,
|
|
help="the target KL divergence threshold")
|
|
# fmt: on
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
|
torch.nn.init.orthogonal_(layer.weight, std)
|
|
torch.nn.init.constant_(layer.bias, bias_const)
|
|
return layer
|
|
|
|
|
|
class PPOAgent(nn.Module):
|
|
def __init__(self, env: Aimbot):
|
|
super(PPOAgent, self).__init__()
|
|
self.discrete_size = env.unity_discrete_size
|
|
self.discrete_shape = list(env.unity_discrete_branches)
|
|
|
|
self.network = nn.Sequential(
|
|
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 128)),
|
|
nn.Tanh(),
|
|
layer_init(nn.Linear(128, 128)),
|
|
nn.ReLU(),
|
|
layer_init(nn.Linear(128, 128)),
|
|
nn.ReLU(),
|
|
)
|
|
self.dis_Actor = layer_init(nn.Linear(128, self.discrete_size), std=0.01)
|
|
self.critic = layer_init(nn.Linear(128, 1), std=1)
|
|
|
|
def get_value(self, state: torch.Tensor):
|
|
return self.critic(self.network(state))
|
|
|
|
def get_actions_value(self, state: torch.Tensor, actions=None):
|
|
hidden = self.network(state)
|
|
dis_logits = self.dis_Actor(hidden)
|
|
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
|
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
|
if actions is None:
|
|
actions = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
|
log_prob = torch.stack(
|
|
[ctgr.log_prob(act) for act, ctgr in zip(actions, multi_categoricals)]
|
|
)
|
|
entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
|
return actions.T, log_prob.sum(0), entropy.sum(0), self.critic(hidden)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_args()
|
|
random.seed(args.seed)
|
|
np.random.seed(args.seed)
|
|
torch.manual_seed(args.seed)
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
|
|
|
|
# Initialize environment anget optimizer
|
|
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport)
|
|
agent = PPOAgent(env).to(device)
|
|
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
|
|
|
# Tensorboard and WandB Recorder
|
|
game_name = "Aimbot"
|
|
run_name = f"{game_name}__{args.seed}__{int(time.time())}"
|
|
wandb.init(
|
|
project=run_name,
|
|
entity=args.wandb_entity,
|
|
sync_tensorboard=True,
|
|
config=vars(args),
|
|
name=run_name,
|
|
monitor_gym=True,
|
|
save_code=True,
|
|
)
|
|
|
|
writer = SummaryWriter(f"runs/{run_name}")
|
|
writer.add_text(
|
|
"hyperparameters",
|
|
"|param|value|\n|-|-|\n%s"
|
|
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
|
|
)
|
|
|
|
# Memory Record
|
|
obs = torch.zeros((args.stepNum, env.unity_agent_num) + env.unity_observation_shape).to(device)
|
|
actions = torch.zeros((args.stepNum, env.unity_agent_num) + (env.unity_discrete_type,)).to(
|
|
device
|
|
)
|
|
logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
|
rewards = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
|
dones = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
|
values = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
|
|
|
# TRY NOT TO MODIFY: start the game
|
|
args.batch_size = int(env.unity_agent_num * args.stepNum)
|
|
args.minibatch_size = int(args.batch_size // args.minibatchesNum)
|
|
total_update_step = args.total_timesteps // args.batch_size
|
|
global_step = 0
|
|
start_time = time.time()
|
|
next_obs, _, _ = env.reset()
|
|
next_obs = torch.Tensor(next_obs).to(device)
|
|
next_done = torch.zeros(env.unity_agent_num).to(device)
|
|
|
|
for total_steps in range(total_update_step):
|
|
# discunt learning rate, while step == total_update_step lr will be 0
|
|
if args.annealLR:
|
|
frac = 1.0 - (total_steps - 1.0) / total_update_step
|
|
lrnow = frac * args.lr
|
|
optimizer.param_groups[0]["lr"] = lrnow
|
|
|
|
# MAIN LOOP: run agent in environment
|
|
for step in range(args.stepNum):
|
|
global_step += 1 * env.unity_agent_num
|
|
obs[step] = next_obs
|
|
dones[step] = next_done
|
|
|
|
with torch.no_grad():
|
|
# predict actions
|
|
action, logprob, _, value = agent.get_actions_value(next_obs)
|
|
value = value.flatten()
|
|
next_obs, reward, done = env.step(action.cpu().numpy())
|
|
|
|
# save memories
|
|
actions[step] = action
|
|
logprobs[step] = logprob
|
|
values[step] = value
|
|
rewards[step] = torch.tensor(reward).to(device).view(-1)
|
|
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
|
|
|
|
# GAE
|
|
with torch.no_grad():
|
|
next_value = agent.get_value(next_obs).reshape(1, -1)
|
|
if args.gae:
|
|
advantages = torch.zeros_like(rewards).to(device)
|
|
lastgaelam = 0
|
|
for t in reversed(range(args.stepNum)):
|
|
if t == args.stepNum - 1:
|
|
nextnonterminal = 1.0 - next_done
|
|
nextvalues = next_value
|
|
else:
|
|
nextnonterminal = 1.0 - dones[t + 1]
|
|
nextvalues = values[t + 1]
|
|
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
|
|
advantages[t] = lastgaelam = (
|
|
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
|
|
)
|
|
returns = advantages + values
|
|
else:
|
|
returns = torch.zeros_like(rewards).to(device)
|
|
for t in reversed(range(args.stepNum)):
|
|
if t == args.stepNum - 1:
|
|
nextnonterminal = 1.0 - next_done
|
|
next_return = next_value
|
|
else:
|
|
nextnonterminal = 1.0 - dones[t + 1]
|
|
next_return = returns[t + 1]
|
|
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
|
|
advantages = returns - values
|
|
|
|
# flatten the batch
|
|
b_obs = obs.reshape((-1,) + env.unity_observation_shape)
|
|
b_logprobs = logprobs.reshape(-1)
|
|
b_actions = actions.reshape((-1,) + (env.unity_discrete_type,))
|
|
b_advantages = advantages.reshape(-1)
|
|
b_returns = returns.reshape(-1)
|
|
b_values = values.reshape(-1)
|
|
|
|
# Optimizing the policy and value network
|
|
b_inds = np.arange(args.batch_size)
|
|
clipfracs = []
|
|
for epoch in range(args.epochs):
|
|
# shuffle all datasets
|
|
np.random.shuffle(b_inds)
|
|
for start in range(0, args.batch_size, args.minibatch_size):
|
|
end = start + args.minibatch_size
|
|
mb_inds = b_inds[start:end]
|
|
mb_advantages = b_advantages[mb_inds]
|
|
|
|
# normalize advantages
|
|
if args.norm_adv:
|
|
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
|
mb_advantages.std() + 1e-8
|
|
)
|
|
|
|
# ratio
|
|
_, newlogprob, entropy, newvalue = agent.get_actions_value(
|
|
b_obs[mb_inds], b_actions.long()[mb_inds].T
|
|
)
|
|
logratio = newlogprob - b_logprobs[mb_inds]
|
|
ratio = logratio.exp()
|
|
|
|
# early stop
|
|
with torch.no_grad():
|
|
# calculate approx_kl http://joschu.net/blog/kl-approx.html
|
|
old_approx_kl = (-logratio).mean()
|
|
approx_kl = ((ratio - 1) - logratio).mean()
|
|
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
|
|
|
|
# Policy loss
|
|
pg_loss1 = -mb_advantages * ratio
|
|
pg_loss2 = -mb_advantages * torch.clamp(
|
|
ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
|
)
|
|
pg_loss = torch.max(pg_loss1, pg_loss2).mean()
|
|
|
|
# Value loss
|
|
newvalue = newvalue.view(-1)
|
|
if args.clip_vloss:
|
|
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
|
v_clipped = b_values[mb_inds] + torch.clamp(
|
|
newvalue - b_values[mb_inds],
|
|
-args.clip_coef,
|
|
args.clip_coef,
|
|
)
|
|
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
|
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
|
v_loss = 0.5 * v_loss_max.mean()
|
|
else:
|
|
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
|
|
|
entropy_loss = entropy.mean()
|
|
loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.critic_coef
|
|
|
|
optimizer.zero_grad()
|
|
loss.backward()
|
|
# Clips gradient norm of an iterable of parameters.
|
|
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
|
|
optimizer.step()
|
|
|
|
if args.target_kl is not None:
|
|
if approx_kl > args.target_kl:
|
|
break
|
|
# record rewards for plotting purposes
|
|
writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
|
|
writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
|
|
writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
|
|
writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
|
|
writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
|
|
writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
|
|
writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
|
|
print("SPS:", int(global_step / (time.time() - start_time)))
|
|
writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
|
|
|
|
env.close()
|
|
writer.close()
|