Add load & save function. Add train flag to test model. Add new action select function while in test mode. Add decision period to skip step.
434 lines
19 KiB
Python
434 lines
19 KiB
Python
import argparse
|
|
import wandb
|
|
import time
|
|
import numpy as np
|
|
import random
|
|
import torch
|
|
import torch.nn as nn
|
|
import torch.optim as optim
|
|
|
|
from AimbotEnv import Aimbot
|
|
from torch.distributions.normal import Normal
|
|
from torch.distributions.categorical import Categorical
|
|
from distutils.util import strtobool
|
|
from torch.utils.tensorboard import SummaryWriter
|
|
|
|
bestReward = 0
|
|
|
|
DEFAULT_SEED = 9331
|
|
ENV_PATH = "../Build/Build-ParallelEnv-BigArea-6Enemy/Aimbot-ParallelEnv"
|
|
WAND_ENTITY = "koha9"
|
|
WORKER_ID = 1
|
|
BASE_PORT = 1000
|
|
|
|
|
|
TOTAL_STEPS = 2000000
|
|
STEP_NUM = 314
|
|
DECISION_PERIOD = 2
|
|
LEARNING_RATE = 7e-4
|
|
GAMMA = 0.99
|
|
GAE_LAMBDA = 0.95
|
|
MINIBATCH_NUM = 4
|
|
EPOCHS = 4
|
|
CLIP_COEF = 0.1
|
|
POLICY_COEF = 1.0
|
|
ENTROPY_COEF = 0.01
|
|
CRITIC_COEF = 0.5
|
|
|
|
ANNEAL_LEARNING_RATE = True
|
|
CLIP_VLOSS = True
|
|
NORM_ADV = True
|
|
TRAIN = True
|
|
|
|
WANDB_TACK = False
|
|
LOAD_DIR = None
|
|
# LOAD_DIR = "../PPO-Model/SmallArea-256-128-hybrid-2nd-trainning.pt"
|
|
|
|
|
|
def parse_args():
|
|
# fmt: off
|
|
# pytorch and environment parameters
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
|
|
help="seed of the experiment")
|
|
parser.add_argument("--path", type=str, default=ENV_PATH,
|
|
help="enviroment path")
|
|
parser.add_argument("--workerID", type=int, default=WORKER_ID,
|
|
help="unity worker ID")
|
|
parser.add_argument("--baseport", type=int, default=BASE_PORT,
|
|
help="port to connect to Unity environment")
|
|
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
|
|
help="the learning rate of optimizer")
|
|
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
|
help="if toggled, cuda will be enabled by default")
|
|
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
|
|
help="total timesteps of the experiments")
|
|
|
|
# model parameters
|
|
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
|
|
help="Train Model or not")
|
|
parser.add_argument("--stepNum", type=int, default=STEP_NUM,
|
|
help="the number of steps to run in each environment per policy rollout")
|
|
parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM,
|
|
help="the number of mini-batches")
|
|
parser.add_argument("--epochs", type=int, default=EPOCHS,
|
|
help="the K epochs to update the policy")
|
|
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
|
|
help="Toggle learning rate annealing for policy and value networks")
|
|
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
|
|
help="track on the wandb")
|
|
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
|
|
help="the entity (team) of wandb's project")
|
|
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
|
|
help="load model directory")
|
|
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
|
|
help="the number of steps to run in each environment per policy rollout")
|
|
|
|
# GAE loss
|
|
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
|
help="Use GAE for advantage computation")
|
|
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
|
|
help="Toggles advantages normalization")
|
|
parser.add_argument("--gamma", type=float, default=GAMMA,
|
|
help="the discount factor gamma")
|
|
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
|
|
help="the lambda for the general advantage estimation")
|
|
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
|
|
help="the surrogate clipping coefficient")
|
|
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
|
|
help="coefficient of the policy")
|
|
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
|
|
help="coefficient of the entropy")
|
|
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
|
|
help="coefficient of the value function")
|
|
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
|
|
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
|
|
parser.add_argument("--max-grad-norm", type=float, default=0.5,
|
|
help="the maximum norm for the gradient clipping")
|
|
parser.add_argument("--target-kl", type=float, default=None,
|
|
help="the target KL divergence threshold")
|
|
# fmt: on
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
|
torch.nn.init.orthogonal_(layer.weight, std)
|
|
torch.nn.init.constant_(layer.bias, bias_const)
|
|
return layer
|
|
|
|
|
|
class PPOAgent(nn.Module):
|
|
def __init__(self, env: Aimbot):
|
|
super(PPOAgent, self).__init__()
|
|
self.discrete_size = env.unity_discrete_size
|
|
self.discrete_shape = list(env.unity_discrete_branches)
|
|
self.continuous_size = env.unity_continuous_size
|
|
|
|
self.network = nn.Sequential(
|
|
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 384)),
|
|
nn.ReLU(),
|
|
layer_init(nn.Linear(384, 256)),
|
|
nn.ReLU(),
|
|
)
|
|
self.actor_dis = layer_init(nn.Linear(256, self.discrete_size), std=0.01)
|
|
self.actor_mean = layer_init(nn.Linear(256, self.continuous_size), std=0.01)
|
|
self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
|
self.critic = layer_init(nn.Linear(256, 1), std=1)
|
|
|
|
def get_value(self, state: torch.Tensor):
|
|
return self.critic(self.network(state))
|
|
|
|
def get_actions_value(self, state: torch.Tensor, actions=None):
|
|
hidden = self.network(state)
|
|
# discrete
|
|
dis_logits = self.actor_dis(hidden)
|
|
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
|
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
|
# continuous
|
|
actions_mean = self.actor_mean(hidden)
|
|
action_logstd = self.actor_logstd.expand_as(actions_mean)
|
|
action_std = torch.exp(action_logstd)
|
|
con_probs = Normal(actions_mean, action_std)
|
|
|
|
if actions is None:
|
|
if args.train:
|
|
# select actions base on probability distribution model
|
|
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
|
conAct = con_probs.sample()
|
|
actions = torch.cat([disAct.T, conAct], dim=1)
|
|
else:
|
|
# select actions base on best probability distribution
|
|
disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
|
conAct = actions_mean
|
|
actions = torch.cat([disAct.T, conAct], dim=1)
|
|
else:
|
|
disAct = actions[:, 0 : env.unity_discrete_type].T
|
|
conAct = actions[:, env.unity_discrete_type :]
|
|
dis_log_prob = torch.stack(
|
|
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
|
|
)
|
|
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
|
return (
|
|
actions,
|
|
dis_log_prob.sum(0),
|
|
dis_entropy.sum(0),
|
|
con_probs.log_prob(conAct).sum(1),
|
|
con_probs.entropy().sum(1),
|
|
self.critic(hidden),
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_args()
|
|
random.seed(args.seed)
|
|
np.random.seed(args.seed)
|
|
torch.manual_seed(args.seed)
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
|
|
|
|
# Initialize environment anget optimizer
|
|
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport)
|
|
if args.load_dir is None:
|
|
agent = PPOAgent(env).to(device)
|
|
else:
|
|
agent = torch.load(args.load_dir)
|
|
print("Load Agent", args.load_dir)
|
|
print(agent.eval())
|
|
|
|
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
|
|
|
# Tensorboard and WandB Recorder
|
|
game_name = "Aimbot"
|
|
run_name = f"{game_name}_{args.seed}_{int(time.time())}"
|
|
if args.wandb_track:
|
|
wandb.init(
|
|
project=run_name,
|
|
entity=args.wandb_entity,
|
|
sync_tensorboard=True,
|
|
config=vars(args),
|
|
name=run_name,
|
|
monitor_gym=True,
|
|
save_code=True,
|
|
)
|
|
|
|
writer = SummaryWriter(f"runs/{run_name}")
|
|
writer.add_text(
|
|
"hyperparameters",
|
|
"|param|value|\n|-|-|\n%s"
|
|
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
|
|
)
|
|
|
|
# Memory Record
|
|
obs = torch.zeros((args.stepNum, env.unity_agent_num) + env.unity_observation_shape).to(device)
|
|
actions = torch.zeros((args.stepNum, env.unity_agent_num) + (env.unity_action_size,)).to(device)
|
|
dis_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
|
con_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
|
rewards = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
|
dones = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
|
values = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
|
|
|
# TRY NOT TO MODIFY: start the game
|
|
args.batch_size = int(env.unity_agent_num * args.stepNum)
|
|
args.minibatch_size = int(args.batch_size // args.minibatchesNum)
|
|
total_update_step = args.total_timesteps // args.batch_size
|
|
global_step = 0
|
|
start_time = time.time()
|
|
next_obs, _, _ = env.reset()
|
|
next_obs = torch.Tensor(next_obs).to(device)
|
|
next_done = torch.zeros(env.unity_agent_num).to(device)
|
|
|
|
for total_steps in range(total_update_step):
|
|
# discunt learning rate, while step == total_update_step lr will be 0
|
|
if args.annealLR:
|
|
frac = 1.0 - (total_steps - 1.0) / total_update_step
|
|
lrnow = frac * args.lr
|
|
optimizer.param_groups[0]["lr"] = lrnow
|
|
|
|
# MAIN LOOP: run agent in environment
|
|
for i in range(args.stepNum * args.decision_period):
|
|
if i % args.decision_period == 0:
|
|
step = round(i / args.decision_period)
|
|
# Choose action by agent
|
|
global_step += 1 * env.unity_agent_num
|
|
obs[step] = next_obs
|
|
dones[step] = next_done
|
|
|
|
with torch.no_grad():
|
|
# predict actions
|
|
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
|
|
next_obs
|
|
)
|
|
value = value.flatten()
|
|
next_obs, reward, done = env.step(action.cpu().numpy())
|
|
|
|
# save memories
|
|
actions[step] = action
|
|
dis_logprobs[step] = dis_logprob
|
|
con_logprobs[step] = con_logprob
|
|
values[step] = value
|
|
rewards[step] = torch.tensor(reward).to(device).view(-1)
|
|
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
|
|
device
|
|
)
|
|
else:
|
|
# skip this step use last predict action
|
|
next_obs, reward, done = env.step(action.cpu().numpy())
|
|
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
|
|
device
|
|
)
|
|
|
|
# GAE
|
|
with torch.no_grad():
|
|
next_value = agent.get_value(next_obs).reshape(1, -1)
|
|
if args.gae:
|
|
advantages = torch.zeros_like(rewards).to(device)
|
|
lastgaelam = 0
|
|
for t in reversed(range(args.stepNum)):
|
|
if t == args.stepNum - 1:
|
|
nextnonterminal = 1.0 - next_done
|
|
nextvalues = next_value
|
|
else:
|
|
nextnonterminal = 1.0 - dones[t + 1]
|
|
nextvalues = values[t + 1]
|
|
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
|
|
advantages[t] = lastgaelam = (
|
|
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
|
|
)
|
|
returns = advantages + values
|
|
else:
|
|
returns = torch.zeros_like(rewards).to(device)
|
|
for t in reversed(range(args.stepNum)):
|
|
if t == args.stepNum - 1:
|
|
nextnonterminal = 1.0 - next_done
|
|
next_return = next_value
|
|
else:
|
|
nextnonterminal = 1.0 - dones[t + 1]
|
|
next_return = returns[t + 1]
|
|
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
|
|
advantages = returns - values
|
|
|
|
if args.train:
|
|
# flatten the batch
|
|
b_obs = obs.reshape((-1,) + env.unity_observation_shape)
|
|
b_dis_logprobs = dis_logprobs.reshape(-1)
|
|
b_con_logprobs = con_logprobs.reshape(-1)
|
|
b_actions = actions.reshape((-1,) + (env.unity_action_size,))
|
|
b_advantages = advantages.reshape(-1)
|
|
b_returns = returns.reshape(-1)
|
|
b_values = values.reshape(-1)
|
|
|
|
# Optimizing the policy and value network
|
|
b_inds = np.arange(args.batch_size)
|
|
# clipfracs = []
|
|
for epoch in range(args.epochs):
|
|
# shuffle all datasets
|
|
np.random.shuffle(b_inds)
|
|
for start in range(0, args.batch_size, args.minibatch_size):
|
|
end = start + args.minibatch_size
|
|
mb_inds = b_inds[start:end]
|
|
mb_advantages = b_advantages[mb_inds]
|
|
|
|
# normalize advantages
|
|
if args.norm_adv:
|
|
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
|
mb_advantages.std() + 1e-8
|
|
)
|
|
|
|
(
|
|
_,
|
|
new_dis_logprob,
|
|
dis_entropy,
|
|
new_con_logprob,
|
|
con_entropy,
|
|
newvalue,
|
|
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
|
# discrete ratio
|
|
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
|
dis_ratio = dis_logratio.exp()
|
|
# continuous ratio
|
|
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
|
|
con_ratio = con_logratio.exp()
|
|
|
|
"""
|
|
# early stop
|
|
with torch.no_grad():
|
|
# calculate approx_kl http://joschu.net/blog/kl-approx.html
|
|
old_approx_kl = (-logratio).mean()
|
|
approx_kl = ((ratio - 1) - logratio).mean()
|
|
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
|
|
"""
|
|
|
|
# discrete Policy loss
|
|
dis_pg_loss_orig = -mb_advantages * dis_ratio
|
|
dis_pg_loss_clip = -mb_advantages * torch.clamp(
|
|
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
|
)
|
|
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
|
|
# continuous Policy loss
|
|
con_pg_loss_orig = -mb_advantages * con_ratio
|
|
con_pg_loss_clip = -mb_advantages * torch.clamp(
|
|
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
|
)
|
|
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
|
|
|
# Value loss
|
|
newvalue = newvalue.view(-1)
|
|
if args.clip_vloss:
|
|
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
|
v_clipped = b_values[mb_inds] + torch.clamp(
|
|
newvalue - b_values[mb_inds],
|
|
-args.clip_coef,
|
|
args.clip_coef,
|
|
)
|
|
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
|
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
|
v_loss = 0.5 * v_loss_max.mean()
|
|
else:
|
|
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
|
|
|
# total loss
|
|
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
|
loss = (
|
|
dis_pg_loss * args.policy_coef
|
|
+ con_pg_loss * args.policy_coef
|
|
- entropy_loss * args.ent_coef
|
|
+ v_loss * args.critic_coef
|
|
)
|
|
|
|
optimizer.zero_grad()
|
|
loss.backward()
|
|
# Clips gradient norm of an iterable of parameters.
|
|
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
|
|
optimizer.step()
|
|
|
|
"""
|
|
if args.target_kl is not None:
|
|
if approx_kl > args.target_kl:
|
|
break
|
|
"""
|
|
# record rewards for plotting purposes
|
|
rewardsMean = np.mean(rewards.to("cpu").detach().numpy().copy())
|
|
writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
|
|
writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
|
|
writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step)
|
|
writer.add_scalar("losses/con_policy_loss", con_pg_loss.item(), global_step)
|
|
writer.add_scalar("losses/total_loss", loss.item(), global_step)
|
|
writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step)
|
|
# writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
|
|
# writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
|
|
# writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
|
|
# print("SPS:", int(global_step / (time.time() - start_time)))
|
|
print("episode over mean reward:", rewardsMean)
|
|
writer.add_scalar(
|
|
"charts/SPS", int(global_step / (time.time() - start_time)), global_step
|
|
)
|
|
writer.add_scalar("charts/Reward", rewardsMean, global_step)
|
|
if rewardsMean > bestReward:
|
|
bestReward = rewardsMean
|
|
saveDir = "../PPO-Model/bigArea-384-128-hybrid-" + str(rewardsMean) + ".pt"
|
|
torch.save(agent, saveDir)
|
|
|
|
env.close()
|
|
writer.close()
|