代码整理

分离args,规范化命名
This commit is contained in:
Koha9 2023-07-24 16:48:47 +09:00
parent ef0ee495f2
commit efb5c61f0d
6 changed files with 401 additions and 370 deletions

View File

@ -185,18 +185,27 @@
"metadata": {},
"outputs": [
{
"name": "stdout",
"name": "stderr",
"output_type": "stream",
"text": [
"(1.2, 3.2)\n",
"1.2\n"
"Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mkoha9\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"aaa = (1.2,3.2)\n",
"print(aaa)\n",
"print(aaa[0])"
"import wandb\n",
"wandb.login()"
]
}
],

View File

@ -1,156 +1,28 @@
import argparse
import time
import numpy as np
import random
import uuid
import torch
import torch.nn as nn
import torch.optim as optim
import atexit
from aimbotEnv import Aimbot
from aimbotEnv import AimbotSideChannel
from ppoagent import PPOAgent
from airecorder import WandbRecorder
from aimemory import PPOMem
from aimemory import Targets
from enum import Enum
from distutils.util import strtobool
from arguments import parse_args
import torch.optim as optim
best_reward = -1
DEFAULT_SEED = 9331
ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv"
# side channel uuid
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
WAND_ENTITY = "koha9"
WORKER_ID = 1
BASE_PORT = 1000
# tensorboard names
GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3"
GAME_NAME = "Aimbot_Hybrid_V3"
GAME_TYPE = "Mix_Verification"
# max round steps per agent is 2500/Decision_period, 25 seconds
# !!!check every parameters before run!!!
TOTAL_STEPS = 3150000
BATCH_SIZE = 512
MAX_TRAINNING_DATASETS = 6000
DECISION_PERIOD = 1
LEARNING_RATE = 6.5e-4
GAMMA = 0.99
GAE_LAMBDA = 0.95
EPOCHS = 3
CLIP_COEF = 0.11
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
TARGET_LEARNING_RATE = 1e-6
FREEZE_VIEW_NETWORK = True
BROADCASTREWARD = False
ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True
NORM_ADV = False
TRAIN = True
SAVE_MODEL = False
WANDB_TACK = False
LOAD_DIR = None
#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
TARGET_STATE_SIZE = 6
INAREA_STATE_SIZE = 1
TIME_STATE_SIZE = 1
GUN_STATE_SIZE = 1
MY_STATE_SIZE = 4
TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
BASE_WINREWARD = 999
BASE_LOSEREWARD = -999
TARGETNUM= 4
ENV_TIMELIMIT = 30
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
# !!!SPECIAL PARAMETERS!!!
# change it while program is finished
using_targets_num = 3
def parse_args():
# fmt: off
# pytorch and environment parameters
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
help="seed of the experiment")
parser.add_argument("--path", type=str, default=ENV_PATH,
help="enviroment path")
parser.add_argument("--workerID", type=int, default=WORKER_ID,
help="unity worker ID")
parser.add_argument("--baseport", type=int, default=BASE_PORT,
help="port to connect to Unity environment")
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
help="the learning rate of optimizer")
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="if toggled, cuda will be enabled by default")
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
help="total timesteps of the experiments")
# model parameters
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
help="Train Model or not")
parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
help="freeze view network or not")
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
help="training dataset size,start training while dataset collect enough data")
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
help="nimi batch size")
parser.add_argument("--epochs", type=int, default=EPOCHS,
help="the K epochs to update the policy")
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
help="Toggle learning rate annealing for policy and value networks")
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
help="track on the wandb")
parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
help="save model or not")
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
help="the entity (team) of wandb's project")
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
help="load model directory")
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
help="the number of steps to run in each environment per policy rollout")
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True,
help="save model or not")
# GAE loss
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="Use GAE for advantage computation")
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
help="Toggles advantages normalization")
parser.add_argument("--gamma", type=float, default=GAMMA,
help="the discount factor gamma")
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
help="the lambda for the general advantage estimation")
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
help="the surrogate clipping coefficient")
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
help="coefficient of the policy")
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
help="coefficient of the entropy")
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
help="coefficient of the value function")
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
parser.add_argument("--max-grad-norm", type=float, default=0.5,
help="the maximum norm for the gradient clipping")
parser.add_argument("--target-kl", type=float, default=None,
help="the target KL divergence threshold")
# fmt: on
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
random.seed(args.seed)
@ -158,6 +30,7 @@ if __name__ == "__main__":
torch.manual_seed(args.seed)
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
best_reward = -1
# Initialize environment anget optimizer
aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
@ -166,18 +39,11 @@ if __name__ == "__main__":
agent = PPOAgent(
env = env,
this_args=args,
train_agent=args.train,
target_num=TARGETNUM,
target_state_size= TARGET_STATE_SIZE,
time_state_size=TIME_STATE_SIZE,
gun_state_size=GUN_STATE_SIZE,
my_state_size=MY_STATE_SIZE,
total_t_size=TOTAL_T_SIZE,
device=device,
).to(device)
else:
agent = torch.load(args.load_dir)
# freeze
# freeze
if args.freeze_viewnet:
# freeze the view network
for p in agent.viewNetwork.parameters():
@ -185,9 +51,8 @@ if __name__ == "__main__":
print("VIEW NETWORK FREEZED")
print("Load Agent", args.load_dir)
print(agent.eval())
# optimizer
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
# Tensorboard and WandB Recorder
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
@ -204,34 +69,30 @@ if __name__ == "__main__":
# start the game
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
target_steps = [0 for i in range(TARGETNUM)]
target_steps = [0 for i in range(args.target_num)]
start_time = time.time()
state, _, done = env.reset()
# initialize AI memories
ppo_memories = PPOMem(
env = env,
device = device,
args=args,
target_num = TARGETNUM,
target_state_size = TARGET_STATE_SIZE,
base_lose_reward = BASE_LOSEREWARD,
base_win_reward = BASE_WINREWARD,
unity_agent_num=env.unity_agent_num,
device = device,
)
# MAIN LOOP: run agent in environment
for total_steps in range(total_update_step):
# discunt learning rate, while step == total_update_step lr will be 0
if args.annealLR:
final_lr_ratio = TARGET_LEARNING_RATE/args.lr
final_lr_ratio = args.target_lr/args.lr
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
lr_now = frac * args.lr
optimizer.param_groups[0]["lr"] = lr_now
else:
lr_now = args.lr
# episode start show learning rate
print("new episode",total_steps,"learning rate = ",lr_now)
# MAIN LOOP: run agent in environment
step = 0
training = False
@ -271,14 +132,15 @@ if __name__ == "__main__":
next_done = next_done,
next_state=next_state,
)
# check if any training dataset is full and ready to train
for i in range(TARGETNUM):
for i in range(args.target_num):
if ppo_memories.obs[i].size()[0] >= args.datasetSize:
# start train NN
train_queue.append(i)
if(len(train_queue)>0):
# break while loop and start train
break
# update state
state, done = next_state, next_done
else:
step += 1
@ -299,7 +161,7 @@ if __name__ == "__main__":
next_done = next_done,
next_state=next_state,
)
# update state
state = next_state
last_reward = reward
@ -307,137 +169,34 @@ if __name__ == "__main__":
# train mode on
mean_reward_list = [] # for WANDB
# loop all tarining queue
for thisT in train_queue:
for this_train_ind in train_queue:
# sart time
start_time = time.time()
target_steps[thisT]+=1
# flatten the batch
b_obs = ppo_memories.obs[thisT].reshape((-1,) + env.unity_observation_shape)
b_dis_logprobs = ppo_memories.dis_logprobs[thisT].reshape(-1)
b_con_logprobs = ppo_memories.con_logprobs[thisT].reshape(-1)
b_actions = ppo_memories.actions[thisT].reshape((-1,) + (env.unity_action_size,))
b_advantages = ppo_memories.advantages[thisT].reshape(-1)
b_returns = ppo_memories.returns[thisT].reshape(-1)
b_values = ppo_memories.values[thisT].reshape(-1)
b_size = b_obs.size()[0]
# Optimizing the policy and value network
b_inds = np.arange(b_size)
# clipfracs = []
for epoch in range(args.epochs):
print(epoch,end="")
# shuffle all datasets
np.random.shuffle(b_inds)
for start in range(0, b_size, args.minibatchSize):
print(".",end="")
end = start + args.minibatchSize
mb_inds = b_inds[start:end]
if(np.size(mb_inds)<=1):
break
mb_advantages = b_advantages[mb_inds]
# normalize advantages
if args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
mb_advantages.std() + 1e-8
)
(
_,
new_dis_logprob,
dis_entropy,
new_con_logprob,
con_entropy,
newvalue,
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
# discrete ratio
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
dis_ratio = dis_logratio.exp()
# continuous ratio
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
con_ratio = con_logratio.exp()
"""
# early stop
with torch.no_grad():
# calculate approx_kl http://joschu.net/blog/kl-approx.html
old_approx_kl = (-logratio).mean()
approx_kl = ((ratio - 1) - logratio).mean()
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
"""
# discrete Policy loss
dis_pg_loss_orig = -mb_advantages * dis_ratio
dis_pg_loss_clip = -mb_advantages * torch.clamp(
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
target_steps[this_train_ind]+=1
# train agent
(
v_loss,
dis_pg_loss,
con_pg_loss,
loss,
entropy_loss
) = agent.train_net(
this_train_ind=this_train_ind,
ppo_memories=ppo_memories,
optimizer=optimizer
)
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
# continuous Policy loss
con_pg_loss_orig = -mb_advantages * con_ratio
con_pg_loss_clip = -mb_advantages * torch.clamp(
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
)
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
# Value loss
newvalue = newvalue.view(-1)
if args.clip_vloss:
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
v_clipped = b_values[mb_inds] + torch.clamp(
newvalue - b_values[mb_inds],
-args.clip_coef,
args.clip_coef,
)
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
# total loss
entropy_loss = dis_entropy.mean() + con_entropy.mean()
loss = (
dis_pg_loss * POLICY_COEF[thisT]
+ con_pg_loss * POLICY_COEF[thisT]
+ entropy_loss * ENTROPY_COEF[thisT]
+ v_loss * CRITIC_COEF[thisT]
)*LOSS_COEF[thisT]
if(torch.isnan(loss).any()):
print("LOSS Include NAN!!!")
if(torch.isnan(dis_pg_loss.any())):
print("dis_pg_loss include nan")
if(torch.isnan(con_pg_loss.any())):
print("con_pg_loss include nan")
if(torch.isnan(entropy_loss.any())):
print("entropy_loss include nan")
if(torch.isnan(v_loss.any())):
print("v_loss include nan")
raise
optimizer.zero_grad()
loss.backward()
# Clips gradient norm of an iterable of parameters.
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
optimizer.step()
"""
if args.target_kl is not None:
if approx_kl > args.target_kl:
break
"""
# record mean reward before clear history
print("done")
targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy())
targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
mean_reward_list.append(targetRewardMean)
targetName = Targets(thisT).name
targetName = Targets(this_train_ind).name
# clear this target trainning set buffer
ppo_memories.clear_training_datasets(thisT)
ppo_memories.clear_training_datasets(this_train_ind)
# record rewards for plotting purposes
wdb_recorder.add_target_scalar(
targetName,
thisT,
this_train_ind,
v_loss,
dis_pg_loss,
con_pg_loss,
@ -464,19 +223,19 @@ if __name__ == "__main__":
# train mode off
mean_reward_list = [] # for WANDB
# while not in training mode, clear the buffer
for thisT in train_queue:
target_steps[thisT]+=1
targetName = Targets(thisT).name
targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy())
for this_train_ind in train_queue:
target_steps[this_train_ind]+=1
targetName = Targets(this_train_ind).name
targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
mean_reward_list.append(targetRewardMean)
print(target_steps[thisT])
print(target_steps[this_train_ind])
# clear this target trainning set buffer
ppo_memories.clear_training_datasets(thisT)
ppo_memories.clear_training_datasets(this_train_ind)
# record rewards for plotting purposes
wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
wdb_recorder.add_win_ratio(targetName,target_steps[thisT])
wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[this_train_ind])
wdb_recorder.add_win_ratio(targetName,target_steps[this_train_ind])
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
TotalRewardMean = np.mean(mean_reward_list)
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)

View File

@ -1,7 +1,6 @@
import torch
import numpy as np
import argparse
from aimbotEnv import Aimbot
from ppoagent import PPOAgent
from enum import Enum
@ -16,42 +15,39 @@ class Targets(Enum):
class PPOMem:
def __init__(
self,
env: Aimbot,
args: argparse.Namespace,
unity_agent_num: int,
device: torch.device,
target_num: int,
target_state_size: int,
base_lose_reward: int,
base_win_reward: int,
) -> None:
self.target_num = args.target_num
self.data_set_size = args.datasetSize
self.result_broadcast_ratio = args.result_broadcast_ratio
self.decision_period = args.decision_period
self.unity_agent_num = env.unity_agent_num
self.unity_agent_num = unity_agent_num
self.base_lose_reward = base_lose_reward
self.base_win_reward = base_win_reward
self.target_state_size = target_state_size
self.base_lose_reward = args.base_lose_reward
self.base_win_reward = args.base_win_reward
self.target_state_size = args.target_state_size
self.device = device
# Trajectory Buffer
self.ob_bf = [[] for i in range(env.unity_agent_num)]
self.act_bf = [[] for i in range(env.unity_agent_num)]
self.dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
self.con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
self.rewards_bf = [[] for i in range(env.unity_agent_num)]
self.dones_bf = [[] for i in range(env.unity_agent_num)]
self.values_bf = [[] for i in range(env.unity_agent_num)]
self.ob_bf = [[] for i in range(self.unity_agent_num)]
self.act_bf = [[] for i in range(self.unity_agent_num)]
self.dis_logprobs_bf = [[] for i in range(self.unity_agent_num)]
self.con_logprobs_bf = [[] for i in range(self.unity_agent_num)]
self.rewards_bf = [[] for i in range(self.unity_agent_num)]
self.dones_bf = [[] for i in range(self.unity_agent_num)]
self.values_bf = [[] for i in range(self.unity_agent_num)]
# initialize empty training datasets
self.obs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_observation_size)
self.actions = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_action_size)
self.dis_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
self.con_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
self.rewards = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
self.values = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
self.advantages = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
self.returns = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
self.obs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,env.unity_observation_size)
self.actions = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,env.unity_action_size)
self.dis_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
self.con_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
self.rewards = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
self.values = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
self.advantages = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
self.returns = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
def broad_cast_end_reward(self, rewardBF: list, remainTime: float) -> torch.Tensor:
thisRewardBF = rewardBF.copy()

View File

@ -0,0 +1,154 @@
import argparse
import uuid
from distutils.util import strtobool
DEFAULT_SEED = 9331
ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv"
WAND_ENTITY = "koha9"
WORKER_ID = 1
BASE_PORT = 1000
# tensorboard names
GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3"
GAME_TYPE = "Mix_Verification"
# max round steps per agent is 2500/Decision_period, 25 seconds
TOTAL_STEPS = 3150000
BATCH_SIZE = 512
MAX_TRAINNING_DATASETS = 6000
DECISION_PERIOD = 1
LEARNING_RATE = 6.5e-4
GAMMA = 0.99
GAE_LAMBDA = 0.95
EPOCHS = 3
CLIP_COEF = 0.11
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
TARGET_LEARNING_RATE = 1e-6
FREEZE_VIEW_NETWORK = False
BROADCASTREWARD = False
ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True
NORM_ADV = False
TRAIN = True
SAVE_MODEL = True
WANDB_TACK = True
LOAD_DIR = None
#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
# Unity Environment Parameters
TARGET_STATE_SIZE = 6
INAREA_STATE_SIZE = 1
TIME_STATE_SIZE = 1
GUN_STATE_SIZE = 1
MY_STATE_SIZE = 4
TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
BASE_WINREWARD = 999
BASE_LOSEREWARD = -999
TARGETNUM= 4
ENV_TIMELIMIT = 30
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
def parse_args():
# fmt: off
# pytorch and environment parameters
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
help="seed of the experiment")
parser.add_argument("--path", type=str, default=ENV_PATH,
help="enviroment path")
parser.add_argument("--workerID", type=int, default=WORKER_ID,
help="unity worker ID")
parser.add_argument("--baseport", type=int, default=BASE_PORT,
help="port to connect to Unity environment")
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
help="the default learning rate of optimizer")
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="if toggled, cuda will be enabled by default")
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
help="total timesteps of the experiments")
# model parameters
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
help="Train Model or not")
parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
help="freeze view network or not")
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
help="training dataset size,start training while dataset collect enough data")
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
help="nimi batch size")
parser.add_argument("--epochs", type=int, default=EPOCHS,
help="the K epochs to update the policy")
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
help="Toggle learning rate annealing for policy and value networks")
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
help="track on the wandb")
parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
help="save model or not")
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
help="the entity (team) of wandb's project")
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
help="load model directory")
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
help="the number of steps to run in each environment per policy rollout")
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True,
help="save model or not")
# target_learning_rate
parser.add_argument("--target-lr", type=float, default=TARGET_LEARNING_RATE,
help="target value of downscaling the learning rate")
# POLICY_COEF ENTROPY_COEF CRITIC_COEF LOSS_COEF
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
help="coefficient of the policy loss")
parser.add_argument("--entropy-coef", type=float, default=ENTROPY_COEF,
help="coefficient of the entropy loss")
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
help="coefficient of the critic loss")
parser.add_argument("--loss-coef", type=float, default=LOSS_COEF,
help="coefficient of the total loss")
# GAE loss
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="Use GAE for advantage computation")
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
help="Toggles advantages normalization")
parser.add_argument("--gamma", type=float, default=GAMMA,
help="the discount factor gamma")
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
help="the lambda for the general advantage estimation")
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
help="the surrogate clipping coefficient")
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
parser.add_argument("--max-grad-norm", type=float, default=0.5,
help="the maximum norm for the gradient clipping")
parser.add_argument("--target-kl", type=float, default=None,
help="the target KL divergence threshold")
# environment parameters
parser.add_argument("--target-num", type=int, default=TARGETNUM,
help="the number of targets")
parser.add_argument("--env-timelimit", type=int, default=ENV_TIMELIMIT,
help="the time limit of each round")
parser.add_argument("--base-win-reward", type=int, default=BASE_WINREWARD,
help="the base reward of win round")
parser.add_argument("--base-lose-reward", type=int, default=BASE_LOSEREWARD,
help="the base reward of lose round")
parser.add_argument("--target-state-size", type=int, default=TARGET_STATE_SIZE,
help="the size of target state")
parser.add_argument("--time-state-size", type=int, default=TIME_STATE_SIZE,
help="the size of time state")
parser.add_argument("--gun-state-size", type=int, default=GUN_STATE_SIZE,
help="the size of gun state")
parser.add_argument("--my-state-size", type=int, default=MY_STATE_SIZE,
help="the size of my state")
parser.add_argument("--total-target-size", type=int, default=TOTAL_T_SIZE,
help="the size of total target state")
# fmt: on
args = parser.parse_args()
return args

View File

@ -1,6 +1,7 @@
import numpy as np
import torch
import argparse
import time
from torch import nn
from aimbotEnv import Aimbot
@ -19,123 +20,118 @@ class PPOAgent(nn.Module):
self,
env: Aimbot,
this_args:argparse.Namespace,
train_agent: bool,
target_num: int,
target_state_size: int,
time_state_size: int,
gun_state_size: int,
my_state_size: int,
total_t_size: int,
device: torch.device,
):
super(PPOAgent, self).__init__()
self.device = device
self.args = this_args
self.trainAgent = train_agent
self.targetNum = target_num
self.stateSize = env.unity_observation_shape[0]
self.agentNum = env.unity_agent_num
self.targetSize = target_state_size
self.timeSize = time_state_size
self.gunSize = gun_state_size
self.myStateSize = my_state_size
self.raySize = env.unity_observation_shape[0] - total_t_size
self.nonRaySize = total_t_size
self.train_agent = self.args.train
self.target_num = self.args.target_num
self.unity_observation_shape = env.unity_observation_shape
self.unity_action_size = env.unity_action_size
self.state_size = self.unity_observation_shape[0]
self.agent_num = env.unity_agent_num
self.target_size = self.args.target_state_size
self.time_state_size = self.args.time_state_size
self.gun_state_size = self.args.gun_state_size
self.my_state_size = self.args.my_state_size
self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
self.state_size_without_ray = self.args.total_target_size
self.head_input_size = (
env.unity_observation_shape[0] - self.targetSize - self.timeSize - self.gunSize
env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
) # except target state input
self.unityDiscreteType = env.unity_discrete_type
self.unity_discrete_type = env.unity_discrete_type
self.discrete_size = env.unity_discrete_size
self.discrete_shape = list(env.unity_discrete_branches)
self.continuous_size = env.unity_continuous_size
self.viewNetwork = nn.Sequential(layer_init(nn.Linear(self.raySize, 200)), nn.LeakyReLU())
self.targetNetworks = nn.ModuleList(
self.view_network = nn.Sequential(layer_init(nn.Linear(self.ray_state_size, 200)), nn.LeakyReLU())
self.target_networks = nn.ModuleList(
[
nn.Sequential(layer_init(nn.Linear(self.nonRaySize, 100)), nn.LeakyReLU())
for i in range(target_num)
nn.Sequential(layer_init(nn.Linear(self.state_size_without_ray, 100)), nn.LeakyReLU())
for i in range(self.target_num)
]
)
self.middleNetworks = nn.ModuleList(
self.middle_networks = nn.ModuleList(
[
nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU())
for i in range(target_num)
for i in range(self.target_num)
]
)
self.actor_dis = nn.ModuleList(
[layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(target_num)]
[layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(self.target_num)]
)
self.actor_mean = nn.ModuleList(
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(target_num)]
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
)
# self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
self.actor_logstd = nn.ParameterList(
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(target_num)]
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
) # nn.Parameter(torch.zeros(1, self.continuous_size))
self.critic = nn.ModuleList(
[layer_init(nn.Linear(200, 1), std=1) for i in range(target_num)]
[layer_init(nn.Linear(200, 1), std=1) for i in range(self.target_num)]
)
def get_value(self, state: torch.Tensor):
target = state[:, 0].to(torch.int32) # int
thisStateNum = target.size()[0]
viewInput = state[:, -self.raySize :] # all ray input
targetInput = state[:, : self.nonRaySize]
viewLayer = self.viewNetwork(viewInput)
targetLayer = torch.stack(
[self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]
this_state_num = target.size()[0]
view_input = state[:, -self.ray_state_size :] # all ray input
target_input = state[:, : self.state_size_without_ray]
view_layer = self.view_network(view_input)
target_layer = torch.stack(
[self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
)
middleInput = torch.cat([viewLayer, targetLayer], dim=1)
middleLayer = torch.stack(
[self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]
middle_input = torch.cat([view_layer, target_layer], dim=1)
middle_layer = torch.stack(
[self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
)
criticV = torch.stack(
[self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]
[self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
) # self.critic
return criticV
def get_actions_value(self, state: torch.Tensor, actions=None):
target = state[:, 0].to(torch.int32) # int
thisStateNum = target.size()[0]
viewInput = state[:, -self.raySize :] # all ray input
targetInput = state[:, : self.nonRaySize]
viewLayer = self.viewNetwork(viewInput)
targetLayer = torch.stack(
[self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]
this_state_num = target.size()[0]
view_input = state[:, -self.ray_state_size :] # all ray input
target_input = state[:, : self.state_size_without_ray]
view_layer = self.view_network(view_input)
target_layer = torch.stack(
[self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
)
middleInput = torch.cat([viewLayer, targetLayer], dim=1)
middleLayer = torch.stack(
[self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]
middle_input = torch.cat([view_layer, target_layer], dim=1)
middle_layer = torch.stack(
[self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
)
# discrete
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
dis_logits = torch.stack(
[self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)]
[self.actor_dis[target[i]](middle_layer[i]) for i in range(this_state_num)]
)
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
# continuous
actions_mean = torch.stack(
[self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]
[self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)]
) # self.actor_mean(hidden)
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
action_logstd = torch.stack(
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(thisStateNum)]
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)]
)
# print(action_logstd)
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
con_probs = Normal(actions_mean, action_std)
# critic
criticV = torch.stack(
[self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]
[self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
) # self.critic
if actions is None:
if self.trainAgent:
if self.train_agent:
# select actions base on probability distribution model
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
conAct = con_probs.sample()
@ -148,8 +144,8 @@ class PPOAgent(nn.Module):
conAct = con_probs.sample()
actions = torch.cat([disAct.T, conAct], dim=1)
else:
disAct = actions[:, 0 : self.unityDiscreteType].T
conAct = actions[:, self.unityDiscreteType :]
disAct = actions[:, 0 : self.unity_discrete_type].T
conAct = actions[:, self.unity_discrete_type :]
dis_log_prob = torch.stack(
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
)
@ -162,6 +158,123 @@ class PPOAgent(nn.Module):
con_probs.entropy().sum(1),
criticV,
)
def train_net(self, this_train_ind:int,ppo_memories,optimizer) -> tuple:
start_time = time.time()
# flatten the batch
b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape)
b_dis_logprobs = ppo_memories.dis_logprobs[this_train_ind].reshape(-1)
b_con_logprobs = ppo_memories.con_logprobs[this_train_ind].reshape(-1)
b_actions = ppo_memories.actions[this_train_ind].reshape((-1,) + (self.unity_action_size,))
b_advantages = ppo_memories.advantages[this_train_ind].reshape(-1)
b_returns = ppo_memories.returns[this_train_ind].reshape(-1)
b_values = ppo_memories.values[this_train_ind].reshape(-1)
b_size = b_obs.size()[0]
# optimizing the policy and value network
b_inds = np.arange(b_size)
for epoch in range(self.args.epochs):
print("epoch:",epoch,end="")
# shuffle all datasets
np.random.shuffle(b_inds)
for start in range(0, b_size, self.args.minibatchSize):
print(".",end="")
end = start + self.args.minibatchSize
mb_inds = b_inds[start:end]
if(np.size(mb_inds)<=1):
break
mb_advantages = b_advantages[mb_inds]
# normalize advantages
if self.args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
mb_advantages.std() + 1e-8
)
(
_,
new_dis_logprob,
dis_entropy,
new_con_logprob,
con_entropy,
newvalue,
) = self.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
# discrete ratio
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
dis_ratio = dis_logratio.exp()
# continuous ratio
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
con_ratio = con_logratio.exp()
"""
# early stop
with torch.no_grad():
# calculate approx_kl http://joschu.net/blog/kl-approx.html
old_approx_kl = (-logratio).mean()
approx_kl = ((ratio - 1) - logratio).mean()
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
"""
# discrete Policy loss
dis_pg_loss_orig = -mb_advantages * dis_ratio
dis_pg_loss_clip = -mb_advantages * torch.clamp(
dis_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
)
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
# continuous Policy loss
con_pg_loss_orig = -mb_advantages * con_ratio
con_pg_loss_clip = -mb_advantages * torch.clamp(
con_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
)
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
# Value loss
newvalue = newvalue.view(-1)
if self.args.clip_vloss:
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
v_clipped = b_values[mb_inds] + torch.clamp(
newvalue - b_values[mb_inds],
-self.args.clip_coef,
self.args.clip_coef,
)
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
# total loss
entropy_loss = dis_entropy.mean() + con_entropy.mean()
loss = (
dis_pg_loss * self.args.policy_coef[this_train_ind]
+ con_pg_loss * self.args.policy_coef[this_train_ind]
+ entropy_loss * self.args.entropy_coef[this_train_ind]
+ v_loss * self.args.critic_coef[this_train_ind]
)*self.args.loss_coef[this_train_ind]
if(torch.isnan(loss).any()):
print("LOSS Include NAN!!!")
if(torch.isnan(dis_pg_loss.any())):
print("dis_pg_loss include nan")
if(torch.isnan(con_pg_loss.any())):
print("con_pg_loss include nan")
if(torch.isnan(entropy_loss.any())):
print("entropy_loss include nan")
if(torch.isnan(v_loss.any())):
print("v_loss include nan")
raise
optimizer.zero_grad()
loss.backward()
# Clips gradient norm of an iterable of parameters.
nn.utils.clip_grad_norm_(self.parameters(), self.args.max_grad_norm)
optimizer.step()
"""
if args.target_kl is not None:
if approx_kl > args.target_kl:
break
"""
return (v_loss,dis_pg_loss,con_pg_loss,loss,entropy_loss)
def gae(
self,

Binary file not shown.