代码整理
分离args,规范化命名
This commit is contained in:
parent
ef0ee495f2
commit
efb5c61f0d
@ -185,18 +185,27 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"(1.2, 3.2)\n",
|
"Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n",
|
||||||
"1.2\n"
|
"\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mkoha9\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"aaa = (1.2,3.2)\n",
|
"import wandb\n",
|
||||||
"print(aaa)\n",
|
"wandb.login()"
|
||||||
"print(aaa[0])"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -1,156 +1,28 @@
|
|||||||
import argparse
|
|
||||||
import time
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import random
|
import random
|
||||||
import uuid
|
import uuid
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
|
||||||
import torch.optim as optim
|
|
||||||
import atexit
|
import atexit
|
||||||
|
|
||||||
|
|
||||||
from aimbotEnv import Aimbot
|
from aimbotEnv import Aimbot
|
||||||
from aimbotEnv import AimbotSideChannel
|
from aimbotEnv import AimbotSideChannel
|
||||||
from ppoagent import PPOAgent
|
from ppoagent import PPOAgent
|
||||||
from airecorder import WandbRecorder
|
from airecorder import WandbRecorder
|
||||||
from aimemory import PPOMem
|
from aimemory import PPOMem
|
||||||
from aimemory import Targets
|
from aimemory import Targets
|
||||||
from enum import Enum
|
from arguments import parse_args
|
||||||
from distutils.util import strtobool
|
import torch.optim as optim
|
||||||
|
|
||||||
best_reward = -1
|
# side channel uuid
|
||||||
|
|
||||||
DEFAULT_SEED = 9331
|
|
||||||
ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv"
|
|
||||||
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
||||||
WAND_ENTITY = "koha9"
|
|
||||||
WORKER_ID = 1
|
|
||||||
BASE_PORT = 1000
|
|
||||||
|
|
||||||
# tensorboard names
|
# tensorboard names
|
||||||
GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3"
|
GAME_NAME = "Aimbot_Hybrid_V3"
|
||||||
GAME_TYPE = "Mix_Verification"
|
GAME_TYPE = "Mix_Verification"
|
||||||
|
|
||||||
# max round steps per agent is 2500/Decision_period, 25 seconds
|
|
||||||
# !!!check every parameters before run!!!
|
|
||||||
|
|
||||||
TOTAL_STEPS = 3150000
|
|
||||||
BATCH_SIZE = 512
|
|
||||||
MAX_TRAINNING_DATASETS = 6000
|
|
||||||
DECISION_PERIOD = 1
|
|
||||||
LEARNING_RATE = 6.5e-4
|
|
||||||
GAMMA = 0.99
|
|
||||||
GAE_LAMBDA = 0.95
|
|
||||||
EPOCHS = 3
|
|
||||||
CLIP_COEF = 0.11
|
|
||||||
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
|
|
||||||
POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
|
|
||||||
ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
|
|
||||||
CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
|
|
||||||
TARGET_LEARNING_RATE = 1e-6
|
|
||||||
FREEZE_VIEW_NETWORK = True
|
|
||||||
|
|
||||||
BROADCASTREWARD = False
|
|
||||||
ANNEAL_LEARNING_RATE = True
|
|
||||||
CLIP_VLOSS = True
|
|
||||||
NORM_ADV = False
|
|
||||||
TRAIN = True
|
|
||||||
SAVE_MODEL = False
|
|
||||||
WANDB_TACK = False
|
|
||||||
LOAD_DIR = None
|
|
||||||
#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
|
|
||||||
|
|
||||||
TARGET_STATE_SIZE = 6
|
|
||||||
INAREA_STATE_SIZE = 1
|
|
||||||
TIME_STATE_SIZE = 1
|
|
||||||
GUN_STATE_SIZE = 1
|
|
||||||
MY_STATE_SIZE = 4
|
|
||||||
TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
|
|
||||||
BASE_WINREWARD = 999
|
|
||||||
BASE_LOSEREWARD = -999
|
|
||||||
TARGETNUM= 4
|
|
||||||
ENV_TIMELIMIT = 30
|
|
||||||
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
|
|
||||||
|
|
||||||
# !!!SPECIAL PARAMETERS!!!
|
# !!!SPECIAL PARAMETERS!!!
|
||||||
# change it while program is finished
|
|
||||||
using_targets_num = 3
|
using_targets_num = 3
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
# fmt: off
|
|
||||||
# pytorch and environment parameters
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
|
|
||||||
help="seed of the experiment")
|
|
||||||
parser.add_argument("--path", type=str, default=ENV_PATH,
|
|
||||||
help="enviroment path")
|
|
||||||
parser.add_argument("--workerID", type=int, default=WORKER_ID,
|
|
||||||
help="unity worker ID")
|
|
||||||
parser.add_argument("--baseport", type=int, default=BASE_PORT,
|
|
||||||
help="port to connect to Unity environment")
|
|
||||||
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
|
|
||||||
help="the learning rate of optimizer")
|
|
||||||
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
|
||||||
help="if toggled, cuda will be enabled by default")
|
|
||||||
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
|
|
||||||
help="total timesteps of the experiments")
|
|
||||||
|
|
||||||
# model parameters
|
|
||||||
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
|
|
||||||
help="Train Model or not")
|
|
||||||
parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
|
|
||||||
help="freeze view network or not")
|
|
||||||
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
|
|
||||||
help="training dataset size,start training while dataset collect enough data")
|
|
||||||
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
|
|
||||||
help="nimi batch size")
|
|
||||||
parser.add_argument("--epochs", type=int, default=EPOCHS,
|
|
||||||
help="the K epochs to update the policy")
|
|
||||||
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
|
|
||||||
help="Toggle learning rate annealing for policy and value networks")
|
|
||||||
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
|
|
||||||
help="track on the wandb")
|
|
||||||
parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
|
|
||||||
help="save model or not")
|
|
||||||
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
|
|
||||||
help="the entity (team) of wandb's project")
|
|
||||||
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
|
|
||||||
help="load model directory")
|
|
||||||
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
|
|
||||||
help="the number of steps to run in each environment per policy rollout")
|
|
||||||
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
|
|
||||||
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
|
|
||||||
parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True,
|
|
||||||
help="save model or not")
|
|
||||||
# GAE loss
|
|
||||||
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
|
||||||
help="Use GAE for advantage computation")
|
|
||||||
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
|
|
||||||
help="Toggles advantages normalization")
|
|
||||||
parser.add_argument("--gamma", type=float, default=GAMMA,
|
|
||||||
help="the discount factor gamma")
|
|
||||||
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
|
|
||||||
help="the lambda for the general advantage estimation")
|
|
||||||
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
|
|
||||||
help="the surrogate clipping coefficient")
|
|
||||||
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
|
|
||||||
help="coefficient of the policy")
|
|
||||||
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
|
|
||||||
help="coefficient of the entropy")
|
|
||||||
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
|
|
||||||
help="coefficient of the value function")
|
|
||||||
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
|
|
||||||
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
|
|
||||||
parser.add_argument("--max-grad-norm", type=float, default=0.5,
|
|
||||||
help="the maximum norm for the gradient clipping")
|
|
||||||
parser.add_argument("--target-kl", type=float, default=None,
|
|
||||||
help="the target KL divergence threshold")
|
|
||||||
# fmt: on
|
|
||||||
args = parser.parse_args()
|
|
||||||
return args
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
@ -158,6 +30,7 @@ if __name__ == "__main__":
|
|||||||
torch.manual_seed(args.seed)
|
torch.manual_seed(args.seed)
|
||||||
|
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
|
||||||
|
best_reward = -1
|
||||||
|
|
||||||
# Initialize environment anget optimizer
|
# Initialize environment anget optimizer
|
||||||
aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
|
aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
|
||||||
@ -166,18 +39,11 @@ if __name__ == "__main__":
|
|||||||
agent = PPOAgent(
|
agent = PPOAgent(
|
||||||
env = env,
|
env = env,
|
||||||
this_args=args,
|
this_args=args,
|
||||||
train_agent=args.train,
|
|
||||||
target_num=TARGETNUM,
|
|
||||||
target_state_size= TARGET_STATE_SIZE,
|
|
||||||
time_state_size=TIME_STATE_SIZE,
|
|
||||||
gun_state_size=GUN_STATE_SIZE,
|
|
||||||
my_state_size=MY_STATE_SIZE,
|
|
||||||
total_t_size=TOTAL_T_SIZE,
|
|
||||||
device=device,
|
device=device,
|
||||||
).to(device)
|
).to(device)
|
||||||
else:
|
else:
|
||||||
agent = torch.load(args.load_dir)
|
agent = torch.load(args.load_dir)
|
||||||
# freeze
|
# freeze
|
||||||
if args.freeze_viewnet:
|
if args.freeze_viewnet:
|
||||||
# freeze the view network
|
# freeze the view network
|
||||||
for p in agent.viewNetwork.parameters():
|
for p in agent.viewNetwork.parameters():
|
||||||
@ -185,9 +51,8 @@ if __name__ == "__main__":
|
|||||||
print("VIEW NETWORK FREEZED")
|
print("VIEW NETWORK FREEZED")
|
||||||
print("Load Agent", args.load_dir)
|
print("Load Agent", args.load_dir)
|
||||||
print(agent.eval())
|
print(agent.eval())
|
||||||
|
# optimizer
|
||||||
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
||||||
|
|
||||||
# Tensorboard and WandB Recorder
|
# Tensorboard and WandB Recorder
|
||||||
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
|
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
|
||||||
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
|
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
|
||||||
@ -204,34 +69,30 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
# start the game
|
# start the game
|
||||||
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
|
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
|
||||||
target_steps = [0 for i in range(TARGETNUM)]
|
target_steps = [0 for i in range(args.target_num)]
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
state, _, done = env.reset()
|
state, _, done = env.reset()
|
||||||
|
|
||||||
# initialize AI memories
|
# initialize AI memories
|
||||||
ppo_memories = PPOMem(
|
ppo_memories = PPOMem(
|
||||||
env = env,
|
|
||||||
device = device,
|
|
||||||
args=args,
|
args=args,
|
||||||
target_num = TARGETNUM,
|
unity_agent_num=env.unity_agent_num,
|
||||||
target_state_size = TARGET_STATE_SIZE,
|
device = device,
|
||||||
base_lose_reward = BASE_LOSEREWARD,
|
|
||||||
base_win_reward = BASE_WINREWARD,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# MAIN LOOP: run agent in environment
|
||||||
for total_steps in range(total_update_step):
|
for total_steps in range(total_update_step):
|
||||||
# discunt learning rate, while step == total_update_step lr will be 0
|
# discunt learning rate, while step == total_update_step lr will be 0
|
||||||
|
|
||||||
if args.annealLR:
|
if args.annealLR:
|
||||||
final_lr_ratio = TARGET_LEARNING_RATE/args.lr
|
final_lr_ratio = args.target_lr/args.lr
|
||||||
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
|
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
|
||||||
lr_now = frac * args.lr
|
lr_now = frac * args.lr
|
||||||
optimizer.param_groups[0]["lr"] = lr_now
|
optimizer.param_groups[0]["lr"] = lr_now
|
||||||
else:
|
else:
|
||||||
lr_now = args.lr
|
lr_now = args.lr
|
||||||
|
|
||||||
|
# episode start show learning rate
|
||||||
print("new episode",total_steps,"learning rate = ",lr_now)
|
print("new episode",total_steps,"learning rate = ",lr_now)
|
||||||
|
|
||||||
|
|
||||||
# MAIN LOOP: run agent in environment
|
# MAIN LOOP: run agent in environment
|
||||||
step = 0
|
step = 0
|
||||||
training = False
|
training = False
|
||||||
@ -271,14 +132,15 @@ if __name__ == "__main__":
|
|||||||
next_done = next_done,
|
next_done = next_done,
|
||||||
next_state=next_state,
|
next_state=next_state,
|
||||||
)
|
)
|
||||||
|
|
||||||
# check if any training dataset is full and ready to train
|
# check if any training dataset is full and ready to train
|
||||||
for i in range(TARGETNUM):
|
for i in range(args.target_num):
|
||||||
if ppo_memories.obs[i].size()[0] >= args.datasetSize:
|
if ppo_memories.obs[i].size()[0] >= args.datasetSize:
|
||||||
# start train NN
|
# start train NN
|
||||||
train_queue.append(i)
|
train_queue.append(i)
|
||||||
if(len(train_queue)>0):
|
if(len(train_queue)>0):
|
||||||
|
# break while loop and start train
|
||||||
break
|
break
|
||||||
|
# update state
|
||||||
state, done = next_state, next_done
|
state, done = next_state, next_done
|
||||||
else:
|
else:
|
||||||
step += 1
|
step += 1
|
||||||
@ -299,7 +161,7 @@ if __name__ == "__main__":
|
|||||||
next_done = next_done,
|
next_done = next_done,
|
||||||
next_state=next_state,
|
next_state=next_state,
|
||||||
)
|
)
|
||||||
|
# update state
|
||||||
state = next_state
|
state = next_state
|
||||||
last_reward = reward
|
last_reward = reward
|
||||||
|
|
||||||
@ -307,137 +169,34 @@ if __name__ == "__main__":
|
|||||||
# train mode on
|
# train mode on
|
||||||
mean_reward_list = [] # for WANDB
|
mean_reward_list = [] # for WANDB
|
||||||
# loop all tarining queue
|
# loop all tarining queue
|
||||||
for thisT in train_queue:
|
for this_train_ind in train_queue:
|
||||||
# sart time
|
# sart time
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
target_steps[thisT]+=1
|
target_steps[this_train_ind]+=1
|
||||||
# flatten the batch
|
# train agent
|
||||||
b_obs = ppo_memories.obs[thisT].reshape((-1,) + env.unity_observation_shape)
|
(
|
||||||
b_dis_logprobs = ppo_memories.dis_logprobs[thisT].reshape(-1)
|
v_loss,
|
||||||
b_con_logprobs = ppo_memories.con_logprobs[thisT].reshape(-1)
|
dis_pg_loss,
|
||||||
b_actions = ppo_memories.actions[thisT].reshape((-1,) + (env.unity_action_size,))
|
con_pg_loss,
|
||||||
b_advantages = ppo_memories.advantages[thisT].reshape(-1)
|
loss,
|
||||||
b_returns = ppo_memories.returns[thisT].reshape(-1)
|
entropy_loss
|
||||||
b_values = ppo_memories.values[thisT].reshape(-1)
|
) = agent.train_net(
|
||||||
b_size = b_obs.size()[0]
|
this_train_ind=this_train_ind,
|
||||||
# Optimizing the policy and value network
|
ppo_memories=ppo_memories,
|
||||||
b_inds = np.arange(b_size)
|
optimizer=optimizer
|
||||||
# clipfracs = []
|
|
||||||
for epoch in range(args.epochs):
|
|
||||||
print(epoch,end="")
|
|
||||||
# shuffle all datasets
|
|
||||||
np.random.shuffle(b_inds)
|
|
||||||
for start in range(0, b_size, args.minibatchSize):
|
|
||||||
print(".",end="")
|
|
||||||
end = start + args.minibatchSize
|
|
||||||
mb_inds = b_inds[start:end]
|
|
||||||
if(np.size(mb_inds)<=1):
|
|
||||||
break
|
|
||||||
mb_advantages = b_advantages[mb_inds]
|
|
||||||
|
|
||||||
# normalize advantages
|
|
||||||
if args.norm_adv:
|
|
||||||
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
|
||||||
mb_advantages.std() + 1e-8
|
|
||||||
)
|
|
||||||
|
|
||||||
(
|
|
||||||
_,
|
|
||||||
new_dis_logprob,
|
|
||||||
dis_entropy,
|
|
||||||
new_con_logprob,
|
|
||||||
con_entropy,
|
|
||||||
newvalue,
|
|
||||||
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
|
||||||
# discrete ratio
|
|
||||||
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
|
||||||
dis_ratio = dis_logratio.exp()
|
|
||||||
# continuous ratio
|
|
||||||
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
|
|
||||||
con_ratio = con_logratio.exp()
|
|
||||||
|
|
||||||
"""
|
|
||||||
# early stop
|
|
||||||
with torch.no_grad():
|
|
||||||
# calculate approx_kl http://joschu.net/blog/kl-approx.html
|
|
||||||
old_approx_kl = (-logratio).mean()
|
|
||||||
approx_kl = ((ratio - 1) - logratio).mean()
|
|
||||||
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
|
|
||||||
"""
|
|
||||||
|
|
||||||
# discrete Policy loss
|
|
||||||
dis_pg_loss_orig = -mb_advantages * dis_ratio
|
|
||||||
dis_pg_loss_clip = -mb_advantages * torch.clamp(
|
|
||||||
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
|
||||||
)
|
)
|
||||||
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
|
|
||||||
# continuous Policy loss
|
|
||||||
con_pg_loss_orig = -mb_advantages * con_ratio
|
|
||||||
con_pg_loss_clip = -mb_advantages * torch.clamp(
|
|
||||||
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
|
||||||
)
|
|
||||||
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
|
||||||
|
|
||||||
# Value loss
|
|
||||||
newvalue = newvalue.view(-1)
|
|
||||||
if args.clip_vloss:
|
|
||||||
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
|
||||||
v_clipped = b_values[mb_inds] + torch.clamp(
|
|
||||||
newvalue - b_values[mb_inds],
|
|
||||||
-args.clip_coef,
|
|
||||||
args.clip_coef,
|
|
||||||
)
|
|
||||||
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
|
||||||
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
|
||||||
v_loss = 0.5 * v_loss_max.mean()
|
|
||||||
else:
|
|
||||||
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
|
||||||
|
|
||||||
# total loss
|
|
||||||
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
|
||||||
loss = (
|
|
||||||
dis_pg_loss * POLICY_COEF[thisT]
|
|
||||||
+ con_pg_loss * POLICY_COEF[thisT]
|
|
||||||
+ entropy_loss * ENTROPY_COEF[thisT]
|
|
||||||
+ v_loss * CRITIC_COEF[thisT]
|
|
||||||
)*LOSS_COEF[thisT]
|
|
||||||
|
|
||||||
if(torch.isnan(loss).any()):
|
|
||||||
print("LOSS Include NAN!!!")
|
|
||||||
if(torch.isnan(dis_pg_loss.any())):
|
|
||||||
print("dis_pg_loss include nan")
|
|
||||||
if(torch.isnan(con_pg_loss.any())):
|
|
||||||
print("con_pg_loss include nan")
|
|
||||||
if(torch.isnan(entropy_loss.any())):
|
|
||||||
print("entropy_loss include nan")
|
|
||||||
if(torch.isnan(v_loss.any())):
|
|
||||||
print("v_loss include nan")
|
|
||||||
raise
|
|
||||||
|
|
||||||
optimizer.zero_grad()
|
|
||||||
loss.backward()
|
|
||||||
# Clips gradient norm of an iterable of parameters.
|
|
||||||
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
|
|
||||||
optimizer.step()
|
|
||||||
|
|
||||||
"""
|
|
||||||
if args.target_kl is not None:
|
|
||||||
if approx_kl > args.target_kl:
|
|
||||||
break
|
|
||||||
"""
|
|
||||||
# record mean reward before clear history
|
# record mean reward before clear history
|
||||||
print("done")
|
print("done")
|
||||||
targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy())
|
targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
|
||||||
mean_reward_list.append(targetRewardMean)
|
mean_reward_list.append(targetRewardMean)
|
||||||
targetName = Targets(thisT).name
|
targetName = Targets(this_train_ind).name
|
||||||
|
|
||||||
# clear this target trainning set buffer
|
# clear this target trainning set buffer
|
||||||
ppo_memories.clear_training_datasets(thisT)
|
ppo_memories.clear_training_datasets(this_train_ind)
|
||||||
|
|
||||||
# record rewards for plotting purposes
|
# record rewards for plotting purposes
|
||||||
wdb_recorder.add_target_scalar(
|
wdb_recorder.add_target_scalar(
|
||||||
targetName,
|
targetName,
|
||||||
thisT,
|
this_train_ind,
|
||||||
v_loss,
|
v_loss,
|
||||||
dis_pg_loss,
|
dis_pg_loss,
|
||||||
con_pg_loss,
|
con_pg_loss,
|
||||||
@ -464,19 +223,19 @@ if __name__ == "__main__":
|
|||||||
# train mode off
|
# train mode off
|
||||||
mean_reward_list = [] # for WANDB
|
mean_reward_list = [] # for WANDB
|
||||||
# while not in training mode, clear the buffer
|
# while not in training mode, clear the buffer
|
||||||
for thisT in train_queue:
|
for this_train_ind in train_queue:
|
||||||
target_steps[thisT]+=1
|
target_steps[this_train_ind]+=1
|
||||||
targetName = Targets(thisT).name
|
targetName = Targets(this_train_ind).name
|
||||||
targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy())
|
targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
|
||||||
mean_reward_list.append(targetRewardMean)
|
mean_reward_list.append(targetRewardMean)
|
||||||
print(target_steps[thisT])
|
print(target_steps[this_train_ind])
|
||||||
|
|
||||||
# clear this target trainning set buffer
|
# clear this target trainning set buffer
|
||||||
ppo_memories.clear_training_datasets(thisT)
|
ppo_memories.clear_training_datasets(this_train_ind)
|
||||||
|
|
||||||
# record rewards for plotting purposes
|
# record rewards for plotting purposes
|
||||||
wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
|
wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[this_train_ind])
|
||||||
wdb_recorder.add_win_ratio(targetName,target_steps[thisT])
|
wdb_recorder.add_win_ratio(targetName,target_steps[this_train_ind])
|
||||||
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
||||||
TotalRewardMean = np.mean(mean_reward_list)
|
TotalRewardMean = np.mean(mean_reward_list)
|
||||||
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import argparse
|
import argparse
|
||||||
from aimbotEnv import Aimbot
|
|
||||||
from ppoagent import PPOAgent
|
from ppoagent import PPOAgent
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
@ -16,42 +15,39 @@ class Targets(Enum):
|
|||||||
class PPOMem:
|
class PPOMem:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
env: Aimbot,
|
|
||||||
args: argparse.Namespace,
|
args: argparse.Namespace,
|
||||||
|
unity_agent_num: int,
|
||||||
device: torch.device,
|
device: torch.device,
|
||||||
target_num: int,
|
|
||||||
target_state_size: int,
|
|
||||||
base_lose_reward: int,
|
|
||||||
base_win_reward: int,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
|
self.target_num = args.target_num
|
||||||
self.data_set_size = args.datasetSize
|
self.data_set_size = args.datasetSize
|
||||||
self.result_broadcast_ratio = args.result_broadcast_ratio
|
self.result_broadcast_ratio = args.result_broadcast_ratio
|
||||||
self.decision_period = args.decision_period
|
self.decision_period = args.decision_period
|
||||||
self.unity_agent_num = env.unity_agent_num
|
self.unity_agent_num = unity_agent_num
|
||||||
|
|
||||||
self.base_lose_reward = base_lose_reward
|
self.base_lose_reward = args.base_lose_reward
|
||||||
self.base_win_reward = base_win_reward
|
self.base_win_reward = args.base_win_reward
|
||||||
self.target_state_size = target_state_size
|
self.target_state_size = args.target_state_size
|
||||||
self.device = device
|
self.device = device
|
||||||
|
|
||||||
# Trajectory Buffer
|
# Trajectory Buffer
|
||||||
self.ob_bf = [[] for i in range(env.unity_agent_num)]
|
self.ob_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
self.act_bf = [[] for i in range(env.unity_agent_num)]
|
self.act_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
self.dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
self.dis_logprobs_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
self.con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
self.con_logprobs_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
self.rewards_bf = [[] for i in range(env.unity_agent_num)]
|
self.rewards_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
self.dones_bf = [[] for i in range(env.unity_agent_num)]
|
self.dones_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
self.values_bf = [[] for i in range(env.unity_agent_num)]
|
self.values_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
|
|
||||||
# initialize empty training datasets
|
# initialize empty training datasets
|
||||||
self.obs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_observation_size)
|
self.obs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,env.unity_observation_size)
|
||||||
self.actions = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_action_size)
|
self.actions = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,env.unity_action_size)
|
||||||
self.dis_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
|
self.dis_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
|
||||||
self.con_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
|
self.con_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
|
||||||
self.rewards = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
|
self.rewards = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
|
||||||
self.values = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
|
self.values = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
|
||||||
self.advantages = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
|
self.advantages = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
|
||||||
self.returns = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
|
self.returns = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
|
||||||
|
|
||||||
def broad_cast_end_reward(self, rewardBF: list, remainTime: float) -> torch.Tensor:
|
def broad_cast_end_reward(self, rewardBF: list, remainTime: float) -> torch.Tensor:
|
||||||
thisRewardBF = rewardBF.copy()
|
thisRewardBF = rewardBF.copy()
|
||||||
|
154
Aimbot-PPO-Python/Pytorch/arguments.py
Normal file
154
Aimbot-PPO-Python/Pytorch/arguments.py
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
import argparse
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
from distutils.util import strtobool
|
||||||
|
|
||||||
|
DEFAULT_SEED = 9331
|
||||||
|
ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv"
|
||||||
|
WAND_ENTITY = "koha9"
|
||||||
|
WORKER_ID = 1
|
||||||
|
BASE_PORT = 1000
|
||||||
|
|
||||||
|
# tensorboard names
|
||||||
|
GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3"
|
||||||
|
GAME_TYPE = "Mix_Verification"
|
||||||
|
|
||||||
|
# max round steps per agent is 2500/Decision_period, 25 seconds
|
||||||
|
TOTAL_STEPS = 3150000
|
||||||
|
BATCH_SIZE = 512
|
||||||
|
MAX_TRAINNING_DATASETS = 6000
|
||||||
|
DECISION_PERIOD = 1
|
||||||
|
LEARNING_RATE = 6.5e-4
|
||||||
|
GAMMA = 0.99
|
||||||
|
GAE_LAMBDA = 0.95
|
||||||
|
EPOCHS = 3
|
||||||
|
CLIP_COEF = 0.11
|
||||||
|
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
|
||||||
|
POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
|
||||||
|
ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
|
||||||
|
CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
|
||||||
|
TARGET_LEARNING_RATE = 1e-6
|
||||||
|
|
||||||
|
FREEZE_VIEW_NETWORK = False
|
||||||
|
BROADCASTREWARD = False
|
||||||
|
ANNEAL_LEARNING_RATE = True
|
||||||
|
CLIP_VLOSS = True
|
||||||
|
NORM_ADV = False
|
||||||
|
TRAIN = True
|
||||||
|
SAVE_MODEL = True
|
||||||
|
WANDB_TACK = True
|
||||||
|
LOAD_DIR = None
|
||||||
|
#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
|
||||||
|
|
||||||
|
# Unity Environment Parameters
|
||||||
|
TARGET_STATE_SIZE = 6
|
||||||
|
INAREA_STATE_SIZE = 1
|
||||||
|
TIME_STATE_SIZE = 1
|
||||||
|
GUN_STATE_SIZE = 1
|
||||||
|
MY_STATE_SIZE = 4
|
||||||
|
TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
|
||||||
|
BASE_WINREWARD = 999
|
||||||
|
BASE_LOSEREWARD = -999
|
||||||
|
TARGETNUM= 4
|
||||||
|
ENV_TIMELIMIT = 30
|
||||||
|
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
# fmt: off
|
||||||
|
# pytorch and environment parameters
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
|
||||||
|
help="seed of the experiment")
|
||||||
|
parser.add_argument("--path", type=str, default=ENV_PATH,
|
||||||
|
help="enviroment path")
|
||||||
|
parser.add_argument("--workerID", type=int, default=WORKER_ID,
|
||||||
|
help="unity worker ID")
|
||||||
|
parser.add_argument("--baseport", type=int, default=BASE_PORT,
|
||||||
|
help="port to connect to Unity environment")
|
||||||
|
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
|
||||||
|
help="the default learning rate of optimizer")
|
||||||
|
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||||
|
help="if toggled, cuda will be enabled by default")
|
||||||
|
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
|
||||||
|
help="total timesteps of the experiments")
|
||||||
|
|
||||||
|
# model parameters
|
||||||
|
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
|
||||||
|
help="Train Model or not")
|
||||||
|
parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
|
||||||
|
help="freeze view network or not")
|
||||||
|
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
|
||||||
|
help="training dataset size,start training while dataset collect enough data")
|
||||||
|
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
|
||||||
|
help="nimi batch size")
|
||||||
|
parser.add_argument("--epochs", type=int, default=EPOCHS,
|
||||||
|
help="the K epochs to update the policy")
|
||||||
|
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
|
||||||
|
help="Toggle learning rate annealing for policy and value networks")
|
||||||
|
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
|
||||||
|
help="track on the wandb")
|
||||||
|
parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
|
||||||
|
help="save model or not")
|
||||||
|
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
|
||||||
|
help="the entity (team) of wandb's project")
|
||||||
|
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
|
||||||
|
help="load model directory")
|
||||||
|
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
|
||||||
|
help="the number of steps to run in each environment per policy rollout")
|
||||||
|
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
|
||||||
|
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
|
||||||
|
parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True,
|
||||||
|
help="save model or not")
|
||||||
|
# target_learning_rate
|
||||||
|
parser.add_argument("--target-lr", type=float, default=TARGET_LEARNING_RATE,
|
||||||
|
help="target value of downscaling the learning rate")
|
||||||
|
|
||||||
|
# POLICY_COEF ENTROPY_COEF CRITIC_COEF LOSS_COEF
|
||||||
|
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
|
||||||
|
help="coefficient of the policy loss")
|
||||||
|
parser.add_argument("--entropy-coef", type=float, default=ENTROPY_COEF,
|
||||||
|
help="coefficient of the entropy loss")
|
||||||
|
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
|
||||||
|
help="coefficient of the critic loss")
|
||||||
|
parser.add_argument("--loss-coef", type=float, default=LOSS_COEF,
|
||||||
|
help="coefficient of the total loss")
|
||||||
|
|
||||||
|
# GAE loss
|
||||||
|
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||||
|
help="Use GAE for advantage computation")
|
||||||
|
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
|
||||||
|
help="Toggles advantages normalization")
|
||||||
|
parser.add_argument("--gamma", type=float, default=GAMMA,
|
||||||
|
help="the discount factor gamma")
|
||||||
|
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
|
||||||
|
help="the lambda for the general advantage estimation")
|
||||||
|
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
|
||||||
|
help="the surrogate clipping coefficient")
|
||||||
|
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
|
||||||
|
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
|
||||||
|
parser.add_argument("--max-grad-norm", type=float, default=0.5,
|
||||||
|
help="the maximum norm for the gradient clipping")
|
||||||
|
parser.add_argument("--target-kl", type=float, default=None,
|
||||||
|
help="the target KL divergence threshold")
|
||||||
|
# environment parameters
|
||||||
|
parser.add_argument("--target-num", type=int, default=TARGETNUM,
|
||||||
|
help="the number of targets")
|
||||||
|
parser.add_argument("--env-timelimit", type=int, default=ENV_TIMELIMIT,
|
||||||
|
help="the time limit of each round")
|
||||||
|
parser.add_argument("--base-win-reward", type=int, default=BASE_WINREWARD,
|
||||||
|
help="the base reward of win round")
|
||||||
|
parser.add_argument("--base-lose-reward", type=int, default=BASE_LOSEREWARD,
|
||||||
|
help="the base reward of lose round")
|
||||||
|
parser.add_argument("--target-state-size", type=int, default=TARGET_STATE_SIZE,
|
||||||
|
help="the size of target state")
|
||||||
|
parser.add_argument("--time-state-size", type=int, default=TIME_STATE_SIZE,
|
||||||
|
help="the size of time state")
|
||||||
|
parser.add_argument("--gun-state-size", type=int, default=GUN_STATE_SIZE,
|
||||||
|
help="the size of gun state")
|
||||||
|
parser.add_argument("--my-state-size", type=int, default=MY_STATE_SIZE,
|
||||||
|
help="the size of my state")
|
||||||
|
parser.add_argument("--total-target-size", type=int, default=TOTAL_T_SIZE,
|
||||||
|
help="the size of total target state")
|
||||||
|
# fmt: on
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
@ -1,6 +1,7 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import argparse
|
import argparse
|
||||||
|
import time
|
||||||
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from aimbotEnv import Aimbot
|
from aimbotEnv import Aimbot
|
||||||
@ -19,123 +20,118 @@ class PPOAgent(nn.Module):
|
|||||||
self,
|
self,
|
||||||
env: Aimbot,
|
env: Aimbot,
|
||||||
this_args:argparse.Namespace,
|
this_args:argparse.Namespace,
|
||||||
train_agent: bool,
|
|
||||||
target_num: int,
|
|
||||||
target_state_size: int,
|
|
||||||
time_state_size: int,
|
|
||||||
gun_state_size: int,
|
|
||||||
my_state_size: int,
|
|
||||||
total_t_size: int,
|
|
||||||
device: torch.device,
|
device: torch.device,
|
||||||
):
|
):
|
||||||
super(PPOAgent, self).__init__()
|
super(PPOAgent, self).__init__()
|
||||||
self.device = device
|
self.device = device
|
||||||
self.args = this_args
|
self.args = this_args
|
||||||
self.trainAgent = train_agent
|
self.train_agent = self.args.train
|
||||||
self.targetNum = target_num
|
self.target_num = self.args.target_num
|
||||||
self.stateSize = env.unity_observation_shape[0]
|
self.unity_observation_shape = env.unity_observation_shape
|
||||||
self.agentNum = env.unity_agent_num
|
self.unity_action_size = env.unity_action_size
|
||||||
self.targetSize = target_state_size
|
self.state_size = self.unity_observation_shape[0]
|
||||||
self.timeSize = time_state_size
|
self.agent_num = env.unity_agent_num
|
||||||
self.gunSize = gun_state_size
|
self.target_size = self.args.target_state_size
|
||||||
self.myStateSize = my_state_size
|
self.time_state_size = self.args.time_state_size
|
||||||
self.raySize = env.unity_observation_shape[0] - total_t_size
|
self.gun_state_size = self.args.gun_state_size
|
||||||
self.nonRaySize = total_t_size
|
self.my_state_size = self.args.my_state_size
|
||||||
|
self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
|
||||||
|
self.state_size_without_ray = self.args.total_target_size
|
||||||
self.head_input_size = (
|
self.head_input_size = (
|
||||||
env.unity_observation_shape[0] - self.targetSize - self.timeSize - self.gunSize
|
env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
|
||||||
) # except target state input
|
) # except target state input
|
||||||
|
|
||||||
self.unityDiscreteType = env.unity_discrete_type
|
self.unity_discrete_type = env.unity_discrete_type
|
||||||
self.discrete_size = env.unity_discrete_size
|
self.discrete_size = env.unity_discrete_size
|
||||||
self.discrete_shape = list(env.unity_discrete_branches)
|
self.discrete_shape = list(env.unity_discrete_branches)
|
||||||
self.continuous_size = env.unity_continuous_size
|
self.continuous_size = env.unity_continuous_size
|
||||||
|
|
||||||
self.viewNetwork = nn.Sequential(layer_init(nn.Linear(self.raySize, 200)), nn.LeakyReLU())
|
self.view_network = nn.Sequential(layer_init(nn.Linear(self.ray_state_size, 200)), nn.LeakyReLU())
|
||||||
self.targetNetworks = nn.ModuleList(
|
self.target_networks = nn.ModuleList(
|
||||||
[
|
[
|
||||||
nn.Sequential(layer_init(nn.Linear(self.nonRaySize, 100)), nn.LeakyReLU())
|
nn.Sequential(layer_init(nn.Linear(self.state_size_without_ray, 100)), nn.LeakyReLU())
|
||||||
for i in range(target_num)
|
for i in range(self.target_num)
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
self.middleNetworks = nn.ModuleList(
|
self.middle_networks = nn.ModuleList(
|
||||||
[
|
[
|
||||||
nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU())
|
nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU())
|
||||||
for i in range(target_num)
|
for i in range(self.target_num)
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
self.actor_dis = nn.ModuleList(
|
self.actor_dis = nn.ModuleList(
|
||||||
[layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(target_num)]
|
[layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(self.target_num)]
|
||||||
)
|
)
|
||||||
self.actor_mean = nn.ModuleList(
|
self.actor_mean = nn.ModuleList(
|
||||||
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(target_num)]
|
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
|
||||||
)
|
)
|
||||||
# self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
|
# self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
|
||||||
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||||
self.actor_logstd = nn.ParameterList(
|
self.actor_logstd = nn.ParameterList(
|
||||||
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(target_num)]
|
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
|
||||||
) # nn.Parameter(torch.zeros(1, self.continuous_size))
|
) # nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||||
self.critic = nn.ModuleList(
|
self.critic = nn.ModuleList(
|
||||||
[layer_init(nn.Linear(200, 1), std=1) for i in range(target_num)]
|
[layer_init(nn.Linear(200, 1), std=1) for i in range(self.target_num)]
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_value(self, state: torch.Tensor):
|
def get_value(self, state: torch.Tensor):
|
||||||
target = state[:, 0].to(torch.int32) # int
|
target = state[:, 0].to(torch.int32) # int
|
||||||
thisStateNum = target.size()[0]
|
this_state_num = target.size()[0]
|
||||||
viewInput = state[:, -self.raySize :] # all ray input
|
view_input = state[:, -self.ray_state_size :] # all ray input
|
||||||
targetInput = state[:, : self.nonRaySize]
|
target_input = state[:, : self.state_size_without_ray]
|
||||||
viewLayer = self.viewNetwork(viewInput)
|
view_layer = self.view_network(view_input)
|
||||||
targetLayer = torch.stack(
|
target_layer = torch.stack(
|
||||||
[self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]
|
[self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
|
||||||
)
|
)
|
||||||
middleInput = torch.cat([viewLayer, targetLayer], dim=1)
|
middle_input = torch.cat([view_layer, target_layer], dim=1)
|
||||||
middleLayer = torch.stack(
|
middle_layer = torch.stack(
|
||||||
[self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]
|
[self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
|
||||||
)
|
)
|
||||||
criticV = torch.stack(
|
criticV = torch.stack(
|
||||||
[self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]
|
[self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||||
) # self.critic
|
) # self.critic
|
||||||
return criticV
|
return criticV
|
||||||
|
|
||||||
def get_actions_value(self, state: torch.Tensor, actions=None):
|
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||||
target = state[:, 0].to(torch.int32) # int
|
target = state[:, 0].to(torch.int32) # int
|
||||||
thisStateNum = target.size()[0]
|
this_state_num = target.size()[0]
|
||||||
viewInput = state[:, -self.raySize :] # all ray input
|
view_input = state[:, -self.ray_state_size :] # all ray input
|
||||||
targetInput = state[:, : self.nonRaySize]
|
target_input = state[:, : self.state_size_without_ray]
|
||||||
viewLayer = self.viewNetwork(viewInput)
|
view_layer = self.view_network(view_input)
|
||||||
targetLayer = torch.stack(
|
target_layer = torch.stack(
|
||||||
[self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]
|
[self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
|
||||||
)
|
)
|
||||||
middleInput = torch.cat([viewLayer, targetLayer], dim=1)
|
middle_input = torch.cat([view_layer, target_layer], dim=1)
|
||||||
middleLayer = torch.stack(
|
middle_layer = torch.stack(
|
||||||
[self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]
|
[self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
|
||||||
)
|
)
|
||||||
|
|
||||||
# discrete
|
# discrete
|
||||||
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
||||||
dis_logits = torch.stack(
|
dis_logits = torch.stack(
|
||||||
[self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)]
|
[self.actor_dis[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||||
)
|
)
|
||||||
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
||||||
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
||||||
# continuous
|
# continuous
|
||||||
actions_mean = torch.stack(
|
actions_mean = torch.stack(
|
||||||
[self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]
|
[self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||||
) # self.actor_mean(hidden)
|
) # self.actor_mean(hidden)
|
||||||
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
|
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
|
||||||
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
|
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
|
||||||
action_logstd = torch.stack(
|
action_logstd = torch.stack(
|
||||||
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(thisStateNum)]
|
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)]
|
||||||
)
|
)
|
||||||
# print(action_logstd)
|
# print(action_logstd)
|
||||||
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
|
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
|
||||||
con_probs = Normal(actions_mean, action_std)
|
con_probs = Normal(actions_mean, action_std)
|
||||||
# critic
|
# critic
|
||||||
criticV = torch.stack(
|
criticV = torch.stack(
|
||||||
[self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]
|
[self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||||
) # self.critic
|
) # self.critic
|
||||||
|
|
||||||
if actions is None:
|
if actions is None:
|
||||||
if self.trainAgent:
|
if self.train_agent:
|
||||||
# select actions base on probability distribution model
|
# select actions base on probability distribution model
|
||||||
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
||||||
conAct = con_probs.sample()
|
conAct = con_probs.sample()
|
||||||
@ -148,8 +144,8 @@ class PPOAgent(nn.Module):
|
|||||||
conAct = con_probs.sample()
|
conAct = con_probs.sample()
|
||||||
actions = torch.cat([disAct.T, conAct], dim=1)
|
actions = torch.cat([disAct.T, conAct], dim=1)
|
||||||
else:
|
else:
|
||||||
disAct = actions[:, 0 : self.unityDiscreteType].T
|
disAct = actions[:, 0 : self.unity_discrete_type].T
|
||||||
conAct = actions[:, self.unityDiscreteType :]
|
conAct = actions[:, self.unity_discrete_type :]
|
||||||
dis_log_prob = torch.stack(
|
dis_log_prob = torch.stack(
|
||||||
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
|
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
|
||||||
)
|
)
|
||||||
@ -162,6 +158,123 @@ class PPOAgent(nn.Module):
|
|||||||
con_probs.entropy().sum(1),
|
con_probs.entropy().sum(1),
|
||||||
criticV,
|
criticV,
|
||||||
)
|
)
|
||||||
|
def train_net(self, this_train_ind:int,ppo_memories,optimizer) -> tuple:
|
||||||
|
start_time = time.time()
|
||||||
|
# flatten the batch
|
||||||
|
b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape)
|
||||||
|
b_dis_logprobs = ppo_memories.dis_logprobs[this_train_ind].reshape(-1)
|
||||||
|
b_con_logprobs = ppo_memories.con_logprobs[this_train_ind].reshape(-1)
|
||||||
|
b_actions = ppo_memories.actions[this_train_ind].reshape((-1,) + (self.unity_action_size,))
|
||||||
|
b_advantages = ppo_memories.advantages[this_train_ind].reshape(-1)
|
||||||
|
b_returns = ppo_memories.returns[this_train_ind].reshape(-1)
|
||||||
|
b_values = ppo_memories.values[this_train_ind].reshape(-1)
|
||||||
|
b_size = b_obs.size()[0]
|
||||||
|
# optimizing the policy and value network
|
||||||
|
b_inds = np.arange(b_size)
|
||||||
|
|
||||||
|
for epoch in range(self.args.epochs):
|
||||||
|
print("epoch:",epoch,end="")
|
||||||
|
# shuffle all datasets
|
||||||
|
np.random.shuffle(b_inds)
|
||||||
|
for start in range(0, b_size, self.args.minibatchSize):
|
||||||
|
print(".",end="")
|
||||||
|
end = start + self.args.minibatchSize
|
||||||
|
mb_inds = b_inds[start:end]
|
||||||
|
if(np.size(mb_inds)<=1):
|
||||||
|
break
|
||||||
|
mb_advantages = b_advantages[mb_inds]
|
||||||
|
|
||||||
|
# normalize advantages
|
||||||
|
if self.args.norm_adv:
|
||||||
|
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
||||||
|
mb_advantages.std() + 1e-8
|
||||||
|
)
|
||||||
|
|
||||||
|
(
|
||||||
|
_,
|
||||||
|
new_dis_logprob,
|
||||||
|
dis_entropy,
|
||||||
|
new_con_logprob,
|
||||||
|
con_entropy,
|
||||||
|
newvalue,
|
||||||
|
) = self.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
||||||
|
# discrete ratio
|
||||||
|
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
||||||
|
dis_ratio = dis_logratio.exp()
|
||||||
|
# continuous ratio
|
||||||
|
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
|
||||||
|
con_ratio = con_logratio.exp()
|
||||||
|
|
||||||
|
"""
|
||||||
|
# early stop
|
||||||
|
with torch.no_grad():
|
||||||
|
# calculate approx_kl http://joschu.net/blog/kl-approx.html
|
||||||
|
old_approx_kl = (-logratio).mean()
|
||||||
|
approx_kl = ((ratio - 1) - logratio).mean()
|
||||||
|
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
|
||||||
|
"""
|
||||||
|
|
||||||
|
# discrete Policy loss
|
||||||
|
dis_pg_loss_orig = -mb_advantages * dis_ratio
|
||||||
|
dis_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||||
|
dis_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
|
||||||
|
)
|
||||||
|
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
|
||||||
|
# continuous Policy loss
|
||||||
|
con_pg_loss_orig = -mb_advantages * con_ratio
|
||||||
|
con_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||||
|
con_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
|
||||||
|
)
|
||||||
|
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
||||||
|
|
||||||
|
# Value loss
|
||||||
|
newvalue = newvalue.view(-1)
|
||||||
|
if self.args.clip_vloss:
|
||||||
|
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
||||||
|
v_clipped = b_values[mb_inds] + torch.clamp(
|
||||||
|
newvalue - b_values[mb_inds],
|
||||||
|
-self.args.clip_coef,
|
||||||
|
self.args.clip_coef,
|
||||||
|
)
|
||||||
|
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
||||||
|
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
||||||
|
v_loss = 0.5 * v_loss_max.mean()
|
||||||
|
else:
|
||||||
|
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
||||||
|
|
||||||
|
# total loss
|
||||||
|
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
||||||
|
loss = (
|
||||||
|
dis_pg_loss * self.args.policy_coef[this_train_ind]
|
||||||
|
+ con_pg_loss * self.args.policy_coef[this_train_ind]
|
||||||
|
+ entropy_loss * self.args.entropy_coef[this_train_ind]
|
||||||
|
+ v_loss * self.args.critic_coef[this_train_ind]
|
||||||
|
)*self.args.loss_coef[this_train_ind]
|
||||||
|
|
||||||
|
if(torch.isnan(loss).any()):
|
||||||
|
print("LOSS Include NAN!!!")
|
||||||
|
if(torch.isnan(dis_pg_loss.any())):
|
||||||
|
print("dis_pg_loss include nan")
|
||||||
|
if(torch.isnan(con_pg_loss.any())):
|
||||||
|
print("con_pg_loss include nan")
|
||||||
|
if(torch.isnan(entropy_loss.any())):
|
||||||
|
print("entropy_loss include nan")
|
||||||
|
if(torch.isnan(v_loss.any())):
|
||||||
|
print("v_loss include nan")
|
||||||
|
raise
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
loss.backward()
|
||||||
|
# Clips gradient norm of an iterable of parameters.
|
||||||
|
nn.utils.clip_grad_norm_(self.parameters(), self.args.max_grad_norm)
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
"""
|
||||||
|
if args.target_kl is not None:
|
||||||
|
if approx_kl > args.target_kl:
|
||||||
|
break
|
||||||
|
"""
|
||||||
|
return (v_loss,dis_pg_loss,con_pg_loss,loss,entropy_loss)
|
||||||
|
|
||||||
def gae(
|
def gae(
|
||||||
self,
|
self,
|
||||||
|
BIN
Aimbot-PPO-Python/Pytorch/pytorch_run_archive.zip
Normal file
BIN
Aimbot-PPO-Python/Pytorch/pytorch_run_archive.zip
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user