import argparse import wandb import time import numpy as np import random import uuid import torch import torch.nn as nn import torch.optim as optim import atexit from torchviz import make_dot, make_dot_from_trace from AimbotEnv import Aimbot from tqdm import tqdm from enum import Enum from torch.distributions.normal import Normal from torch.distributions.categorical import Categorical from distutils.util import strtobool from torch.utils.tensorboard import SummaryWriter from mlagents_envs.environment import UnityEnvironment from mlagents_envs.side_channel.side_channel import ( SideChannel, IncomingMessage, OutgoingMessage, ) from typing import List bestReward = -1 DEFAULT_SEED = 9331 ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv" SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e") WAND_ENTITY = "koha9" WORKER_ID = 2 BASE_PORT = 1111 # max round steps per agent is 2500/Decision_period, 25 seconds # !!!check every parameters before run!!! TOTAL_STEPS = 3150000 BATCH_SIZE = 1024 MAX_TRAINNING_DATASETS = 6000 DECISION_PERIOD = 1 LEARNING_RATE = 5e-4 GAMMA = 0.99 GAE_LAMBDA = 0.95 EPOCHS = 3 CLIP_COEF = 0.11 LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence POLICY_COEF = [1.0, 1.0, 1.0, 1.0] ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05] CRITIC_COEF = [0.5, 0.5, 0.5, 0.5] TARGET_LEARNING_RATE = 1e-6 FREEZE_VIEW_NETWORK = False ANNEAL_LEARNING_RATE = True CLIP_VLOSS = True NORM_ADV = True TRAIN = True SAVE_MODEL = False WANDB_TACK = False LOAD_DIR = None #LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948-freeonly-20/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948_0.7949778.pt" # public data class Targets(Enum): Free = 0 Go = 1 Attack = 2 Defence = 3 Num = 4 TARGET_STATE_SIZE = 6 INAREA_STATE_SIZE = 1 TIME_STATE_SIZE = 1 GUN_STATE_SIZE = 1 MY_STATE_SIZE = 4 TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE BASE_WINREWARD = 999 BASE_LOSEREWARD = -999 TARGETNUM= 4 ENV_TIMELIMIT = 30 RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT TotalRounds = {"Free":0,"Go":0,"Attack":0} WinRounds = {"Free":0,"Go":0,"Attack":0} # !!!SPECIAL PARAMETERS!!! # change it while program is finished using_targets_num = 3 def parse_args(): # fmt: off # pytorch and environment parameters parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=DEFAULT_SEED, help="seed of the experiment") parser.add_argument("--path", type=str, default=ENV_PATH, help="enviroment path") parser.add_argument("--workerID", type=int, default=WORKER_ID, help="unity worker ID") parser.add_argument("--baseport", type=int, default=BASE_PORT, help="port to connect to Unity environment") parser.add_argument("--lr", type=float, default=LEARNING_RATE, help="the learning rate of optimizer") parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, help="if toggled, cuda will be enabled by default") parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS, help="total timesteps of the experiments") # model parameters parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True, help="Train Model or not") parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True, help="freeze view network or not") parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS, help="training dataset size,start training while dataset collect enough data") parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE, help="nimi batch size") parser.add_argument("--epochs", type=int, default=EPOCHS, help="the K epochs to update the policy") parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True, help="track on the wandb") parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True, help="save model or not") parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY, help="the entity (team) of wandb's project") parser.add_argument("--load-dir", type=str, default=LOAD_DIR, help="load model directory") parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD, help="the number of steps to run in each environment per policy rollout") parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO, help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime") # GAE loss parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, help="Use GAE for advantage computation") parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True, help="Toggles advantages normalization") parser.add_argument("--gamma", type=float, default=GAMMA, help="the discount factor gamma") parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA, help="the lambda for the general advantage estimation") parser.add_argument("--clip-coef", type=float, default=CLIP_COEF, help="the surrogate clipping coefficient") parser.add_argument("--policy-coef", type=float, default=POLICY_COEF, help="coefficient of the policy") parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF, help="coefficient of the entropy") parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF, help="coefficient of the value function") parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True, help="Toggles whether or not to use a clipped loss for the value function, as per the paper.") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="the maximum norm for the gradient clipping") parser.add_argument("--target-kl", type=float, default=None, help="the target KL divergence threshold") # fmt: on args = parser.parse_args() return args def layer_init(layer, std=np.sqrt(2), bias_const=0.0): torch.nn.init.orthogonal_(layer.weight, std) torch.nn.init.constant_(layer.bias, bias_const) return layer class PPOAgent(nn.Module): def __init__(self, env: Aimbot,targetNum:int): super(PPOAgent, self).__init__() self.targetNum = targetNum self.stateSize = env.unity_observation_shape[0] self.agentNum = env.unity_agent_num self.targetSize = TARGET_STATE_SIZE self.timeSize = TIME_STATE_SIZE self.gunSize = GUN_STATE_SIZE self.myStateSize = MY_STATE_SIZE self.raySize = env.unity_observation_shape[0] - TOTAL_T_SIZE self.nonRaySize = TOTAL_T_SIZE self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input self.discrete_size = env.unity_discrete_size self.discrete_shape = list(env.unity_discrete_branches) self.continuous_size = env.unity_continuous_size self.viewNetwork = nn.Sequential( layer_init(nn.Linear(self.raySize, 200)), nn.Tanh() ) self.targetNetworks = nn.ModuleList([nn.Sequential( layer_init(nn.Linear(self.nonRaySize, 100)), nn.Tanh() )for i in range(targetNum)]) self.middleNetworks = nn.ModuleList([nn.Sequential( layer_init(nn.Linear(300,200)), nn.Tanh() )for i in range(targetNum)]) self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(targetNum)]) self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(targetNum)]) # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)]) # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1,self.continuous_size))for i in range(targetNum)]) # nn.Parameter(torch.zeros(1, self.continuous_size)) self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)]) def get_value(self, state: torch.Tensor): target = state[:,0].to(torch.int32) # int thisStateNum = target.size()[0] viewInput = state[:,-self.raySize:] # all ray input targetInput = state[:,:self.nonRaySize] viewLayer = self.viewNetwork(viewInput) targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]) middleInput = torch.cat([viewLayer,targetLayer],dim = 1) middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]) criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic return criticV def get_actions_value(self, state: torch.Tensor, actions=None): target = state[:,0].to(torch.int32) # int thisStateNum = target.size()[0] viewInput = state[:,-self.raySize:] # all ray input targetInput = state[:,:self.nonRaySize] viewLayer = self.viewNetwork(viewInput) targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]) middleInput = torch.cat([viewLayer,targetLayer],dim = 1) middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]) # discrete # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出 dis_logits = torch.stack([self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)]) split_logits = torch.split(dis_logits, self.discrete_shape, dim=1) multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits] # continuous actions_mean = torch.stack([self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_mean(hidden) # action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden) # action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean) action_logstd = torch.stack([torch.squeeze(self.actor_logstd[target[i]],0) for i in range(thisStateNum)]) # print(action_logstd) action_std = torch.exp(action_logstd) # torch.exp(action_logstd) con_probs = Normal(actions_mean, action_std) # critic criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic if actions is None: if args.train: # select actions base on probability distribution model disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) conAct = con_probs.sample() actions = torch.cat([disAct.T, conAct], dim=1) else: # select actions base on best probability distribution disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits]) conAct = actions_mean actions = torch.cat([disAct.T, conAct], dim=1) else: disAct = actions[:, 0 : env.unity_discrete_type].T conAct = actions[:, env.unity_discrete_type :] dis_log_prob = torch.stack( [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)] ) dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals]) return ( actions, dis_log_prob.sum(0), dis_entropy.sum(0), con_probs.log_prob(conAct).sum(1), con_probs.entropy().sum(1), criticV, ) def GAE(agent, args, rewards, dones, values, next_obs, next_done): # GAE with torch.no_grad(): next_value = agent.get_value(next_obs).reshape(1, -1) data_size = rewards.size()[0] if args.gae: advantages = torch.zeros_like(rewards).to(device) lastgaelam = 0 for t in reversed(range(data_size)): if t == data_size - 1: nextnonterminal = 1.0 - next_done nextvalues = next_value else: nextnonterminal = 1.0 - dones[t + 1] nextvalues = values[t + 1] delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] advantages[t] = lastgaelam = ( delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam ) returns = advantages + values else: returns = torch.zeros_like(rewards).to(device) for t in reversed(range(data_size)): if t == data_size - 1: nextnonterminal = 1.0 - next_done next_return = next_value else: nextnonterminal = 1.0 - dones[t + 1] next_return = returns[t + 1] returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return advantages = returns - values return advantages, returns class AimbotSideChannel(SideChannel): def __init__(self, channel_id: uuid.UUID) -> None: super().__init__(channel_id) def on_message_received(self, msg: IncomingMessage) -> None: """ Note: We must implement this method of the SideChannel interface to receive messages from Unity """ thisMessage = msg.read_string() # print(thisMessage) thisResult = thisMessage.split("|") if(thisResult[0] == "result"): TotalRounds[thisResult[1]]+=1 if(thisResult[2] == "Win"): WinRounds[thisResult[1]]+=1 #print(TotalRounds) #print(WinRounds) elif(thisResult[0] == "Error"): print(thisMessage) # 发送函数 def send_string(self, data: str) -> None: # send a string toC# msg = OutgoingMessage() msg.write_string(data) super().queue_message_to_send(msg) def send_bool(self, data: bool) -> None: msg = OutgoingMessage() msg.write_bool(data) super().queue_message_to_send(msg) def send_int(self, data: int) -> None: msg = OutgoingMessage() msg.write_int32(data) super().queue_message_to_send(msg) def send_float(self, data: float) -> None: msg = OutgoingMessage() msg.write_float32(data) super().queue_message_to_send(msg) def send_float_list(self, data: List[float]) -> None: msg = OutgoingMessage() msg.write_float32_list(data) super().queue_message_to_send(msg) def broadCastEndReward(rewardBF:list,remainTime:float): thisRewardBF = rewardBF if (rewardBF[-1]<=-500): # print("Lose DO NOT BROAD CAST",rewardBF[-1]) thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD thisRewardBF = thisRewardBF elif (rewardBF[-1]>=500): # print("Win! Broadcast reward!",rewardBF[-1]) thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist() else: print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1]) return torch.Tensor(thisRewardBF).to(device) if __name__ == "__main__": args = parse_args() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") # Initialize environment anget optimizer aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID); env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel]) if args.load_dir is None: agent = PPOAgent(env,TARGETNUM).to(device) else: agent = torch.load(args.load_dir) # freeze if args.freeze_viewnet: # freeze the view network for p in agent.viewNetwork.parameters(): p.requires_grad = False print("VIEW NETWORK FREEZED") print("Load Agent", args.load_dir) print(agent.eval()) optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) # Tensorboard and WandB Recorder game_name = "Aimbot_Target_Hybrid_PMNN_V2" game_type = "OffPolicy_EndBC" run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}" if args.wandb_track: wandb.init( project=game_name, entity=args.wandb_entity, sync_tensorboard=True, config=vars(args), name=run_name, monitor_gym=True, save_code=True, ) writer = SummaryWriter(f"runs/{run_name}") writer.add_text( "hyperparameters", "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), ) @atexit.register def save_model(): # save model while exit saveDir = "../PPO-Model/"+ run_name + "_last.pt" torch.save(agent, saveDir) print("save model to " + saveDir) # Trajectory Buffer ob_bf = [[] for i in range(env.unity_agent_num)] act_bf = [[] for i in range(env.unity_agent_num)] dis_logprobs_bf = [[] for i in range(env.unity_agent_num)] con_logprobs_bf = [[] for i in range(env.unity_agent_num)] rewards_bf = [[] for i in range(env.unity_agent_num)] dones_bf = [[] for i in range(env.unity_agent_num)] values_bf = [[] for i in range(env.unity_agent_num)] # start the game total_update_step = using_targets_num * args.total_timesteps // args.datasetSize target_steps = [0 for i in range(TARGETNUM)] start_time = time.time() state, _, done = env.reset() # state = torch.Tensor(next_obs).to(device) # next_done = torch.zeros(env.unity_agent_num).to(device) # initialize empty training datasets obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size) actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size) dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) vis_graph = make_dot(agent.get_actions_value( torch.Tensor(state).to(device) ), params=dict(agent.named_parameters())) vis_graph.view() # 会在当前目录下保存一个“Digraph.gv.pdf”文件,并在默认浏览器中打开 with torch.onnx.set_training(agent, False): trace, _ = torch.jit.get_trace_graph(agent, args=(torch.Tensor(state).to(device),)) make_dot_from_trace(trace) raise for total_steps in range(total_update_step): # discunt learning rate, while step == total_update_step lr will be 0 if args.annealLR: finalRatio = TARGET_LEARNING_RATE/args.lr frac = 1.0 - ((total_steps + 1.0) / total_update_step) lrnow = frac * args.lr optimizer.param_groups[0]["lr"] = lrnow else: lrnow = args.lr print("new episode",total_steps,"learning rate = ",lrnow) # MAIN LOOP: run agent in environment step = 0 training = False trainQueue = [] last_reward = [0.for i in range(env.unity_agent_num)] while True: if step % args.decision_period == 0: step += 1 # Choose action by agent with torch.no_grad(): # predict actions action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value( torch.Tensor(state).to(device) ) value = value.flatten() # variable from GPU to CPU action_cpu = action.cpu().numpy() dis_logprob_cpu = dis_logprob.cpu().numpy() con_logprob_cpu = con_logprob.cpu().numpy() value_cpu = value.cpu().numpy() # Environment step next_state, reward, next_done = env.step(action_cpu) # save memories for i in range(env.unity_agent_num): # save memories to buffers ob_bf[i].append(state[i]) act_bf[i].append(action_cpu[i]) dis_logprobs_bf[i].append(dis_logprob_cpu[i]) con_logprobs_bf[i].append(con_logprob_cpu[i]) rewards_bf[i].append(reward[i]+last_reward[i]) dones_bf[i].append(done[i]) values_bf[i].append(value_cpu[i]) remainTime = state[i,TARGET_STATE_SIZE] if next_done[i] == True: # finished a round, send finished memories to training datasets # compute advantage and discounted reward #print(i,"over") roundTargetType = int(state[i,0]) thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime) adv, rt = GAE( agent, args, thisRewardsTensor, torch.Tensor(dones_bf[i]).to(device), torch.tensor(values_bf[i]).to(device), torch.tensor(next_state[i]).to(device).unsqueeze(0), torch.Tensor([next_done[i]]).to(device), ) # send memories to training datasets obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0) actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0) dis_logprobs[roundTargetType] = torch.cat( (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0 ) con_logprobs[roundTargetType] = torch.cat( (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0 ) rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0) values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0) advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0) returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0) # clear buffers ob_bf[i] = [] act_bf[i] = [] dis_logprobs_bf[i] = [] con_logprobs_bf[i] = [] rewards_bf[i] = [] dones_bf[i] = [] values_bf[i] = [] print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}") for i in range(TARGETNUM): if obs[i].size()[0] >= args.datasetSize: # start train NN trainQueue.append(i) if(len(trainQueue)>0): break state, done = next_state, next_done else: step += 1 # skip this step use last predict action next_state, reward, next_done = env.step(action_cpu) # save memories for i in range(env.unity_agent_num): if next_done[i] == True: #print(i,"over???") # save memories to buffers ob_bf[i].append(state[i]) act_bf[i].append(action_cpu[i]) dis_logprobs_bf[i].append(dis_logprob_cpu[i]) con_logprobs_bf[i].append(con_logprob_cpu[i]) rewards_bf[i].append(reward[i]) dones_bf[i].append(done[i]) values_bf[i].append(value_cpu[i]) remainTime = state[i,TARGET_STATE_SIZE] # finished a round, send finished memories to training datasets # compute advantage and discounted reward roundTargetType = int(state[i,0]) thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime) adv, rt = GAE( agent, args, thisRewardsTensor, torch.Tensor(dones_bf[i]).to(device), torch.tensor(values_bf[i]).to(device), torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0), torch.Tensor([next_done[i]]).to(device), ) # send memories to training datasets obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0) actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0) dis_logprobs[roundTargetType] = torch.cat( (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0 ) con_logprobs[roundTargetType] = torch.cat( (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0 ) rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0) values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0) advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0) returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0) # clear buffers ob_bf[i] = [] act_bf[i] = [] dis_logprobs_bf[i] = [] con_logprobs_bf[i] = [] rewards_bf[i] = [] dones_bf[i] = [] values_bf[i] = [] print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}") state = next_state last_reward = reward i += 1 if args.train: meanRewardList = [] # for WANDB # loop all tarining queue for thisT in trainQueue: target_steps[thisT]+=1 # flatten the batch b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape) b_dis_logprobs = dis_logprobs[thisT].reshape(-1) b_con_logprobs = con_logprobs[thisT].reshape(-1) b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,)) b_advantages = advantages[thisT].reshape(-1) b_returns = returns[thisT].reshape(-1) b_values = values[thisT].reshape(-1) b_size = b_obs.size()[0] # Optimizing the policy and value network b_inds = np.arange(b_size) # clipfracs = [] for epoch in range(args.epochs): print(epoch,end="") # shuffle all datasets np.random.shuffle(b_inds) for start in range(0, b_size, args.minibatchSize): print(".",end="") end = start + args.minibatchSize mb_inds = b_inds[start:end] if(np.size(mb_inds)<=1): break mb_advantages = b_advantages[mb_inds] # normalize advantages if args.norm_adv: mb_advantages = (mb_advantages - mb_advantages.mean()) / ( mb_advantages.std() + 1e-8 ) ( _, new_dis_logprob, dis_entropy, new_con_logprob, con_entropy, newvalue, ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) # discrete ratio dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] dis_ratio = dis_logratio.exp() # continuous ratio con_logratio = new_con_logprob - b_con_logprobs[mb_inds] con_ratio = con_logratio.exp() """ # early stop with torch.no_grad(): # calculate approx_kl http://joschu.net/blog/kl-approx.html old_approx_kl = (-logratio).mean() approx_kl = ((ratio - 1) - logratio).mean() clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] """ # discrete Policy loss dis_pg_loss_orig = -mb_advantages * dis_ratio dis_pg_loss_clip = -mb_advantages * torch.clamp( dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef ) dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() # continuous Policy loss con_pg_loss_orig = -mb_advantages * con_ratio con_pg_loss_clip = -mb_advantages * torch.clamp( con_ratio, 1 - args.clip_coef, 1 + args.clip_coef ) con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() # Value loss newvalue = newvalue.view(-1) if args.clip_vloss: v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 v_clipped = b_values[mb_inds] + torch.clamp( newvalue - b_values[mb_inds], -args.clip_coef, args.clip_coef, ) v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() # total loss entropy_loss = dis_entropy.mean() + con_entropy.mean() loss = ( dis_pg_loss * POLICY_COEF[thisT] + con_pg_loss * POLICY_COEF[thisT] + entropy_loss * ENTROPY_COEF[thisT] + v_loss * CRITIC_COEF[thisT] )*LOSS_COEF[thisT] if(torch.isnan(loss).any()): print("LOSS Include NAN!!!") if(torch.isnan(dis_pg_loss.any())): print("dis_pg_loss include nan") if(torch.isnan(con_pg_loss.any())): print("con_pg_loss include nan") if(torch.isnan(entropy_loss.any())): print("entropy_loss include nan") if(torch.isnan(v_loss.any())): print("v_loss include nan") raise optimizer.zero_grad() loss.backward() # Clips gradient norm of an iterable of parameters. nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) optimizer.step() """ if args.target_kl is not None: if approx_kl > args.target_kl: break """ # record mean reward before clear history print("done") targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) meanRewardList.append(targetRewardMean) targetName = Targets(thisT).name # clear this target trainning set buffer obs[thisT] = torch.tensor([]).to(device) actions[thisT] = torch.tensor([]).to(device) dis_logprobs[thisT] = torch.tensor([]).to(device) con_logprobs[thisT] = torch.tensor([]).to(device) rewards[thisT] = torch.tensor([]).to(device) values[thisT] = torch.tensor([]).to(device) advantages[thisT] = torch.tensor([]).to(device) returns[thisT] = torch.tensor([]).to(device) # record rewards for plotting purposes writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT]) writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT]) writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT]) writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT]) writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT]) writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT]) print(f"episode over Target{targetName} mean reward:", targetRewardMean) TotalRewardMean = np.mean(meanRewardList) writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps) # New Record! if TotalRewardMean > bestReward and args.save_model: bestReward = targetRewardMean saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt" torch.save(agent, saveDir) saveDir = "../PPO-Model/"+ run_name + "_last.pt" torch.save(agent, saveDir) env.close() writer.close()