From 177974888a97087f6003e67d521fe7a55cea5969 Mon Sep 17 00:00:00 2001 From: Koha9 Date: Sat, 15 Jul 2023 20:37:23 +0900 Subject: [PATCH 1/2] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E6=95=B4=E7=90=86?= =?UTF-8?q?=EF=BC=8C=E4=B8=8D=E5=85=BC=E5=AE=B9=E8=BF=87=E5=8E=BB=E7=9A=84?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 代码整理,不兼容过去的模型 --- .vscode/settings.json | 3 + Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py | 303 ++++------------------- Aimbot-PPO-Python/Pytorch/airecorder.py | 82 ++++++ Aimbot-PPO-Python/Pytorch/ppoagent.py | 267 ++++++++++++++++++++ Aimbot-PPO-Python/Pytorch/test2.ipynb | 36 ++- Aimbot-PPO-Python/Pytorch/testarea.ipynb | 22 +- 6 files changed, 451 insertions(+), 262 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 Aimbot-PPO-Python/Pytorch/airecorder.py create mode 100644 Aimbot-PPO-Python/Pytorch/ppoagent.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..26df38b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.linting.enabled": false +} \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index af97665..97b6ab4 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -1,5 +1,4 @@ import argparse -import wandb import time import numpy as np import random @@ -9,20 +8,14 @@ import torch.nn as nn import torch.optim as optim import atexit -from AimbotEnv import Aimbot -from tqdm import tqdm + +from aimbotEnv import Aimbot +from ppoagent import PPOAgent +from ppoagent import GAE +from ppoagent import AimbotSideChannel +from airecorder import WandbRecorder from enum import Enum -from torch.distributions.normal import Normal -from torch.distributions.categorical import Categorical from distutils.util import strtobool -from torch.utils.tensorboard import SummaryWriter -from mlagents_envs.environment import UnityEnvironment -from mlagents_envs.side_channel.side_channel import ( - SideChannel, - IncomingMessage, - OutgoingMessage, -) -from typing import List bestReward = -1 @@ -62,11 +55,11 @@ BROADCASTREWARD = False ANNEAL_LEARNING_RATE = True CLIP_VLOSS = True NORM_ADV = False -TRAIN = False +TRAIN = True SAVE_MODEL = False -WANDB_TACK = True +WANDB_TACK = False LOAD_DIR = None -LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt" +#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt" # public data class Targets(Enum): @@ -86,8 +79,6 @@ BASE_LOSEREWARD = -999 TARGETNUM= 4 ENV_TIMELIMIT = 30 RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT -TotalRounds = {"Free":0,"Go":0,"Attack":0} -WinRounds = {"Free":0,"Go":0,"Attack":0} # !!!SPECIAL PARAMETERS!!! # change it while program is finished @@ -168,215 +159,6 @@ def parse_args(): return args -def layer_init(layer, std=np.sqrt(2), bias_const=0.0): - torch.nn.init.orthogonal_(layer.weight, std) - torch.nn.init.constant_(layer.bias, bias_const) - return layer - - -class PPOAgent(nn.Module): - def __init__(self, env: Aimbot,targetNum:int): - super(PPOAgent, self).__init__() - self.targetNum = targetNum - self.stateSize = env.unity_observation_shape[0] - self.agentNum = env.unity_agent_num - self.targetSize = TARGET_STATE_SIZE - self.timeSize = TIME_STATE_SIZE - self.gunSize = GUN_STATE_SIZE - self.myStateSize = MY_STATE_SIZE - self.raySize = env.unity_observation_shape[0] - TOTAL_T_SIZE - self.nonRaySize = TOTAL_T_SIZE - self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input - - self.discrete_size = env.unity_discrete_size - self.discrete_shape = list(env.unity_discrete_branches) - self.continuous_size = env.unity_continuous_size - - self.viewNetwork = nn.Sequential( - layer_init(nn.Linear(self.raySize, 200)), - nn.LeakyReLU() - ) - self.targetNetworks = nn.ModuleList([nn.Sequential( - layer_init(nn.Linear(self.nonRaySize, 100)), - nn.LeakyReLU() - )for i in range(targetNum)]) - self.middleNetworks = nn.ModuleList([nn.Sequential( - layer_init(nn.Linear(300,200)), - nn.LeakyReLU() - )for i in range(targetNum)]) - self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(targetNum)]) - self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(targetNum)]) - # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)]) - # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) - self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1,self.continuous_size))for i in range(targetNum)]) # nn.Parameter(torch.zeros(1, self.continuous_size)) - self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)]) - - def get_value(self, state: torch.Tensor): - target = state[:,0].to(torch.int32) # int - thisStateNum = target.size()[0] - viewInput = state[:,-self.raySize:] # all ray input - targetInput = state[:,:self.nonRaySize] - viewLayer = self.viewNetwork(viewInput) - targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]) - middleInput = torch.cat([viewLayer,targetLayer],dim = 1) - middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]) - criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic - return criticV - - def get_actions_value(self, state: torch.Tensor, actions=None): - target = state[:,0].to(torch.int32) # int - thisStateNum = target.size()[0] - viewInput = state[:,-self.raySize:] # all ray input - targetInput = state[:,:self.nonRaySize] - viewLayer = self.viewNetwork(viewInput) - targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]) - middleInput = torch.cat([viewLayer,targetLayer],dim = 1) - middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]) - - # discrete - # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出 - dis_logits = torch.stack([self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)]) - split_logits = torch.split(dis_logits, self.discrete_shape, dim=1) - multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits] - # continuous - actions_mean = torch.stack([self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_mean(hidden) - # action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden) - # action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean) - action_logstd = torch.stack([torch.squeeze(self.actor_logstd[target[i]],0) for i in range(thisStateNum)]) - # print(action_logstd) - action_std = torch.exp(action_logstd) # torch.exp(action_logstd) - con_probs = Normal(actions_mean, action_std) - # critic - criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic - - if actions is None: - if args.train: - # select actions base on probability distribution model - disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) - conAct = con_probs.sample() - actions = torch.cat([disAct.T, conAct], dim=1) - else: - # select actions base on best probability distribution - # disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits]) - conAct = actions_mean - disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) - conAct = con_probs.sample() - actions = torch.cat([disAct.T, conAct], dim=1) - else: - disAct = actions[:, 0 : env.unity_discrete_type].T - conAct = actions[:, env.unity_discrete_type :] - dis_log_prob = torch.stack( - [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)] - ) - dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals]) - return ( - actions, - dis_log_prob.sum(0), - dis_entropy.sum(0), - con_probs.log_prob(conAct).sum(1), - con_probs.entropy().sum(1), - criticV, - ) - - -def GAE(agent, args, rewards, dones, values, next_obs, next_done): - # GAE - with torch.no_grad(): - next_value = agent.get_value(next_obs).reshape(1, -1) - data_size = rewards.size()[0] - if args.gae: - advantages = torch.zeros_like(rewards).to(device) - lastgaelam = 0 - for t in reversed(range(data_size)): - if t == data_size - 1: - nextnonterminal = 1.0 - next_done - nextvalues = next_value - else: - nextnonterminal = 1.0 - dones[t + 1] - nextvalues = values[t + 1] - delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] - advantages[t] = lastgaelam = ( - delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam - ) - returns = advantages + values - else: - returns = torch.zeros_like(rewards).to(device) - for t in reversed(range(data_size)): - if t == data_size - 1: - nextnonterminal = 1.0 - next_done - next_return = next_value - else: - nextnonterminal = 1.0 - dones[t + 1] - next_return = returns[t + 1] - returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return - advantages = returns - values - return advantages, returns - -class AimbotSideChannel(SideChannel): - def __init__(self, channel_id: uuid.UUID) -> None: - super().__init__(channel_id) - def on_message_received(self, msg: IncomingMessage) -> None: - global SCrecieved # make sure this variable is global - """ - Note: We must implement this method of the SideChannel interface to - receive messages from Unity - Message will be sent like this: - "Warning|Message1|Message2|Message3" or - "Error|Message1|Message2|Message3" - """ - thisMessage = msg.read_string() - thisResult = thisMessage.split("|") - if(thisResult[0] == "result"): - TotalRounds[thisResult[1]]+=1 - if(thisResult[2] == "Win"): - WinRounds[thisResult[1]]+=1 - #print(TotalRounds) - #print(WinRounds) - elif(thisResult[0] == "Error"): - print(thisMessage) - - # # while Message type is Warning - # if(thisResult[0] == "Warning"): - # # while Message1 is result means one game is over - # if (thisResult[1] == "Result"): - # TotalRounds[thisResult[2]]+=1 - # # while Message3 is Win means this agent win this game - # if(thisResult[3] == "Win"): - # WinRounds[thisResult[2]]+=1 - # # while Message1 is GameState means this game is just start - # # and tell python which game mode is - # elif (thisResult[1] == "GameState"): - # SCrecieved = 1 - # # while Message type is Error - # elif(thisResult[0] == "Error"): - # print(thisMessage) - # 发送函数 - def send_string(self, data: str) -> None: - # send a string toC# - msg = OutgoingMessage() - msg.write_string(data) - super().queue_message_to_send(msg) - - def send_bool(self, data: bool) -> None: - msg = OutgoingMessage() - msg.write_bool(data) - super().queue_message_to_send(msg) - - def send_int(self, data: int) -> None: - msg = OutgoingMessage() - msg.write_int32(data) - super().queue_message_to_send(msg) - - def send_float(self, data: float) -> None: - msg = OutgoingMessage() - msg.write_float32(data) - super().queue_message_to_send(msg) - - def send_float_list(self, data: List[float]) -> None: - msg = OutgoingMessage() - msg.write_float32_list(data) - super().queue_message_to_send(msg) - def broadCastEndReward(rewardBF:list,remainTime:float): thisRewardBF = rewardBF if (rewardBF[-1]<=-500): @@ -404,7 +186,16 @@ if __name__ == "__main__": aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID); env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel]) if args.load_dir is None: - agent = PPOAgent(env,TARGETNUM).to(device) + agent = PPOAgent( + env = env, + trainAgent=args.train, + targetNum=TARGETNUM, + target_state_size= TARGET_STATE_SIZE, + time_state_size=TIME_STATE_SIZE, + gun_state_size=GUN_STATE_SIZE, + my_state_size=MY_STATE_SIZE, + total_t_size=TOTAL_T_SIZE, + ).to(device) else: agent = torch.load(args.load_dir) # freeze @@ -420,23 +211,7 @@ if __name__ == "__main__": # Tensorboard and WandB Recorder run_name = f"{game_type}_{args.seed}_{int(time.time())}" - if args.wandb_track: - wandb.init( - project=game_name, - entity=args.wandb_entity, - sync_tensorboard=True, - config=vars(args), - name=run_name, - monitor_gym=True, - save_code=True, - ) - - writer = SummaryWriter(f"runs/{run_name}") - writer.add_text( - "hyperparameters", - "|param|value|\n|-|-|\n%s" - % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), - ) + wdb_recorder = WandbRecorder(game_name, game_type, run_name, args) @atexit.register def save_model(): @@ -538,6 +313,7 @@ if __name__ == "__main__": torch.tensor(values_bf[i]).to(device), torch.tensor(next_state[i]).to(device).unsqueeze(0), torch.Tensor([next_done[i]]).to(device), + device, ) # send memories to training datasets obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0) @@ -599,6 +375,7 @@ if __name__ == "__main__": torch.tensor(values_bf[i]).to(device), torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0), torch.Tensor([next_done[i]]).to(device), + device ) # send memories to training datasets obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0) @@ -629,6 +406,7 @@ if __name__ == "__main__": i += 1 if args.train: + # train mode on meanRewardList = [] # for WANDB # loop all tarining queue for thisT in trainQueue: @@ -766,17 +544,24 @@ if __name__ == "__main__": returns[thisT] = torch.tensor([]).to(device) # record rewards for plotting purposes - writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT]) - writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT]) - writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT]) - writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT]) - writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT]) - writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) - writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT]) + wdb_recorder.add_target_scalar( + targetName, + thisT, + v_loss, + dis_pg_loss, + con_pg_loss, + loss, + entropy_loss, + targetRewardMean, + target_steps, + ) print(f"episode over Target{targetName} mean reward:", targetRewardMean) TotalRewardMean = np.mean(meanRewardList) - writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) - writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps) + wdb_recorder.add_global_scalar( + TotalRewardMean, + optimizer.param_groups[0]["lr"], + total_steps, + ) # print cost time as seconds print("cost time:", time.time() - start_time) # New Record! @@ -785,6 +570,7 @@ if __name__ == "__main__": saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt" torch.save(agent, saveDir) else: + # train mode off meanRewardList = [] # for WANDB # while not in training mode, clear the buffer for thisT in trainQueue: @@ -804,14 +590,13 @@ if __name__ == "__main__": returns[thisT] = torch.tensor([]).to(device) # record rewards for plotting purposes - - writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) - writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT]) + wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) + wdb_recorder.add_win_ratio(targetName,target_steps[thisT]) print(f"episode over Target{targetName} mean reward:", targetRewardMean) TotalRewardMean = np.mean(meanRewardList) - writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) + wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) saveDir = "../PPO-Model/"+ run_name + "_last.pt" torch.save(agent, saveDir) env.close() - writer.close() + wdb_recorder.writer.close() diff --git a/Aimbot-PPO-Python/Pytorch/airecorder.py b/Aimbot-PPO-Python/Pytorch/airecorder.py new file mode 100644 index 0000000..3cea9df --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/airecorder.py @@ -0,0 +1,82 @@ +import wandb +import time +from torch.utils.tensorboard import SummaryWriter + + +total_rounds = {"Free": 0, "Go": 0, "Attack": 0} +win_rounds = {"Free": 0, "Go": 0, "Attack": 0} + + +# class for wandb recording +class WandbRecorder: + def __init__(self, game_name: str, game_type: str, run_name: str, _args) -> None: + # init wandb + self.game_name = game_name + self.game_type = game_type + self._args = _args + self.run_name = run_name + if self._args.wandb_track: + wandb.init( + project=self.game_name, + entity=self._args.wandb_entity, + sync_tensorboard=True, + config=vars(self._args), + name=self.run_name, + monitor_gym=True, + save_code=True, + ) + self.writer = SummaryWriter(f"runs/{self.run_name}") + self.writer.add_text( + "hyperparameters", + "|param|value|\n|-|-|\n%s" + % ("\n".join([f"|{key}|{value}|" for key, value in vars(self._args).items()])), + ) + + def add_target_scalar( + self, + target_name, + thisT, + v_loss, + dis_pg_loss, + con_pg_loss, + loss, + entropy_loss, + target_reward_mean, + target_steps, + ): + # fmt:off + self.writer.add_scalar( + f"Target{target_name}/value_loss", v_loss.item(), target_steps[thisT] + ) + self.writer.add_scalar( + f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT] + ) + self.writer.add_scalar( + f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[thisT] + ) + self.writer.add_scalar( + f"Target{target_name}/total_loss", loss.item(), target_steps[thisT] + ) + self.writer.add_scalar( + f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[thisT] + ) + self.writer.add_scalar( + f"Target{target_name}/Reward", target_reward_mean, target_steps[thisT] + ) + self.writer.add_scalar( + f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[thisT], + ) + # fmt:on + + def add_global_scalar( + self, + total_reward_mean, + learning_rate, + total_steps, + ): + self.writer.add_scalar("GlobalCharts/TotalRewardMean", total_reward_mean, total_steps) + self.writer.add_scalar("GlobalCharts/learning_rate", learning_rate, total_steps) + def add_win_ratio(self, target_name, target_steps): + self.writer.add_scalar( + f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps, + ) diff --git a/Aimbot-PPO-Python/Pytorch/ppoagent.py b/Aimbot-PPO-Python/Pytorch/ppoagent.py new file mode 100644 index 0000000..d13bae0 --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py @@ -0,0 +1,267 @@ +import numpy as np +import torch +import uuid +import airecorder +from torch import nn +from typing import List +from aimbotEnv import Aimbot +from torch.distributions.normal import Normal +from torch.distributions.categorical import Categorical +from mlagents_envs.side_channel.side_channel import ( + SideChannel, + IncomingMessage, + OutgoingMessage, +) + + +def layer_init(layer, std=np.sqrt(2), bias_const=0.0): + nn.init.orthogonal_(layer.weight, std) + nn.init.constant_(layer.bias, bias_const) + return layer + + +class PPOAgent(nn.Module): + def __init__( + self, + env: Aimbot, + trainAgent: bool, + targetNum: int, + target_state_size: int, + time_state_size: int, + gun_state_size: int, + my_state_size: int, + total_t_size: int, + ): + super(PPOAgent, self).__init__() + self.trainAgent = trainAgent + self.targetNum = targetNum + self.stateSize = env.unity_observation_shape[0] + self.agentNum = env.unity_agent_num + self.targetSize = target_state_size + self.timeSize = time_state_size + self.gunSize = gun_state_size + self.myStateSize = my_state_size + self.raySize = env.unity_observation_shape[0] - total_t_size + self.nonRaySize = total_t_size + self.head_input_size = ( + env.unity_observation_shape[0] - self.targetSize - self.timeSize - self.gunSize + ) # except target state input + + self.unityDiscreteType = env.unity_discrete_type + self.discrete_size = env.unity_discrete_size + self.discrete_shape = list(env.unity_discrete_branches) + self.continuous_size = env.unity_continuous_size + + self.viewNetwork = nn.Sequential(layer_init(nn.Linear(self.raySize, 200)), nn.LeakyReLU()) + self.targetNetworks = nn.ModuleList( + [ + nn.Sequential(layer_init(nn.Linear(self.nonRaySize, 100)), nn.LeakyReLU()) + for i in range(targetNum) + ] + ) + self.middleNetworks = nn.ModuleList( + [ + nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU()) + for i in range(targetNum) + ] + ) + self.actor_dis = nn.ModuleList( + [layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(targetNum)] + ) + self.actor_mean = nn.ModuleList( + [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(targetNum)] + ) + # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)]) + # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) + self.actor_logstd = nn.ParameterList( + [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)] + ) # nn.Parameter(torch.zeros(1, self.continuous_size)) + self.critic = nn.ModuleList( + [layer_init(nn.Linear(200, 1), std=1) for i in range(targetNum)] + ) + + def get_value(self, state: torch.Tensor): + target = state[:, 0].to(torch.int32) # int + thisStateNum = target.size()[0] + viewInput = state[:, -self.raySize :] # all ray input + targetInput = state[:, : self.nonRaySize] + viewLayer = self.viewNetwork(viewInput) + targetLayer = torch.stack( + [self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)] + ) + middleInput = torch.cat([viewLayer, targetLayer], dim=1) + middleLayer = torch.stack( + [self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)] + ) + criticV = torch.stack( + [self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)] + ) # self.critic + return criticV + + def get_actions_value(self, state: torch.Tensor, actions=None): + target = state[:, 0].to(torch.int32) # int + thisStateNum = target.size()[0] + viewInput = state[:, -self.raySize :] # all ray input + targetInput = state[:, : self.nonRaySize] + viewLayer = self.viewNetwork(viewInput) + targetLayer = torch.stack( + [self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)] + ) + middleInput = torch.cat([viewLayer, targetLayer], dim=1) + middleLayer = torch.stack( + [self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)] + ) + + # discrete + # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出 + dis_logits = torch.stack( + [self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)] + ) + split_logits = torch.split(dis_logits, self.discrete_shape, dim=1) + multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits] + # continuous + actions_mean = torch.stack( + [self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)] + ) # self.actor_mean(hidden) + # action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden) + # action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean) + action_logstd = torch.stack( + [torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(thisStateNum)] + ) + # print(action_logstd) + action_std = torch.exp(action_logstd) # torch.exp(action_logstd) + con_probs = Normal(actions_mean, action_std) + # critic + criticV = torch.stack( + [self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)] + ) # self.critic + + if actions is None: + if self.trainAgent: + # select actions base on probability distribution model + disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) + conAct = con_probs.sample() + actions = torch.cat([disAct.T, conAct], dim=1) + else: + # select actions base on best probability distribution + # disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits]) + conAct = actions_mean + disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) + conAct = con_probs.sample() + actions = torch.cat([disAct.T, conAct], dim=1) + else: + disAct = actions[:, 0 : self.unityDiscreteType].T + conAct = actions[:, self.unityDiscreteType :] + dis_log_prob = torch.stack( + [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)] + ) + dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals]) + return ( + actions, + dis_log_prob.sum(0), + dis_entropy.sum(0), + con_probs.log_prob(conAct).sum(1), + con_probs.entropy().sum(1), + criticV, + ) + + +def GAE(agent, args, rewards, dones, values, next_obs, next_done, device): + # GAE + with torch.no_grad(): + next_value = agent.get_value(next_obs).reshape(1, -1) + data_size = rewards.size()[0] + if args.gae: + advantages = torch.zeros_like(rewards).to(device) + lastgaelam = 0 + for t in reversed(range(data_size)): + if t == data_size - 1: + nextnonterminal = 1.0 - next_done + nextvalues = next_value + else: + nextnonterminal = 1.0 - dones[t + 1] + nextvalues = values[t + 1] + delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] + advantages[t] = lastgaelam = ( + delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam + ) + returns = advantages + values + else: + returns = torch.zeros_like(rewards).to(device) + for t in reversed(range(data_size)): + if t == data_size - 1: + nextnonterminal = 1.0 - next_done + next_return = next_value + else: + nextnonterminal = 1.0 - dones[t + 1] + next_return = returns[t + 1] + returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return + advantages = returns - values + return advantages, returns + + +class AimbotSideChannel(SideChannel): + def __init__(self, channel_id: uuid.UUID) -> None: + super().__init__(channel_id) + + def on_message_received(self, msg: IncomingMessage) -> None: + global SCrecieved # make sure this variable is global + """ + Note: We must implement this method of the SideChannel interface to + receive messages from Unity + Message will be sent like this: + "Warning|Message1|Message2|Message3" or + "Error|Message1|Message2|Message3" + """ + thisMessage = msg.read_string() + thisResult = thisMessage.split("|") + if(thisResult[0] == "result"): + airecorder.total_rounds[thisResult[1]]+=1 + if(thisResult[2] == "Win"): + airecorder.win_rounds[thisResult[1]]+=1 + #print(TotalRounds) + #print(WinRounds) + elif(thisResult[0] == "Error"): + print(thisMessage) + + # # while Message type is Warning + # if(thisResult[0] == "Warning"): + # # while Message1 is result means one game is over + # if (thisResult[1] == "Result"): + # TotalRounds[thisResult[2]]+=1 + # # while Message3 is Win means this agent win this game + # if(thisResult[3] == "Win"): + # WinRounds[thisResult[2]]+=1 + # # while Message1 is GameState means this game is just start + # # and tell python which game mode is + # elif (thisResult[1] == "GameState"): + # SCrecieved = 1 + # # while Message type is Error + # elif(thisResult[0] == "Error"): + # print(thisMessage) + # 发送函数 + def send_string(self, data: str) -> None: + # send a string toC# + msg = OutgoingMessage() + msg.write_string(data) + super().queue_message_to_send(msg) + + def send_bool(self, data: bool) -> None: + msg = OutgoingMessage() + msg.write_bool(data) + super().queue_message_to_send(msg) + + def send_int(self, data: int) -> None: + msg = OutgoingMessage() + msg.write_int32(data) + super().queue_message_to_send(msg) + + def send_float(self, data: float) -> None: + msg = OutgoingMessage() + msg.write_float32(data) + super().queue_message_to_send(msg) + + def send_float_list(self, data: List[float]) -> None: + msg = OutgoingMessage() + msg.write_float32_list(data) + super().queue_message_to_send(msg) \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/test2.ipynb b/Aimbot-PPO-Python/Pytorch/test2.ipynb index dc895ad..7cd4d47 100644 --- a/Aimbot-PPO-Python/Pytorch/test2.ipynb +++ b/Aimbot-PPO-Python/Pytorch/test2.ipynb @@ -107,6 +107,40 @@ ")\n", "from typing import List\n" ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'aaa' object has no attribute 'outa'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[5], line 14\u001b[0m\n\u001b[0;32m 12\u001b[0m asd \u001b[39m=\u001b[39m aaa(outa, outb)\n\u001b[0;32m 13\u001b[0m asd\u001b[39m.\u001b[39mfunc()\n\u001b[1;32m---> 14\u001b[0m \u001b[39mprint\u001b[39m(asd\u001b[39m.\u001b[39;49mouta) \u001b[39m# 输出 100\u001b[39;00m\n", + "\u001b[1;31mAttributeError\u001b[0m: 'aaa' object has no attribute 'outa'" + ] + } + ], + "source": [ + "class aaa():\n", + " def __init__(self, a, b):\n", + " self.a = a\n", + " self.b = b\n", + "\n", + " def func(self):\n", + " global outa\n", + " outa = 100\n", + "\n", + "outa = 1\n", + "outb = 2\n", + "asd = aaa(outa, outb)\n", + "asd.func()\n", + "print(asd.outa) # 输出 100" + ] } ], "metadata": { @@ -125,7 +159,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/Aimbot-PPO-Python/Pytorch/testarea.ipynb b/Aimbot-PPO-Python/Pytorch/testarea.ipynb index 5b95a12..20d780a 100644 --- a/Aimbot-PPO-Python/Pytorch/testarea.ipynb +++ b/Aimbot-PPO-Python/Pytorch/testarea.ipynb @@ -62,7 +62,6 @@ "outputs": [], "source": [ "from mlagents_envs.environment import UnityEnvironment\n", - "from gym_unity.envs import UnityToGymWrapper\n", "import numpy as np\n", "\n", "ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n", @@ -368,6 +367,7 @@ ], "source": [ "import torch\n", + "from torch import nn\n", "\n", "def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n", " torch.nn.init.orthogonal_(layer.weight, std)\n", @@ -1248,6 +1248,24 @@ "saveDir = \"C:/Users/UCUNI/OneDrive/Unity/ML-Agents/Aimbot-PPO/Aimbot-PPO-Python/PPO-Model/Chimera-1677965178-1678547500.pt\"\n", "torch.save(badGotoAgent,saveDir)" ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n" + ] + } + ], + "source": [ + "import torch\n", + "print(torch.cuda.is_available())" + ] } ], "metadata": { @@ -1266,7 +1284,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4, "vscode": { From a21fd724af84ccf50f94a012a901f816bfeed1bb Mon Sep 17 00:00:00 2001 From: Koha9 Date: Sat, 22 Jul 2023 19:26:39 +0900 Subject: [PATCH 2/2] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E6=95=B4=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 分离ppoagent,AI memory,AI Recorder 优化Aimbot Env 正规化各类命名 Archive不使用的package --- .vscode/settings.json | 4 +- Aimbot-PPO-Python/Pytorch/AimbotEnv.py | 107 +++++-- .../Pytorch/{ => Archive}/AimBotEnv-old.py | 0 .../Pytorch/{ => Archive}/graph.py | 0 .../Pytorch/{ => Archive}/ppo.py | 0 .../Pytorch/{ => Archive}/test2.ipynb | 57 ++++ .../Pytorch/{ => Archive}/testEnv.py | 0 .../Pytorch/{ => Archive}/testarea.ipynb | 0 Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py | 291 ++++++------------ Aimbot-PPO-Python/Pytorch/aimemory.py | 146 +++++++++ Aimbot-PPO-Python/Pytorch/ppoagent.py | 173 ++++------- 11 files changed, 438 insertions(+), 340 deletions(-) rename Aimbot-PPO-Python/Pytorch/{ => Archive}/AimBotEnv-old.py (100%) rename Aimbot-PPO-Python/Pytorch/{ => Archive}/graph.py (100%) rename Aimbot-PPO-Python/Pytorch/{ => Archive}/ppo.py (100%) rename Aimbot-PPO-Python/Pytorch/{ => Archive}/test2.ipynb (89%) rename Aimbot-PPO-Python/Pytorch/{ => Archive}/testEnv.py (100%) rename Aimbot-PPO-Python/Pytorch/{ => Archive}/testarea.ipynb (100%) create mode 100644 Aimbot-PPO-Python/Pytorch/aimemory.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 26df38b..780162f 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,5 @@ { - "python.linting.enabled": false + "python.linting.enabled": false, + "python.analysis.typeCheckingMode": "off", + "commentTranslate.source": "intellsmi.deepl-translate-deepl" } \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/AimbotEnv.py b/Aimbot-PPO-Python/Pytorch/AimbotEnv.py index 0ca1a59..1a4baca 100644 --- a/Aimbot-PPO-Python/Pytorch/AimbotEnv.py +++ b/Aimbot-PPO-Python/Pytorch/AimbotEnv.py @@ -1,9 +1,16 @@ import gym import numpy as np - +import uuid +import airecorder from numpy import ndarray from mlagents_envs.base_env import ActionTuple from mlagents_envs.environment import UnityEnvironment +from typing import Tuple, List +from mlagents_envs.side_channel.side_channel import ( + SideChannel, + IncomingMessage, + OutgoingMessage, +) class Aimbot(gym.Env): @@ -61,7 +68,7 @@ class Aimbot(gym.Env): # agents number self.unity_agent_num = len(self.unity_agent_IDS) - def reset(self): + def reset(self)->Tuple[np.ndarray, List, List]: """reset enviroment and get observations Returns: @@ -69,7 +76,7 @@ class Aimbot(gym.Env): """ # reset env self.env.reset() - nextState, reward, done = self.getSteps() + nextState, reward, done = self.get_steps() return nextState, reward, done # TODO: @@ -80,7 +87,7 @@ class Aimbot(gym.Env): def step( self, actions: ndarray, - ): + )->Tuple[np.ndarray, List, List]: """change ations list to ActionTuple then send it to enviroment Args: @@ -114,10 +121,10 @@ class Aimbot(gym.Env): self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple) self.env.step() # get nextState & reward & done after this action - nextStates, rewards, dones = self.getSteps() + nextStates, rewards, dones = self.get_steps() return nextStates, rewards, dones - def getSteps(self): + def get_steps(self)->Tuple[np.ndarray, List, List]: """get enviroment now observations. Include State, Reward, Done @@ -127,28 +134,92 @@ class Aimbot(gym.Env): ndarray: nextState, reward, done """ # get nextState & reward & done - decisionSteps, terminalSteps = self.env.get_steps(self.unity_beha_name) - nextStates = [] + decision_steps, terminal_steps = self.env.get_steps(self.unity_beha_name) + next_states = [] dones = [] rewards = [] - for thisAgentID in self.unity_agent_IDS: + for this_agent_ID in self.unity_agent_IDS: # while Episode over agentID will both in decisionSteps and terminalSteps. # avoid redundant state and reward, # use agentExist toggle to check if agent is already exist. - agentExist = False + agent_exist = False # game done - if thisAgentID in terminalSteps: - nextStates.append(terminalSteps[thisAgentID].obs[0]) + if this_agent_ID in terminal_steps: + next_states.append(terminal_steps[this_agent_ID].obs[0]) dones.append(True) - rewards.append(terminalSteps[thisAgentID].reward) - agentExist = True + rewards.append(terminal_steps[this_agent_ID].reward) + agent_exist = True # game not over yet and agent not in terminalSteps - if (thisAgentID in decisionSteps) and (not agentExist): - nextStates.append(decisionSteps[thisAgentID].obs[0]) + if (this_agent_ID in decision_steps) and (not agent_exist): + next_states.append(decision_steps[this_agent_ID].obs[0]) dones.append(False) - rewards.append(decisionSteps[thisAgentID].reward) + rewards.append(decision_steps[this_agent_ID].reward) - return np.asarray(nextStates), rewards, dones + return np.asarray(next_states), rewards, dones def close(self): self.env.close() + +class AimbotSideChannel(SideChannel): + def __init__(self, channel_id: uuid.UUID) -> None: + super().__init__(channel_id) + + def on_message_received(self, msg: IncomingMessage) -> None: + """ + Note: We must implement this method of the SideChannel interface to + receive messages from Unity + Message will be sent like this: + "Warning|Message1|Message2|Message3" or + "Error|Message1|Message2|Message3" + """ + this_message = msg.read_string() + this_result = this_message.split("|") + if(this_result[0] == "result"): + airecorder.total_rounds[this_result[1]]+=1 + if(this_result[2] == "Win"): + airecorder.win_rounds[this_result[1]]+=1 + #print(TotalRounds) + #print(WinRounds) + elif(this_result[0] == "Error"): + print(this_message) + # # while Message type is Warning + # if(thisResult[0] == "Warning"): + # # while Message1 is result means one game is over + # if (thisResult[1] == "Result"): + # TotalRounds[thisResult[2]]+=1 + # # while Message3 is Win means this agent win this game + # if(thisResult[3] == "Win"): + # WinRounds[thisResult[2]]+=1 + # # while Message1 is GameState means this game is just start + # # and tell python which game mode is + # elif (thisResult[1] == "GameState"): + # SCrecieved = 1 + # # while Message type is Error + # elif(thisResult[0] == "Error"): + # print(thisMessage) + # 发送函数 + def send_string(self, data: str) -> None: + # send a string toC# + msg = OutgoingMessage() + msg.write_string(data) + super().queue_message_to_send(msg) + + def send_bool(self, data: bool) -> None: + msg = OutgoingMessage() + msg.write_bool(data) + super().queue_message_to_send(msg) + + def send_int(self, data: int) -> None: + msg = OutgoingMessage() + msg.write_int32(data) + super().queue_message_to_send(msg) + + def send_float(self, data: float) -> None: + msg = OutgoingMessage() + msg.write_float32(data) + super().queue_message_to_send(msg) + + def send_float_list(self, data: List[float]) -> None: + msg = OutgoingMessage() + msg.write_float32_list(data) + super().queue_message_to_send(msg) \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/AimBotEnv-old.py b/Aimbot-PPO-Python/Pytorch/Archive/AimBotEnv-old.py similarity index 100% rename from Aimbot-PPO-Python/Pytorch/AimBotEnv-old.py rename to Aimbot-PPO-Python/Pytorch/Archive/AimBotEnv-old.py diff --git a/Aimbot-PPO-Python/Pytorch/graph.py b/Aimbot-PPO-Python/Pytorch/Archive/graph.py similarity index 100% rename from Aimbot-PPO-Python/Pytorch/graph.py rename to Aimbot-PPO-Python/Pytorch/Archive/graph.py diff --git a/Aimbot-PPO-Python/Pytorch/ppo.py b/Aimbot-PPO-Python/Pytorch/Archive/ppo.py similarity index 100% rename from Aimbot-PPO-Python/Pytorch/ppo.py rename to Aimbot-PPO-Python/Pytorch/Archive/ppo.py diff --git a/Aimbot-PPO-Python/Pytorch/test2.ipynb b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb similarity index 89% rename from Aimbot-PPO-Python/Pytorch/test2.ipynb rename to Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb index 7cd4d47..caa3aaa 100644 --- a/Aimbot-PPO-Python/Pytorch/test2.ipynb +++ b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb @@ -141,6 +141,63 @@ "asd.func()\n", "print(asd.outa) # 输出 100" ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "usage: ipykernel_launcher.py [-h] [--seed SEED]\n", + "ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme=\"hmac-sha256\" --Session.key=b\"46ef9317-59fb-4ab6-ae4e-6b35744fc423\" --shell=9002 --transport=\"tcp\" --iopub=9004 --f=c:\\Users\\UCUNI\\AppData\\Roaming\\jupyter\\runtime\\kernel-v2-311926K1uko38tdWb.json\n" + ] + }, + { + "ename": "SystemExit", + "evalue": "2", + "output_type": "error", + "traceback": [ + "An exception has occurred, use %tb to see the full traceback.\n", + "\u001b[1;31mSystemExit\u001b[0m\u001b[1;31m:\u001b[0m 2\n" + ] + } + ], + "source": [ + "import argparse\n", + "\n", + "def parse_args():\n", + " parser = argparse.ArgumentParser()\n", + " parser.add_argument(\"--seed\", type=int, default=11,\n", + " help=\"seed of the experiment\")\n", + " args = parser.parse_args()\n", + " return args\n", + "\n", + "arggg = parse_args()\n", + "print(type(arggg))" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1.2, 3.2)\n", + "1.2\n" + ] + } + ], + "source": [ + "aaa = (1.2,3.2)\n", + "print(aaa)\n", + "print(aaa[0])" + ] } ], "metadata": { diff --git a/Aimbot-PPO-Python/Pytorch/testEnv.py b/Aimbot-PPO-Python/Pytorch/Archive/testEnv.py similarity index 100% rename from Aimbot-PPO-Python/Pytorch/testEnv.py rename to Aimbot-PPO-Python/Pytorch/Archive/testEnv.py diff --git a/Aimbot-PPO-Python/Pytorch/testarea.ipynb b/Aimbot-PPO-Python/Pytorch/Archive/testarea.ipynb similarity index 100% rename from Aimbot-PPO-Python/Pytorch/testarea.ipynb rename to Aimbot-PPO-Python/Pytorch/Archive/testarea.ipynb diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index 97b6ab4..9e2e95e 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -10,16 +10,15 @@ import atexit from aimbotEnv import Aimbot +from aimbotEnv import AimbotSideChannel from ppoagent import PPOAgent -from ppoagent import GAE -from ppoagent import AimbotSideChannel from airecorder import WandbRecorder +from aimemory import PPOMem +from aimemory import Targets from enum import Enum from distutils.util import strtobool -bestReward = -1 - -SCrecieved = 0 +best_reward = -1 DEFAULT_SEED = 9331 ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv" @@ -29,8 +28,8 @@ WORKER_ID = 1 BASE_PORT = 1000 # tensorboard names -game_name = "Aimbot_Target_Hybrid_PMNN_V3" -game_type = "Mix_Verification" +GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3" +GAME_TYPE = "Mix_Verification" # max round steps per agent is 2500/Decision_period, 25 seconds # !!!check every parameters before run!!! @@ -61,13 +60,6 @@ WANDB_TACK = False LOAD_DIR = None #LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt" -# public data -class Targets(Enum): - Free = 0 - Go = 1 - Attack = 2 - Defence = 3 - Num = 4 TARGET_STATE_SIZE = 6 INAREA_STATE_SIZE = 1 TIME_STATE_SIZE = 1 @@ -159,21 +151,6 @@ def parse_args(): return args -def broadCastEndReward(rewardBF:list,remainTime:float): - thisRewardBF = rewardBF - if (rewardBF[-1]<=-500): - # print("Lose DO NOT BROAD CAST",rewardBF[-1]) - thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD - elif (rewardBF[-1]>=500): - # print("Win! Broadcast reward!",rewardBF[-1]) - print(sum(thisRewardBF)/len(thisRewardBF)) - thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD - thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist() - else: - print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1]) - return torch.Tensor(thisRewardBF).to(device) - - if __name__ == "__main__": args = parse_args() random.seed(args.seed) @@ -183,18 +160,20 @@ if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") # Initialize environment anget optimizer - aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID); - env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel]) + aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID); + env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimbot_sidechannel]) if args.load_dir is None: agent = PPOAgent( env = env, - trainAgent=args.train, - targetNum=TARGETNUM, + this_args=args, + train_agent=args.train, + target_num=TARGETNUM, target_state_size= TARGET_STATE_SIZE, time_state_size=TIME_STATE_SIZE, gun_state_size=GUN_STATE_SIZE, my_state_size=MY_STATE_SIZE, total_t_size=TOTAL_T_SIZE, + device=device, ).to(device) else: agent = torch.load(args.load_dir) @@ -210,8 +189,8 @@ if __name__ == "__main__": optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) # Tensorboard and WandB Recorder - run_name = f"{game_type}_{args.seed}_{int(time.time())}" - wdb_recorder = WandbRecorder(game_name, game_type, run_name, args) + run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}" + wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args) @atexit.register def save_model(): @@ -219,60 +198,49 @@ if __name__ == "__main__": env.close() if args.save_model: # save model while exit - saveDir = "../PPO-Model/"+ run_name + "_last.pt" - torch.save(agent, saveDir) - print("save model to " + saveDir) - - # Trajectory Buffer - ob_bf = [[] for i in range(env.unity_agent_num)] - act_bf = [[] for i in range(env.unity_agent_num)] - dis_logprobs_bf = [[] for i in range(env.unity_agent_num)] - con_logprobs_bf = [[] for i in range(env.unity_agent_num)] - rewards_bf = [[] for i in range(env.unity_agent_num)] - dones_bf = [[] for i in range(env.unity_agent_num)] - values_bf = [[] for i in range(env.unity_agent_num)] + save_dir = "../PPO-Model/"+ run_name + "_last.pt" + torch.save(agent, save_dir) + print("save model to " + save_dir) # start the game total_update_step = using_targets_num * args.total_timesteps // args.datasetSize target_steps = [0 for i in range(TARGETNUM)] start_time = time.time() state, _, done = env.reset() - # state = torch.Tensor(next_obs).to(device) - # next_done = torch.zeros(env.unity_agent_num).to(device) - # initialize empty training datasets - obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size) - actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size) - dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) - con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) - rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) - values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) - advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) - returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + # initialize AI memories + ppo_memories = PPOMem( + env = env, + device = device, + args=args, + target_num = TARGETNUM, + target_state_size = TARGET_STATE_SIZE, + base_lose_reward = BASE_LOSEREWARD, + base_win_reward = BASE_WINREWARD, + ) for total_steps in range(total_update_step): # discunt learning rate, while step == total_update_step lr will be 0 if args.annealLR: - finalRatio = TARGET_LEARNING_RATE/args.lr + final_lr_ratio = TARGET_LEARNING_RATE/args.lr frac = 1.0 - ((total_steps + 1.0) / total_update_step) - lrnow = frac * args.lr - optimizer.param_groups[0]["lr"] = lrnow + lr_now = frac * args.lr + optimizer.param_groups[0]["lr"] = lr_now else: - lrnow = args.lr - print("new episode",total_steps,"learning rate = ",lrnow) + lr_now = args.lr + print("new episode",total_steps,"learning rate = ",lr_now) # MAIN LOOP: run agent in environment step = 0 training = False - trainQueue = [] + train_queue = [] last_reward = [0.for i in range(env.unity_agent_num)] while True: if step % args.decision_period == 0: step += 1 # Choose action by agent - with torch.no_grad(): # predict actions action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value( @@ -289,61 +257,27 @@ if __name__ == "__main__": next_state, reward, next_done = env.step(action_cpu) # save memories - for i in range(env.unity_agent_num): - # save memories to buffers - ob_bf[i].append(state[i]) - act_bf[i].append(action_cpu[i]) - dis_logprobs_bf[i].append(dis_logprob_cpu[i]) - con_logprobs_bf[i].append(con_logprob_cpu[i]) - rewards_bf[i].append(reward[i]+last_reward[i]) - dones_bf[i].append(done[i]) - values_bf[i].append(value_cpu[i]) - remainTime = state[i,TARGET_STATE_SIZE] - if next_done[i] == True: - # finished a round, send finished memories to training datasets - # compute advantage and discounted reward - #print(i,"over") - roundTargetType = int(state[i,0]) - thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime) - adv, rt = GAE( - agent, - args, - thisRewardsTensor, - torch.Tensor(dones_bf[i]).to(device), - torch.tensor(values_bf[i]).to(device), - torch.tensor(next_state[i]).to(device).unsqueeze(0), - torch.Tensor([next_done[i]]).to(device), - device, - ) - # send memories to training datasets - obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0) - actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0) - dis_logprobs[roundTargetType] = torch.cat( - (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0 - ) - con_logprobs[roundTargetType] = torch.cat( - (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0 - ) - rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0) - values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0) - advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0) - returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0) - - # clear buffers - ob_bf[i] = [] - act_bf[i] = [] - dis_logprobs_bf[i] = [] - con_logprobs_bf[i] = [] - rewards_bf[i] = [] - dones_bf[i] = [] - values_bf[i] = [] - print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}") + ppo_memories.save_memories( + now_step = step, + agent = agent, + state = state, + action_cpu = action_cpu, + dis_logprob_cpu = dis_logprob_cpu, + con_logprob_cpu = con_logprob_cpu, + reward = reward, + done = done, + value_cpu = value_cpu, + last_reward = last_reward, + next_done = next_done, + next_state=next_state, + ) + # check if any training dataset is full and ready to train for i in range(TARGETNUM): - if obs[i].size()[0] >= args.datasetSize: + if ppo_memories.obs[i].size()[0] >= args.datasetSize: # start train NN - trainQueue.append(i) - if(len(trainQueue)>0): + train_queue.append(i) + if(len(train_queue)>0): break state, done = next_state, next_done else: @@ -351,76 +285,40 @@ if __name__ == "__main__": # skip this step use last predict action next_state, reward, next_done = env.step(action_cpu) # save memories - for i in range(env.unity_agent_num): - if next_done[i] == True: - #print(i,"over???") - # save memories to buffers - ob_bf[i].append(state[i]) - act_bf[i].append(action_cpu[i]) - dis_logprobs_bf[i].append(dis_logprob_cpu[i]) - con_logprobs_bf[i].append(con_logprob_cpu[i]) - rewards_bf[i].append(reward[i]) - dones_bf[i].append(done[i]) - values_bf[i].append(value_cpu[i]) - remainTime = state[i,TARGET_STATE_SIZE] - # finished a round, send finished memories to training datasets - # compute advantage and discounted reward - roundTargetType = int(state[i,0]) - thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime) - adv, rt = GAE( - agent, - args, - thisRewardsTensor, - torch.Tensor(dones_bf[i]).to(device), - torch.tensor(values_bf[i]).to(device), - torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0), - torch.Tensor([next_done[i]]).to(device), - device - ) - # send memories to training datasets - obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0) - actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0) - dis_logprobs[roundTargetType] = torch.cat( - (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0 - ) - con_logprobs[roundTargetType] = torch.cat( - (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0 - ) - rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0) - values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0) - advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0) - returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0) - - # clear buffers - ob_bf[i] = [] - act_bf[i] = [] - dis_logprobs_bf[i] = [] - con_logprobs_bf[i] = [] - rewards_bf[i] = [] - dones_bf[i] = [] - values_bf[i] = [] - print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}") + ppo_memories.save_memories( + now_step = step, + agent = agent, + state = state, + action_cpu = action_cpu, + dis_logprob_cpu = dis_logprob_cpu, + con_logprob_cpu = con_logprob_cpu, + reward = reward, + done = done, + value_cpu = value_cpu, + last_reward = last_reward, + next_done = next_done, + next_state=next_state, + ) state = next_state last_reward = reward - i += 1 if args.train: # train mode on - meanRewardList = [] # for WANDB + mean_reward_list = [] # for WANDB # loop all tarining queue - for thisT in trainQueue: + for thisT in train_queue: # sart time - startTime = time.time() + start_time = time.time() target_steps[thisT]+=1 # flatten the batch - b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape) - b_dis_logprobs = dis_logprobs[thisT].reshape(-1) - b_con_logprobs = con_logprobs[thisT].reshape(-1) - b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,)) - b_advantages = advantages[thisT].reshape(-1) - b_returns = returns[thisT].reshape(-1) - b_values = values[thisT].reshape(-1) + b_obs = ppo_memories.obs[thisT].reshape((-1,) + env.unity_observation_shape) + b_dis_logprobs = ppo_memories.dis_logprobs[thisT].reshape(-1) + b_con_logprobs = ppo_memories.con_logprobs[thisT].reshape(-1) + b_actions = ppo_memories.actions[thisT].reshape((-1,) + (env.unity_action_size,)) + b_advantages = ppo_memories.advantages[thisT].reshape(-1) + b_returns = ppo_memories.returns[thisT].reshape(-1) + b_values = ppo_memories.values[thisT].reshape(-1) b_size = b_obs.size()[0] # Optimizing the policy and value network b_inds = np.arange(b_size) @@ -529,19 +427,12 @@ if __name__ == "__main__": """ # record mean reward before clear history print("done") - targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) - meanRewardList.append(targetRewardMean) + targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy()) + mean_reward_list.append(targetRewardMean) targetName = Targets(thisT).name # clear this target trainning set buffer - obs[thisT] = torch.tensor([]).to(device) - actions[thisT] = torch.tensor([]).to(device) - dis_logprobs[thisT] = torch.tensor([]).to(device) - con_logprobs[thisT] = torch.tensor([]).to(device) - rewards[thisT] = torch.tensor([]).to(device) - values[thisT] = torch.tensor([]).to(device) - advantages[thisT] = torch.tensor([]).to(device) - returns[thisT] = torch.tensor([]).to(device) + ppo_memories.clear_training_datasets(thisT) # record rewards for plotting purposes wdb_recorder.add_target_scalar( @@ -556,7 +447,7 @@ if __name__ == "__main__": target_steps, ) print(f"episode over Target{targetName} mean reward:", targetRewardMean) - TotalRewardMean = np.mean(meanRewardList) + TotalRewardMean = np.mean(mean_reward_list) wdb_recorder.add_global_scalar( TotalRewardMean, optimizer.param_groups[0]["lr"], @@ -565,35 +456,29 @@ if __name__ == "__main__": # print cost time as seconds print("cost time:", time.time() - start_time) # New Record! - if TotalRewardMean > bestReward and args.save_model: - bestReward = targetRewardMean + if TotalRewardMean > best_reward and args.save_model: + best_reward = targetRewardMean saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt" torch.save(agent, saveDir) else: # train mode off - meanRewardList = [] # for WANDB + mean_reward_list = [] # for WANDB # while not in training mode, clear the buffer - for thisT in trainQueue: + for thisT in train_queue: target_steps[thisT]+=1 targetName = Targets(thisT).name - targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) - meanRewardList.append(targetRewardMean) + targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy()) + mean_reward_list.append(targetRewardMean) print(target_steps[thisT]) - obs[thisT] = torch.tensor([]).to(device) - actions[thisT] = torch.tensor([]).to(device) - dis_logprobs[thisT] = torch.tensor([]).to(device) - con_logprobs[thisT] = torch.tensor([]).to(device) - rewards[thisT] = torch.tensor([]).to(device) - values[thisT] = torch.tensor([]).to(device) - advantages[thisT] = torch.tensor([]).to(device) - returns[thisT] = torch.tensor([]).to(device) + # clear this target trainning set buffer + ppo_memories.clear_training_datasets(thisT) # record rewards for plotting purposes wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) wdb_recorder.add_win_ratio(targetName,target_steps[thisT]) print(f"episode over Target{targetName} mean reward:", targetRewardMean) - TotalRewardMean = np.mean(meanRewardList) + TotalRewardMean = np.mean(mean_reward_list) wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) saveDir = "../PPO-Model/"+ run_name + "_last.pt" diff --git a/Aimbot-PPO-Python/Pytorch/aimemory.py b/Aimbot-PPO-Python/Pytorch/aimemory.py new file mode 100644 index 0000000..9751c85 --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/aimemory.py @@ -0,0 +1,146 @@ +import torch +import numpy as np +import argparse +from aimbotEnv import Aimbot +from ppoagent import PPOAgent +from enum import Enum + +# public data +class Targets(Enum): + Free = 0 + Go = 1 + Attack = 2 + Defence = 3 + Num = 4 + +class PPOMem: + def __init__( + self, + env: Aimbot, + args: argparse.Namespace, + device: torch.device, + target_num: int, + target_state_size: int, + base_lose_reward: int, + base_win_reward: int, + ) -> None: + self.data_set_size = args.datasetSize + self.result_broadcast_ratio = args.result_broadcast_ratio + self.decision_period = args.decision_period + self.unity_agent_num = env.unity_agent_num + + self.base_lose_reward = base_lose_reward + self.base_win_reward = base_win_reward + self.target_state_size = target_state_size + self.device = device + + # Trajectory Buffer + self.ob_bf = [[] for i in range(env.unity_agent_num)] + self.act_bf = [[] for i in range(env.unity_agent_num)] + self.dis_logprobs_bf = [[] for i in range(env.unity_agent_num)] + self.con_logprobs_bf = [[] for i in range(env.unity_agent_num)] + self.rewards_bf = [[] for i in range(env.unity_agent_num)] + self.dones_bf = [[] for i in range(env.unity_agent_num)] + self.values_bf = [[] for i in range(env.unity_agent_num)] + + # initialize empty training datasets + self.obs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_observation_size) + self.actions = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_action_size) + self.dis_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) + self.con_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) + self.rewards = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) + self.values = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) + self.advantages = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) + self.returns = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1) + + def broad_cast_end_reward(self, rewardBF: list, remainTime: float) -> torch.Tensor: + thisRewardBF = rewardBF.copy() + if rewardBF[-1] <= -500: + # print("Lose DO NOT BROAD CAST",rewardBF[-1]) + thisRewardBF[-1] = rewardBF[-1] - self.base_lose_reward + elif rewardBF[-1] >= 500: + # print("Win! Broadcast reward!",rewardBF[-1]) + print(sum(thisRewardBF) / len(thisRewardBF)) + thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward + thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist() + else: + print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1]) + return torch.Tensor(thisRewardBF).to(self.device) + + def save_memories( + self, + now_step: int, + agent: PPOAgent, + state: np.ndarray, + action_cpu: np.ndarray, + dis_logprob_cpu: np.ndarray, + con_logprob_cpu: np.ndarray, + reward: list, + done: list, + value_cpu: np.ndarray, + last_reward: list, + next_done: list, + next_state: np.ndarray, + ): + for i in range(self.unity_agent_num): + if now_step % self.decision_period == 0 or next_done[i] == True: + # only on decision period or finished a round, save memories to buffer + self.ob_bf[i].append(state[i]) + self.act_bf[i].append(action_cpu[i]) + self.dis_logprobs_bf[i].append(dis_logprob_cpu[i]) + self.con_logprobs_bf[i].append(con_logprob_cpu[i]) + self.dones_bf[i].append(done[i]) + self.values_bf[i].append(value_cpu[i]) + if now_step % self.decision_period == 0: + # on decision period, add last skiped round's reward + self.rewards_bf[i].append(reward[i] + last_reward[i]) + else: + # not on decision period, only add this round's reward + self.rewards_bf[i].append(reward[i]) + if next_done[i] == True: + # finished a round, send finished memories to training datasets + # compute advantage and discounted reward + remainTime = state[i, self.target_state_size] + roundTargetType = int(state[i, 0]) + thisRewardsTensor = self.broad_cast_end_reward(self.rewards_bf[i], remainTime) + adv, rt = agent.gae( + rewards=thisRewardsTensor, + dones=torch.Tensor(self.dones_bf[i]).to(self.device), + values=torch.tensor(self.values_bf[i]).to(self.device), + next_obs=torch.tensor(next_state[i]).to(self.device).unsqueeze(0), + next_done=torch.Tensor([next_done[i]]).to(self.device), + ) + # send memories to training datasets + self.obs[roundTargetType] = torch.cat((self.obs[roundTargetType], torch.tensor(self.ob_bf[i]).to(self.device)), 0) + self.actions[roundTargetType] = torch.cat((self.actions[roundTargetType], torch.tensor(self.act_bf[i]).to(self.device)), 0) + self.dis_logprobs[roundTargetType] = torch.cat((self.dis_logprobs[roundTargetType], torch.tensor(self.dis_logprobs_bf[i]).to(self.device)), 0) + self.con_logprobs[roundTargetType] = torch.cat((self.con_logprobs[roundTargetType], torch.tensor(self.con_logprobs_bf[i]).to(self.device)), 0) + self.rewards[roundTargetType] = torch.cat((self.rewards[roundTargetType], thisRewardsTensor), 0) + self.values[roundTargetType] = torch.cat((self.values[roundTargetType], torch.tensor(self.values_bf[i]).to(self.device)), 0) + self.advantages[roundTargetType] = torch.cat((self.advantages[roundTargetType], adv), 0) + self.returns[roundTargetType] = torch.cat((self.returns[roundTargetType], rt), 0) + + # clear buffers + self.clear_buffers(i) + print(f"train dataset {Targets(roundTargetType).name} added:{self.obs[roundTargetType].size()[0]}/{self.data_set_size}") + + def clear_buffers(self,ind:int): + # clear buffers + self.ob_bf[ind] = [] + self.act_bf[ind] = [] + self.dis_logprobs_bf[ind] = [] + self.con_logprobs_bf[ind] = [] + self.rewards_bf[ind] = [] + self.dones_bf[ind] = [] + self.values_bf[ind] = [] + + def clear_training_datasets(self,ind:int): + # clear training datasets + self.obs[ind] = torch.tensor([]).to(self.device) + self.actions[ind] = torch.tensor([]).to(self.device) + self.dis_logprobs[ind] = torch.tensor([]).to(self.device) + self.con_logprobs[ind] = torch.tensor([]).to(self.device) + self.rewards[ind] = torch.tensor([]).to(self.device) + self.values[ind] = torch.tensor([]).to(self.device) + self.advantages[ind] = torch.tensor([]).to(self.device) + self.returns[ind] = torch.tensor([]).to(self.device) \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/ppoagent.py b/Aimbot-PPO-Python/Pytorch/ppoagent.py index d13bae0..917fc3e 100644 --- a/Aimbot-PPO-Python/Pytorch/ppoagent.py +++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py @@ -1,17 +1,11 @@ import numpy as np import torch -import uuid -import airecorder +import argparse + from torch import nn -from typing import List from aimbotEnv import Aimbot from torch.distributions.normal import Normal from torch.distributions.categorical import Categorical -from mlagents_envs.side_channel.side_channel import ( - SideChannel, - IncomingMessage, - OutgoingMessage, -) def layer_init(layer, std=np.sqrt(2), bias_const=0.0): @@ -24,17 +18,21 @@ class PPOAgent(nn.Module): def __init__( self, env: Aimbot, - trainAgent: bool, - targetNum: int, + this_args:argparse.Namespace, + train_agent: bool, + target_num: int, target_state_size: int, time_state_size: int, gun_state_size: int, my_state_size: int, total_t_size: int, + device: torch.device, ): super(PPOAgent, self).__init__() - self.trainAgent = trainAgent - self.targetNum = targetNum + self.device = device + self.args = this_args + self.trainAgent = train_agent + self.targetNum = target_num self.stateSize = env.unity_observation_shape[0] self.agentNum = env.unity_agent_num self.targetSize = target_state_size @@ -56,28 +54,28 @@ class PPOAgent(nn.Module): self.targetNetworks = nn.ModuleList( [ nn.Sequential(layer_init(nn.Linear(self.nonRaySize, 100)), nn.LeakyReLU()) - for i in range(targetNum) + for i in range(target_num) ] ) self.middleNetworks = nn.ModuleList( [ nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU()) - for i in range(targetNum) + for i in range(target_num) ] ) self.actor_dis = nn.ModuleList( - [layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(targetNum)] + [layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(target_num)] ) self.actor_mean = nn.ModuleList( - [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(targetNum)] + [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(target_num)] ) # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)]) # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) self.actor_logstd = nn.ParameterList( - [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)] + [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(target_num)] ) # nn.Parameter(torch.zeros(1, self.continuous_size)) self.critic = nn.ModuleList( - [layer_init(nn.Linear(200, 1), std=1) for i in range(targetNum)] + [layer_init(nn.Linear(200, 1), std=1) for i in range(target_num)] ) def get_value(self, state: torch.Tensor): @@ -165,103 +163,42 @@ class PPOAgent(nn.Module): criticV, ) - -def GAE(agent, args, rewards, dones, values, next_obs, next_done, device): - # GAE - with torch.no_grad(): - next_value = agent.get_value(next_obs).reshape(1, -1) - data_size = rewards.size()[0] - if args.gae: - advantages = torch.zeros_like(rewards).to(device) - lastgaelam = 0 - for t in reversed(range(data_size)): - if t == data_size - 1: - nextnonterminal = 1.0 - next_done - nextvalues = next_value - else: - nextnonterminal = 1.0 - dones[t + 1] - nextvalues = values[t + 1] - delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] - advantages[t] = lastgaelam = ( - delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam - ) - returns = advantages + values - else: - returns = torch.zeros_like(rewards).to(device) - for t in reversed(range(data_size)): - if t == data_size - 1: - nextnonterminal = 1.0 - next_done - next_return = next_value - else: - nextnonterminal = 1.0 - dones[t + 1] - next_return = returns[t + 1] - returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return - advantages = returns - values - return advantages, returns - - -class AimbotSideChannel(SideChannel): - def __init__(self, channel_id: uuid.UUID) -> None: - super().__init__(channel_id) - - def on_message_received(self, msg: IncomingMessage) -> None: - global SCrecieved # make sure this variable is global - """ - Note: We must implement this method of the SideChannel interface to - receive messages from Unity - Message will be sent like this: - "Warning|Message1|Message2|Message3" or - "Error|Message1|Message2|Message3" - """ - thisMessage = msg.read_string() - thisResult = thisMessage.split("|") - if(thisResult[0] == "result"): - airecorder.total_rounds[thisResult[1]]+=1 - if(thisResult[2] == "Win"): - airecorder.win_rounds[thisResult[1]]+=1 - #print(TotalRounds) - #print(WinRounds) - elif(thisResult[0] == "Error"): - print(thisMessage) - - # # while Message type is Warning - # if(thisResult[0] == "Warning"): - # # while Message1 is result means one game is over - # if (thisResult[1] == "Result"): - # TotalRounds[thisResult[2]]+=1 - # # while Message3 is Win means this agent win this game - # if(thisResult[3] == "Win"): - # WinRounds[thisResult[2]]+=1 - # # while Message1 is GameState means this game is just start - # # and tell python which game mode is - # elif (thisResult[1] == "GameState"): - # SCrecieved = 1 - # # while Message type is Error - # elif(thisResult[0] == "Error"): - # print(thisMessage) - # 发送函数 - def send_string(self, data: str) -> None: - # send a string toC# - msg = OutgoingMessage() - msg.write_string(data) - super().queue_message_to_send(msg) - - def send_bool(self, data: bool) -> None: - msg = OutgoingMessage() - msg.write_bool(data) - super().queue_message_to_send(msg) - - def send_int(self, data: int) -> None: - msg = OutgoingMessage() - msg.write_int32(data) - super().queue_message_to_send(msg) - - def send_float(self, data: float) -> None: - msg = OutgoingMessage() - msg.write_float32(data) - super().queue_message_to_send(msg) - - def send_float_list(self, data: List[float]) -> None: - msg = OutgoingMessage() - msg.write_float32_list(data) - super().queue_message_to_send(msg) \ No newline at end of file + def gae( + self, + rewards: torch.Tensor, + dones: torch.Tensor, + values: torch.tensor, + next_obs: torch.tensor, + next_done: torch.Tensor, + ) -> tuple: + # GAE + with torch.no_grad(): + next_value = self.get_value(next_obs).reshape(1, -1) + data_size = rewards.size()[0] + if self.args.gae: + advantages = torch.zeros_like(rewards).to(self.device) + last_gae_lam = 0 + for t in reversed(range(data_size)): + if t == data_size - 1: + nextnonterminal = 1.0 - next_done + next_values = next_value + else: + nextnonterminal = 1.0 - dones[t + 1] + next_values = values[t + 1] + delta = rewards[t] + self.args.gamma * next_values * nextnonterminal - values[t] + advantages[t] = last_gae_lam = ( + delta + self.args.gamma * self.args.gaeLambda * nextnonterminal * last_gae_lam + ) + returns = advantages + values + else: + returns = torch.zeros_like(rewards).to(self.device) + for t in reversed(range(data_size)): + if t == data_size - 1: + nextnonterminal = 1.0 - next_done + next_return = next_value + else: + nextnonterminal = 1.0 - dones[t + 1] + next_return = returns[t + 1] + returns[t] = rewards[t] + self.args.gamma * nextnonterminal * next_return + advantages = returns - values + return advantages, returns \ No newline at end of file