From 4b8ffeac6d3c87d6230ab486ef6228baf8fea5ff Mon Sep 17 00:00:00 2001 From: Koha9 Date: Sun, 4 Dec 2022 08:42:10 +0900 Subject: [PATCH] GAIL V0.1 save point GAIL V0.1 save point todo 1.human action record(GAILMem) debug 2.gail debug --- Aimbot-PPO-Python/Pytorch/GAIL.py | 679 ++++++++++++++++++++++ Aimbot-PPO-Python/Pytorch/GAILMem.py | 176 ++++++ Aimbot-PPO-Python/Pytorch/GAILRecorder.py | 97 ++++ Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py | 1 - Aimbot-PPO-Python/Pytorch/testarea.ipynb | 23 + 5 files changed, 975 insertions(+), 1 deletion(-) create mode 100644 Aimbot-PPO-Python/Pytorch/GAIL.py create mode 100644 Aimbot-PPO-Python/Pytorch/GAILMem.py create mode 100644 Aimbot-PPO-Python/Pytorch/GAILRecorder.py diff --git a/Aimbot-PPO-Python/Pytorch/GAIL.py b/Aimbot-PPO-Python/Pytorch/GAIL.py new file mode 100644 index 0000000..551a378 --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/GAIL.py @@ -0,0 +1,679 @@ +import os +import argparse +import wandb +import time +import numpy as np +import random +import uuid +import torch +import torch.nn as nn +import torch.optim as optim + +from AimbotEnv import Aimbot +from tqdm import tqdm +from enum import Enum +from torch.distributions.normal import Normal +from torch.distributions.categorical import Categorical +from distutils.util import strtobool +from torch.utils.tensorboard import SummaryWriter +from mlagents_envs.environment import UnityEnvironment +from mlagents_envs.side_channel.side_channel import ( + SideChannel, + IncomingMessage, + OutgoingMessage, +) +from typing import List +from GAILMem import GAILMem + +bestReward = 0 + +useCUDA = True +DEFAULT_SEED = 9331 +ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy/Aimbot-ParallelEnv" +EXPERT_PATH = "NAN" +SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e") +WAND_ENTITY = "koha9" +WORKER_ID = 2 +BASE_PORT = 1001 + +# max round steps per agent is 2500/Decision_period, 25 seconds +# !!!check every parameters before run!!! + +TOTAL_STEPS = 6750000 +BATCH_SIZE = 512 +MAX_TRAINNING_DATASETS = 3000 +DECISION_PERIOD = 1 +LEARNING_RATE = 1e-3 +GAMMA = 0.99 +GAE_LAMBDA = 0.95 +EPOCHS = 4 +CLIP_COEF = 0.1 +POLICY_COEF = 1.0 +ENTROPY_COEF = 0.01 +CRITIC_COEF = 0.5 +TARGET_LEARNING_RATE = 1e-5 +DSCM_ENHANCED = 1 + +ANNEAL_LEARNING_RATE = True +CLIP_VLOSS = True +NORM_ADV = True +TRAIN = True + +WANDB_TACK = True +LOAD_DIR = None +# LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt" + +# public data +class Targets(Enum): + Free = 0 + Go = 1 + Attack = 2 + Defence = 3 + Num = 4 + + +BASE_WINREWARD = 999 +BASE_LOSEREWARD = -999 +TARGETNUM = 4 +ENV_TIMELIMIT = 30 +RESULT_BROADCAST_RATIO = 2 / ENV_TIMELIMIT +TotalRounds = {"Free": 0, "Go": 0, "Attack": 0} +WinRounds = {"Free": 0, "Go": 0, "Attack": 0} +EPS = 1e-8 + +# !!!SPECIAL PARAMETERS!!! +# change it while program is finished +using_targets_num = 3 + + +def parse_args(): + # fmt: off + # pytorch and environment parameters + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=DEFAULT_SEED, + help="seed of the experiment") + parser.add_argument("--path", type=str, default=ENV_PATH, + help="enviroment path") + parser.add_argument("--workerID", type=int, default=WORKER_ID, + help="unity worker ID") + parser.add_argument("--baseport", type=int, default=BASE_PORT, + help="port to connect to Unity environment") + parser.add_argument("--lr", type=float, default=LEARNING_RATE, + help="the learning rate of optimizer") + parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, + help="if toggled, cuda will be enabled by default") + parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS, + help="total timesteps of the experiments") + + # model parameters + parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True, + help="Train Model or not") + parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS, + help="training dataset size,start training while dataset collect enough data") + parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE, + help="nimi batch size") + parser.add_argument("--epochs", type=int, default=EPOCHS, + help="the K epochs to update the policy") + parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True, + help="Toggle learning rate annealing for policy and value networks") + parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True, + help="track on the wandb") + parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY, + help="the entity (team) of wandb's project") + parser.add_argument("--load-dir", type=str, default=LOAD_DIR, + help="load model directory") + parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD, + help="the number of steps to run in each environment per policy rollout") + + # GAE loss + parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, + help="Use GAE for advantage computation") + parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True, + help="Toggles advantages normalization") + parser.add_argument("--gamma", type=float, default=GAMMA, + help="the discount factor gamma") + parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA, + help="the lambda for the general advantage estimation") + parser.add_argument("--clip-coef", type=float, default=CLIP_COEF, + help="the surrogate clipping coefficient") + parser.add_argument("--policy-coef", type=float, default=POLICY_COEF, + help="coefficient of the policy") + parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF, + help="coefficient of the entropy") + parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF, + help="coefficient of the value function") + parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True, + help="Toggles whether or not to use a clipped loss for the value function, as per the paper.") + parser.add_argument("--max-grad-norm", type=float, default=0.5, + help="the maximum norm for the gradient clipping") + parser.add_argument("--target-kl", type=float, default=None, + help="the target KL divergence threshold") + # fmt: on + args = parser.parse_args() + return args + + +def layer_init(layer, std=np.sqrt(2), bias_const=0.0): + torch.nn.init.orthogonal_(layer.weight, std) + torch.nn.init.constant_(layer.bias, bias_const) + return layer + + +class PPOAgent(nn.Module): + def __init__(self, env: Aimbot, targetNum: int): + super(PPOAgent, self).__init__() + self.targetNum = targetNum + self.discrete_size = env.unity_discrete_size + self.discrete_shape = list(env.unity_discrete_branches) + self.continuous_size = env.unity_continuous_size + + self.network = nn.ModuleList( + [ + nn.Sequential( + layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 300)), + nn.ReLU(), + layer_init(nn.Linear(300, 200)), + nn.ReLU(), + ) + for i in range(targetNum) + ] + ) + self.actor_dis = nn.ModuleList( + [layer_init(nn.Linear(200, self.discrete_size), std=0.01) for i in range(targetNum)] + ) + self.actor_mean = nn.ModuleList( + [layer_init(nn.Linear(200, self.continuous_size), std=0.01) for i in range(targetNum)] + ) + self.actor_logstd = nn.ParameterList( + [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)] + ) + self.critic = nn.ModuleList( + [layer_init(nn.Linear(200, 1), std=1) for i in range(targetNum)] + ) + + def get_value(self, state: torch.Tensor): + targets = state[:, 0].to(torch.int32) + hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])]) + return torch.stack([self.critic[targets[i]](hidden[i]) for i in range(targets.size()[0])]) + + def get_actions_value(self, state: torch.Tensor, actions=None): + targets = state[:, 0].to(torch.int32) + hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])]) + + # discrete + # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出 + dis_logits = torch.stack( + [self.actor_dis[targets[i]](hidden[i]) for i in range(targets.size()[0])] + ) + split_logits = torch.split(dis_logits, self.discrete_shape, dim=1) + multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits] + # continuous + actions_mean = torch.stack( + [self.actor_mean[targets[i]](hidden[i]) for i in range(targets.size()[0])] + ) # self.actor_mean(hidden) + # action_logstd = torch.stack([self.actor_logstd[targets[i]].expand_as(actions_mean) for i in range(targets.size()[0])]) # self.actor_logstd.expand_as(actions_mean) + # print(action_logstd) + action_std = torch.squeeze( + torch.stack( + [torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])] + ), + dim=-1, + ) # torch.exp(action_logstd) + con_probs = Normal(actions_mean, action_std) + # critic + criticV = torch.stack( + [self.critic[targets[i]](hidden[i]) for i in range(targets.size()[0])] + ) + + if actions is None: + if TRAIN: + # select actions base on probability distribution model + disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) + conAct = con_probs.sample() + actions = torch.cat([disAct.T, conAct], dim=1) + else: + # select actions base on best probability distribution + disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits]) + conAct = actions_mean + actions = torch.cat([disAct.T, conAct], dim=1) + else: + disAct = actions[:, 0 : env.unity_discrete_type].T + conAct = actions[:, env.unity_discrete_type :] + dis_log_prob = torch.stack( + [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)] + ) + dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals]) + + return ( + actions, + dis_log_prob.sum(0), + dis_entropy.sum(0), + con_probs.log_prob(conAct).sum(1), + con_probs.entropy().sum(1), + criticV, + ) + + +def GAE(agent, args, rewards, dones, values, next_obs, next_done): + # GAE + with torch.no_grad(): + next_value = agent.get_value(next_obs).reshape(1, -1) + data_size = rewards.size()[0] + if args.gae: + advantages = torch.zeros_like(rewards).to(device) + lastgaelam = 0 + for t in reversed(range(data_size)): + if t == data_size - 1: + nextnonterminal = 1.0 - next_done + nextvalues = next_value + else: + nextnonterminal = 1.0 - dones[t + 1] + nextvalues = values[t + 1] + delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] + advantages[t] = lastgaelam = ( + delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam + ) + returns = advantages + values + else: + returns = torch.zeros_like(rewards).to(device) + for t in reversed(range(data_size)): + if t == data_size - 1: + nextnonterminal = 1.0 - next_done + next_return = next_value + else: + nextnonterminal = 1.0 - dones[t + 1] + next_return = returns[t + 1] + returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return + advantages = returns - values + return advantages, returns + + +class AimbotSideChannel(SideChannel): + def __init__(self, channel_id: uuid.UUID) -> None: + super().__init__(channel_id) + + def on_message_received(self, msg: IncomingMessage) -> None: + """ + Note: We must implement this method of the SideChannel interface to + receive messages from Unity + """ + thisMessage = msg.read_string() + # print(thisMessage) + thisResult = thisMessage.split("|") + if thisResult[0] == "result": + TotalRounds[thisResult[1]] += 1 + if thisResult[2] == "Win": + WinRounds[thisResult[1]] += 1 + # print(TotalRounds) + # print(WinRounds) + elif thisResult[0] == "Error": + print(thisMessage) + + # 发送函数 + def send_string(self, data: str) -> None: + # send a string toC# + msg = OutgoingMessage() + msg.write_string(data) + super().queue_message_to_send(msg) + + def send_bool(self, data: bool) -> None: + msg = OutgoingMessage() + msg.write_bool(data) + super().queue_message_to_send(msg) + + def send_int(self, data: int) -> None: + msg = OutgoingMessage() + msg.write_int32(data) + super().queue_message_to_send(msg) + + def send_float(self, data: float) -> None: + msg = OutgoingMessage() + msg.write_float32(data) + super().queue_message_to_send(msg) + + def send_float_list(self, data: List[float]) -> None: + msg = OutgoingMessage() + msg.write_float32_list(data) + super().queue_message_to_send(msg) + + +def broadCastEndReward(rewardBF: list, remainTime: float): + thisRewardBF = rewardBF + if rewardBF[-1] <= -500: + # print("Lose DO NOT BROAD CAST",rewardBF[-1]) + thisRewardBF[-1] = rewardBF[-1] - BASE_LOSEREWARD + thisRewardBF = (np.asarray(thisRewardBF)).tolist() + elif rewardBF[-1] >= 500: + # print("Win! Broadcast reward!",rewardBF[-1]) + thisRewardBF[-1] = rewardBF[-1] - BASE_WINREWARD + thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * RESULT_BROADCAST_RATIO)).tolist() + else: + print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1]) + return torch.Tensor(thisRewardBF).to(device) + + +class Discriminator(nn.Module): + def __init__(self, env: Aimbot, targetNum: int): + super(Discriminator, self).__init__() + self.targetNum = targetNum + self.trajectory_size = env.unity_observation_shape + env.unity_action_size + + self.network = nn.ModuleList( + [ + nn.Sequential( + layer_init(nn.Linear(np.array(self.trajectory_size).prod(), 300)), + nn.ReLU(), + layer_init(nn.Linear(300, 200)), + nn.ReLU(), + ) + for i in range(targetNum) + ] + ) + self.output = nn.ModuleList( + [nn.Relu(layer_init(nn.Linear(200, 1), std=0.01)) for i in range(targetNum)] + ) + + def get_value(self, state: torch.Tensor): + targets = state[:, 0].to(torch.int32) + hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])]) + return torch.stack([self.output[targets[i]](hidden[i]) for i in range(targets.size()[0])]) + + +if __name__ == "__main__": + args = parse_args() + random.seed(DEFAULT_SEED) + np.random.seed(DEFAULT_SEED) + torch.manual_seed(DEFAULT_SEED) + + device = torch.device("cuda" if torch.cuda.is_available() and useCUDA else "cpu") + + aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID) + env = Aimbot( + envPath=ENV_PATH, workerID=WORKER_ID, basePort=BASE_PORT, side_channels=[aimBotsideChannel] + ) + if LOAD_DIR is None: + agent = PPOAgent(env, TARGETNUM).to(device) + discriminator = Discriminator(env, TARGETNUM).to(device) + else: + print("NOT AVALABLE") + agent = torch.load(LOAD_DIR) + print("Load Agent", LOAD_DIR) + print(agent.eval()) + + # Optimizers + PPOoptimizer = optim.Adam(agent.parameters(), lr=LEARNING_RATE, eps=1e-5) + DSCMoptimizer = optim.Adam(discriminator.parameters(), lr=LEARNING_RATE, eps=1e-5) + + # Tensorboard and WandB Recorder + game_name = "Aimbot_GAIL" + game_type = "ORG" + run_name = f"{game_name}_{game_type}_{DEFAULT_SEED}_{int(time.time())}" + if WANDB_TACK: + wandb.init( + project=game_name, + entity=WAND_ENTITY, + sync_tensorboard=True, + config=vars(args), + name=run_name, + monitor_gym=True, + save_code=True, + ) + writer = SummaryWriter(f"runs/{run_name}") + writer.add_text( + "hyperparameters", + "|param|value|\n|-|-|\n%s" + % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), + ) + + agentMem = GAILMem() + expertMem = GAILMem() + expertMem.loadMemFile(EXPERT_PATH) + + # start the game + total_update_step = using_targets_num * args.total_timesteps // args.datasetSize + target_steps = [0 for i in range(TARGETNUM)] + start_time = time.time() + state, _, done = env.reset() + # Trajectory Buffer + ob_bf = [[] for i in range(env.unity_agent_num)] + act_bf = [[] for i in range(env.unity_agent_num)] + dis_logprobs_bf = [[] for i in range(env.unity_agent_num)] + con_logprobs_bf = [[] for i in range(env.unity_agent_num)] + rewards_bf = [[] for i in range(env.unity_agent_num)] + dones_bf = [[] for i in range(env.unity_agent_num)] + values_bf = [[] for i in range(env.unity_agent_num)] + + # initialize empty training datasets + obs = [ + torch.tensor([]).to(device) for i in range(TARGETNUM) + ] # (TARGETNUM,n,env.unity_observation_size) + actions = [ + torch.tensor([]).to(device) for i in range(TARGETNUM) + ] # (TARGETNUM,n,env.unity_action_size) + dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + dones = [torch.tensor([]).to(device) for i in range(TARGETNUM)] + + next_state_buffer = [[] for i in range(TARGETNUM)] + next_done_buffer = [[] for i in range(TARGETNUM)] + for total_steps in range(total_update_step): + print("new episode") + trainQueue = [] + while True: + with torch.no_grad(): + # predict actions + action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value( + torch.Tensor(state).to(device) + ) + value = value.flatten() + + # variable from GPU to CPU + action_cpu = action.cpu().numpy() + dis_logprob_cpu = dis_logprob.cpu().numpy() + con_logprob_cpu = con_logprob.cpu().numpy() + value_cpu = value.cpu().numpy() + + # Environment step + next_state, reward, next_done = env.step(action_cpu) + + # save mem + for i in range(env.unity_agent_num): + # save memories to buffers + ob_bf[i].append(state[i]) + act_bf[i].append(action_cpu[i]) + dis_logprobs_bf[i].append(dis_logprob_cpu[i]) + con_logprobs_bf[i].append(con_logprob_cpu[i]) + rewards_bf[i].append(reward[i]) + dones_bf[i].append(done[i]) + values_bf[i].append(value_cpu[i]) + if next_done[i] == True: + # finished a round, send finished memories to training datasets + thisTarget = int(state[i, 0]) + next_state_buffer[thisTarget] = next_state + next_done_buffer[thisTarget] = next_done + obs[thisTarget] = torch.cat( + (obs[thisTarget], torch.tensor(ob_bf[i]).to(device)), 0 + ) + actions[thisTarget] = torch.cat( + (actions[thisTarget], torch.tensor(act_bf[i]).to(device)), 0 + ) + dis_logprobs[thisTarget] = torch.cat( + (dis_logprobs[thisTarget], torch.tensor(dis_logprobs_bf[i]).to(device)), 0 + ) + con_logprobs[thisTarget] = torch.cat( + (con_logprobs[thisTarget], torch.tensor(con_logprobs_bf[i]).to(device)), 0 + ) + values[thisTarget] = torch.cat( + (values[thisTarget], torch.tensor(values_bf[i]).to(device)), 0 + ) + dones[thisTarget] = torch.cat( + (dones[thisTarget], torch.tensor(dones_bf[i]).to(device)), 0 + ) + for i in range(TARGETNUM): + if obs[i].size()[0] >= args.datasetSize: + # start train NN + trainQueue.append(i) + if len(trainQueue) > 0: + break + state, done = next_state, next_done + + if args.train: + meanRewardList = [] + for thisT in trainQueue: + target_steps[thisT] += 1 + # get agent training datasets + b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape) + b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,)) + b_dis_logprobs = dis_logprobs[thisT].reshape(-1) + b_con_logprobs = con_logprobs[thisT].reshape(-1) + b_values = values[thisT].reshape(-1) + b_dones = dones[thisT].reshape(-1) + dataNum = b_obs[thisT].size()[0] + # get expert training datasets from GAILMem + exp_obs, _, exp_actions, _, _ = expertMem.getRandomSample(dataNum, thisT) + + # trajectory + agent_trajectory = torch.cat((b_obs, b_actions), dim=1) + exp_trajectory = torch.cat((exp_obs, exp_actions), dim=1) + + # discriminator ACC + with torch.no_grad(): + thisDSCM_agent_acc = torch.mean(discriminator.get_value(agent_trajectory)) + thisDSCM_expt_acc = torch.mean(discriminator.get_value(exp_trajectory)) + # train discriminator + b_inds = np.arange(dataNum) + for epoch in range(args.epoch): + np.random.shuffle(b_inds) + # train discriminator + for start in range(0, dataNum, args.minibatchSize): + end = start + args.minibatchSize + mb_inds = b_inds[start:end] + exp_value = discriminator.get_value(exp_trajectory[mb_inds]) + agent_value = discriminator.get_value(agent_trajectory[mb_inds]) + # DSCM loss function + exp_loss = torch.log(1.0 - exp_value + EPS) + agent_loss = torch.log(agent_value + EPS) + loss = exp_loss + agent_loss + # DSCM backward + DSCMoptimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(discriminator.parameters(), args.max_grad_norm) + DSCMoptimizer.step() + # get discriminator reward + with torch.no_grad(): + DSCMReward = discriminator.get_value(agent_trajectory) * DSCM_ENHANCED + advantages, rts = GAE( + agent, + args, + DSCMReward, + b_dones, + b_values, + next_state_buffer[thisTarget], + next_done_buffer[thisTarget], + ) + b_advantages = advantages.reshape(-1) + b_returns = rts.reshape(-1) + + # train PPO agent + for epoch in range(args.epoch): + np.random.shuffle(b_inds) + for start in range(0, dataNum, args.minibatchSize): + end = start + args.minibatchSize + mb_inds = b_inds[start:end] + mb_advantages = b_advantages[mb_inds] + # normalize advantages + if args.norm_adv: + mb_advantages = (mb_advantages - mb_advantages.mean()) / ( + mb_advantages.std() + 1e-8 + ) + ( + _, + new_dis_logprob, + dis_entropy, + new_con_logprob, + con_entropy, + newvalue, + ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) + # discrete ratio + dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] + dis_ratio = dis_logratio.exp() + # continuous ratio + con_logratio = new_con_logprob - b_con_logprobs[mb_inds] + con_ratio = con_logratio.exp() + # discrete Policy loss + dis_pg_loss_orig = -mb_advantages * dis_ratio + dis_pg_loss_clip = -mb_advantages * torch.clamp( + dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef + ) + dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() + # continuous Policy loss + con_pg_loss_orig = -mb_advantages * con_ratio + con_pg_loss_clip = -mb_advantages * torch.clamp( + con_ratio, 1 - args.clip_coef, 1 + args.clip_coef + ) + con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() + # Value loss + newvalue = newvalue.view(-1) + if args.clip_vloss: + v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 + v_clipped = b_values[mb_inds] + torch.clamp( + newvalue - b_values[mb_inds], + -args.clip_coef, + args.clip_coef, + ) + v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 + v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) + v_loss = 0.5 * v_loss_max.mean() + else: + v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() + # total loss + entropy_loss = dis_entropy.mean() + con_entropy.mean() + loss = ( + dis_pg_loss * args.policy_coef + + con_pg_loss * args.policy_coef + - entropy_loss * args.ent_coef + + v_loss * args.critic_coef + ) + PPOoptimizer.zero_grad() + loss.backward() + # Clips gradient norm of an iterable of parameters. + nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) + PPOoptimizer.step() + # record mean reward before clear history + targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) + meanRewardList.append(targetRewardMean) + targetName = Targets(thisT).name + + # clear this target trainning set buffer + obs[thisT] = torch.tensor([]).to(device) + actions[thisT] = torch.tensor([]).to(device) + dis_logprobs[thisT] = torch.tensor([]).to(device) + con_logprobs[thisT] = torch.tensor([]).to(device) + rewards[thisT] = torch.tensor([]).to(device) + values[thisT] = torch.tensor([]).to(device) + dones[thisT] = torch.tensor([]).to(device) + + # record rewards for plotting purposes + writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/Discriminator_EXP_ACC", thisDSCM_expt_acc,target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/Discriminator_Agent_ACC", thisDSCM_agent_acc,target_steps[thisT]) + print(f"episode over Target{targetName} mean reward:", targetRewardMean) + TotalRewardMean = np.mean(meanRewardList) + writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) + writer.add_scalar("GlobalCharts/learning_rate", PPOoptimizer.param_groups[0]["lr"], total_steps) + # New Record! + if TotalRewardMean > bestReward: + bestReward = targetRewardMean + saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt" + torch.save(agent, saveDir) + + # train PPO + # record diff --git a/Aimbot-PPO-Python/Pytorch/GAILMem.py b/Aimbot-PPO-Python/Pytorch/GAILMem.py new file mode 100644 index 0000000..d2554de --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/GAILMem.py @@ -0,0 +1,176 @@ +import os +import random +import numpy as np + + +class GAILMem(): + def __init__(self, targetNum): + self.targetNum = targetNum + self.states = [[] for i in range(self.targetNum)] + self.actorProbs = [[] for i in range(self.targetNum)] + self.actions = [[] for i in range(self.targetNum)] + self.rewards = [[] for i in range(self.targetNum)] + self.dones = [[] for i in range(self.targetNum)] + self.memNum = [0 for i in range(self.targetNum)] + + def clearMem(self,targetType): + """clearMemories""" + self.states[targetType] = [] + self.actorProbs[targetType] = [] + self.actions[targetType] = [] + self.rewards[targetType] = [] + self.dones[targetType] = [] + self.memNum[targetType] = 0 + def saveMemtoFile(self, dir: str): + """save memories ndarray to npz file + + Args: + dir (str): save direction,like"GAIL-Expert-Data/",end with "/" + """ + for i in range(self.targetNum): + statesNP = np.asarray(self.states[i]) + actorProbsNP = np.asarray(self.actorProbs[i]) + actionsNP = np.asarray(self.actions[i]) + rewardsNP = np.asarray(self.rewards[i]) + donesNP = np.asarray(self.dones[i]) + thisSaveDir = dir + "pack-" + str(self.memNum) + str(i) + try: + np.savez( + thisSaveDir, + states=statesNP, + actorProbs=actorProbsNP, + actions=actionsNP, + rewards=rewardsNP, + dones=donesNP, + ) + except FileNotFoundError: + os.mkdir(dir) + np.savez( + thisSaveDir, + states=statesNP, + actorProbs=actorProbsNP, + actions=actionsNP, + rewards=rewardsNP, + dones=donesNP, + ) + def loadMemFile(self, dir: str): + """load memories from mpz file + + Args: + dir (str): file direction + """ + for i in range(self.targetNum): + self.clearMem(i) + loadDir = dir + "pack-" + str(self.memNum) + str(i) + ".npz" + memFile = np.load(loadDir, allow_pickle=True) + self.states[i] = memFile["states"].tolist() + self.actorProbs[i] = memFile["actorProbs"].tolist() + self.actions[i] = memFile["actions"].tolist() + self.rewards[i] = memFile["rewards"].tolist() + self.dones[i] = memFile["dones"].tolist() + self.memNum = len(self.states[i]) + def getRandomSample(self,sampleNum: int,targetType:int): + """get random unique sample set. + + Args: + sampleNum (int, optional): sample number, while 0 return all samples. Defaults to 0. + + Returns: + tuple: (states,actorProbs,actions,rewards,dones) + """ + if sampleNum == 0: + return ( + self.getStates(), + self.getActorProbs(), + self.getActions(), + self.getRewards(), + self.getDones(), + ) + else: + randIndex = random.sample(range(0, self.memNum), sampleNum) + return ( + self.standDims(np.asarray(self.states)[randIndex]), + self.standDims(np.asarray(self.actorProbs)[randIndex]), + self.standDims(np.asarray(self.actions)[randIndex]), + self.standDims(np.asarray(self.rewards)[randIndex]), + self.standDims(np.asarray(self.dones)[randIndex]), + ) + + def getStates(self,targetType): + """get all States data as ndarray + + Returns: + ndarray: ndarray type State data + """ + return self.standDims(np.asarray(self.states[targetType])) + + def getActorProbs(self,targetType): + """get all ActorProbs data as ndarray + + Returns: + ndarray: ndarray type ActorProbs data + """ + + return self.standDims(np.asarray(self.actorProbs[targetType])) + + def getActions(self,targetType): + """get all Actions data as ndarray + + Returns: + ndarray: ndarray type Actions data + """ + + return self.standDims(np.asarray(self.actions[targetType])) + + def getRewards(self,targetType): + """get all Rewards data as ndarray + + Returns: + ndarray: ndarray type Rewards data + """ + + return self.standDims(np.asarray(self.rewards[targetType])) + + def getDones(self,targetType): + """get all Dones data as ndarray + + Returns: + ndarray: ndarray type Dones data + """ + + return self.standDims(np.asarray(self.dones[targetType])) + + def standDims(self, data): + """standalize data's dimension + + Args: + data (list): data list + + Returns: + ndarray: ndarra type data + """ + # standarlize data's dimension + if np.ndim(data) > 2: + return np.squeeze(data, axis=1) + elif np.ndim(data) < 2: + return np.expand_dims(data, axis=1) + else: + return np.asarray(data) + + def saveMems(self, state, actorProb, action, reward, done): + """save memories + + Args: + state (_type_): sates + actorProb (_type_): actor predict result + action (_type_): actor choosed action + reward (_type_): reward + done (function): done + """ + targetType = int(state[0,0]) + self.states[targetType].append(state) + self.actorProbs[targetType].append(actorProb) + self.actions[targetType].append(action) + self.rewards[targetType].append(reward) + self.dones[targetType].append(done) + self.memNum[targetType] += 1 diff --git a/Aimbot-PPO-Python/Pytorch/GAILRecorder.py b/Aimbot-PPO-Python/Pytorch/GAILRecorder.py new file mode 100644 index 0000000..613743b --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/GAILRecorder.py @@ -0,0 +1,97 @@ +import time +import numpy as np +from AimbotEnv import Aimbot +from GAILMem import GAILMem +import keyboard +import mouse +import math + + +# Env +ENV_PATH = "../Build/HUMAN-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy/Aimbot-ParallelEnv" +WORKER_ID = 1 +BASE_PORT = 200 + +# ENV Para +MOUSEDISCOUNT = 20.0 +MAX_EP = 10000000 +STACKSTATESIZE = 3 +STACKINTERCE = 29 + + +class HumanActions: + def __init__(self, mouseDiscount: float = 10, screenW: int = 1920, screenH: int = 1080): + def multiPressed(): + pass + + keyboard.add_hotkey("w+a", multiPressed) + keyboard.add_hotkey("w+d", multiPressed) + keyboard.add_hotkey("s+a", multiPressed) + keyboard.add_hotkey("s+d", multiPressed) + self.screenW = screenW + self.screenH = screenH + self.MOUSEDISCOUNT = mouseDiscount + self.mouseSmooth = 5 + self.mouseMax = 10 + + def getHumanActions(self): + x, _ = mouse.get_position() + xMovement = (x - self.screenW / 2) / self.MOUSEDISCOUNT + xMovement = self.smoothMouseMovement(xMovement) + ws = 0 + ad = 0 + click = 0 + if keyboard.is_pressed("w"): + ws = 1 + elif keyboard.is_pressed("s"): + ws = 2 + if keyboard.is_pressed("d"): + ad = 1 + elif keyboard.is_pressed("a"): + ad = 2 + if keyboard.is_pressed("w+d"): + ws = 1 + ad = 1 + elif keyboard.is_pressed("w+a"): + ws = 1 + ad = 2 + elif keyboard.is_pressed("s+d"): + ws = 2 + ad = 1 + elif keyboard.is_pressed("s+a"): + ws = 2 + ad = 2 + if keyboard.is_pressed("0"): + click = 1 + + actions = np.asarray([[ws, ad, click, xMovement]]) + + mouse.move(self.screenW / 2, self.screenH / 2) + return actions + + def smoothMouseMovement(self, x: float): + out = (1 / (1 + math.exp(-x / self.mouseSmooth)) - 1 / 2) * self.mouseMax * 2 + return out + +if __name__ == "__main__": + env = Aimbot( + envPath=ENV_PATH, + workerID=WORKER_ID, + basePort=BASE_PORT, + side_channels=[], + ) + demoMem = GAILMem(4) + demoAct = HumanActions(mouseDiscount=MOUSEDISCOUNT) + + for ep in range(MAX_EP): + print("EP Start") + done = False + while not done: + actions = demoAct.getHumanActions() + nextState, r, done = env.step(actions=actions) + demoMem.saveMems(state=nextState, actorProb=None, action=actions, reward=None, done=None) + state = nextState + #nowMemNum = demoMem.memNum + saveSteps = 500 + lastMemCheckPoint = 0 + diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index 32a6e5e..f42244c 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -646,7 +646,6 @@ if __name__ == "__main__": # record rewards for plotting purposes writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT]) - writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT]) writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT]) writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT]) writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT]) diff --git a/Aimbot-PPO-Python/Pytorch/testarea.ipynb b/Aimbot-PPO-Python/Pytorch/testarea.ipynb index 3efc30e..d255ad4 100644 --- a/Aimbot-PPO-Python/Pytorch/testarea.ipynb +++ b/Aimbot-PPO-Python/Pytorch/testarea.ipynb @@ -792,6 +792,29 @@ "source": [ "env.close()" ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "float" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "\n", + "aa = torch.Tensor([[1,2,3],[2,2,3],[3,2,3],[4,2,3]]).to(\"cuda\")\n", + "type(torch.mean(aa).item())" + ] } ], "metadata": {