diff --git a/.gitignore b/.gitignore index f29e4da..6edd870 100644 --- a/.gitignore +++ b/.gitignore @@ -76,6 +76,8 @@ crashlytics-build.properties /Aimbot-PPO-Python/.vscode/ /Aimbot-PPO-Python/.mypy_cache/ /Aimbot-PPO-Python/__pycache__/ +/Aimbot-PPO-Python/wandb/ +/Aimbot-PPO-Python/runs/ /Aimbot-PPO-Python/Tensorflow/__pycache__/ /Aimbot-PPO-Python/Pytorch/__pycache__/ /Aimbot-PPO-Python/Pytorch/runs/ diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index 3339c24..67d0533 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -7,6 +7,7 @@ import uuid import torch import torch.nn as nn import torch.optim as optim +import atexit from AimbotEnv import Aimbot from tqdm import tqdm @@ -26,39 +27,44 @@ from typing import List bestReward = -1 DEFAULT_SEED = 9331 -ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv" +ENV_PATH = "../Build/3.0/Goto/Aimbot-ParallelEnv" SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e") WAND_ENTITY = "koha9" WORKER_ID = 1 BASE_PORT = 1000 +# tensorboard names +game_name = "Aimbot_Target_Hybrid_PMNN_V3" +game_type = "PList_Go_LeakyReLU" + # max round steps per agent is 2500/Decision_period, 25 seconds # !!!check every parameters before run!!! TOTAL_STEPS = 3150000 -BATCH_SIZE = 1024 +BATCH_SIZE = 512 MAX_TRAINNING_DATASETS = 6000 DECISION_PERIOD = 1 -LEARNING_RATE = 5e-4 +LEARNING_RATE = 1e-3 GAMMA = 0.99 GAE_LAMBDA = 0.95 EPOCHS = 3 CLIP_COEF = 0.11 LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence POLICY_COEF = [1.0, 1.0, 1.0, 1.0] -ENTROPY_COEF = [0.1, 0.1, 0.1, 0.1] +ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05] CRITIC_COEF = [0.5, 0.5, 0.5, 0.5] TARGET_LEARNING_RATE = 1e-6 FREEZE_VIEW_NETWORK = False +BROADCASTREWARD = False ANNEAL_LEARNING_RATE = True CLIP_VLOSS = True -NORM_ADV = True +NORM_ADV = False TRAIN = True - -WANDB_TACK = False +SAVE_MODEL = True +WANDB_TACK = True LOAD_DIR = None -# LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948-freeonly-20/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948_0.7949778.pt" +#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677696843_middle.pt" # public data class Targets(Enum): @@ -120,6 +126,8 @@ def parse_args(): help="Toggle learning rate annealing for policy and value networks") parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True, help="track on the wandb") + parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True, + help="save model or not") parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY, help="the entity (team) of wandb's project") parser.add_argument("--load-dir", type=str, default=LOAD_DIR, @@ -128,7 +136,8 @@ def parse_args(): help="the number of steps to run in each environment per policy rollout") parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO, help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime") - + parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True, + help="save model or not") # GAE loss parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, help="Use GAE for advantage computation") @@ -183,20 +192,21 @@ class PPOAgent(nn.Module): self.viewNetwork = nn.Sequential( layer_init(nn.Linear(self.raySize, 200)), - nn.Tanh() + nn.LeakyReLU() ) self.targetNetworks = nn.ModuleList([nn.Sequential( layer_init(nn.Linear(self.nonRaySize, 100)), - nn.Tanh() + nn.LeakyReLU() )for i in range(targetNum)]) self.middleNetworks = nn.ModuleList([nn.Sequential( layer_init(nn.Linear(300,200)), - nn.Tanh() + nn.LeakyReLU() )for i in range(targetNum)]) self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(targetNum)]) self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(targetNum)]) - # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(256, self.continuous_size), std=1) for i in range(targetNum)]) - self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) + # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)]) + # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) + self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1,self.continuous_size))for i in range(targetNum)]) # nn.Parameter(torch.zeros(1, self.continuous_size)) self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)]) def get_value(self, state: torch.Tensor): @@ -228,7 +238,9 @@ class PPOAgent(nn.Module): multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits] # continuous actions_mean = torch.stack([self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_mean(hidden) - action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean) + # action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden) + # action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean) + action_logstd = torch.stack([torch.squeeze(self.actor_logstd[target[i]],0) for i in range(thisStateNum)]) # print(action_logstd) action_std = torch.exp(action_logstd) # torch.exp(action_logstd) con_probs = Normal(actions_mean, action_std) @@ -243,8 +255,10 @@ class PPOAgent(nn.Module): actions = torch.cat([disAct.T, conAct], dim=1) else: # select actions base on best probability distribution - disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits]) + # disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits]) conAct = actions_mean + disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) + conAct = con_probs.sample() actions = torch.cat([disAct.T, conAct], dim=1) else: disAct = actions[:, 0 : env.unity_discrete_type].T @@ -347,9 +361,9 @@ def broadCastEndReward(rewardBF:list,remainTime:float): if (rewardBF[-1]<=-500): # print("Lose DO NOT BROAD CAST",rewardBF[-1]) thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD - thisRewardBF = thisRewardBF elif (rewardBF[-1]>=500): # print("Win! Broadcast reward!",rewardBF[-1]) + print(sum(thisRewardBF)/len(thisRewardBF)) thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist() else: @@ -384,9 +398,7 @@ if __name__ == "__main__": optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) # Tensorboard and WandB Recorder - game_name = "Aimbot_Target_Hybrid_PMNN_V2" - game_type = "OffPolicy_EndBC" - run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}" + run_name = f"{game_type}_{args.seed}_{int(time.time())}" if args.wandb_track: wandb.init( project=game_name, @@ -405,6 +417,16 @@ if __name__ == "__main__": % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), ) + @atexit.register + def save_model(): + # close env + env.close() + if args.save_model: + # save model while exit + saveDir = "../PPO-Model/"+ run_name + "_last.pt" + torch.save(agent, saveDir) + print("save model to " + saveDir) + # Trajectory Buffer ob_bf = [[] for i in range(env.unity_agent_num)] act_bf = [[] for i in range(env.unity_agent_num)] @@ -589,6 +611,8 @@ if __name__ == "__main__": meanRewardList = [] # for WANDB # loop all tarining queue for thisT in trainQueue: + # sart time + startTime = time.time() target_steps[thisT]+=1 # flatten the batch b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape) @@ -610,6 +634,8 @@ if __name__ == "__main__": print(".",end="") end = start + args.minibatchSize mb_inds = b_inds[start:end] + if(np.size(mb_inds)<=1): + break mb_advantages = b_advantages[mb_inds] # normalize advantages @@ -730,11 +756,39 @@ if __name__ == "__main__": TotalRewardMean = np.mean(meanRewardList) writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps) + # print cost time as seconds + print("cost time:", time.time() - start_time) # New Record! - if TotalRewardMean > bestReward: + if TotalRewardMean > bestReward and args.save_model: bestReward = targetRewardMean saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt" torch.save(agent, saveDir) + else: + meanRewardList = [] # for WANDB + # while not in training mode, clear the buffer + for thisT in trainQueue: + target_steps[thisT]+=1 + targetName = Targets(thisT).name + targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) + meanRewardList.append(targetRewardMean) + print(target_steps[thisT]) + + obs[thisT] = torch.tensor([]).to(device) + actions[thisT] = torch.tensor([]).to(device) + dis_logprobs[thisT] = torch.tensor([]).to(device) + con_logprobs[thisT] = torch.tensor([]).to(device) + rewards[thisT] = torch.tensor([]).to(device) + values[thisT] = torch.tensor([]).to(device) + advantages[thisT] = torch.tensor([]).to(device) + returns[thisT] = torch.tensor([]).to(device) + + # record rewards for plotting purposes + + writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT]) + print(f"episode over Target{targetName} mean reward:", targetRewardMean) + TotalRewardMean = np.mean(meanRewardList) + writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) saveDir = "../PPO-Model/"+ run_name + "_last.pt" torch.save(agent, saveDir) diff --git a/Aimbot-PPO-Python/Pytorch/graph.py b/Aimbot-PPO-Python/Pytorch/graph.py new file mode 100644 index 0000000..2621cc0 --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/graph.py @@ -0,0 +1,769 @@ +import argparse +import wandb +import time +import numpy as np +import random +import uuid +import torch +import torch.nn as nn +import torch.optim as optim +import atexit + +from torchviz import make_dot, make_dot_from_trace +from AimbotEnv import Aimbot +from tqdm import tqdm +from enum import Enum +from torch.distributions.normal import Normal +from torch.distributions.categorical import Categorical +from distutils.util import strtobool +from torch.utils.tensorboard import SummaryWriter +from mlagents_envs.environment import UnityEnvironment +from mlagents_envs.side_channel.side_channel import ( + SideChannel, + IncomingMessage, + OutgoingMessage, +) +from typing import List + +bestReward = -1 + +DEFAULT_SEED = 9331 +ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv" +SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e") +WAND_ENTITY = "koha9" +WORKER_ID = 2 +BASE_PORT = 1111 + +# max round steps per agent is 2500/Decision_period, 25 seconds +# !!!check every parameters before run!!! + +TOTAL_STEPS = 3150000 +BATCH_SIZE = 1024 +MAX_TRAINNING_DATASETS = 6000 +DECISION_PERIOD = 1 +LEARNING_RATE = 5e-4 +GAMMA = 0.99 +GAE_LAMBDA = 0.95 +EPOCHS = 3 +CLIP_COEF = 0.11 +LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence +POLICY_COEF = [1.0, 1.0, 1.0, 1.0] +ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05] +CRITIC_COEF = [0.5, 0.5, 0.5, 0.5] +TARGET_LEARNING_RATE = 1e-6 +FREEZE_VIEW_NETWORK = False + +ANNEAL_LEARNING_RATE = True +CLIP_VLOSS = True +NORM_ADV = True +TRAIN = True + +SAVE_MODEL = False +WANDB_TACK = False +LOAD_DIR = None +#LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948-freeonly-20/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948_0.7949778.pt" + +# public data +class Targets(Enum): + Free = 0 + Go = 1 + Attack = 2 + Defence = 3 + Num = 4 +TARGET_STATE_SIZE = 6 +INAREA_STATE_SIZE = 1 +TIME_STATE_SIZE = 1 +GUN_STATE_SIZE = 1 +MY_STATE_SIZE = 4 +TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE +BASE_WINREWARD = 999 +BASE_LOSEREWARD = -999 +TARGETNUM= 4 +ENV_TIMELIMIT = 30 +RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT +TotalRounds = {"Free":0,"Go":0,"Attack":0} +WinRounds = {"Free":0,"Go":0,"Attack":0} + +# !!!SPECIAL PARAMETERS!!! +# change it while program is finished +using_targets_num = 3 + + +def parse_args(): + # fmt: off + # pytorch and environment parameters + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=DEFAULT_SEED, + help="seed of the experiment") + parser.add_argument("--path", type=str, default=ENV_PATH, + help="enviroment path") + parser.add_argument("--workerID", type=int, default=WORKER_ID, + help="unity worker ID") + parser.add_argument("--baseport", type=int, default=BASE_PORT, + help="port to connect to Unity environment") + parser.add_argument("--lr", type=float, default=LEARNING_RATE, + help="the learning rate of optimizer") + parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, + help="if toggled, cuda will be enabled by default") + parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS, + help="total timesteps of the experiments") + + # model parameters + parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True, + help="Train Model or not") + parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True, + help="freeze view network or not") + parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS, + help="training dataset size,start training while dataset collect enough data") + parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE, + help="nimi batch size") + parser.add_argument("--epochs", type=int, default=EPOCHS, + help="the K epochs to update the policy") + parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True, + help="Toggle learning rate annealing for policy and value networks") + parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True, + help="track on the wandb") + parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True, + help="save model or not") + parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY, + help="the entity (team) of wandb's project") + parser.add_argument("--load-dir", type=str, default=LOAD_DIR, + help="load model directory") + parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD, + help="the number of steps to run in each environment per policy rollout") + parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO, + help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime") + + # GAE loss + parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, + help="Use GAE for advantage computation") + parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True, + help="Toggles advantages normalization") + parser.add_argument("--gamma", type=float, default=GAMMA, + help="the discount factor gamma") + parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA, + help="the lambda for the general advantage estimation") + parser.add_argument("--clip-coef", type=float, default=CLIP_COEF, + help="the surrogate clipping coefficient") + parser.add_argument("--policy-coef", type=float, default=POLICY_COEF, + help="coefficient of the policy") + parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF, + help="coefficient of the entropy") + parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF, + help="coefficient of the value function") + parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True, + help="Toggles whether or not to use a clipped loss for the value function, as per the paper.") + parser.add_argument("--max-grad-norm", type=float, default=0.5, + help="the maximum norm for the gradient clipping") + parser.add_argument("--target-kl", type=float, default=None, + help="the target KL divergence threshold") + # fmt: on + args = parser.parse_args() + return args + + +def layer_init(layer, std=np.sqrt(2), bias_const=0.0): + torch.nn.init.orthogonal_(layer.weight, std) + torch.nn.init.constant_(layer.bias, bias_const) + return layer + + +class PPOAgent(nn.Module): + def __init__(self, env: Aimbot,targetNum:int): + super(PPOAgent, self).__init__() + self.targetNum = targetNum + self.stateSize = env.unity_observation_shape[0] + self.agentNum = env.unity_agent_num + self.targetSize = TARGET_STATE_SIZE + self.timeSize = TIME_STATE_SIZE + self.gunSize = GUN_STATE_SIZE + self.myStateSize = MY_STATE_SIZE + self.raySize = env.unity_observation_shape[0] - TOTAL_T_SIZE + self.nonRaySize = TOTAL_T_SIZE + self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input + + self.discrete_size = env.unity_discrete_size + self.discrete_shape = list(env.unity_discrete_branches) + self.continuous_size = env.unity_continuous_size + + self.viewNetwork = nn.Sequential( + layer_init(nn.Linear(self.raySize, 200)), + nn.Tanh() + ) + self.targetNetworks = nn.ModuleList([nn.Sequential( + layer_init(nn.Linear(self.nonRaySize, 100)), + nn.Tanh() + )for i in range(targetNum)]) + self.middleNetworks = nn.ModuleList([nn.Sequential( + layer_init(nn.Linear(300,200)), + nn.Tanh() + )for i in range(targetNum)]) + self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(targetNum)]) + self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(targetNum)]) + # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)]) + # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) + self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1,self.continuous_size))for i in range(targetNum)]) # nn.Parameter(torch.zeros(1, self.continuous_size)) + self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)]) + + def get_value(self, state: torch.Tensor): + target = state[:,0].to(torch.int32) # int + thisStateNum = target.size()[0] + viewInput = state[:,-self.raySize:] # all ray input + targetInput = state[:,:self.nonRaySize] + viewLayer = self.viewNetwork(viewInput) + targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]) + middleInput = torch.cat([viewLayer,targetLayer],dim = 1) + middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]) + criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic + return criticV + + def get_actions_value(self, state: torch.Tensor, actions=None): + target = state[:,0].to(torch.int32) # int + thisStateNum = target.size()[0] + viewInput = state[:,-self.raySize:] # all ray input + targetInput = state[:,:self.nonRaySize] + viewLayer = self.viewNetwork(viewInput) + targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]) + middleInput = torch.cat([viewLayer,targetLayer],dim = 1) + middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]) + + # discrete + # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出 + dis_logits = torch.stack([self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)]) + split_logits = torch.split(dis_logits, self.discrete_shape, dim=1) + multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits] + # continuous + actions_mean = torch.stack([self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_mean(hidden) + # action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden) + # action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean) + action_logstd = torch.stack([torch.squeeze(self.actor_logstd[target[i]],0) for i in range(thisStateNum)]) + # print(action_logstd) + action_std = torch.exp(action_logstd) # torch.exp(action_logstd) + con_probs = Normal(actions_mean, action_std) + # critic + criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic + + if actions is None: + if args.train: + # select actions base on probability distribution model + disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) + conAct = con_probs.sample() + actions = torch.cat([disAct.T, conAct], dim=1) + else: + # select actions base on best probability distribution + disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits]) + conAct = actions_mean + actions = torch.cat([disAct.T, conAct], dim=1) + else: + disAct = actions[:, 0 : env.unity_discrete_type].T + conAct = actions[:, env.unity_discrete_type :] + dis_log_prob = torch.stack( + [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)] + ) + dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals]) + return ( + actions, + dis_log_prob.sum(0), + dis_entropy.sum(0), + con_probs.log_prob(conAct).sum(1), + con_probs.entropy().sum(1), + criticV, + ) + + +def GAE(agent, args, rewards, dones, values, next_obs, next_done): + # GAE + with torch.no_grad(): + next_value = agent.get_value(next_obs).reshape(1, -1) + data_size = rewards.size()[0] + if args.gae: + advantages = torch.zeros_like(rewards).to(device) + lastgaelam = 0 + for t in reversed(range(data_size)): + if t == data_size - 1: + nextnonterminal = 1.0 - next_done + nextvalues = next_value + else: + nextnonterminal = 1.0 - dones[t + 1] + nextvalues = values[t + 1] + delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] + advantages[t] = lastgaelam = ( + delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam + ) + returns = advantages + values + else: + returns = torch.zeros_like(rewards).to(device) + for t in reversed(range(data_size)): + if t == data_size - 1: + nextnonterminal = 1.0 - next_done + next_return = next_value + else: + nextnonterminal = 1.0 - dones[t + 1] + next_return = returns[t + 1] + returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return + advantages = returns - values + return advantages, returns + +class AimbotSideChannel(SideChannel): + def __init__(self, channel_id: uuid.UUID) -> None: + super().__init__(channel_id) + def on_message_received(self, msg: IncomingMessage) -> None: + """ + Note: We must implement this method of the SideChannel interface to + receive messages from Unity + """ + thisMessage = msg.read_string() + # print(thisMessage) + thisResult = thisMessage.split("|") + if(thisResult[0] == "result"): + TotalRounds[thisResult[1]]+=1 + if(thisResult[2] == "Win"): + WinRounds[thisResult[1]]+=1 + #print(TotalRounds) + #print(WinRounds) + elif(thisResult[0] == "Error"): + print(thisMessage) + # 发送函数 + def send_string(self, data: str) -> None: + # send a string toC# + msg = OutgoingMessage() + msg.write_string(data) + super().queue_message_to_send(msg) + + def send_bool(self, data: bool) -> None: + msg = OutgoingMessage() + msg.write_bool(data) + super().queue_message_to_send(msg) + + def send_int(self, data: int) -> None: + msg = OutgoingMessage() + msg.write_int32(data) + super().queue_message_to_send(msg) + + def send_float(self, data: float) -> None: + msg = OutgoingMessage() + msg.write_float32(data) + super().queue_message_to_send(msg) + + def send_float_list(self, data: List[float]) -> None: + msg = OutgoingMessage() + msg.write_float32_list(data) + super().queue_message_to_send(msg) + +def broadCastEndReward(rewardBF:list,remainTime:float): + thisRewardBF = rewardBF + if (rewardBF[-1]<=-500): + # print("Lose DO NOT BROAD CAST",rewardBF[-1]) + thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD + thisRewardBF = thisRewardBF + elif (rewardBF[-1]>=500): + # print("Win! Broadcast reward!",rewardBF[-1]) + thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD + thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist() + else: + print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1]) + return torch.Tensor(thisRewardBF).to(device) + + +if __name__ == "__main__": + args = parse_args() + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + + device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + + # Initialize environment anget optimizer + aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID); + env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel]) + if args.load_dir is None: + agent = PPOAgent(env,TARGETNUM).to(device) + else: + agent = torch.load(args.load_dir) + # freeze + if args.freeze_viewnet: + # freeze the view network + for p in agent.viewNetwork.parameters(): + p.requires_grad = False + print("VIEW NETWORK FREEZED") + print("Load Agent", args.load_dir) + print(agent.eval()) + + optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) + + # Tensorboard and WandB Recorder + game_name = "Aimbot_Target_Hybrid_PMNN_V2" + game_type = "OffPolicy_EndBC" + run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}" + if args.wandb_track: + wandb.init( + project=game_name, + entity=args.wandb_entity, + sync_tensorboard=True, + config=vars(args), + name=run_name, + monitor_gym=True, + save_code=True, + ) + + writer = SummaryWriter(f"runs/{run_name}") + writer.add_text( + "hyperparameters", + "|param|value|\n|-|-|\n%s" + % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), + ) + + @atexit.register + def save_model(): + # save model while exit + saveDir = "../PPO-Model/"+ run_name + "_last.pt" + torch.save(agent, saveDir) + print("save model to " + saveDir) + + # Trajectory Buffer + ob_bf = [[] for i in range(env.unity_agent_num)] + act_bf = [[] for i in range(env.unity_agent_num)] + dis_logprobs_bf = [[] for i in range(env.unity_agent_num)] + con_logprobs_bf = [[] for i in range(env.unity_agent_num)] + rewards_bf = [[] for i in range(env.unity_agent_num)] + dones_bf = [[] for i in range(env.unity_agent_num)] + values_bf = [[] for i in range(env.unity_agent_num)] + + # start the game + total_update_step = using_targets_num * args.total_timesteps // args.datasetSize + target_steps = [0 for i in range(TARGETNUM)] + start_time = time.time() + state, _, done = env.reset() + # state = torch.Tensor(next_obs).to(device) + # next_done = torch.zeros(env.unity_agent_num).to(device) + + # initialize empty training datasets + obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size) + actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size) + dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) + + vis_graph = make_dot(agent.get_actions_value( + torch.Tensor(state).to(device) + ), params=dict(agent.named_parameters())) + vis_graph.view() # 会在当前目录下保存一个“Digraph.gv.pdf”文件,并在默认浏览器中打开 + + with torch.onnx.set_training(agent, False): + trace, _ = torch.jit.get_trace_graph(agent, args=(torch.Tensor(state).to(device),)) + make_dot_from_trace(trace) + raise + + for total_steps in range(total_update_step): + # discunt learning rate, while step == total_update_step lr will be 0 + + if args.annealLR: + finalRatio = TARGET_LEARNING_RATE/args.lr + frac = 1.0 - ((total_steps + 1.0) / total_update_step) + lrnow = frac * args.lr + optimizer.param_groups[0]["lr"] = lrnow + else: + lrnow = args.lr + print("new episode",total_steps,"learning rate = ",lrnow) + + + # MAIN LOOP: run agent in environment + step = 0 + training = False + trainQueue = [] + last_reward = [0.for i in range(env.unity_agent_num)] + while True: + if step % args.decision_period == 0: + step += 1 + # Choose action by agent + + with torch.no_grad(): + # predict actions + action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value( + torch.Tensor(state).to(device) + ) + value = value.flatten() + + # variable from GPU to CPU + action_cpu = action.cpu().numpy() + dis_logprob_cpu = dis_logprob.cpu().numpy() + con_logprob_cpu = con_logprob.cpu().numpy() + value_cpu = value.cpu().numpy() + # Environment step + next_state, reward, next_done = env.step(action_cpu) + + # save memories + for i in range(env.unity_agent_num): + # save memories to buffers + ob_bf[i].append(state[i]) + act_bf[i].append(action_cpu[i]) + dis_logprobs_bf[i].append(dis_logprob_cpu[i]) + con_logprobs_bf[i].append(con_logprob_cpu[i]) + rewards_bf[i].append(reward[i]+last_reward[i]) + dones_bf[i].append(done[i]) + values_bf[i].append(value_cpu[i]) + remainTime = state[i,TARGET_STATE_SIZE] + if next_done[i] == True: + # finished a round, send finished memories to training datasets + # compute advantage and discounted reward + #print(i,"over") + roundTargetType = int(state[i,0]) + thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime) + adv, rt = GAE( + agent, + args, + thisRewardsTensor, + torch.Tensor(dones_bf[i]).to(device), + torch.tensor(values_bf[i]).to(device), + torch.tensor(next_state[i]).to(device).unsqueeze(0), + torch.Tensor([next_done[i]]).to(device), + ) + # send memories to training datasets + obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0) + actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0) + dis_logprobs[roundTargetType] = torch.cat( + (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0 + ) + con_logprobs[roundTargetType] = torch.cat( + (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0 + ) + rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0) + values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0) + advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0) + returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0) + + # clear buffers + ob_bf[i] = [] + act_bf[i] = [] + dis_logprobs_bf[i] = [] + con_logprobs_bf[i] = [] + rewards_bf[i] = [] + dones_bf[i] = [] + values_bf[i] = [] + print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}") + + for i in range(TARGETNUM): + if obs[i].size()[0] >= args.datasetSize: + # start train NN + trainQueue.append(i) + if(len(trainQueue)>0): + break + state, done = next_state, next_done + else: + step += 1 + # skip this step use last predict action + next_state, reward, next_done = env.step(action_cpu) + # save memories + for i in range(env.unity_agent_num): + if next_done[i] == True: + #print(i,"over???") + # save memories to buffers + ob_bf[i].append(state[i]) + act_bf[i].append(action_cpu[i]) + dis_logprobs_bf[i].append(dis_logprob_cpu[i]) + con_logprobs_bf[i].append(con_logprob_cpu[i]) + rewards_bf[i].append(reward[i]) + dones_bf[i].append(done[i]) + values_bf[i].append(value_cpu[i]) + remainTime = state[i,TARGET_STATE_SIZE] + # finished a round, send finished memories to training datasets + # compute advantage and discounted reward + roundTargetType = int(state[i,0]) + thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime) + adv, rt = GAE( + agent, + args, + thisRewardsTensor, + torch.Tensor(dones_bf[i]).to(device), + torch.tensor(values_bf[i]).to(device), + torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0), + torch.Tensor([next_done[i]]).to(device), + ) + # send memories to training datasets + obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0) + actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0) + dis_logprobs[roundTargetType] = torch.cat( + (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0 + ) + con_logprobs[roundTargetType] = torch.cat( + (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0 + ) + rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0) + values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0) + advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0) + returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0) + + # clear buffers + ob_bf[i] = [] + act_bf[i] = [] + dis_logprobs_bf[i] = [] + con_logprobs_bf[i] = [] + rewards_bf[i] = [] + dones_bf[i] = [] + values_bf[i] = [] + print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}") + + state = next_state + last_reward = reward + i += 1 + + if args.train: + meanRewardList = [] # for WANDB + # loop all tarining queue + for thisT in trainQueue: + target_steps[thisT]+=1 + # flatten the batch + b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape) + b_dis_logprobs = dis_logprobs[thisT].reshape(-1) + b_con_logprobs = con_logprobs[thisT].reshape(-1) + b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,)) + b_advantages = advantages[thisT].reshape(-1) + b_returns = returns[thisT].reshape(-1) + b_values = values[thisT].reshape(-1) + b_size = b_obs.size()[0] + # Optimizing the policy and value network + b_inds = np.arange(b_size) + # clipfracs = [] + for epoch in range(args.epochs): + print(epoch,end="") + # shuffle all datasets + np.random.shuffle(b_inds) + for start in range(0, b_size, args.minibatchSize): + print(".",end="") + end = start + args.minibatchSize + mb_inds = b_inds[start:end] + if(np.size(mb_inds)<=1): + break + mb_advantages = b_advantages[mb_inds] + + # normalize advantages + if args.norm_adv: + mb_advantages = (mb_advantages - mb_advantages.mean()) / ( + mb_advantages.std() + 1e-8 + ) + + ( + _, + new_dis_logprob, + dis_entropy, + new_con_logprob, + con_entropy, + newvalue, + ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) + # discrete ratio + dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] + dis_ratio = dis_logratio.exp() + # continuous ratio + con_logratio = new_con_logprob - b_con_logprobs[mb_inds] + con_ratio = con_logratio.exp() + + """ + # early stop + with torch.no_grad(): + # calculate approx_kl http://joschu.net/blog/kl-approx.html + old_approx_kl = (-logratio).mean() + approx_kl = ((ratio - 1) - logratio).mean() + clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] + """ + + # discrete Policy loss + dis_pg_loss_orig = -mb_advantages * dis_ratio + dis_pg_loss_clip = -mb_advantages * torch.clamp( + dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef + ) + dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() + # continuous Policy loss + con_pg_loss_orig = -mb_advantages * con_ratio + con_pg_loss_clip = -mb_advantages * torch.clamp( + con_ratio, 1 - args.clip_coef, 1 + args.clip_coef + ) + con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() + + # Value loss + newvalue = newvalue.view(-1) + if args.clip_vloss: + v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 + v_clipped = b_values[mb_inds] + torch.clamp( + newvalue - b_values[mb_inds], + -args.clip_coef, + args.clip_coef, + ) + v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 + v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) + v_loss = 0.5 * v_loss_max.mean() + else: + v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() + + # total loss + entropy_loss = dis_entropy.mean() + con_entropy.mean() + loss = ( + dis_pg_loss * POLICY_COEF[thisT] + + con_pg_loss * POLICY_COEF[thisT] + + entropy_loss * ENTROPY_COEF[thisT] + + v_loss * CRITIC_COEF[thisT] + )*LOSS_COEF[thisT] + + if(torch.isnan(loss).any()): + print("LOSS Include NAN!!!") + if(torch.isnan(dis_pg_loss.any())): + print("dis_pg_loss include nan") + if(torch.isnan(con_pg_loss.any())): + print("con_pg_loss include nan") + if(torch.isnan(entropy_loss.any())): + print("entropy_loss include nan") + if(torch.isnan(v_loss.any())): + print("v_loss include nan") + raise + + optimizer.zero_grad() + loss.backward() + # Clips gradient norm of an iterable of parameters. + nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) + optimizer.step() + + """ + if args.target_kl is not None: + if approx_kl > args.target_kl: + break + """ + # record mean reward before clear history + print("done") + targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) + meanRewardList.append(targetRewardMean) + targetName = Targets(thisT).name + + # clear this target trainning set buffer + obs[thisT] = torch.tensor([]).to(device) + actions[thisT] = torch.tensor([]).to(device) + dis_logprobs[thisT] = torch.tensor([]).to(device) + con_logprobs[thisT] = torch.tensor([]).to(device) + rewards[thisT] = torch.tensor([]).to(device) + values[thisT] = torch.tensor([]).to(device) + advantages[thisT] = torch.tensor([]).to(device) + returns[thisT] = torch.tensor([]).to(device) + + # record rewards for plotting purposes + writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) + writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT]) + print(f"episode over Target{targetName} mean reward:", targetRewardMean) + TotalRewardMean = np.mean(meanRewardList) + writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) + writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps) + # New Record! + if TotalRewardMean > bestReward and args.save_model: + bestReward = targetRewardMean + saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt" + torch.save(agent, saveDir) + + saveDir = "../PPO-Model/"+ run_name + "_last.pt" + torch.save(agent, saveDir) + env.close() + writer.close() diff --git a/Aimbot-PPO-Python/Pytorch/testarea.ipynb b/Aimbot-PPO-Python/Pytorch/testarea.ipynb index e362898..33ce0c5 100644 --- a/Aimbot-PPO-Python/Pytorch/testarea.ipynb +++ b/Aimbot-PPO-Python/Pytorch/testarea.ipynb @@ -831,6 +831,152 @@ "torch.cat([aaa[:,1:3],aaa[:,4:]],dim=1)" ] }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch.distributions.categorical import Categorical\n", + "\n", + "logits = torch.Tensor([[0.5,0.25]])\n", + "lgst = Categorical(logits=logits)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[ 0.0000, -0.2500]])\n", + "tensor([[1.0000, 0.7788]])\n", + "tensor([[1.7788]])\n", + "tensor([[0.5622, 0.4378]])\n" + ] + }, + { + "data": { + "text/plain": [ + "tensor([[0.6854]])" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# calculate entropy of log probability\n", + "def entropy(logits):\n", + " a0 = logits - logits.max(1, keepdim=True)[0]\n", + " print(a0)\n", + " ea0 = torch.exp(a0)\n", + " print(ea0)\n", + " z0 = ea0.sum(1, keepdim=True)\n", + " print(z0)\n", + " p0 = ea0 / z0\n", + " print(p0)\n", + " return (p0 * (torch.log(z0) - a0)).sum(1, keepdim=True)\n", + "entropy(logits)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[0.5000, 0.2500, 0.2500]])\n", + "tensor([[1.0397]])\n" + ] + }, + { + "data": { + "text/plain": [ + "tensor([1.0397])" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "probs = torch.Tensor([[0.5,0.25,0.25]])\n", + "print(probs)\n", + "# calculate entropy of probability\n", + "def entropy2(probs):\n", + " return -(probs * torch.log(probs)).sum(1, keepdim=True)\n", + "print(entropy2(probs))\n", + "lgst2 = Categorical(probs=probs)\n", + "lgst2.entropy()" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[2.1121]])\n" + ] + }, + { + "data": { + "text/plain": [ + "tensor([[2.1121]])" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from torch.distributions.normal import Normal\n", + "mu = torch.Tensor([[1]])\n", + "sigma = torch.Tensor([[2]])\n", + "# calculate entropy of Normal distribution\n", + "def entropy3(mu,sigma):\n", + " return 0.5 * (1 + torch.log(2 * sigma * sigma * 3.1415926))\n", + "\n", + "print(entropy3(mu,sigma))\n", + "nm = Normal(mu,sigma)\n", + "nm.entropy()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([-1.0139])\n" + ] + } + ], + "source": [ + "logits = logits - logits.logsumexp(dim=-1, keepdim=True)\n", + "min_real = torch.finfo(logits.dtype).min\n", + "logits = torch.clamp(logits, min=min_real)\n", + "p_log_p = logits*logits\n", + "print(-p_log_p.sum(-1))" + ] + }, { "cell_type": "code", "execution_count": 2, @@ -840,18 +986,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "tensor([False, True, False], device='cuda:0')\n", - "tensor(True, device='cuda:0')\n" + "37\n" ] } ], "source": [ - "import torch\n", - "import numpy as np\n", - "from torch.distributions.normal import Normal\n", + "a = 13\n", + "b = 24\n", "\n", - "print(torch.isnan(torch.tensor([1,float('nan'),2]).to(\"cuda\")))\n", - "print(torch.isnan(torch.tensor([1,float('nan'),2]).to(\"cuda\")).any())" + "c = a + b\n", + "print(c)" ] } ], diff --git a/Aimbot-PPO-Python/testdebug.py b/Aimbot-PPO-Python/testdebug.py new file mode 100644 index 0000000..a4aaffb --- /dev/null +++ b/Aimbot-PPO-Python/testdebug.py @@ -0,0 +1,5 @@ +import numpy as np + +aa = np.array([1,2,3,4,5,6,7,8,9,10]) + +print(aa) \ No newline at end of file