From 9d9524429c54b43f462c39020d47d55f45eff81e Mon Sep 17 00:00:00 2001 From: Koha9 Date: Wed, 24 Jan 2024 17:07:45 +0900 Subject: [PATCH] =?UTF-8?q?=E6=95=B4=E7=90=86=E6=97=A0=E7=94=A8=E5=8F=98?= =?UTF-8?q?=E9=87=8F=EF=BC=8C=E5=AF=B9=E7=8E=AF=E5=A2=833.6=E8=BF=9B?= =?UTF-8?q?=E8=A1=8C=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py | 2 +- Aimbot-PPO-Python/Pytorch/aimemory.py | 3 ++- Aimbot-PPO-Python/Pytorch/arguments.py | 12 ++++++------ Aimbot-PPO-Python/Pytorch/ppoagent.py | 21 +++++++-------------- 4 files changed, 16 insertions(+), 22 deletions(-) diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index e28eecf..ad965a3 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -20,7 +20,7 @@ import torch.optim as optim SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e") # tensorboard names GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel" -GAME_TYPE = "GotoOnly-Level0123-newModel" +GAME_TYPE = "GotoOnly-3.6-Level0123-newModel-Onehot" if __name__ == "__main__": args = parse_args() diff --git a/Aimbot-PPO-Python/Pytorch/aimemory.py b/Aimbot-PPO-Python/Pytorch/aimemory.py index 4ef2dbb..4041a14 100644 --- a/Aimbot-PPO-Python/Pytorch/aimemory.py +++ b/Aimbot-PPO-Python/Pytorch/aimemory.py @@ -58,6 +58,7 @@ class PPOMem: # print("Win! Broadcast reward!",rewardBF[-1]) print(sum(thisRewardBF) / len(thisRewardBF)) thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward + # broadcast result reward, increase all reward in this round by remainTime * self.result_broadcast_ratio thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist() else: print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1]) @@ -88,7 +89,7 @@ class PPOMem: self.dones_bf[i].append(done[i]) self.values_bf[i].append(value_cpu[i]) if now_step % self.decision_period == 0: - # on decision period, add last skiped round's reward + # on decision period, add last skiped round's reward, only affact in decision_period != 1 self.rewards_bf[i].append(reward[i] + last_reward[i]) else: # not on decision period, only add this round's reward diff --git a/Aimbot-PPO-Python/Pytorch/arguments.py b/Aimbot-PPO-Python/Pytorch/arguments.py index e729167..0f36890 100644 --- a/Aimbot-PPO-Python/Pytorch/arguments.py +++ b/Aimbot-PPO-Python/Pytorch/arguments.py @@ -4,7 +4,7 @@ import uuid from distutils.util import strtobool DEFAULT_SEED = 9331 -ENV_PATH = "../Build/3.4/Aimbot-ParallelEnv" +ENV_PATH = "../Build/3.6/Aimbot-ParallelEnv" WAND_ENTITY = "koha9" WORKER_ID = 1 BASE_PORT = 1000 @@ -16,19 +16,19 @@ TOTAL_STEPS = 3150000 BATCH_SIZE = 512 MAX_TRAINNING_DATASETS = 6000 DECISION_PERIOD = 1 -LEARNING_RATE = 1.5e-4 -GAMMA = 0.99 +LEARNING_RATE = 5e-5 +GAMMA = 0.999 GAE_LAMBDA = 0.95 EPOCHS = 3 CLIP_COEF = 0.11 LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence -POLICY_COEF = [1.0, 1.0, 1.0, 1.0] +POLICY_COEF = [0.8, 0.8, 0.8, 0.8] ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05] -CRITIC_COEF = [0.8, 0.8, 0.8, 0.8] +CRITIC_COEF = [1.0, 1.0, 1.0, 1.0] TARGET_LEARNING_RATE = 1e-6 FREEZE_VIEW_NETWORK = False -ANNEAL_LEARNING_RATE = True +ANNEAL_LEARNING_RATE = False CLIP_VLOSS = True NORM_ADV = False TRAIN = True diff --git a/Aimbot-PPO-Python/Pytorch/ppoagent.py b/Aimbot-PPO-Python/Pytorch/ppoagent.py index 74232a7..6e6899b 100644 --- a/Aimbot-PPO-Python/Pytorch/ppoagent.py +++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py @@ -14,6 +14,8 @@ def layer_init(layer, std=np.sqrt(2), bias_const=0.0): nn.init.constant_(layer.bias, bias_const) return layer +neural_size_1 = 400 +neural_size_2 = 300 class PPOAgent(nn.Module): def __init__( @@ -31,15 +33,6 @@ class PPOAgent(nn.Module): self.unity_action_size = env.unity_action_size self.state_size = self.unity_observation_shape[0] self.agent_num = env.unity_agent_num - self.target_size = self.args.target_state_size - self.time_state_size = self.args.time_state_size - self.gun_state_size = self.args.gun_state_size - self.my_state_size = self.args.my_state_size - self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size - self.state_size_without_ray = self.args.total_target_size - self.head_input_size = ( - env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size - ) # except target state input self.unity_discrete_type = env.unity_discrete_type self.discrete_size = env.unity_discrete_size @@ -49,9 +42,9 @@ class PPOAgent(nn.Module): self.hidden_networks = nn.ModuleList( [ nn.Sequential( - layer_init(nn.Linear(self.state_size, 256)), + layer_init(nn.Linear(self.state_size, neural_size_1)), nn.LeakyReLU(), - layer_init(nn.Linear(256, 128)), + layer_init(nn.Linear(neural_size_1, neural_size_2)), nn.LeakyReLU(), ) for i in range(self.target_num) @@ -59,16 +52,16 @@ class PPOAgent(nn.Module): ) self.actor_dis = nn.ModuleList( - [layer_init(nn.Linear(128, self.discrete_size), std=0.5) for i in range(self.target_num)] + [layer_init(nn.Linear(neural_size_2, self.discrete_size), std=0.5) for i in range(self.target_num)] ) self.actor_mean = nn.ModuleList( - [layer_init(nn.Linear(128, self.continuous_size), std=0) for i in range(self.target_num)] + [layer_init(nn.Linear(neural_size_2, self.continuous_size), std=0) for i in range(self.target_num)] ) self.actor_logstd = nn.ParameterList( [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)] ) self.critic = nn.ModuleList( - [layer_init(nn.Linear(128, 1), std=0) for i in range(self.target_num)] + [layer_init(nn.Linear(neural_size_2, 1), std=0) for i in range(self.target_num)] ) def get_value(self, state: torch.Tensor):