diff --git a/.gitignore b/.gitignore index f29e4da..4801267 100644 --- a/.gitignore +++ b/.gitignore @@ -83,4 +83,5 @@ crashlytics-build.properties /Aimbot-PPO-Python/Backup/ /Aimbot-PPO-Python/Build/ /Aimbot-PPO-Python/PPO-Model/ -/Aimbot-PPO-Python/GAIL-Expert-Data/ \ No newline at end of file +/Aimbot-PPO-Python/GAIL-Expert-Data/ +/Aimbot-PPO-Python/runs/ \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index 7202ca6..05d2be8 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -23,40 +23,41 @@ from mlagents_envs.side_channel.side_channel import ( ) from typing import List -bestReward = 0 +bestReward = -1 -DEFAULT_SEED = 933139 -ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy/Aimbot-ParallelEnv" +DEFAULT_SEED = 9331 +ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv" SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e") WAND_ENTITY = "koha9" -WORKER_ID = 2 -BASE_PORT = 1001 +WORKER_ID = 3 +BASE_PORT = 1002 # max round steps per agent is 2500/Decision_period, 25 seconds # !!!check every parameters before run!!! -TOTAL_STEPS = 6750000 -BATCH_SIZE = 512 -MAX_TRAINNING_DATASETS = 3000 +TOTAL_STEPS = 3150000 +BATCH_SIZE = 1024 +MAX_TRAINNING_DATASETS = 6000 DECISION_PERIOD = 1 -LEARNING_RATE = 1e-3 +LEARNING_RATE = 5e-4 GAMMA = 0.99 GAE_LAMBDA = 0.95 -EPOCHS = 4 -CLIP_COEF = 0.1 -POLICY_COEF = 1.0 -ENTROPY_COEF = 0.01 -CRITIC_COEF = 0.5 -TARGET_LEARNING_RATE = 1e-5 +EPOCHS = 3 +CLIP_COEF = 0.11 +LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence +POLICY_COEF = [1.0, 1.0, 1.0, 1.0] +ENTROPY_COEF = [0.1, 0.1, 0.1, 0.1] +CRITIC_COEF = [0.5, 0.5, 0.5, 0.5] +TARGET_LEARNING_RATE = 1e-6 ANNEAL_LEARNING_RATE = True CLIP_VLOSS = True NORM_ADV = True TRAIN = True -WANDB_TACK = True +WANDB_TACK = False LOAD_DIR = None -#LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt" +#LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670522099-freeonly-12/Aimbot-target-last.pt" # public data class Targets(Enum): @@ -65,12 +66,17 @@ class Targets(Enum): Attack = 2 Defence = 3 Num = 4 -STATE_REMAINTIME_POSITION = 6 +TARGET_STATE_SIZE = 6 +INAREA_STATE_SIZE = 1 +TIME_STATE_SIZE = 1 +GUN_STATE_SIZE = 1 +MY_STATE_SIZE = 4 +TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE BASE_WINREWARD = 999 BASE_LOSEREWARD = -999 TARGETNUM= 4 ENV_TIMELIMIT = 30 -RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT +RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT TotalRounds = {"Free":0,"Go":0,"Attack":0} WinRounds = {"Free":0,"Go":0,"Attack":0} @@ -117,6 +123,8 @@ def parse_args(): help="load model directory") parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD, help="the number of steps to run in each environment per policy rollout") + parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO, + help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime") # GAE loss parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, @@ -156,42 +164,44 @@ class PPOAgent(nn.Module): def __init__(self, env: Aimbot,targetNum:int): super(PPOAgent, self).__init__() self.targetNum = targetNum + self.stateSize = env.unity_observation_shape[0] + self.targetSize = TARGET_STATE_SIZE + self.timeSize = TIME_STATE_SIZE + self.gunSize = GUN_STATE_SIZE + self.myStateSize = MY_STATE_SIZE + self.totalMiddleSize = TOTAL_T_SIZE + self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input + self.discrete_size = env.unity_discrete_size self.discrete_shape = list(env.unity_discrete_branches) self.continuous_size = env.unity_continuous_size - self.network = nn.ModuleList([nn.Sequential( - layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 300)), - nn.ReLU(), + self.network = nn.Sequential( + layer_init(nn.Linear(env.unity_observation_shape[0], 300)), + nn.Tanh(), layer_init(nn.Linear(300, 200)), - nn.ReLU()) for i in range(targetNum)]) - self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.01) for i in range(targetNum)]) - self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.01) for i in range(targetNum)]) - self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)]) - self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)]) + nn.Tanh(), + ) + self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5) + self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5) + self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) + self.critic = layer_init(nn.Linear(200, 1), std=1) def get_value(self, state: torch.Tensor): - targets = state[:,0].to(torch.int32) - hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])]) - return torch.stack([self.critic[targets[i]](hidden[i])for i in range(targets.size()[0])]) + return self.critic(self.network(state)) def get_actions_value(self, state: torch.Tensor, actions=None): - targets = state[:,0].to(torch.int32) - hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])]) + hidden = self.network(state) # discrete - # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出 - dis_logits = torch.stack([self.actor_dis[targets[i]](hidden[i]) for i in range(targets.size()[0])]) + dis_logits = self.actor_dis(hidden) split_logits = torch.split(dis_logits, self.discrete_shape, dim=1) multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits] # continuous - actions_mean = torch.stack([self.actor_mean[targets[i]](hidden[i]) for i in range(targets.size()[0])]) # self.actor_mean(hidden) - # action_logstd = torch.stack([self.actor_logstd[targets[i]].expand_as(actions_mean) for i in range(targets.size()[0])]) # self.actor_logstd.expand_as(actions_mean) - # print(action_logstd) - action_std = torch.squeeze(torch.stack([torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])]),dim = -1) # torch.exp(action_logstd) + actions_mean = self.actor_mean(hidden) + action_logstd = self.actor_logstd.expand_as(actions_mean) + action_std = torch.exp(action_logstd) con_probs = Normal(actions_mean, action_std) - # critic - criticV = torch.stack([self.critic[targets[i]](hidden[i])for i in range(targets.size()[0])]) if actions is None: if args.train: @@ -211,14 +221,13 @@ class PPOAgent(nn.Module): [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)] ) dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals]) - return ( actions, dis_log_prob.sum(0), dis_entropy.sum(0), con_probs.log_prob(conAct).sum(1), con_probs.entropy().sum(1), - criticV, + self.critic(hidden), ) @@ -306,11 +315,11 @@ def broadCastEndReward(rewardBF:list,remainTime:float): if (rewardBF[-1]<=-500): # print("Lose DO NOT BROAD CAST",rewardBF[-1]) thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD - thisRewardBF = (np.asarray(thisRewardBF)).tolist() + thisRewardBF = thisRewardBF elif (rewardBF[-1]>=500): # print("Win! Broadcast reward!",rewardBF[-1]) thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD - thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*RESULT_BROADCAST_RATIO)).tolist() + thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist() else: print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1]) return torch.Tensor(thisRewardBF).to(device) @@ -327,17 +336,22 @@ if __name__ == "__main__": # Initialize environment anget optimizer aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID); env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel]) + agentList = [] + optimizers = [] if args.load_dir is None: - agent = PPOAgent(env,TARGETNUM).to(device) + for i in range(using_targets_num): + agentList.append(PPOAgent(env,TARGETNUM).to(device)) + optimizers.append(optim.Adam(agentList[i].parameters(), lr=args.lr, eps=1e-5)) else: - agent = torch.load(args.load_dir) - print("Load Agent", args.load_dir) - print(agent.eval()) + print("NAH") + # !!!not finished + # agent = torch.load(args.load_dir) + # print("Load Agent", args.load_dir) + # print(agent.eval()) - optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) # Tensorboard and WandB Recorder - game_name = "Aimbot_Target_Hybrid_Multi_Output" + game_name = "Aimbot_Target_Hybrid_PMNN_V2" game_type = "OffPolicy_EndBC" run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}" if args.wandb_track: @@ -387,38 +401,51 @@ if __name__ == "__main__": for total_steps in range(total_update_step): # discunt learning rate, while step == total_update_step lr will be 0 - print("new episode") + if args.annealLR: finalRatio = TARGET_LEARNING_RATE/args.lr - frac = 1.0 - finalRatio*((total_steps - 1.0) / total_update_step) + frac = 1.0 - ((total_steps + 1.0) / total_update_step) lrnow = frac * args.lr - optimizer.param_groups[0]["lr"] = lrnow + for optimizer in optimizers: + optimizer.param_groups[0]["lr"] = lrnow + else: + lrnow = args.lr + print("new episode",total_steps,"learning rate = ",lrnow) # MAIN LOOP: run agent in environment - i = 0 + step = 0 training = False trainQueue = [] + last_reward = [0.for i in range(env.unity_agent_num)] + action = torch.zeros((env.unity_agent_num,env.unity_discrete_type+env.unity_continuous_size)) + dis_logprob = torch.zeros((env.unity_agent_num,env.unity_discrete_size)) + con_logprob = torch.zeros((env.unity_agent_num,env.unity_continuous_size)) + value = torch.zeros((env.unity_agent_num,1)) while True: - if i % args.decision_period == 0: - step = round(i / args.decision_period) + if step % args.decision_period == 0: + step += 1 # Choose action by agent with torch.no_grad(): # predict actions - action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value( - torch.Tensor(state).to(device) - ) - value = value.flatten() + for i in range(env.unity_agent_num): + actTarget = int(state[i][0]) + act, dis_lgprb, _, con_lgprb, _, vl = agentList[actTarget].get_actions_value( + torch.Tensor([state[i]]).to(device) + ) + action[i] = act + dis_logprob[i] = dis_lgprb.squeeze(0) + con_logprob[i] = con_lgprb.squeeze(0) + value[i] = vl.squeeze(0) # variable from GPU to CPU action_cpu = action.cpu().numpy() dis_logprob_cpu = dis_logprob.cpu().numpy() con_logprob_cpu = con_logprob.cpu().numpy() - value_cpu = value.cpu().numpy() + value_cpu = value.flatten().cpu().numpy() # Environment step next_state, reward, next_done = env.step(action_cpu) - remainTime = state[i,STATE_REMAINTIME_POSITION] # save memories for i in range(env.unity_agent_num): # save memories to buffers @@ -426,22 +453,24 @@ if __name__ == "__main__": act_bf[i].append(action_cpu[i]) dis_logprobs_bf[i].append(dis_logprob_cpu[i]) con_logprobs_bf[i].append(con_logprob_cpu[i]) - rewards_bf[i].append(reward[i]) + rewards_bf[i].append(reward[i]+last_reward[i]) dones_bf[i].append(done[i]) values_bf[i].append(value_cpu[i]) + remainTime = state[i,TARGET_STATE_SIZE] if next_done[i] == True: # finished a round, send finished memories to training datasets # compute advantage and discounted reward #print(i,"over") + endTarget = int(ob_bf[i][0][0]) roundTargetType = int(state[i,0]) thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime) adv, rt = GAE( - agent, + agentList[endTarget], args, thisRewardsTensor, torch.Tensor(dones_bf[i]).to(device), torch.tensor(values_bf[i]).to(device), - torch.tensor([next_state[i]]).to(device), + torch.tensor(next_state[i]).to(device).unsqueeze(0), torch.Tensor([next_done[i]]).to(device), ) # send memories to training datasets @@ -476,13 +505,14 @@ if __name__ == "__main__": break state, done = next_state, next_done else: + step += 1 # skip this step use last predict action - next_obs, reward, next_done = env.step(action_cpu) + next_state, reward, next_done = env.step(action_cpu) # save memories for i in range(env.unity_agent_num): if next_done[i] == True: #print(i,"over???") - # save last memories to buffers + # save memories to buffers ob_bf[i].append(state[i]) act_bf[i].append(action_cpu[i]) dis_logprobs_bf[i].append(dis_logprob_cpu[i]) @@ -490,30 +520,33 @@ if __name__ == "__main__": rewards_bf[i].append(reward[i]) dones_bf[i].append(done[i]) values_bf[i].append(value_cpu[i]) + remainTime = state[i,TARGET_STATE_SIZE] # finished a round, send finished memories to training datasets # compute advantage and discounted reward + roundTargetType = int(state[i,0]) + thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime) adv, rt = GAE( - agent, + agentList[roundTargetType], args, - torch.tensor(rewards_bf[i]).to(device), + thisRewardsTensor, torch.Tensor(dones_bf[i]).to(device), torch.tensor(values_bf[i]).to(device), - torch.tensor(next_state[i]).to(device), + torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0), torch.Tensor([next_done[i]]).to(device), ) # send memories to training datasets - obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0) - actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0) - dis_logprobs = torch.cat( - (dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0 + obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0) + actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0) + dis_logprobs[roundTargetType] = torch.cat( + (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0 ) - con_logprobs = torch.cat( - (con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0 + con_logprobs[roundTargetType] = torch.cat( + (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0 ) - rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0) - values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0) - advantages = torch.cat((advantages, adv), 0) - returns = torch.cat((returns, rt), 0) + rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0) + values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0) + advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0) + returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0) # clear buffers ob_bf[i] = [] @@ -523,8 +556,10 @@ if __name__ == "__main__": rewards_bf[i] = [] dones_bf[i] = [] values_bf[i] = [] - print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}") - state, done = next_state, next_done + print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}") + + state = next_state + last_reward = reward i += 1 if args.train: @@ -540,14 +575,16 @@ if __name__ == "__main__": b_advantages = advantages[thisT].reshape(-1) b_returns = returns[thisT].reshape(-1) b_values = values[thisT].reshape(-1) - b_size = b_obs[thisT].size()[0] + b_size = b_obs.size()[0] # Optimizing the policy and value network b_inds = np.arange(b_size) # clipfracs = [] for epoch in range(args.epochs): + print(epoch,end="") # shuffle all datasets np.random.shuffle(b_inds) for start in range(0, b_size, args.minibatchSize): + print(".",end="") end = start + args.minibatchSize mb_inds = b_inds[start:end] mb_advantages = b_advantages[mb_inds] @@ -565,7 +602,7 @@ if __name__ == "__main__": new_con_logprob, con_entropy, newvalue, - ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) + ) = agentList[thisT].get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) # discrete ratio dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] dis_ratio = dis_logratio.exp() @@ -613,17 +650,17 @@ if __name__ == "__main__": # total loss entropy_loss = dis_entropy.mean() + con_entropy.mean() loss = ( - dis_pg_loss * args.policy_coef - + con_pg_loss * args.policy_coef - - entropy_loss * args.ent_coef - + v_loss * args.critic_coef - ) + dis_pg_loss * POLICY_COEF[thisT] + + con_pg_loss * POLICY_COEF[thisT] + + entropy_loss * ENTROPY_COEF[thisT] + + v_loss * CRITIC_COEF[thisT] + )*LOSS_COEF[thisT] - optimizer.zero_grad() + optimizers[thisT].zero_grad() loss.backward() # Clips gradient norm of an iterable of parameters. - nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) - optimizer.step() + nn.utils.clip_grad_norm_(agentList[thisT].parameters(), args.max_grad_norm) + optimizers[thisT].step() """ if args.target_kl is not None: @@ -631,6 +668,7 @@ if __name__ == "__main__": break """ # record mean reward before clear history + print("done") targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) meanRewardList.append(targetRewardMean) targetName = Targets(thisT).name @@ -660,10 +698,12 @@ if __name__ == "__main__": # New Record! if TotalRewardMean > bestReward: bestReward = targetRewardMean - saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt" - torch.save(agent, saveDir) + for i in range(using_targets_num): + saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) +"_"+ str(i)+".pt" + torch.save(agentList[i], saveDir) - saveDir = "../PPO-Model/Hybrid-MNN-500-300-Last" + ".pt" - torch.save(agent, saveDir) + for i in range(using_targets_num): + saveDir = "../PPO-Model/"+ run_name +"_last_"+ str(i) + ".pt" + torch.save(agentList[i], saveDir) env.close() writer.close() diff --git a/Aimbot-PPO-Python/Pytorch/testarea.ipynb b/Aimbot-PPO-Python/Pytorch/testarea.ipynb index 3efc30e..85acfe9 100644 --- a/Aimbot-PPO-Python/Pytorch/testarea.ipynb +++ b/Aimbot-PPO-Python/Pytorch/testarea.ipynb @@ -601,13 +601,15 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ + "import numpy as np\n", "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", + "from AimbotEnv import Aimbot\n", "from torch.distributions.normal import Normal\n", "from torch.distributions.categorical import Categorical\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() and True else \"cpu\")\n", @@ -620,39 +622,37 @@ "class PPOAgent(nn.Module):\n", " def __init__(self, env: Aimbot,targetNum:int):\n", " super(PPOAgent, self).__init__()\n", - " self.targetNum = targetNum\n", + " self.stateSize = env.unity_observation_shape[0]\n", + "\n", " self.discrete_size = env.unity_discrete_size\n", " self.discrete_shape = list(env.unity_discrete_branches)\n", " self.continuous_size = env.unity_continuous_size\n", "\n", " self.network = nn.Sequential(\n", - " layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 500)),\n", - " nn.ReLU(),\n", - " layer_init(nn.Linear(500, 300)),\n", - " nn.ReLU(),\n", + " layer_init(nn.Linear(env.unity_observation_shape[0], 300)),\n", + " nn.Tanh(),\n", + " layer_init(nn.Linear(300, 200)),\n", + " nn.Tanh(),\n", " )\n", - " self.actor_dis = nn.ModuleList([layer_init(nn.Linear(300, self.discrete_size), std=0.01) for i in range(targetNum)])\n", - " self.actor_mean = nn.ModuleList([layer_init(nn.Linear(300, self.continuous_size), std=0.01) for i in range(targetNum)])\n", - " self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)])\n", - " self.critic = layer_init(nn.Linear(300, 1), std=1)\n", + " self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5)\n", + " self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5)\n", + " self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))\n", + " self.critic = layer_init(nn.Linear(200, 1), std=1)\n", "\n", " def get_value(self, state: torch.Tensor):\n", " return self.critic(self.network(state))\n", "\n", " def get_actions_value(self, state: torch.Tensor, actions=None):\n", " hidden = self.network(state)\n", - " targets = torch.argmax(state[:,0:self.targetNum],dim=1)\n", "\n", " # discrete\n", - " # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出\n", - " dis_logits = torch.stack([self.actor_dis[targets[i]](hidden[i]) for i in range(targets.size()[0])])\n", + " dis_logits = self.actor_dis(hidden)\n", " split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n", " multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n", " # continuous\n", - " actions_mean = torch.stack([self.actor_mean[targets[i]](hidden[i]) for i in range(targets.size()[0])]) # self.actor_mean(hidden)\n", - " # action_logstd = torch.stack([self.actor_logstd[targets[i]].expand_as(actions_mean) for i in range(targets.size()[0])]) # self.actor_logstd.expand_as(actions_mean)\n", - " # print(action_logstd)\n", - " action_std = torch.squeeze(torch.stack([torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])]),dim = -1) # torch.exp(action_logstd)\n", + " actions_mean = self.actor_mean(hidden)\n", + " action_logstd = self.actor_logstd.expand_as(actions_mean)\n", + " action_std = torch.exp(action_logstd)\n", " con_probs = Normal(actions_mean, action_std)\n", "\n", " if actions is None:\n", @@ -680,117 +680,46 @@ " con_probs.log_prob(conAct).sum(1),\n", " con_probs.entropy().sum(1),\n", " self.critic(hidden),\n", - " )\n", - "agent = PPOAgent(env,4).to(device)" + " )" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "ppp = \"../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv\"\n", + "env = Aimbot(envPath=ppp, workerID=1, basePort=1000,side_channels=[])\n", + "agent_list = []\n", + "optimizers = []\n", + "for i in range(3):\n", + " agent_list.append(PPOAgent(env=env,targetNum=3).to('cuda'))\n", + " optimizers.append(optim.Adam(agent_list[i].parameters(),lr=1e-4))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[ 1. , -10.343613 , 0. , -7.367299 ,\n", - " 0. , 0. , 30. , -10.343662 ,\n", - " 1. , -33.708736 , 1. , 1. ,\n", - " 1. , 1. , 2. , 1. ,\n", - " 1. , 1. , 2. , 2. ,\n", - " 2. , 1. , 1. , 1. ,\n", - " 33.270493 , 39.50663 , 49.146526 , 32.595673 ,\n", - " 30.21616 , 21.163797 , 46.9299 , 1.3264331 ,\n", - " 1.2435672 , 1.2541904 , 30.08522 , 30.041445 ,\n", - " 21.072094 , 0. ],\n", - " [ 0. , 0. , 0. , 0. ,\n", - " 0. , 0. , 30. , -5.5892515 ,\n", - " 1. , -29.907726 , 1. , 1. ,\n", - " 1. , 1. , 2. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 41.408752 , 47.830173 , 45.03225 , 31.905174 ,\n", - " 41.849663 , 41.849648 , 43.001434 , 45.0322 ,\n", - " 47.48242 , 40.00285 , 41.668346 , 41.607723 ,\n", - " 41.668335 , 0. ],\n", - " [ 1. , 2.9582403 , 0. , -4.699738 ,\n", - " 0. , 0. , 30. , -5.412487 ,\n", - " 1. , -32.79967 , 1. , 2. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 1. , 1. , 1. , 2. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 20.17488 , 49.507687 , 48.162056 , 45.98998 ,\n", - " 44.75835 , 31.08564 , 32.865173 , 24.676666 ,\n", - " 12.952409 , 39.69923 , 44.564423 , 44.49966 ,\n", - " 44.564495 , 0. ],\n", - " [ 2. , -0.20171738, 0. , -10.340863 ,\n", - " 0. , 0. , 30. , -22.987915 ,\n", - " 1. , -34.37514 , 1. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 1. , 2. , 1. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 11.631058 , 13.872022 , 18.006863 , 27.457632 ,\n", - " 46.343067 , 46.343094 , 20.155125 , 49.867714 ,\n", - " 52.965984 , 56.775608 , 46.14223 , 46.075138 ,\n", - " 46.142246 , 0. ],\n", - " [ 2. , -14.687862 , 0. , -12.615574 ,\n", - " 0. , 0. , 30. , 15.125373 ,\n", - " 1. , -30.849268 , 1. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 1. , 1. , 1. , 2. ,\n", - " 52.430542 , 48.912865 , 46.05145 , 43.974594 ,\n", - " 42.796673 , 26.467875 , 11.072432 , 7.190229 ,\n", - " 5.483198 , 4.5500183 , 42.611244 , 42.549267 ,\n", - " 18.856438 , 0. ],\n", - " [ 0. , 0. , 0. , 0. ,\n", - " 0. , 0. , 30. , -4.0314903 ,\n", - " 1. , -29.164669 , 1. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 44.074184 , 46.9762 , 44.228096 , 42.2335 ,\n", - " 41.102253 , 41.102367 , 42.233757 , 44.22849 ,\n", - " 44.321827 , 37.335304 , 40.924183 , 40.86467 ,\n", - " 40.924236 , 0. ],\n", - " [ 0. , 0. , 0. , 0. ,\n", - " 0. , 0. , 30. , -18.603981 ,\n", - " 1. , -29.797592 , 1. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 1. , 2. , 2. , 2. ,\n", - " 19.134174 , 22.76088 , 29.468704 , 42.88739 ,\n", - " 41.738823 , 41.739002 , 42.88781 , 44.913647 ,\n", - " 47.704174 , 51.135338 , 20.418388 , 12.470214 ,\n", - " 12.670923 , 0. ],\n", - " [ 0. , 0. , 0. , 0. ,\n", - " 0. , 0. , 30. , -19.07032 ,\n", - " 1. , -30.246218 , 1. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 1. , 1. , 1. , 1. ,\n", - " 18.336487 , 21.81617 , 28.251017 , 42.977867 ,\n", - " 42.18994 , 42.19034 , 43.351707 , 45.399582 ,\n", - " 48.22037 , 51.68873 , 42.00719 , 41.94621 ,\n", - " 42.00739 , 0. ]], dtype=float32)" + "tensor([1., 2., 3., 4., 5.])" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "state,_,_ = env.getSteps()\n", - "state" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "env.close()" + "import torch\n", + "\n", + "aaa = torch.zeros((8,5))\n", + "aaa[0] = torch.Tensor([1,2,3,4,5])\n", + "aaa[0]" ] } ],