Multi Agent Each Type Action Select Style

Multi Agent Each Type Action Select Style. waste too much time
2022-12-14 09:01:29 +09:00 · 2022-12-14 09:01:29 +09:00 · 34206b95c5
commit 34206b95c5
parent 1787872e82
3 changed files with 179 additions and 209 deletions
--- a/.gitignore
+++ b/.gitignore
@ -83,4 +83,5 @@ crashlytics-build.properties
 /Aimbot-PPO-Python/Backup/
 /Aimbot-PPO-Python/Build/
 /Aimbot-PPO-Python/PPO-Model/
-/Aimbot-PPO-Python/GAIL-Expert-Data/
+/Aimbot-PPO-Python/GAIL-Expert-Data/
+/Aimbot-PPO-Python/runs/
--- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
+++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
@ -23,40 +23,41 @@ from mlagents_envs.side_channel.side_channel import (
 )
 from typing import List

-bestReward = 0
+bestReward = -1

-DEFAULT_SEED = 933139
-ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy/Aimbot-ParallelEnv"
+DEFAULT_SEED = 9331
+ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv"
 SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
 WAND_ENTITY = "koha9"
-WORKER_ID = 2
-BASE_PORT = 1001
+WORKER_ID = 3
+BASE_PORT = 1002

 # max round steps per agent is 2500/Decision_period, 25 seconds
 # !!!check every parameters before run!!!

-TOTAL_STEPS = 6750000
-BATCH_SIZE = 512
-MAX_TRAINNING_DATASETS = 3000
+TOTAL_STEPS = 3150000
+BATCH_SIZE = 1024
+MAX_TRAINNING_DATASETS = 6000
 DECISION_PERIOD = 1
-LEARNING_RATE = 1e-3
+LEARNING_RATE = 5e-4
 GAMMA = 0.99
 GAE_LAMBDA = 0.95
-EPOCHS = 4
-CLIP_COEF = 0.1
-POLICY_COEF = 1.0
-ENTROPY_COEF = 0.01
-CRITIC_COEF = 0.5
-TARGET_LEARNING_RATE = 1e-5
+EPOCHS = 3
+CLIP_COEF = 0.11
+LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
+POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
+ENTROPY_COEF = [0.1, 0.1, 0.1, 0.1]
+CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
+TARGET_LEARNING_RATE = 1e-6

 ANNEAL_LEARNING_RATE = True
 CLIP_VLOSS = True
 NORM_ADV = True
 TRAIN = True

-WANDB_TACK = True
+WANDB_TACK = False
 LOAD_DIR = None
-#LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
+#LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670522099-freeonly-12/Aimbot-target-last.pt"

 # public data
 class Targets(Enum):
@ -65,12 +66,17 @@ class Targets(Enum):
    Attack = 2
    Defence = 3
    Num = 4
-STATE_REMAINTIME_POSITION = 6
+TARGET_STATE_SIZE = 6
+INAREA_STATE_SIZE = 1
+TIME_STATE_SIZE = 1
+GUN_STATE_SIZE = 1
+MY_STATE_SIZE = 4
+TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
 BASE_WINREWARD = 999
 BASE_LOSEREWARD = -999
 TARGETNUM= 4
 ENV_TIMELIMIT = 30
-RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT
+RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
 TotalRounds = {"Free":0,"Go":0,"Attack":0}
 WinRounds = {"Free":0,"Go":0,"Attack":0}

@ -117,6 +123,8 @@ def parse_args():
                        help="load model directory")
    parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
                        help="the number of steps to run in each environment per policy rollout")
+    parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
+                        help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")

    # GAE loss
    parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
@ -156,42 +164,44 @@ class PPOAgent(nn.Module):
    def __init__(self, env: Aimbot,targetNum:int):
        super(PPOAgent, self).__init__()
        self.targetNum = targetNum
+        self.stateSize = env.unity_observation_shape[0]
+        self.targetSize = TARGET_STATE_SIZE
+        self.timeSize = TIME_STATE_SIZE
+        self.gunSize = GUN_STATE_SIZE
+        self.myStateSize = MY_STATE_SIZE
+        self.totalMiddleSize = TOTAL_T_SIZE
+        self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input
+
        self.discrete_size = env.unity_discrete_size
        self.discrete_shape = list(env.unity_discrete_branches)
        self.continuous_size = env.unity_continuous_size

-        self.network = nn.ModuleList([nn.Sequential(
-            layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 300)),
-            nn.ReLU(),
+        self.network = nn.Sequential(
+            layer_init(nn.Linear(env.unity_observation_shape[0], 300)),
+            nn.Tanh(),
            layer_init(nn.Linear(300, 200)),
-            nn.ReLU()) for i in range(targetNum)])
-        self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.01) for i in range(targetNum)])
-        self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.01) for i in range(targetNum)])
-        self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)])
-        self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)])
+            nn.Tanh(),
+        )
+        self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5)
+        self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5)
+        self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
+        self.critic = layer_init(nn.Linear(200, 1), std=1)

    def get_value(self, state: torch.Tensor):
-        targets = state[:,0].to(torch.int32)
-        hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])])
-        return torch.stack([self.critic[targets[i]](hidden[i])for i in range(targets.size()[0])])
+        return self.critic(self.network(state))

    def get_actions_value(self, state: torch.Tensor, actions=None):
-        targets = state[:,0].to(torch.int32)
-        hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])])
+        hidden = self.network(state)

        # discrete
-        # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
-        dis_logits = torch.stack([self.actor_dis[targets[i]](hidden[i]) for i in range(targets.size()[0])])
+        dis_logits = self.actor_dis(hidden)
        split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
        multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
        # continuous
-        actions_mean = torch.stack([self.actor_mean[targets[i]](hidden[i]) for i in range(targets.size()[0])]) # self.actor_mean(hidden)
-        # action_logstd = torch.stack([self.actor_logstd[targets[i]].expand_as(actions_mean) for i in range(targets.size()[0])]) # self.actor_logstd.expand_as(actions_mean)
-        # print(action_logstd)
-        action_std = torch.squeeze(torch.stack([torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])]),dim = -1) # torch.exp(action_logstd)
+        actions_mean = self.actor_mean(hidden)
+        action_logstd = self.actor_logstd.expand_as(actions_mean)
+        action_std = torch.exp(action_logstd)
        con_probs = Normal(actions_mean, action_std)
-        # critic
-        criticV = torch.stack([self.critic[targets[i]](hidden[i])for i in range(targets.size()[0])])

        if actions is None:
            if args.train:
@ -211,14 +221,13 @@ class PPOAgent(nn.Module):
            [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
        )
        dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
-
        return (
            actions,
            dis_log_prob.sum(0),
            dis_entropy.sum(0),
            con_probs.log_prob(conAct).sum(1),
            con_probs.entropy().sum(1),
-            criticV,
+            self.critic(hidden),
        )


@ -306,11 +315,11 @@ def broadCastEndReward(rewardBF:list,remainTime:float):
    if (rewardBF[-1]<=-500):
        # print("Lose DO NOT BROAD CAST",rewardBF[-1])
        thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD
-        thisRewardBF = (np.asarray(thisRewardBF)).tolist()
+        thisRewardBF = thisRewardBF
    elif (rewardBF[-1]>=500):
        # print("Win! Broadcast reward!",rewardBF[-1])
        thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD
-        thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*RESULT_BROADCAST_RATIO)).tolist()
+        thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist()
    else:
        print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1])
    return torch.Tensor(thisRewardBF).to(device)
@ -327,17 +336,22 @@ if __name__ == "__main__":
    # Initialize environment anget optimizer
    aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
    env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel])
+    agentList = []
+    optimizers = []
    if args.load_dir is None:
-        agent = PPOAgent(env,TARGETNUM).to(device)
+        for i in range(using_targets_num):
+            agentList.append(PPOAgent(env,TARGETNUM).to(device))
+            optimizers.append(optim.Adam(agentList[i].parameters(), lr=args.lr, eps=1e-5))
    else:
-        agent = torch.load(args.load_dir)
-        print("Load Agent", args.load_dir)
-        print(agent.eval())
+        print("NAH")
+        # !!!not finished
+        # agent = torch.load(args.load_dir)
+        # print("Load Agent", args.load_dir)
+        # print(agent.eval())

-    optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)

    # Tensorboard and WandB Recorder
-    game_name = "Aimbot_Target_Hybrid_Multi_Output"
+    game_name = "Aimbot_Target_Hybrid_PMNN_V2"
    game_type = "OffPolicy_EndBC"
    run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
    if args.wandb_track:
@ -387,38 +401,51 @@ if __name__ == "__main__":

    for total_steps in range(total_update_step):
        # discunt learning rate, while step == total_update_step lr will be 0
-        print("new episode")
+
        if args.annealLR:
            finalRatio = TARGET_LEARNING_RATE/args.lr
-            frac = 1.0 - finalRatio*((total_steps - 1.0) / total_update_step)
+            frac = 1.0 - ((total_steps + 1.0) / total_update_step)
            lrnow = frac * args.lr
-            optimizer.param_groups[0]["lr"] = lrnow
+            for optimizer in optimizers:
+                optimizer.param_groups[0]["lr"] = lrnow
+        else:
+            lrnow = args.lr
+        print("new episode",total_steps,"learning rate = ",lrnow)


        # MAIN LOOP: run agent in environment
-        i = 0
+        step = 0
        training = False
        trainQueue = []
+        last_reward = [0.for i in range(env.unity_agent_num)]
+        action = torch.zeros((env.unity_agent_num,env.unity_discrete_type+env.unity_continuous_size))
+        dis_logprob = torch.zeros((env.unity_agent_num,env.unity_discrete_size))
+        con_logprob = torch.zeros((env.unity_agent_num,env.unity_continuous_size))
+        value = torch.zeros((env.unity_agent_num,1))
        while True:
-            if i % args.decision_period == 0:
-                step = round(i / args.decision_period)
+            if step % args.decision_period == 0:
+                step += 1
                # Choose action by agent

                with torch.no_grad():
                    # predict actions
-                    action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
-                        torch.Tensor(state).to(device)
-                    )
-                    value = value.flatten()
+                    for i in range(env.unity_agent_num):
+                        actTarget = int(state[i][0])
+                        act, dis_lgprb, _, con_lgprb, _, vl = agentList[actTarget].get_actions_value(
+                            torch.Tensor([state[i]]).to(device)
+                        )
+                        action[i] = act
+                        dis_logprob[i] = dis_lgprb.squeeze(0)
+                        con_logprob[i] = con_lgprb.squeeze(0)
+                        value[i] = vl.squeeze(0)

                # variable from GPU to CPU
                action_cpu = action.cpu().numpy()
                dis_logprob_cpu = dis_logprob.cpu().numpy()
                con_logprob_cpu = con_logprob.cpu().numpy()
-                value_cpu = value.cpu().numpy()
+                value_cpu = value.flatten().cpu().numpy()
                # Environment step
                next_state, reward, next_done = env.step(action_cpu)
-                remainTime = state[i,STATE_REMAINTIME_POSITION]
                # save memories
                for i in range(env.unity_agent_num):
                    # save memories to buffers
@ -426,22 +453,24 @@ if __name__ == "__main__":
                    act_bf[i].append(action_cpu[i])
                    dis_logprobs_bf[i].append(dis_logprob_cpu[i])
                    con_logprobs_bf[i].append(con_logprob_cpu[i])
-                    rewards_bf[i].append(reward[i])
+                    rewards_bf[i].append(reward[i]+last_reward[i])
                    dones_bf[i].append(done[i])
                    values_bf[i].append(value_cpu[i])
+                    remainTime = state[i,TARGET_STATE_SIZE]
                    if next_done[i] == True:
                        # finished a round, send finished memories to training datasets
                        # compute advantage and discounted reward
                        #print(i,"over")
+                        endTarget = int(ob_bf[i][0][0])
                        roundTargetType = int(state[i,0])
                        thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
                        adv, rt = GAE(
-                            agent,
+                            agentList[endTarget],
                            args,
                            thisRewardsTensor,
                            torch.Tensor(dones_bf[i]).to(device),
                            torch.tensor(values_bf[i]).to(device),
-                            torch.tensor([next_state[i]]).to(device),
+                            torch.tensor(next_state[i]).to(device).unsqueeze(0),
                            torch.Tensor([next_done[i]]).to(device),
                        )
                        # send memories to training datasets
@ -476,13 +505,14 @@ if __name__ == "__main__":
                    break
                state, done = next_state, next_done
            else:
+                step += 1
                # skip this step use last predict action
-                next_obs, reward, next_done = env.step(action_cpu)
+                next_state, reward, next_done = env.step(action_cpu)
                # save memories
                for i in range(env.unity_agent_num):
                    if next_done[i] == True:
                        #print(i,"over???")
-                        # save last memories to buffers
+                        # save memories to buffers
                        ob_bf[i].append(state[i])
                        act_bf[i].append(action_cpu[i])
                        dis_logprobs_bf[i].append(dis_logprob_cpu[i])
@ -490,30 +520,33 @@ if __name__ == "__main__":
                        rewards_bf[i].append(reward[i])
                        dones_bf[i].append(done[i])
                        values_bf[i].append(value_cpu[i])
+                        remainTime = state[i,TARGET_STATE_SIZE]
                        # finished a round, send finished memories to training datasets
                        # compute advantage and discounted reward
+                        roundTargetType = int(state[i,0])
+                        thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
                        adv, rt = GAE(
-                            agent,
+                            agentList[roundTargetType],
                            args,
-                            torch.tensor(rewards_bf[i]).to(device),
+                            thisRewardsTensor,
                            torch.Tensor(dones_bf[i]).to(device),
                            torch.tensor(values_bf[i]).to(device),
-                            torch.tensor(next_state[i]).to(device),
+                            torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0),
                            torch.Tensor([next_done[i]]).to(device),
                        )
                        # send memories to training datasets
-                        obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
-                        actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
-                        dis_logprobs = torch.cat(
-                            (dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
+                        obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
+                        actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
+                        dis_logprobs[roundTargetType] = torch.cat(
+                            (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
                        )
-                        con_logprobs = torch.cat(
-                            (con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
+                        con_logprobs[roundTargetType] = torch.cat(
+                            (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
                        )
-                        rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0)
-                        values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
-                        advantages = torch.cat((advantages, adv), 0)
-                        returns = torch.cat((returns, rt), 0)
+                        rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
+                        values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
+                        advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
+                        returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)

                        # clear buffers
                        ob_bf[i] = []
@ -523,8 +556,10 @@ if __name__ == "__main__":
                        rewards_bf[i] = []
                        dones_bf[i] = []
                        values_bf[i] = []
-                        print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
-                state, done = next_state, next_done
+                        print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
+
+                state = next_state
+                last_reward = reward
            i += 1

        if args.train:
@ -540,14 +575,16 @@ if __name__ == "__main__":
                b_advantages = advantages[thisT].reshape(-1)
                b_returns = returns[thisT].reshape(-1)
                b_values = values[thisT].reshape(-1)
-                b_size = b_obs[thisT].size()[0]
+                b_size = b_obs.size()[0]
                # Optimizing the policy and value network
                b_inds = np.arange(b_size)
                # clipfracs = []
                for epoch in range(args.epochs):
+                    print(epoch,end="")
                    # shuffle all datasets
                    np.random.shuffle(b_inds)
                    for start in range(0, b_size, args.minibatchSize):
+                        print(".",end="")
                        end = start + args.minibatchSize
                        mb_inds = b_inds[start:end]
                        mb_advantages = b_advantages[mb_inds]
@ -565,7 +602,7 @@ if __name__ == "__main__":
                            new_con_logprob,
                            con_entropy,
                            newvalue,
-                        ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
+                        ) = agentList[thisT].get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
                        # discrete ratio
                        dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
                        dis_ratio = dis_logratio.exp()
@ -613,17 +650,17 @@ if __name__ == "__main__":
                        # total loss
                        entropy_loss = dis_entropy.mean() + con_entropy.mean()
                        loss = (
-                            dis_pg_loss * args.policy_coef
-                            + con_pg_loss * args.policy_coef
-                            - entropy_loss * args.ent_coef
-                            + v_loss * args.critic_coef
-                        )
+                            dis_pg_loss * POLICY_COEF[thisT]
+                            + con_pg_loss * POLICY_COEF[thisT]
+                            + entropy_loss * ENTROPY_COEF[thisT]
+                            + v_loss * CRITIC_COEF[thisT]
+                        )*LOSS_COEF[thisT]

-                        optimizer.zero_grad()
+                        optimizers[thisT].zero_grad()
                        loss.backward()
                        # Clips gradient norm of an iterable of parameters.
-                        nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
-                        optimizer.step()
+                        nn.utils.clip_grad_norm_(agentList[thisT].parameters(), args.max_grad_norm)
+                        optimizers[thisT].step()

                    """
                    if args.target_kl is not None:
@ -631,6 +668,7 @@ if __name__ == "__main__":
                            break
                    """
                # record mean reward before clear history
+                print("done")
                targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
                meanRewardList.append(targetRewardMean)
                targetName = Targets(thisT).name
@ -660,10 +698,12 @@ if __name__ == "__main__":
            # New Record!
            if TotalRewardMean > bestReward:
                bestReward = targetRewardMean
-                saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt"
-                torch.save(agent, saveDir)
+                for i in range(using_targets_num):
+                    saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) +"_"+ str(i)+".pt"
+                    torch.save(agentList[i], saveDir)

-    saveDir = "../PPO-Model/Hybrid-MNN-500-300-Last" + ".pt"
-    torch.save(agent, saveDir)
+    for i in range(using_targets_num):
+        saveDir = "../PPO-Model/"+ run_name +"_last_"+ str(i) + ".pt"
+        torch.save(agentList[i], saveDir)
    env.close()
    writer.close()
--- a/Aimbot-PPO-Python/Pytorch/testarea.ipynb
+++ b/Aimbot-PPO-Python/Pytorch/testarea.ipynb
@ -601,13 +601,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
+    "import numpy as np\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
+    "from AimbotEnv import Aimbot\n",
    "from torch.distributions.normal import Normal\n",
    "from torch.distributions.categorical import Categorical\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() and True else \"cpu\")\n",
@ -620,39 +622,37 @@
    "class PPOAgent(nn.Module):\n",
    "    def __init__(self, env: Aimbot,targetNum:int):\n",
    "        super(PPOAgent, self).__init__()\n",
-    "        self.targetNum = targetNum\n",
+    "        self.stateSize = env.unity_observation_shape[0]\n",
+    "\n",
    "        self.discrete_size = env.unity_discrete_size\n",
    "        self.discrete_shape = list(env.unity_discrete_branches)\n",
    "        self.continuous_size = env.unity_continuous_size\n",
    "\n",
    "        self.network = nn.Sequential(\n",
-    "            layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 500)),\n",
-    "            nn.ReLU(),\n",
-    "            layer_init(nn.Linear(500, 300)),\n",
-    "            nn.ReLU(),\n",
+    "            layer_init(nn.Linear(env.unity_observation_shape[0], 300)),\n",
+    "            nn.Tanh(),\n",
+    "            layer_init(nn.Linear(300, 200)),\n",
+    "            nn.Tanh(),\n",
    "        )\n",
-    "        self.actor_dis = nn.ModuleList([layer_init(nn.Linear(300, self.discrete_size), std=0.01) for i in range(targetNum)])\n",
-    "        self.actor_mean = nn.ModuleList([layer_init(nn.Linear(300, self.continuous_size), std=0.01) for i in range(targetNum)])\n",
-    "        self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)])\n",
-    "        self.critic = layer_init(nn.Linear(300, 1), std=1)\n",
+    "        self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5)\n",
+    "        self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5)\n",
+    "        self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))\n",
+    "        self.critic = layer_init(nn.Linear(200, 1), std=1)\n",
    "\n",
    "    def get_value(self, state: torch.Tensor):\n",
    "        return self.critic(self.network(state))\n",
    "\n",
    "    def get_actions_value(self, state: torch.Tensor, actions=None):\n",
    "        hidden = self.network(state)\n",
-    "        targets = torch.argmax(state[:,0:self.targetNum],dim=1)\n",
    "\n",
    "        # discrete\n",
-    "        # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出\n",
-    "        dis_logits = torch.stack([self.actor_dis[targets[i]](hidden[i]) for i in range(targets.size()[0])])\n",
+    "        dis_logits = self.actor_dis(hidden)\n",
    "        split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n",
    "        multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n",
    "        # continuous\n",
-    "        actions_mean = torch.stack([self.actor_mean[targets[i]](hidden[i]) for i in range(targets.size()[0])]) # self.actor_mean(hidden)\n",
-    "        # action_logstd = torch.stack([self.actor_logstd[targets[i]].expand_as(actions_mean) for i in range(targets.size()[0])]) # self.actor_logstd.expand_as(actions_mean)\n",
-    "        # print(action_logstd)\n",
-    "        action_std = torch.squeeze(torch.stack([torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])]),dim = -1) # torch.exp(action_logstd)\n",
+    "        actions_mean = self.actor_mean(hidden)\n",
+    "        action_logstd = self.actor_logstd.expand_as(actions_mean)\n",
+    "        action_std = torch.exp(action_logstd)\n",
    "        con_probs = Normal(actions_mean, action_std)\n",
    "\n",
    "        if actions is None:\n",
@ -680,117 +680,46 @@
    "            con_probs.log_prob(conAct).sum(1),\n",
    "            con_probs.entropy().sum(1),\n",
    "            self.critic(hidden),\n",
-    "        )\n",
-    "agent = PPOAgent(env,4).to(device)"
+    "        )"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ppp = \"../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv\"\n",
+    "env = Aimbot(envPath=ppp, workerID=1, basePort=1000,side_channels=[])\n",
+    "agent_list = []\n",
+    "optimizers = []\n",
+    "for i in range(3):\n",
+    "    agent_list.append(PPOAgent(env=env,targetNum=3).to('cuda'))\n",
+    "    optimizers.append(optim.Adam(agent_list[i].parameters(),lr=1e-4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "array([[  1.        , -10.343613  ,   0.        ,  -7.367299  ,\n",
-       "          0.        ,   0.        ,  30.        , -10.343662  ,\n",
-       "          1.        , -33.708736  ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   2.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   2.        ,   2.        ,\n",
-       "          2.        ,   1.        ,   1.        ,   1.        ,\n",
-       "         33.270493  ,  39.50663   ,  49.146526  ,  32.595673  ,\n",
-       "         30.21616   ,  21.163797  ,  46.9299    ,   1.3264331 ,\n",
-       "          1.2435672 ,   1.2541904 ,  30.08522   ,  30.041445  ,\n",
-       "         21.072094  ,   0.        ],\n",
-       "       [  0.        ,   0.        ,   0.        ,   0.        ,\n",
-       "          0.        ,   0.        ,  30.        ,  -5.5892515 ,\n",
-       "          1.        , -29.907726  ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   2.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "         41.408752  ,  47.830173  ,  45.03225   ,  31.905174  ,\n",
-       "         41.849663  ,  41.849648  ,  43.001434  ,  45.0322    ,\n",
-       "         47.48242   ,  40.00285   ,  41.668346  ,  41.607723  ,\n",
-       "         41.668335  ,   0.        ],\n",
-       "       [  1.        ,   2.9582403 ,   0.        ,  -4.699738  ,\n",
-       "          0.        ,   0.        ,  30.        ,  -5.412487  ,\n",
-       "          1.        , -32.79967   ,   1.        ,   2.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   2.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "         20.17488   ,  49.507687  ,  48.162056  ,  45.98998   ,\n",
-       "         44.75835   ,  31.08564   ,  32.865173  ,  24.676666  ,\n",
-       "         12.952409  ,  39.69923   ,  44.564423  ,  44.49966   ,\n",
-       "         44.564495  ,   0.        ],\n",
-       "       [  2.        ,  -0.20171738,   0.        , -10.340863  ,\n",
-       "          0.        ,   0.        ,  30.        , -22.987915  ,\n",
-       "          1.        , -34.37514   ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "          1.        ,   2.        ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "         11.631058  ,  13.872022  ,  18.006863  ,  27.457632  ,\n",
-       "         46.343067  ,  46.343094  ,  20.155125  ,  49.867714  ,\n",
-       "         52.965984  ,  56.775608  ,  46.14223   ,  46.075138  ,\n",
-       "         46.142246  ,   0.        ],\n",
-       "       [  2.        , -14.687862  ,   0.        , -12.615574  ,\n",
-       "          0.        ,   0.        ,  30.        ,  15.125373  ,\n",
-       "          1.        , -30.849268  ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   2.        ,\n",
-       "         52.430542  ,  48.912865  ,  46.05145   ,  43.974594  ,\n",
-       "         42.796673  ,  26.467875  ,  11.072432  ,   7.190229  ,\n",
-       "          5.483198  ,   4.5500183 ,  42.611244  ,  42.549267  ,\n",
-       "         18.856438  ,   0.        ],\n",
-       "       [  0.        ,   0.        ,   0.        ,   0.        ,\n",
-       "          0.        ,   0.        ,  30.        ,  -4.0314903 ,\n",
-       "          1.        , -29.164669  ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "         44.074184  ,  46.9762    ,  44.228096  ,  42.2335    ,\n",
-       "         41.102253  ,  41.102367  ,  42.233757  ,  44.22849   ,\n",
-       "         44.321827  ,  37.335304  ,  40.924183  ,  40.86467   ,\n",
-       "         40.924236  ,   0.        ],\n",
-       "       [  0.        ,   0.        ,   0.        ,   0.        ,\n",
-       "          0.        ,   0.        ,  30.        , -18.603981  ,\n",
-       "          1.        , -29.797592  ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "          1.        ,   2.        ,   2.        ,   2.        ,\n",
-       "         19.134174  ,  22.76088   ,  29.468704  ,  42.88739   ,\n",
-       "         41.738823  ,  41.739002  ,  42.88781   ,  44.913647  ,\n",
-       "         47.704174  ,  51.135338  ,  20.418388  ,  12.470214  ,\n",
-       "         12.670923  ,   0.        ],\n",
-       "       [  0.        ,   0.        ,   0.        ,   0.        ,\n",
-       "          0.        ,   0.        ,  30.        , -19.07032   ,\n",
-       "          1.        , -30.246218  ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "          1.        ,   1.        ,   1.        ,   1.        ,\n",
-       "         18.336487  ,  21.81617   ,  28.251017  ,  42.977867  ,\n",
-       "         42.18994   ,  42.19034   ,  43.351707  ,  45.399582  ,\n",
-       "         48.22037   ,  51.68873   ,  42.00719   ,  41.94621   ,\n",
-       "         42.00739   ,   0.        ]], dtype=float32)"
+       "tensor([1., 2., 3., 4., 5.])"
      ]
     },
-     "execution_count": 7,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "state,_,_ = env.getSteps()\n",
-    "state"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "env.close()"
+    "import torch\n",
+    "\n",
+    "aaa = torch.zeros((8,5))\n",
+    "aaa[0] = torch.Tensor([1,2,3,4,5])\n",
+    "aaa[0]"
   ]
  }
 ],