Argument説明

整理无用变量，对环境3.6进行适配
参数Critic_coef修改
2024-03-02 17:36:33 +09:00 · 2024-01-24 17:07:45 +09:00 · 2023-11-23 15:29:13 +09:00 · 2023-10-15 06:05:59 +09:00 · 2023-08-08 20:47:56 +09:00 · 2023-08-04 05:13:32 +09:00
28 changed files with 3837 additions and 630 deletions
--- a/.gitignore
+++ b/.gitignore
@ -76,6 +76,8 @@ crashlytics-build.properties
 /Aimbot-PPO-Python/.vscode/
 /Aimbot-PPO-Python/.mypy_cache/
 /Aimbot-PPO-Python/__pycache__/
+/Aimbot-PPO-Python/wandb/
+/Aimbot-PPO-Python/runs/
 /Aimbot-PPO-Python/Tensorflow/__pycache__/
 /Aimbot-PPO-Python/Pytorch/__pycache__/
 /Aimbot-PPO-Python/Pytorch/runs/
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,5 @@
+{
+    "python.linting.enabled": false,
+    "python.analysis.typeCheckingMode": "off",
+    "commentTranslate.source": "intellsmi.deepl-translate-deepl"
+}
--- a/Aimbot-PPO-Python/Pytorch/.idea/.gitignore
+++ b/Aimbot-PPO-Python/Pytorch/.idea/.gitignore
@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
--- a/Aimbot-PPO-Python/Pytorch/.idea/Pytorch.iml
+++ b/Aimbot-PPO-Python/Pytorch/.idea/Pytorch.iml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="mlagents39" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml
+++ b/Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml
@ -0,0 +1,10 @@
+<component name="ProjectDictionaryState">
+  <dictionary name="UCUNI">
+    <words>
+      <w>aimbot</w>
+      <w>logprobs</w>
+      <w>logstd</w>
+      <w>unclipped</w>
+    </words>
+  </dictionary>
+</component>
--- a/Aimbot-PPO-Python/Pytorch/.idea/inspectionProfiles/profiles_settings.xml
+++ b/Aimbot-PPO-Python/Pytorch/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/Aimbot-PPO-Python/Pytorch/.idea/misc.xml
+++ b/Aimbot-PPO-Python/Pytorch/.idea/misc.xml
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="mlagents39" project-jdk-type="Python SDK" />
+</project>
--- a/Aimbot-PPO-Python/Pytorch/.idea/modules.xml
+++ b/Aimbot-PPO-Python/Pytorch/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Pytorch.iml" filepath="$PROJECT_DIR$/.idea/Pytorch.iml" />
+    </modules>
+  </component>
+</project>
--- a/Aimbot-PPO-Python/Pytorch/.idea/vcs.xml
+++ b/Aimbot-PPO-Python/Pytorch/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
+  </component>
+</project>
--- a/Aimbot-PPO-Python/Pytorch/AimbotEnv.py
+++ b/Aimbot-PPO-Python/Pytorch/AimbotEnv.py
@ -1,25 +1,34 @@
 import gym
 import numpy as np
-
+import uuid
+import airecorder
 from numpy import ndarray
 from mlagents_envs.base_env import ActionTuple
 from mlagents_envs.environment import UnityEnvironment
+from typing import Tuple, List
+from mlagents_envs.side_channel.side_channel import (
+    SideChannel,
+    IncomingMessage,
+    OutgoingMessage,
+)
+from arguments import set_save_model


 class Aimbot(gym.Env):
    def __init__(
-        self,
-        envPath: str,
-        workerID: int = 1,
-        basePort: int = 100,
+            self,
+            env_path: str,
+            worker_id: int = 1,
+            base_port: int = 100,
+            side_channels: list = []
    ):
        super(Aimbot, self).__init__()
        self.env = UnityEnvironment(
-            file_name=envPath,
+            file_name=env_path,
            seed=1,
-            side_channels=[],
-            worker_id=workerID,
-            base_port=basePort,
+            side_channels=side_channels,
+            worker_id=worker_id,
+            base_port=base_port,
        )
        self.env.reset()
        # all behavior_specs
@ -33,7 +42,7 @@ class Aimbot(gym.Env):
        #  environment action specs
        self.unity_action_spec = self.unity_specs.action_spec
        #  environment sample observation
-        decisionSteps, _ = self.env.get_steps(self.unity_beha_name)
+        decision_steps, _ = self.env.get_steps(self.unity_beha_name)

        # OBSERVATION SPECS
        #  environment state shape. like tuple:(93,)
@ -56,31 +65,34 @@ class Aimbot(gym.Env):

        # AGENT SPECS
        # all agents ID
-        self.unity_agent_IDS = decisionSteps.agent_id
+        self.unity_agent_IDS = decision_steps.agent_id
        # agents number
        self.unity_agent_num = len(self.unity_agent_IDS)

-    def reset(self):
-        """reset enviroment and get observations
+        # all zero action
+        self.all_zero_action = np.zeros((self.unity_agent_num, self.unity_action_size))
+
+    def reset(self) -> Tuple[np.ndarray, List, List]:
+        """reset environment and get observations

        Returns:
-            ndarray: nextState, reward, done, loadDir, saveNow
+            ndarray: next_state, reward, done, loadDir, saveNow
        """
        # reset env
        self.env.reset()
-        nextState, reward, done = self.getSteps()
-        return nextState, reward, done
+        next_state, reward, done = self.get_steps()
+        return next_state, reward, done

    # TODO:
    # delete all stack state DONE
-    # getstep State disassembly function DONE
+    # get-step State disassembly function DONE
    # delete agent selection function DONE
    # self.step action wrapper function DONE
    def step(
-        self,
-        actions: ndarray,
-    ):
-        """change ations list to ActionTuple then send it to enviroment
+            self,
+            actions: ndarray,
+    ) -> Tuple[np.ndarray, List, List]:
+        """change actions list to ActionTuple then send it to environment

        Args:
            actions (ndarray): PPO chooseAction output action list.(agentNum,actionNum)
@ -88,36 +100,36 @@ class Aimbot(gym.Env):
        Returns:
            ndarray: nextState, reward, done
        """
-        # take action to enviroment
+        # take action to environment
        # return mextState,reward,done
        # discrete action
        if self.unity_dis_act_exist:
            # create discrete action from actions list
-            discreteActions = actions[:, 0 : self.unity_discrete_type]
+            discrete_actions = actions[:, 0: self.unity_discrete_type]
        else:
            # create empty discrete action
-            discreteActions = np.asarray([[0]])
+            discrete_actions = np.asarray([[0]])
        # continuous action
        if self.unity_con_act_exist:
            # create continuous actions from actions list
-            continuousActions = actions[:, self.unity_discrete_type :]
+            continuous_actions = actions[:, self.unity_discrete_type:]
        else:
            # create empty continuous action
-            continuousActions = np.asanyarray([[0.0]])
+            continuous_actions = np.asanyarray([[0.0]])

        # Dummy continuous action
        # continuousActions = np.asanyarray([[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]])
        # create actionTuple
-        thisActionTuple = ActionTuple(continuous=continuousActions, discrete=discreteActions)
+        this_action_tuple = ActionTuple(continuous=continuous_actions, discrete=discrete_actions)
        # take action to env
-        self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple)
+        self.env.set_actions(behavior_name=self.unity_beha_name, action=this_action_tuple)
        self.env.step()
        # get nextState & reward & done after this action
-        nextStates, rewards, dones = self.getSteps()
-        return nextStates, rewards, dones
+        next_states, rewards, dones = self.get_steps()
+        return next_states, rewards, dones

-    def getSteps(self):
-        """get enviroment now observations.
+    def get_steps(self) -> Tuple[np.ndarray, List, List]:
+        """get environment now observations.
        Include State, Reward, Done

        Args:
@ -126,28 +138,99 @@ class Aimbot(gym.Env):
            ndarray: nextState, reward, done
        """
        # get nextState & reward & done
-        decisionSteps, terminalSteps = self.env.get_steps(self.unity_beha_name)
-        nextStates = []
+        decision_steps, terminal_steps = self.env.get_steps(self.unity_beha_name)
+        next_states = []
        dones = []
        rewards = []
-        for thisAgentID in self.unity_agent_IDS:
+        for this_agent_ID in self.unity_agent_IDS:
            # while Episode over agentID will both in decisionSteps and terminalSteps.
            # avoid redundant state and reward,
            # use agentExist toggle to check if agent is already exist.
-            agentExist = False
+            agent_exist = False
            # game done
-            if thisAgentID in terminalSteps:
-                nextStates.append(terminalSteps[thisAgentID].obs[0])
+            if this_agent_ID in terminal_steps:
+                next_states.append(terminal_steps[this_agent_ID].obs[0])
                dones.append(True)
-                rewards.append(terminalSteps[thisAgentID].reward)
-                agentExist = True
+                rewards.append(terminal_steps[this_agent_ID].reward)
+                agent_exist = True
            # game not over yet and agent not in terminalSteps
-            if (thisAgentID in decisionSteps) and (not agentExist):
-                nextStates.append(decisionSteps[thisAgentID].obs[0])
+            if (this_agent_ID in decision_steps) and (not agent_exist):
+                next_states.append(decision_steps[this_agent_ID].obs[0])
                dones.append(False)
-                rewards.append(decisionSteps[thisAgentID].reward)
+                rewards.append(decision_steps[this_agent_ID].reward)

-        return np.asarray(nextStates), rewards, dones
+        return np.asarray(next_states), rewards, dones

    def close(self):
        self.env.close()
+
+
+class AimbotSideChannel(SideChannel):
+    def __init__(self, channel_id: uuid.UUID) -> None:
+        super().__init__(channel_id)
+
+    def on_message_received(self, msg: IncomingMessage) -> None:
+        """
+        Note: We must implement this method of the SideChannel interface to
+        receive messages from Unity
+        Message will be sent like this:
+        "Warning|Message1|Message2|Message3" or
+        "Error|Message1|Message2|Message3"
+        """
+        this_message_Original = msg.read_string()
+        this_message = this_message_Original.split("|")
+        print(this_message)
+        if this_message[0] == "Warning":
+            if this_message[1] == "Result":
+                airecorder.total_rounds[this_message[2]] += 1
+                if this_message[3] == "Win":
+                    airecorder.win_rounds[this_message[2]] += 1
+                # print(TotalRounds)
+                # print(WinRounds)
+            if this_message[1] == "Command":
+                set_save_model(True)
+                print("Command: " + this_message_Original)
+        elif this_message[0] == "Error":
+            print(this_message_Original)
+        # # while Message type is Warning
+        # if(thisResult[0] == "Warning"):
+        #     # while Message1 is result means one game is over
+        #     if (thisResult[1] == "Result"):
+        #         TotalRounds[thisResult[2]]+=1
+        #         # while Message3 is Win means this agent win this game
+        #         if(thisResult[3] == "Win"):
+        #             WinRounds[thisResult[2]]+=1
+        #     # while Message1 is GameState means this game is just start
+        #     # and tell python which game mode is
+        #     elif (thisResult[1] == "GameState"):
+        #         SCrecieved = 1
+        # # while Message type is Error
+        # elif(thisResult[0] == "Error"):
+        #     print(thisMessage)
+
+    # 发送函数
+    def send_string(self, data: str) -> None:
+        # send a string toC#
+        msg = OutgoingMessage()
+        msg.write_string(data)
+        super().queue_message_to_send(msg)
+
+    def send_bool(self, data: bool) -> None:
+        msg = OutgoingMessage()
+        msg.write_bool(data)
+        super().queue_message_to_send(msg)
+
+    def send_int(self, data: int) -> None:
+        msg = OutgoingMessage()
+        msg.write_int32(data)
+        super().queue_message_to_send(msg)
+
+    def send_float(self, data: float) -> None:
+        msg = OutgoingMessage()
+        msg.write_float32(data)
+        super().queue_message_to_send(msg)
+
+    def send_float_list(self, data: List[float]) -> None:
+        msg = OutgoingMessage()
+        msg.write_float32_list(data)
+        super().queue_message_to_send(msg)
--- a/Aimbot-PPO-Python/Pytorch/Archive/AimBotEnv-old.py
+++ b/Aimbot-PPO-Python/Pytorch/Archive/AimBotEnv-old.py
--- a/Aimbot-PPO-Python/Pytorch/Archive/graph.py
+++ b/Aimbot-PPO-Python/Pytorch/Archive/graph.py
@ -0,0 +1,769 @@
+import argparse
+import wandb
+import time
+import numpy as np
+import random
+import uuid
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import atexit
+
+from torchviz import make_dot, make_dot_from_trace
+from AimbotEnv import Aimbot
+from tqdm import tqdm
+from enum import Enum
+from torch.distributions.normal import Normal
+from torch.distributions.categorical import Categorical
+from distutils.util import strtobool
+from torch.utils.tensorboard import SummaryWriter
+from mlagents_envs.environment import UnityEnvironment
+from mlagents_envs.side_channel.side_channel import (
+    SideChannel,
+    IncomingMessage,
+    OutgoingMessage,
+)
+from typing import List
+
+bestReward = -1
+
+DEFAULT_SEED = 9331
+ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv"
+SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
+WAND_ENTITY = "koha9"
+WORKER_ID = 2
+BASE_PORT = 1111
+
+# max round steps per agent is 2500/Decision_period, 25 seconds
+# !!!check every parameters before run!!!
+
+TOTAL_STEPS = 3150000
+BATCH_SIZE = 1024
+MAX_TRAINNING_DATASETS = 6000
+DECISION_PERIOD = 1
+LEARNING_RATE = 5e-4
+GAMMA = 0.99
+GAE_LAMBDA = 0.95
+EPOCHS = 3
+CLIP_COEF = 0.11
+LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
+POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
+ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
+CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
+TARGET_LEARNING_RATE = 1e-6
+FREEZE_VIEW_NETWORK = False
+
+ANNEAL_LEARNING_RATE = True
+CLIP_VLOSS = True
+NORM_ADV = True
+TRAIN = True
+
+SAVE_MODEL = False
+WANDB_TACK = False
+LOAD_DIR = None
+#LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948-freeonly-20/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948_0.7949778.pt"
+
+# public data
+class Targets(Enum):
+    Free = 0
+    Go = 1
+    Attack = 2
+    Defence = 3
+    Num = 4
+TARGET_STATE_SIZE = 6
+INAREA_STATE_SIZE = 1
+TIME_STATE_SIZE = 1
+GUN_STATE_SIZE = 1
+MY_STATE_SIZE = 4
+TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
+BASE_WINREWARD = 999
+BASE_LOSEREWARD = -999
+TARGETNUM= 4
+ENV_TIMELIMIT = 30
+RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
+TotalRounds = {"Free":0,"Go":0,"Attack":0}
+WinRounds = {"Free":0,"Go":0,"Attack":0}
+
+# !!!SPECIAL PARAMETERS!!!
+# change it while program is finished
+using_targets_num = 3
+
+
+def parse_args():
+    # fmt: off
+    # pytorch and environment parameters
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
+                        help="seed of the experiment")
+    parser.add_argument("--path", type=str, default=ENV_PATH,
+                        help="enviroment path")
+    parser.add_argument("--workerID", type=int, default=WORKER_ID,
+                        help="unity worker ID")
+    parser.add_argument("--baseport", type=int, default=BASE_PORT,
+                        help="port to connect to Unity environment")
+    parser.add_argument("--lr", type=float, default=LEARNING_RATE,
+                        help="the learning rate of optimizer")
+    parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
+                        help="if toggled, cuda will be enabled by default")
+    parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
+                        help="total timesteps of the experiments")
+
+    # model parameters
+    parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
+                        help="Train Model or not")
+    parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
+                        help="freeze view network or not")
+    parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
+                        help="training dataset size,start training while dataset collect enough data")
+    parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
+                        help="nimi batch size")
+    parser.add_argument("--epochs", type=int, default=EPOCHS,
+                        help="the K epochs to update the policy")
+    parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
+                        help="Toggle learning rate annealing for policy and value networks")
+    parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
+                        help="track on the wandb")
+    parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
+                        help="save model or not")
+    parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
+                        help="the entity (team) of wandb's project")
+    parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
+                        help="load model directory")
+    parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
+                        help="the number of steps to run in each environment per policy rollout")
+    parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
+                        help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
+
+    # GAE loss
+    parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
+                        help="Use GAE for advantage computation")
+    parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
+                        help="Toggles advantages normalization")
+    parser.add_argument("--gamma", type=float, default=GAMMA,
+                        help="the discount factor gamma")
+    parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
+                        help="the lambda for the general advantage estimation")
+    parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
+                        help="the surrogate clipping coefficient")
+    parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
+                        help="coefficient of the policy")
+    parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
+                        help="coefficient of the entropy")
+    parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
+                        help="coefficient of the value function")
+    parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
+                        help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
+    parser.add_argument("--max-grad-norm", type=float, default=0.5,
+                        help="the maximum norm for the gradient clipping")
+    parser.add_argument("--target-kl", type=float, default=None,
+                        help="the target KL divergence threshold")
+    # fmt: on
+    args = parser.parse_args()
+    return args
+
+
+def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
+    torch.nn.init.orthogonal_(layer.weight, std)
+    torch.nn.init.constant_(layer.bias, bias_const)
+    return layer
+
+
+class PPOAgent(nn.Module):
+    def __init__(self, env: Aimbot,targetNum:int):
+        super(PPOAgent, self).__init__()
+        self.targetNum = targetNum
+        self.stateSize = env.unity_observation_shape[0]
+        self.agentNum = env.unity_agent_num
+        self.targetSize = TARGET_STATE_SIZE
+        self.timeSize = TIME_STATE_SIZE
+        self.gunSize = GUN_STATE_SIZE
+        self.myStateSize = MY_STATE_SIZE
+        self.raySize = env.unity_observation_shape[0] - TOTAL_T_SIZE
+        self.nonRaySize = TOTAL_T_SIZE
+        self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input
+
+        self.discrete_size = env.unity_discrete_size
+        self.discrete_shape = list(env.unity_discrete_branches)
+        self.continuous_size = env.unity_continuous_size
+
+        self.viewNetwork = nn.Sequential(
+            layer_init(nn.Linear(self.raySize, 200)),
+            nn.Tanh()
+        )
+        self.targetNetworks = nn.ModuleList([nn.Sequential(
+            layer_init(nn.Linear(self.nonRaySize, 100)),
+            nn.Tanh()
+            )for i in range(targetNum)])
+        self.middleNetworks = nn.ModuleList([nn.Sequential(
+            layer_init(nn.Linear(300,200)),
+            nn.Tanh()
+            )for i in range(targetNum)])
+        self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(targetNum)])
+        self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(targetNum)])
+        # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
+        # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
+        self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1,self.continuous_size))for i in range(targetNum)]) # nn.Parameter(torch.zeros(1, self.continuous_size))
+        self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)])
+
+    def get_value(self, state: torch.Tensor):
+        target = state[:,0].to(torch.int32) # int
+        thisStateNum = target.size()[0]
+        viewInput = state[:,-self.raySize:] # all ray input
+        targetInput = state[:,:self.nonRaySize]
+        viewLayer = self.viewNetwork(viewInput)
+        targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)])
+        middleInput = torch.cat([viewLayer,targetLayer],dim = 1)
+        middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)])
+        criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic
+        return criticV
+
+    def get_actions_value(self, state: torch.Tensor, actions=None):
+        target = state[:,0].to(torch.int32) # int
+        thisStateNum = target.size()[0]
+        viewInput = state[:,-self.raySize:] # all ray input
+        targetInput = state[:,:self.nonRaySize]
+        viewLayer = self.viewNetwork(viewInput)
+        targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)])
+        middleInput = torch.cat([viewLayer,targetLayer],dim = 1)
+        middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)])
+
+        # discrete
+        # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
+        dis_logits = torch.stack([self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)])
+        split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
+        multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
+        # continuous
+        actions_mean = torch.stack([self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_mean(hidden)
+        # action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
+        # action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
+        action_logstd = torch.stack([torch.squeeze(self.actor_logstd[target[i]],0) for i in range(thisStateNum)])
+        # print(action_logstd)
+        action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
+        con_probs = Normal(actions_mean, action_std)
+        # critic
+        criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic
+
+        if actions is None:
+            if args.train:
+                # select actions base on probability distribution model
+                disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
+                conAct = con_probs.sample()
+                actions = torch.cat([disAct.T, conAct], dim=1)
+            else:
+                # select actions base on best probability distribution
+                disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
+                conAct = actions_mean
+                actions = torch.cat([disAct.T, conAct], dim=1)
+        else:
+            disAct = actions[:, 0 : env.unity_discrete_type].T
+            conAct = actions[:, env.unity_discrete_type :]
+        dis_log_prob = torch.stack(
+            [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
+        )
+        dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
+        return (
+            actions,
+            dis_log_prob.sum(0),
+            dis_entropy.sum(0),
+            con_probs.log_prob(conAct).sum(1),
+            con_probs.entropy().sum(1),
+            criticV,
+        )
+
+
+def GAE(agent, args, rewards, dones, values, next_obs, next_done):
+    # GAE
+    with torch.no_grad():
+        next_value = agent.get_value(next_obs).reshape(1, -1)
+        data_size = rewards.size()[0]
+        if args.gae:
+            advantages = torch.zeros_like(rewards).to(device)
+            lastgaelam = 0
+            for t in reversed(range(data_size)):
+                if t == data_size - 1:
+                    nextnonterminal = 1.0 - next_done
+                    nextvalues = next_value
+                else:
+                    nextnonterminal = 1.0 - dones[t + 1]
+                    nextvalues = values[t + 1]
+                delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
+                advantages[t] = lastgaelam = (
+                    delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
+                )
+            returns = advantages + values
+        else:
+            returns = torch.zeros_like(rewards).to(device)
+            for t in reversed(range(data_size)):
+                if t == data_size - 1:
+                    nextnonterminal = 1.0 - next_done
+                    next_return = next_value
+                else:
+                    nextnonterminal = 1.0 - dones[t + 1]
+                    next_return = returns[t + 1]
+                returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
+            advantages = returns - values
+    return advantages, returns
+
+class AimbotSideChannel(SideChannel):
+    def __init__(self, channel_id: uuid.UUID) -> None:
+        super().__init__(channel_id)
+    def on_message_received(self, msg: IncomingMessage) -> None:
+        """
+        Note: We must implement this method of the SideChannel interface to
+        receive messages from Unity
+        """
+        thisMessage = msg.read_string()
+        # print(thisMessage)
+        thisResult = thisMessage.split("|")
+        if(thisResult[0] == "result"):
+            TotalRounds[thisResult[1]]+=1
+            if(thisResult[2] == "Win"):
+                WinRounds[thisResult[1]]+=1
+            #print(TotalRounds)
+            #print(WinRounds)
+        elif(thisResult[0] == "Error"):
+            print(thisMessage)
+	# 发送函数
+    def send_string(self, data: str) -> None:
+        # send a string toC#
+        msg = OutgoingMessage()
+        msg.write_string(data)
+        super().queue_message_to_send(msg)
+
+    def send_bool(self, data: bool) -> None:
+        msg = OutgoingMessage()
+        msg.write_bool(data)
+        super().queue_message_to_send(msg)
+
+    def send_int(self, data: int) -> None:
+        msg = OutgoingMessage()
+        msg.write_int32(data)
+        super().queue_message_to_send(msg)
+
+    def send_float(self, data: float) -> None:
+        msg = OutgoingMessage()
+        msg.write_float32(data)
+        super().queue_message_to_send(msg)
+
+    def send_float_list(self, data: List[float]) -> None:
+        msg = OutgoingMessage()
+        msg.write_float32_list(data)
+        super().queue_message_to_send(msg)
+
+def broadCastEndReward(rewardBF:list,remainTime:float):
+    thisRewardBF = rewardBF
+    if (rewardBF[-1]<=-500):
+        # print("Lose DO NOT BROAD CAST",rewardBF[-1])
+        thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD
+        thisRewardBF = thisRewardBF
+    elif (rewardBF[-1]>=500):
+        # print("Win! Broadcast reward!",rewardBF[-1])
+        thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD
+        thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist()
+    else:
+        print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1])
+    return torch.Tensor(thisRewardBF).to(device)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+
+    # Initialize environment anget optimizer
+    aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
+    env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel])
+    if args.load_dir is None:
+        agent = PPOAgent(env,TARGETNUM).to(device)
+    else:
+        agent = torch.load(args.load_dir)
+        # freeze 
+        if args.freeze_viewnet:
+            # freeze the view network
+            for p in agent.viewNetwork.parameters():
+                p.requires_grad = False
+            print("VIEW NETWORK FREEZED")
+        print("Load Agent", args.load_dir)
+        print(agent.eval())
+
+    optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
+
+    # Tensorboard and WandB Recorder
+    game_name = "Aimbot_Target_Hybrid_PMNN_V2"
+    game_type = "OffPolicy_EndBC"
+    run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
+    if args.wandb_track:
+        wandb.init(
+            project=game_name,
+            entity=args.wandb_entity,
+            sync_tensorboard=True,
+            config=vars(args),
+            name=run_name,
+            monitor_gym=True,
+            save_code=True,
+        )
+
+    writer = SummaryWriter(f"runs/{run_name}")
+    writer.add_text(
+        "hyperparameters",
+        "|param|value|\n|-|-|\n%s"
+        % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
+    )
+
+    @atexit.register
+    def save_model():
+        # save model while exit
+        saveDir = "../PPO-Model/"+ run_name + "_last.pt"
+        torch.save(agent, saveDir)
+        print("save model to " + saveDir)
+
+    # Trajectory Buffer
+    ob_bf = [[] for i in range(env.unity_agent_num)]
+    act_bf = [[] for i in range(env.unity_agent_num)]
+    dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
+    con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
+    rewards_bf = [[] for i in range(env.unity_agent_num)]
+    dones_bf = [[] for i in range(env.unity_agent_num)]
+    values_bf = [[] for i in range(env.unity_agent_num)]
+
+    # start the game
+    total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
+    target_steps = [0 for i in range(TARGETNUM)]
+    start_time = time.time()
+    state, _, done = env.reset()
+    # state = torch.Tensor(next_obs).to(device)
+    # next_done = torch.zeros(env.unity_agent_num).to(device)
+
+    # initialize empty training datasets
+    obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,env.unity_observation_size)
+    actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,env.unity_action_size)
+    dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,1)
+    con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,1)
+    rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,1)
+    values = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,1)
+    advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,1)
+    returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,1)
+
+    vis_graph = make_dot(agent.get_actions_value(
+                        torch.Tensor(state).to(device)
+                    ), params=dict(agent.named_parameters()))
+    vis_graph.view()  # 会在当前目录下保存一个“Digraph.gv.pdf”文件，并在默认浏览器中打开
+    
+    with torch.onnx.set_training(agent, False):
+        trace, _ = torch.jit.get_trace_graph(agent, args=(torch.Tensor(state).to(device),))
+    make_dot_from_trace(trace)
+    raise
+    
+    for total_steps in range(total_update_step):
+        # discunt learning rate, while step == total_update_step lr will be 0
+
+        if args.annealLR:
+            finalRatio = TARGET_LEARNING_RATE/args.lr
+            frac = 1.0 - ((total_steps + 1.0) / total_update_step)
+            lrnow = frac * args.lr
+            optimizer.param_groups[0]["lr"] = lrnow
+        else:
+            lrnow = args.lr
+        print("new episode",total_steps,"learning rate = ",lrnow)
+
+
+        # MAIN LOOP: run agent in environment
+        step = 0
+        training = False
+        trainQueue = []
+        last_reward = [0.for i in range(env.unity_agent_num)]
+        while True:
+            if step % args.decision_period == 0:
+                step += 1
+                # Choose action by agent
+
+                with torch.no_grad():
+                    # predict actions
+                    action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
+                        torch.Tensor(state).to(device)
+                    )
+                    value = value.flatten()
+
+                # variable from GPU to CPU
+                action_cpu = action.cpu().numpy()
+                dis_logprob_cpu = dis_logprob.cpu().numpy()
+                con_logprob_cpu = con_logprob.cpu().numpy()
+                value_cpu = value.cpu().numpy()
+                # Environment step
+                next_state, reward, next_done = env.step(action_cpu)
+
+                # save memories
+                for i in range(env.unity_agent_num):
+                    # save memories to buffers
+                    ob_bf[i].append(state[i])
+                    act_bf[i].append(action_cpu[i])
+                    dis_logprobs_bf[i].append(dis_logprob_cpu[i])
+                    con_logprobs_bf[i].append(con_logprob_cpu[i])
+                    rewards_bf[i].append(reward[i]+last_reward[i])
+                    dones_bf[i].append(done[i])
+                    values_bf[i].append(value_cpu[i])
+                    remainTime = state[i,TARGET_STATE_SIZE]
+                    if next_done[i] == True:
+                        # finished a round, send finished memories to training datasets
+                        # compute advantage and discounted reward
+                        #print(i,"over")
+                        roundTargetType = int(state[i,0])
+                        thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
+                        adv, rt = GAE(
+                            agent,
+                            args,
+                            thisRewardsTensor,
+                            torch.Tensor(dones_bf[i]).to(device),
+                            torch.tensor(values_bf[i]).to(device),
+                            torch.tensor(next_state[i]).to(device).unsqueeze(0),
+                            torch.Tensor([next_done[i]]).to(device),
+                        )
+                        # send memories to training datasets
+                        obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
+                        actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
+                        dis_logprobs[roundTargetType] = torch.cat(
+                            (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
+                        )
+                        con_logprobs[roundTargetType] = torch.cat(
+                            (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
+                        )
+                        rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
+                        values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
+                        advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
+                        returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
+
+                        # clear buffers
+                        ob_bf[i] = []
+                        act_bf[i] = []
+                        dis_logprobs_bf[i] = []
+                        con_logprobs_bf[i] = []
+                        rewards_bf[i] = []
+                        dones_bf[i] = []
+                        values_bf[i] = []
+                        print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
+
+                for i in range(TARGETNUM):
+                    if obs[i].size()[0] >= args.datasetSize:
+                        # start train NN
+                        trainQueue.append(i)
+                if(len(trainQueue)>0):
+                    break
+                state, done = next_state, next_done
+            else:
+                step += 1
+                # skip this step use last predict action
+                next_state, reward, next_done = env.step(action_cpu)
+                # save memories
+                for i in range(env.unity_agent_num):
+                    if next_done[i] == True:
+                        #print(i,"over???")
+                        # save memories to buffers
+                        ob_bf[i].append(state[i])
+                        act_bf[i].append(action_cpu[i])
+                        dis_logprobs_bf[i].append(dis_logprob_cpu[i])
+                        con_logprobs_bf[i].append(con_logprob_cpu[i])
+                        rewards_bf[i].append(reward[i])
+                        dones_bf[i].append(done[i])
+                        values_bf[i].append(value_cpu[i])
+                        remainTime = state[i,TARGET_STATE_SIZE]
+                        # finished a round, send finished memories to training datasets
+                        # compute advantage and discounted reward
+                        roundTargetType = int(state[i,0])
+                        thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
+                        adv, rt = GAE(
+                            agent,
+                            args,
+                            thisRewardsTensor,
+                            torch.Tensor(dones_bf[i]).to(device),
+                            torch.tensor(values_bf[i]).to(device),
+                            torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0),
+                            torch.Tensor([next_done[i]]).to(device),
+                        )
+                        # send memories to training datasets
+                        obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
+                        actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
+                        dis_logprobs[roundTargetType] = torch.cat(
+                            (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
+                        )
+                        con_logprobs[roundTargetType] = torch.cat(
+                            (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
+                        )
+                        rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
+                        values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
+                        advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
+                        returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
+
+                        # clear buffers
+                        ob_bf[i] = []
+                        act_bf[i] = []
+                        dis_logprobs_bf[i] = []
+                        con_logprobs_bf[i] = []
+                        rewards_bf[i] = []
+                        dones_bf[i] = []
+                        values_bf[i] = []
+                        print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
+
+                state = next_state
+                last_reward = reward
+            i += 1
+
+        if args.train:
+            meanRewardList = [] # for WANDB
+            # loop all tarining queue
+            for thisT in trainQueue:
+                target_steps[thisT]+=1
+                # flatten the batch
+                b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
+                b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
+                b_con_logprobs = con_logprobs[thisT].reshape(-1)
+                b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
+                b_advantages = advantages[thisT].reshape(-1)
+                b_returns = returns[thisT].reshape(-1)
+                b_values = values[thisT].reshape(-1)
+                b_size = b_obs.size()[0]
+                # Optimizing the policy and value network
+                b_inds = np.arange(b_size)
+                # clipfracs = []
+                for epoch in range(args.epochs):
+                    print(epoch,end="")
+                    # shuffle all datasets
+                    np.random.shuffle(b_inds)
+                    for start in range(0, b_size, args.minibatchSize):
+                        print(".",end="")
+                        end = start + args.minibatchSize
+                        mb_inds = b_inds[start:end]
+                        if(np.size(mb_inds)<=1):
+                            break
+                        mb_advantages = b_advantages[mb_inds]
+
+                        # normalize advantages
+                        if args.norm_adv:
+                            mb_advantages = (mb_advantages - mb_advantages.mean()) / (
+                                mb_advantages.std() + 1e-8
+                            )
+
+                        (
+                            _,
+                            new_dis_logprob,
+                            dis_entropy,
+                            new_con_logprob,
+                            con_entropy,
+                            newvalue,
+                        ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
+                        # discrete ratio
+                        dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
+                        dis_ratio = dis_logratio.exp()
+                        # continuous ratio
+                        con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
+                        con_ratio = con_logratio.exp()
+
+                        """
+                        # early stop
+                        with torch.no_grad():
+                            # calculate approx_kl http://joschu.net/blog/kl-approx.html
+                            old_approx_kl = (-logratio).mean()
+                            approx_kl = ((ratio - 1) - logratio).mean()
+                            clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
+                        """
+
+                        # discrete Policy loss
+                        dis_pg_loss_orig = -mb_advantages * dis_ratio
+                        dis_pg_loss_clip = -mb_advantages * torch.clamp(
+                            dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
+                        )
+                        dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
+                        # continuous Policy loss
+                        con_pg_loss_orig = -mb_advantages * con_ratio
+                        con_pg_loss_clip = -mb_advantages * torch.clamp(
+                            con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
+                        )
+                        con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
+
+                        # Value loss
+                        newvalue = newvalue.view(-1)
+                        if args.clip_vloss:
+                            v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
+                            v_clipped = b_values[mb_inds] + torch.clamp(
+                                newvalue - b_values[mb_inds],
+                                -args.clip_coef,
+                                args.clip_coef,
+                            )
+                            v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
+                            v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
+                            v_loss = 0.5 * v_loss_max.mean()
+                        else:
+                            v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
+
+                        # total loss
+                        entropy_loss = dis_entropy.mean() + con_entropy.mean()
+                        loss = (
+                            dis_pg_loss * POLICY_COEF[thisT]
+                            + con_pg_loss * POLICY_COEF[thisT]
+                            + entropy_loss * ENTROPY_COEF[thisT]
+                            + v_loss * CRITIC_COEF[thisT]
+                        )*LOSS_COEF[thisT]
+
+                        if(torch.isnan(loss).any()):
+                            print("LOSS Include NAN!!!")
+                            if(torch.isnan(dis_pg_loss.any())):
+                                print("dis_pg_loss include nan")
+                            if(torch.isnan(con_pg_loss.any())):
+                                print("con_pg_loss include nan")
+                            if(torch.isnan(entropy_loss.any())):
+                                print("entropy_loss include nan")
+                            if(torch.isnan(v_loss.any())):
+                                print("v_loss include nan")
+                            raise
+
+                        optimizer.zero_grad()
+                        loss.backward()
+                        # Clips gradient norm of an iterable of parameters.
+                        nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
+                        optimizer.step()
+
+                    """
+                    if args.target_kl is not None:
+                        if approx_kl > args.target_kl:
+                            break
+                    """
+                # record mean reward before clear history
+                print("done")
+                targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
+                meanRewardList.append(targetRewardMean)
+                targetName = Targets(thisT).name
+
+                # clear this target trainning set buffer
+                obs[thisT] = torch.tensor([]).to(device)
+                actions[thisT] = torch.tensor([]).to(device)
+                dis_logprobs[thisT] = torch.tensor([]).to(device)
+                con_logprobs[thisT] = torch.tensor([]).to(device)
+                rewards[thisT] = torch.tensor([]).to(device)
+                values[thisT] = torch.tensor([]).to(device)
+                advantages[thisT] = torch.tensor([]).to(device)
+                returns[thisT] = torch.tensor([]).to(device)
+
+                # record rewards for plotting purposes
+                writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
+                writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
+                writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
+                writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
+                writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
+                writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
+                writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
+                print(f"episode over Target{targetName} mean reward:", targetRewardMean)
+            TotalRewardMean = np.mean(meanRewardList)
+            writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
+            writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps)
+            # New Record!
+            if TotalRewardMean > bestReward and args.save_model:
+                bestReward = targetRewardMean
+                saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt"
+                torch.save(agent, saveDir)
+
+    saveDir = "../PPO-Model/"+ run_name + "_last.pt"
+    torch.save(agent, saveDir)
+    env.close()
+    writer.close()
--- a/Aimbot-PPO-Python/Pytorch/Archive/ppo.py
+++ b/Aimbot-PPO-Python/Pytorch/Archive/ppo.py
@ -3,37 +3,50 @@ import wandb
 import time
 import numpy as np
 import random
+import uuid
 import torch
 import torch.nn as nn
 import torch.optim as optim

 from AimbotEnv import Aimbot
+from tqdm import tqdm
 from torch.distributions.normal import Normal
 from torch.distributions.categorical import Categorical
 from distutils.util import strtobool
 from torch.utils.tensorboard import SummaryWriter
+from mlagents_envs.environment import UnityEnvironment
+from mlagents_envs.side_channel.side_channel import (
+    SideChannel,
+    IncomingMessage,
+    OutgoingMessage,
+)
+from typing import List

 bestReward = 0

 DEFAULT_SEED = 9331
-ENV_PATH = "../Build/Build-ParallelEnv-BigArea-6Enemy/Aimbot-ParallelEnv"
+ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-ExtremeReward/Aimbot-ParallelEnv"
+SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
 WAND_ENTITY = "koha9"
 WORKER_ID = 1
 BASE_PORT = 1000

+# max round steps per agent is 2500/Decision_period, 25 seconds
+# !!!check every parameters before run!!!

-TOTAL_STEPS = 2000000
-STEP_NUM = 314
-DECISION_PERIOD = 2
-LEARNING_RATE = 7e-4
+TOTAL_STEPS = 6000000
+BATCH_SIZE = 512
+MAX_TRAINNING_DATASETS = 8000
+DECISION_PERIOD = 1
+LEARNING_RATE = 1e-3
 GAMMA = 0.99
 GAE_LAMBDA = 0.95
-MINIBATCH_NUM = 4
 EPOCHS = 4
 CLIP_COEF = 0.1
 POLICY_COEF = 1.0
 ENTROPY_COEF = 0.01
 CRITIC_COEF = 0.5
+TARGET_LEARNING_RATE = 5e-5

 ANNEAL_LEARNING_RATE = True
 CLIP_VLOSS = True
@ -41,8 +54,12 @@ NORM_ADV = True
 TRAIN = True

 WANDB_TACK = False
-LOAD_DIR = None
-# LOAD_DIR = "../PPO-Model/SmallArea-256-128-hybrid-2nd-trainning.pt"
+#LOAD_DIR = None
+LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
+
+# public data
+TotalRounds = {"Go":0,"Attack":0,"Free":0}
+WinRounds = {"Go":0,"Attack":0,"Free":0}


 def parse_args():
@ -67,10 +84,10 @@ def parse_args():
    # model parameters
    parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
                        help="Train Model or not")
-    parser.add_argument("--stepNum", type=int, default=STEP_NUM,
-                        help="the number of steps to run in each environment per policy rollout")
-    parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM,
-                        help="the number of mini-batches")
+    parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
+                        help="training dataset size,start training while dataset collect enough data")
+    parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
+                        help="nimi batch size")
    parser.add_argument("--epochs", type=int, default=EPOCHS,
                        help="the K epochs to update the policy")
    parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
@ -126,9 +143,11 @@ class PPOAgent(nn.Module):
        self.continuous_size = env.unity_continuous_size

        self.network = nn.Sequential(
-            layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 384)),
+            layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 700)),
            nn.ReLU(),
-            layer_init(nn.Linear(384, 256)),
+            layer_init(nn.Linear(700, 500)),
+            nn.ReLU(),
+            layer_init(nn.Linear(500, 256)),
            nn.ReLU(),
        )
        self.actor_dis = layer_init(nn.Linear(256, self.discrete_size), std=0.01)
@ -179,6 +198,86 @@ class PPOAgent(nn.Module):
        )


+def GAE(agent, args, rewards, dones, values, next_obs, next_done):
+    # GAE
+    with torch.no_grad():
+        next_value = agent.get_value(next_obs).reshape(1, -1)
+        data_size = rewards.size()[0]
+        if args.gae:
+            advantages = torch.zeros_like(rewards).to(device)
+            lastgaelam = 0
+            for t in reversed(range(data_size)):
+                if t == data_size - 1:
+                    nextnonterminal = 1.0 - next_done
+                    nextvalues = next_value
+                else:
+                    nextnonterminal = 1.0 - dones[t + 1]
+                    nextvalues = values[t + 1]
+                delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
+                advantages[t] = lastgaelam = (
+                    delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
+                )
+            returns = advantages + values
+        else:
+            returns = torch.zeros_like(rewards).to(device)
+            for t in reversed(range(data_size)):
+                if t == data_size - 1:
+                    nextnonterminal = 1.0 - next_done
+                    next_return = next_value
+                else:
+                    nextnonterminal = 1.0 - dones[t + 1]
+                    next_return = returns[t + 1]
+                returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
+            advantages = returns - values
+    return advantages, returns
+
+class AimbotSideChannel(SideChannel):
+    def __init__(self, channel_id: uuid.UUID) -> None:
+        super().__init__(channel_id)
+    def on_message_received(self, msg: IncomingMessage) -> None:
+        """
+        Note: We must implement this method of the SideChannel interface to
+        receive messages from Unity
+        """
+        thisMessage = msg.read_string()
+        #print(thisMessage)
+        thisResult = thisMessage.split("|")
+        if(thisResult[0] == "result"):
+            TotalRounds[thisResult[1]]+=1
+            if(thisResult[2] == "Win"):
+                WinRounds[thisResult[1]]+=1
+            #print(TotalRounds)
+            #print(WinRounds)
+        elif(thisResult[0] == "Error"):
+            print(thisMessage)
+	# 发送函数
+    def send_string(self, data: str) -> None:
+        """发送一个字符串给C#"""
+        msg = OutgoingMessage()
+        msg.write_string(data)
+        super().queue_message_to_send(msg)
+
+    def send_bool(self, data: bool) -> None:
+        msg = OutgoingMessage()
+        msg.write_bool(data)
+        super().queue_message_to_send(msg)
+
+    def send_int(self, data: int) -> None:
+        msg = OutgoingMessage()
+        msg.write_int32(data)
+        super().queue_message_to_send(msg)
+
+    def send_float(self, data: float) -> None:
+        msg = OutgoingMessage()
+        msg.write_float32(data)
+        super().queue_message_to_send(msg)
+
+    def send_float_list(self, data: List[float]) -> None:
+        msg = OutgoingMessage()
+        msg.write_float32_list(data)
+        super().queue_message_to_send(msg)
+
+
 if __name__ == "__main__":
    args = parse_args()
    random.seed(args.seed)
@ -188,7 +287,8 @@ if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

    # Initialize environment anget optimizer
-    env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport)
+    aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
+    env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel])
    if args.load_dir is None:
        agent = PPOAgent(env).to(device)
    else:
@ -199,11 +299,12 @@ if __name__ == "__main__":
    optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)

    # Tensorboard and WandB Recorder
-    game_name = "Aimbot"
-    run_name = f"{game_name}_{args.seed}_{int(time.time())}"
+    game_name = "Aimbot_Target"
+    game_type = "OffPolicy"
+    run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
    if args.wandb_track:
        wandb.init(
-            project=run_name,
+            project=game_name,
            entity=args.wandb_entity,
            sync_tensorboard=True,
            config=vars(args),
@ -219,94 +320,168 @@ if __name__ == "__main__":
        % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
    )

-    # Memory Record
-    obs = torch.zeros((args.stepNum, env.unity_agent_num) + env.unity_observation_shape).to(device)
-    actions = torch.zeros((args.stepNum, env.unity_agent_num) + (env.unity_action_size,)).to(device)
-    dis_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
-    con_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
-    rewards = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
-    dones = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
-    values = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
+    # Trajectory Buffer
+    ob_bf = [[] for i in range(env.unity_agent_num)]
+    act_bf = [[] for i in range(env.unity_agent_num)]
+    dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
+    con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
+    rewards_bf = [[] for i in range(env.unity_agent_num)]
+    dones_bf = [[] for i in range(env.unity_agent_num)]
+    values_bf = [[] for i in range(env.unity_agent_num)]

    # TRY NOT TO MODIFY: start the game
-    args.batch_size = int(env.unity_agent_num * args.stepNum)
-    args.minibatch_size = int(args.batch_size // args.minibatchesNum)
-    total_update_step = args.total_timesteps // args.batch_size
+    total_update_step = args.total_timesteps // args.datasetSize
    global_step = 0
    start_time = time.time()
-    next_obs, _, _ = env.reset()
-    next_obs = torch.Tensor(next_obs).to(device)
-    next_done = torch.zeros(env.unity_agent_num).to(device)
+    state, _, done = env.reset()
+    # state = torch.Tensor(next_obs).to(device)
+    # next_done = torch.zeros(env.unity_agent_num).to(device)

    for total_steps in range(total_update_step):
        # discunt learning rate, while step == total_update_step lr will be 0
+        print("new episode")
        if args.annealLR:
-            frac = 1.0 - (total_steps - 1.0) / total_update_step
+            finalRatio = TARGET_LEARNING_RATE/args.lr
+            frac = 1.0 - finalRatio*((total_steps - 1.0) / total_update_step)
            lrnow = frac * args.lr
            optimizer.param_groups[0]["lr"] = lrnow

+        # initialize empty training datasets
+        obs = torch.tensor([]).to(device)  # (n,env.unity_observation_size)
+        actions = torch.tensor([]).to(device)  # (n,env.unity_action_size)
+        dis_logprobs = torch.tensor([]).to(device)  # (n,1)
+        con_logprobs = torch.tensor([]).to(device)  # (n,1)
+        rewards = torch.tensor([]).to(device)  # (n,1)
+        values = torch.tensor([]).to(device)  # (n,1)
+        advantages = torch.tensor([]).to(device)  # (n,1)
+        returns = torch.tensor([]).to(device)  # (n,1)
+
        # MAIN LOOP: run agent in environment
-        for i in range(args.stepNum * args.decision_period):
+        i = 0
+        training = False
+        while True:
            if i % args.decision_period == 0:
                step = round(i / args.decision_period)
                # Choose action by agent
                global_step += 1 * env.unity_agent_num
-                obs[step] = next_obs
-                dones[step] = next_done

                with torch.no_grad():
                    # predict actions
                    action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
-                        next_obs
+                        torch.Tensor(state).to(device)
                    )
                    value = value.flatten()
-                next_obs, reward, done = env.step(action.cpu().numpy())
+
+                # variable from GPU to CPU
+                action_cpu = action.cpu().numpy()
+                dis_logprob_cpu = dis_logprob.cpu().numpy()
+                con_logprob_cpu = con_logprob.cpu().numpy()
+                value_cpu = value.cpu().numpy()
+                # Environment step
+                next_state, reward, next_done = env.step(action_cpu)

                # save memories
-                actions[step] = action
-                dis_logprobs[step] = dis_logprob
-                con_logprobs[step] = con_logprob
-                values[step] = value
-                rewards[step] = torch.tensor(reward).to(device).view(-1)
-                next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
-                    device
-                )
+                for i in range(env.unity_agent_num):
+                    # save memories to buffers
+                    ob_bf[i].append(state[i])
+                    act_bf[i].append(action_cpu[i])
+                    dis_logprobs_bf[i].append(dis_logprob_cpu[i])
+                    con_logprobs_bf[i].append(con_logprob_cpu[i])
+                    rewards_bf[i].append(reward[i])
+                    dones_bf[i].append(done[i])
+                    values_bf[i].append(value_cpu[i])
+                    if next_done[i] == True:
+                        # finished a round, send finished memories to training datasets
+                        # compute advantage and discounted reward
+                        #print(i,"over")
+                        adv, rt = GAE(
+                            agent,
+                            args,
+                            torch.tensor(rewards_bf[i]).to(device),
+                            torch.Tensor(dones_bf[i]).to(device),
+                            torch.tensor(values_bf[i]).to(device),
+                            torch.tensor(next_state[i]).to(device),
+                            torch.Tensor([next_done[i]]).to(device),
+                        )
+                        # send memories to training datasets
+                        obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
+                        actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
+                        dis_logprobs = torch.cat(
+                            (dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
+                        )
+                        con_logprobs = torch.cat(
+                            (con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
+                        )
+                        rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0)
+                        values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
+                        advantages = torch.cat((advantages, adv), 0)
+                        returns = torch.cat((returns, rt), 0)
+
+                        # clear buffers
+                        ob_bf[i] = []
+                        act_bf[i] = []
+                        dis_logprobs_bf[i] = []
+                        con_logprobs_bf[i] = []
+                        rewards_bf[i] = []
+                        dones_bf[i] = []
+                        values_bf[i] = []
+                        print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
+
+                if obs.size()[0] >= args.datasetSize:
+                    # start train NN
+                    break
+                state, done = next_state, next_done
            else:
                # skip this step use last predict action
-                next_obs, reward, done = env.step(action.cpu().numpy())
-                next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
-                    device
-                )
+                next_obs, reward, next_done = env.step(action_cpu)
+                # save memories
+                for i in range(env.unity_agent_num):
+                    if next_done[i] == True:
+                        #print(i,"over???")
+                        # save last memories to buffers
+                        ob_bf[i].append(state[i])
+                        act_bf[i].append(action_cpu[i])
+                        dis_logprobs_bf[i].append(dis_logprob_cpu[i])
+                        con_logprobs_bf[i].append(con_logprob_cpu[i])
+                        rewards_bf[i].append(reward[i])
+                        dones_bf[i].append(done[i])
+                        values_bf[i].append(value_cpu[i])
+                        # finished a round, send finished memories to training datasets
+                        # compute advantage and discounted reward
+                        adv, rt = GAE(
+                            agent,
+                            args,
+                            torch.tensor(rewards_bf[i]).to(device),
+                            torch.Tensor(dones_bf[i]).to(device),
+                            torch.tensor(values_bf[i]).to(device),
+                            torch.tensor(next_state[i]).to(device),
+                            torch.Tensor([next_done[i]]).to(device),
+                        )
+                        # send memories to training datasets
+                        obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
+                        actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
+                        dis_logprobs = torch.cat(
+                            (dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
+                        )
+                        con_logprobs = torch.cat(
+                            (con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
+                        )
+                        rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0)
+                        values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
+                        advantages = torch.cat((advantages, adv), 0)
+                        returns = torch.cat((returns, rt), 0)

-        # GAE
-        with torch.no_grad():
-            next_value = agent.get_value(next_obs).reshape(1, -1)
-            if args.gae:
-                advantages = torch.zeros_like(rewards).to(device)
-                lastgaelam = 0
-                for t in reversed(range(args.stepNum)):
-                    if t == args.stepNum - 1:
-                        nextnonterminal = 1.0 - next_done
-                        nextvalues = next_value
-                    else:
-                        nextnonterminal = 1.0 - dones[t + 1]
-                        nextvalues = values[t + 1]
-                    delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
-                    advantages[t] = lastgaelam = (
-                        delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
-                    )
-                returns = advantages + values
-            else:
-                returns = torch.zeros_like(rewards).to(device)
-                for t in reversed(range(args.stepNum)):
-                    if t == args.stepNum - 1:
-                        nextnonterminal = 1.0 - next_done
-                        next_return = next_value
-                    else:
-                        nextnonterminal = 1.0 - dones[t + 1]
-                        next_return = returns[t + 1]
-                    returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
-                advantages = returns - values
+                        # clear buffers
+                        ob_bf[i] = []
+                        act_bf[i] = []
+                        dis_logprobs_bf[i] = []
+                        con_logprobs_bf[i] = []
+                        rewards_bf[i] = []
+                        dones_bf[i] = []
+                        values_bf[i] = []
+                        print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
+                state, done = next_state, next_done
+            i += 1

        if args.train:
            # flatten the batch
@ -317,15 +492,15 @@ if __name__ == "__main__":
            b_advantages = advantages.reshape(-1)
            b_returns = returns.reshape(-1)
            b_values = values.reshape(-1)
-
+            b_size = b_obs.size()[0]
            # Optimizing the policy and value network
-            b_inds = np.arange(args.batch_size)
+            b_inds = np.arange(b_size)
            # clipfracs = []
            for epoch in range(args.epochs):
                # shuffle all datasets
                np.random.shuffle(b_inds)
-                for start in range(0, args.batch_size, args.minibatch_size):
-                    end = start + args.minibatch_size
+                for start in range(0, b_size, args.minibatchSize):
+                    end = start + args.minibatchSize
                    mb_inds = b_inds[start:end]
                    mb_advantages = b_advantages[mb_inds]

@ -424,9 +599,12 @@ if __name__ == "__main__":
                "charts/SPS", int(global_step / (time.time() - start_time)), global_step
            )
            writer.add_scalar("charts/Reward", rewardsMean, global_step)
+            writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"], global_step)
+            writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"], global_step)
+            writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"], global_step)
            if rewardsMean > bestReward:
                bestReward = rewardsMean
-                saveDir = "../PPO-Model/bigArea-384-128-hybrid-" + str(rewardsMean) + ".pt"
+                saveDir = "../PPO-Model/Target-700-500-256-hybrid-" + str(rewardsMean) + ".pt"
                torch.save(agent, saveDir)

    env.close()
--- a/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
+++ b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
--- a/Aimbot-PPO-Python/Pytorch/Archive/testEnv.py
+++ b/Aimbot-PPO-Python/Pytorch/Archive/testEnv.py
--- a/Aimbot-PPO-Python/Pytorch/Archive/testarea.ipynb
+++ b/Aimbot-PPO-Python/Pytorch/Archive/testarea.ipynb
--- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
+++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
@ -0,0 +1,255 @@
+import time
+import numpy as np
+import random
+import uuid
+import torch
+import atexit
+import os
+
+from aimbotEnv import Aimbot
+from aimbotEnv import AimbotSideChannel
+from ppoagent import PPOAgent
+from airecorder import WandbRecorder
+from aimemory import PPOMem
+from aimemory import Targets
+from arguments import parse_args
+from arguments import set_save_model, is_save_model
+import torch.optim as optim
+
+# side channel uuid
+SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
+# tensorboard names
+GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel"
+GAME_TYPE = "GotoOnly-3.6-Level0123-newModel-Onehot"
+
+if __name__ == "__main__":
+    args = parse_args()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+    best_reward = -1
+
+    # Initialize environment agent optimizer
+    aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID)
+    env = Aimbot(
+        env_path=args.path,
+        worker_id=args.workerID,
+        base_port=args.baseport,
+        side_channels=[aimbot_side_channel])
+    if args.load_dir is None:
+        agent = PPOAgent(
+            env=env,
+            this_args=args,
+            device=device,
+        ).to(device)
+    else:
+        agent = torch.load(args.load_dir)
+        # freeze
+        if args.freeze_viewnet:
+            # freeze the view network
+            print("FREEZE VIEW NETWORK is not compatible with Full MNN!")
+            raise NotImplementedError
+        print("Load Agent", args.load_dir)
+        print(agent.eval())
+    # optimizer
+    optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
+    # Tensorboard and WandB Recorder
+    run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
+    wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
+
+    # start the game
+    total_update_step = args.target_num * args.total_timesteps // args.datasetSize
+    target_steps = [0 for i in range(args.target_num)]
+    start_time = time.time()
+    state, _, done = env.reset()
+
+    # initialize AI memories
+    ppo_memories = PPOMem(
+        args=args,
+        unity_agent_num=env.unity_agent_num,
+        device=device,
+    )
+
+    # MAIN LOOP: run agent in environment
+    for total_steps in range(total_update_step):
+        # discount learning rate, while step == total_update_step lr will be 0
+        if args.annealLR:
+            final_lr_ratio = args.target_lr / args.lr
+            frac = 1.0 - ((total_steps + 1.0) / total_update_step)
+            lr_now = frac * args.lr
+            optimizer.param_groups[0]["lr"] = lr_now
+        else:
+            lr_now = args.lr
+
+        # episode start show learning rate
+        print("new episode", total_steps, "learning rate = ", lr_now)
+        step = 0
+        training = False
+        train_queue = []
+        last_reward = [0. for i in range(env.unity_agent_num)]
+        # MAIN LOOP: run agent in environment
+        while True:
+            # Target Type(state[0][0]) is stay(4),use all zero action
+            if state[0][0] == 4:
+                next_state, reward, next_done = env.step(env.all_zero_action)
+                state, done = next_state, next_done
+                continue
+            # On decision point, and Target Type(state[0][0]) is not stay(4) choose action by agent
+            if step % args.decision_period == 0:
+                step += 1
+                # Choose action by agent
+                with torch.no_grad():
+                    # predict actions
+                    action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
+                        torch.tensor(state,dtype=torch.float32).to(device)
+                    )
+                    value = value.flatten()
+
+                # variable from GPU to CPU
+                action_cpu = action.cpu().numpy()
+                dis_logprob_cpu = dis_logprob.cpu().numpy()
+                con_logprob_cpu = con_logprob.cpu().numpy()
+                value_cpu = value.cpu().numpy()
+                # Environment step
+                next_state, reward, next_done = env.step(action_cpu)
+
+                # save memories
+                if args.train:
+                    ppo_memories.save_memories(
+                        now_step=step,
+                        agent=agent,
+                        state=state,
+                        action_cpu=action_cpu,
+                        dis_logprob_cpu=dis_logprob_cpu,
+                        con_logprob_cpu=con_logprob_cpu,
+                        reward=reward,
+                        done=done,
+                        value_cpu=value_cpu,
+                        last_reward=last_reward,
+                        next_done=next_done,
+                        next_state=next_state,
+                    )
+                    # check if any training dataset is full and ready to train
+                    for i in range(args.target_num):
+                        if ppo_memories.obs[i].size()[0] >= args.datasetSize:
+                            # start train NN
+                            train_queue.append(i)
+                    if len(train_queue) > 0:
+                        # break while loop and start train
+                        break
+                    # update state
+                state, done = next_state, next_done
+            else:
+                step += 1
+                # skip this step use last predict action
+                next_state, reward, next_done = env.step(action_cpu)
+                # save memories
+                if args.train:
+                    ppo_memories.save_memories(
+                        now_step=step,
+                        agent=agent,
+                        state=state,
+                        action_cpu=action_cpu,
+                        dis_logprob_cpu=dis_logprob_cpu,
+                        con_logprob_cpu=con_logprob_cpu,
+                        reward=reward,
+                        done=done,
+                        value_cpu=value_cpu,
+                        last_reward=last_reward,
+                        next_done=next_done,
+                        next_state=next_state,
+                    )
+                    # update state
+                    state = next_state
+                    last_reward = reward
+
+        if args.train:
+            # train mode on
+            mean_reward_list = []  # for WANDB
+            # loop all training queue
+            for this_train_ind in train_queue:
+                # start time
+                start_time = time.time()
+                target_steps[this_train_ind] += 1
+                # train agent
+                (
+                    v_loss,
+                    dis_pg_loss,
+                    con_pg_loss,
+                    loss,
+                    entropy_loss
+                ) = agent.train_net(
+                    this_train_ind=this_train_ind,
+                    ppo_memories=ppo_memories,
+                    optimizer=optimizer
+                )
+                # record mean reward before clear history
+                print("done")
+                target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
+                mean_reward_list.append(target_reward_mean)
+                targetName = Targets(this_train_ind).name
+
+                # clear this target training set buffer
+                ppo_memories.clear_training_datasets(this_train_ind)
+                # record rewards for plotting purposes
+                wdb_recorder.add_target_scalar(
+                    targetName,
+                    this_train_ind,
+                    v_loss,
+                    dis_pg_loss,
+                    con_pg_loss,
+                    loss,
+                    entropy_loss,
+                    target_reward_mean,
+                    target_steps,
+                )
+                print(f"episode over Target{targetName} mean reward:", target_reward_mean)
+            TotalRewardMean = np.mean(mean_reward_list)
+            wdb_recorder.add_global_scalar(
+                TotalRewardMean,
+                optimizer.param_groups[0]["lr"],
+                total_steps,
+            )
+            # print cost time as seconds
+            print("cost time:", time.time() - start_time)
+            # New Record! or save model
+            if ((is_save_model() or TotalRewardMean > best_reward) and args.save_model):
+                # check saveDir is exist
+                saveDir = "../PPO-Model/" + run_name + "/"
+                if not os.path.isdir(saveDir):
+                    os.mkdir(saveDir)
+                best_reward = TotalRewardMean
+                torch.save(agent, saveDir + str(TotalRewardMean) + ".pt")
+                print("Model Saved!")
+                set_save_model(False)
+        else:
+            # train mode off
+            mean_reward_list = []  # for WANDB
+            # while not in training mode, clear the buffer
+            for this_train_ind in train_queue:
+                target_steps[this_train_ind] += 1
+                targetName = Targets(this_train_ind).name
+                target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
+                mean_reward_list.append(target_reward_mean)
+                print(target_steps[this_train_ind])
+
+                # clear this target training set buffer
+                ppo_memories.clear_training_datasets(this_train_ind)
+
+                # record rewards for plotting purposes
+                wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean,
+                                               target_steps[this_train_ind])
+                wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind])
+                print(f"episode over Target{targetName} mean reward:", target_reward_mean)
+            TotalRewardMean = np.mean(mean_reward_list)
+            wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
+
+    saveDir = "../PPO-Model/" + run_name + "/"
+    if not os.path.isdir(saveDir):
+        os.mkdir(saveDir)
+    best_reward = target_reward_mean
+    torch.save(agent, saveDir + "_last.pt")
+    env.close()
+    wdb_recorder.writer.close()
--- a/Aimbot-PPO-Python/Pytorch/aimemory.py
+++ b/Aimbot-PPO-Python/Pytorch/aimemory.py
@ -0,0 +1,143 @@
+import torch
+import numpy as np
+import argparse
+from ppoagent import PPOAgent
+from enum import Enum
+
+# public data
+class Targets(Enum):
+    Free = 0
+    Go = 1
+    Attack = 2
+    Defence = 3
+    Num = 4
+
+class PPOMem:
+    def __init__(
+        self,
+        args: argparse.Namespace,
+        unity_agent_num: int,
+        device: torch.device,
+    ) -> None:
+        self.target_num = args.target_num
+        self.data_set_size = args.datasetSize
+        self.result_broadcast_ratio = args.result_broadcast_ratio
+        self.decision_period = args.decision_period
+        self.unity_agent_num = unity_agent_num
+
+        self.base_lose_reward = args.base_lose_reward
+        self.base_win_reward = args.base_win_reward
+        self.target_state_size = args.target_state_size
+        self.device = device
+
+        # Trajectory Buffer
+        self.ob_bf = [[] for i in range(self.unity_agent_num)]
+        self.act_bf = [[] for i in range(self.unity_agent_num)]
+        self.dis_logprobs_bf = [[] for i in range(self.unity_agent_num)]
+        self.con_logprobs_bf = [[] for i in range(self.unity_agent_num)]
+        self.rewards_bf = [[] for i in range(self.unity_agent_num)]
+        self.dones_bf = [[] for i in range(self.unity_agent_num)]
+        self.values_bf = [[] for i in range(self.unity_agent_num)]
+
+        # initialize empty training datasets
+        self.obs = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,env.unity_observation_size)
+        self.actions = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,env.unity_action_size)
+        self.dis_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
+        self.con_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
+        self.rewards = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
+        self.values = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
+        self.advantages = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
+        self.returns = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
+
+    def broad_cast_end_reward(self, rewardBF: list, remainTime: float) -> torch.Tensor:
+        thisRewardBF = rewardBF.copy()
+        if rewardBF[-1] <= -500:
+            # print("Lose DO NOT BROAD CAST",rewardBF[-1])
+            thisRewardBF[-1] = rewardBF[-1] - self.base_lose_reward
+        elif rewardBF[-1] >= 500:
+            # print("Win! Broadcast reward!",rewardBF[-1])
+            print(sum(thisRewardBF) / len(thisRewardBF))
+            thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward
+            # broadcast result reward, increase all reward in this round by remainTime * self.result_broadcast_ratio
+            thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist()
+        else:
+            print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1])
+        return torch.tensor(thisRewardBF,dtype=torch.float32).to(self.device)
+
+    def save_memories(
+        self,
+        now_step: int,
+        agent: PPOAgent,
+        state: np.ndarray,
+        action_cpu: np.ndarray,
+        dis_logprob_cpu: np.ndarray,
+        con_logprob_cpu: np.ndarray,
+        reward: list,
+        done: list,
+        value_cpu: np.ndarray,
+        last_reward: list,
+        next_done: list,
+        next_state: np.ndarray,
+    ):
+        for i in range(self.unity_agent_num):
+            if now_step % self.decision_period == 0 or next_done[i] == True:
+                # only on decision period or finished a round, save memories to buffer
+                self.ob_bf[i].append(state[i])
+                self.act_bf[i].append(action_cpu[i])
+                self.dis_logprobs_bf[i].append(dis_logprob_cpu[i])
+                self.con_logprobs_bf[i].append(con_logprob_cpu[i])
+                self.dones_bf[i].append(done[i])
+                self.values_bf[i].append(value_cpu[i])
+                if now_step % self.decision_period == 0:
+                    # on decision period, add last skiped round's reward, only affact in decision_period != 1
+                    self.rewards_bf[i].append(reward[i] + last_reward[i])
+                else:
+                    # not on decision period, only add this round's reward
+                    self.rewards_bf[i].append(reward[i])
+            if next_done[i] == True:
+                # finished a round, send finished memories to training datasets
+                # compute advantage and discounted reward
+                remainTime = state[i, self.target_state_size]
+                roundTargetType = int(state[i, 0])
+                thisRewardsTensor = self.broad_cast_end_reward(self.rewards_bf[i], remainTime)
+                adv, rt = agent.gae(
+                    rewards=thisRewardsTensor,
+                    dones=torch.tensor(self.dones_bf[i],dtype=torch.float32).to(self.device),
+                    values=torch.tensor(self.values_bf[i]).to(self.device),
+                    next_obs=torch.tensor(next_state[i]).to(self.device).unsqueeze(0),
+                    next_done=torch.tensor([next_done[i]],dtype=torch.float32).to(self.device),
+                )
+                # send memories to training datasets
+                self.obs[roundTargetType] = torch.cat((self.obs[roundTargetType], torch.tensor(np.array(self.ob_bf[i])).to(self.device)), 0)
+                self.actions[roundTargetType] = torch.cat((self.actions[roundTargetType], torch.tensor(np.array(self.act_bf[i])).to(self.device)), 0)
+                self.dis_logprobs[roundTargetType] = torch.cat((self.dis_logprobs[roundTargetType], torch.tensor(np.array(self.dis_logprobs_bf[i])).to(self.device)), 0)
+                self.con_logprobs[roundTargetType] = torch.cat((self.con_logprobs[roundTargetType], torch.tensor(np.array(self.con_logprobs_bf[i])).to(self.device)), 0)
+                self.rewards[roundTargetType] = torch.cat((self.rewards[roundTargetType], thisRewardsTensor), 0)
+                self.values[roundTargetType] = torch.cat((self.values[roundTargetType], torch.tensor(np.array(self.values_bf[i])).to(self.device)), 0)
+                self.advantages[roundTargetType] = torch.cat((self.advantages[roundTargetType], adv), 0)
+                self.returns[roundTargetType] = torch.cat((self.returns[roundTargetType], rt), 0)
+
+                # clear buffers
+                self.clear_buffers(i)
+                print(f"train dataset {Targets(roundTargetType).name} added:{self.obs[roundTargetType].size()[0]}/{self.data_set_size}")
+
+    def clear_buffers(self,ind:int):
+        # clear buffers
+        self.ob_bf[ind] = []
+        self.act_bf[ind] = []
+        self.dis_logprobs_bf[ind] = []
+        self.con_logprobs_bf[ind] = []
+        self.rewards_bf[ind] = []
+        self.dones_bf[ind] = []
+        self.values_bf[ind] = []
+
+    def clear_training_datasets(self,ind:int):
+        # clear training datasets
+        self.obs[ind] = torch.tensor([]).to(self.device)
+        self.actions[ind] = torch.tensor([]).to(self.device)
+        self.dis_logprobs[ind] = torch.tensor([]).to(self.device)
+        self.con_logprobs[ind] = torch.tensor([]).to(self.device)
+        self.rewards[ind] = torch.tensor([]).to(self.device)
+        self.values[ind] = torch.tensor([]).to(self.device)
+        self.advantages[ind] = torch.tensor([]).to(self.device)
+        self.returns[ind] = torch.tensor([]).to(self.device)
--- a/Aimbot-PPO-Python/Pytorch/airecorder.py
+++ b/Aimbot-PPO-Python/Pytorch/airecorder.py
@ -0,0 +1,81 @@
+from torch.utils.tensorboard import SummaryWriter
+
+import wandb
+
+total_rounds = {"Free": 0, "Go": 0, "Attack": 0}
+win_rounds = {"Free": 0, "Go": 0, "Attack": 0}
+
+
+# class for wandb recording
+class WandbRecorder:
+    def __init__(self, game_name: str, game_type: str, run_name: str, _args) -> None:
+        # init wandb
+        self.game_name = game_name
+        self.game_type = game_type
+        self._args = _args
+        self.run_name = run_name
+        if self._args.wandb_track:
+            wandb.init(
+                project=self.game_name,
+                entity=self._args.wandb_entity,
+                sync_tensorboard=True,
+                config=vars(self._args),
+                name=self.run_name,
+                monitor_gym=True,
+                save_code=True,
+            )
+        self.writer = SummaryWriter(f"runs/{self.run_name}")
+        self.writer.add_text(
+            "hyperparameters",
+            "|param|value|\n|-|-|\n%s"
+            % ("\n".join([f"|{key}|{value}|" for key, value in vars(self._args).items()])),
+        )
+
+    def add_target_scalar(
+        self,
+        target_name,
+        this_t,
+        v_loss,
+        dis_pg_loss,
+        con_pg_loss,
+        loss,
+        entropy_loss,
+        target_reward_mean,
+        target_steps,
+    ):
+        # fmt:off
+        self.writer.add_scalar(
+            f"Target{target_name}/value_loss", v_loss.item(), target_steps[this_t]
+        )
+        self.writer.add_scalar(
+            f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[this_t]
+        )
+        self.writer.add_scalar(
+            f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[this_t]
+        )
+        self.writer.add_scalar(
+            f"Target{target_name}/total_loss", loss.item(), target_steps[this_t]
+        )
+        self.writer.add_scalar(
+            f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[this_t]
+        )
+        self.writer.add_scalar(
+            f"Target{target_name}/Reward", target_reward_mean, target_steps[this_t]
+        )
+        self.writer.add_scalar(
+            f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[this_t],
+        )
+        # fmt:on
+
+    def add_global_scalar(
+        self,
+        total_reward_mean,
+        learning_rate,
+        total_steps,
+    ):
+        self.writer.add_scalar("GlobalCharts/TotalRewardMean", total_reward_mean, total_steps)
+        self.writer.add_scalar("GlobalCharts/learning_rate", learning_rate, total_steps)
+    def add_win_ratio(self, target_name, target_steps):
+        self.writer.add_scalar(
+            f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps,
+        )
--- a/Aimbot-PPO-Python/Pytorch/arguments-cn.md
+++ b/Aimbot-PPO-Python/Pytorch/arguments-cn.md
@ -0,0 +1,56 @@
+
+
+本项目使用以下命令行参数来配置运行环境和模型训练参数：
+
+- `--seed <int>`：实验的随机种子。默认值为`9331`。
+- `--path <str>`：环境路径。默认值为`"./Build/3.6/Aimbot-ParallelEnv"`。
+- `--workerID <int>`：Unity worker ID。默认值为`1`。
+- `--baseport <int>`：用于连接Unity环境的端口。默认值为`500`。
+- `--lr <float>`：优化器的默认学习率。默认值为`5e-5`。
+- `--cuda`：如果启用，将默认使用cuda。可以通过传入`true`或`false`来开启或关闭。
+- `--total-timesteps <int>`：实验的总时间步数。默认值为`3150000`。
+
+### 模型参数
+
+- `--train`：是否训练模型。默认启用。
+- `--freeze-viewnet`：是否冻结视图网络(raycast)。默认为`False`。
+- `--datasetSize <int>`：训练数据集的大小，当数据集收集足够的数据时开始训练。默认值为`6000`。
+- `--minibatchSize <int>`：minibatch大小。默认值为`512`。
+- `--epochs <int>`：更新策略的K次迭代。默认值为`3`。
+- `--annealLR`：是否对策略和价值网络进行学习率退火。默认为`True`。
+- `--wandb-track`：是否在wandb上跟踪。默认为`False`。
+- `--save-model`：是否保存模型。默认为`False`。
+- `--wandb-entity <str>`：wandb项目的实体。默认值为`"koha9"`。
+- `--load-dir <str>`：模型加载目录。默认值为`None`。
+- `--decision-period <int>`：Timestep之间的动作执行间隔。默认值为`1`。
+- `--result-broadcast-ratio <float>`：当赢得回合时，对结果的reward进行broadcast的比例，默认值为`1/30`。
+- `--target-lr <float>`：下调学习率的目标值。默认值为`1e-6`。
+
+### 损失函数参数
+
+- `--policy-coef <float>`：策略损失的系数。默认值为`[0.8, 0.8, 0.8, 0.8]`。
+- `--entropy-coef <float>`：熵损失的系数。默认值为`[0.05, 0.05, 0.05, 0.05]`。
+- `--critic-coef <float>`：评论家损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
+- `--loss-coef <float>`：总损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
+
+### GAE损失参数
+
+- `--gae`：是否使用GAE进行优势计算。默认启用。
+- `--norm-adv`：是否标准化优势。默认为`False`。
+- `--gamma <float>`：折扣因子gamma。默认值为`0.999`。
+- `--gaeLambda <float>`：GAE的lambda值。默认值为`0.95`。
+- `--clip-coef <float>`：替代裁剪系数。默认值为`0.11`。
+- `--clip-vloss`：是否使用论文中的裁剪价值函数损失。默认启用。
+- `--max-grad-norm <float>`：梯度裁剪的最大范数。默认值为`0.5`。
+
+### 环境参数
+
+- `--target-num <int>`：目标种类数量。默认值为`4`。
+- `--env-timelimit <int>`：每轮的时间限制。默认值为`30`。
+- `--base-win-reward <int>`：赢得回合的基础奖励。默认值为`999`。
+- `--base-lose-reward <int>`：输掉回合的基础奖励。默认值为`-999`。
+- `--target-state-size <int>`：target状态的大小。默认值为`6`。
+- `--time-state-size <int>`：游戏剩余时间状态的大小。默认值为`1`。
+- `--gun-state-size <int>`：枪状态的大小。默认值为`1`。
+- `--my-state-size <int>`：我的状态大小。默认值为`4`。
+- `--total-target-size <int>`：总target状态的大小。默认值为`12`。
--- a/Aimbot-PPO-Python/Pytorch/arguments-jp.md
+++ b/Aimbot-PPO-Python/Pytorch/arguments-jp.md
@ -0,0 +1,52 @@
+- `--seed <int>`：実験の乱数Seed。デフォルト値は`9331`。
+- `--path <str>`：環境パス。デフォルト値は`"./Build/3.6/Aimbot-ParallelEnv"`。
+- `--workerID <int>`：Unity Worker ID。デフォルト値は`1`。
+- `--baseport <int>`：Unity環境への接続用Port。デフォルト値は`500`。
+- `--lr <float>`：Optimizerのデフォルト学習率。デフォルト値は`5e-5`。
+- `--cuda`：有効にすると、デフォルトでcudaを使用します。`true`または`false`を渡すことで有効/無効を切り替えられます。
+- `--total-timesteps <int>`：実験の合計タイムステップ数。デフォルト値は`3150000`。
+
+### モデルパラメータ
+
+- `--train`：モデルを訓練するかどうか。デフォルトで有効。
+- `--freeze-viewnet`：ビューネットワーク(raycast)をfreezeする。デフォルトは`False`。
+- `--datasetSize <int>`：訓練データセットのサイズ。データセットが十分なデータを集めたら訓練を開始する。デフォルト値は`6000`。
+- `--minibatchSize <int>`：minibatchのサイズ。デフォルト値は`512`。
+- `--epochs <int>`：epochs。デフォルト値は`3`。
+- `--annealLR`：ポリシーとバリューネットワークの学習率を退火するかどうか。デフォルトは`True`。
+- `--wandb-track`：wandbでトラッキングするかどうか。デフォルトは`False`。
+- `--save-model`：モデルを保存するかどうか。デフォルトは`False`。
+- `--wandb-entity <str>`：wandbプロジェクトのエンティティ。デフォルト値は`"koha9"`。
+- `--load-dir <str>`：モデルのロードディレクトリ。デフォルト値は`None`。
+- `--decision-period <int>`：実際動作を実行する時のタイムステップの間隔。デフォルト値は`1`。
+- `--result-broadcast-ratio <float>`：ラウンドに勝った場合の報酬のbroadcast ratio、デフォルト値は`1/30`。
+- `--target-lr <float>`：学習率を下げる時の目標値。デフォルト値は`1e-6`。
+
+### 損失関数パラメータ
+
+- `--policy-coef <float>`：policy損失の係数。デフォルト値は`[0.8, 0.8, 0.8, 0.8]`。
+- `--entropy-coef <float>`：entropy損失の係数。デフォルト値は`[0.05, 0.05, 0.05, 0.05]`。
+- `--critic-coef <float>`：critic損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
+- `--loss-coef <float>`：全体の損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
+
+### GAE損失パラメータ
+
+- `--gae`：GAEを使用してアドバンテージを計算するかどうか。デフォルトで有効。
+- `--norm-adv`：アドバンテージを正規化するかどうか。デフォルトは`False`。
+- `--gamma <float>`：割引因子gamma。デフォルト値は`0.999`。
+- `--gaeLambda <float>`：GAEのlambda値。デフォルト値は`0.95`。
+- `--clip-coef <float>`：代替クリッピング係数。デフォルト値は`0.11`。
+- `--clip-vloss`：論文で述べられている価値関数の損失のクリッピングを使用するかどうか。デフォルトで有効。
+- `--max-grad-norm <float>`：勾配のクリッピングの最大ノルム。デフォルト値は`0.5`。
+
+### 環境パラメータ
+
+- `--target-num <int>`：Targetの種類数。デフォルト値は`4`。
+- `--env-timelimit <int>`：ラウンドごとの時間制限。デフォルト値は`30`。
+- `--base-win-reward <int>`：ラウンドに勝った場合の基本報酬。デフォルト値は`999`。
+- `--base-lose-reward <int>`：ラウンドに負けた場合の基本報酬。デフォルト値は`-999`。
+- `--target-state-size <int>`：Targetの状態サイズ。デフォルト値は`6`。
+- `--time-state-size <int>`：ゲームの残り時間の状態サイズ。デフォルト値は`1`。
+- `--gun-state-size <int>`：銃の状態サイズ。デフォルト値は`1`。
+- `--my-state-size <int>`：自分の状態サイズ。デフォルト値は`4`。
+- `--total-target-size <int>`：全Targetの状態サイズ。デフォルト値は`12`。
--- a/Aimbot-PPO-Python/Pytorch/arguments.py
+++ b/Aimbot-PPO-Python/Pytorch/arguments.py
@ -0,0 +1,159 @@
+import argparse
+import uuid
+
+from distutils.util import strtobool
+
+DEFAULT_SEED = 9331
+ENV_PATH = "../Build/3.6/Aimbot-ParallelEnv"
+WAND_ENTITY = "koha9"
+WORKER_ID = 1
+BASE_PORT = 1000
+
+# tensorboard names
+
+# max round steps per agent is 2500/Decision_period, 25 seconds
+TOTAL_STEPS = 3150000
+BATCH_SIZE = 512
+MAX_TRAINNING_DATASETS = 6000
+DECISION_PERIOD = 1
+LEARNING_RATE = 5e-5
+GAMMA = 0.999
+GAE_LAMBDA = 0.95
+EPOCHS = 3
+CLIP_COEF = 0.11
+LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
+POLICY_COEF = [0.8, 0.8, 0.8, 0.8]
+ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
+CRITIC_COEF = [1.0, 1.0, 1.0, 1.0]
+TARGET_LEARNING_RATE = 1e-6
+
+FREEZE_VIEW_NETWORK = False
+ANNEAL_LEARNING_RATE = True
+CLIP_VLOSS = True
+NORM_ADV = False
+TRAIN = True
+SAVE_MODEL = True
+WANDB_TACK = True
+LOAD_DIR = None
+# LOAD_DIR = "../PPO-Model/GotoOnly-Level1234_9331_1697122986/8.853553.pt"
+
+# Unity Environment Parameters
+TARGET_STATE_SIZE = 6
+INAREA_STATE_SIZE = 1
+TIME_STATE_SIZE = 1
+GUN_STATE_SIZE = 1
+MY_STATE_SIZE = 4
+TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
+BASE_WINREWARD = 999
+BASE_LOSEREWARD = -999
+TARGETNUM= 4
+ENV_TIMELIMIT = 30
+RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
+
+save_model_this_episode = False
+
+def is_save_model():
+    global save_model_this_episode
+    return save_model_this_episode
+def set_save_model(save_model:bool):
+    print("set save model to ",save_model)
+    global save_model_this_episode
+    save_model_this_episode = save_model
+
+def parse_args():
+    # fmt: off
+    # pytorch and environment parameters
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
+                        help="seed of the experiment")
+    parser.add_argument("--path", type=str, default=ENV_PATH,
+                        help="enviroment path")
+    parser.add_argument("--workerID", type=int, default=WORKER_ID,
+                        help="unity worker ID")
+    parser.add_argument("--baseport", type=int, default=BASE_PORT,
+                        help="port to connect to Unity environment")
+    parser.add_argument("--lr", type=float, default=LEARNING_RATE,
+                        help="the default learning rate of optimizer")
+    parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
+                        help="if toggled, cuda will be enabled by default")
+    parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
+                        help="total timesteps of the experiments")
+
+    # model parameters
+    parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
+                        help="Train Model or not")
+    parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
+                        help="freeze view network or not")
+    parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
+                        help="training dataset size,start training while dataset collect enough data")
+    parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
+                        help="nimi batch size")
+    parser.add_argument("--epochs", type=int, default=EPOCHS,
+                        help="the K epochs to update the policy")
+    parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
+                        help="Toggle learning rate annealing for policy and value networks")
+    parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
+                        help="track on the wandb")
+    parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
+                        help="save model or not")
+    parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
+                        help="the entity (team) of wandb's project")
+    parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
+                        help="load model directory")
+    parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
+                        help="the number of steps to run in each environment per policy rollout")
+    parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
+                        help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
+    # target_learning_rate
+    parser.add_argument("--target-lr", type=float, default=TARGET_LEARNING_RATE,
+                        help="target value of downscaling the learning rate")
+
+    # POLICY_COEF ENTROPY_COEF CRITIC_COEF LOSS_COEF
+    parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
+                        help="coefficient of the policy loss")
+    parser.add_argument("--entropy-coef", type=float, default=ENTROPY_COEF,
+                        help="coefficient of the entropy loss")
+    parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
+                        help="coefficient of the critic loss")
+    parser.add_argument("--loss-coef", type=float, default=LOSS_COEF,
+                        help="coefficient of the total loss")
+
+    # GAE loss
+    parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
+                        help="Use GAE for advantage computation")
+    parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
+                        help="Toggles advantages normalization")
+    parser.add_argument("--gamma", type=float, default=GAMMA,
+                        help="the discount factor gamma")
+    parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
+                        help="the lambda for the general advantage estimation")
+    parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
+                        help="the surrogate clipping coefficient")
+    parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
+                        help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
+    parser.add_argument("--max-grad-norm", type=float, default=0.5,
+                        help="the maximum norm for the gradient clipping")
+    parser.add_argument("--target-kl", type=float, default=None,
+                        help="the target KL divergence threshold")
+    # environment parameters
+    parser.add_argument("--target-num", type=int, default=TARGETNUM,
+                        help="the number of targets")
+    parser.add_argument("--env-timelimit", type=int, default=ENV_TIMELIMIT,
+                        help="the time limit of each round")
+    parser.add_argument("--base-win-reward", type=int, default=BASE_WINREWARD,
+                        help="the base reward of win round")
+    parser.add_argument("--base-lose-reward", type=int, default=BASE_LOSEREWARD,
+                        help="the base reward of lose round")
+    parser.add_argument("--target-state-size", type=int, default=TARGET_STATE_SIZE,
+                        help="the size of target state")
+    parser.add_argument("--time-state-size", type=int, default=TIME_STATE_SIZE,
+                        help="the size of time state")
+    parser.add_argument("--gun-state-size", type=int, default=GUN_STATE_SIZE,
+                        help="the size of gun state")
+    parser.add_argument("--my-state-size", type=int, default=MY_STATE_SIZE,
+                        help="the size of my state")
+    parser.add_argument("--total-target-size", type=int, default=TOTAL_T_SIZE,
+                        help="the size of total target state")
+    # fmt: on
+    args = parser.parse_args()
+    return args
--- a/Aimbot-PPO-Python/Pytorch/ppoagent.py
+++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py
@ -0,0 +1,291 @@
+import numpy as np
+import torch
+import argparse
+import time
+
+from torch import nn
+from aimbotEnv import Aimbot
+from torch.distributions.normal import Normal
+from torch.distributions.categorical import Categorical
+
+
+def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
+    nn.init.orthogonal_(layer.weight, std)
+    nn.init.constant_(layer.bias, bias_const)
+    return layer
+
+neural_size_1 = 400
+neural_size_2 = 300
+
+class PPOAgent(nn.Module):
+    def __init__(
+            self,
+            env: Aimbot,
+            this_args: argparse.Namespace,
+            device: torch.device,
+    ):
+        super(PPOAgent, self).__init__()
+        self.device = device
+        self.args = this_args
+        self.train_agent = self.args.train
+        self.target_num = self.args.target_num
+        self.unity_observation_shape = env.unity_observation_shape
+        self.unity_action_size = env.unity_action_size
+        self.state_size = self.unity_observation_shape[0]
+        self.agent_num = env.unity_agent_num
+
+        self.unity_discrete_type = env.unity_discrete_type
+        self.discrete_size = env.unity_discrete_size
+        self.discrete_shape = list(env.unity_discrete_branches)
+        self.continuous_size = env.unity_continuous_size
+
+        self.hidden_networks = nn.ModuleList(
+            [
+                nn.Sequential(
+                    layer_init(nn.Linear(self.state_size, neural_size_1)),
+                    nn.LeakyReLU(),
+                    layer_init(nn.Linear(neural_size_1, neural_size_2)),
+                    nn.LeakyReLU(),
+                    )
+                for i in range(self.target_num)
+            ]
+        )
+
+        self.actor_dis = nn.ModuleList(
+            [layer_init(nn.Linear(neural_size_2, self.discrete_size), std=0.5) for i in range(self.target_num)]
+        )
+        self.actor_mean = nn.ModuleList(
+            [layer_init(nn.Linear(neural_size_2, self.continuous_size), std=0) for i in range(self.target_num)]
+        )
+        self.actor_logstd = nn.ParameterList(
+            [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
+        )
+        self.critic = nn.ModuleList(
+            [layer_init(nn.Linear(neural_size_2, 1), std=0) for i in range(self.target_num)]
+        )
+
+    def get_value(self, state: torch.Tensor):
+        # get critic value
+        # state.size()[0] is batch_size
+        target = state[:, 0].to(torch.int32)  # int
+        hidden_output = torch.stack(
+            [self.hidden_networks[target[i]](state[i]) for i in range(state.size()[0])]
+        )
+        criticV = torch.stack(
+            [self.critic[target[i]](hidden_output[i]) for i in range(state.size()[0])]
+        )
+        return criticV
+
+    def get_actions_value(self, state: torch.Tensor, actions=None):
+        # get actions and value
+        target = state[:, 0].to(torch.int32)  # int
+        hidden_output = torch.stack(
+            [self.hidden_networks[target[i]](state[i]) for i in range(target.size()[0])]
+        )
+
+        # discrete
+        # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
+        dis_logits = torch.stack(
+            [self.actor_dis[target[i]](hidden_output[i]) for i in range(target.size()[0])]
+        )
+        split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
+        multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
+        # continuous
+        actions_mean = torch.stack(
+            [self.actor_mean[target[i]](hidden_output[i]) for i in range(target.size()[0])]
+        )  # self.actor_mean(hidden)
+        action_logstd = torch.stack(
+            [torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(target.size()[0])]
+        )
+        # print(action_logstd)
+        action_std = torch.exp(action_logstd)  # torch.exp(action_logstd)
+        con_probs = Normal(actions_mean, action_std)
+        # critic
+        criticV = torch.stack(
+            [self.critic[target[i]](hidden_output[i]) for i in range(target.size()[0])]
+        )  # self.critic
+
+        if actions is None:
+            if self.train_agent:
+                # select actions base on probability distribution model
+                dis_act = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
+                con_act = con_probs.sample()
+                actions = torch.cat([dis_act.T, con_act], dim=1)
+            else:
+                # select actions base on best probability distribution
+                dis_act = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
+                con_act = actions_mean
+                actions = torch.cat([dis_act.T, con_act], dim=1)
+        else:
+            dis_act = actions[:, 0: self.unity_discrete_type].T
+            con_act = actions[:, self.unity_discrete_type:]
+        dis_log_prob = torch.stack(
+            [ctgr.log_prob(act) for act, ctgr in zip(dis_act, multi_categoricals)]
+        )
+        dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
+        return (
+            actions,
+            dis_log_prob.sum(0),
+            dis_entropy.sum(0),
+            con_probs.log_prob(con_act).sum(1),
+            con_probs.entropy().sum(1),
+            criticV,
+        )
+
+    def train_net(self, this_train_ind: int, ppo_memories, optimizer) -> tuple:
+        start_time = time.time()
+        # flatten the batch
+        b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape)
+        b_dis_logprobs = ppo_memories.dis_logprobs[this_train_ind].reshape(-1)
+        b_con_logprobs = ppo_memories.con_logprobs[this_train_ind].reshape(-1)
+        b_actions = ppo_memories.actions[this_train_ind].reshape((-1,) + (self.unity_action_size,))
+        b_advantages = ppo_memories.advantages[this_train_ind].reshape(-1)
+        b_returns = ppo_memories.returns[this_train_ind].reshape(-1)
+        b_values = ppo_memories.values[this_train_ind].reshape(-1)
+        b_size = b_obs.size()[0]
+        # optimizing the policy and value network
+        b_index = np.arange(b_size)
+
+        for epoch in range(self.args.epochs):
+            print("epoch:", epoch, end="")
+            # shuffle all datasets
+            np.random.shuffle(b_index)
+            for start in range(0, b_size, self.args.minibatchSize):
+                print(".", end="")
+                end = start + self.args.minibatchSize
+                mb_index = b_index[start:end]
+                if np.size(mb_index) <= 1:
+                    break
+                mb_advantages = b_advantages[mb_index]
+
+                # normalize advantages
+                if self.args.norm_adv:
+                    mb_advantages = (mb_advantages - mb_advantages.mean()) / (
+                            mb_advantages.std() + 1e-8
+                    )
+
+                (
+                    _,
+                    new_dis_logprob,
+                    dis_entropy,
+                    new_con_logprob,
+                    con_entropy,
+                    new_value,
+                ) = self.get_actions_value(b_obs[mb_index], b_actions[mb_index])
+                # discrete ratio
+                dis_log_ratio = new_dis_logprob - b_dis_logprobs[mb_index]
+                dis_ratio = dis_log_ratio.exp()
+                # continuous ratio
+                con_log_ratio = new_con_logprob - b_con_logprobs[mb_index]
+                con_ratio = con_log_ratio.exp()
+
+                """
+                # early stop
+                with torch.no_grad():
+                    # calculate approx_kl http://joschu.net/blog/kl-approx.html
+                    old_approx_kl = (-logratio).mean()
+                    approx_kl = ((ratio - 1) - logratio).mean()
+                    clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
+                """
+
+                # discrete Policy loss
+                dis_pg_loss_orig = -mb_advantages * dis_ratio
+                dis_pg_loss_clip = -mb_advantages * torch.clamp(
+                    dis_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
+                )
+                dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
+                # continuous Policy loss
+                con_pg_loss_orig = -mb_advantages * con_ratio
+                con_pg_loss_clip = -mb_advantages * torch.clamp(
+                    con_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
+                )
+                con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
+
+                # Value loss
+                new_value = new_value.view(-1)
+                if self.args.clip_vloss:
+                    v_loss_unclipped = (new_value - b_returns[mb_index]) ** 2
+                    v_clipped = b_values[mb_index] + torch.clamp(
+                        new_value - b_values[mb_index],
+                        -self.args.clip_coef,
+                        self.args.clip_coef,
+                    )
+                    v_loss_clipped = (v_clipped - b_returns[mb_index]) ** 2
+                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
+                    v_loss = 0.5 * v_loss_max.mean()
+                else:
+                    v_loss = 0.5 * ((new_value - b_returns[mb_index]) ** 2).mean()
+
+                # total loss
+                entropy_loss = dis_entropy.mean() + con_entropy.mean()
+                loss = (
+                               dis_pg_loss * self.args.policy_coef[this_train_ind]
+                               + con_pg_loss * self.args.policy_coef[this_train_ind]
+                               + entropy_loss * self.args.entropy_coef[this_train_ind]
+                               + v_loss * self.args.critic_coef[this_train_ind]
+                       ) * self.args.loss_coef[this_train_ind]
+
+                if torch.isnan(loss).any():
+                    print("LOSS Include NAN!!!")
+                    if torch.isnan(dis_pg_loss.any()):
+                        print("dis_pg_loss include nan")
+                    if torch.isnan(con_pg_loss.any()):
+                        print("con_pg_loss include nan")
+                    if torch.isnan(entropy_loss.any()):
+                        print("entropy_loss include nan")
+                    if torch.isnan(v_loss.any()):
+                        print("v_loss include nan")
+                    raise
+
+                optimizer.zero_grad()
+                loss.backward()
+                # Clips gradient norm of an iterable of parameters.
+                nn.utils.clip_grad_norm_(self.parameters(), self.args.max_grad_norm)
+                optimizer.step()
+
+            """
+            if args.target_kl is not None:
+                if approx_kl > args.target_kl:
+                    break
+            """
+        return v_loss, dis_pg_loss, con_pg_loss, loss, entropy_loss
+
+    def gae(
+            self,
+            rewards: torch.Tensor,
+            dones: torch.Tensor,
+            values: torch.Tensor,
+            next_obs: torch.Tensor,
+            next_done: torch.Tensor,
+    ) -> tuple:
+        # GAE
+        with torch.no_grad():
+            next_value = self.get_value(next_obs).reshape(1, -1)
+            data_size = rewards.size()[0]
+            if self.args.gae:
+                advantages = torch.zeros_like(rewards).to(self.device)
+                last_gae_lam = 0
+                for t in reversed(range(data_size)):
+                    if t == data_size - 1:
+                        next_non_terminal = 1.0 - next_done
+                        next_values = next_value
+                    else:
+                        next_non_terminal = 1.0 - dones[t + 1]
+                        next_values = values[t + 1]
+                    delta = rewards[t] + self.args.gamma * next_values * next_non_terminal - values[t]
+                    advantages[t] = last_gae_lam = (
+                            delta + self.args.gamma * self.args.gaeLambda * next_non_terminal * last_gae_lam
+                    )
+                returns = advantages + values
+            else:
+                returns = torch.zeros_like(rewards).to(self.device)
+                for t in reversed(range(data_size)):
+                    if t == data_size - 1:
+                        next_non_terminal = 1.0 - next_done
+                        next_return = next_value
+                    else:
+                        next_non_terminal = 1.0 - dones[t + 1]
+                        next_return = returns[t + 1]
+                    returns[t] = rewards[t] + self.args.gamma * next_non_terminal * next_return
+                advantages = returns - values
+        return advantages, returns
--- a/Aimbot-PPO-Python/Pytorch/pytorch_run_archive.zip
+++ b/Aimbot-PPO-Python/Pytorch/pytorch_run_archive.zip
--- a/Aimbot-PPO-Python/Pytorch/testarea.ipynb
+++ b/Aimbot-PPO-Python/Pytorch/testarea.ipynb
@ -1,502 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Action, 1 continuous ctrl 2.1\n",
-      "Action, 0 continuous ctrl -1.1\n"
-     ]
-    }
-   ],
-   "source": [
-    "import gym\n",
-    "from gym.spaces import Dict, Discrete, Box, Tuple\n",
-    "import numpy as np\n",
-    "\n",
-    "\n",
-    "class SampleGym(gym.Env):\n",
-    "    def __init__(self, config={}):\n",
-    "        self.config = config\n",
-    "        self.action_space = Tuple((Discrete(2), Box(-10, 10, (2,))))\n",
-    "        self.observation_space = Box(-10, 10, (2, 2))\n",
-    "        self.p_done = config.get(\"p_done\", 0.1)\n",
-    "\n",
-    "    def reset(self):\n",
-    "        return self.observation_space.sample()\n",
-    "\n",
-    "    def step(self, action):\n",
-    "        chosen_action = action[0]\n",
-    "        cnt_control = action[1][chosen_action]\n",
-    "\n",
-    "        if chosen_action == 0:\n",
-    "            reward = cnt_control\n",
-    "        else:\n",
-    "            reward = -cnt_control - 1\n",
-    "\n",
-    "        print(f\"Action, {chosen_action} continuous ctrl {cnt_control}\")\n",
-    "        return (\n",
-    "            self.observation_space.sample(),\n",
-    "            reward,\n",
-    "            bool(np.random.choice([True, False], p=[self.p_done, 1.0 - self.p_done])),\n",
-    "            {},\n",
-    "        )\n",
-    "\n",
-    "\n",
-    "if __name__ == \"__main__\":\n",
-    "    env = SampleGym()\n",
-    "    env.reset()\n",
-    "    env.step((1, [-1, 2.1]))  # should say use action 1 with 2.1\n",
-    "    env.step((0, [-1.1, 2.1]))  # should say use action 0 with -1.1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from mlagents_envs.environment import UnityEnvironment\n",
-    "from gym_unity.envs import UnityToGymWrapper\n",
-    "import numpy as np\n",
-    "\n",
-    "ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n",
-    "WORKER_ID = 1\n",
-    "BASE_PORT = 2002\n",
-    "\n",
-    "env = UnityEnvironment(\n",
-    "    file_name=ENV_PATH,\n",
-    "    seed=1,\n",
-    "    side_channels=[],\n",
-    "    worker_id=WORKER_ID,\n",
-    "    base_port=BASE_PORT,\n",
-    ")\n",
-    "\n",
-    "trackedAgent = 0\n",
-    "env.reset()\n",
-    "BEHA_SPECS = env.behavior_specs\n",
-    "BEHA_NAME = list(BEHA_SPECS)[0]\n",
-    "SPEC = BEHA_SPECS[BEHA_NAME]\n",
-    "print(SPEC)\n",
-    "\n",
-    "decisionSteps, terminalSteps = env.get_steps(BEHA_NAME)\n",
-    "\n",
-    "if trackedAgent in decisionSteps:  # ゲーム終了していない場合、環境状態がdecision_stepsに保存される\n",
-    "    nextState = decisionSteps[trackedAgent].obs[0]\n",
-    "    reward = decisionSteps[trackedAgent].reward\n",
-    "    done = False\n",
-    "if trackedAgent in terminalSteps:  # ゲーム終了した場合、環境状態がterminal_stepsに保存される\n",
-    "    nextState = terminalSteps[trackedAgent].obs[0]\n",
-    "    reward = terminalSteps[trackedAgent].reward\n",
-    "    done = True\n",
-    "print(decisionSteps.agent_id)\n",
-    "print(terminalSteps)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "decisionSteps.agent_id [1 2 5 7]\n",
-      "decisionSteps.agent_id_to_index {1: 0, 2: 1, 5: 2, 7: 3}\n",
-      "decisionSteps.reward [0. 0. 0. 0.]\n",
-      "decisionSteps.action_mask [array([[False, False, False],\n",
-      "       [False, False, False],\n",
-      "       [False, False, False],\n",
-      "       [False, False, False]]), array([[False, False, False],\n",
-      "       [False, False, False],\n",
-      "       [False, False, False],\n",
-      "       [False, False, False]]), array([[False, False],\n",
-      "       [False, False],\n",
-      "       [False, False],\n",
-      "       [False, False]])]\n",
-      "decisionSteps.obs [  0.          0.          0.          0.          0.          0.\n",
-      "   0.          0.          0.          0.          0.          0.\n",
-      "   0.          0.          0.          0.          0.          0.\n",
-      "   0.          0.          0.          0.          0.          0.\n",
-      "   0.          0.          0.          0.          0.          0.\n",
-      "   0.          0.          0.          0.          0.          0.\n",
-      "   0.          0.          0.          0.          0.          0.\n",
-      "   0.          0.          0.          0.          0.          0.\n",
-      "   0.          0.          0.          0.          0.          0.\n",
-      "   0.          0.          0.          0.          0.          0.\n",
-      "   0.          0.        -15.994009    1.        -26.322788    1.\n",
-      "   1.          1.          1.          1.          1.          2.\n",
-      "   1.          1.          1.          1.          1.          1.\n",
-      "   1.          1.3519633   1.6946528   2.3051548   3.673389    9.067246\n",
-      "  17.521473   21.727095   22.753294   24.167128   25.905216   18.35725\n",
-      "  21.02278    21.053417    0.       ]\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'decisionSteps.obs [array([[-15.994009 ,   1.       , -26.322788 ,   1.       ,   1.       ,\\n          1.       ,   1.       ,   1.       ,   1.       ,   2.       ,\\n          1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\\n          1.       ,   1.       ,   1.3519633,   1.6946528,   2.3051548,\\n          3.673389 ,   9.067246 ,  17.521473 ,  21.727095 ,  22.753294 ,\\n         24.167128 ,  25.905216 ,  18.35725  ,  21.02278  ,  21.053417 ,\\n          0.       ],\\n       [ -1.8809433,   1.       , -25.66834  ,   1.       ,   2.       ,\\n          1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\\n          1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\\n          1.       ,   1.       ,  16.768637 ,  23.414627 ,  22.04486  ,\\n         21.050663 ,  20.486784 ,  20.486784 ,  21.050665 ,  15.049731 ,\\n         11.578419 ,   9.695194 ,  20.398016 ,  20.368341 ,  20.398016 ,\\n...\\n         20.551746 ,  20.00118  ,  20.001116 ,  20.551594 ,  21.5222   ,\\n         17.707508 ,  14.86889  ,  19.914494 ,  19.885508 ,  19.914463 ,\\n          0.       ]], dtype=float32)]'"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "print(\"decisionSteps.agent_id\",decisionSteps.agent_id)\n",
-    "# decisionSteps.agent_id [1 2 5 7]\n",
-    "print(\"decisionSteps.agent_id_to_index\",decisionSteps.agent_id_to_index)\n",
-    "# decisionSteps.agent_id_to_index {1: 0, 2: 1, 5: 2, 7: 3}\n",
-    "print(\"decisionSteps.reward\",decisionSteps.reward)\n",
-    "# decisionSteps.reward [0. 0. 0. 0.]\n",
-    "print(\"decisionSteps.action_mask\",decisionSteps.action_mask)\n",
-    "'''\n",
-    "decisionSteps.action_mask [array([[False, False, False],\n",
-    "       [False, False, False],\n",
-    "       [False, False, False],\n",
-    "       [False, False, False]]), array([[False, False, False],\n",
-    "       [False, False, False],\n",
-    "       [False, False, False],\n",
-    "       [False, False, False]]), array([[False, False],\n",
-    "       [False, False],\n",
-    "       [False, False],\n",
-    "       [False, False]])]\n",
-    "'''\n",
-    "print(\"decisionSteps.obs\", decisionSteps.obs[0][0])\n",
-    "'''decisionSteps.obs [array([[-15.994009 ,   1.       , -26.322788 ,   1.       ,   1.       ,\n",
-    "          1.       ,   1.       ,   1.       ,   1.       ,   2.       ,\n",
-    "          1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-    "          1.       ,   1.       ,   1.3519633,   1.6946528,   2.3051548,\n",
-    "          3.673389 ,   9.067246 ,  17.521473 ,  21.727095 ,  22.753294 ,\n",
-    "         24.167128 ,  25.905216 ,  18.35725  ,  21.02278  ,  21.053417 ,\n",
-    "          0.       ],\n",
-    "       [ -1.8809433,   1.       , -25.66834  ,   1.       ,   2.       ,\n",
-    "          1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-    "          1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-    "          1.       ,   1.       ,  16.768637 ,  23.414627 ,  22.04486  ,\n",
-    "         21.050663 ,  20.486784 ,  20.486784 ,  21.050665 ,  15.049731 ,\n",
-    "         11.578419 ,   9.695194 ,  20.398016 ,  20.368341 ,  20.398016 ,\n",
-    "...\n",
-    "         20.551746 ,  20.00118  ,  20.001116 ,  20.551594 ,  21.5222   ,\n",
-    "         17.707508 ,  14.86889  ,  19.914494 ,  19.885508 ,  19.914463 ,\n",
-    "          0.       ]], dtype=float32)]'''\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from AimbotEnv import Aimbot\n",
-    "\n",
-    "ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n",
-    "WORKER_ID = 1\n",
-    "BASE_PORT = 2002\n",
-    "\n",
-    "env = Aimbot(envPath=ENV_PATH,workerID= WORKER_ID,basePort= BASE_PORT)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(array([[  0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       , -15.994009 ,   1.       , -26.322788 ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-       "           2.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,   1.3519633,   1.6946528,\n",
-       "           2.3051548,   3.673389 ,   9.067246 ,  17.521473 ,  21.727095 ,\n",
-       "          22.753294 ,  24.167128 ,  25.905216 ,  18.35725  ,  21.02278  ,\n",
-       "          21.053417 ,   0.       , -15.994003 ,   1.       , -26.322784 ,\n",
-       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,   1.       ,   1.3519667,\n",
-       "           1.6946585,   2.3051722,   3.6734192,   9.067533 ,  21.145092 ,\n",
-       "          21.727148 ,  22.753365 ,  24.167217 ,  25.905317 ,  18.358263 ,\n",
-       "          21.022812 ,  21.053455 ,   0.       ],\n",
-       "        [  0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,  -1.8809433,   1.       , -25.66834  ,   1.       ,\n",
-       "           2.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,  16.768637 ,  23.414627 ,\n",
-       "          22.04486  ,  21.050663 ,  20.486784 ,  20.486784 ,  21.050665 ,\n",
-       "          15.049731 ,  11.578419 ,   9.695194 ,  20.398016 ,  20.368341 ,\n",
-       "          20.398016 ,   0.       ,  -1.8809433,   1.       , -25.66834  ,\n",
-       "           1.       ,   1.       ,   2.       ,   1.       ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,   1.       ,   2.       ,\n",
-       "           2.       ,   1.       ,   1.       ,   1.       ,  25.098585 ,\n",
-       "          15.749494 ,  22.044899 ,  21.050697 ,  20.486813 ,  20.486813 ,\n",
-       "          21.050694 ,  15.049746 ,   3.872317 ,   3.789325 ,  20.398046 ,\n",
-       "          20.368372 ,  20.398046 ,   0.       ],\n",
-       "        [  0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       , -13.672583 ,   1.       , -26.479263 ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,   5.3249803,   6.401276 ,\n",
-       "           8.374101 ,  12.8657875,  21.302414 ,  21.30242  ,  21.888742 ,\n",
-       "          22.92251  ,  24.346794 ,  26.09773  ,  21.210114 ,  21.179258 ,\n",
-       "          21.210117 ,   0.       , -13.672583 ,   1.       , -26.479263 ,\n",
-       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-       "           1.       ,   1.       ,   2.       ,   1.       ,   1.       ,\n",
-       "           2.       ,   1.       ,   1.       ,   2.       ,   5.3249855,\n",
-       "           6.4012837,   8.374114 ,  12.865807 ,  21.302446 ,  21.30245  ,\n",
-       "          16.168503 ,  22.922543 ,  24.346823 ,   7.1110754,  21.210148 ,\n",
-       "          21.17929  ,  12.495141 ,   0.       ],\n",
-       "        [  0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
-       "           0.       ,  -4.9038744,   1.       , -25.185507 ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,  20.33171  ,  22.859762 ,\n",
-       "          21.522427 ,  20.551746 ,  20.00118  ,  20.001116 ,  20.551594 ,\n",
-       "          21.5222   ,  17.707508 ,  14.86889  ,  19.914494 ,  19.885508 ,\n",
-       "          19.914463 ,   0.       ,  -4.9038773,   1.       , -25.185507 ,\n",
-       "           1.       ,   2.       ,   1.       ,   2.       ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,   2.       ,   1.       ,\n",
-       "           1.       ,   1.       ,   1.       ,   1.       ,  15.905993 ,\n",
-       "          22.85977  ,  11.566693 ,  20.551773 ,  20.00121  ,  20.001146 ,\n",
-       "          20.551619 ,   7.135157 ,  17.707582 ,  14.868943 ,  19.914528 ,\n",
-       "          19.88554  ,  19.914494 ,   0.       ]], dtype=float32),\n",
-       " [[-0.05], [-0.05], [-0.05], [-0.05]],\n",
-       " [[False], [False], [False], [False]])"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "env.unity_observation_shape\n",
-    "(128, 4) + env.unity_observation_shape\n",
-    "env.reset()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor([[1, 2, 3],\n",
-      "        [1, 2, 3],\n",
-      "        [1, 2, 3],\n",
-      "        [1, 2, 3]], device='cuda:0')\n",
-      "tensor([[1],\n",
-      "        [2],\n",
-      "        [3],\n",
-      "        [4]], device='cuda:0')\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[1, 2, 3, 1],\n",
-       "        [1, 2, 3, 2],\n",
-       "        [1, 2, 3, 3],\n",
-       "        [1, 2, 3, 4]], device='cuda:0')"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import torch\n",
-    "aa = torch.tensor([[1,2,3],[1,2,3],[1,2,3],[1,2,3]]).to(\"cuda:0\")\n",
-    "bb = torch.tensor([[1],[2],[3],[4]]).to(\"cuda:0\")\n",
-    "print(aa)\n",
-    "print(bb)\n",
-    "torch.cat([aa,bb],axis = 1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "AttributeError",
-     "evalue": "Can't get attribute 'PPOAgent' on <module '__main__'>",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_31348\\1930153251.py\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mmymodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"../PPO-Model/SmallArea-256-128-hybrid.pt\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m \u001b[0mmymodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0meval\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36mload\u001b[1;34m(f, map_location, pickle_module, **pickle_load_args)\u001b[0m\n\u001b[0;32m    710\u001b[0m                     \u001b[0mopened_file\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mseek\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0morig_position\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    711\u001b[0m                     \u001b[1;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 712\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0m_load\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_zipfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmap_location\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    713\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0m_legacy_load\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmap_location\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    714\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36m_load\u001b[1;34m(zip_file, map_location, pickle_module, pickle_file, **pickle_load_args)\u001b[0m\n\u001b[0;32m   1047\u001b[0m     \u001b[0munpickler\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mUnpicklerWrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_file\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1048\u001b[0m     \u001b[0munpickler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpersistent_load\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpersistent_load\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1049\u001b[1;33m     \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0munpickler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1050\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1051\u001b[0m     \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_utils\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_loaded_sparse_tensors\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36mfind_class\u001b[1;34m(self, mod_name, name)\u001b[0m\n\u001b[0;32m   1040\u001b[0m                     \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1041\u001b[0m             \u001b[0mmod_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_module_mapping\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmod_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmod_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1042\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmod_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1043\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1044\u001b[0m     \u001b[1;31m# Load the data (which may in turn use `persistent_load` to load tensors)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;31mAttributeError\u001b[0m: Can't get attribute 'PPOAgent' on <module '__main__'>"
-     ]
-    }
-   ],
-   "source": [
-    "import torch\n",
-    "\n",
-    "def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n",
-    "    torch.nn.init.orthogonal_(layer.weight, std)\n",
-    "    torch.nn.init.constant_(layer.bias, bias_const)\n",
-    "    return layer\n",
-    "\n",
-    "class PPOAgent(nn.Module):\n",
-    "    def __init__(self, env: Aimbot):\n",
-    "        super(PPOAgent, self).__init__()\n",
-    "        self.discrete_size = env.unity_discrete_size\n",
-    "        self.discrete_shape = list(env.unity_discrete_branches)\n",
-    "        self.continuous_size = env.unity_continuous_size\n",
-    "\n",
-    "        self.network = nn.Sequential(\n",
-    "            layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 256)),\n",
-    "            nn.ReLU(),\n",
-    "            layer_init(nn.Linear(256, 128)),\n",
-    "            nn.ReLU(),\n",
-    "        )\n",
-    "        self.actor_dis = layer_init(nn.Linear(128, self.discrete_size), std=0.01)\n",
-    "        self.actor_mean = layer_init(nn.Linear(128, self.continuous_size), std=0.01)\n",
-    "        self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))\n",
-    "        self.critic = layer_init(nn.Linear(128, 1), std=1)\n",
-    "\n",
-    "    def get_value(self, state: torch.Tensor):\n",
-    "        return self.critic(self.network(state))\n",
-    "\n",
-    "    def get_actions_value(self, state: torch.Tensor, actions=None):\n",
-    "        hidden = self.network(state)\n",
-    "        # discrete\n",
-    "        dis_logits = self.actor_dis(hidden)\n",
-    "        split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n",
-    "        multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n",
-    "        # continuous\n",
-    "        actions_mean = self.actor_mean(hidden)\n",
-    "        action_logstd = self.actor_logstd.expand_as(actions_mean)\n",
-    "        action_std = torch.exp(action_logstd)\n",
-    "        con_probs = Normal(actions_mean, action_std)\n",
-    "\n",
-    "        if actions is None:\n",
-    "            disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])\n",
-    "            conAct = con_probs.sample()\n",
-    "            actions = torch.cat([disAct.T, conAct], dim=1)\n",
-    "        else:\n",
-    "            disAct = actions[:, 0 : env.unity_discrete_type].T\n",
-    "            conAct = actions[:, env.unity_discrete_type :]\n",
-    "        dis_log_prob = torch.stack(\n",
-    "            [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]\n",
-    "        )\n",
-    "        dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])\n",
-    "        return (\n",
-    "            actions,\n",
-    "            dis_log_prob.sum(0),\n",
-    "            dis_entropy.sum(0),\n",
-    "            con_probs.log_prob(conAct).sum(1),\n",
-    "            con_probs.entropy().sum(1),\n",
-    "            self.critic(hidden),\n",
-    "        )\n",
-    "\n",
-    "\n",
-    "mymodel = torch.load(\"../PPO-Model/SmallArea-256-128-hybrid.pt\")\n",
-    "mymodel.eval()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "x : torch.Size([2, 3, 4])\n",
-      "x : torch.Size([6, 2, 3, 4])\n",
-      "x : torch.Size([6, 2, 3, 4])\n"
-     ]
-    }
-   ],
-   "source": [
-    "import torch\n",
-    "#1\n",
-    "x = torch.randn(2, 1, 1)#为1可以扩展为3和4\n",
-    "x = x.expand(2, 3, 4)\n",
-    "print('x :', x.size())\n",
-    "\n",
-    "#2\n",
-    "#扩展一个新的维度必须在最前面，否则会报错\n",
-    "#x = x.expand(2, 3, 4, 6)\n",
-    "\n",
-    "x = x.expand(6, 2, 3, 4)\n",
-    "print('x :', x.size())\n",
-    "\n",
-    "#3\n",
-    "#某一个维度为-1表示不改变该维度的大小\n",
-    "x = x.expand(6, -1, -1, -1)\n",
-    "print('x :', x.size())\n",
-    "\n",
-    "x : torch.Size([2, 3, 4])\n",
-    "x : torch.Size([6, 2, 3, 4])\n",
-    "x : torch.Size([6, 2, 3, 4])"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.9.7 64-bit",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.7"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "86e2db13b09bd6be22cb599ea60c1572b9ef36ebeaa27a4c8e961d6df315ac32"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/Aimbot-PPO-Python/runs/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670873579/events.out.tfevents.1670873582.Koha9-Main.27880.0
+++ b/Aimbot-PPO-Python/runs/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670873579/events.out.tfevents.1670873582.Koha9-Main.27880.0
--- a/Aimbot-PPO-Python/runs/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670873653/events.out.tfevents.1670873656.Koha9-Main.17692.0
+++ b/Aimbot-PPO-Python/runs/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670873653/events.out.tfevents.1670873656.Koha9-Main.17692.0
--- a/Aimbot-PPO-Python/testdebug.py
+++ b/Aimbot-PPO-Python/testdebug.py
@ -0,0 +1,5 @@
+import numpy as np
+
+aa = np.array([1,2,3,4,5,6,7,8,9,10])
+
+print(aa)
Author	SHA1	Message	Date
Koha9	573b09a920	Argument説明	2024-03-02 17:36:33 +09:00
Koha9	9d9524429c	整理无用变量，对环境3.6进行适配	2024-01-24 17:07:45 +09:00
Koha9	5aa7e0936a	参数Critic_coef修改	2023-11-23 15:29:13 +09:00
Koha9	3bc5c30fd3	添加对Save_in_next_Trainning的SideChannel支持	2023-10-15 06:05:59 +09:00
Koha9	2741d6d51a	将Tensor改为tensor Tensor与tensor的问题，规范化tensor使用。	2023-08-08 20:47:56 +09:00
Koha9	9432eaa76e	完全分离NN 使用根据Target完全分离的NN进行预测和训练。修改NN构造，修改预测预测算法以配合Full Multi NN	2023-08-04 05:13:32 +09:00
Koha9	52ccce88bc	修正预测函数小错误，规范化命名修正get_actions_value中在非学习模式时仍然使用sample模式而不是取最佳值规范化命名	2023-08-04 03:50:35 +09:00
Koha9	15c1edb6c9	对应V3.1.6Play模式对应V3.1.5的Stay Target进行修改	2023-07-29 23:52:20 +09:00
Koha9	f9ee51c256	对应V3.1.6 训练模式主要修改SideChannel以对应V316的训练模式规则化命名	2023-07-29 22:40:03 +09:00
Koha9	be1322381e	优化AIMemory运行效率对应版本2.9 优化以下错误提示：UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor.	2023-07-29 04:05:02 +09:00
Koha9	efb5c61f0d	代码整理分离args，规范化命名	2023-07-24 16:48:47 +09:00
Koha9	ef0ee495f2	Merge branch 'OffP-PartialMNN-review' into OffP-PartialMNN	2023-07-22 19:30:29 +09:00
Koha9	a21fd724af	代码整理分离ppoagent，AI memory，AI Recorder 优化Aimbot Env 正规化各类命名 Archive不使用的package	2023-07-22 19:26:39 +09:00
Koha9	177974888a	代码整理，不兼容过去的模型代码整理，不兼容过去的模型	2023-07-15 20:37:23 +09:00
Koha9	bee609d160	毕业备份我也不知道毕业的时候发生了些什么...好像改了side channel相关。	2023-07-14 01:29:43 +09:00
Koha9	cbcecfa9e9	Change to LeakyRelu change activation function as leakyrelu, fix some bugs graph.py is for Thesis	2023-03-09 18:11:00 +09:00
Koha9	0e0d98d8b1	Change Param based on a Paper Change Param based on a Paper, and it work!	2022-12-17 09:59:44 +09:00
Koha9	3116831ae6	change network and fix trainset bug change network and fix trainset bug	2022-12-17 09:59:44 +09:00
Koha9	bf77060456	Change Critic NN as Multi-NN Change Critic NN as Multi-NN wrong remain Time Fix wrong remain Time Fix, what a stupid mistake... and fix doubled WANDB writer Deeper TargetNN deeper target NN and will get target state while receive hidden layer's output. Change Middle input let every thing expect raycast input to target network. Change Activation function to Tanh Change Activation function to Tanh, and it's works a little bit better than before.	2022-12-17 09:59:44 +09:00
Koha9	cbc385ca10	Change training dataset storage method save training dataset by it target type. while training NN use single target training set to backward NN. this improve at least 20 times faster than last update!	2022-12-03 07:54:38 +09:00
Koha9	895cd5c118	Add EndReward Broadcast function while game over add remaintime/15 to every step's rewards. to improve this round's training weight. fix get target from states still using onehot decoder bug.	2022-12-03 03:58:19 +09:00
Koha9	3930bcd953	Add Multi-NN agent Add Multi neural network in output layer use different nn while facing to different target.	2022-12-01 19:55:51 +09:00
Koha9	5631569b31	Side Channel added add side Channel to save target win ratio. Fix some Bug	2022-11-30 06:45:07 +09:00
Koha9	32d398dbef	Change Learning timing change learning timing to each episode end.	2022-11-16 19:40:57 +09:00