Update PPO class,add python human control

Python: Update PPO class add python human control Unity: add FP/TP choose button
2022-10-11 06:40:15 +09:00 · 2022-10-11 06:40:15 +09:00 · ae8a1ba8e2
commit ae8a1ba8e2
parent de066f3a65
26 changed files with 3639 additions and 990 deletions
--- a/Aimbot-PPO-MultiScene/Assets/Demonstrations.meta
+++ b/Aimbot-PPO-MultiScene/Assets/Demonstrations.meta
@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: d65d9ca7ae1253341b6790f3a23e3a11
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Aimbot-PPO-MultiScene/Assets/Demonstrations/AKMAgent.demo
+++ b/Aimbot-PPO-MultiScene/Assets/Demonstrations/AKMAgent.demo
--- a/Aimbot-PPO-MultiScene/Assets/Demonstrations/AKMAgent.demo.meta
+++ b/Aimbot-PPO-MultiScene/Assets/Demonstrations/AKMAgent.demo.meta
@ -0,0 +1,10 @@
+fileFormatVersion: 2
+guid: 39a127fc79ed92d4e88aec711f545d5f
+ScriptedImporter:
+  internalIDToNameTable: []
+  externalObjects: {}
+  serializedVersion: 2
+  userData: ' (Unity.MLAgents.Demonstrations.DemonstrationSummary)'
+  assetBundleName: 
+  assetBundleVariant: 
+  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Aimbot-PPO-MultiScene/Assets/ML-Agents/Timers/InGame_timers.json
+++ b/Aimbot-PPO-MultiScene/Assets/ML-Agents/Timers/InGame_timers.json
@ -1 +1 @@
-{"count":1,"self":33.6679968,"total":34.5046305,"children":{"InitializeActuators":{"count":2,"self":0.0010002,"total":0.0010002,"children":null},"InitializeSensors":{"count":2,"self":0.0010004,"total":0.0010004,"children":null},"AgentSendState":{"count":1489,"self":0.011503399999999999,"total":0.2010688,"children":{"CollectObservations":{"count":1489,"self":0.1780647,"total":0.1780647,"children":null},"WriteActionMask":{"count":1488,"self":0.0019993999999999997,"total":0.0019993999999999997,"children":null},"RequestDecision":{"count":1488,"self":0.009501299999999999,"total":0.009501299999999999,"children":null}}},"DecideAction":{"count":1488,"self":0.0117408,"total":0.0117408,"children":null},"AgentAct":{"count":1488,"self":0.6208231,"total":0.6208231,"children":null}},"gauges":{},"metadata":{"timer_format_version":"0.1.0","start_time_seconds":"1663089804","unity_version":"2020.3.19f1","command_line_arguments":"C:\\Program Files\\Unity\\Hub\\Editor\\2020.3.19f1\\Editor\\Unity.exe -projectpath C:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-MultiScene -useHub -hubIPC -cloudEnvironment production -licensingIpc LicenseClient-UCUNI -hubSessionId 4cf980b0-326c-11ed-87c2-a7333acffe7c -accessToken j61gZPw8-vc4ZH7TJMvrSAAPQLV9SK6U72z_dek2xhw00ef","communication_protocol_version":"1.5.0","com.unity.ml-agents_version":"2.0.0","scene_name":"InGame","end_time_seconds":"1663089838"}}
+{"count":1,"self":42.3855296,"total":42.4020608,"children":{"InitializeActuators":{"count":2,"self":0.0015155,"total":0.0015155,"children":null},"InitializeSensors":{"count":2,"self":0.0015017,"total":0.0015017,"children":null},"AgentSendState":{"count":1898,"self":0.0025031999999999997,"total":0.0025031999999999997,"children":null},"DecideAction":{"count":1898,"self":0.0070091999999999993,"total":0.0070091999999999993,"children":null},"AgentAct":{"count":1898,"self":0.0030023,"total":0.0030023,"children":null}},"gauges":{},"metadata":{"timer_format_version":"0.1.0","start_time_seconds":"1665414279","unity_version":"2020.3.19f1","command_line_arguments":"C:\\Program Files\\Unity\\Hub\\Editor\\2020.3.19f1\\Editor\\Unity.exe -projectpath C:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-MultiScene -useHub -hubIPC -cloudEnvironment production -licensingIpc LicenseClient-UCUNI -hubSessionId 39022900-48a5-11ed-b848-09be5949a456 -accessToken _47qt9I_MF3bhL7JS735Xdmfj8A4dGBOdRNKR0X2L_w00ef","communication_protocol_version":"1.5.0","com.unity.ml-agents_version":"2.0.0","scene_name":"InGame","end_time_seconds":"1665414322"}}
--- a/Aimbot-PPO-MultiScene/Assets/ML-Agents/Timers/Start_timers.json
+++ b/Aimbot-PPO-MultiScene/Assets/ML-Agents/Timers/Start_timers.json
@ -1 +1 @@
-{"count":1,"self":114.25904639999999,"total":114.62062499999999,"children":{"InitializeActuators":{"count":2,"self":0.0010000999999999999,"total":0.0010000999999999999,"children":null},"InitializeSensors":{"count":2,"self":0.0010002,"total":0.0010002,"children":null},"AgentSendState":{"count":1382,"self":0.0080028,"total":0.0195053,"children":{"CollectObservations":{"count":1382,"self":0.0070022999999999995,"total":0.0070022999999999995,"children":null},"WriteActionMask":{"count":1382,"self":0.0004994,"total":0.0004994,"children":null},"RequestDecision":{"count":1382,"self":0.0040008,"total":0.0040008,"children":null}}},"DecideAction":{"count":1382,"self":0.0110034,"total":0.0110034,"children":null},"AgentAct":{"count":1382,"self":0.3290731,"total":0.3290731,"children":null}},"gauges":{},"metadata":{"timer_format_version":"0.1.0","start_time_seconds":"1662500099","unity_version":"2020.3.19f1","command_line_arguments":"C:\\Program Files\\Unity\\Hub\\Editor\\2020.3.19f1\\Editor\\Unity.exe -projectpath C:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-MultiScene -useHub -hubIPC -cloudEnvironment production -licensingIpc LicenseClient-UCUNI -hubSessionId 209fdf30-2c1f-11ed-916f-33e85f4223cc -accessToken 78EBbrn-dg5kE__h3rNOqQVTDU3b1xUmmwWF1c5sFLc00ef","communication_protocol_version":"1.5.0","com.unity.ml-agents_version":"2.0.0","scene_name":"Start","end_time_seconds":"1662500214"}}
+{"count":1,"self":100.7007424,"total":102.0526476,"children":{"InitializeActuators":{"count":2,"self":0.0015004999999999999,"total":0.0015004999999999999,"children":null},"InitializeSensors":{"count":2,"self":0.0010015,"total":0.0010015,"children":null},"AgentSendState":{"count":2851,"self":0.0227973,"total":0.3594312,"children":{"CollectObservations":{"count":2851,"self":0.3230326,"total":0.3230326,"children":null},"WriteActionMask":{"count":2850,"self":0.0040877,"total":0.0040877,"children":null},"RequestDecision":{"count":2850,"self":0.0095135999999999988,"total":0.0095135999999999988,"children":null}}},"DecideAction":{"count":2850,"self":0.0184923,"total":0.0184923,"children":null},"AgentAct":{"count":2850,"self":0.971482,"total":0.971482,"children":null}},"gauges":{"AKMAgent.CumulativeReward":{"count":1,"max":0,"min":0,"runningAverage":0,"value":0,"weightedAverage":0}},"metadata":{"timer_format_version":"0.1.0","start_time_seconds":"1665340408","unity_version":"2020.3.19f1","command_line_arguments":"C:\\Program Files\\Unity\\Hub\\Editor\\2020.3.19f1\\Editor\\Unity.exe -projectpath C:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-MultiScene -useHub -hubIPC -cloudEnvironment production -licensingIpc LicenseClient-8AgJBC01I23iOtjIDvezn -hubSessionId a2bff0f0-47ee-11ed-98ba-e72fca9de6f1 -accessToken VHkJOvWIH11sBEzC18rl6YA9y6y2sRMQj2zrOyZdNeE00ef","communication_protocol_version":"1.5.0","com.unity.ml-agents_version":"2.0.0","scene_name":"Start","end_time_seconds":"1665340510"}}
--- a/Aimbot-PPO-MultiScene/Assets/Scenes/InGame.unity
+++ b/Aimbot-PPO-MultiScene/Assets/Scenes/InGame.unity
--- a/Aimbot-PPO-MultiScene/Assets/Scenes/Start.unity
+++ b/Aimbot-PPO-MultiScene/Assets/Scenes/Start.unity
--- a/Aimbot-PPO-MultiScene/Assets/Script/InGame/AgentWithGun.cs
+++ b/Aimbot-PPO-MultiScene/Assets/Script/InGame/AgentWithGun.cs
@ -21,6 +21,8 @@ public class AgentWithGun : Agent
    public Camera thisCam;
    public CharacterController PlayerController;
    public GameObject enemyPrefab;
+    public GameObject cameraChangerOBJ;
+

    [Header("Rewards")]
    [Tooltip("Nothing happened reward")]
@ -76,12 +78,15 @@ public class AgentWithGun : Agent
    private string LoadDirTime;
    private float LoadDirDateF;
    private float loadDirTimeF;
+    public bool defaultTPCamera = true;
    private StartSeneData DataTransfer;
    private UIController UICon;
    private HistoryRecorder HistoryRec;
    private RaySensors rayScript;
+    private CameraChange camChanger;

-    [System.NonSerialized]public float nonReward;
+
+    [System.NonSerialized] public float nonReward;
    [System.NonSerialized] public float shootReward;
    [System.NonSerialized] public float shootWithoutReadyReward;
    [System.NonSerialized] public float hitReward;
@ -118,6 +123,8 @@ public class AgentWithGun : Agent
            killRewardDefault = DataTransfer.killReward;
            winRewardDefault = DataTransfer.winReward;
            loseRewardDefault = DataTransfer.loseReward;
+            lockMouse = DataTransfer.lockMouse;
+            defaultTPCamera = DataTransfer.defaultTPCamera;

            // change Decision Period & Take Actions Between Decisions
            transform.GetComponent<DecisionRequester>().DecisionPeriod = DataTransfer.DecisionPeriod;
@ -156,6 +163,7 @@ public class AgentWithGun : Agent
            UICon = transform.GetComponent<UIController>();
            HistoryRec = transform.GetComponent<HistoryRecorder>();
            rayScript = GetComponent<RaySensors>();
+            camChanger = cameraChangerOBJ.GetComponent<CameraChange>();

            // give default Reward to Reward value will be used.
            nonReward = nonRewardDefault;
@ -167,6 +175,15 @@ public class AgentWithGun : Agent
            killReward = killRewardDefault;
            //initialize remainTime
            remainTime = (int)(timeLimit - Time.time + startTime);
+            // change default camera view
+            if (defaultTPCamera)
+            {
+                camChanger.ShowTPSView();
+            }
+            else
+            {
+                camChanger.ShowFPSView();
+            }
        }
    }

@ -203,27 +220,9 @@ public class AgentWithGun : Agent

    // ------------动作处理--------------
    // moveAgent 用于模拟Input.GetAxis移动
-    public void moveAgent(int kW, int kS,int kA,int kD)
+    public void moveAgent(int vertical, int horizontal)
    {
        Vector3 thisMovement;
-        int horizontal = 0;
-        int vertical = 0;
-        if (kW==1 && kS != 1)
-        {
-            vertical = 1;
-        }
-        else if (kS==1 && kW!=1)
-        {
-            vertical = -1;
-        }
-        if (kD==1 && kA!=1)
-        {
-            horizontal = 1;
-        }
-        else if (kA ==1 && kD!=1)
-        {
-            horizontal = -1;
-        }

        if (horizontal != 0)//当按下按键（水平方向）
        {
@ -295,7 +294,7 @@ public class AgentWithGun : Agent

    // ------------动作处理--------------
    // cameraControl 用于控制Agent视角转动
-    public void cameraControl(float Mouse_X,float Mouse_Y)
+    public void cameraControl(float Mouse_X, float Mouse_Y)
    {
        //Mouse_X = Input.GetAxis("Mouse X") * MouseSensitivity * Time.deltaTime;
        //Debug.Log(Input.GetAxis("Mouse X"));
@ -359,7 +358,7 @@ public class AgentWithGun : Agent
        RaycastHit hit;
        Debug.DrawRay(ray.origin, ray.direction * 100, Color.blue);
        bool isGunReady = gunReady();
-        UICon.updateShootKeyViewer(shoot,isGunReady);
+        UICon.updateShootKeyViewer(shoot, isGunReady);
        //按下鼠标左键
        if (shoot != 0 && isGunReady == true)
        {
@ -420,12 +419,12 @@ public class AgentWithGun : Agent
    {
        GameObject[] EnemyGameObjs;
        EnemyGameObjs = GameObject.FindGameObjectsWithTag("Enemy");
-        if(EnemyGameObjs.Length <= 1)
+        if (EnemyGameObjs.Length <= 1)
        {
            //成功击杀所有Enemy
            return 1;
        }
-        else if(Time.time - startTime >= timeLimit)
+        else if (Time.time - startTime >= timeLimit)
        {
            //超时失败
            return 2;
@ -477,9 +476,9 @@ public class AgentWithGun : Agent
    {
        float epreward = 0f;
        // 击杀reward判断
-        if(enemyKillCount > 0)
+        if (enemyKillCount > 0)
        {
-            for(int i = 0;i < enemyKillCount; i++)
+            for (int i = 0; i < enemyKillCount; i++)
            {
                epreward += killReward;
            }
@ -506,7 +505,7 @@ public class AgentWithGun : Agent
        }
        if (lockMouse)
        {
-            Cursor.lockState = CursorLockMode.Locked; // 隐藏并且锁定鼠标
+            Cursor.lockState = CursorLockMode.Locked; // hide and lock the mouse
        }
        //iniCharts();
        thisAgentObj.name = thisAgentObj.GetInstanceID().ToString();
@ -549,35 +548,26 @@ public class AgentWithGun : Agent
    public override void OnActionReceived(ActionBuffers actionBuffers)
    {
        //获取输入
-        int kW = actionBuffers.DiscreteActions[0];
-        int kS = actionBuffers.DiscreteActions[1];
-        int kA = actionBuffers.DiscreteActions[2];
-        int kD = actionBuffers.DiscreteActions[3];
-        int mouseShoot = actionBuffers.DiscreteActions[4];
+        int vertical = actionBuffers.DiscreteActions[0];
+        int horizontal = actionBuffers.DiscreteActions[1];
+        int mouseShoot = actionBuffers.DiscreteActions[2];
        float Mouse_X = actionBuffers.ContinuousActions[0];
-        //float Mouse_Y = actionBuffers.ContinuousActions[1];
-        //int timeLimitControl = (int)actionBuffers.ContinuousActions[2];
-        //float nonRewardIn = actionBuffers.ContinuousActions[1];
-        //float shootRewardIn = actionBuffers.ContinuousActions[2];
-        //float shootWithoutReadyRewardIn = actionBuffers.ContinuousActions[3];
-        //float hitRewardIn = actionBuffers.ContinuousActions[4];
-        //float winRewardIn = actionBuffers.ContinuousActions[5];
-        // loseRewardIn = actionBuffers.ContinuousActions[6];
-        //float killRewardIn = actionBuffers.ContinuousActions[7];
-        //Rewards Update
+        if (vertical == 2) vertical = -1;
+        if (horizontal == 2) horizontal = -1;
        remainTime = (int)(timeLimit - Time.time + startTime);

        //应用输入
        shoot = mouseShoot;
-        HistoryRec.realTimeKeyCounter(kW, kS, kA, kD, shoot);
+        HistoryRec.realTimeKeyCounter(vertical, horizontal, shoot);
        (int kWCount, int kSCount, int kACount, int kDCount, int shootCount) = HistoryRec.getKeyCount();
        UICon.updateRemainTime(remainTime);
-        UICon.updateWASDKeyViewer(kW, kS, kA, kD);
+        UICon.updateRemainEnemy(enemyNum);
+        UICon.updateWASDKeyViewer(vertical, horizontal);
        UICon.updateKeyCounterChart(kWCount, kSCount, kACount, kDCount, shootCount);
        UICon.updateMouseMovementViewer(Mouse_X);
        UICon.updateRewardViewer(nonReward, shootReward, shootWithoutReadyReward, hitReward, winReward, loseReward, killReward);
        cameraControl(Mouse_X, 0);
-        moveAgent(kW, kS, kA, kD);
+        moveAgent(vertical, horizontal);
        float thisRoundReward = rewardCalculate();

        //判断结束
@ -595,7 +585,7 @@ public class AgentWithGun : Agent
            Debug.Log("reward = " + winReward);
            EndEpisode();
        }
-        else if(finished == 2)
+        else if (finished == 2)
        {
            //Lose Finished
            HistoryRec.addRealTimeReward(loseReward);
@ -628,37 +618,45 @@ public class AgentWithGun : Agent
        ActionSegment<float> continuousActions = actionsOut.ContinuousActions;
        ActionSegment<int> discreteActions = actionsOut.DiscreteActions;

-        int kW = 0;
-        int kS = 0;
-        int kA = 0;
-        int kD = 0;
-        if (Input.GetKey(KeyCode.W))
+        int vertical = 0;
+        int horizontal = 0;
+        if (Input.GetKey(KeyCode.W) && !Input.GetKey(KeyCode.S))
        {
-            kW = 1;
+            vertical = 1;
        }
-        if (Input.GetKey(KeyCode.S))
+        else if (Input.GetKey(KeyCode.S) && !Input.GetKey(KeyCode.W))
        {
-            kS = 1;
+            vertical = -1;
        }
-        if (Input.GetKey(KeyCode.A))
+        else
        {
-            kA = 1;
+            vertical = 0;
        }
-        if (Input.GetKey(KeyCode.D))
+        if (Input.GetKey(KeyCode.D) && !Input.GetKey(KeyCode.A))
        {
-            kD = 1;
+            horizontal = 1;
+        }
+        else if (Input.GetKey(KeyCode.A) && !Input.GetKey(KeyCode.D))
+        {
+            horizontal = -1;
+        }
+        else
+        {
+            horizontal = 0;
        }
-        discreteActions[0] = kW;
-        discreteActions[1] = kS;
-        discreteActions[2] = kA;
-        discreteActions[3] = kD;

        if (Input.GetMouseButton(0))
        {
            // Debug.Log("mousebuttonhit");
            shoot = 1;
        }
-        discreteActions[4] = shoot;
+        else
+        {
+            shoot = 0;
+        }
+        discreteActions[0] = vertical;
+        discreteActions[1] = horizontal;
+        discreteActions[2] = shoot;
        //^^^^^^^^^^^^^^^^^^^^^discrete-Control^^^^^^^^^^^^^^^^^^^^^^

        //vvvvvvvvvvvvvvvvvvvvvvvvvvvvvcontinuous-Controlvvvvvvvvvvvvvvvvvvvvvv
--- a/Aimbot-PPO-MultiScene/Assets/Script/InGame/HistoryRecorder.cs
+++ b/Aimbot-PPO-MultiScene/Assets/Script/InGame/HistoryRecorder.cs
@ -31,24 +31,24 @@ public class HistoryRecorder : MonoBehaviour
    {
        EPTotalShootCount.Add(TotalShootCount);
    }
-    public void realTimeKeyCounter(int kW, int kS, int kA, int kD, int shoot)
+    public void realTimeKeyCounter(int vertical,  int horizontal, int shoot)
    {
-        if (kW == 1)
+        if (vertical == 1)
        {
            realTimeWKeyCount += 1;
        }
-        if (kS == 1)
+        else if (vertical == -1)
        {
            realTimeSKeyCount += 1;
        }
-        if (kA == 1)
-        {
-            realTimeAKeyCount += 1;
-        }
-        if (kD == 1)
+        if (horizontal == 1)
        {
            realTimeDKeyCount += 1;
        }
+        else if (horizontal == -1)
+        {
+            realTimeAKeyCount += 1;
+        }
        if (shoot == 1)
        {
            realTimeShootCount += 1;
--- a/Aimbot-PPO-MultiScene/Assets/Script/InGame/UIController.cs
+++ b/Aimbot-PPO-MultiScene/Assets/Script/InGame/UIController.cs
@ -69,39 +69,37 @@ public class UIController : MonoBehaviour
    }

    //------------Key Viewer----------
-    public void updateWASDKeyViewer(int kW,int kS,int kA,int kD)
+    public void updateWASDKeyViewer(int vertical,int horizontal)
    {
-        if (kW == 1)
+        if (vertical == 1)
        {
            upText.color = Color.red;
+            downText.color = Color.black;
        }
-        else
-        {
-            upText.color = Color.black;
-        }
-        if (kS == 1)
+        else if (vertical == -1)
        {
            downText.color = Color.red;
+            upText.color = Color.black;
        }
        else
        {
            downText.color = Color.black;
+            upText.color = Color.black;
        }
-        if(kA == 1)
-        {
-            leftText.color = Color.red;
-        }
-        else
-        {
-            leftText.color = Color.black;
-        }
-        if( kD == 1)
+        if (horizontal == 1)
        {
            rightText.color = Color.red;
+            leftText.color = Color.black;
+        }
+        else if (horizontal == -1)
+        {
+            leftText.color = Color.red;
+            rightText.color = Color.black;
        }
        else
        {
-            rightText.color = Color.black;
+            downText.color = Color.black;
+            upText.color = Color.black;
        }
    }
    public void updateShootKeyViewer(int shoot,bool isGunReady)
--- a/Aimbot-PPO-MultiScene/Assets/Script/InGame/gameFlowController.cs
+++ b/Aimbot-PPO-MultiScene/Assets/Script/InGame/gameFlowController.cs
@ -0,0 +1,27 @@
+using System.Collections;
+using System.Collections.Generic;
+using UnityEngine;
+
+public class gameFlowController : MonoBehaviour
+{
+    public GameObject Agent;
+    AgentWithGun agentWithGun;
+    // Start is called before the first frame update
+    void Start()
+    {
+        agentWithGun = Agent.GetComponent<AgentWithGun>();
+    }
+
+    // Update is called once per frame
+    void Update()
+    {
+        if (Input.GetKey(KeyCode.Escape))
+        {
+            Application.Quit();
+        }
+        if (Input.GetKey(KeyCode.L))
+        {
+            agentWithGun.lockMouse = !agentWithGun.lockMouse;
+        }
+    }
+}
--- a/Aimbot-PPO-MultiScene/Assets/Script/InGame/gameFlowController.cs.meta
+++ b/Aimbot-PPO-MultiScene/Assets/Script/InGame/gameFlowController.cs.meta
@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 9a8fb4d12d4b8fc4784f3e142e7fdcf8
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Aimbot-PPO-MultiScene/Assets/Script/Start/EnvArgsChanger.cs
+++ b/Aimbot-PPO-MultiScene/Assets/Script/Start/EnvArgsChanger.cs
@ -19,6 +19,21 @@ public class EnvArgsChanger : MonoBehaviour
    public Text DecisionPeriodDataText;
    public Toggle TakeActionsBetweenDecisionsToggle;

+    [Header("Lock Mouse")]
+    public Toggle LockMouseToggle;
+
+    [Header("Default Camera")]
+    public Toggle FPToggle;
+    public Text FPText;
+    public Toggle TPToggle;
+    public Text TPText;
+
+    private StartSeneData startSeneData;
+    private void Start()
+    {
+        startSeneData = DataTransfer.GetComponent<StartSeneData>();
+    }
+

    public void onEnemynumValueChanged()
    {
@ -30,7 +45,7 @@ public class EnvArgsChanger : MonoBehaviour
        else
        {
            EnemyNumText.color = Color.yellow;
-            DataTransfer.GetComponent<StartSeneData>().EnemyNum = Math.Abs(int.Parse(EnemyNumInput.GetComponent<InputField>().text));
+            startSeneData.EnemyNum = Math.Abs(int.Parse(EnemyNumInput.GetComponent<InputField>().text));
        }
    }

@ -44,19 +59,48 @@ public class EnvArgsChanger : MonoBehaviour
        else
        {
            TimeLimText.color = Color.yellow;
-            DataTransfer.GetComponent<StartSeneData>().Timelim = Math.Abs(int.Parse(TimelimInput.GetComponent<InputField>().text));
+            startSeneData.Timelim = Math.Abs(int.Parse(TimelimInput.GetComponent<InputField>().text));
        }
    }

    public void onDPSlideValueChanged()
    {
        // DecisionPeriod(DP) value Control
-        DataTransfer.GetComponent<StartSeneData>().DecisionPeriod = (int)(DecisionPeriodSlide.GetComponent<Slider>().value);
-        DecisionPeriodDataText.text = DataTransfer.GetComponent<StartSeneData>().DecisionPeriod.ToString();
+        startSeneData.DecisionPeriod = (int)(DecisionPeriodSlide.GetComponent<Slider>().value);
+        DecisionPeriodDataText.text = startSeneData.DecisionPeriod.ToString();
    }
    public void onABDToggleChanged()
    {
        // Actions Between Decisions(ABD) Toggle Control
-        DataTransfer.GetComponent<StartSeneData>().ActionsBetweenDecisions = TakeActionsBetweenDecisionsToggle.isOn;
+        startSeneData.ActionsBetweenDecisions = TakeActionsBetweenDecisionsToggle.isOn;
+    }
+    public void onLockMouseToggleChanged()
+    {
+        // lock mouse or not
+        startSeneData.lockMouse = LockMouseToggle.isOn;
+    }
+    public void onTPCamToggleChanged()
+    {
+        startSeneData.defaultTPCamera = true;
+
+        FPToggle.interactable = true;
+        FPToggle.SetIsOnWithoutNotify(false);
+        FPText.color = Color.gray;
+
+        TPToggle.SetIsOnWithoutNotify(true);
+        TPToggle.interactable = false;
+        TPText.color = Color.green;
+    }
+    public void onFPCameToggleChanged()
+    {
+        startSeneData.defaultTPCamera = false;
+
+        TPToggle.interactable = true;
+        TPToggle.SetIsOnWithoutNotify(false);
+        TPText.color = Color.gray;
+
+        FPToggle.SetIsOnWithoutNotify(true);
+        FPToggle.interactable = false;
+        FPText.color = Color.green;
    }
 }
--- a/Aimbot-PPO-MultiScene/Assets/Script/Start/StartSeneData.cs
+++ b/Aimbot-PPO-MultiScene/Assets/Script/Start/StartSeneData.cs
@ -13,6 +13,8 @@ public class StartSeneData : MonoBehaviour
    public float killRewardDefault = 10.0f;
    public float winRewardDefault = 20.0f;
    public float loseRewardDefault = -10.0f;
+    public bool lockMouse = false;
+    public bool defaultTPCamera = true;

    // LoadDir
    [System.NonSerialized]public string LoadDirDate = "0";
--- a/Aimbot-PPO-Python/DemoRecorder.ipynb
+++ b/Aimbot-PPO-Python/DemoRecorder.ipynb
@ -0,0 +1,90 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "√√√√√Enviroment Initialized Success√√√√√\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "import aimBotEnv\n",
+    "from HumanAction import HumanActions\n",
+    "\n",
+    "# Env\n",
+    "ENV_PATH = \"./Build-CloseEnemyCut/Aimbot-PPO\"\n",
+    "WORKER_ID = 1\n",
+    "BASE_PORT = 200\n",
+    "\n",
+    "MOUSEDISCOUNT = 8.0\n",
+    "MAX_EP = 10000000\n",
+    "\n",
+    "env = aimBotEnv.makeEnv(envPath=ENV_PATH, workerID=WORKER_ID, basePort=BASE_PORT)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "UnityCommunicatorStoppedException",
+     "evalue": "Communicator has exited.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mUnityCommunicatorStoppedException\u001b[0m         Traceback (most recent call last)",
+      "\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_37248/645561173.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[1;32mwhile\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mdone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m         \u001b[0mactions\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdemoAct\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetHumanActions\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m         \u001b[0menv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mactions\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mactions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;32mc:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-Python\\aimBotEnv.py\u001b[0m in \u001b[0;36mstep\u001b[1;34m(self, actions, behaviorName, trackedAgent)\u001b[0m\n\u001b[0;32m     72\u001b[0m         \u001b[1;31m# take action to env\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     73\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0menv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_actions\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbehavior_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbehaviorName\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maction\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mthisActionTuple\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 74\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0menv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     75\u001b[0m         \u001b[1;31m# get nextState & reward & done after this action\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     76\u001b[0m         \u001b[0mnextState\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreward\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mloadDir\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msaveNow\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetSteps\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbehaviorName\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrackedAgent\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\mlagents_envs\\timers.py\u001b[0m in \u001b[0;36mwrapped\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    303\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mwrapped\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    304\u001b[0m         \u001b[1;32mwith\u001b[0m \u001b[0mhierarchical_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__qualname__\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 305\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    306\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    307\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0mwrapped\u001b[0m  \u001b[1;31m# type: ignore\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\mlagents_envs\\environment.py\u001b[0m in \u001b[0;36mstep\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    333\u001b[0m             \u001b[0moutputs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_communicator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexchange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstep_input\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_poll_process\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    334\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0moutputs\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 335\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mUnityCommunicatorStoppedException\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Communicator has exited.\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    336\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_update_behavior_specs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    337\u001b[0m         \u001b[0mrl_output\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0moutputs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrl_output\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mUnityCommunicatorStoppedException\u001b[0m: Communicator has exited."
+     ]
+    }
+   ],
+   "source": [
+    "done = False\n",
+    "env.reset()\n",
+    "demoAct = HumanActions(mouseDiscount=MOUSEDISCOUNT)\n",
+    "for ep in range(MAX_EP):\n",
+    "    while not done:\n",
+    "        actions = demoAct.getHumanActions()\n",
+    "        env.step(actions=actions)6\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.7 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "86e2db13b09bd6be22cb599ea60c1572b9ef36ebeaa27a4c8e961d6df315ac32"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/Aimbot-PPO-Python/HumanAction.py
+++ b/Aimbot-PPO-Python/HumanAction.py
@ -0,0 +1,51 @@
+import keyboard
+import mouse
+
+
+class HumanActions:
+    def __init__(self, mouseDiscount: float = 10, screenW: int = 1920, screenH: int = 1080):
+        def multiPressed():
+            pass
+
+        keyboard.add_hotkey("w+a", multiPressed)
+        keyboard.add_hotkey("w+d", multiPressed)
+        keyboard.add_hotkey("s+a", multiPressed)
+        keyboard.add_hotkey("s+d", multiPressed)
+        self.screenW = screenW
+        self.screenH = screenH
+        self.MOUSEDISCOUNT = mouseDiscount
+
+    def getHumanActions(self):
+        x, _ = mouse.get_position()
+        xMovement = (x - self.screenW / 2) / self.MOUSEDISCOUNT
+
+        ws = 0
+        ad = 0
+        click = 0
+        if keyboard.is_pressed("w"):
+            ws = 1
+        elif keyboard.is_pressed("s"):
+            ws = 2
+        if keyboard.is_pressed("d"):
+            ad = 1
+        elif keyboard.is_pressed("a"):
+            ad = 2
+        if keyboard.is_pressed("w+d"):
+            ws = 1
+            ad = 1
+        elif keyboard.is_pressed("w+a"):
+            ws = 1
+            ad = 2
+        elif keyboard.is_pressed("s+d"):
+            ws = 2
+            ad = 1
+        elif keyboard.is_pressed("s+a"):
+            ws = 2
+            ad = 2
+        if mouse.is_pressed(button="left"):
+            click = 1
+
+        actions = [ws, ad, click, [xMovement]]
+
+        mouse.move(self.screenW / 2, self.screenH / 2)
+        return actions
--- a/Aimbot-PPO-Python/PPO-mian.ipynb
+++ b/Aimbot-PPO-Python/PPO-mian.ipynb
--- a/Aimbot-PPO-Python/PPO.py
+++ b/Aimbot-PPO-Python/PPO.py
@ -1,108 +1,213 @@
 import tensorflow as tf
+from tensorflow.python.ops.numpy_ops import ndarray
 import tensorflow_probability as tfp
 import numpy as np
+import time
 import math
-import copy
+
 import datetime
-import os
+from PPOConfig import PPOConfig

 from tensorflow import keras
 from tensorflow.keras import layers
 from tensorflow.keras import optimizers
-from keras_radam import RAdam
+
+EPS = 1e-10


 class PPO(object):
-    """Create PPO Agent
-    """
+    def __init__(
+        self,
+        stateSize: int,
+        disActShape: list,
+        conActSize: int,
+        conActRange: float,
+        PPOConfig: PPOConfig,
+    ):
+        """initialize PPO

-    def __init__(self, stateSize, disActShape, conActSize, conActRange, criticLR, actorLR, gamma, epsilon, entropyWeight, saveDir, loadModelDir):
-        
-        # check disActShape is correct(greater than 1)
-        try:
-            if np.any(np.array(disActShape)<=1):
-                raise ValueError("disActShape error,disActShape should greater than 1 but get",disActShape)
-        except ValueError as e:
-            raise
+        Args:
+            stateSize (int): enviroment state size
+            disActShape (numpy): discrete Action shape.
+                                just like [3,2],means 2 type of dis actions,each act include 3 and 2 types
+                                if no discrete action output then use [0].
+            conActSize (int): continuous Action Size. if no continuous action output then use 0.
+            conActRange (float): continuous action range. -conActRange to +conActRange
+            PPOConfig (PPOConfig): PPO configuration
+        """
+        # check use dis action or not.
+        if disActShape == [0]:
+            # non dis action output
+            self.disActSize = 0
+            self.disOutputSize = 0
+        else:
+            # make sure disActShape greater than 1
+            try:
+                if np.any(np.array(disActShape) <= 1):
+                    raise ValueError(
+                        "disActShape error,disActShape should greater than 1 but get", disActShape
+                    )
+            except ValueError:
+                raise
+            self.disActSize = len(disActShape)
+            self.disOutputSize = sum(disActShape)

        self.stateSize = stateSize
-        # self.actionSize = actionSize
-        self.disActShape = disActShape # shape of discrete action output. like [3,3,2]
-        self.disActSize = len(disActShape)
+        self.disActShape = disActShape
        self.conActSize = conActSize
        self.conActRange = conActRange
-        self.criticLR = criticLR
-        self.actorLR = actorLR
-        self.GAMMA = gamma
-        self.EPSILON = epsilon
-        self.saveDir = saveDir
-        self.entropyWeight = entropyWeight
+        self.muSigSize = 2
+        self.conOutputSize = conActSize * self.muSigSize

-        self.disOutputSize = sum(disActShape)
-        self.conOutputSize = conActSize * 2
+        # config
+        self.NNShape = PPOConfig.NNShape
+        self.criticLR = PPOConfig.criticLR
+        self.actorLR = PPOConfig.actorLR
+        self.gamma = PPOConfig.gamma
+        self.lmbda = PPOConfig.lmbda
+        self.clipRange = PPOConfig.clipRange
+        self.entropyWeight = PPOConfig.entropyWeight
+        self.trainEpochs = PPOConfig.trainEpochs
+        self.saveDir = PPOConfig.saveDir
+        self.loadModelDir = PPOConfig.loadModelDir
+        print("---------thisPPO Params---------")
+        print("self.stateSize = ", self.stateSize)
+        print("self.disActShape = ", self.disActShape)
+        print("self.disActSize", self.disActSize)
+        print("self.disOutputSize", self.disOutputSize)
+        print("self.conActSize = ", self.conActSize)
+        print("self.conActRange = ", self.conActRange)
+        print("self.conOutputSize = ", self.conOutputSize)

-        if loadModelDir == None:
+        # config
+        print("---------thisPPO config---------")
+        print("self.NNShape = ", self.NNShape)
+        print("self.criticLR = ", self.criticLR)
+        print("self.actorLR = ", self.actorLR)
+        print("self.gamma = ", self.gamma)
+        print("self.lmbda = ", self.lmbda)
+        print("self.clipRange = ", self.clipRange)
+        print("self.entropyWeight = ", self.entropyWeight)
+        print("self.trainEpochs = ", self.trainEpochs)
+        print("self.saveDir = ", self.saveDir)
+        print("self.loadModelDir = ", self.loadModelDir)
+
+        # load NN or not
+        if self.loadModelDir is None:
            # critc NN
-            self.critic = self.buildCriticNet(self.stateSize, 1, compileModel = True)
+            self.critic = self.buildCriticNet(self.stateSize, 1, compileModel=True)
            # actor NN
-            self.actor = self.buildActorNet(self.stateSize, self.conActRange, compileModel = True)
+            self.actor = self.buildActorNet(self.stateSize, compileModel=True)
+            print("---------Actor Model Create Success---------")
+            self.actor.summary()
+            print("---------Critic Model Create Success---------")
+            self.critic.summary()
        else:
            # critc NN
            self.critic = self.buildCriticNet(self.stateSize, 1, compileModel=True)
            # actor NN
-            self.actor = self.buildActorNet(self.stateSize, self.conActRange, compileModel=True)
+            self.actor = self.buildActorNet(self.stateSize, compileModel=True)
            # load weight to Critic&Actor NN
-            self.loadWeightToModels(loadModelDir)
-            
+            self.loadWeightToModels(self.loadModelDir)
+            print("---------Actor Model Load Success---------")
+            self.actor.summary()
+            print("---------Critic Model Load Success---------")
+            self.critic.summary()

    # Build Net
-    def buildActorNet(self, inputSize, continuousActionRange,compileModel):
+    def buildActorNet(self, inputSize: int, compileModel: bool):
        """build Actor Nueral Net and compile.Output:[disAct1,disAct2,disAct3,mu,sigma]

        Args:
            inputSize (int): InputLayer Nueral size.
-            continuousActionRange (foat): continuous Action's max Range.
+            compileModel (bool): compile Model or not.

        Returns:
            keras.Model: return Actor NN
        """
-        stateInput = layers.Input(shape=(inputSize,), name='stateInput')
-        dense0 = layers.Dense(500, activation='relu',name='dense0',)(stateInput)
-        dense1 = layers.Dense(200, activation='relu',name='dense1',)(dense0)
-        dense2 = layers.Dense(100, activation='relu', name='dense2')(dense1)
+        # -----------Input Layers-----------
+        stateInput = layers.Input(shape=(inputSize,), name="stateInput")

-        disAct1 = layers.Dense(3, activation='softmax',name='WSAction')(dense2)  # WS
-        disAct2 = layers.Dense(3, activation='softmax',name='ADAction')(dense2)  # AD
-        disAct3 = layers.Dense(2, activation='softmax',name='ShootAction')(dense2)  # Mouse shoot
-        mu = continuousActionRange * layers.Dense(1, activation='tanh', name='muOut')(dense2)  # mu，既正态分布mean
-        sigma = 1e-8 + layers.Dense(1, activation='softplus',name='sigmaOut')(dense2)  # sigma，既正态分布
-        # musig = layers.concatenate([mu,sigma],name = 'musig')
-        totalOut = layers.concatenate(
-            [disAct1, disAct2, disAct3, mu, sigma], name='totalOut')  # package
+        # -------Intermediate layers--------
+        interLayers = []
+        interLayersIndex = 0
+        for neuralUnit in self.NNShape:
+            thisLayerName = "dense" + str(interLayersIndex)
+            if interLayersIndex == 0:
+                interLayers.append(
+                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(stateInput)
+                )
+            else:
+                interLayers.append(
+                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(interLayers[-1])
+                )
+            interLayersIndex += 1

+        # ----------Output Layers-----------
+        outputLayersList = []
+        if self.disActSize != 0:
+            # while NN have discrete action output.
+            disActIndex = 0
+            for thisDisActDepth in self.disActShape:
+                thisDisActName = "disAct" + str(disActIndex)
+                outputLayersList.append(
+                    layers.Dense(thisDisActDepth, activation="softmax", name=thisDisActName)(
+                        interLayers[-1]
+                    )
+                )
+                disActIndex += 1
+        if self.conActSize != 0:
+            # while NN have continuous action output.
+            mu = tf.multiply(
+                layers.Dense(1, activation="tanh", name="muOut")(interLayers[-1]), self.conActRange
+            )  # mu，既正态分布位置参数
+            sigma = tf.add(
+                layers.Dense(1, activation="softplus", name="sigmaOut")(interLayers[-1]), EPS
+            )  # sigma，既正态分布尺度参数
+            outputLayersList.append(mu)
+            outputLayersList.append(sigma)
+        totalOut = layers.concatenate(outputLayersList, name="totalOut")  # package
+
+        # ----------Model Compile-----------
        model = keras.Model(inputs=stateInput, outputs=totalOut)
-        #actorOPT = optimizers.Adam(learning_rate = self.actorLR)
-        if compileModel:
-            actorOPT = RAdam(self.actorLR)
+        if compileModel:  # Compile Model
+            actorOPT = optimizers.Adam(learning_rate=self.actorLR)
            model.compile(optimizer=actorOPT, loss=self.aLoss())
        return model

-    def buildCriticNet(self, inputSize, outputSize,compileModel):
+    def buildCriticNet(self, inputSize: int, outputSize: int, compileModel: bool):
        """build Critic Nueral Net and compile.Output:[Q]

        Args:
-            inputSize (int): InputLayer Neural Size
-            outputSize (float): Q size
+            inputSize (int): input size
+            outputSize (int): output size
+            compileModel (bool): compile Model or not.

        Returns:
            keras.Model: return Critic NN
        """
-        stateInput = keras.Input(shape=(inputSize,))
-        dense0 = layers.Dense(500, activation='relu',
-                              name='dense0',)(stateInput)
-        dense1 = layers.Dense(200, activation='relu')(dense0)
-        dense2 = layers.Dense(100, activation='relu')(dense1)
-        output = layers.Dense(outputSize)(dense2)
+        # -----------Input Layers-----------
+        stateInput = keras.Input(shape=(inputSize,), name="stateInput")
+
+        # -------Intermediate layers--------
+        interLayers = []
+        interLayersIndex = 0
+        for neuralUnit in self.NNShape:
+            thisLayerName = "dense" + str(interLayersIndex)
+            if interLayersIndex == 0:
+                interLayers.append(
+                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(stateInput)
+                )
+            else:
+                interLayers.append(
+                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(interLayers[-1])
+                )
+            interLayersIndex += 1
+
+        # ----------Output Layers-----------
+        output = layers.Dense(outputSize, activation=None)(interLayers[-1])
+
+        # ----------Model Compile-----------
        model = keras.Model(inputs=stateInput, outputs=output)
        if compileModel:
            criticOPT = optimizers.Adam(learning_rate=self.criticLR)
@ -110,39 +215,53 @@ class PPO(object):
        return model

    # loss Function
+    # critic loss
    def cLoss(self):
-        """Critic Loss function
-        """
+        """Critic Loss function"""
+
        def loss(y_true, y_pred):
            # y_true: discountedR
            # y_pred: critcV = model.predict(states)

-            advantage = y_true - y_pred  # TD error
-            loss = tf.reduce_mean(tf.square(advantage))
+            adv = y_true - y_pred  # TD error
+            loss = tf.reduce_mean(tf.square(adv))
            return loss
+
        return loss

+    # actor loss
    def aLoss(self):
-        def getDiscreteALoss(nowProbs,oldProbs,advantage):
+        """Actor Loss function"""
+
+        def getDiscreteALoss(nowProbs, oldProbs, disOneHotAct, actShape, advantage):
            """get Discrete Action Loss

            Args:
-                nowProbs (tf.constant): (length,actionSize)
-                oldProbs (tf.constant): (length,actionSize)
+                nowProbs (tf.constant): (length,actionProbSize)
+                oldProbs (tf.constant): (length,actionProbSize)
                advantage (tf.constant): (length,)

            Returns:
                tf.constant: (length,)
            """
-            entropy = tf.reduce_mean(tf.math.multiply(nowProbs,tf.math.log(nowProbs+1e-6)))
-            ratio = tf.math.divide(nowProbs,oldProbs+1e-6)
-            value = tf.math.multiply(ratio,tf.expand_dims(advantage,axis = 1))
-            clipRatio = tf.clip_by_value(ratio,1. - self.EPSILON,1.+self.EPSILON)
-            clipValue = tf.math.multiply(clipRatio,tf.expand_dims(advantage,axis = 1))
-            loss = -tf.reduce_mean(tf.math.minimum(value,clipValue)) + self.entropyWeight * entropy
+            entropy = tf.negative(
+                tf.reduce_mean(tf.math.multiply(nowProbs, tf.math.log(nowProbs + EPS)))
+            )
+            nowSingleProbs = tf.reduce_mean(tf.multiply(nowProbs, disOneHotAct), axis=1)
+            nowSingleProbs = tf.multiply(nowSingleProbs, actShape)
+            oldSingleProbs = tf.reduce_mean(tf.multiply(oldProbs, disOneHotAct), axis=1)
+            oldSingleProbs = tf.multiply(oldSingleProbs, actShape)
+            ratio = tf.math.divide(nowSingleProbs, oldSingleProbs + EPS)
+            value = tf.math.multiply(ratio, advantage)
+            clipRatio = tf.clip_by_value(ratio, 1.0 - self.clipRange, 1.0 + self.clipRange)
+            clipValue = tf.math.multiply(clipRatio, advantage)
+            loss = tf.math.negative(
+                tf.reduce_mean(tf.math.minimum(value, clipValue))
+                - tf.multiply(self.entropyWeight, entropy)
+            )
            return loss

-        def getContinuousALoss(musig,actions,oldProbs,advantage):
+        def getContinuousALoss(musig, actions, oldProbs, advantage):
            """get Continuous Action Loss

            Args:
@ -154,150 +273,376 @@ class PPO(object):
            Returns:
                tf.constant: (length,)
            """
-            mu = musig[:,0]
-            sigma = musig[:,1]
-            dist = tfp.distributions.Normal(mu,sigma)
+            mu = musig[:, 0]
+            sigma = musig[:, 1]
+            dist = tfp.distributions.Normal(mu, sigma)

            nowProbs = dist.prob(actions)
-            ratio = tf.math.divide(nowProbs,oldProbs+1e-6)
            entropy = tf.reduce_mean(dist.entropy())

-            value = tf.math.multiply(ratio,tf.expand_dims(advantage,axis = 1))
-            clipValue = tf.clip_by_value(ratio,1. - self.EPSILON,1.+self.EPSILON) * advantage
-            loss = -tf.reduce_mean(tf.math.minimum(value,clipValue)) + self.entropyWeight * entropy
+            ratio = tf.math.divide(nowProbs, oldProbs + EPS)
+            value = tf.math.multiply(ratio, advantage)
+            clipRatio = tf.clip_by_value(ratio, 1.0 - self.clipRange, 1.0 + self.clipRange)
+            clipValue = tf.math.multiply(clipRatio, advantage)
+            loss = tf.negative(
+                tf.reduce_mean(tf.math.minimum(value, clipValue))
+                - tf.multiply(self.entropyWeight, entropy)
+            )
            return loss

        def loss(y_true, y_pred):
-            # y_true: [[disAct1, disAct2, disAct3, mu, sigma]]
-            # y_pred: muSigma = self.actor(state) = 
-            # [[disAct1, disAct2, disAct3, mu, sigma]]
-            oldDisProbs = y_true[:,0:self.disOutputSize]
-            oldConMusigs = y_true[:,self.disOutputSize:self.disOutputSize+self.conActSize]
-            conActions = y_true[:,self.disOutputSize+self.conActSize:self.disOutputSize+(self.conActSize*2)]
-            advantage = y_true[:,-1]
-            
-            nowDisProbs = y_pred[:,0:self.disOutputSize] # [disAct1, disAct2, disAct3]
-            nowConMusigs = y_pred[:,self.disOutputSize:] #[musig1,musig2]
-            
-            totalALoss = tf.constant([0.])
+            # y_true: [[disActProb..., conActProbs..., disOneHotActs..., conAct..., advantage]]
+            # y_pred: [[disActProb..., mu, sigma...]]
+            totalALoss = 0
            totalActionNum = 0
+            advantage = tf.expand_dims(y_true[:, -1], axis=1)

-            # for nowProb,oldProb in zip(tf.transpose(nowDisProbs,perm=[1,0,2]),tf.transpose(oldDisProbs,perm=[1,0,2])):
-            lastDisActShape = 0
-            for shape in self.disActShape:
-                thisNowDisProbs = nowDisProbs[:,lastDisActShape:lastDisActShape+shape]
-                thisOldDisProbs = oldDisProbs[:,lastDisActShape:lastDisActShape+shape]
-                discreteALoss = getDiscreteALoss(thisNowDisProbs,thisOldDisProbs,advantage)
-                lastDisActShape += shape
-                totalALoss += discreteALoss
-                totalActionNum += 1
-            # for nowConMusig,conAction,oldPiProb in zip(tf.transpose(nowConMusigs,perm=[1,0,2]),conActions,oldPiProbs):
-            lastConAct = 0
-            for act in range(self.conActSize):
-                thisNowConMusig = nowConMusigs[:,lastConAct:lastConAct+((act+1)*2)]
-                thisOldConMusig = oldConMusigs[:,lastConAct:lastConAct+((act+1)*2)]
-                thisConAction = conActions[:,act]
-                continuousAloss = getContinuousALoss(thisNowConMusig,thisConAction,thisOldConMusig,advantage)
-                totalALoss += continuousAloss
-                totalActionNum += 1
-
-            loss = tf.divide(totalALoss,totalActionNum)
+            if self.disActSize != 0:
+                # while NN have discrete action output.
+                oldDisProbs = y_true[:, 0 : self.disOutputSize]
+                nowDisProbs = y_pred[:, 0 : self.disOutputSize]  # [disAct1, disAct2, disAct3]
+                disOneHotActs = y_true[
+                    :,
+                    self.disOutputSize
+                    + self.conActSize : self.disOutputSize
+                    + self.conActSize
+                    + self.disOutputSize,
+                ]
+                lastDisActShape = 0
+                for thisShape in self.disActShape:
+                    thisNowDisProbs = nowDisProbs[:, lastDisActShape : lastDisActShape + thisShape]
+                    thisOldDisProbs = oldDisProbs[:, lastDisActShape : lastDisActShape + thisShape]
+                    thisDisOneHotActs = disOneHotActs[
+                        :, lastDisActShape : lastDisActShape + thisShape
+                    ]
+                    discreteALoss = getDiscreteALoss(
+                        thisNowDisProbs, thisOldDisProbs, thisDisOneHotActs, thisShape, advantage
+                    )
+                    lastDisActShape += thisShape
+                    totalALoss += discreteALoss
+                    totalActionNum += 1.0
+            if self.conActSize != 0:
+                # while NN have continuous action output.
+                oldConProbs = y_true[:, self.disOutputSize : self.disOutputSize + self.conActSize]
+                conActions = y_true[
+                    :,
+                    self.disOutputSize
+                    + self.conActSize : self.disOutputSize
+                    + self.conActSize
+                    + self.conActSize,
+                ]
+                nowConMusigs = y_pred[:, self.disOutputSize :]  # [musig1,musig2]
+                lastConAct = 0
+                for conAct in range(self.conActSize):
+                    thisNowConMusig = nowConMusigs[:, lastConAct : lastConAct + self.muSigSize]
+                    thisOldConProb = oldConProbs[:, conAct : conAct + 1]
+                    thisConAction = conActions[:, conAct]
+                    continuousAloss = getContinuousALoss(
+                        thisNowConMusig, thisConAction, thisOldConProb, advantage
+                    )
+                    totalALoss += continuousAloss
+                    totalActionNum += 1.0
+                    lastConAct += self.muSigSize
+            loss = tf.divide(totalALoss, totalActionNum)
            return loss
+
        return loss

-    # get Action&V
-    def chooseAction(self, state):
+    # get Actions&values
+    def chooseAction(self, state: ndarray):
        """Agent choose action to take

        Args:
-            state (np.array): enviroment state
+            state (ndarray): enviroment state

        Returns:
            np.array:
-                disAct1,
-                    discreteAction1
-                disAct2,
-                    discreteAction2
-                disAct3,
-                    discreteAction3
-                conAction,
-                    continuousAction
+                actions,
+                    actions list,2dims like [[0],[1],[1.5]]
                predictResult,
                    actor NN predict Result output
        """
        # let actor choose action,use the normal distribution
        # state = np.expand_dims(state,0)

-        # check state dimension is [1,statesize]
-        if state.ndim!=2:
-            state = state.reshape([1,self.stateSize])
-        
+        # check state dimension is [stateNum,statesize]
+        if state.ndim != 2:
+            stateNum = int(len(state) / self.stateSize)
+            state = state.reshape([stateNum, self.stateSize])
        predictResult = self.actor(state)  # get predict result [[disAct1, disAct2, disAct3, musig]]
-        predictResult = predictResult.numpy()
-        disAct1Prob = predictResult[0][0:3]
-        disAct2Prob = predictResult[0][3:6]
-        disAct3Prob = predictResult[0][6:8]
-        mu = predictResult[0][8]
-        sigma = predictResult[0][9]
-        if math.isnan(mu) or math.isnan(sigma):
-            # check mu or sigma is nan
-            print("mu or sigma is nan")
+        # print("predictResult",predictResult)
+        # predictResult = predictResult.numpy()
+        actions = []
+        if self.disActSize != 0:
+            # while NN have discrete action output.
+            lastDisActShape = 0
+            for shape in self.disActShape:
+                thisDisActProbs = predictResult[:, lastDisActShape : lastDisActShape + shape]
+                dist = tfp.distributions.Categorical(probs=thisDisActProbs, dtype=tf.float32)
+                action = int(dist.sample().numpy()[0])
+                # action = np.argmax(thisDisActProbs)
+                actions.append(action)
+                lastDisActShape += shape
+        if self.conActSize != 0:
+            # while NN have continuous action output.
+            lastConAct = 0
+            for actIndex in range(self.conActSize):
+                thisMu = predictResult[:, self.disOutputSize + lastConAct]
+                thisSig = predictResult[:, self.disOutputSize + lastConAct + 1]
+                if math.isnan(thisMu) or math.isnan(thisSig):
+                    # check mu or sigma is nan
+                    print("chooseAction:mu or sigma is nan")
+                thisDist = np.random.normal(loc=thisMu, scale=thisSig)
+                actions.append(np.clip(thisDist, -self.conActRange, self.conActRange))
+                lastConAct += 2
+        return actions, predictResult

-        disAct1 = np.argmax(disAct1Prob)  # WS 0 or 1 or 2
-        disAct2 = np.argmax(disAct2Prob)  # AD 0 or 1 or 2
-        disAct3 = np.argmax(disAct3Prob)  # mouse shoot 0 or 1
-        normDist = np.random.normal(loc=mu, scale=sigma)  # normalDistribution
-        conAction = np.clip(normDist, -self.conActRange,
-                            self.conActRange)  # 在正态分布中随机get一个action
-        return disAct1, disAct2, disAct3, conAction, predictResult
+    def trainCritcActor(
+        self,
+        states: ndarray,
+        oldActorResult: ndarray,
+        actions: ndarray,
+        rewards: ndarray,
+        dones: ndarray,
+        nextState: ndarray,
+        epochs: int = None,
+    ):
+        """train critic&actor use PPO ways

-    def getCriticV(self, state):
+        Args:
+            states (ndarray): states
+            oldActorResult (ndarray): actor predict result
+            actions (ndarray): predicted actions include both discrete actions and continuous actions
+            rewards (ndarray): rewards from enviroment
+            dones (ndarray): dones from enviroment
+            nextState (ndarray): next state from enviroment
+            epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.
+
+        Returns:
+            tf.constant: criticLoss, actorLoss
+        """
+
+        if epochs == None:
+            epochs = self.trainEpochs
+        criticValues = self.getCriticV(state=states)
+        discountedR = self.discountReward(nextState, criticValues, dones, rewards)
+        advantage = self.getGAE(discountedR, criticValues)
+
+        criticLoss = self.trainCritic(states, discountedR, epochs)
+        actorLoss = self.trainActor(states, oldActorResult, actions, advantage, epochs)
+        # print("A_Loss:", actorLoss, "C_Loss:", criticLoss)
+        return criticLoss, actorLoss
+
+    def trainCritic(self, states: ndarray, discountedR: ndarray, epochs: int = None):
+        """critic NN trainning function
+
+        Args:
+            states (ndarray): states
+            discountedR (ndarray): discounted rewards
+            epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.
+
+        Returns:
+            tf.constant: all critic losses
+        """
+        if epochs == None:
+            epochs = self.trainEpochs
+        his = self.critic.fit(x=states, y=discountedR, epochs=epochs, verbose=0)
+        return his.history["loss"]
+
+    def trainActor(
+        self,
+        states: ndarray,
+        oldActorResult: ndarray,
+        actions: ndarray,
+        advantage: ndarray,
+        epochs: int = None,
+    ):
+        """actor NN trainning function
+
+        Args:
+            states (ndarray): states
+            oldActorResult (ndarray): actor predict results
+            actions (ndarray): acotor predict actions
+            advantage (ndarray): GAE advantage
+            epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.
+
+        Returns:
+            tf.constant: all actor losses
+        """
+        # Trian Actor
+        # states: Buffer States
+        # actions: Buffer Actions
+        # discountedR: Discounted Rewards
+        # Epochs: just Epochs
+        if epochs == None:
+            epochs = self.trainEpochs
+        actions = np.asarray(actions, dtype=np.float32)
+
+        disActions = actions[:, 0 : self.disActSize]
+        conActions = actions[:, self.disActSize :]
+        oldDisProbs = oldActorResult[:, 0 : self.disOutputSize]  # [disAct1, disAct2, disAct3]
+        oldConMusigs = oldActorResult[:, self.disOutputSize :]  # [musig1,musig2]
+        if self.disActSize != 0:
+            disOneHotActs = self.getOneHotActs(disActions)
+            if self.conActSize != 0:
+                # while NN have discrete6 & continuous actions output.
+                oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
+                # pack [oldDisProbs,oldPiProbs,conActions,advantage] as y_true
+                y_true = np.hstack((oldDisProbs, oldPiProbs, disOneHotActs, conActions, advantage))
+            else:
+                # while NN have only discrete actions output.
+                # pack [oldDisProbs,advantage] as y_true
+                y_true = np.hstack((oldDisProbs, disOneHotActs, advantage))
+        else:
+            if self.conActSize != 0:
+                # while NN have only continuous action output.
+                oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
+                # pack [oldPiProbs,conActions,advantage] as y_true
+                y_true = np.hstack((oldPiProbs, conActions, advantage))
+            else:
+                print("trainActor:disActSize & conActSize error")
+                time.sleep(999999)
+        # assembly Actions history
+        # train start
+        if np.any(tf.math.is_nan(y_true)):
+            print("y_true got nan")
+            print("y_true", y_true)
+        his = self.actor.fit(x=states, y=y_true, epochs=epochs, verbose=0)
+        if np.any(tf.math.is_nan(his.history["loss"])):
+            print("his.history['loss'] is nan!")
+            print(his.history["loss"])
+        return his.history["loss"]
+
+    def saveWeights(self, score: float):
+        """save now NN's Weight. Use "models.save_weights" method.
+        Save as "tf" format "ckpt" file.
+
+        Args:
+            score (float): now score
+        """
+        actor_save_dir = (
+            self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/actor/" + "actor.ckpt"
+        )
+        critic_save_dir = (
+            self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/critic/" + "critic.ckpt"
+        )
+        self.actor.save_weights(actor_save_dir, save_format="tf")
+        self.critic.save_weights(critic_save_dir, save_format="tf")
+        # create an empty file named  as score to recored score
+        score_dir = (
+            self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/" + str(round(score))
+        )
+        scorefile = open(score_dir, "w")
+        scorefile.close()
+        print("Model's Weights Saved")
+
+    def loadWeightToModels(self, loadDir: str):
+        """load NN Model. Use "models.load_weights()" method.
+        Load "tf" format "ckpt" file.
+
+        Args:
+            loadDir (str): Model dir
+        """
+        actorDir = loadDir + "/actor/" + "actor.ckpt"
+        criticDir = loadDir + "/critic/" + "critic.ckpt"
+        self.actor.load_weights(actorDir)
+        self.critic.load_weights(criticDir)
+
+        print("++++++++++++++++++++++++++++++++++++")
+        print("++++++++++++Model Loaded++++++++++++")
+        print(loadDir)
+        print("++++++++++++++++++++++++++++++++++++")
+
+    def getCriticV(self, state: ndarray):
        """get Critic predict V value

        Args:
-            state (np.array): Env state
+            state (ndarray): Env state

        Returns:
            tensor: retrun Critic predict result
        """
        # if state.ndim < 2:
        #    state = np.expand_dims(state,0)
-        if state.ndim!=2:
-            state = state.reshape([1,self.stateSize])
+        if state.ndim != 2:
+            stateNum = int(len(state) / self.stateSize)
+            state = state.reshape([stateNum, self.stateSize])
        return self.critic.predict(state)

-    def discountReward(self, nextState, rewards):
+    def discountReward(self, nextState: ndarray, values: ndarray, dones: ndarray, rewards: ndarray):
        """Discount future rewards

        Args:
-            nextState (np.array): next Env state 
-            rewards (np.array): reward list of this episode
+            nextState (ndarray): next Env state
+            values (ndarray): critic predict values
+            dones (ndarray): dones from enviroment
+            rewards (ndarray): reward list of this episode

        Returns:
-            np.array: discounted rewards list,same shape as rewards that input
+            ndarray: discounted rewards list,same shape as rewards that input
+        """
+        """
+        nextV = self.getCriticV(nextState)
+        dones = 1 - dones
+        discountedRewards = []
+        for i in reversed(range(len(rewards))):
+            nextV = rewards[i] + dones[i] * self.gamma * nextV
+            discountedRewards.append(nextV)
+        discountedRewards.reverse()  # reverse
+        discountedRewards = np.squeeze(discountedRewards)
+        discountedRewards = np.expand_dims(discountedRewards, axis=1)
+        # discountedRewards = np.array(discountedRewards)[:, np.newaxis]
+        return discountedRewards
+        """
        """
-        # 降低未来的rewards
        nextV = self.getCriticV(nextState)
        discountedRewards = []
        for r in rewards[::-1]:
-            nextV = r + self.GAMMA*nextV
+            nextV = r + self.gamma * nextV
            discountedRewards.append(nextV)
-        discountedRewards.reverse()  # \ESREVER/
+        discountedRewards.reverse()  # reverse
        discountedRewards = np.squeeze(discountedRewards)
        discountedRewards = np.expand_dims(discountedRewards, axis=1)
-        #discountedRewards = np.array(discountedRewards)[:, np.newaxis]
+        # discountedRewards = np.array(discountedRewards)[:, np.newaxis]
+        print(discountedRewards)
        return discountedRewards
+        """
+        g = 0
+        discountedRewards = []
+        lastValue = self.getCriticV(nextState)
+        values = np.append(values, lastValue, axis=0)
+        dones = 1 - dones
+        for i in reversed(range(len(rewards))):
+            delta = rewards[i] + self.gamma * values[i + 1] * dones[i] - values[i]
+            g = delta + self.gamma * self.lmbda * dones[i] * g
+            discountedRewards.append(g + values[i])
+        discountedRewards.reverse()
+        return np.asarray(discountedRewards)

-    def conProb(self, mu, sig, x):
+    def getGAE(self, discountedRewards: ndarray, values: ndarray):
+        """compute GAE adcantage
+
+        Args:
+            discountedRewards (ndarray): discounted rewards
+            values (ndarray): critic predict values
+
+        Returns:
+            ndarray: GAE advantage
+        """
+        advantage = discountedRewards - values
+        advantage = (advantage - np.mean(advantage)) / (np.std(advantage) + EPS)
+        return advantage
+
+    def conProb(self, mu: ndarray, sig: ndarray, x: ndarray):
        """calculate probability when x in Normal distribution(mu,sigma)

        Args:
-            mu (np,array): mu
-            sig (np.array): sigma
-            x (np.array): x
+            mu (ndarray): mu
+            sig (ndarray): sigma
+            x (ndarray): x

        Returns:
-            np.array: probabilities
+            ndarray: probability
        """
        # 获取在正态分布mu,sig下当取x值时的概率
        # return shape : (length,1)
@ -309,120 +654,62 @@ class PPO(object):
        prob = dist.prob(x)

        prob = np.reshape(prob, (np.size(x), 1))
-        #dist = 1./(tf.sqrt(2.*np.pi)*sig)
-        #prob = dist*tf.exp(-tf.square(x-mu)/(2.*tf.square(sig)))
+        # dist = 1./(tf.sqrt(2.*np.pi)*sig)
+        # prob = dist*tf.exp(-tf.square(x-mu)/(2.*tf.square(sig)))
        return prob

-    def trainCritcActor(self, states, actions, rewards, nextState, criticEpochs, actorEpochs):
-        # Train ActorNN and CriticNN
-        # states: Buffer States
-        # actions: Buffer Actions
-        # rewards: Buffer Rewards,没有Discount处理
-        # nextState: 下一个单独state
-        # criticEpochs: just criticNN'Epochs
-        # acotrEpochs: just acotrNN'Epochs
-        discountedR = self.discountReward(nextState, rewards)
-
-        criticMeanLoss = self.trainCritic(states, discountedR, criticEpochs)
-        actorMeanLoss = self.trainActor(
-            states, actions, discountedR, actorEpochs)
-        print("A_Loss:", actorMeanLoss, "C_Loss:", criticMeanLoss)
-        return actorMeanLoss, criticMeanLoss
-
-    def trainCritic(self, states, discountedR, epochs):
-        # Trian Critic
-        # states: Buffer States
-        # discountedR: Discounted Rewards
-        # Epochs: just Epochs
-
-        # IDK why this should be list...It just work...
-        # If discountR in np.array type it will throw 'Failed to find data adapter that can handle'
-        # discountedR = discountedR.tolist()
-        his = self.critic.fit(x=states, y=discountedR,
-                              epochs=epochs, verbose=0)
-        return np.mean(his.history['loss'])
-
-    def trainActor(self, states, actions, discountedR, epochs):
-        """Actor NN trainning function
+    def getOneHotActs(self, disActions):
+        """one hot action encoder

        Args:
-            states (np.array): Env states
-            actions (np.array): action history
-            discountedR (np.array): discountedR
-            epochs (int): epochs,how many time NN learning
+            disActions (ndarray): discrete actions

        Returns:
-            Average actor loss: this learning round's average actor loss
+            ndarray: one hot actions
        """
-        # Trian Actor
-        # states: Buffer States
-        # actions: Buffer Actions
-        # discountedR: Discounted Rewards
-        # Epochs: just Epochs
+        actIndex = 0
+        for thisShape in self.disActShape:
+            thisActs = disActions[:, actIndex]
+            thisOneHotAct = tf.squeeze(tf.one_hot(thisActs, thisShape)).numpy()
+            if actIndex == 0:
+                oneHotActs = thisOneHotAct
+            else:
+                oneHotActs = np.append(oneHotActs, thisOneHotAct, axis=1)
+            actIndex += 1
+        return oneHotActs

-        states = np.asarray(states)
-        actions = np.asarray(actions, dtype=np.float32)
-        # predict with old Actor NN
-        oldActorResult = self.actor.predict(states)
-        
-        # assembly Actions history
-        disActions = actions[:,0:self.disActSize]
-        conActions = actions[:,self.disActSize:]
-        # assembly predictResult as old Actor's Result
-        oldDisProbs = oldActorResult[:,0:self.disOutputSize] # [disAct1, disAct2, disAct3]
-        oldConMusigs = oldActorResult[:,self.disOutputSize:] # [musig1,musig2]
-        oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
-
-        criticV = self.critic.predict(states)
-        advantage = copy.deepcopy(discountedR - criticV)
-
-        # pack [oldDisProbs,oldPiProbs,conActions,advantage] as y_true
-        y_true = np.hstack((oldDisProbs,oldPiProbs,conActions,advantage))
-
-        # train start
-        if np.any(tf.math.is_nan(y_true)):
-            print("y_true got nan")
-            print("oldConMusigs",oldConMusigs)
-            print("oldPiProbs",oldPiProbs)
-            print("conActions",conActions)
-            print("oldConMusigs",oldConMusigs)
-        his = self.actor.fit(x=states, y=y_true, epochs=epochs, verbose=0)
-        if np.any(tf.math.is_nan(his.history['loss'])):
-            print("his.history['loss'] is nan!")
-            print(his.history['loss'])
-        return np.mean(his.history['loss'])
-
-    def saveWeights(self,score = None):
-        """save now NN's Weight. Use "models.save_weights" method. 
-        Save as "tf" format "ckpt" file.
+    def getAverageEntropy(self, probs: ndarray):
+        """get average dis&con ACT Entropys

        Args:
-            score (int): now score
+            probs (ndarray): actor NN predict result
+
+        Returns:
+            float: average total entropy
+            list: discrete entropys
+            list: continuous entropys
        """
-        actor_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/actor/" + "actor.ckpt"
-        critic_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/critic/" + "critic.ckpt"
-        self.actor.save_weights(actor_save_dir, save_format="tf")
-        self.critic.save_weights(critic_save_dir, save_format="tf")
-        if score != None:
-            # create an empty file named  as score to recored score
-            score_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/" + str(round(score))
-            scorefile = open(score_dir,'w')
-            scorefile.close()
-        print("Model's Weights Saved")
-    
-    def loadWeightToModels(self,loadDir):
-        """load NN Model. Use "models.load_weights()" method.
-        Load "tf" format "ckpt" file.
-
-        Args:
-            loadDir (string): Model dir
-        """
-        actorDir = loadDir + "/actor/" + "actor.ckpt"
-        criticDir = loadDir + "/critic/" + "critic.ckpt"
-        self.actor.load_weights(actorDir)
-        self.critic.load_weights(criticDir)
-        
-        print("++++++++++++++++++++++++++++++++++++")
-        print("++++++++++++Model Loaded++++++++++++")
-        print(loadDir)
-        print("++++++++++++++++++++++++++++++++++++")
+        discreteEntropys = []
+        continuousEntropys = []
+        if self.disActSize != 0:
+            disProbs = probs[:, 0 : self.disOutputSize]
+            lastDisActIndex = 0
+            for actShape in self.disActShape:
+                thisDisProbs = disProbs[:, lastDisActIndex : lastDisActIndex + actShape]
+                lastDisActIndex += actShape
+                discreteEntropys.append(
+                    tf.negative(
+                        tf.reduce_mean(
+                            tf.math.multiply(thisDisProbs, tf.math.log(thisDisProbs + EPS))
+                        )
+                    )
+                )
+        if self.conActSize != 0:
+            conProbs = probs[:, self.disOutputSize :]
+            conActIndex = 0
+            for i in range(self.conActSize):
+                thisConProbs = conProbs[:, conActIndex : conActIndex + 2]
+                conActIndex += 2
+                continuousEntropys.append(tf.reduce_mean(thisConProbs[:, 1]))
+        averageEntropy = np.mean([np.mean(discreteEntropys), np.mean(continuousEntropys)])
+        return averageEntropy, discreteEntropys, continuousEntropys
--- a/Aimbot-PPO-Python/PPOBuffer.py
+++ b/Aimbot-PPO-Python/PPOBuffer.py
@ -0,0 +1,65 @@
+import numpy as np
+
+
+class PPOBuffer(object):
+    def __init__(self):
+        self.states = []
+        self.actorProbs = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        print("√√√√√Buffer Initialized Success√√√√√")
+
+    def clearBuffer(self):
+        self.states = []
+        self.actorProbs = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+
+    def getStates(self):
+        return self.standDims(np.asarray(self.states))
+
+    def getActorProbs(self):
+        return self.standDims(np.asarray(self.actorProbs))
+
+    def getActions(self):
+        return self.standDims(np.asarray(self.actions))
+
+    def getRewards(self):
+        return self.standDims(np.asarray(self.rewards))
+
+    def getDones(self):
+        return self.standDims(np.asarray(self.dones))
+
+    def saveState(self, state):
+        self.states.append(state)
+
+    def saveAction(self, action):
+        self.actions.append(action)
+
+    def saveReward(self, reward):
+        self.rewards.append(reward)
+
+    def standDims(self, data):
+        # standarlize data's dimension
+        if np.ndim(data) > 2:
+            return np.squeeze(data, axis=1)
+        elif np.ndim(data) < 2:
+            return np.expand_dims(data, axis=1)
+        else:
+            return np.asarray(data)
+
+    def saveBuffers(self, state, actorProb, action, reward, done):
+        self.states.append(state)
+        self.actorProbs.append(actorProb)
+        self.actions.append(action)
+        self.rewards.append(reward)
+        self.dones.append(done)
+        """
+        print("self.states", self.states)
+        print("self.actions", self.actions)
+        print("self.rewards", self.rewards)
+        print("self.dones", self.dones)
+        print("self.values", self.values)
+        """
--- a/Aimbot-PPO-Python/PPOConfig.py
+++ b/Aimbot-PPO-Python/PPOConfig.py
@ -0,0 +1,15 @@
+import datetime
+from typing import NamedTuple, Optional
+
+
+class PPOConfig(NamedTuple):
+    NNShape: list = [256, 256, 128]
+    actorLR: float = 2e-3  # Actor Net Learning
+    criticLR: float = 2e-3  # Critic Net Learning
+    gamma: float = 0.99
+    lmbda: float = 0.95
+    clipRange: float = 0.20
+    entropyWeight: float = 1e-2
+    trainEpochs: int = 8
+    saveDir: str = "PPO-Model/" + datetime.datetime.now().strftime("%m%d-%H%M") + "/"
+    loadModelDir: Optional[str] = None
--- a/Aimbot-PPO-Python/PPOHistoryRecorder.py
+++ b/Aimbot-PPO-Python/PPOHistoryRecorder.py
@ -0,0 +1,58 @@
+from turtle import color
+import matplotlib.pyplot as plt
+
+
+class PPOHistory(object):
+    def __init__(self):
+        self.meanRewards = []
+        self.entropys = []
+        self.actorLosses = []
+        self.criticLosses = []
+
+    def saveHis(self, rewards, entropys, aLosses, cLosses):
+        self.meanRewards.extend([rewards])
+        self.entropys.extend([entropys])
+        self.actorLosses.extend(aLosses)
+        self.criticLosses.extend(cLosses)
+
+    def drawHis(self):
+        plt.figure(figsize=(21, 13), facecolor="#011627")
+        ax = plt.subplot(2, 2, 1)
+        ax.set_facecolor("#011627")
+        ax.spines["top"].set_color("#c9d2df")
+        ax.spines["bottom"].set_color("#c9d2df")
+        ax.spines["left"].set_color("#c9d2df")
+        ax.spines["right"].set_color("#c9d2df")
+        ax.plot(
+            range(len(self.meanRewards)), self.meanRewards, color="#c9d2df", label="AverageRewards"
+        )
+        ax.set_title("meanRewards", color="#c9d2df")
+        ax = plt.subplot(2, 2, 2)
+        ax.set_facecolor("#011627")
+        ax.spines["top"].set_color("#c9d2df")
+        ax.spines["bottom"].set_color("#c9d2df")
+        ax.spines["left"].set_color("#c9d2df")
+        ax.spines["right"].set_color("#c9d2df")
+        ax.plot(range(len(self.entropys)), self.entropys, color="#c9d2df", label="AverageEntropys")
+        ax.set_title("entropys", color="#c9d2df")
+        ax = plt.subplot(2, 2, 3)
+        ax.set_facecolor("#011627")
+        ax.spines["top"].set_color("#c9d2df")
+        ax.spines["bottom"].set_color("#c9d2df")
+        ax.spines["left"].set_color("#c9d2df")
+        ax.spines["right"].set_color("#c9d2df")
+        ax.plot(
+            range(len(self.actorLosses)), self.actorLosses, color="#c9d2df", label="actorLosses"
+        )
+        ax.set_title("actorLosses", color="#c9d2df")
+        ax = plt.subplot(2, 2, 4)
+        ax.set_facecolor("#011627")
+        ax.spines["top"].set_color("#c9d2df")
+        ax.spines["bottom"].set_color("#c9d2df")
+        ax.spines["left"].set_color("#c9d2df")
+        ax.spines["right"].set_color("#c9d2df")
+        ax.plot(
+            range(len(self.criticLosses)), self.criticLosses, color="#c9d2df", label="criticLosses"
+        )
+        ax.set_title("criticLosses", color="#c9d2df")
+        plt.show()
--- a/Aimbot-PPO-Python/aimBotEnv.py
+++ b/Aimbot-PPO-Python/aimBotEnv.py
@ -1,8 +1,8 @@
-import mlagents_envs
 from mlagents_envs.base_env import ActionTuple
 from mlagents_envs.environment import UnityEnvironment

 import numpy as np
+from numpy import ndarray


 class makeEnv(object):
@ -22,69 +22,71 @@ class makeEnv(object):
        self.BEHA_SPECS = self.env.behavior_specs
        self.BEHA_NAME = list(self.BEHA_SPECS)[0]
        self.SPEC = self.BEHA_SPECS[self.BEHA_NAME]
-        self.OBSERVATION_SPECS = self.SPEC.observation_specs[
-            0
-        ]  # observation spec
+        self.OBSERVATION_SPECS = self.SPEC.observation_specs[0]  # observation spec
        self.ACTION_SPEC = self.SPEC.action_spec  # action specs

        self.DISCRETE_SIZE = self.ACTION_SPEC.discrete_size  # 　連続的な動作のSize
+        self.DISCRETE_SHAPE = list(self.ACTION_SPEC.discrete_branches)
        self.CONTINUOUS_SIZE = self.ACTION_SPEC.continuous_size  # 　離散的な動作のSize
-        self.STATE_SIZE = (
-            self.OBSERVATION_SPECS.shape[0] - self.LOAD_DIR_SIZE_IN_STATE
-        )  # 環境観測データ数
+        self.STATE_SIZE = self.OBSERVATION_SPECS.shape[0] - self.LOAD_DIR_SIZE_IN_STATE  # 環境観測データ数
        print("√√√√√Enviroment Initialized Success√√√√√")

    def step(
        self,
-        discreteActions=None,
-        continuousActions=None,
-        behaviorName=None,
-        trackedAgent=None,
+        actions: list,
+        behaviorName: ndarray = None,
+        trackedAgent: ndarray = None,
    ):
+        """change ations list to ActionTuple then send it to enviroment
+
+        Args:
+            actions (list): PPO chooseAction output action list
+            behaviorName (ndarray, optional): behaviorName. Defaults to None.
+            trackedAgent (ndarray, optional): trackedAgentID. Defaults to None.
+
+        Returns:
+            ndarray: nextState, reward, done, loadDir, saveNow
+        """
        # take action to enviroment
        # return mextState,reward,done
+        if self.DISCRETE_SIZE == 0:
+            # create empty discrete action
+            discreteActions = np.asarray([[0]])
+        else:
+            # create discrete action from actions list
+            discreteActions = np.asanyarray([actions[0:self.DISCRETE_SIZE]])
+        if self.CONTINUOUS_SIZE == 0:
+            # create empty continuous action
+            continuousActions = np.asanyarray([[0.0]])
+        else:
+            # create continuous actions from actions list
+            continuousActions = np.asanyarray(actions[self.DISCRETE_SIZE:])

-        # check if arg is include None or IS None
-        try:
-            isDisNone = discreteActions.any() is None
-            if discreteActions.all() is None:
-                print("step() Error!:discreteActions include None")
-        except:
-            isDisNone = True
-        try:
-            isConNone = continuousActions.any() is None
-            if continuousActions.all() is None:
-                print("step() Error!:continuousActions include None")
-        except:
-            isConNone = True
-
-        if isDisNone:
-            # if discreteActions is enpty just give nothing[[0]] to Enviroment
-            discreteActions = np.array([[0]], dtype=np.int)
-        if isConNone:
-            # if continuousActions is enpty just give nothing[[0]] to Enviroment
-            continuousActions = np.array([[0]], dtype=np.float)
        if behaviorName is None:
            behaviorName = self.BEHA_NAME
        if trackedAgent is None:
            trackedAgent = self.TRACKED_AGENT

        # create actionTuple
-        thisActionTuple = ActionTuple(
-            continuous=continuousActions, discrete=discreteActions
-        )
+        thisActionTuple = ActionTuple(continuous=continuousActions, discrete=discreteActions)
        # take action to env
-        self.env.set_actions(
-            behavior_name=behaviorName, action=thisActionTuple
-        )
+        self.env.set_actions(behavior_name=behaviorName, action=thisActionTuple)
        self.env.step()
        # get nextState & reward & done after this action
-        nextState, reward, done, loadDir, saveNow = self.getSteps(
-            behaviorName, trackedAgent
-        )
+        nextState, reward, done, loadDir, saveNow = self.getSteps(behaviorName, trackedAgent)
        return nextState, reward, done, loadDir, saveNow

    def getSteps(self, behaviorName=None, trackedAgent=None):
+        """get enviroment now observations.
+        Include State, Reward, Done, LoadDir, SaveNow
+
+        Args:
+            behaviorName (_type_, optional): behaviorName. Defaults to None.
+            trackedAgent (_type_, optional): trackedAgent. Defaults to None.
+
+        Returns:
+            ndarray: nextState, reward, done, loadDir, saveNow
+        """
        # get nextState & reward & done
        if behaviorName is None:
            behaviorName = self.BEHA_NAME
@ -94,25 +96,17 @@ class makeEnv(object):
        if trackedAgent is None:
            trackedAgent = self.TRACKED_AGENT

-        if (
-            trackedAgent in decisionSteps
-        ):  # ゲーム終了していない場合、環境状態がdecision_stepsに保存される
+        if trackedAgent in decisionSteps:  # ゲーム終了していない場合、環境状態がdecision_stepsに保存される
            nextState = decisionSteps[trackedAgent].obs[0]
-            nextState = np.reshape(
-                nextState, [1, self.STATE_SIZE + self.LOAD_DIR_SIZE_IN_STATE]
-            )
+            nextState = np.reshape(nextState, [1, self.STATE_SIZE + self.LOAD_DIR_SIZE_IN_STATE])
            saveNow = nextState[0][-1]
            loadDir = nextState[0][-3:-1]
            nextState = nextState[0][:-3]
            reward = decisionSteps[trackedAgent].reward
            done = False
-        if (
-            trackedAgent in terminalSteps
-        ):  # ゲーム終了した場合、環境状態がterminal_stepsに保存される
+        if trackedAgent in terminalSteps:  # ゲーム終了した場合、環境状態がterminal_stepsに保存される
            nextState = terminalSteps[trackedAgent].obs[0]
-            nextState = np.reshape(
-                nextState, [1, self.STATE_SIZE + self.LOAD_DIR_SIZE_IN_STATE]
-            )
+            nextState = np.reshape(nextState, [1, self.STATE_SIZE + self.LOAD_DIR_SIZE_IN_STATE])
            saveNow = nextState[0][-1]
            loadDir = nextState[0][-3:-1]
            nextState = nextState[0][:-3]
@ -121,9 +115,16 @@ class makeEnv(object):
        return nextState, reward, done, loadDir, saveNow

    def reset(self):
+        """reset enviroment and get observations
+
+        Returns:
+            ndarray: nextState, reward, done, loadDir, saveNow
+        """
        self.env.reset()
        nextState, reward, done, loadDir, saveNow = self.getSteps()
        return nextState, reward, done, loadDir, saveNow

    def render(self):
+        """render enviroment
+        """
        self.env.render()
--- a/Aimbot-PPO-Python/buffer.py
+++ b/Aimbot-PPO-Python/buffer.py
@ -1,29 +0,0 @@
-import numpy as np
-
-class buffer(object):
-    def __init__(self):
-        self.states = []
-        self.actions = []
-        self.rewards = []
-        print("√√√√√Buffer Initialized Success√√√√√")
-    def clearBuffer(self):
-        self.states = []
-        self.actions = []
-        self.rewards = []
-    def getStates(self):
-        return np.asarray(self.states)
-    def getActions(self):
-        return np.asarray(self.actions)
-    def getRewards(self):
-        return np.asarray(self.rewards)
-    
-    def saveState(self,state):
-        self.states.append(state)
-    def saveAction(self,action):
-        self.actions.append(action)
-    def saveReward(self,reward):
-        self.rewards.append(reward)
-    def saveBuffers(self,state,action,reward):
-        self.states.append(state)
-        self.actions.append(action)
-        self.rewards.append(reward)
--- a/Aimbot-PPO-Python/main.ipynb
+++ b/Aimbot-PPO-Python/main.ipynb
@ -1,356 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import aimBotEnv\n",
-    "import PPO\n",
-    "import buffer\n",
-    "import numpy as np\n",
-    "\n",
-    "import tensorflow as tf\n",
-    "import time\n",
-    "import datetime\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Attempts to allocate only the GPU memory needed for allocation\n",
-    "physical_devices = tf.config.list_physical_devices('GPU')\n",
-    "tf.config.experimental.set_memory_growth(physical_devices[0], True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Env\n",
-    "ENV_PATH = \"./Build-CloseEnemyCut/Aimbot-PPO\"\n",
-    "WORKER_ID = 1\n",
-    "BASE_PORT = 200\n",
-    "\n",
-    "MAX_EP = 1000\n",
-    "EP_LENGTH = 100000\n",
-    "GAMMA = 0.99  # discount future reward (UP?)\n",
-    "EPSILON = 0.2  # clip Ratio range[1-EPSILON,1+EPSILON]\n",
-    "ACTOR_LR = 1e-5  # LR\n",
-    "CRITIC_LR = 2e-5  # LR\n",
-    "BATCH = 256  # learning step\n",
-    "ACTOR_EPOCH = 15  # epoch\n",
-    "CRITIC_EPOCH = 15  # epoch\n",
-    "ENTROPY_WHEIGHT = 0.001  # sigma's entropy in Actor loss\n",
-    "ACTION_INTERVAL = 1  # take action every ACTION_INTERVAL steps\n",
-    "\n",
-    "\n",
-    "TRAIN = True\n",
-    "SAVE_DIR = \"PPO-Model/\" + datetime.datetime.now().strftime(\"%m%d%H%M\") + \"/\"\n",
-    "LOAD_DIR = None\n",
-    "\n",
-    "CTN_ACTION_RANGE = 10\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "√√√√√Enviroment Initialized Success√√√√√\n",
-      "√√√√√Buffer Initialized Success√√√√√\n",
-      "No loadDir specified,Create a New Model\n",
-      "CONTINUOUS_SIZE 1\n",
-      "DISCRETE_SIZE 5\n",
-      "STATE_SIZE 30\n"
-     ]
-    }
-   ],
-   "source": [
-    "# initialize enviroment & buffer class\n",
-    "env = aimBotEnv.makeEnv(\n",
-    "    envPath=ENV_PATH, workerID=WORKER_ID, basePort=BASE_PORT\n",
-    ")\n",
-    "epBuffer = buffer.buffer()\n",
-    "\n",
-    "STATE_SIZE = env.STATE_SIZE\n",
-    "CONTINUOUS_SIZE = env.CONTINUOUS_SIZE\n",
-    "DISCRETE_SIZE = env.DISCRETE_SIZE\n",
-    "_, _, _, loadDir, _ = env.getSteps()\n",
-    "\n",
-    "# check load model or not\n",
-    "if np.any(loadDir == 0):\n",
-    "    # create a new model\n",
-    "    print(\"No loadDir specified,Create a New Model\")\n",
-    "    LOAD_DIR = None\n",
-    "else:\n",
-    "    # load model\n",
-    "    loadDirDateSTR = str(int(loadDir[0]))\n",
-    "    loadDirTimeSTR = str(int(loadDir[1]))\n",
-    "    if len(loadDirDateSTR) != 8:\n",
-    "        # fill lost 0 while converse float to string\n",
-    "        for _ in range(8 - len(loadDirDateSTR)):\n",
-    "            loadDirDateSTR = \"0\" + loadDirDateSTR\n",
-    "    if len(loadDirTimeSTR) != 6:\n",
-    "        # fill lost 0 while converse float to string\n",
-    "        for _ in range(6 - len(loadDirTimeSTR)):\n",
-    "            loadDirTimeSTR = \"0\" + loadDirTimeSTR\n",
-    "    LOAD_DIR = \"PPO-Model/\" + loadDirDateSTR + \"/\" + loadDirTimeSTR\n",
-    "    print(\"Load Model:\")\n",
-    "    print(LOAD_DIR)\n",
-    "\n",
-    "print(\"CONTINUOUS_SIZE\", CONTINUOUS_SIZE)\n",
-    "print(\"DISCRETE_SIZE\", DISCRETE_SIZE)\n",
-    "print(\"STATE_SIZE\", STATE_SIZE)\n",
-    "\n",
-    "disActShape = [3, 3, 2]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def actToKey(disAct1,disAct2,disAct3,conAct):\n",
-    "    kW = 0\n",
-    "    kS = 0\n",
-    "    kA = 0\n",
-    "    kD = 0\n",
-    "    mouseShoot = 0\n",
-    "    if disAct1 == 0:\n",
-    "        kW = 0\n",
-    "        kS = 1\n",
-    "    elif disAct1 == 1:\n",
-    "        kW = 0\n",
-    "        kS = 0\n",
-    "    elif disAct1 == 2:\n",
-    "        kW = 1\n",
-    "        kS = 0\n",
-    "    if disAct2 == 0:\n",
-    "        kA = 0\n",
-    "        kD = 1\n",
-    "    elif disAct2 == 1:\n",
-    "        kA = 0\n",
-    "        kD = 0\n",
-    "    elif disAct2 == 2:\n",
-    "        kA = 1\n",
-    "        kD = 0\n",
-    "    mouseShoot = disAct3\n",
-    "    return kW,kS,kA,kD,mouseShoot,conAct"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "EP  0  START\n",
-      "√√√√√Buffer Initialized Success√√√√√\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\numpy\\core\\fromnumeric.py:3474: RuntimeWarning: Mean of empty slice.\n",
-      "  return _methods._mean(a, axis=axis, dtype=dtype,\n",
-      "c:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\numpy\\core\\_methods.py:189: RuntimeWarning: invalid value encountered in double_scalars\n",
-      "  ret = ret.dtype.type(ret / rcount)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "A_Loss: 0.4477495511372884 C_Loss: 3.155759557088216\n",
-      "A_Loss: 0.14549287557601928 C_Loss: 0.5123071213563283\n",
-      "A_Loss: 0.055241942902406055 C_Loss: 0.13002794484297434\n",
-      "A_Loss: 0.057325509190559384 C_Loss: 0.11068039039770762\n",
-      "A_Loss: 0.04376962607105573 C_Loss: 0.03923700377345085\n"
-     ]
-    }
-   ],
-   "source": [
-    "bestScore = 200.0\n",
-    "stopTrainCounter = 0\n",
-    "\n",
-    "totalRewardHis = []\n",
-    "totalActorLossHis = []\n",
-    "totalCriticLossHis = []\n",
-    "epHis = []\n",
-    "maxTotalReward = -99999999999\n",
-    "\n",
-    "for ep in range(MAX_EP):\n",
-    "    print(\"EP \", ep, \" START\")\n",
-    "    # first time run game\n",
-    "    s, _, _, _, _ = env.reset()\n",
-    "    if ep == 0:\n",
-    "        epBuffer = buffer.buffer()\n",
-    "        s = s.reshape([STATE_SIZE])\n",
-    "        agent = PPO.PPO(\n",
-    "            stateSize=STATE_SIZE,\n",
-    "            disActShape=disActShape,\n",
-    "            conActSize=1,\n",
-    "            conActRange=CTN_ACTION_RANGE,\n",
-    "            criticLR=CRITIC_LR,\n",
-    "            actorLR=ACTOR_LR,\n",
-    "            gamma=GAMMA,\n",
-    "            epsilon=EPSILON,\n",
-    "            entropyWeight=ENTROPY_WHEIGHT,\n",
-    "            saveDir=SAVE_DIR,\n",
-    "            loadModelDir=LOAD_DIR,\n",
-    "        )\n",
-    "    step = 0\n",
-    "    done = False\n",
-    "    stopTrainCounter -= 1\n",
-    "    epHis.append(ep)\n",
-    "\n",
-    "    # reset total reward\n",
-    "    epTotalReward = 0\n",
-    "\n",
-    "    # Recorder list\n",
-    "    epStepHis = []\n",
-    "    epRewardHis = []\n",
-    "    epActorLossHis = []\n",
-    "    epCriticLossHis = []\n",
-    "\n",
-    "    # save weight immediately?\n",
-    "    saveNow = 0\n",
-    "\n",
-    "    while not done:\n",
-    "        step += 1\n",
-    "        if (\n",
-    "            step % ACTION_INTERVAL == 0\n",
-    "        ):  # take action every ACTION_INTERVAL steps\n",
-    "            epStepHis.append(step)\n",
-    "            (\n",
-    "                disAct1,\n",
-    "                disAct2,\n",
-    "                disAct3,\n",
-    "                conAct,\n",
-    "                predictResult,\n",
-    "            ) = agent.chooseAction(s)\n",
-    "            kW, kS, kA, kD, mouseShoot, mouseMove = actToKey(\n",
-    "                disAct1, disAct2, disAct3, conAct\n",
-    "            )\n",
-    "\n",
-    "            nextState, thisReward, done, _, saveNow = env.step(\n",
-    "                discreteActions=np.array([[kW, kS, kA, kD, mouseShoot]]),\n",
-    "                continuousActions=np.array([[mouseMove]]),\n",
-    "            )\n",
-    "\n",
-    "            epTotalReward += thisReward\n",
-    "            epBuffer.saveBuffers(\n",
-    "                s, [disAct1, disAct2, disAct3, conAct], thisReward\n",
-    "            )\n",
-    "        else:\n",
-    "            disActs = np.array([[0, 0, 0, 0, 0]])\n",
-    "            conActs = np.array([[0]])\n",
-    "\n",
-    "            nextState, thisReward, done, _, saveNow = env.step(\n",
-    "                discreteActions=disActs, continuousActions=conActs\n",
-    "            )\n",
-    "            epTotalReward += thisReward\n",
-    "        nextState = nextState.reshape([STATE_SIZE])\n",
-    "        s = nextState\n",
-    "\n",
-    "        if done:\n",
-    "            print(\"EP OVER!\")\n",
-    "        if saveNow != 0:\n",
-    "            print(\"SAVENOW!\")\n",
-    "            saveNow = 0\n",
-    "            agent.saveWeights()\n",
-    "        # update PPO after Batch step or GameOver\n",
-    "        if (step + 1) % BATCH == 0 or done:\n",
-    "            bs = epBuffer.getStates()\n",
-    "            ba = epBuffer.getActions()\n",
-    "            br = epBuffer.getRewards()\n",
-    "            epBuffer.clearBuffer()\n",
-    "            if TRAIN:\n",
-    "                epActorLoss, epCriticLoss = agent.trainCritcActor(\n",
-    "                    bs, ba, br, s, CRITIC_EPOCH, ACTOR_EPOCH\n",
-    "                )\n",
-    "                epActorLossHis.append(epActorLoss)\n",
-    "                epCriticLossHis.append(epCriticLoss)\n",
-    "        # update History Recorder\n",
-    "        totalActorLossHis.append(np.mean(epActorLossHis))\n",
-    "        totalCriticLossHis.append(np.mean(epCriticLossHis))\n",
-    "        totalRewardHis.append(epTotalReward)\n",
-    "\n",
-    "    if epTotalReward > maxTotalReward and epTotalReward != 0:\n",
-    "        maxTotalReward = epTotalReward\n",
-    "        agent.saveWeights(epTotalReward)\n",
-    "        print(\"New Record! Save NN\", epTotalReward)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "aaa =  0\n",
-      "aaa =  1\n",
-      "aaa =  2\n",
-      "aaa =  3\n",
-      "aaa =  4\n",
-      "aaa =  5\n",
-      "aaa =  6\n",
-      "aaa =  7\n",
-      "aaa =  8\n",
-      "aaa =  9\n"
-     ]
-    }
-   ],
-   "source": [
-    "aaa = 0\n",
-    "while aaa<10:\n",
-    "    print(\"aaa = \",aaa)\n",
-    "    aaa+=1"
-   ]
-  }
- ],
- "metadata": {
-  "interpreter": {
-   "hash": "86e2db13b09bd6be22cb599ea60c1572b9ef36ebeaa27a4c8e961d6df315ac32"
-  },
-  "kernelspec": {
-   "display_name": "Python 3.9.7 64-bit",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.7"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/Aimbot-PPO-Python/main.py
+++ b/Aimbot-PPO-Python/main.py
@ -1,25 +0,0 @@
-import aimBotEnv
-import PPO
-
-ENV_PATH = './Build/Aimbot-PPO'
-WORKER_ID = 100
-
-MAX_EP = 1000
-EP_LENGTH = 400
-GAMMA = 0.99 # discount future reward (UP?)
-EPSILON = 0.2 # clip Ratio range[1-EPSILON,1+EPSILON]
-ACTOR_LR = 1e-5 # LR
-CRITIC_LR = 2e-5 # LR
-BATCH = 32 # learning step
-ACTOR_EPOCH = 10 # epoch
-CRITIC_EPOCH = 10 # epoch
-ENTROPY_WHEIGHT = 0.01 # sigma's entropy in Actor loss
-ACTION_INTERVAL = 1 # take action every ACTION_INTERVAL steps
-TRAIN = True
-
-env = aimBotEnv.makeEnv(envPath = ENV_PATH,workerID = WORKER_ID)
-STATE_SIZE = env.STATE_SIZE
-CONTINUOUS_SIZE = env.CONTINUOUS_SIZE
-DISCRETE_SIZE = env.DISCRETE_SIZE
-
-CTN_ACTION_RANGE = 2
--- a/Aimbot-PPO-Python/testarea.ipynb
+++ b/Aimbot-PPO-Python/testarea.ipynb
@ -29,7 +29,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -73,7 +73,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
@ -118,7 +118,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -182,7 +182,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
@ -205,7 +205,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
@ -240,16 +240,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "True"
+       "array([[0.]])"
      ]
     },
-     "execution_count": 20,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -259,28 +259,119 @@
    "\n",
    "a = np.array([10, 20, 30, 0])\n",
    "\n",
-    "np.any(a == 0)\n"
+    "np.asarray([[0.]])\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "4"
+       "1.5"
      ]
     },
-     "execution_count": 1,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "asd = \"adsf\"\n",
-    "len(asd)"
+    "import numpy as np\n",
+    "\n",
+    "asd = [1,2,3,np.array([0.5]),np.array([0.5])]\n",
+    "\n",
+    "asd[3:]\n",
+    "len(asd)\n",
+    "\n",
+    "np.mean([1,2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.0\n",
+      "0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "import pyautogui as pag\n",
+    "\n",
+    "from pynput.mouse import Button, Controller\n",
+    "\n",
+    "w = pag.size().width\n",
+    "h = pag.size().height\n",
+    "mouse = Controller()\n",
+    "\n",
+    "nowt = time.time()\n",
+    "\n",
+    "middletime = time.time() - nowt\n",
+    "print(middletime)\n",
+    "# print(nowPos-(w/2))\n",
+    "\n",
+    "print(time.time() - middletime - nowt)\n",
+    "while True:\n",
+    "    x,_ = mouse.position\n",
+    "    #print(mouse.press)\n",
+    "    #print(mouse.position)\n",
+    "    \n",
+    "    mouse.position = (w / 2, h / 2)\n",
+    "    time.sleep(1/60)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import pyautogui as pag\n",
+    "\n",
+    "import mouse\n",
+    "\n",
+    "w = pag.size().width\n",
+    "h = pag.size().height\n",
+    "\n",
+    "nowt = time.time()\n",
+    "\n",
+    "middletime = time.time() - nowt\n",
+    "print(middletime)\n",
+    "# print(nowPos-(w/2))\n",
+    "\n",
+    "print(time.time() - middletime - nowt)\n",
+    "while True:\n",
+    "    x = mouse.get_position()\n",
+    "    print(x)\n",
+    "    #print(mouse.position)\n",
+    "    \n",
+    "    mouse.move(w / 2, h / 2)\n",
+    "    time.sleep(1/60)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import keyboard\n",
+    "\n",
+    "while True:\n",
+    "    if keyboard.is_pressed(\"w\"):\n",
+    "        print(\"w\")\n",
+    "    elif keyboard.is_pressed(\"s\"):\n",
+    "        print(\"s\")"
   ]
  }
 ],