Update PPO class,add python human control

Python: Update PPO class add python human control Unity: add FP/TP choose button
2022-10-11 06:40:15 +09:00 · 2022-10-11 06:40:15 +09:00 · ae8a1ba8e2
commit ae8a1ba8e2
parent de066f3a65
26 changed files with 3639 additions and 990 deletions
--- a/Aimbot-PPO-MultiScene/Assets/Demonstrations.meta
+++ b/Aimbot-PPO-MultiScene/Assets/Demonstrations.meta
@ -0,0 +1,8 @@
 fileFormatVersion: 2
 guid: d65d9ca7ae1253341b6790f3a23e3a11
 folderAsset: yes
 DefaultImporter:
  externalObjects: {}
  userData: 
  assetBundleName: 
  assetBundleVariant: 
--- a/Aimbot-PPO-MultiScene/Assets/Demonstrations/AKMAgent.demo
+++ b/Aimbot-PPO-MultiScene/Assets/Demonstrations/AKMAgent.demo
--- a/Aimbot-PPO-MultiScene/Assets/Demonstrations/AKMAgent.demo.meta
+++ b/Aimbot-PPO-MultiScene/Assets/Demonstrations/AKMAgent.demo.meta
@ -0,0 +1,10 @@
 fileFormatVersion: 2
 guid: 39a127fc79ed92d4e88aec711f545d5f
 ScriptedImporter:
  internalIDToNameTable: []
  externalObjects: {}
  serializedVersion: 2
  userData: ' (Unity.MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Aimbot-PPO-MultiScene/Assets/ML-Agents/Timers/InGame_timers.json
+++ b/Aimbot-PPO-MultiScene/Assets/ML-Agents/Timers/InGame_timers.json
@ -1 +1 @@
-{"count":1,"self":33.6679968,"total":34.5046305,"children":{"InitializeActuators":{"count":2,"self":0.0010002,"total":0.0010002,"children":null},"InitializeSensors":{"count":2,"self":0.0010004,"total":0.0010004,"children":null},"AgentSendState":{"count":1489,"self":0.011503399999999999,"total":0.2010688,"children":{"CollectObservations":{"count":1489,"self":0.1780647,"total":0.1780647,"children":null},"WriteActionMask":{"count":1488,"self":0.0019993999999999997,"total":0.0019993999999999997,"children":null},"RequestDecision":{"count":1488,"self":0.009501299999999999,"total":0.009501299999999999,"children":null}}},"DecideAction":{"count":1488,"self":0.0117408,"total":0.0117408,"children":null},"AgentAct":{"count":1488,"self":0.6208231,"total":0.6208231,"children":null}},"gauges":{},"metadata":{"timer_format_version":"0.1.0","start_time_seconds":"1663089804","unity_version":"2020.3.19f1","command_line_arguments":"C:\\Program Files\\Unity\\Hub\\Editor\\2020.3.19f1\\Editor\\Unity.exe -projectpath C:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-MultiScene -useHub -hubIPC -cloudEnvironment production -licensingIpc LicenseClient-UCUNI -hubSessionId 4cf980b0-326c-11ed-87c2-a7333acffe7c -accessToken j61gZPw8-vc4ZH7TJMvrSAAPQLV9SK6U72z_dek2xhw00ef","communication_protocol_version":"1.5.0","com.unity.ml-agents_version":"2.0.0","scene_name":"InGame","end_time_seconds":"1663089838"}}
+{"count":1,"self":42.3855296,"total":42.4020608,"children":{"InitializeActuators":{"count":2,"self":0.0015155,"total":0.0015155,"children":null},"InitializeSensors":{"count":2,"self":0.0015017,"total":0.0015017,"children":null},"AgentSendState":{"count":1898,"self":0.0025031999999999997,"total":0.0025031999999999997,"children":null},"DecideAction":{"count":1898,"self":0.0070091999999999993,"total":0.0070091999999999993,"children":null},"AgentAct":{"count":1898,"self":0.0030023,"total":0.0030023,"children":null}},"gauges":{},"metadata":{"timer_format_version":"0.1.0","start_time_seconds":"1665414279","unity_version":"2020.3.19f1","command_line_arguments":"C:\\Program Files\\Unity\\Hub\\Editor\\2020.3.19f1\\Editor\\Unity.exe -projectpath C:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-MultiScene -useHub -hubIPC -cloudEnvironment production -licensingIpc LicenseClient-UCUNI -hubSessionId 39022900-48a5-11ed-b848-09be5949a456 -accessToken _47qt9I_MF3bhL7JS735Xdmfj8A4dGBOdRNKR0X2L_w00ef","communication_protocol_version":"1.5.0","com.unity.ml-agents_version":"2.0.0","scene_name":"InGame","end_time_seconds":"1665414322"}}
--- a/Aimbot-PPO-MultiScene/Assets/ML-Agents/Timers/Start_timers.json
+++ b/Aimbot-PPO-MultiScene/Assets/ML-Agents/Timers/Start_timers.json
@ -1 +1 @@
-{"count":1,"self":114.25904639999999,"total":114.62062499999999,"children":{"InitializeActuators":{"count":2,"self":0.0010000999999999999,"total":0.0010000999999999999,"children":null},"InitializeSensors":{"count":2,"self":0.0010002,"total":0.0010002,"children":null},"AgentSendState":{"count":1382,"self":0.0080028,"total":0.0195053,"children":{"CollectObservations":{"count":1382,"self":0.0070022999999999995,"total":0.0070022999999999995,"children":null},"WriteActionMask":{"count":1382,"self":0.0004994,"total":0.0004994,"children":null},"RequestDecision":{"count":1382,"self":0.0040008,"total":0.0040008,"children":null}}},"DecideAction":{"count":1382,"self":0.0110034,"total":0.0110034,"children":null},"AgentAct":{"count":1382,"self":0.3290731,"total":0.3290731,"children":null}},"gauges":{},"metadata":{"timer_format_version":"0.1.0","start_time_seconds":"1662500099","unity_version":"2020.3.19f1","command_line_arguments":"C:\\Program Files\\Unity\\Hub\\Editor\\2020.3.19f1\\Editor\\Unity.exe -projectpath C:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-MultiScene -useHub -hubIPC -cloudEnvironment production -licensingIpc LicenseClient-UCUNI -hubSessionId 209fdf30-2c1f-11ed-916f-33e85f4223cc -accessToken 78EBbrn-dg5kE__h3rNOqQVTDU3b1xUmmwWF1c5sFLc00ef","communication_protocol_version":"1.5.0","com.unity.ml-agents_version":"2.0.0","scene_name":"Start","end_time_seconds":"1662500214"}}
+{"count":1,"self":100.7007424,"total":102.0526476,"children":{"InitializeActuators":{"count":2,"self":0.0015004999999999999,"total":0.0015004999999999999,"children":null},"InitializeSensors":{"count":2,"self":0.0010015,"total":0.0010015,"children":null},"AgentSendState":{"count":2851,"self":0.0227973,"total":0.3594312,"children":{"CollectObservations":{"count":2851,"self":0.3230326,"total":0.3230326,"children":null},"WriteActionMask":{"count":2850,"self":0.0040877,"total":0.0040877,"children":null},"RequestDecision":{"count":2850,"self":0.0095135999999999988,"total":0.0095135999999999988,"children":null}}},"DecideAction":{"count":2850,"self":0.0184923,"total":0.0184923,"children":null},"AgentAct":{"count":2850,"self":0.971482,"total":0.971482,"children":null}},"gauges":{"AKMAgent.CumulativeReward":{"count":1,"max":0,"min":0,"runningAverage":0,"value":0,"weightedAverage":0}},"metadata":{"timer_format_version":"0.1.0","start_time_seconds":"1665340408","unity_version":"2020.3.19f1","command_line_arguments":"C:\\Program Files\\Unity\\Hub\\Editor\\2020.3.19f1\\Editor\\Unity.exe -projectpath C:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-MultiScene -useHub -hubIPC -cloudEnvironment production -licensingIpc LicenseClient-8AgJBC01I23iOtjIDvezn -hubSessionId a2bff0f0-47ee-11ed-98ba-e72fca9de6f1 -accessToken VHkJOvWIH11sBEzC18rl6YA9y6y2sRMQj2zrOyZdNeE00ef","communication_protocol_version":"1.5.0","com.unity.ml-agents_version":"2.0.0","scene_name":"Start","end_time_seconds":"1665340510"}}
--- a/Aimbot-PPO-MultiScene/Assets/Scenes/InGame.unity
+++ b/Aimbot-PPO-MultiScene/Assets/Scenes/InGame.unity
--- a/Aimbot-PPO-MultiScene/Assets/Scenes/Start.unity
+++ b/Aimbot-PPO-MultiScene/Assets/Scenes/Start.unity
--- a/Aimbot-PPO-MultiScene/Assets/Script/InGame/AgentWithGun.cs
+++ b/Aimbot-PPO-MultiScene/Assets/Script/InGame/AgentWithGun.cs
@ -21,6 +21,8 @@ public class AgentWithGun : Agent
    public Camera thisCam;
    public CharacterController PlayerController;
    public GameObject enemyPrefab;
    public GameObject cameraChangerOBJ;
    [Header("Rewards")]
    [Tooltip("Nothing happened reward")]
@ -76,10 +78,13 @@ public class AgentWithGun : Agent
    private string LoadDirTime;
    private float LoadDirDateF;
    private float loadDirTimeF;
    public bool defaultTPCamera = true;
    private StartSeneData DataTransfer;
    private UIController UICon;
    private HistoryRecorder HistoryRec;
    private RaySensors rayScript;
    private CameraChange camChanger;
    [System.NonSerialized] public float nonReward;
    [System.NonSerialized] public float shootReward;
@ -118,6 +123,8 @@ public class AgentWithGun : Agent
            killRewardDefault = DataTransfer.killReward;
            winRewardDefault = DataTransfer.winReward;
            loseRewardDefault = DataTransfer.loseReward;
            lockMouse = DataTransfer.lockMouse;
            defaultTPCamera = DataTransfer.defaultTPCamera;
            // change Decision Period & Take Actions Between Decisions
            transform.GetComponent<DecisionRequester>().DecisionPeriod = DataTransfer.DecisionPeriod;
@ -156,6 +163,7 @@ public class AgentWithGun : Agent
            UICon = transform.GetComponent<UIController>();
            HistoryRec = transform.GetComponent<HistoryRecorder>();
            rayScript = GetComponent<RaySensors>();
            camChanger = cameraChangerOBJ.GetComponent<CameraChange>();
            // give default Reward to Reward value will be used.
            nonReward = nonRewardDefault;
@ -167,6 +175,15 @@ public class AgentWithGun : Agent
            killReward = killRewardDefault;
            //initialize remainTime
            remainTime = (int)(timeLimit - Time.time + startTime);
            // change default camera view
            if (defaultTPCamera)
            {
                camChanger.ShowTPSView();
            }
            else
            {
                camChanger.ShowFPSView();
            }
        }
    }
@ -203,27 +220,9 @@ public class AgentWithGun : Agent
    // ------------动作处理--------------
    // moveAgent 用于模拟Input.GetAxis移动
-    public void moveAgent(int kW, int kS,int kA,int kD)
+    public void moveAgent(int vertical, int horizontal)
    {
        Vector3 thisMovement;
        int horizontal = 0;
        int vertical = 0;
        if (kW==1 && kS != 1)
        {
            vertical = 1;
        }
        else if (kS==1 && kW!=1)
        {
            vertical = -1;
        }
        if (kD==1 && kA!=1)
        {
            horizontal = 1;
        }
        else if (kA ==1 && kD!=1)
        {
            horizontal = -1;
        }
        if (horizontal != 0)//当按下按键（水平方向）
        {
@ -506,7 +505,7 @@ public class AgentWithGun : Agent
        }
        if (lockMouse)
        {
-            Cursor.lockState = CursorLockMode.Locked; // 隐藏并且锁定鼠标
+            Cursor.lockState = CursorLockMode.Locked; // hide and lock the mouse
        }
        //iniCharts();
        thisAgentObj.name = thisAgentObj.GetInstanceID().ToString();
@ -549,35 +548,26 @@ public class AgentWithGun : Agent
    public override void OnActionReceived(ActionBuffers actionBuffers)
    {
        //获取输入
-        int kW = actionBuffers.DiscreteActions[0];
+        int vertical = actionBuffers.DiscreteActions[0];
-        int kS = actionBuffers.DiscreteActions[1];
+        int horizontal = actionBuffers.DiscreteActions[1];
-        int kA = actionBuffers.DiscreteActions[2];
+        int mouseShoot = actionBuffers.DiscreteActions[2];
        int kD = actionBuffers.DiscreteActions[3];
        int mouseShoot = actionBuffers.DiscreteActions[4];
        float Mouse_X = actionBuffers.ContinuousActions[0];
-        //float Mouse_Y = actionBuffers.ContinuousActions[1];
+        if (vertical == 2) vertical = -1;
-        //int timeLimitControl = (int)actionBuffers.ContinuousActions[2];
+        if (horizontal == 2) horizontal = -1;
        //float nonRewardIn = actionBuffers.ContinuousActions[1];
        //float shootRewardIn = actionBuffers.ContinuousActions[2];
        //float shootWithoutReadyRewardIn = actionBuffers.ContinuousActions[3];
        //float hitRewardIn = actionBuffers.ContinuousActions[4];
        //float winRewardIn = actionBuffers.ContinuousActions[5];
        // loseRewardIn = actionBuffers.ContinuousActions[6];
        //float killRewardIn = actionBuffers.ContinuousActions[7];
        //Rewards Update
        remainTime = (int)(timeLimit - Time.time + startTime);
        //应用输入
        shoot = mouseShoot;
-        HistoryRec.realTimeKeyCounter(kW, kS, kA, kD, shoot);
+        HistoryRec.realTimeKeyCounter(vertical, horizontal, shoot);
        (int kWCount, int kSCount, int kACount, int kDCount, int shootCount) = HistoryRec.getKeyCount();
        UICon.updateRemainTime(remainTime);
-        UICon.updateWASDKeyViewer(kW, kS, kA, kD);
+        UICon.updateRemainEnemy(enemyNum);
        UICon.updateWASDKeyViewer(vertical, horizontal);
        UICon.updateKeyCounterChart(kWCount, kSCount, kACount, kDCount, shootCount);
        UICon.updateMouseMovementViewer(Mouse_X);
        UICon.updateRewardViewer(nonReward, shootReward, shootWithoutReadyReward, hitReward, winReward, loseReward, killReward);
        cameraControl(Mouse_X, 0);
-        moveAgent(kW, kS, kA, kD);
+        moveAgent(vertical, horizontal);
        float thisRoundReward = rewardCalculate();
        //判断结束
@ -628,37 +618,45 @@ public class AgentWithGun : Agent
        ActionSegment<float> continuousActions = actionsOut.ContinuousActions;
        ActionSegment<int> discreteActions = actionsOut.DiscreteActions;
-        int kW = 0;
+        int vertical = 0;
-        int kS = 0;
+        int horizontal = 0;
-        int kA = 0;
+        if (Input.GetKey(KeyCode.W) && !Input.GetKey(KeyCode.S))
        int kD = 0;
        if (Input.GetKey(KeyCode.W))
        {
-            kW = 1;
+            vertical = 1;
        }
-        if (Input.GetKey(KeyCode.S))
+        else if (Input.GetKey(KeyCode.S) && !Input.GetKey(KeyCode.W))
        {
-            kS = 1;
+            vertical = -1;
        }
-        if (Input.GetKey(KeyCode.A))
+        else
        {
-            kA = 1;
+            vertical = 0;
        }
-        if (Input.GetKey(KeyCode.D))
+        if (Input.GetKey(KeyCode.D) && !Input.GetKey(KeyCode.A))
        {
-            kD = 1;
+            horizontal = 1;
        }
        else if (Input.GetKey(KeyCode.A) && !Input.GetKey(KeyCode.D))
        {
            horizontal = -1;
        }
        else
        {
            horizontal = 0;
        }
        discreteActions[0] = kW;
        discreteActions[1] = kS;
        discreteActions[2] = kA;
        discreteActions[3] = kD;
        if (Input.GetMouseButton(0))
        {
            // Debug.Log("mousebuttonhit");
            shoot = 1;
        }
-        discreteActions[4] = shoot;
+        else
        {
            shoot = 0;
        }
        discreteActions[0] = vertical;
        discreteActions[1] = horizontal;
        discreteActions[2] = shoot;
        //^^^^^^^^^^^^^^^^^^^^^discrete-Control^^^^^^^^^^^^^^^^^^^^^^
        //vvvvvvvvvvvvvvvvvvvvvvvvvvvvvcontinuous-Controlvvvvvvvvvvvvvvvvvvvvvv
--- a/Aimbot-PPO-MultiScene/Assets/Script/InGame/HistoryRecorder.cs
+++ b/Aimbot-PPO-MultiScene/Assets/Script/InGame/HistoryRecorder.cs
@ -31,24 +31,24 @@ public class HistoryRecorder : MonoBehaviour
    {
        EPTotalShootCount.Add(TotalShootCount);
    }
-    public void realTimeKeyCounter(int kW, int kS, int kA, int kD, int shoot)
+    public void realTimeKeyCounter(int vertical,  int horizontal, int shoot)
    {
-        if (kW == 1)
+        if (vertical == 1)
        {
            realTimeWKeyCount += 1;
        }
-        if (kS == 1)
+        else if (vertical == -1)
        {
            realTimeSKeyCount += 1;
        }
-        if (kA == 1)
+        if (horizontal == 1)
        {
            realTimeAKeyCount += 1;
        }
        if (kD == 1)
        {
            realTimeDKeyCount += 1;
        }
        else if (horizontal == -1)
        {
            realTimeAKeyCount += 1;
        }
        if (shoot == 1)
        {
            realTimeShootCount += 1;
--- a/Aimbot-PPO-MultiScene/Assets/Script/InGame/UIController.cs
+++ b/Aimbot-PPO-MultiScene/Assets/Script/InGame/UIController.cs
@ -69,39 +69,37 @@ public class UIController : MonoBehaviour
    }
    //------------Key Viewer----------
-    public void updateWASDKeyViewer(int kW,int kS,int kA,int kD)
+    public void updateWASDKeyViewer(int vertical,int horizontal)
    {
-        if (kW == 1)
+        if (vertical == 1)
        {
            upText.color = Color.red;
            downText.color = Color.black;
        }
-        else
+        else if (vertical == -1)
        {
            upText.color = Color.black;
        }
        if (kS == 1)
        {
            downText.color = Color.red;
            upText.color = Color.black;
        }
        else
        {
            downText.color = Color.black;
            upText.color = Color.black;
        }
-        if(kA == 1)
+        if (horizontal == 1)
        {
            leftText.color = Color.red;
        }
        else
        {
            leftText.color = Color.black;
        }
        if( kD == 1)
        {
            rightText.color = Color.red;
            leftText.color = Color.black;
        }
        else if (horizontal == -1)
        {
            leftText.color = Color.red;
            rightText.color = Color.black;
        }
        else
        {
-            rightText.color = Color.black;
+            downText.color = Color.black;
            upText.color = Color.black;
        }
    }
    public void updateShootKeyViewer(int shoot,bool isGunReady)
--- a/Aimbot-PPO-MultiScene/Assets/Script/InGame/gameFlowController.cs
+++ b/Aimbot-PPO-MultiScene/Assets/Script/InGame/gameFlowController.cs
@ -0,0 +1,27 @@
 using System.Collections;
 using System.Collections.Generic;
 using UnityEngine;
 public class gameFlowController : MonoBehaviour
 {
    public GameObject Agent;
    AgentWithGun agentWithGun;
    // Start is called before the first frame update
    void Start()
    {
        agentWithGun = Agent.GetComponent<AgentWithGun>();
    }
    // Update is called once per frame
    void Update()
    {
        if (Input.GetKey(KeyCode.Escape))
        {
            Application.Quit();
        }
        if (Input.GetKey(KeyCode.L))
        {
            agentWithGun.lockMouse = !agentWithGun.lockMouse;
        }
    }
 }
--- a/Aimbot-PPO-MultiScene/Assets/Script/InGame/gameFlowController.cs.meta
+++ b/Aimbot-PPO-MultiScene/Assets/Script/InGame/gameFlowController.cs.meta
@ -0,0 +1,11 @@
 fileFormatVersion: 2
 guid: 9a8fb4d12d4b8fc4784f3e142e7fdcf8
 MonoImporter:
  externalObjects: {}
  serializedVersion: 2
  defaultReferences: []
  executionOrder: 0
  icon: {instanceID: 0}
  userData: 
  assetBundleName: 
  assetBundleVariant: 
--- a/Aimbot-PPO-MultiScene/Assets/Script/Start/EnvArgsChanger.cs
+++ b/Aimbot-PPO-MultiScene/Assets/Script/Start/EnvArgsChanger.cs
@ -19,6 +19,21 @@ public class EnvArgsChanger : MonoBehaviour
    public Text DecisionPeriodDataText;
    public Toggle TakeActionsBetweenDecisionsToggle;
    [Header("Lock Mouse")]
    public Toggle LockMouseToggle;
    [Header("Default Camera")]
    public Toggle FPToggle;
    public Text FPText;
    public Toggle TPToggle;
    public Text TPText;
    private StartSeneData startSeneData;
    private void Start()
    {
        startSeneData = DataTransfer.GetComponent<StartSeneData>();
    }
    public void onEnemynumValueChanged()
    {
@ -30,7 +45,7 @@ public class EnvArgsChanger : MonoBehaviour
        else
        {
            EnemyNumText.color = Color.yellow;
-            DataTransfer.GetComponent<StartSeneData>().EnemyNum = Math.Abs(int.Parse(EnemyNumInput.GetComponent<InputField>().text));
+            startSeneData.EnemyNum = Math.Abs(int.Parse(EnemyNumInput.GetComponent<InputField>().text));
        }
    }
@ -44,19 +59,48 @@ public class EnvArgsChanger : MonoBehaviour
        else
        {
            TimeLimText.color = Color.yellow;
-            DataTransfer.GetComponent<StartSeneData>().Timelim = Math.Abs(int.Parse(TimelimInput.GetComponent<InputField>().text));
+            startSeneData.Timelim = Math.Abs(int.Parse(TimelimInput.GetComponent<InputField>().text));
        }
    }
    public void onDPSlideValueChanged()
    {
        // DecisionPeriod(DP) value Control
-        DataTransfer.GetComponent<StartSeneData>().DecisionPeriod = (int)(DecisionPeriodSlide.GetComponent<Slider>().value);
+        startSeneData.DecisionPeriod = (int)(DecisionPeriodSlide.GetComponent<Slider>().value);
-        DecisionPeriodDataText.text = DataTransfer.GetComponent<StartSeneData>().DecisionPeriod.ToString();
+        DecisionPeriodDataText.text = startSeneData.DecisionPeriod.ToString();
    }
    public void onABDToggleChanged()
    {
        // Actions Between Decisions(ABD) Toggle Control
-        DataTransfer.GetComponent<StartSeneData>().ActionsBetweenDecisions = TakeActionsBetweenDecisionsToggle.isOn;
+        startSeneData.ActionsBetweenDecisions = TakeActionsBetweenDecisionsToggle.isOn;
    }
    public void onLockMouseToggleChanged()
    {
        // lock mouse or not
        startSeneData.lockMouse = LockMouseToggle.isOn;
    }
    public void onTPCamToggleChanged()
    {
        startSeneData.defaultTPCamera = true;
        FPToggle.interactable = true;
        FPToggle.SetIsOnWithoutNotify(false);
        FPText.color = Color.gray;
        TPToggle.SetIsOnWithoutNotify(true);
        TPToggle.interactable = false;
        TPText.color = Color.green;
    }
    public void onFPCameToggleChanged()
    {
        startSeneData.defaultTPCamera = false;
        TPToggle.interactable = true;
        TPToggle.SetIsOnWithoutNotify(false);
        TPText.color = Color.gray;
        FPToggle.SetIsOnWithoutNotify(true);
        FPToggle.interactable = false;
        FPText.color = Color.green;
    }
 }
--- a/Aimbot-PPO-MultiScene/Assets/Script/Start/StartSeneData.cs
+++ b/Aimbot-PPO-MultiScene/Assets/Script/Start/StartSeneData.cs
@ -13,6 +13,8 @@ public class StartSeneData : MonoBehaviour
    public float killRewardDefault = 10.0f;
    public float winRewardDefault = 20.0f;
    public float loseRewardDefault = -10.0f;
    public bool lockMouse = false;
    public bool defaultTPCamera = true;
    // LoadDir
    [System.NonSerialized]public string LoadDirDate = "0";
--- a/Aimbot-PPO-Python/DemoRecorder.ipynb
+++ b/Aimbot-PPO-Python/DemoRecorder.ipynb
@ -0,0 +1,90 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "√√√√√Enviroment Initialized Success√√√√√\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "import aimBotEnv\n",
    "from HumanAction import HumanActions\n",
    "\n",
    "# Env\n",
    "ENV_PATH = \"./Build-CloseEnemyCut/Aimbot-PPO\"\n",
    "WORKER_ID = 1\n",
    "BASE_PORT = 200\n",
    "\n",
    "MOUSEDISCOUNT = 8.0\n",
    "MAX_EP = 10000000\n",
    "\n",
    "env = aimBotEnv.makeEnv(envPath=ENV_PATH, workerID=WORKER_ID, basePort=BASE_PORT)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "UnityCommunicatorStoppedException",
     "evalue": "Communicator has exited.",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mUnityCommunicatorStoppedException\u001b[0m         Traceback (most recent call last)",
      "\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_37248/645561173.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[1;32mwhile\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mdone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m         \u001b[0mactions\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdemoAct\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetHumanActions\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m         \u001b[0menv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mactions\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mactions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mc:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-Python\\aimBotEnv.py\u001b[0m in \u001b[0;36mstep\u001b[1;34m(self, actions, behaviorName, trackedAgent)\u001b[0m\n\u001b[0;32m     72\u001b[0m         \u001b[1;31m# take action to env\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     73\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0menv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_actions\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbehavior_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbehaviorName\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maction\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mthisActionTuple\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 74\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0menv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     75\u001b[0m         \u001b[1;31m# get nextState & reward & done after this action\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     76\u001b[0m         \u001b[0mnextState\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreward\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mloadDir\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msaveNow\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetSteps\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbehaviorName\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrackedAgent\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\mlagents_envs\\timers.py\u001b[0m in \u001b[0;36mwrapped\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    303\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mwrapped\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    304\u001b[0m         \u001b[1;32mwith\u001b[0m \u001b[0mhierarchical_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__qualname__\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 305\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    306\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    307\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0mwrapped\u001b[0m  \u001b[1;31m# type: ignore\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\mlagents_envs\\environment.py\u001b[0m in \u001b[0;36mstep\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    333\u001b[0m             \u001b[0moutputs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_communicator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexchange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstep_input\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_poll_process\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    334\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0moutputs\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 335\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mUnityCommunicatorStoppedException\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Communicator has exited.\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    336\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_update_behavior_specs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    337\u001b[0m         \u001b[0mrl_output\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0moutputs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrl_output\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mUnityCommunicatorStoppedException\u001b[0m: Communicator has exited."
     ]
    }
   ],
   "source": [
    "done = False\n",
    "env.reset()\n",
    "demoAct = HumanActions(mouseDiscount=MOUSEDISCOUNT)\n",
    "for ep in range(MAX_EP):\n",
    "    while not done:\n",
    "        actions = demoAct.getHumanActions()\n",
    "        env.step(actions=actions)6\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.7 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "86e2db13b09bd6be22cb599ea60c1572b9ef36ebeaa27a4c8e961d6df315ac32"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/Aimbot-PPO-Python/HumanAction.py
+++ b/Aimbot-PPO-Python/HumanAction.py
@ -0,0 +1,51 @@
 import keyboard
 import mouse
 class HumanActions:
    def __init__(self, mouseDiscount: float = 10, screenW: int = 1920, screenH: int = 1080):
        def multiPressed():
            pass
        keyboard.add_hotkey("w+a", multiPressed)
        keyboard.add_hotkey("w+d", multiPressed)
        keyboard.add_hotkey("s+a", multiPressed)
        keyboard.add_hotkey("s+d", multiPressed)
        self.screenW = screenW
        self.screenH = screenH
        self.MOUSEDISCOUNT = mouseDiscount
    def getHumanActions(self):
        x, _ = mouse.get_position()
        xMovement = (x - self.screenW / 2) / self.MOUSEDISCOUNT
        ws = 0
        ad = 0
        click = 0
        if keyboard.is_pressed("w"):
            ws = 1
        elif keyboard.is_pressed("s"):
            ws = 2
        if keyboard.is_pressed("d"):
            ad = 1
        elif keyboard.is_pressed("a"):
            ad = 2
        if keyboard.is_pressed("w+d"):
            ws = 1
            ad = 1
        elif keyboard.is_pressed("w+a"):
            ws = 1
            ad = 2
        elif keyboard.is_pressed("s+d"):
            ws = 2
            ad = 1
        elif keyboard.is_pressed("s+a"):
            ws = 2
            ad = 2
        if mouse.is_pressed(button="left"):
            click = 1
        actions = [ws, ad, click, [xMovement]]
        mouse.move(self.screenW / 2, self.screenH / 2)
        return actions
--- a/Aimbot-PPO-Python/PPO-mian.ipynb
+++ b/Aimbot-PPO-Python/PPO-mian.ipynb
--- a/Aimbot-PPO-Python/PPO.py
+++ b/Aimbot-PPO-Python/PPO.py
@ -1,108 +1,213 @@
 import tensorflow as tf
 from tensorflow.python.ops.numpy_ops import ndarray
 import tensorflow_probability as tfp
 import numpy as np
 import time
 import math
-import copy
+
 import datetime
-import os
+from PPOConfig import PPOConfig
 from tensorflow import keras
 from tensorflow.keras import layers
 from tensorflow.keras import optimizers
-from keras_radam import RAdam
+
 EPS = 1e-10
 class PPO(object):
-    """Create PPO Agent
+    def __init__(
        self,
        stateSize: int,
        disActShape: list,
        conActSize: int,
        conActRange: float,
        PPOConfig: PPOConfig,
    ):
        """initialize PPO
        Args:
            stateSize (int): enviroment state size
            disActShape (numpy): discrete Action shape.
                                just like [3,2],means 2 type of dis actions,each act include 3 and 2 types
                                if no discrete action output then use [0].
            conActSize (int): continuous Action Size. if no continuous action output then use 0.
            conActRange (float): continuous action range. -conActRange to +conActRange
            PPOConfig (PPOConfig): PPO configuration
        """
-
+        # check use dis action or not.
-    def __init__(self, stateSize, disActShape, conActSize, conActRange, criticLR, actorLR, gamma, epsilon, entropyWeight, saveDir, loadModelDir):
+        if disActShape == [0]:
-        
+            # non dis action output
-        # check disActShape is correct(greater than 1)
+            self.disActSize = 0
            self.disOutputSize = 0
        else:
            # make sure disActShape greater than 1
            try:
                if np.any(np.array(disActShape) <= 1):
-                raise ValueError("disActShape error,disActShape should greater than 1 but get",disActShape)
+                    raise ValueError(
-        except ValueError as e:
+                        "disActShape error,disActShape should greater than 1 but get", disActShape
                    )
            except ValueError:
                raise
            self.disActSize = len(disActShape)
            self.disOutputSize = sum(disActShape)
        self.stateSize = stateSize
-        # self.actionSize = actionSize
+        self.disActShape = disActShape
        self.disActShape = disActShape # shape of discrete action output. like [3,3,2]
        self.disActSize = len(disActShape)
        self.conActSize = conActSize
        self.conActRange = conActRange
-        self.criticLR = criticLR
+        self.muSigSize = 2
-        self.actorLR = actorLR
+        self.conOutputSize = conActSize * self.muSigSize
        self.GAMMA = gamma
        self.EPSILON = epsilon
        self.saveDir = saveDir
        self.entropyWeight = entropyWeight
-        self.disOutputSize = sum(disActShape)
+        # config
-        self.conOutputSize = conActSize * 2
+        self.NNShape = PPOConfig.NNShape
        self.criticLR = PPOConfig.criticLR
        self.actorLR = PPOConfig.actorLR
        self.gamma = PPOConfig.gamma
        self.lmbda = PPOConfig.lmbda
        self.clipRange = PPOConfig.clipRange
        self.entropyWeight = PPOConfig.entropyWeight
        self.trainEpochs = PPOConfig.trainEpochs
        self.saveDir = PPOConfig.saveDir
        self.loadModelDir = PPOConfig.loadModelDir
        print("---------thisPPO Params---------")
        print("self.stateSize = ", self.stateSize)
        print("self.disActShape = ", self.disActShape)
        print("self.disActSize", self.disActSize)
        print("self.disOutputSize", self.disOutputSize)
        print("self.conActSize = ", self.conActSize)
        print("self.conActRange = ", self.conActRange)
        print("self.conOutputSize = ", self.conOutputSize)
-        if loadModelDir == None:
+        # config
        print("---------thisPPO config---------")
        print("self.NNShape = ", self.NNShape)
        print("self.criticLR = ", self.criticLR)
        print("self.actorLR = ", self.actorLR)
        print("self.gamma = ", self.gamma)
        print("self.lmbda = ", self.lmbda)
        print("self.clipRange = ", self.clipRange)
        print("self.entropyWeight = ", self.entropyWeight)
        print("self.trainEpochs = ", self.trainEpochs)
        print("self.saveDir = ", self.saveDir)
        print("self.loadModelDir = ", self.loadModelDir)
        # load NN or not
        if self.loadModelDir is None:
            # critc NN
            self.critic = self.buildCriticNet(self.stateSize, 1, compileModel=True)
            # actor NN
-            self.actor = self.buildActorNet(self.stateSize, self.conActRange, compileModel = True)
+            self.actor = self.buildActorNet(self.stateSize, compileModel=True)
            print("---------Actor Model Create Success---------")
            self.actor.summary()
            print("---------Critic Model Create Success---------")
            self.critic.summary()
        else:
            # critc NN
            self.critic = self.buildCriticNet(self.stateSize, 1, compileModel=True)
            # actor NN
-            self.actor = self.buildActorNet(self.stateSize, self.conActRange, compileModel=True)
+            self.actor = self.buildActorNet(self.stateSize, compileModel=True)
            # load weight to Critic&Actor NN
-            self.loadWeightToModels(loadModelDir)
+            self.loadWeightToModels(self.loadModelDir)
-            
+            print("---------Actor Model Load Success---------")
            self.actor.summary()
            print("---------Critic Model Load Success---------")
            self.critic.summary()
    # Build Net
-    def buildActorNet(self, inputSize, continuousActionRange,compileModel):
+    def buildActorNet(self, inputSize: int, compileModel: bool):
        """build Actor Nueral Net and compile.Output:[disAct1,disAct2,disAct3,mu,sigma]
        Args:
            inputSize (int): InputLayer Nueral size.
-            continuousActionRange (foat): continuous Action's max Range.
+            compileModel (bool): compile Model or not.
        Returns:
            keras.Model: return Actor NN
        """
-        stateInput = layers.Input(shape=(inputSize,), name='stateInput')
+        # -----------Input Layers-----------
-        dense0 = layers.Dense(500, activation='relu',name='dense0',)(stateInput)
+        stateInput = layers.Input(shape=(inputSize,), name="stateInput")
        dense1 = layers.Dense(200, activation='relu',name='dense1',)(dense0)
        dense2 = layers.Dense(100, activation='relu', name='dense2')(dense1)
-        disAct1 = layers.Dense(3, activation='softmax',name='WSAction')(dense2)  # WS
+        # -------Intermediate layers--------
-        disAct2 = layers.Dense(3, activation='softmax',name='ADAction')(dense2)  # AD
+        interLayers = []
-        disAct3 = layers.Dense(2, activation='softmax',name='ShootAction')(dense2)  # Mouse shoot
+        interLayersIndex = 0
-        mu = continuousActionRange * layers.Dense(1, activation='tanh', name='muOut')(dense2)  # mu，既正态分布mean
+        for neuralUnit in self.NNShape:
-        sigma = 1e-8 + layers.Dense(1, activation='softplus',name='sigmaOut')(dense2)  # sigma，既正态分布
+            thisLayerName = "dense" + str(interLayersIndex)
-        # musig = layers.concatenate([mu,sigma],name = 'musig')
+            if interLayersIndex == 0:
-        totalOut = layers.concatenate(
+                interLayers.append(
-            [disAct1, disAct2, disAct3, mu, sigma], name='totalOut')  # package
+                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(stateInput)
                )
            else:
                interLayers.append(
                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(interLayers[-1])
                )
            interLayersIndex += 1
        # ----------Output Layers-----------
        outputLayersList = []
        if self.disActSize != 0:
            # while NN have discrete action output.
            disActIndex = 0
            for thisDisActDepth in self.disActShape:
                thisDisActName = "disAct" + str(disActIndex)
                outputLayersList.append(
                    layers.Dense(thisDisActDepth, activation="softmax", name=thisDisActName)(
                        interLayers[-1]
                    )
                )
                disActIndex += 1
        if self.conActSize != 0:
            # while NN have continuous action output.
            mu = tf.multiply(
                layers.Dense(1, activation="tanh", name="muOut")(interLayers[-1]), self.conActRange
            )  # mu，既正态分布位置参数
            sigma = tf.add(
                layers.Dense(1, activation="softplus", name="sigmaOut")(interLayers[-1]), EPS
            )  # sigma，既正态分布尺度参数
            outputLayersList.append(mu)
            outputLayersList.append(sigma)
        totalOut = layers.concatenate(outputLayersList, name="totalOut")  # package
        # ----------Model Compile-----------
        model = keras.Model(inputs=stateInput, outputs=totalOut)
-        #actorOPT = optimizers.Adam(learning_rate = self.actorLR)
+        if compileModel:  # Compile Model
-        if compileModel:
+            actorOPT = optimizers.Adam(learning_rate=self.actorLR)
            actorOPT = RAdam(self.actorLR)
            model.compile(optimizer=actorOPT, loss=self.aLoss())
        return model
-    def buildCriticNet(self, inputSize, outputSize,compileModel):
+    def buildCriticNet(self, inputSize: int, outputSize: int, compileModel: bool):
        """build Critic Nueral Net and compile.Output:[Q]
        Args:
-            inputSize (int): InputLayer Neural Size
+            inputSize (int): input size
-            outputSize (float): Q size
+            outputSize (int): output size
            compileModel (bool): compile Model or not.
        Returns:
            keras.Model: return Critic NN
        """
-        stateInput = keras.Input(shape=(inputSize,))
+        # -----------Input Layers-----------
-        dense0 = layers.Dense(500, activation='relu',
+        stateInput = keras.Input(shape=(inputSize,), name="stateInput")
-                              name='dense0',)(stateInput)
+
-        dense1 = layers.Dense(200, activation='relu')(dense0)
+        # -------Intermediate layers--------
-        dense2 = layers.Dense(100, activation='relu')(dense1)
+        interLayers = []
-        output = layers.Dense(outputSize)(dense2)
+        interLayersIndex = 0
        for neuralUnit in self.NNShape:
            thisLayerName = "dense" + str(interLayersIndex)
            if interLayersIndex == 0:
                interLayers.append(
                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(stateInput)
                )
            else:
                interLayers.append(
                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(interLayers[-1])
                )
            interLayersIndex += 1
        # ----------Output Layers-----------
        output = layers.Dense(outputSize, activation=None)(interLayers[-1])
        # ----------Model Compile-----------
        model = keras.Model(inputs=stateInput, outputs=output)
        if compileModel:
            criticOPT = optimizers.Adam(learning_rate=self.criticLR)
@ -110,36 +215,50 @@ class PPO(object):
        return model
    # loss Function
    # critic loss
    def cLoss(self):
-        """Critic Loss function
+        """Critic Loss function"""
-        """
+
        def loss(y_true, y_pred):
            # y_true: discountedR
            # y_pred: critcV = model.predict(states)
-            advantage = y_true - y_pred  # TD error
+            adv = y_true - y_pred  # TD error
-            loss = tf.reduce_mean(tf.square(advantage))
+            loss = tf.reduce_mean(tf.square(adv))
            return loss
            return loss
        return loss
    # actor loss
    def aLoss(self):
-        def getDiscreteALoss(nowProbs,oldProbs,advantage):
+        """Actor Loss function"""
        def getDiscreteALoss(nowProbs, oldProbs, disOneHotAct, actShape, advantage):
            """get Discrete Action Loss
            Args:
-                nowProbs (tf.constant): (length,actionSize)
+                nowProbs (tf.constant): (length,actionProbSize)
-                oldProbs (tf.constant): (length,actionSize)
+                oldProbs (tf.constant): (length,actionProbSize)
                advantage (tf.constant): (length,)
            Returns:
                tf.constant: (length,)
            """
-            entropy = tf.reduce_mean(tf.math.multiply(nowProbs,tf.math.log(nowProbs+1e-6)))
+            entropy = tf.negative(
-            ratio = tf.math.divide(nowProbs,oldProbs+1e-6)
+                tf.reduce_mean(tf.math.multiply(nowProbs, tf.math.log(nowProbs + EPS)))
-            value = tf.math.multiply(ratio,tf.expand_dims(advantage,axis = 1))
+            )
-            clipRatio = tf.clip_by_value(ratio,1. - self.EPSILON,1.+self.EPSILON)
+            nowSingleProbs = tf.reduce_mean(tf.multiply(nowProbs, disOneHotAct), axis=1)
-            clipValue = tf.math.multiply(clipRatio,tf.expand_dims(advantage,axis = 1))
+            nowSingleProbs = tf.multiply(nowSingleProbs, actShape)
-            loss = -tf.reduce_mean(tf.math.minimum(value,clipValue)) + self.entropyWeight * entropy
+            oldSingleProbs = tf.reduce_mean(tf.multiply(oldProbs, disOneHotAct), axis=1)
            oldSingleProbs = tf.multiply(oldSingleProbs, actShape)
            ratio = tf.math.divide(nowSingleProbs, oldSingleProbs + EPS)
            value = tf.math.multiply(ratio, advantage)
            clipRatio = tf.clip_by_value(ratio, 1.0 - self.clipRange, 1.0 + self.clipRange)
            clipValue = tf.math.multiply(clipRatio, advantage)
            loss = tf.math.negative(
                tf.reduce_mean(tf.math.minimum(value, clipValue))
                - tf.multiply(self.entropyWeight, entropy)
            )
            return loss
        def getContinuousALoss(musig, actions, oldProbs, advantage):
@ -159,103 +278,286 @@ class PPO(object):
            dist = tfp.distributions.Normal(mu, sigma)
            nowProbs = dist.prob(actions)
            ratio = tf.math.divide(nowProbs,oldProbs+1e-6)
            entropy = tf.reduce_mean(dist.entropy())
-            value = tf.math.multiply(ratio,tf.expand_dims(advantage,axis = 1))
+            ratio = tf.math.divide(nowProbs, oldProbs + EPS)
-            clipValue = tf.clip_by_value(ratio,1. - self.EPSILON,1.+self.EPSILON) * advantage
+            value = tf.math.multiply(ratio, advantage)
-            loss = -tf.reduce_mean(tf.math.minimum(value,clipValue)) + self.entropyWeight * entropy
+            clipRatio = tf.clip_by_value(ratio, 1.0 - self.clipRange, 1.0 + self.clipRange)
            clipValue = tf.math.multiply(clipRatio, advantage)
            loss = tf.negative(
                tf.reduce_mean(tf.math.minimum(value, clipValue))
                - tf.multiply(self.entropyWeight, entropy)
            )
            return loss
        def loss(y_true, y_pred):
-            # y_true: [[disAct1, disAct2, disAct3, mu, sigma]]
+            # y_true: [[disActProb..., conActProbs..., disOneHotActs..., conAct..., advantage]]
-            # y_pred: muSigma = self.actor(state) = 
+            # y_pred: [[disActProb..., mu, sigma...]]
-            # [[disAct1, disAct2, disAct3, mu, sigma]]
+            totalALoss = 0
            oldDisProbs = y_true[:,0:self.disOutputSize]
            oldConMusigs = y_true[:,self.disOutputSize:self.disOutputSize+self.conActSize]
            conActions = y_true[:,self.disOutputSize+self.conActSize:self.disOutputSize+(self.conActSize*2)]
            advantage = y_true[:,-1]
            nowDisProbs = y_pred[:,0:self.disOutputSize] # [disAct1, disAct2, disAct3]
            nowConMusigs = y_pred[:,self.disOutputSize:] #[musig1,musig2]
            totalALoss = tf.constant([0.])
            totalActionNum = 0
            advantage = tf.expand_dims(y_true[:, -1], axis=1)
-            # for nowProb,oldProb in zip(tf.transpose(nowDisProbs,perm=[1,0,2]),tf.transpose(oldDisProbs,perm=[1,0,2])):
+            if self.disActSize != 0:
                # while NN have discrete action output.
                oldDisProbs = y_true[:, 0 : self.disOutputSize]
                nowDisProbs = y_pred[:, 0 : self.disOutputSize]  # [disAct1, disAct2, disAct3]
                disOneHotActs = y_true[
                    :,
                    self.disOutputSize
                    + self.conActSize : self.disOutputSize
                    + self.conActSize
                    + self.disOutputSize,
                ]
                lastDisActShape = 0
-            for shape in self.disActShape:
+                for thisShape in self.disActShape:
-                thisNowDisProbs = nowDisProbs[:,lastDisActShape:lastDisActShape+shape]
+                    thisNowDisProbs = nowDisProbs[:, lastDisActShape : lastDisActShape + thisShape]
-                thisOldDisProbs = oldDisProbs[:,lastDisActShape:lastDisActShape+shape]
+                    thisOldDisProbs = oldDisProbs[:, lastDisActShape : lastDisActShape + thisShape]
-                discreteALoss = getDiscreteALoss(thisNowDisProbs,thisOldDisProbs,advantage)
+                    thisDisOneHotActs = disOneHotActs[
-                lastDisActShape += shape
+                        :, lastDisActShape : lastDisActShape + thisShape
                    ]
                    discreteALoss = getDiscreteALoss(
                        thisNowDisProbs, thisOldDisProbs, thisDisOneHotActs, thisShape, advantage
                    )
                    lastDisActShape += thisShape
                    totalALoss += discreteALoss
-                totalActionNum += 1
+                    totalActionNum += 1.0
-            # for nowConMusig,conAction,oldPiProb in zip(tf.transpose(nowConMusigs,perm=[1,0,2]),conActions,oldPiProbs):
+            if self.conActSize != 0:
                # while NN have continuous action output.
                oldConProbs = y_true[:, self.disOutputSize : self.disOutputSize + self.conActSize]
                conActions = y_true[
                    :,
                    self.disOutputSize
                    + self.conActSize : self.disOutputSize
                    + self.conActSize
                    + self.conActSize,
                ]
                nowConMusigs = y_pred[:, self.disOutputSize :]  # [musig1,musig2]
                lastConAct = 0
-            for act in range(self.conActSize):
+                for conAct in range(self.conActSize):
-                thisNowConMusig = nowConMusigs[:,lastConAct:lastConAct+((act+1)*2)]
+                    thisNowConMusig = nowConMusigs[:, lastConAct : lastConAct + self.muSigSize]
-                thisOldConMusig = oldConMusigs[:,lastConAct:lastConAct+((act+1)*2)]
+                    thisOldConProb = oldConProbs[:, conAct : conAct + 1]
-                thisConAction = conActions[:,act]
+                    thisConAction = conActions[:, conAct]
-                continuousAloss = getContinuousALoss(thisNowConMusig,thisConAction,thisOldConMusig,advantage)
+                    continuousAloss = getContinuousALoss(
                        thisNowConMusig, thisConAction, thisOldConProb, advantage
                    )
                    totalALoss += continuousAloss
-                totalActionNum += 1
+                    totalActionNum += 1.0
-
+                    lastConAct += self.muSigSize
            loss = tf.divide(totalALoss, totalActionNum)
            return loss
        return loss
-    # get Action&V
+    # get Actions&values
-    def chooseAction(self, state):
+    def chooseAction(self, state: ndarray):
        """Agent choose action to take
        Args:
-            state (np.array): enviroment state
+            state (ndarray): enviroment state
        Returns:
            np.array:
-                disAct1,
+                actions,
-                    discreteAction1
+                    actions list,2dims like [[0],[1],[1.5]]
                disAct2,
                    discreteAction2
                disAct3,
                    discreteAction3
                conAction,
                    continuousAction
                predictResult,
                    actor NN predict Result output
        """
        # let actor choose action,use the normal distribution
        # state = np.expand_dims(state,0)
-        # check state dimension is [1,statesize]
+        # check state dimension is [stateNum,statesize]
        if state.ndim != 2:
-            state = state.reshape([1,self.stateSize])
+            stateNum = int(len(state) / self.stateSize)
-        
+            state = state.reshape([stateNum, self.stateSize])
        predictResult = self.actor(state)  # get predict result [[disAct1, disAct2, disAct3, musig]]
-        predictResult = predictResult.numpy()
+        # print("predictResult",predictResult)
-        disAct1Prob = predictResult[0][0:3]
+        # predictResult = predictResult.numpy()
-        disAct2Prob = predictResult[0][3:6]
+        actions = []
-        disAct3Prob = predictResult[0][6:8]
+        if self.disActSize != 0:
-        mu = predictResult[0][8]
+            # while NN have discrete action output.
-        sigma = predictResult[0][9]
+            lastDisActShape = 0
-        if math.isnan(mu) or math.isnan(sigma):
+            for shape in self.disActShape:
                thisDisActProbs = predictResult[:, lastDisActShape : lastDisActShape + shape]
                dist = tfp.distributions.Categorical(probs=thisDisActProbs, dtype=tf.float32)
                action = int(dist.sample().numpy()[0])
                # action = np.argmax(thisDisActProbs)
                actions.append(action)
                lastDisActShape += shape
        if self.conActSize != 0:
            # while NN have continuous action output.
            lastConAct = 0
            for actIndex in range(self.conActSize):
                thisMu = predictResult[:, self.disOutputSize + lastConAct]
                thisSig = predictResult[:, self.disOutputSize + lastConAct + 1]
                if math.isnan(thisMu) or math.isnan(thisSig):
                    # check mu or sigma is nan
-            print("mu or sigma is nan")
+                    print("chooseAction:mu or sigma is nan")
                thisDist = np.random.normal(loc=thisMu, scale=thisSig)
                actions.append(np.clip(thisDist, -self.conActRange, self.conActRange))
                lastConAct += 2
        return actions, predictResult
-        disAct1 = np.argmax(disAct1Prob)  # WS 0 or 1 or 2
+    def trainCritcActor(
-        disAct2 = np.argmax(disAct2Prob)  # AD 0 or 1 or 2
+        self,
-        disAct3 = np.argmax(disAct3Prob)  # mouse shoot 0 or 1
+        states: ndarray,
-        normDist = np.random.normal(loc=mu, scale=sigma)  # normalDistribution
+        oldActorResult: ndarray,
-        conAction = np.clip(normDist, -self.conActRange,
+        actions: ndarray,
-                            self.conActRange)  # 在正态分布中随机get一个action
+        rewards: ndarray,
-        return disAct1, disAct2, disAct3, conAction, predictResult
+        dones: ndarray,
        nextState: ndarray,
        epochs: int = None,
    ):
        """train critic&actor use PPO ways
-    def getCriticV(self, state):
+        Args:
            states (ndarray): states
            oldActorResult (ndarray): actor predict result
            actions (ndarray): predicted actions include both discrete actions and continuous actions
            rewards (ndarray): rewards from enviroment
            dones (ndarray): dones from enviroment
            nextState (ndarray): next state from enviroment
            epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.
        Returns:
            tf.constant: criticLoss, actorLoss
        """
        if epochs == None:
            epochs = self.trainEpochs
        criticValues = self.getCriticV(state=states)
        discountedR = self.discountReward(nextState, criticValues, dones, rewards)
        advantage = self.getGAE(discountedR, criticValues)
        criticLoss = self.trainCritic(states, discountedR, epochs)
        actorLoss = self.trainActor(states, oldActorResult, actions, advantage, epochs)
        # print("A_Loss:", actorLoss, "C_Loss:", criticLoss)
        return criticLoss, actorLoss
    def trainCritic(self, states: ndarray, discountedR: ndarray, epochs: int = None):
        """critic NN trainning function
        Args:
            states (ndarray): states
            discountedR (ndarray): discounted rewards
            epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.
        Returns:
            tf.constant: all critic losses
        """
        if epochs == None:
            epochs = self.trainEpochs
        his = self.critic.fit(x=states, y=discountedR, epochs=epochs, verbose=0)
        return his.history["loss"]
    def trainActor(
        self,
        states: ndarray,
        oldActorResult: ndarray,
        actions: ndarray,
        advantage: ndarray,
        epochs: int = None,
    ):
        """actor NN trainning function
        Args:
            states (ndarray): states
            oldActorResult (ndarray): actor predict results
            actions (ndarray): acotor predict actions
            advantage (ndarray): GAE advantage
            epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.
        Returns:
            tf.constant: all actor losses
        """
        # Trian Actor
        # states: Buffer States
        # actions: Buffer Actions
        # discountedR: Discounted Rewards
        # Epochs: just Epochs
        if epochs == None:
            epochs = self.trainEpochs
        actions = np.asarray(actions, dtype=np.float32)
        disActions = actions[:, 0 : self.disActSize]
        conActions = actions[:, self.disActSize :]
        oldDisProbs = oldActorResult[:, 0 : self.disOutputSize]  # [disAct1, disAct2, disAct3]
        oldConMusigs = oldActorResult[:, self.disOutputSize :]  # [musig1,musig2]
        if self.disActSize != 0:
            disOneHotActs = self.getOneHotActs(disActions)
            if self.conActSize != 0:
                # while NN have discrete6 & continuous actions output.
                oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
                # pack [oldDisProbs,oldPiProbs,conActions,advantage] as y_true
                y_true = np.hstack((oldDisProbs, oldPiProbs, disOneHotActs, conActions, advantage))
            else:
                # while NN have only discrete actions output.
                # pack [oldDisProbs,advantage] as y_true
                y_true = np.hstack((oldDisProbs, disOneHotActs, advantage))
        else:
            if self.conActSize != 0:
                # while NN have only continuous action output.
                oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
                # pack [oldPiProbs,conActions,advantage] as y_true
                y_true = np.hstack((oldPiProbs, conActions, advantage))
            else:
                print("trainActor:disActSize & conActSize error")
                time.sleep(999999)
        # assembly Actions history
        # train start
        if np.any(tf.math.is_nan(y_true)):
            print("y_true got nan")
            print("y_true", y_true)
        his = self.actor.fit(x=states, y=y_true, epochs=epochs, verbose=0)
        if np.any(tf.math.is_nan(his.history["loss"])):
            print("his.history['loss'] is nan!")
            print(his.history["loss"])
        return his.history["loss"]
    def saveWeights(self, score: float):
        """save now NN's Weight. Use "models.save_weights" method.
        Save as "tf" format "ckpt" file.
        Args:
            score (float): now score
        """
        actor_save_dir = (
            self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/actor/" + "actor.ckpt"
        )
        critic_save_dir = (
            self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/critic/" + "critic.ckpt"
        )
        self.actor.save_weights(actor_save_dir, save_format="tf")
        self.critic.save_weights(critic_save_dir, save_format="tf")
        # create an empty file named  as score to recored score
        score_dir = (
            self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/" + str(round(score))
        )
        scorefile = open(score_dir, "w")
        scorefile.close()
        print("Model's Weights Saved")
    def loadWeightToModels(self, loadDir: str):
        """load NN Model. Use "models.load_weights()" method.
        Load "tf" format "ckpt" file.
        Args:
            loadDir (str): Model dir
        """
        actorDir = loadDir + "/actor/" + "actor.ckpt"
        criticDir = loadDir + "/critic/" + "critic.ckpt"
        self.actor.load_weights(actorDir)
        self.critic.load_weights(criticDir)
        print("++++++++++++++++++++++++++++++++++++")
        print("++++++++++++Model Loaded++++++++++++")
        print(loadDir)
        print("++++++++++++++++++++++++++++++++++++")
    def getCriticV(self, state: ndarray):
        """get Critic predict V value
        Args:
-            state (np.array): Env state
+            state (ndarray): Env state
        Returns:
            tensor: retrun Critic predict result
@ -263,41 +565,84 @@ class PPO(object):
        # if state.ndim < 2:
        #    state = np.expand_dims(state,0)
        if state.ndim != 2:
-            state = state.reshape([1,self.stateSize])
+            stateNum = int(len(state) / self.stateSize)
            state = state.reshape([stateNum, self.stateSize])
        return self.critic.predict(state)
-    def discountReward(self, nextState, rewards):
+    def discountReward(self, nextState: ndarray, values: ndarray, dones: ndarray, rewards: ndarray):
        """Discount future rewards
        Args:
-            nextState (np.array): next Env state 
+            nextState (ndarray): next Env state
-            rewards (np.array): reward list of this episode
+            values (ndarray): critic predict values
            dones (ndarray): dones from enviroment
            rewards (ndarray): reward list of this episode
        Returns:
-            np.array: discounted rewards list,same shape as rewards that input
+            ndarray: discounted rewards list,same shape as rewards that input
        """
        """
        # 降低未来的rewards
        nextV = self.getCriticV(nextState)
        dones = 1 - dones
        discountedRewards = []
-        for r in rewards[::-1]:
+        for i in reversed(range(len(rewards))):
-            nextV = r + self.GAMMA*nextV
+            nextV = rewards[i] + dones[i] * self.gamma * nextV
            discountedRewards.append(nextV)
-        discountedRewards.reverse()  # \ESREVER/
+        discountedRewards.reverse()  # reverse
        discountedRewards = np.squeeze(discountedRewards)
        discountedRewards = np.expand_dims(discountedRewards, axis=1)
        # discountedRewards = np.array(discountedRewards)[:, np.newaxis]
        return discountedRewards
        """
        """
        nextV = self.getCriticV(nextState)
        discountedRewards = []
        for r in rewards[::-1]:
            nextV = r + self.gamma * nextV
            discountedRewards.append(nextV)
        discountedRewards.reverse()  # reverse
        discountedRewards = np.squeeze(discountedRewards)
        discountedRewards = np.expand_dims(discountedRewards, axis=1)
        # discountedRewards = np.array(discountedRewards)[:, np.newaxis]
        print(discountedRewards)
        return discountedRewards
        """
        g = 0
        discountedRewards = []
        lastValue = self.getCriticV(nextState)
        values = np.append(values, lastValue, axis=0)
        dones = 1 - dones
        for i in reversed(range(len(rewards))):
            delta = rewards[i] + self.gamma * values[i + 1] * dones[i] - values[i]
            g = delta + self.gamma * self.lmbda * dones[i] * g
            discountedRewards.append(g + values[i])
        discountedRewards.reverse()
        return np.asarray(discountedRewards)
-    def conProb(self, mu, sig, x):
+    def getGAE(self, discountedRewards: ndarray, values: ndarray):
        """compute GAE adcantage
        Args:
            discountedRewards (ndarray): discounted rewards
            values (ndarray): critic predict values
        Returns:
            ndarray: GAE advantage
        """
        advantage = discountedRewards - values
        advantage = (advantage - np.mean(advantage)) / (np.std(advantage) + EPS)
        return advantage
    def conProb(self, mu: ndarray, sig: ndarray, x: ndarray):
        """calculate probability when x in Normal distribution(mu,sigma)
        Args:
-            mu (np,array): mu
+            mu (ndarray): mu
-            sig (np.array): sigma
+            sig (ndarray): sigma
-            x (np.array): x
+            x (ndarray): x
        Returns:
-            np.array: probabilities
+            ndarray: probability
        """
        # 获取在正态分布mu,sig下当取x值时的概率
        # return shape : (length,1)
@ -313,116 +658,58 @@ class PPO(object):
        # prob = dist*tf.exp(-tf.square(x-mu)/(2.*tf.square(sig)))
        return prob
-    def trainCritcActor(self, states, actions, rewards, nextState, criticEpochs, actorEpochs):
+    def getOneHotActs(self, disActions):
-        # Train ActorNN and CriticNN
+        """one hot action encoder
        # states: Buffer States
        # actions: Buffer Actions
        # rewards: Buffer Rewards,没有Discount处理
        # nextState: 下一个单独state
        # criticEpochs: just criticNN'Epochs
        # acotrEpochs: just acotrNN'Epochs
        discountedR = self.discountReward(nextState, rewards)
        criticMeanLoss = self.trainCritic(states, discountedR, criticEpochs)
        actorMeanLoss = self.trainActor(
            states, actions, discountedR, actorEpochs)
        print("A_Loss:", actorMeanLoss, "C_Loss:", criticMeanLoss)
        return actorMeanLoss, criticMeanLoss
    def trainCritic(self, states, discountedR, epochs):
        # Trian Critic
        # states: Buffer States
        # discountedR: Discounted Rewards
        # Epochs: just Epochs
        # IDK why this should be list...It just work...
        # If discountR in np.array type it will throw 'Failed to find data adapter that can handle'
        # discountedR = discountedR.tolist()
        his = self.critic.fit(x=states, y=discountedR,
                              epochs=epochs, verbose=0)
        return np.mean(his.history['loss'])
    def trainActor(self, states, actions, discountedR, epochs):
        """Actor NN trainning function
        Args:
-            states (np.array): Env states
+            disActions (ndarray): discrete actions
            actions (np.array): action history
            discountedR (np.array): discountedR
            epochs (int): epochs,how many time NN learning
        Returns:
-            Average actor loss: this learning round's average actor loss
+            ndarray: one hot actions
        """
-        # Trian Actor
+        actIndex = 0
-        # states: Buffer States
+        for thisShape in self.disActShape:
-        # actions: Buffer Actions
+            thisActs = disActions[:, actIndex]
-        # discountedR: Discounted Rewards
+            thisOneHotAct = tf.squeeze(tf.one_hot(thisActs, thisShape)).numpy()
-        # Epochs: just Epochs
+            if actIndex == 0:
                oneHotActs = thisOneHotAct
            else:
                oneHotActs = np.append(oneHotActs, thisOneHotAct, axis=1)
            actIndex += 1
        return oneHotActs
-        states = np.asarray(states)
+    def getAverageEntropy(self, probs: ndarray):
-        actions = np.asarray(actions, dtype=np.float32)
+        """get average dis&con ACT Entropys
        # predict with old Actor NN
        oldActorResult = self.actor.predict(states)
        # assembly Actions history
        disActions = actions[:,0:self.disActSize]
        conActions = actions[:,self.disActSize:]
        # assembly predictResult as old Actor's Result
        oldDisProbs = oldActorResult[:,0:self.disOutputSize] # [disAct1, disAct2, disAct3]
        oldConMusigs = oldActorResult[:,self.disOutputSize:] # [musig1,musig2]
        oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
        criticV = self.critic.predict(states)
        advantage = copy.deepcopy(discountedR - criticV)
        # pack [oldDisProbs,oldPiProbs,conActions,advantage] as y_true
        y_true = np.hstack((oldDisProbs,oldPiProbs,conActions,advantage))
        # train start
        if np.any(tf.math.is_nan(y_true)):
            print("y_true got nan")
            print("oldConMusigs",oldConMusigs)
            print("oldPiProbs",oldPiProbs)
            print("conActions",conActions)
            print("oldConMusigs",oldConMusigs)
        his = self.actor.fit(x=states, y=y_true, epochs=epochs, verbose=0)
        if np.any(tf.math.is_nan(his.history['loss'])):
            print("his.history['loss'] is nan!")
            print(his.history['loss'])
        return np.mean(his.history['loss'])
    def saveWeights(self,score = None):
        """save now NN's Weight. Use "models.save_weights" method. 
        Save as "tf" format "ckpt" file.
        Args:
-            score (int): now score
+            probs (ndarray): actor NN predict result
        Returns:
            float: average total entropy
            list: discrete entropys
            list: continuous entropys
        """
-        actor_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/actor/" + "actor.ckpt"
+        discreteEntropys = []
-        critic_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/critic/" + "critic.ckpt"
+        continuousEntropys = []
-        self.actor.save_weights(actor_save_dir, save_format="tf")
+        if self.disActSize != 0:
-        self.critic.save_weights(critic_save_dir, save_format="tf")
+            disProbs = probs[:, 0 : self.disOutputSize]
-        if score != None:
+            lastDisActIndex = 0
-            # create an empty file named  as score to recored score
+            for actShape in self.disActShape:
-            score_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/" + str(round(score))
+                thisDisProbs = disProbs[:, lastDisActIndex : lastDisActIndex + actShape]
-            scorefile = open(score_dir,'w')
+                lastDisActIndex += actShape
-            scorefile.close()
+                discreteEntropys.append(
-        print("Model's Weights Saved")
+                    tf.negative(
-    
+                        tf.reduce_mean(
-    def loadWeightToModels(self,loadDir):
+                            tf.math.multiply(thisDisProbs, tf.math.log(thisDisProbs + EPS))
-        """load NN Model. Use "models.load_weights()" method.
+                        )
-        Load "tf" format "ckpt" file.
+                    )
-
+                )
-        Args:
+        if self.conActSize != 0:
-            loadDir (string): Model dir
+            conProbs = probs[:, self.disOutputSize :]
-        """
+            conActIndex = 0
-        actorDir = loadDir + "/actor/" + "actor.ckpt"
+            for i in range(self.conActSize):
-        criticDir = loadDir + "/critic/" + "critic.ckpt"
+                thisConProbs = conProbs[:, conActIndex : conActIndex + 2]
-        self.actor.load_weights(actorDir)
+                conActIndex += 2
-        self.critic.load_weights(criticDir)
+                continuousEntropys.append(tf.reduce_mean(thisConProbs[:, 1]))
-        
+        averageEntropy = np.mean([np.mean(discreteEntropys), np.mean(continuousEntropys)])
-        print("++++++++++++++++++++++++++++++++++++")
+        return averageEntropy, discreteEntropys, continuousEntropys
        print("++++++++++++Model Loaded++++++++++++")
        print(loadDir)
        print("++++++++++++++++++++++++++++++++++++")
--- a/Aimbot-PPO-Python/PPOBuffer.py
+++ b/Aimbot-PPO-Python/PPOBuffer.py
@ -0,0 +1,65 @@
 import numpy as np
 class PPOBuffer(object):
    def __init__(self):
        self.states = []
        self.actorProbs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        print("√√√√√Buffer Initialized Success√√√√√")
    def clearBuffer(self):
        self.states = []
        self.actorProbs = []
        self.actions = []
        self.rewards = []
        self.dones = []
    def getStates(self):
        return self.standDims(np.asarray(self.states))
    def getActorProbs(self):
        return self.standDims(np.asarray(self.actorProbs))
    def getActions(self):
        return self.standDims(np.asarray(self.actions))
    def getRewards(self):
        return self.standDims(np.asarray(self.rewards))
    def getDones(self):
        return self.standDims(np.asarray(self.dones))
    def saveState(self, state):
        self.states.append(state)
    def saveAction(self, action):
        self.actions.append(action)
    def saveReward(self, reward):
        self.rewards.append(reward)
    def standDims(self, data):
        # standarlize data's dimension
        if np.ndim(data) > 2:
            return np.squeeze(data, axis=1)
        elif np.ndim(data) < 2:
            return np.expand_dims(data, axis=1)
        else:
            return np.asarray(data)
    def saveBuffers(self, state, actorProb, action, reward, done):
        self.states.append(state)
        self.actorProbs.append(actorProb)
        self.actions.append(action)
        self.rewards.append(reward)
        self.dones.append(done)
        """
        print("self.states", self.states)
        print("self.actions", self.actions)
        print("self.rewards", self.rewards)
        print("self.dones", self.dones)
        print("self.values", self.values)
        """
--- a/Aimbot-PPO-Python/PPOConfig.py
+++ b/Aimbot-PPO-Python/PPOConfig.py
@ -0,0 +1,15 @@
 import datetime
 from typing import NamedTuple, Optional
 class PPOConfig(NamedTuple):
    NNShape: list = [256, 256, 128]
    actorLR: float = 2e-3  # Actor Net Learning
    criticLR: float = 2e-3  # Critic Net Learning
    gamma: float = 0.99
    lmbda: float = 0.95
    clipRange: float = 0.20
    entropyWeight: float = 1e-2
    trainEpochs: int = 8
    saveDir: str = "PPO-Model/" + datetime.datetime.now().strftime("%m%d-%H%M") + "/"
    loadModelDir: Optional[str] = None
--- a/Aimbot-PPO-Python/PPOHistoryRecorder.py
+++ b/Aimbot-PPO-Python/PPOHistoryRecorder.py
@ -0,0 +1,58 @@
 from turtle import color
 import matplotlib.pyplot as plt
 class PPOHistory(object):
    def __init__(self):
        self.meanRewards = []
        self.entropys = []
        self.actorLosses = []
        self.criticLosses = []
    def saveHis(self, rewards, entropys, aLosses, cLosses):
        self.meanRewards.extend([rewards])
        self.entropys.extend([entropys])
        self.actorLosses.extend(aLosses)
        self.criticLosses.extend(cLosses)
    def drawHis(self):
        plt.figure(figsize=(21, 13), facecolor="#011627")
        ax = plt.subplot(2, 2, 1)
        ax.set_facecolor("#011627")
        ax.spines["top"].set_color("#c9d2df")
        ax.spines["bottom"].set_color("#c9d2df")
        ax.spines["left"].set_color("#c9d2df")
        ax.spines["right"].set_color("#c9d2df")
        ax.plot(
            range(len(self.meanRewards)), self.meanRewards, color="#c9d2df", label="AverageRewards"
        )
        ax.set_title("meanRewards", color="#c9d2df")
        ax = plt.subplot(2, 2, 2)
        ax.set_facecolor("#011627")
        ax.spines["top"].set_color("#c9d2df")
        ax.spines["bottom"].set_color("#c9d2df")
        ax.spines["left"].set_color("#c9d2df")
        ax.spines["right"].set_color("#c9d2df")
        ax.plot(range(len(self.entropys)), self.entropys, color="#c9d2df", label="AverageEntropys")
        ax.set_title("entropys", color="#c9d2df")
        ax = plt.subplot(2, 2, 3)
        ax.set_facecolor("#011627")
        ax.spines["top"].set_color("#c9d2df")
        ax.spines["bottom"].set_color("#c9d2df")
        ax.spines["left"].set_color("#c9d2df")
        ax.spines["right"].set_color("#c9d2df")
        ax.plot(
            range(len(self.actorLosses)), self.actorLosses, color="#c9d2df", label="actorLosses"
        )
        ax.set_title("actorLosses", color="#c9d2df")
        ax = plt.subplot(2, 2, 4)
        ax.set_facecolor("#011627")
        ax.spines["top"].set_color("#c9d2df")
        ax.spines["bottom"].set_color("#c9d2df")
        ax.spines["left"].set_color("#c9d2df")
        ax.spines["right"].set_color("#c9d2df")
        ax.plot(
            range(len(self.criticLosses)), self.criticLosses, color="#c9d2df", label="criticLosses"
        )
        ax.set_title("criticLosses", color="#c9d2df")
        plt.show()
--- a/Aimbot-PPO-Python/aimBotEnv.py
+++ b/Aimbot-PPO-Python/aimBotEnv.py
@ -1,8 +1,8 @@
 import mlagents_envs
 from mlagents_envs.base_env import ActionTuple
 from mlagents_envs.environment import UnityEnvironment
 import numpy as np
 from numpy import ndarray
 class makeEnv(object):
@ -22,69 +22,71 @@ class makeEnv(object):
        self.BEHA_SPECS = self.env.behavior_specs
        self.BEHA_NAME = list(self.BEHA_SPECS)[0]
        self.SPEC = self.BEHA_SPECS[self.BEHA_NAME]
-        self.OBSERVATION_SPECS = self.SPEC.observation_specs[
+        self.OBSERVATION_SPECS = self.SPEC.observation_specs[0]  # observation spec
            0
        ]  # observation spec
        self.ACTION_SPEC = self.SPEC.action_spec  # action specs
        self.DISCRETE_SIZE = self.ACTION_SPEC.discrete_size  # 　連続的な動作のSize
        self.DISCRETE_SHAPE = list(self.ACTION_SPEC.discrete_branches)
        self.CONTINUOUS_SIZE = self.ACTION_SPEC.continuous_size  # 　離散的な動作のSize
-        self.STATE_SIZE = (
+        self.STATE_SIZE = self.OBSERVATION_SPECS.shape[0] - self.LOAD_DIR_SIZE_IN_STATE  # 環境観測データ数
            self.OBSERVATION_SPECS.shape[0] - self.LOAD_DIR_SIZE_IN_STATE
        )  # 環境観測データ数
        print("√√√√√Enviroment Initialized Success√√√√√")
    def step(
        self,
-        discreteActions=None,
+        actions: list,
-        continuousActions=None,
+        behaviorName: ndarray = None,
-        behaviorName=None,
+        trackedAgent: ndarray = None,
        trackedAgent=None,
    ):
        """change ations list to ActionTuple then send it to enviroment
        Args:
            actions (list): PPO chooseAction output action list
            behaviorName (ndarray, optional): behaviorName. Defaults to None.
            trackedAgent (ndarray, optional): trackedAgentID. Defaults to None.
        Returns:
            ndarray: nextState, reward, done, loadDir, saveNow
        """
        # take action to enviroment
        # return mextState,reward,done
        if self.DISCRETE_SIZE == 0:
            # create empty discrete action
            discreteActions = np.asarray([[0]])
        else:
            # create discrete action from actions list
            discreteActions = np.asanyarray([actions[0:self.DISCRETE_SIZE]])
        if self.CONTINUOUS_SIZE == 0:
            # create empty continuous action
            continuousActions = np.asanyarray([[0.0]])
        else:
            # create continuous actions from actions list
            continuousActions = np.asanyarray(actions[self.DISCRETE_SIZE:])
        # check if arg is include None or IS None
        try:
            isDisNone = discreteActions.any() is None
            if discreteActions.all() is None:
                print("step() Error!:discreteActions include None")
        except:
            isDisNone = True
        try:
            isConNone = continuousActions.any() is None
            if continuousActions.all() is None:
                print("step() Error!:continuousActions include None")
        except:
            isConNone = True
        if isDisNone:
            # if discreteActions is enpty just give nothing[[0]] to Enviroment
            discreteActions = np.array([[0]], dtype=np.int)
        if isConNone:
            # if continuousActions is enpty just give nothing[[0]] to Enviroment
            continuousActions = np.array([[0]], dtype=np.float)
        if behaviorName is None:
            behaviorName = self.BEHA_NAME
        if trackedAgent is None:
            trackedAgent = self.TRACKED_AGENT
        # create actionTuple
-        thisActionTuple = ActionTuple(
+        thisActionTuple = ActionTuple(continuous=continuousActions, discrete=discreteActions)
            continuous=continuousActions, discrete=discreteActions
        )
        # take action to env
-        self.env.set_actions(
+        self.env.set_actions(behavior_name=behaviorName, action=thisActionTuple)
            behavior_name=behaviorName, action=thisActionTuple
        )
        self.env.step()
        # get nextState & reward & done after this action
-        nextState, reward, done, loadDir, saveNow = self.getSteps(
+        nextState, reward, done, loadDir, saveNow = self.getSteps(behaviorName, trackedAgent)
            behaviorName, trackedAgent
        )
        return nextState, reward, done, loadDir, saveNow
    def getSteps(self, behaviorName=None, trackedAgent=None):
        """get enviroment now observations.
        Include State, Reward, Done, LoadDir, SaveNow
        Args:
            behaviorName (_type_, optional): behaviorName. Defaults to None.
            trackedAgent (_type_, optional): trackedAgent. Defaults to None.
        Returns:
            ndarray: nextState, reward, done, loadDir, saveNow
        """
        # get nextState & reward & done
        if behaviorName is None:
            behaviorName = self.BEHA_NAME
@ -94,25 +96,17 @@ class makeEnv(object):
        if trackedAgent is None:
            trackedAgent = self.TRACKED_AGENT
-        if (
+        if trackedAgent in decisionSteps:  # ゲーム終了していない場合、環境状態がdecision_stepsに保存される
            trackedAgent in decisionSteps
        ):  # ゲーム終了していない場合、環境状態がdecision_stepsに保存される
            nextState = decisionSteps[trackedAgent].obs[0]
-            nextState = np.reshape(
+            nextState = np.reshape(nextState, [1, self.STATE_SIZE + self.LOAD_DIR_SIZE_IN_STATE])
                nextState, [1, self.STATE_SIZE + self.LOAD_DIR_SIZE_IN_STATE]
            )
            saveNow = nextState[0][-1]
            loadDir = nextState[0][-3:-1]
            nextState = nextState[0][:-3]
            reward = decisionSteps[trackedAgent].reward
            done = False
-        if (
+        if trackedAgent in terminalSteps:  # ゲーム終了した場合、環境状態がterminal_stepsに保存される
            trackedAgent in terminalSteps
        ):  # ゲーム終了した場合、環境状態がterminal_stepsに保存される
            nextState = terminalSteps[trackedAgent].obs[0]
-            nextState = np.reshape(
+            nextState = np.reshape(nextState, [1, self.STATE_SIZE + self.LOAD_DIR_SIZE_IN_STATE])
                nextState, [1, self.STATE_SIZE + self.LOAD_DIR_SIZE_IN_STATE]
            )
            saveNow = nextState[0][-1]
            loadDir = nextState[0][-3:-1]
            nextState = nextState[0][:-3]
@ -121,9 +115,16 @@ class makeEnv(object):
        return nextState, reward, done, loadDir, saveNow
    def reset(self):
        """reset enviroment and get observations
        Returns:
            ndarray: nextState, reward, done, loadDir, saveNow
        """
        self.env.reset()
        nextState, reward, done, loadDir, saveNow = self.getSteps()
        return nextState, reward, done, loadDir, saveNow
    def render(self):
        """render enviroment
        """
        self.env.render()
--- a/Aimbot-PPO-Python/buffer.py
+++ b/Aimbot-PPO-Python/buffer.py
@ -1,29 +0,0 @@
 import numpy as np
 class buffer(object):
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
        print("√√√√√Buffer Initialized Success√√√√√")
    def clearBuffer(self):
        self.states = []
        self.actions = []
        self.rewards = []
    def getStates(self):
        return np.asarray(self.states)
    def getActions(self):
        return np.asarray(self.actions)
    def getRewards(self):
        return np.asarray(self.rewards)
    def saveState(self,state):
        self.states.append(state)
    def saveAction(self,action):
        self.actions.append(action)
    def saveReward(self,reward):
        self.rewards.append(reward)
    def saveBuffers(self,state,action,reward):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
--- a/Aimbot-PPO-Python/main.ipynb
+++ b/Aimbot-PPO-Python/main.ipynb
@ -1,356 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import aimBotEnv\n",
    "import PPO\n",
    "import buffer\n",
    "import numpy as np\n",
    "\n",
    "import tensorflow as tf\n",
    "import time\n",
    "import datetime\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Attempts to allocate only the GPU memory needed for allocation\n",
    "physical_devices = tf.config.list_physical_devices('GPU')\n",
    "tf.config.experimental.set_memory_growth(physical_devices[0], True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Env\n",
    "ENV_PATH = \"./Build-CloseEnemyCut/Aimbot-PPO\"\n",
    "WORKER_ID = 1\n",
    "BASE_PORT = 200\n",
    "\n",
    "MAX_EP = 1000\n",
    "EP_LENGTH = 100000\n",
    "GAMMA = 0.99  # discount future reward (UP?)\n",
    "EPSILON = 0.2  # clip Ratio range[1-EPSILON,1+EPSILON]\n",
    "ACTOR_LR = 1e-5  # LR\n",
    "CRITIC_LR = 2e-5  # LR\n",
    "BATCH = 256  # learning step\n",
    "ACTOR_EPOCH = 15  # epoch\n",
    "CRITIC_EPOCH = 15  # epoch\n",
    "ENTROPY_WHEIGHT = 0.001  # sigma's entropy in Actor loss\n",
    "ACTION_INTERVAL = 1  # take action every ACTION_INTERVAL steps\n",
    "\n",
    "\n",
    "TRAIN = True\n",
    "SAVE_DIR = \"PPO-Model/\" + datetime.datetime.now().strftime(\"%m%d%H%M\") + \"/\"\n",
    "LOAD_DIR = None\n",
    "\n",
    "CTN_ACTION_RANGE = 10\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "√√√√√Enviroment Initialized Success√√√√√\n",
      "√√√√√Buffer Initialized Success√√√√√\n",
      "No loadDir specified,Create a New Model\n",
      "CONTINUOUS_SIZE 1\n",
      "DISCRETE_SIZE 5\n",
      "STATE_SIZE 30\n"
     ]
    }
   ],
   "source": [
    "# initialize enviroment & buffer class\n",
    "env = aimBotEnv.makeEnv(\n",
    "    envPath=ENV_PATH, workerID=WORKER_ID, basePort=BASE_PORT\n",
    ")\n",
    "epBuffer = buffer.buffer()\n",
    "\n",
    "STATE_SIZE = env.STATE_SIZE\n",
    "CONTINUOUS_SIZE = env.CONTINUOUS_SIZE\n",
    "DISCRETE_SIZE = env.DISCRETE_SIZE\n",
    "_, _, _, loadDir, _ = env.getSteps()\n",
    "\n",
    "# check load model or not\n",
    "if np.any(loadDir == 0):\n",
    "    # create a new model\n",
    "    print(\"No loadDir specified,Create a New Model\")\n",
    "    LOAD_DIR = None\n",
    "else:\n",
    "    # load model\n",
    "    loadDirDateSTR = str(int(loadDir[0]))\n",
    "    loadDirTimeSTR = str(int(loadDir[1]))\n",
    "    if len(loadDirDateSTR) != 8:\n",
    "        # fill lost 0 while converse float to string\n",
    "        for _ in range(8 - len(loadDirDateSTR)):\n",
    "            loadDirDateSTR = \"0\" + loadDirDateSTR\n",
    "    if len(loadDirTimeSTR) != 6:\n",
    "        # fill lost 0 while converse float to string\n",
    "        for _ in range(6 - len(loadDirTimeSTR)):\n",
    "            loadDirTimeSTR = \"0\" + loadDirTimeSTR\n",
    "    LOAD_DIR = \"PPO-Model/\" + loadDirDateSTR + \"/\" + loadDirTimeSTR\n",
    "    print(\"Load Model:\")\n",
    "    print(LOAD_DIR)\n",
    "\n",
    "print(\"CONTINUOUS_SIZE\", CONTINUOUS_SIZE)\n",
    "print(\"DISCRETE_SIZE\", DISCRETE_SIZE)\n",
    "print(\"STATE_SIZE\", STATE_SIZE)\n",
    "\n",
    "disActShape = [3, 3, 2]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def actToKey(disAct1,disAct2,disAct3,conAct):\n",
    "    kW = 0\n",
    "    kS = 0\n",
    "    kA = 0\n",
    "    kD = 0\n",
    "    mouseShoot = 0\n",
    "    if disAct1 == 0:\n",
    "        kW = 0\n",
    "        kS = 1\n",
    "    elif disAct1 == 1:\n",
    "        kW = 0\n",
    "        kS = 0\n",
    "    elif disAct1 == 2:\n",
    "        kW = 1\n",
    "        kS = 0\n",
    "    if disAct2 == 0:\n",
    "        kA = 0\n",
    "        kD = 1\n",
    "    elif disAct2 == 1:\n",
    "        kA = 0\n",
    "        kD = 0\n",
    "    elif disAct2 == 2:\n",
    "        kA = 1\n",
    "        kD = 0\n",
    "    mouseShoot = disAct3\n",
    "    return kW,kS,kA,kD,mouseShoot,conAct"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "EP  0  START\n",
      "√√√√√Buffer Initialized Success√√√√√\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\numpy\\core\\fromnumeric.py:3474: RuntimeWarning: Mean of empty slice.\n",
      "  return _methods._mean(a, axis=axis, dtype=dtype,\n",
      "c:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\numpy\\core\\_methods.py:189: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  ret = ret.dtype.type(ret / rcount)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A_Loss: 0.4477495511372884 C_Loss: 3.155759557088216\n",
      "A_Loss: 0.14549287557601928 C_Loss: 0.5123071213563283\n",
      "A_Loss: 0.055241942902406055 C_Loss: 0.13002794484297434\n",
      "A_Loss: 0.057325509190559384 C_Loss: 0.11068039039770762\n",
      "A_Loss: 0.04376962607105573 C_Loss: 0.03923700377345085\n"
     ]
    }
   ],
   "source": [
    "bestScore = 200.0\n",
    "stopTrainCounter = 0\n",
    "\n",
    "totalRewardHis = []\n",
    "totalActorLossHis = []\n",
    "totalCriticLossHis = []\n",
    "epHis = []\n",
    "maxTotalReward = -99999999999\n",
    "\n",
    "for ep in range(MAX_EP):\n",
    "    print(\"EP \", ep, \" START\")\n",
    "    # first time run game\n",
    "    s, _, _, _, _ = env.reset()\n",
    "    if ep == 0:\n",
    "        epBuffer = buffer.buffer()\n",
    "        s = s.reshape([STATE_SIZE])\n",
    "        agent = PPO.PPO(\n",
    "            stateSize=STATE_SIZE,\n",
    "            disActShape=disActShape,\n",
    "            conActSize=1,\n",
    "            conActRange=CTN_ACTION_RANGE,\n",
    "            criticLR=CRITIC_LR,\n",
    "            actorLR=ACTOR_LR,\n",
    "            gamma=GAMMA,\n",
    "            epsilon=EPSILON,\n",
    "            entropyWeight=ENTROPY_WHEIGHT,\n",
    "            saveDir=SAVE_DIR,\n",
    "            loadModelDir=LOAD_DIR,\n",
    "        )\n",
    "    step = 0\n",
    "    done = False\n",
    "    stopTrainCounter -= 1\n",
    "    epHis.append(ep)\n",
    "\n",
    "    # reset total reward\n",
    "    epTotalReward = 0\n",
    "\n",
    "    # Recorder list\n",
    "    epStepHis = []\n",
    "    epRewardHis = []\n",
    "    epActorLossHis = []\n",
    "    epCriticLossHis = []\n",
    "\n",
    "    # save weight immediately?\n",
    "    saveNow = 0\n",
    "\n",
    "    while not done:\n",
    "        step += 1\n",
    "        if (\n",
    "            step % ACTION_INTERVAL == 0\n",
    "        ):  # take action every ACTION_INTERVAL steps\n",
    "            epStepHis.append(step)\n",
    "            (\n",
    "                disAct1,\n",
    "                disAct2,\n",
    "                disAct3,\n",
    "                conAct,\n",
    "                predictResult,\n",
    "            ) = agent.chooseAction(s)\n",
    "            kW, kS, kA, kD, mouseShoot, mouseMove = actToKey(\n",
    "                disAct1, disAct2, disAct3, conAct\n",
    "            )\n",
    "\n",
    "            nextState, thisReward, done, _, saveNow = env.step(\n",
    "                discreteActions=np.array([[kW, kS, kA, kD, mouseShoot]]),\n",
    "                continuousActions=np.array([[mouseMove]]),\n",
    "            )\n",
    "\n",
    "            epTotalReward += thisReward\n",
    "            epBuffer.saveBuffers(\n",
    "                s, [disAct1, disAct2, disAct3, conAct], thisReward\n",
    "            )\n",
    "        else:\n",
    "            disActs = np.array([[0, 0, 0, 0, 0]])\n",
    "            conActs = np.array([[0]])\n",
    "\n",
    "            nextState, thisReward, done, _, saveNow = env.step(\n",
    "                discreteActions=disActs, continuousActions=conActs\n",
    "            )\n",
    "            epTotalReward += thisReward\n",
    "        nextState = nextState.reshape([STATE_SIZE])\n",
    "        s = nextState\n",
    "\n",
    "        if done:\n",
    "            print(\"EP OVER!\")\n",
    "        if saveNow != 0:\n",
    "            print(\"SAVENOW!\")\n",
    "            saveNow = 0\n",
    "            agent.saveWeights()\n",
    "        # update PPO after Batch step or GameOver\n",
    "        if (step + 1) % BATCH == 0 or done:\n",
    "            bs = epBuffer.getStates()\n",
    "            ba = epBuffer.getActions()\n",
    "            br = epBuffer.getRewards()\n",
    "            epBuffer.clearBuffer()\n",
    "            if TRAIN:\n",
    "                epActorLoss, epCriticLoss = agent.trainCritcActor(\n",
    "                    bs, ba, br, s, CRITIC_EPOCH, ACTOR_EPOCH\n",
    "                )\n",
    "                epActorLossHis.append(epActorLoss)\n",
    "                epCriticLossHis.append(epCriticLoss)\n",
    "        # update History Recorder\n",
    "        totalActorLossHis.append(np.mean(epActorLossHis))\n",
    "        totalCriticLossHis.append(np.mean(epCriticLossHis))\n",
    "        totalRewardHis.append(epTotalReward)\n",
    "\n",
    "    if epTotalReward > maxTotalReward and epTotalReward != 0:\n",
    "        maxTotalReward = epTotalReward\n",
    "        agent.saveWeights(epTotalReward)\n",
    "        print(\"New Record! Save NN\", epTotalReward)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "aaa =  0\n",
      "aaa =  1\n",
      "aaa =  2\n",
      "aaa =  3\n",
      "aaa =  4\n",
      "aaa =  5\n",
      "aaa =  6\n",
      "aaa =  7\n",
      "aaa =  8\n",
      "aaa =  9\n"
     ]
    }
   ],
   "source": [
    "aaa = 0\n",
    "while aaa<10:\n",
    "    print(\"aaa = \",aaa)\n",
    "    aaa+=1"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "86e2db13b09bd6be22cb599ea60c1572b9ef36ebeaa27a4c8e961d6df315ac32"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/Aimbot-PPO-Python/main.py
+++ b/Aimbot-PPO-Python/main.py
@ -1,25 +0,0 @@
 import aimBotEnv
 import PPO
 ENV_PATH = './Build/Aimbot-PPO'
 WORKER_ID = 100
 MAX_EP = 1000
 EP_LENGTH = 400
 GAMMA = 0.99 # discount future reward (UP?)
 EPSILON = 0.2 # clip Ratio range[1-EPSILON,1+EPSILON]
 ACTOR_LR = 1e-5 # LR
 CRITIC_LR = 2e-5 # LR
 BATCH = 32 # learning step
 ACTOR_EPOCH = 10 # epoch
 CRITIC_EPOCH = 10 # epoch
 ENTROPY_WHEIGHT = 0.01 # sigma's entropy in Actor loss
 ACTION_INTERVAL = 1 # take action every ACTION_INTERVAL steps
 TRAIN = True
 env = aimBotEnv.makeEnv(envPath = ENV_PATH,workerID = WORKER_ID)
 STATE_SIZE = env.STATE_SIZE
 CONTINUOUS_SIZE = env.CONTINUOUS_SIZE
 DISCRETE_SIZE = env.DISCRETE_SIZE
 CTN_ACTION_RANGE = 2
--- a/Aimbot-PPO-Python/testarea.ipynb
+++ b/Aimbot-PPO-Python/testarea.ipynb
@ -29,7 +29,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -73,7 +73,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
@ -118,7 +118,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -182,7 +182,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
@ -205,7 +205,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
@ -240,16 +240,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "True"
+       "array([[0.]])"
      ]
     },
-     "execution_count": 20,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -259,28 +259,119 @@
    "\n",
    "a = np.array([10, 20, 30, 0])\n",
    "\n",
-    "np.any(a == 0)\n"
+    "np.asarray([[0.]])\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "4"
+       "1.5"
      ]
     },
-     "execution_count": 1,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "asd = \"adsf\"\n",
+    "import numpy as np\n",
-    "len(asd)"
+    "\n",
    "asd = [1,2,3,np.array([0.5]),np.array([0.5])]\n",
    "\n",
    "asd[3:]\n",
    "len(asd)\n",
    "\n",
    "np.mean([1,2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0\n",
      "0.0\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "import pyautogui as pag\n",
    "\n",
    "from pynput.mouse import Button, Controller\n",
    "\n",
    "w = pag.size().width\n",
    "h = pag.size().height\n",
    "mouse = Controller()\n",
    "\n",
    "nowt = time.time()\n",
    "\n",
    "middletime = time.time() - nowt\n",
    "print(middletime)\n",
    "# print(nowPos-(w/2))\n",
    "\n",
    "print(time.time() - middletime - nowt)\n",
    "while True:\n",
    "    x,_ = mouse.position\n",
    "    #print(mouse.press)\n",
    "    #print(mouse.position)\n",
    "    \n",
    "    mouse.position = (w / 2, h / 2)\n",
    "    time.sleep(1/60)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import pyautogui as pag\n",
    "\n",
    "import mouse\n",
    "\n",
    "w = pag.size().width\n",
    "h = pag.size().height\n",
    "\n",
    "nowt = time.time()\n",
    "\n",
    "middletime = time.time() - nowt\n",
    "print(middletime)\n",
    "# print(nowPos-(w/2))\n",
    "\n",
    "print(time.time() - middletime - nowt)\n",
    "while True:\n",
    "    x = mouse.get_position()\n",
    "    print(x)\n",
    "    #print(mouse.position)\n",
    "    \n",
    "    mouse.move(w / 2, h / 2)\n",
    "    time.sleep(1/60)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import keyboard\n",
    "\n",
    "while True:\n",
    "    if keyboard.is_pressed(\"w\"):\n",
    "        print(\"w\")\n",
    "    elif keyboard.is_pressed(\"s\"):\n",
    "        print(\"s\")"
   ]
  }
 ],
		`@ -1 +1 @@`
			{"count":1,"self":33.6679968,"total":34.5046305,"children":{"InitializeActuators":{"count":2,"self":0.0010002,"total":0.0010002,"children":null},"InitializeSensors":{"count":2,"self":0.0010004,"total":0.0010004,"children":null},"AgentSendState":{"count":1489,"self":0.011503399999999999,"total":0.2010688,"children":{"CollectObservations":{"count":1489,"self":0.1780647,"total":0.1780647,"children":null},"WriteActionMask":{"count":1488,"self":0.0019993999999999997,"total":0.0019993999999999997,"children":null},"RequestDecision":{"count":1488,"self":0.009501299999999999,"total":0.009501299999999999,"children":null}}},"DecideAction":{"count":1488,"self":0.0117408,"total":0.0117408,"children":null},"AgentAct":{"count":1488,"self":0.6208231,"total":0.6208231,"children":null}},"gauges":{},"metadata":{"timer_format_version":"0.1.0","start_time_seconds":"1663089804","unity_version":"2020.3.19f1","command_line_arguments":"C:\\Program Files\\Unity\\Hub\\Editor\\2020.3.19f1\\Editor\\Unity.exe -projectpath C:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-MultiScene -useHub -hubIPC -cloudEnvironment production -licensingIpc LicenseClient-UCUNI -hubSessionId 4cf980b0-326c-11ed-87c2-a7333acffe7c -accessToken j61gZPw8-vc4ZH7TJMvrSAAPQLV9SK6U72z_dek2xhw00ef","communication_protocol_version":"1.5.0","com.unity.ml-agents_version":"2.0.0","scene_name":"InGame","end_time_seconds":"1663089838"}}				{"count":1,"self":42.3855296,"total":42.4020608,"children":{"InitializeActuators":{"count":2,"self":0.0015155,"total":0.0015155,"children":null},"InitializeSensors":{"count":2,"self":0.0015017,"total":0.0015017,"children":null},"AgentSendState":{"count":1898,"self":0.0025031999999999997,"total":0.0025031999999999997,"children":null},"DecideAction":{"count":1898,"self":0.0070091999999999993,"total":0.0070091999999999993,"children":null},"AgentAct":{"count":1898,"self":0.0030023,"total":0.0030023,"children":null}},"gauges":{},"metadata":{"timer_format_version":"0.1.0","start_time_seconds":"1665414279","unity_version":"2020.3.19f1","command_line_arguments":"C:\\Program Files\\Unity\\Hub\\Editor\\2020.3.19f1\\Editor\\Unity.exe -projectpath C:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-MultiScene -useHub -hubIPC -cloudEnvironment production -licensingIpc LicenseClient-UCUNI -hubSessionId 39022900-48a5-11ed-b848-09be5949a456 -accessToken _47qt9I_MF3bhL7JS735Xdmfj8A4dGBOdRNKR0X2L_w00ef","communication_protocol_version":"1.5.0","com.unity.ml-agents_version":"2.0.0","scene_name":"InGame","end_time_seconds":"1665414322"}}
		`@ -1 +1 @@`
			{"count":1,"self":114.25904639999999,"total":114.62062499999999,"children":{"InitializeActuators":{"count":2,"self":0.0010000999999999999,"total":0.0010000999999999999,"children":null},"InitializeSensors":{"count":2,"self":0.0010002,"total":0.0010002,"children":null},"AgentSendState":{"count":1382,"self":0.0080028,"total":0.0195053,"children":{"CollectObservations":{"count":1382,"self":0.0070022999999999995,"total":0.0070022999999999995,"children":null},"WriteActionMask":{"count":1382,"self":0.0004994,"total":0.0004994,"children":null},"RequestDecision":{"count":1382,"self":0.0040008,"total":0.0040008,"children":null}}},"DecideAction":{"count":1382,"self":0.0110034,"total":0.0110034,"children":null},"AgentAct":{"count":1382,"self":0.3290731,"total":0.3290731,"children":null}},"gauges":{},"metadata":{"timer_format_version":"0.1.0","start_time_seconds":"1662500099","unity_version":"2020.3.19f1","command_line_arguments":"C:\\Program Files\\Unity\\Hub\\Editor\\2020.3.19f1\\Editor\\Unity.exe -projectpath C:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-MultiScene -useHub -hubIPC -cloudEnvironment production -licensingIpc LicenseClient-UCUNI -hubSessionId 209fdf30-2c1f-11ed-916f-33e85f4223cc -accessToken 78EBbrn-dg5kE__h3rNOqQVTDU3b1xUmmwWF1c5sFLc00ef","communication_protocol_version":"1.5.0","com.unity.ml-agents_version":"2.0.0","scene_name":"Start","end_time_seconds":"1662500214"}}				{"count":1,"self":100.7007424,"total":102.0526476,"children":{"InitializeActuators":{"count":2,"self":0.0015004999999999999,"total":0.0015004999999999999,"children":null},"InitializeSensors":{"count":2,"self":0.0010015,"total":0.0010015,"children":null},"AgentSendState":{"count":2851,"self":0.0227973,"total":0.3594312,"children":{"CollectObservations":{"count":2851,"self":0.3230326,"total":0.3230326,"children":null},"WriteActionMask":{"count":2850,"self":0.0040877,"total":0.0040877,"children":null},"RequestDecision":{"count":2850,"self":0.0095135999999999988,"total":0.0095135999999999988,"children":null}}},"DecideAction":{"count":2850,"self":0.0184923,"total":0.0184923,"children":null},"AgentAct":{"count":2850,"self":0.971482,"total":0.971482,"children":null}},"gauges":{"AKMAgent.CumulativeReward":{"count":1,"max":0,"min":0,"runningAverage":0,"value":0,"weightedAverage":0}},"metadata":{"timer_format_version":"0.1.0","start_time_seconds":"1665340408","unity_version":"2020.3.19f1","command_line_arguments":"C:\\Program Files\\Unity\\Hub\\Editor\\2020.3.19f1\\Editor\\Unity.exe -projectpath C:\\Users\\UCUNI\\OneDrive\\Unity\\ML-Agents\\Aimbot-PPO\\Aimbot-PPO-MultiScene -useHub -hubIPC -cloudEnvironment production -licensingIpc LicenseClient-8AgJBC01I23iOtjIDvezn -hubSessionId a2bff0f0-47ee-11ed-98ba-e72fca9de6f1 -accessToken VHkJOvWIH11sBEzC18rl6YA9y6y2sRMQj2zrOyZdNeE00ef","communication_protocol_version":"1.5.0","com.unity.ml-agents_version":"2.0.0","scene_name":"Start","end_time_seconds":"1665340510"}}