diff --git a/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
index caa3aaa..1d09dd1 100644
--- a/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
+++ b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
@@ -185,18 +185,27 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "(1.2, 3.2)\n",
-      "1.2\n"
+      "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mkoha9\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
      ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "aaa = (1.2,3.2)\n",
-    "print(aaa)\n",
-    "print(aaa[0])"
+    "import wandb\n",
+    "wandb.login()"
    ]
   }
  ],
diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
index 9e2e95e..b390b6a 100644
--- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
+++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
@@ -1,156 +1,28 @@
-import argparse
 import time
 import numpy as np
 import random
 import uuid
 import torch
-import torch.nn as nn
-import torch.optim as optim
 import atexit
 
-
 from aimbotEnv import Aimbot
 from aimbotEnv import AimbotSideChannel
 from ppoagent import PPOAgent
 from airecorder import WandbRecorder
 from aimemory import PPOMem
 from aimemory import Targets
-from enum import Enum
-from distutils.util import strtobool
+from arguments import parse_args
+import torch.optim as optim
 
-best_reward = -1
-
-DEFAULT_SEED = 9331
-ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv"
+# side channel uuid
 SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
-WAND_ENTITY = "koha9"
-WORKER_ID = 1
-BASE_PORT = 1000
-
 # tensorboard names
-GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3"
+GAME_NAME = "Aimbot_Hybrid_V3"
 GAME_TYPE = "Mix_Verification"
 
-# max round steps per agent is 2500/Decision_period, 25 seconds
-# !!!check every parameters before run!!!
-
-TOTAL_STEPS = 3150000
-BATCH_SIZE = 512
-MAX_TRAINNING_DATASETS = 6000
-DECISION_PERIOD = 1
-LEARNING_RATE = 6.5e-4
-GAMMA = 0.99
-GAE_LAMBDA = 0.95
-EPOCHS = 3
-CLIP_COEF = 0.11
-LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
-POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
-ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
-CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
-TARGET_LEARNING_RATE = 1e-6
-FREEZE_VIEW_NETWORK = True
-
-BROADCASTREWARD = False
-ANNEAL_LEARNING_RATE = True
-CLIP_VLOSS = True
-NORM_ADV = False
-TRAIN = True
-SAVE_MODEL = False
-WANDB_TACK = False
-LOAD_DIR = None
-#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
-
-TARGET_STATE_SIZE = 6
-INAREA_STATE_SIZE = 1
-TIME_STATE_SIZE = 1
-GUN_STATE_SIZE = 1
-MY_STATE_SIZE = 4
-TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
-BASE_WINREWARD = 999
-BASE_LOSEREWARD = -999
-TARGETNUM= 4
-ENV_TIMELIMIT = 30
-RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
-
 # !!!SPECIAL PARAMETERS!!!
-# change it while program is finished
 using_targets_num = 3
 
-
-def parse_args():
-    # fmt: off
-    # pytorch and environment parameters
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
-                        help="seed of the experiment")
-    parser.add_argument("--path", type=str, default=ENV_PATH,
-                        help="enviroment path")
-    parser.add_argument("--workerID", type=int, default=WORKER_ID,
-                        help="unity worker ID")
-    parser.add_argument("--baseport", type=int, default=BASE_PORT,
-                        help="port to connect to Unity environment")
-    parser.add_argument("--lr", type=float, default=LEARNING_RATE,
-                        help="the learning rate of optimizer")
-    parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-                        help="if toggled, cuda will be enabled by default")
-    parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
-                        help="total timesteps of the experiments")
-
-    # model parameters
-    parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
-                        help="Train Model or not")
-    parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
-                        help="freeze view network or not")
-    parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
-                        help="training dataset size,start training while dataset collect enough data")
-    parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
-                        help="nimi batch size")
-    parser.add_argument("--epochs", type=int, default=EPOCHS,
-                        help="the K epochs to update the policy")
-    parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
-                        help="Toggle learning rate annealing for policy and value networks")
-    parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
-                        help="track on the wandb")
-    parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
-                        help="save model or not")
-    parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
-                        help="the entity (team) of wandb's project")
-    parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
-                        help="load model directory")
-    parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
-                        help="the number of steps to run in each environment per policy rollout")
-    parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
-                        help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
-    parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True,
-                        help="save model or not")
-    # GAE loss
-    parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-                        help="Use GAE for advantage computation")
-    parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
-                        help="Toggles advantages normalization")
-    parser.add_argument("--gamma", type=float, default=GAMMA,
-                        help="the discount factor gamma")
-    parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
-                        help="the lambda for the general advantage estimation")
-    parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
-                        help="the surrogate clipping coefficient")
-    parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
-                        help="coefficient of the policy")
-    parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
-                        help="coefficient of the entropy")
-    parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
-                        help="coefficient of the value function")
-    parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
-                        help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
-    parser.add_argument("--max-grad-norm", type=float, default=0.5,
-                        help="the maximum norm for the gradient clipping")
-    parser.add_argument("--target-kl", type=float, default=None,
-                        help="the target KL divergence threshold")
-    # fmt: on
-    args = parser.parse_args()
-    return args
-
-
 if __name__ == "__main__":
     args = parse_args()
     random.seed(args.seed)
@@ -158,6 +30,7 @@ if __name__ == "__main__":
     torch.manual_seed(args.seed)
 
     device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+    best_reward = -1
 
     # Initialize environment anget optimizer
     aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
@@ -166,18 +39,11 @@ if __name__ == "__main__":
         agent = PPOAgent(
             env = env,
             this_args=args,
-            train_agent=args.train,
-            target_num=TARGETNUM,
-            target_state_size= TARGET_STATE_SIZE,
-            time_state_size=TIME_STATE_SIZE,
-            gun_state_size=GUN_STATE_SIZE,
-            my_state_size=MY_STATE_SIZE,
-            total_t_size=TOTAL_T_SIZE,
             device=device,
             ).to(device)
     else:
         agent = torch.load(args.load_dir)
-        # freeze 
+        # freeze
         if args.freeze_viewnet:
             # freeze the view network
             for p in agent.viewNetwork.parameters():
@@ -185,9 +51,8 @@ if __name__ == "__main__":
             print("VIEW NETWORK FREEZED")
         print("Load Agent", args.load_dir)
         print(agent.eval())
-
+    # optimizer
     optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
-
     # Tensorboard and WandB Recorder
     run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
     wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
@@ -204,34 +69,30 @@ if __name__ == "__main__":
 
     # start the game
     total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
-    target_steps = [0 for i in range(TARGETNUM)]
+    target_steps = [0 for i in range(args.target_num)]
     start_time = time.time()
     state, _, done = env.reset()
 
     # initialize AI memories
     ppo_memories = PPOMem(
-        env = env,
-        device = device,
         args=args,
-        target_num = TARGETNUM,
-        target_state_size = TARGET_STATE_SIZE,
-        base_lose_reward = BASE_LOSEREWARD,
-        base_win_reward = BASE_WINREWARD,
+        unity_agent_num=env.unity_agent_num,
+        device = device,
     )
 
+    # MAIN LOOP: run agent in environment
     for total_steps in range(total_update_step):
         # discunt learning rate, while step == total_update_step lr will be 0
-
         if args.annealLR:
-            final_lr_ratio = TARGET_LEARNING_RATE/args.lr
+            final_lr_ratio = args.target_lr/args.lr
             frac = 1.0 - ((total_steps + 1.0) / total_update_step)
             lr_now = frac * args.lr
             optimizer.param_groups[0]["lr"] = lr_now
         else:
             lr_now = args.lr
+
+        # episode start show learning rate
         print("new episode",total_steps,"learning rate = ",lr_now)
-
-
         # MAIN LOOP: run agent in environment
         step = 0
         training = False
@@ -271,14 +132,15 @@ if __name__ == "__main__":
                     next_done = next_done,
                     next_state=next_state,
                 )
-
                 # check if any training dataset is full and ready to train
-                for i in range(TARGETNUM):
+                for i in range(args.target_num):
                     if ppo_memories.obs[i].size()[0] >= args.datasetSize:
                         # start train NN
                         train_queue.append(i)
                 if(len(train_queue)>0):
+                    # break while loop and start train
                     break
+                # update state
                 state, done = next_state, next_done
             else:
                 step += 1
@@ -299,7 +161,7 @@ if __name__ == "__main__":
                     next_done = next_done,
                     next_state=next_state,
                 )
-
+                # update state
                 state = next_state
                 last_reward = reward
 
@@ -307,137 +169,34 @@ if __name__ == "__main__":
             # train mode on
             mean_reward_list = [] # for WANDB
             # loop all tarining queue
-            for thisT in train_queue:
+            for this_train_ind in train_queue:
                 # sart time
                 start_time = time.time()
-                target_steps[thisT]+=1
-                # flatten the batch
-                b_obs = ppo_memories.obs[thisT].reshape((-1,) + env.unity_observation_shape)
-                b_dis_logprobs = ppo_memories.dis_logprobs[thisT].reshape(-1)
-                b_con_logprobs = ppo_memories.con_logprobs[thisT].reshape(-1)
-                b_actions = ppo_memories.actions[thisT].reshape((-1,) + (env.unity_action_size,))
-                b_advantages = ppo_memories.advantages[thisT].reshape(-1)
-                b_returns = ppo_memories.returns[thisT].reshape(-1)
-                b_values = ppo_memories.values[thisT].reshape(-1)
-                b_size = b_obs.size()[0]
-                # Optimizing the policy and value network
-                b_inds = np.arange(b_size)
-                # clipfracs = []
-                for epoch in range(args.epochs):
-                    print(epoch,end="")
-                    # shuffle all datasets
-                    np.random.shuffle(b_inds)
-                    for start in range(0, b_size, args.minibatchSize):
-                        print(".",end="")
-                        end = start + args.minibatchSize
-                        mb_inds = b_inds[start:end]
-                        if(np.size(mb_inds)<=1):
-                            break
-                        mb_advantages = b_advantages[mb_inds]
-
-                        # normalize advantages
-                        if args.norm_adv:
-                            mb_advantages = (mb_advantages - mb_advantages.mean()) / (
-                                mb_advantages.std() + 1e-8
-                            )
-
-                        (
-                            _,
-                            new_dis_logprob,
-                            dis_entropy,
-                            new_con_logprob,
-                            con_entropy,
-                            newvalue,
-                        ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
-                        # discrete ratio
-                        dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
-                        dis_ratio = dis_logratio.exp()
-                        # continuous ratio
-                        con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
-                        con_ratio = con_logratio.exp()
-
-                        """
-                        # early stop
-                        with torch.no_grad():
-                            # calculate approx_kl http://joschu.net/blog/kl-approx.html
-                            old_approx_kl = (-logratio).mean()
-                            approx_kl = ((ratio - 1) - logratio).mean()
-                            clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
-                        """
-
-                        # discrete Policy loss
-                        dis_pg_loss_orig = -mb_advantages * dis_ratio
-                        dis_pg_loss_clip = -mb_advantages * torch.clamp(
-                            dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
+                target_steps[this_train_ind]+=1
+                # train agent
+                (
+                    v_loss,
+                    dis_pg_loss,
+                    con_pg_loss,
+                    loss,
+                    entropy_loss
+                    ) = agent.train_net(
+                        this_train_ind=this_train_ind,
+                        ppo_memories=ppo_memories,
+                        optimizer=optimizer
                         )
-                        dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
-                        # continuous Policy loss
-                        con_pg_loss_orig = -mb_advantages * con_ratio
-                        con_pg_loss_clip = -mb_advantages * torch.clamp(
-                            con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
-                        )
-                        con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
-
-                        # Value loss
-                        newvalue = newvalue.view(-1)
-                        if args.clip_vloss:
-                            v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
-                            v_clipped = b_values[mb_inds] + torch.clamp(
-                                newvalue - b_values[mb_inds],
-                                -args.clip_coef,
-                                args.clip_coef,
-                            )
-                            v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
-                            v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
-                            v_loss = 0.5 * v_loss_max.mean()
-                        else:
-                            v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
-
-                        # total loss
-                        entropy_loss = dis_entropy.mean() + con_entropy.mean()
-                        loss = (
-                            dis_pg_loss * POLICY_COEF[thisT]
-                            + con_pg_loss * POLICY_COEF[thisT]
-                            + entropy_loss * ENTROPY_COEF[thisT]
-                            + v_loss * CRITIC_COEF[thisT]
-                        )*LOSS_COEF[thisT]
-
-                        if(torch.isnan(loss).any()):
-                            print("LOSS Include NAN!!!")
-                            if(torch.isnan(dis_pg_loss.any())):
-                                print("dis_pg_loss include nan")
-                            if(torch.isnan(con_pg_loss.any())):
-                                print("con_pg_loss include nan")
-                            if(torch.isnan(entropy_loss.any())):
-                                print("entropy_loss include nan")
-                            if(torch.isnan(v_loss.any())):
-                                print("v_loss include nan")
-                            raise
-
-                        optimizer.zero_grad()
-                        loss.backward()
-                        # Clips gradient norm of an iterable of parameters.
-                        nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
-                        optimizer.step()
-
-                    """
-                    if args.target_kl is not None:
-                        if approx_kl > args.target_kl:
-                            break
-                    """
                 # record mean reward before clear history
                 print("done")
-                targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy())
+                targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
                 mean_reward_list.append(targetRewardMean)
-                targetName = Targets(thisT).name
+                targetName = Targets(this_train_ind).name
 
                 # clear this target trainning set buffer
-                ppo_memories.clear_training_datasets(thisT)
-
+                ppo_memories.clear_training_datasets(this_train_ind)
                 # record rewards for plotting purposes
                 wdb_recorder.add_target_scalar(
                     targetName,
-                    thisT,
+                    this_train_ind,
                     v_loss,
                     dis_pg_loss,
                     con_pg_loss,
@@ -464,19 +223,19 @@ if __name__ == "__main__":
             # train mode off
             mean_reward_list = [] # for WANDB
             # while not in training mode, clear the buffer
-            for thisT in train_queue:
-                target_steps[thisT]+=1
-                targetName = Targets(thisT).name
-                targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy())
+            for this_train_ind in train_queue:
+                target_steps[this_train_ind]+=1
+                targetName = Targets(this_train_ind).name
+                targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
                 mean_reward_list.append(targetRewardMean)
-                print(target_steps[thisT])
+                print(target_steps[this_train_ind])
 
                 # clear this target trainning set buffer
-                ppo_memories.clear_training_datasets(thisT)
+                ppo_memories.clear_training_datasets(this_train_ind)
 
                 # record rewards for plotting purposes
-                wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
-                wdb_recorder.add_win_ratio(targetName,target_steps[thisT])
+                wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[this_train_ind])
+                wdb_recorder.add_win_ratio(targetName,target_steps[this_train_ind])
                 print(f"episode over Target{targetName} mean reward:", targetRewardMean)
             TotalRewardMean = np.mean(mean_reward_list)
             wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
diff --git a/Aimbot-PPO-Python/Pytorch/aimemory.py b/Aimbot-PPO-Python/Pytorch/aimemory.py
index 9751c85..89aad78 100644
--- a/Aimbot-PPO-Python/Pytorch/aimemory.py
+++ b/Aimbot-PPO-Python/Pytorch/aimemory.py
@@ -1,7 +1,6 @@
 import torch
 import numpy as np
 import argparse
-from aimbotEnv import Aimbot
 from ppoagent import PPOAgent
 from enum import Enum
 
@@ -16,42 +15,39 @@ class Targets(Enum):
 class PPOMem:
     def __init__(
         self,
-        env: Aimbot,
         args: argparse.Namespace,
+        unity_agent_num: int,
         device: torch.device,
-        target_num: int,
-        target_state_size: int,
-        base_lose_reward: int,
-        base_win_reward: int,
     ) -> None:
+        self.target_num = args.target_num
         self.data_set_size = args.datasetSize
         self.result_broadcast_ratio = args.result_broadcast_ratio
         self.decision_period = args.decision_period
-        self.unity_agent_num = env.unity_agent_num
+        self.unity_agent_num = unity_agent_num
 
-        self.base_lose_reward = base_lose_reward
-        self.base_win_reward = base_win_reward
-        self.target_state_size = target_state_size
+        self.base_lose_reward = args.base_lose_reward
+        self.base_win_reward = args.base_win_reward
+        self.target_state_size = args.target_state_size
         self.device = device
 
         # Trajectory Buffer
-        self.ob_bf = [[] for i in range(env.unity_agent_num)]
-        self.act_bf = [[] for i in range(env.unity_agent_num)]
-        self.dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
-        self.con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
-        self.rewards_bf = [[] for i in range(env.unity_agent_num)]
-        self.dones_bf = [[] for i in range(env.unity_agent_num)]
-        self.values_bf = [[] for i in range(env.unity_agent_num)]
+        self.ob_bf = [[] for i in range(self.unity_agent_num)]
+        self.act_bf = [[] for i in range(self.unity_agent_num)]
+        self.dis_logprobs_bf = [[] for i in range(self.unity_agent_num)]
+        self.con_logprobs_bf = [[] for i in range(self.unity_agent_num)]
+        self.rewards_bf = [[] for i in range(self.unity_agent_num)]
+        self.dones_bf = [[] for i in range(self.unity_agent_num)]
+        self.values_bf = [[] for i in range(self.unity_agent_num)]
 
         # initialize empty training datasets
-        self.obs = [torch.tensor([]).to(device) for i in range(target_num)]  # (TARGETNUM,n,env.unity_observation_size)
-        self.actions = [torch.tensor([]).to(device) for i in range(target_num)]  # (TARGETNUM,n,env.unity_action_size)
-        self.dis_logprobs = [torch.tensor([]).to(device) for i in range(target_num)]  # (TARGETNUM,n,1)
-        self.con_logprobs = [torch.tensor([]).to(device) for i in range(target_num)]  # (TARGETNUM,n,1)
-        self.rewards = [torch.tensor([]).to(device) for i in range(target_num)]  # (TARGETNUM,n,1)
-        self.values = [torch.tensor([]).to(device) for i in range(target_num)]  # (TARGETNUM,n,1)
-        self.advantages = [torch.tensor([]).to(device) for i in range(target_num)]  # (TARGETNUM,n,1)
-        self.returns = [torch.tensor([]).to(device) for i in range(target_num)]  # (TARGETNUM,n,1)
+        self.obs = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,env.unity_observation_size)
+        self.actions = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,env.unity_action_size)
+        self.dis_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
+        self.con_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
+        self.rewards = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
+        self.values = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
+        self.advantages = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
+        self.returns = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
 
     def broad_cast_end_reward(self, rewardBF: list, remainTime: float) -> torch.Tensor:
         thisRewardBF = rewardBF.copy()
diff --git a/Aimbot-PPO-Python/Pytorch/arguments.py b/Aimbot-PPO-Python/Pytorch/arguments.py
new file mode 100644
index 0000000..78f58f4
--- /dev/null
+++ b/Aimbot-PPO-Python/Pytorch/arguments.py
@@ -0,0 +1,154 @@
+import argparse
+import uuid
+
+from distutils.util import strtobool
+
+DEFAULT_SEED = 9331
+ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv"
+WAND_ENTITY = "koha9"
+WORKER_ID = 1
+BASE_PORT = 1000
+
+# tensorboard names
+GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3"
+GAME_TYPE = "Mix_Verification"
+
+# max round steps per agent is 2500/Decision_period, 25 seconds
+TOTAL_STEPS = 3150000
+BATCH_SIZE = 512
+MAX_TRAINNING_DATASETS = 6000
+DECISION_PERIOD = 1
+LEARNING_RATE = 6.5e-4
+GAMMA = 0.99
+GAE_LAMBDA = 0.95
+EPOCHS = 3
+CLIP_COEF = 0.11
+LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
+POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
+ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
+CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
+TARGET_LEARNING_RATE = 1e-6
+
+FREEZE_VIEW_NETWORK = False
+BROADCASTREWARD = False
+ANNEAL_LEARNING_RATE = True
+CLIP_VLOSS = True
+NORM_ADV = False
+TRAIN = True
+SAVE_MODEL = True
+WANDB_TACK = True
+LOAD_DIR = None
+#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
+
+# Unity Environment Parameters
+TARGET_STATE_SIZE = 6
+INAREA_STATE_SIZE = 1
+TIME_STATE_SIZE = 1
+GUN_STATE_SIZE = 1
+MY_STATE_SIZE = 4
+TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
+BASE_WINREWARD = 999
+BASE_LOSEREWARD = -999
+TARGETNUM= 4
+ENV_TIMELIMIT = 30
+RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
+
+def parse_args():
+    # fmt: off
+    # pytorch and environment parameters
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
+                        help="seed of the experiment")
+    parser.add_argument("--path", type=str, default=ENV_PATH,
+                        help="enviroment path")
+    parser.add_argument("--workerID", type=int, default=WORKER_ID,
+                        help="unity worker ID")
+    parser.add_argument("--baseport", type=int, default=BASE_PORT,
+                        help="port to connect to Unity environment")
+    parser.add_argument("--lr", type=float, default=LEARNING_RATE,
+                        help="the default learning rate of optimizer")
+    parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
+                        help="if toggled, cuda will be enabled by default")
+    parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
+                        help="total timesteps of the experiments")
+
+    # model parameters
+    parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
+                        help="Train Model or not")
+    parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
+                        help="freeze view network or not")
+    parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
+                        help="training dataset size,start training while dataset collect enough data")
+    parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
+                        help="nimi batch size")
+    parser.add_argument("--epochs", type=int, default=EPOCHS,
+                        help="the K epochs to update the policy")
+    parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
+                        help="Toggle learning rate annealing for policy and value networks")
+    parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
+                        help="track on the wandb")
+    parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
+                        help="save model or not")
+    parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
+                        help="the entity (team) of wandb's project")
+    parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
+                        help="load model directory")
+    parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
+                        help="the number of steps to run in each environment per policy rollout")
+    parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
+                        help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
+    parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True,
+                        help="save model or not")
+    # target_learning_rate
+    parser.add_argument("--target-lr", type=float, default=TARGET_LEARNING_RATE,
+                        help="target value of downscaling the learning rate")
+    
+    # POLICY_COEF ENTROPY_COEF CRITIC_COEF LOSS_COEF
+    parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
+                        help="coefficient of the policy loss")
+    parser.add_argument("--entropy-coef", type=float, default=ENTROPY_COEF,
+                        help="coefficient of the entropy loss")
+    parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
+                        help="coefficient of the critic loss")
+    parser.add_argument("--loss-coef", type=float, default=LOSS_COEF,
+                        help="coefficient of the total loss")
+
+    # GAE loss
+    parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
+                        help="Use GAE for advantage computation")
+    parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
+                        help="Toggles advantages normalization")
+    parser.add_argument("--gamma", type=float, default=GAMMA,
+                        help="the discount factor gamma")
+    parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
+                        help="the lambda for the general advantage estimation")
+    parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
+                        help="the surrogate clipping coefficient")
+    parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
+                        help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
+    parser.add_argument("--max-grad-norm", type=float, default=0.5,
+                        help="the maximum norm for the gradient clipping")
+    parser.add_argument("--target-kl", type=float, default=None,
+                        help="the target KL divergence threshold")
+    # environment parameters
+    parser.add_argument("--target-num", type=int, default=TARGETNUM,
+                        help="the number of targets")
+    parser.add_argument("--env-timelimit", type=int, default=ENV_TIMELIMIT,
+                        help="the time limit of each round")
+    parser.add_argument("--base-win-reward", type=int, default=BASE_WINREWARD,
+                        help="the base reward of win round")
+    parser.add_argument("--base-lose-reward", type=int, default=BASE_LOSEREWARD,
+                        help="the base reward of lose round")
+    parser.add_argument("--target-state-size", type=int, default=TARGET_STATE_SIZE,
+                        help="the size of target state")
+    parser.add_argument("--time-state-size", type=int, default=TIME_STATE_SIZE,
+                        help="the size of time state")
+    parser.add_argument("--gun-state-size", type=int, default=GUN_STATE_SIZE,
+                        help="the size of gun state")
+    parser.add_argument("--my-state-size", type=int, default=MY_STATE_SIZE,
+                        help="the size of my state")
+    parser.add_argument("--total-target-size", type=int, default=TOTAL_T_SIZE,
+                        help="the size of total target state")
+    # fmt: on
+    args = parser.parse_args()
+    return args
\ No newline at end of file
diff --git a/Aimbot-PPO-Python/Pytorch/ppoagent.py b/Aimbot-PPO-Python/Pytorch/ppoagent.py
index 917fc3e..bcc041b 100644
--- a/Aimbot-PPO-Python/Pytorch/ppoagent.py
+++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py
@@ -1,6 +1,7 @@
 import numpy as np
 import torch
 import argparse
+import time
 
 from torch import nn
 from aimbotEnv import Aimbot
@@ -19,123 +20,118 @@ class PPOAgent(nn.Module):
         self,
         env: Aimbot,
         this_args:argparse.Namespace,
-        train_agent: bool,
-        target_num: int,
-        target_state_size: int,
-        time_state_size: int,
-        gun_state_size: int,
-        my_state_size: int,
-        total_t_size: int,
         device: torch.device,
     ):
         super(PPOAgent, self).__init__()
         self.device = device
         self.args = this_args
-        self.trainAgent = train_agent
-        self.targetNum = target_num
-        self.stateSize = env.unity_observation_shape[0]
-        self.agentNum = env.unity_agent_num
-        self.targetSize = target_state_size
-        self.timeSize = time_state_size
-        self.gunSize = gun_state_size
-        self.myStateSize = my_state_size
-        self.raySize = env.unity_observation_shape[0] - total_t_size
-        self.nonRaySize = total_t_size
+        self.train_agent = self.args.train
+        self.target_num = self.args.target_num
+        self.unity_observation_shape = env.unity_observation_shape
+        self.unity_action_size = env.unity_action_size
+        self.state_size = self.unity_observation_shape[0]
+        self.agent_num = env.unity_agent_num
+        self.target_size = self.args.target_state_size
+        self.time_state_size = self.args.time_state_size
+        self.gun_state_size = self.args.gun_state_size
+        self.my_state_size = self.args.my_state_size
+        self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
+        self.state_size_without_ray = self.args.total_target_size
         self.head_input_size = (
-            env.unity_observation_shape[0] - self.targetSize - self.timeSize - self.gunSize
+            env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
         )  # except target state input
 
-        self.unityDiscreteType = env.unity_discrete_type
+        self.unity_discrete_type = env.unity_discrete_type
         self.discrete_size = env.unity_discrete_size
         self.discrete_shape = list(env.unity_discrete_branches)
         self.continuous_size = env.unity_continuous_size
 
-        self.viewNetwork = nn.Sequential(layer_init(nn.Linear(self.raySize, 200)), nn.LeakyReLU())
-        self.targetNetworks = nn.ModuleList(
+        self.view_network = nn.Sequential(layer_init(nn.Linear(self.ray_state_size, 200)), nn.LeakyReLU())
+        self.target_networks = nn.ModuleList(
             [
-                nn.Sequential(layer_init(nn.Linear(self.nonRaySize, 100)), nn.LeakyReLU())
-                for i in range(target_num)
+                nn.Sequential(layer_init(nn.Linear(self.state_size_without_ray, 100)), nn.LeakyReLU())
+                for i in range(self.target_num)
             ]
         )
-        self.middleNetworks = nn.ModuleList(
+        self.middle_networks = nn.ModuleList(
             [
                 nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU())
-                for i in range(target_num)
+                for i in range(self.target_num)
             ]
         )
         self.actor_dis = nn.ModuleList(
-            [layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(target_num)]
+            [layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(self.target_num)]
         )
         self.actor_mean = nn.ModuleList(
-            [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(target_num)]
+            [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
         )
         # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
         # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
         self.actor_logstd = nn.ParameterList(
-            [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(target_num)]
+            [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
         )  # nn.Parameter(torch.zeros(1, self.continuous_size))
         self.critic = nn.ModuleList(
-            [layer_init(nn.Linear(200, 1), std=1) for i in range(target_num)]
+            [layer_init(nn.Linear(200, 1), std=1) for i in range(self.target_num)]
         )
 
     def get_value(self, state: torch.Tensor):
         target = state[:, 0].to(torch.int32)  # int
-        thisStateNum = target.size()[0]
-        viewInput = state[:, -self.raySize :]  # all ray input
-        targetInput = state[:, : self.nonRaySize]
-        viewLayer = self.viewNetwork(viewInput)
-        targetLayer = torch.stack(
-            [self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]
+        this_state_num = target.size()[0]
+        view_input = state[:, -self.ray_state_size :]  # all ray input
+        target_input = state[:, : self.state_size_without_ray]
+        view_layer = self.view_network(view_input)
+        target_layer = torch.stack(
+            [self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
         )
-        middleInput = torch.cat([viewLayer, targetLayer], dim=1)
-        middleLayer = torch.stack(
-            [self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]
+        middle_input = torch.cat([view_layer, target_layer], dim=1)
+        middle_layer = torch.stack(
+            [self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
         )
         criticV = torch.stack(
-            [self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]
+            [self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
         )  # self.critic
         return criticV
 
     def get_actions_value(self, state: torch.Tensor, actions=None):
         target = state[:, 0].to(torch.int32)  # int
-        thisStateNum = target.size()[0]
-        viewInput = state[:, -self.raySize :]  # all ray input
-        targetInput = state[:, : self.nonRaySize]
-        viewLayer = self.viewNetwork(viewInput)
-        targetLayer = torch.stack(
-            [self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]
+        this_state_num = target.size()[0]
+        view_input = state[:, -self.ray_state_size :]  # all ray input
+        target_input = state[:, : self.state_size_without_ray]
+        view_layer = self.view_network(view_input)
+        target_layer = torch.stack(
+            [self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
         )
-        middleInput = torch.cat([viewLayer, targetLayer], dim=1)
-        middleLayer = torch.stack(
-            [self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]
+        middle_input = torch.cat([view_layer, target_layer], dim=1)
+        middle_layer = torch.stack(
+            [self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
         )
 
         # discrete
         # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
         dis_logits = torch.stack(
-            [self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)]
+            [self.actor_dis[target[i]](middle_layer[i]) for i in range(this_state_num)]
         )
         split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
         multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
         # continuous
         actions_mean = torch.stack(
-            [self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]
+            [self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)]
         )  # self.actor_mean(hidden)
         # action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
         # action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
         action_logstd = torch.stack(
-            [torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(thisStateNum)]
+            [torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)]
         )
         # print(action_logstd)
         action_std = torch.exp(action_logstd)  # torch.exp(action_logstd)
         con_probs = Normal(actions_mean, action_std)
         # critic
         criticV = torch.stack(
-            [self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]
+            [self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
         )  # self.critic
 
         if actions is None:
-            if self.trainAgent:
+            if self.train_agent:
                 # select actions base on probability distribution model
                 disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
                 conAct = con_probs.sample()
@@ -148,8 +144,8 @@ class PPOAgent(nn.Module):
                 conAct = con_probs.sample()
                 actions = torch.cat([disAct.T, conAct], dim=1)
         else:
-            disAct = actions[:, 0 : self.unityDiscreteType].T
-            conAct = actions[:, self.unityDiscreteType :]
+            disAct = actions[:, 0 : self.unity_discrete_type].T
+            conAct = actions[:, self.unity_discrete_type :]
         dis_log_prob = torch.stack(
             [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
         )
@@ -162,6 +158,123 @@ class PPOAgent(nn.Module):
             con_probs.entropy().sum(1),
             criticV,
         )
+    def train_net(self, this_train_ind:int,ppo_memories,optimizer) -> tuple:
+        start_time = time.time()
+        # flatten the batch
+        b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape)
+        b_dis_logprobs = ppo_memories.dis_logprobs[this_train_ind].reshape(-1)
+        b_con_logprobs = ppo_memories.con_logprobs[this_train_ind].reshape(-1)
+        b_actions = ppo_memories.actions[this_train_ind].reshape((-1,) + (self.unity_action_size,))
+        b_advantages = ppo_memories.advantages[this_train_ind].reshape(-1)
+        b_returns = ppo_memories.returns[this_train_ind].reshape(-1)
+        b_values = ppo_memories.values[this_train_ind].reshape(-1)
+        b_size = b_obs.size()[0]
+        # optimizing the policy and value network
+        b_inds = np.arange(b_size)
+        
+        for epoch in range(self.args.epochs):
+            print("epoch:",epoch,end="")
+            # shuffle all datasets
+            np.random.shuffle(b_inds)
+            for start in range(0, b_size, self.args.minibatchSize):
+                print(".",end="")
+                end = start + self.args.minibatchSize
+                mb_inds = b_inds[start:end]
+                if(np.size(mb_inds)<=1):
+                    break
+                mb_advantages = b_advantages[mb_inds]
+
+                # normalize advantages
+                if self.args.norm_adv:
+                    mb_advantages = (mb_advantages - mb_advantages.mean()) / (
+                        mb_advantages.std() + 1e-8
+                    )
+
+                (
+                    _,
+                    new_dis_logprob,
+                    dis_entropy,
+                    new_con_logprob,
+                    con_entropy,
+                    newvalue,
+                ) = self.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
+                # discrete ratio
+                dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
+                dis_ratio = dis_logratio.exp()
+                # continuous ratio
+                con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
+                con_ratio = con_logratio.exp()
+
+                """
+                # early stop
+                with torch.no_grad():
+                    # calculate approx_kl http://joschu.net/blog/kl-approx.html
+                    old_approx_kl = (-logratio).mean()
+                    approx_kl = ((ratio - 1) - logratio).mean()
+                    clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
+                """
+
+                # discrete Policy loss
+                dis_pg_loss_orig = -mb_advantages * dis_ratio
+                dis_pg_loss_clip = -mb_advantages * torch.clamp(
+                    dis_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
+                )
+                dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
+                # continuous Policy loss
+                con_pg_loss_orig = -mb_advantages * con_ratio
+                con_pg_loss_clip = -mb_advantages * torch.clamp(
+                    con_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
+                )
+                con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
+
+                # Value loss
+                newvalue = newvalue.view(-1)
+                if self.args.clip_vloss:
+                    v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
+                    v_clipped = b_values[mb_inds] + torch.clamp(
+                        newvalue - b_values[mb_inds],
+                        -self.args.clip_coef,
+                        self.args.clip_coef,
+                    )
+                    v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
+                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
+                    v_loss = 0.5 * v_loss_max.mean()
+                else:
+                    v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
+
+                # total loss
+                entropy_loss = dis_entropy.mean() + con_entropy.mean()
+                loss = (
+                    dis_pg_loss * self.args.policy_coef[this_train_ind]
+                    + con_pg_loss * self.args.policy_coef[this_train_ind]
+                    + entropy_loss * self.args.entropy_coef[this_train_ind]
+                    + v_loss * self.args.critic_coef[this_train_ind]
+                )*self.args.loss_coef[this_train_ind]
+
+                if(torch.isnan(loss).any()):
+                    print("LOSS Include NAN!!!")
+                    if(torch.isnan(dis_pg_loss.any())):
+                        print("dis_pg_loss include nan")
+                    if(torch.isnan(con_pg_loss.any())):
+                        print("con_pg_loss include nan")
+                    if(torch.isnan(entropy_loss.any())):
+                        print("entropy_loss include nan")
+                    if(torch.isnan(v_loss.any())):
+                        print("v_loss include nan")
+                    raise
+
+                optimizer.zero_grad()
+                loss.backward()
+                # Clips gradient norm of an iterable of parameters.
+                nn.utils.clip_grad_norm_(self.parameters(), self.args.max_grad_norm)
+                optimizer.step()
+
+            """
+            if args.target_kl is not None:
+                if approx_kl > args.target_kl:
+                    break
+            """
+        return (v_loss,dis_pg_loss,con_pg_loss,loss,entropy_loss)
 
     def gae(
         self,
diff --git a/Aimbot-PPO-Python/Pytorch/pytorch_run_archive.zip b/Aimbot-PPO-Python/Pytorch/pytorch_run_archive.zip
new file mode 100644
index 0000000..f085b11
Binary files /dev/null and b/Aimbot-PPO-Python/Pytorch/pytorch_run_archive.zip differ