Argument説明

整理无用变量，对环境3.6进行适配
参数Critic_coef修改
2024-03-02 17:36:33 +09:00 · 2024-01-24 17:07:45 +09:00 · 2023-11-23 15:29:13 +09:00 · 2023-10-15 06:05:59 +09:00 · 2023-08-08 20:47:56 +09:00 · 2023-08-04 05:13:32 +09:00
8 changed files with 266 additions and 116 deletions
--- a/Aimbot-PPO-Python/Pytorch/AimbotEnv.py
+++ b/Aimbot-PPO-Python/Pytorch/AimbotEnv.py
@ -11,6 +11,7 @@ from mlagents_envs.side_channel.side_channel import (
    IncomingMessage,
    OutgoingMessage,
 )
 from arguments import set_save_model
 class Aimbot(gym.Env):
@ -176,18 +177,21 @@ class AimbotSideChannel(SideChannel):
        "Warning|Message1|Message2|Message3" or
        "Error|Message1|Message2|Message3"
        """
-        this_message = msg.read_string()
+        this_message_Original = msg.read_string()
-        this_result = this_message.split("|")
+        this_message = this_message_Original.split("|")
-        print(this_result)
+        print(this_message)
-        if this_result[0] == "Warning":
+        if this_message[0] == "Warning":
-            if this_result[1] == "Result":
+            if this_message[1] == "Result":
-                airecorder.total_rounds[this_result[2]] += 1
+                airecorder.total_rounds[this_message[2]] += 1
-                if this_result[3] == "Win":
+                if this_message[3] == "Win":
-                    airecorder.win_rounds[this_result[2]] += 1
+                    airecorder.win_rounds[this_message[2]] += 1
                # print(TotalRounds)
                # print(WinRounds)
-        elif this_result[0] == "Error":
+            if this_message[1] == "Command":
-            print(this_message)
+                set_save_model(True)
                print("Command: " + this_message_Original)
        elif this_message[0] == "Error":
            print(this_message_Original)
        # # while Message type is Warning
        # if(thisResult[0] == "Warning"):
        #     # while Message1 is result means one game is over
--- a/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
+++ b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
@ -181,30 +181,84 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "array([[0., 0., 0., 0.],\n",
+       "3"
       "       [0., 0., 0., 0.],\n",
       "       [0., 0., 0., 0.],\n",
       "       [0., 0., 0., 0.],\n",
       "       [0., 0., 0., 0.],\n",
       "       [0., 0., 0., 0.],\n",
       "       [0., 0., 0., 0.],\n",
       "       [0., 0., 0., 0.]])"
      ]
     },
-     "execution_count": 6,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "import numpy as np\n",
+    "y=\"a;b;c\"\n",
-    "np.zeros((8, 4))"
+    "len(y.split(\";\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2]\n"
     ]
    }
   ],
   "source": [
    "a = np.array([1,2,3,4])\n",
    "print(a[[False,True,False,False]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{1, 2, 3, 4}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = {1,2,3}\n",
    "a.add(4)\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3, 4])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = np.array([[1,3],[2,4]])\n",
    "a.max(axis=1)\n"
   ]
  }
 ],
--- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
+++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
@ -4,6 +4,7 @@ import random
 import uuid
 import torch
 import atexit
 import os
 from aimbotEnv import Aimbot
 from aimbotEnv import AimbotSideChannel
@ -12,13 +13,14 @@ from airecorder import WandbRecorder
 from aimemory import PPOMem
 from aimemory import Targets
 from arguments import parse_args
 from arguments import set_save_model, is_save_model
 import torch.optim as optim
 # side channel uuid
 SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
 # tensorboard names
-GAME_NAME = "Aimbot_Hybrid_V3"
+GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel"
-GAME_TYPE = "Mix_Verification"
+GAME_TYPE = "GotoOnly-3.6-Level0123-newModel-Onehot"
 if __name__ == "__main__":
    args = parse_args()
@ -47,9 +49,8 @@ if __name__ == "__main__":
        # freeze
        if args.freeze_viewnet:
            # freeze the view network
-            for p in agent.viewNetwork.parameters():
+            print("FREEZE VIEW NETWORK is not compatible with Full MNN!")
-                p.requires_grad = False
+            raise NotImplementedError
            print("VIEW NETWORK FREEZE")
        print("Load Agent", args.load_dir)
        print(agent.eval())
    # optimizer
@ -58,16 +59,6 @@ if __name__ == "__main__":
    run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
    wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
    @atexit.register
    def save_model():
        # close env
        env.close()
        if args.save_model:
            # save model while exit
            save_dir = "../PPO-Model/" + run_name + "_last.pt"
            torch.save(agent, save_dir)
            print("save model to " + save_dir)
    # start the game
    total_update_step = args.target_num * args.total_timesteps // args.datasetSize
    target_steps = [0 for i in range(args.target_num)]
@ -112,7 +103,7 @@ if __name__ == "__main__":
                with torch.no_grad():
                    # predict actions
                    action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
-                        torch.Tensor(state).to(device)
+                        torch.tensor(state,dtype=torch.float32).to(device)
                    )
                    value = value.flatten()
@ -223,11 +214,16 @@ if __name__ == "__main__":
            )
            # print cost time as seconds
            print("cost time:", time.time() - start_time)
-            # New Record!
+            # New Record! or save model
-            if TotalRewardMean > best_reward and args.save_model:
+            if ((is_save_model() or TotalRewardMean > best_reward) and args.save_model):
-                best_reward = target_reward_mean
+                # check saveDir is exist
-                saveDir = "../PPO-Model/" + run_name + "_" + str(TotalRewardMean) + ".pt"
+                saveDir = "../PPO-Model/" + run_name + "/"
-                torch.save(agent, saveDir)
+                if not os.path.isdir(saveDir):
                    os.mkdir(saveDir)
                best_reward = TotalRewardMean
                torch.save(agent, saveDir + str(TotalRewardMean) + ".pt")
                print("Model Saved!")
                set_save_model(False)
        else:
            # train mode off
            mean_reward_list = []  # for WANDB
@ -250,7 +246,10 @@ if __name__ == "__main__":
            TotalRewardMean = np.mean(mean_reward_list)
            wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
-    saveDir = "../PPO-Model/" + run_name + "_last.pt"
+    saveDir = "../PPO-Model/" + run_name + "/"
-    torch.save(agent, saveDir)
+    if not os.path.isdir(saveDir):
        os.mkdir(saveDir)
    best_reward = target_reward_mean
    torch.save(agent, saveDir + "_last.pt")
    env.close()
    wdb_recorder.writer.close()
--- a/Aimbot-PPO-Python/Pytorch/aimemory.py
+++ b/Aimbot-PPO-Python/Pytorch/aimemory.py
@ -58,10 +58,11 @@ class PPOMem:
            # print("Win! Broadcast reward!",rewardBF[-1])
            print(sum(thisRewardBF) / len(thisRewardBF))
            thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward
            # broadcast result reward, increase all reward in this round by remainTime * self.result_broadcast_ratio
            thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist()
        else:
            print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1])
-        return torch.Tensor(thisRewardBF).to(self.device)
+        return torch.tensor(thisRewardBF,dtype=torch.float32).to(self.device)
    def save_memories(
        self,
@ -88,7 +89,7 @@ class PPOMem:
                self.dones_bf[i].append(done[i])
                self.values_bf[i].append(value_cpu[i])
                if now_step % self.decision_period == 0:
-                    # on decision period, add last skiped round's reward
+                    # on decision period, add last skiped round's reward, only affact in decision_period != 1
                    self.rewards_bf[i].append(reward[i] + last_reward[i])
                else:
                    # not on decision period, only add this round's reward
@ -101,10 +102,10 @@ class PPOMem:
                thisRewardsTensor = self.broad_cast_end_reward(self.rewards_bf[i], remainTime)
                adv, rt = agent.gae(
                    rewards=thisRewardsTensor,
-                    dones=torch.Tensor(self.dones_bf[i]).to(self.device),
+                    dones=torch.tensor(self.dones_bf[i],dtype=torch.float32).to(self.device),
                    values=torch.tensor(self.values_bf[i]).to(self.device),
                    next_obs=torch.tensor(next_state[i]).to(self.device).unsqueeze(0),
-                    next_done=torch.Tensor([next_done[i]]).to(self.device),
+                    next_done=torch.tensor([next_done[i]],dtype=torch.float32).to(self.device),
                )
                # send memories to training datasets
                self.obs[roundTargetType] = torch.cat((self.obs[roundTargetType], torch.tensor(np.array(self.ob_bf[i])).to(self.device)), 0)
--- a/Aimbot-PPO-Python/Pytorch/arguments-cn.md
+++ b/Aimbot-PPO-Python/Pytorch/arguments-cn.md
@ -0,0 +1,56 @@
 本项目使用以下命令行参数来配置运行环境和模型训练参数：
 - `--seed <int>`：实验的随机种子。默认值为`9331`。
 - `--path <str>`：环境路径。默认值为`"./Build/3.6/Aimbot-ParallelEnv"`。
 - `--workerID <int>`：Unity worker ID。默认值为`1`。
 - `--baseport <int>`：用于连接Unity环境的端口。默认值为`500`。
 - `--lr <float>`：优化器的默认学习率。默认值为`5e-5`。
 - `--cuda`：如果启用，将默认使用cuda。可以通过传入`true`或`false`来开启或关闭。
 - `--total-timesteps <int>`：实验的总时间步数。默认值为`3150000`。
 ### 模型参数
 - `--train`：是否训练模型。默认启用。
 - `--freeze-viewnet`：是否冻结视图网络(raycast)。默认为`False`。
 - `--datasetSize <int>`：训练数据集的大小，当数据集收集足够的数据时开始训练。默认值为`6000`。
 - `--minibatchSize <int>`：minibatch大小。默认值为`512`。
 - `--epochs <int>`：更新策略的K次迭代。默认值为`3`。
 - `--annealLR`：是否对策略和价值网络进行学习率退火。默认为`True`。
 - `--wandb-track`：是否在wandb上跟踪。默认为`False`。
 - `--save-model`：是否保存模型。默认为`False`。
 - `--wandb-entity <str>`：wandb项目的实体。默认值为`"koha9"`。
 - `--load-dir <str>`：模型加载目录。默认值为`None`。
 - `--decision-period <int>`：Timestep之间的动作执行间隔。默认值为`1`。
 - `--result-broadcast-ratio <float>`：当赢得回合时，对结果的reward进行broadcast的比例，默认值为`1/30`。
 - `--target-lr <float>`：下调学习率的目标值。默认值为`1e-6`。
 ### 损失函数参数
 - `--policy-coef <float>`：策略损失的系数。默认值为`[0.8, 0.8, 0.8, 0.8]`。
 - `--entropy-coef <float>`：熵损失的系数。默认值为`[0.05, 0.05, 0.05, 0.05]`。
 - `--critic-coef <float>`：评论家损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
 - `--loss-coef <float>`：总损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
 ### GAE损失参数
 - `--gae`：是否使用GAE进行优势计算。默认启用。
 - `--norm-adv`：是否标准化优势。默认为`False`。
 - `--gamma <float>`：折扣因子gamma。默认值为`0.999`。
 - `--gaeLambda <float>`：GAE的lambda值。默认值为`0.95`。
 - `--clip-coef <float>`：替代裁剪系数。默认值为`0.11`。
 - `--clip-vloss`：是否使用论文中的裁剪价值函数损失。默认启用。
 - `--max-grad-norm <float>`：梯度裁剪的最大范数。默认值为`0.5`。
 ### 环境参数
 - `--target-num <int>`：目标种类数量。默认值为`4`。
 - `--env-timelimit <int>`：每轮的时间限制。默认值为`30`。
 - `--base-win-reward <int>`：赢得回合的基础奖励。默认值为`999`。
 - `--base-lose-reward <int>`：输掉回合的基础奖励。默认值为`-999`。
 - `--target-state-size <int>`：target状态的大小。默认值为`6`。
 - `--time-state-size <int>`：游戏剩余时间状态的大小。默认值为`1`。
 - `--gun-state-size <int>`：枪状态的大小。默认值为`1`。
 - `--my-state-size <int>`：我的状态大小。默认值为`4`。
 - `--total-target-size <int>`：总target状态的大小。默认值为`12`。
--- a/Aimbot-PPO-Python/Pytorch/arguments-jp.md
+++ b/Aimbot-PPO-Python/Pytorch/arguments-jp.md
@ -0,0 +1,52 @@
 - `--seed <int>`：実験の乱数Seed。デフォルト値は`9331`。
 - `--path <str>`：環境パス。デフォルト値は`"./Build/3.6/Aimbot-ParallelEnv"`。
 - `--workerID <int>`：Unity Worker ID。デフォルト値は`1`。
 - `--baseport <int>`：Unity環境への接続用Port。デフォルト値は`500`。
 - `--lr <float>`：Optimizerのデフォルト学習率。デフォルト値は`5e-5`。
 - `--cuda`：有効にすると、デフォルトでcudaを使用します。`true`または`false`を渡すことで有効/無効を切り替えられます。
 - `--total-timesteps <int>`：実験の合計タイムステップ数。デフォルト値は`3150000`。
 ### モデルパラメータ
 - `--train`：モデルを訓練するかどうか。デフォルトで有効。
 - `--freeze-viewnet`：ビューネットワーク(raycast)をfreezeする。デフォルトは`False`。
 - `--datasetSize <int>`：訓練データセットのサイズ。データセットが十分なデータを集めたら訓練を開始する。デフォルト値は`6000`。
 - `--minibatchSize <int>`：minibatchのサイズ。デフォルト値は`512`。
 - `--epochs <int>`：epochs。デフォルト値は`3`。
 - `--annealLR`：ポリシーとバリューネットワークの学習率を退火するかどうか。デフォルトは`True`。
 - `--wandb-track`：wandbでトラッキングするかどうか。デフォルトは`False`。
 - `--save-model`：モデルを保存するかどうか。デフォルトは`False`。
 - `--wandb-entity <str>`：wandbプロジェクトのエンティティ。デフォルト値は`"koha9"`。
 - `--load-dir <str>`：モデルのロードディレクトリ。デフォルト値は`None`。
 - `--decision-period <int>`：実際動作を実行する時のタイムステップの間隔。デフォルト値は`1`。
 - `--result-broadcast-ratio <float>`：ラウンドに勝った場合の報酬のbroadcast ratio、デフォルト値は`1/30`。
 - `--target-lr <float>`：学習率を下げる時の目標値。デフォルト値は`1e-6`。
 ### 損失関数パラメータ
 - `--policy-coef <float>`：policy損失の係数。デフォルト値は`[0.8, 0.8, 0.8, 0.8]`。
 - `--entropy-coef <float>`：entropy損失の係数。デフォルト値は`[0.05, 0.05, 0.05, 0.05]`。
 - `--critic-coef <float>`：critic損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
 - `--loss-coef <float>`：全体の損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
 ### GAE損失パラメータ
 - `--gae`：GAEを使用してアドバンテージを計算するかどうか。デフォルトで有効。
 - `--norm-adv`：アドバンテージを正規化するかどうか。デフォルトは`False`。
 - `--gamma <float>`：割引因子gamma。デフォルト値は`0.999`。
 - `--gaeLambda <float>`：GAEのlambda値。デフォルト値は`0.95`。
 - `--clip-coef <float>`：代替クリッピング係数。デフォルト値は`0.11`。
 - `--clip-vloss`：論文で述べられている価値関数の損失のクリッピングを使用するかどうか。デフォルトで有効。
 - `--max-grad-norm <float>`：勾配のクリッピングの最大ノルム。デフォルト値は`0.5`。
 ### 環境パラメータ
 - `--target-num <int>`：Targetの種類数。デフォルト値は`4`。
 - `--env-timelimit <int>`：ラウンドごとの時間制限。デフォルト値は`30`。
 - `--base-win-reward <int>`：ラウンドに勝った場合の基本報酬。デフォルト値は`999`。
 - `--base-lose-reward <int>`：ラウンドに負けた場合の基本報酬。デフォルト値は`-999`。
 - `--target-state-size <int>`：Targetの状態サイズ。デフォルト値は`6`。
 - `--time-state-size <int>`：ゲームの残り時間の状態サイズ。デフォルト値は`1`。
 - `--gun-state-size <int>`：銃の状態サイズ。デフォルト値は`1`。
 - `--my-state-size <int>`：自分の状態サイズ。デフォルト値は`4`。
 - `--total-target-size <int>`：全Targetの状態サイズ。デフォルト値は`12`。
--- a/Aimbot-PPO-Python/Pytorch/arguments.py
+++ b/Aimbot-PPO-Python/Pytorch/arguments.py
@ -4,41 +4,38 @@ import uuid
 from distutils.util import strtobool
 DEFAULT_SEED = 9331
-ENV_PATH = "../Build/3.1.6/Aimbot-ParallelEnv"
+ENV_PATH = "../Build/3.6/Aimbot-ParallelEnv"
 WAND_ENTITY = "koha9"
 WORKER_ID = 1
 BASE_PORT = 1000
 # tensorboard names
 GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3"
 GAME_TYPE = "Mix_Verification"
 # max round steps per agent is 2500/Decision_period, 25 seconds
 TOTAL_STEPS = 3150000
 BATCH_SIZE = 512
 MAX_TRAINNING_DATASETS = 6000
 DECISION_PERIOD = 1
-LEARNING_RATE = 6.5e-4
+LEARNING_RATE = 5e-5
-GAMMA = 0.99
+GAMMA = 0.999
 GAE_LAMBDA = 0.95
 EPOCHS = 3
 CLIP_COEF = 0.11
 LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
-POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
+POLICY_COEF = [0.8, 0.8, 0.8, 0.8]
 ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
-CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
+CRITIC_COEF = [1.0, 1.0, 1.0, 1.0]
 TARGET_LEARNING_RATE = 1e-6
 FREEZE_VIEW_NETWORK = False
 BROADCASTREWARD = False
 ANNEAL_LEARNING_RATE = True
 CLIP_VLOSS = True
 NORM_ADV = False
-TRAIN = False
+TRAIN = True
-SAVE_MODEL = False
+SAVE_MODEL = True
-WANDB_TACK = False
+WANDB_TACK = True
 LOAD_DIR = None
-#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
+# LOAD_DIR = "../PPO-Model/GotoOnly-Level1234_9331_1697122986/8.853553.pt"
 # Unity Environment Parameters
 TARGET_STATE_SIZE = 6
@ -53,6 +50,16 @@ TARGETNUM= 4
 ENV_TIMELIMIT = 30
 RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
 save_model_this_episode = False
 def is_save_model():
    global save_model_this_episode
    return save_model_this_episode
 def set_save_model(save_model:bool):
    print("set save model to ",save_model)
    global save_model_this_episode
    save_model_this_episode = save_model
 def parse_args():
    # fmt: off
    # pytorch and environment parameters
@ -97,8 +104,6 @@ def parse_args():
                        help="the number of steps to run in each environment per policy rollout")
    parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
                        help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
    parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True,
                        help="save model or not")
    # target_learning_rate
    parser.add_argument("--target-lr", type=float, default=TARGET_LEARNING_RATE,
                        help="target value of downscaling the learning rate")
--- a/Aimbot-PPO-Python/Pytorch/ppoagent.py
+++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py
@ -14,6 +14,8 @@ def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    nn.init.constant_(layer.bias, bias_const)
    return layer
 neural_size_1 = 400
 neural_size_2 = 300
 class PPOAgent(nn.Module):
    def __init__(
@ -31,99 +33,76 @@ class PPOAgent(nn.Module):
        self.unity_action_size = env.unity_action_size
        self.state_size = self.unity_observation_shape[0]
        self.agent_num = env.unity_agent_num
        self.target_size = self.args.target_state_size
        self.time_state_size = self.args.time_state_size
        self.gun_state_size = self.args.gun_state_size
        self.my_state_size = self.args.my_state_size
        self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
        self.state_size_without_ray = self.args.total_target_size
        self.head_input_size = (
                env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
        )  # except target state input
        self.unity_discrete_type = env.unity_discrete_type
        self.discrete_size = env.unity_discrete_size
        self.discrete_shape = list(env.unity_discrete_branches)
        self.continuous_size = env.unity_continuous_size
-        self.view_network = nn.Sequential(layer_init(nn.Linear(self.ray_state_size, 200)), nn.LeakyReLU())
+        self.hidden_networks = nn.ModuleList(
        self.target_networks = nn.ModuleList(
            [
-                nn.Sequential(layer_init(nn.Linear(self.state_size_without_ray, 100)), nn.LeakyReLU())
+                nn.Sequential(
-                for i in range(self.target_num)
+                    layer_init(nn.Linear(self.state_size, neural_size_1)),
-            ]
+                    nn.LeakyReLU(),
-        )
+                    layer_init(nn.Linear(neural_size_1, neural_size_2)),
-        self.middle_networks = nn.ModuleList(
+                    nn.LeakyReLU(),
-            [
+                    )
                nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU())
                for i in range(self.target_num)
            ]
        )
        self.actor_dis = nn.ModuleList(
-            [layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(self.target_num)]
+            [layer_init(nn.Linear(neural_size_2, self.discrete_size), std=0.5) for i in range(self.target_num)]
        )
        self.actor_mean = nn.ModuleList(
-            [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
+            [layer_init(nn.Linear(neural_size_2, self.continuous_size), std=0) for i in range(self.target_num)]
        )
        self.actor_logstd = nn.ParameterList(
            [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
-        )  # nn.Parameter(torch.zeros(1, self.continuous_size))
+        )
        self.critic = nn.ModuleList(
-            [layer_init(nn.Linear(200, 1), std=1) for i in range(self.target_num)]
+            [layer_init(nn.Linear(neural_size_2, 1), std=0) for i in range(self.target_num)]
        )
    def get_value(self, state: torch.Tensor):
        # get critic value
        # state.size()[0] is batch_size
        target = state[:, 0].to(torch.int32)  # int
-        this_state_num = target.size()[0]
+        hidden_output = torch.stack(
-        view_input = state[:, -self.ray_state_size:]  # all ray input
+            [self.hidden_networks[target[i]](state[i]) for i in range(state.size()[0])]
        target_input = state[:, : self.state_size_without_ray]
        view_layer = self.view_network(view_input)
        target_layer = torch.stack(
            [self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
        )
        middle_input = torch.cat([view_layer, target_layer], dim=1)
        middle_layer = torch.stack(
            [self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
        )
        criticV = torch.stack(
-            [self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
+            [self.critic[target[i]](hidden_output[i]) for i in range(state.size()[0])]
-        )  # self.critic
+        )
        return criticV
    def get_actions_value(self, state: torch.Tensor, actions=None):
        # get actions and value
        target = state[:, 0].to(torch.int32)  # int
-        this_state_num = target.size()[0]
+        hidden_output = torch.stack(
-        view_input = state[:, -self.ray_state_size:]  # all ray input
+            [self.hidden_networks[target[i]](state[i]) for i in range(target.size()[0])]
        target_input = state[:, : self.state_size_without_ray]
        view_layer = self.view_network(view_input)
        target_layer = torch.stack(
            [self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
        )
        middle_input = torch.cat([view_layer, target_layer], dim=1)
        middle_layer = torch.stack(
            [self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
        )
        # discrete
        # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
        dis_logits = torch.stack(
-            [self.actor_dis[target[i]](middle_layer[i]) for i in range(this_state_num)]
+            [self.actor_dis[target[i]](hidden_output[i]) for i in range(target.size()[0])]
        )
        split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
        multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
        # continuous
        actions_mean = torch.stack(
-            [self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)]
+            [self.actor_mean[target[i]](hidden_output[i]) for i in range(target.size()[0])]
        )  # self.actor_mean(hidden)
        action_logstd = torch.stack(
-            [torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)]
+            [torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(target.size()[0])]
        )
        # print(action_logstd)
        action_std = torch.exp(action_logstd)  # torch.exp(action_logstd)
        con_probs = Normal(actions_mean, action_std)
        # critic
        criticV = torch.stack(
-            [self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
+            [self.critic[target[i]](hidden_output[i]) for i in range(target.size()[0])]
        )  # self.critic
        if actions is None:
@ -275,8 +254,8 @@ class PPOAgent(nn.Module):
            self,
            rewards: torch.Tensor,
            dones: torch.Tensor,
-            values: torch.tensor,
+            values: torch.Tensor,
-            next_obs: torch.tensor,
+            next_obs: torch.Tensor,
            next_done: torch.Tensor,
    ) -> tuple:
        # GAE
Author	SHA1	Message	Date
Koha9	573b09a920	Argument説明	2024-03-02 17:36:33 +09:00
Koha9	9d9524429c	整理无用变量，对环境3.6进行适配	2024-01-24 17:07:45 +09:00
Koha9	5aa7e0936a	参数Critic_coef修改	2023-11-23 15:29:13 +09:00
Koha9	3bc5c30fd3	添加对Save_in_next_Trainning的SideChannel支持	2023-10-15 06:05:59 +09:00
Koha9	2741d6d51a	将Tensor改为tensor Tensor与tensor的问题，规范化tensor使用。	2023-08-08 20:47:56 +09:00
Koha9	9432eaa76e	完全分离NN 使用根据Target完全分离的NN进行预测和训练。修改NN构造，修改预测预测算法以配合Full Multi NN	2023-08-04 05:13:32 +09:00