Argument説明

整理无用变量，对环境3.6进行适配
参数Critic_coef修改
2024-03-02 17:36:33 +09:00 · 2024-01-24 17:07:45 +09:00 · 2023-11-23 15:29:13 +09:00
8 changed files with 293 additions and 307 deletions
--- a/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
+++ b/Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
@ -81,43 +81,184 @@
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import argparse\n",
    "import wandb\n",
    "import time\n",
    "import numpy as np\n",
    "import random\n",
    "import uuid\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "\n",
    "from AimbotEnv import Aimbot\n",
    "from tqdm import tqdm\n",
    "from torch.distributions.normal import Normal\n",
    "from torch.distributions.categorical import Categorical\n",
    "from distutils.util import strtobool\n",
    "from torch.utils.tensorboard import SummaryWriter\n",
    "from mlagents_envs.environment import UnityEnvironment\n",
    "from mlagents_envs.side_channel.side_channel import (\n",
    "    SideChannel,\n",
    "    IncomingMessage,\n",
    "    OutgoingMessage,\n",
    ")\n",
    "from typing import List\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'aaa' object has no attribute 'outa'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[5], line 14\u001b[0m\n\u001b[0;32m     12\u001b[0m asd \u001b[39m=\u001b[39m aaa(outa, outb)\n\u001b[0;32m     13\u001b[0m asd\u001b[39m.\u001b[39mfunc()\n\u001b[1;32m---> 14\u001b[0m \u001b[39mprint\u001b[39m(asd\u001b[39m.\u001b[39;49mouta)  \u001b[39m# 输出 100\u001b[39;00m\n",
      "\u001b[1;31mAttributeError\u001b[0m: 'aaa' object has no attribute 'outa'"
     ]
    }
   ],
   "source": [
    "class aaa():\n",
    "    def __init__(self, a, b):\n",
    "        self.a = a\n",
    "        self.b = b\n",
    "\n",
    "    def func(self):\n",
    "        global outa\n",
    "        outa = 100\n",
    "\n",
    "outa = 1\n",
    "outb = 2\n",
    "asd = aaa(outa, outb)\n",
    "asd.func()\n",
    "print(asd.outa)  # 输出 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "usage: ipykernel_launcher.py [-h] [--seed SEED]\n",
      "ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme=\"hmac-sha256\" --Session.key=b\"46ef9317-59fb-4ab6-ae4e-6b35744fc423\" --shell=9002 --transport=\"tcp\" --iopub=9004 --f=c:\\Users\\UCUNI\\AppData\\Roaming\\jupyter\\runtime\\kernel-v2-311926K1uko38tdWb.json\n"
     ]
    },
    {
     "ename": "SystemExit",
     "evalue": "2",
     "output_type": "error",
     "traceback": [
      "An exception has occurred, use %tb to see the full traceback.\n",
      "\u001b[1;31mSystemExit\u001b[0m\u001b[1;31m:\u001b[0m 2\n"
     ]
    }
   ],
   "source": [
    "import argparse\n",
    "\n",
    "def parse_args():\n",
    "    parser = argparse.ArgumentParser()\n",
    "    parser.add_argument(\"--seed\", type=int, default=11,\n",
    "                        help=\"seed of the experiment\")\n",
    "    args = parser.parse_args()\n",
    "    return args\n",
    "\n",
    "arggg = parse_args()\n",
    "print(type(arggg))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y=\"a;b;c\"\n",
    "len(y.split(\";\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "0\n",
+      "[2]\n"
      "i =  0\n",
      "i =  1\n",
      "i =  2\n",
      "i =  3\n",
      "i =  4\n",
      "i =  5\n",
      "i =  6\n",
      "i =  7\n",
      "i =  8\n",
      "i =  9\n",
      "10\n"
     ]
    }
   ],
   "source": [
-    "import threading\n",
+    "a = np.array([1,2,3,4])\n",
-    "\n",
+    "print(a[[False,True,False,False]])"
-    "num = 0\n",
+   ]
-    "\n",
+  },
-    "def print_numers():\n",
+  {
-    "    global num\n",
+   "cell_type": "code",
-    "    for i in range(10):\n",
+   "execution_count": 16,
-    "        num +=1\n",
+   "metadata": {},
-    "        print(\"i = \",i)\n",
+   "outputs": [
-    "\n",
+    {
-    "thread = threading.Thread(target=print_numers)\n",
+     "data": {
-    "\n",
+      "text/plain": [
-    "print(num)\n",
+       "{1, 2, 3, 4}"
-    "thread.start()\n",
+      ]
-    "thread.join()\n",
+     },
-    "print(num)"
+     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = {1,2,3}\n",
    "a.add(4)\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3, 4])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = np.array([[1,3],[2,4]])\n",
    "a.max(axis=1)\n"
   ]
  }
 ],
--- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
+++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
@ -19,8 +19,8 @@ import torch.optim as optim
 # side channel uuid
 SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
 # tensorboard names
-GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel_V2"
+GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel"
-GAME_TYPE = "GotoOnly-Level0123-new512Model"
+GAME_TYPE = "GotoOnly-3.6-Level0123-newModel-Onehot"
 if __name__ == "__main__":
    args = parse_args()
--- a/Aimbot-PPO-Python/Pytorch/aimemory.py
+++ b/Aimbot-PPO-Python/Pytorch/aimemory.py
@ -58,6 +58,7 @@ class PPOMem:
            # print("Win! Broadcast reward!",rewardBF[-1])
            print(sum(thisRewardBF) / len(thisRewardBF))
            thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward
            # broadcast result reward, increase all reward in this round by remainTime * self.result_broadcast_ratio
            thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist()
        else:
            print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1])
@ -88,7 +89,7 @@ class PPOMem:
                self.dones_bf[i].append(done[i])
                self.values_bf[i].append(value_cpu[i])
                if now_step % self.decision_period == 0:
-                    # on decision period, add last skiped round's reward
+                    # on decision period, add last skiped round's reward, only affact in decision_period != 1
                    self.rewards_bf[i].append(reward[i] + last_reward[i])
                else:
                    # not on decision period, only add this round's reward
--- a/Aimbot-PPO-Python/Pytorch/arguments-cn.md
+++ b/Aimbot-PPO-Python/Pytorch/arguments-cn.md
@ -0,0 +1,56 @@
 本项目使用以下命令行参数来配置运行环境和模型训练参数：
 - `--seed <int>`：实验的随机种子。默认值为`9331`。
 - `--path <str>`：环境路径。默认值为`"./Build/3.6/Aimbot-ParallelEnv"`。
 - `--workerID <int>`：Unity worker ID。默认值为`1`。
 - `--baseport <int>`：用于连接Unity环境的端口。默认值为`500`。
 - `--lr <float>`：优化器的默认学习率。默认值为`5e-5`。
 - `--cuda`：如果启用，将默认使用cuda。可以通过传入`true`或`false`来开启或关闭。
 - `--total-timesteps <int>`：实验的总时间步数。默认值为`3150000`。
 ### 模型参数
 - `--train`：是否训练模型。默认启用。
 - `--freeze-viewnet`：是否冻结视图网络(raycast)。默认为`False`。
 - `--datasetSize <int>`：训练数据集的大小，当数据集收集足够的数据时开始训练。默认值为`6000`。
 - `--minibatchSize <int>`：minibatch大小。默认值为`512`。
 - `--epochs <int>`：更新策略的K次迭代。默认值为`3`。
 - `--annealLR`：是否对策略和价值网络进行学习率退火。默认为`True`。
 - `--wandb-track`：是否在wandb上跟踪。默认为`False`。
 - `--save-model`：是否保存模型。默认为`False`。
 - `--wandb-entity <str>`：wandb项目的实体。默认值为`"koha9"`。
 - `--load-dir <str>`：模型加载目录。默认值为`None`。
 - `--decision-period <int>`：Timestep之间的动作执行间隔。默认值为`1`。
 - `--result-broadcast-ratio <float>`：当赢得回合时，对结果的reward进行broadcast的比例，默认值为`1/30`。
 - `--target-lr <float>`：下调学习率的目标值。默认值为`1e-6`。
 ### 损失函数参数
 - `--policy-coef <float>`：策略损失的系数。默认值为`[0.8, 0.8, 0.8, 0.8]`。
 - `--entropy-coef <float>`：熵损失的系数。默认值为`[0.05, 0.05, 0.05, 0.05]`。
 - `--critic-coef <float>`：评论家损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
 - `--loss-coef <float>`：总损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
 ### GAE损失参数
 - `--gae`：是否使用GAE进行优势计算。默认启用。
 - `--norm-adv`：是否标准化优势。默认为`False`。
 - `--gamma <float>`：折扣因子gamma。默认值为`0.999`。
 - `--gaeLambda <float>`：GAE的lambda值。默认值为`0.95`。
 - `--clip-coef <float>`：替代裁剪系数。默认值为`0.11`。
 - `--clip-vloss`：是否使用论文中的裁剪价值函数损失。默认启用。
 - `--max-grad-norm <float>`：梯度裁剪的最大范数。默认值为`0.5`。
 ### 环境参数
 - `--target-num <int>`：目标种类数量。默认值为`4`。
 - `--env-timelimit <int>`：每轮的时间限制。默认值为`30`。
 - `--base-win-reward <int>`：赢得回合的基础奖励。默认值为`999`。
 - `--base-lose-reward <int>`：输掉回合的基础奖励。默认值为`-999`。
 - `--target-state-size <int>`：target状态的大小。默认值为`6`。
 - `--time-state-size <int>`：游戏剩余时间状态的大小。默认值为`1`。
 - `--gun-state-size <int>`：枪状态的大小。默认值为`1`。
 - `--my-state-size <int>`：我的状态大小。默认值为`4`。
 - `--total-target-size <int>`：总target状态的大小。默认值为`12`。
--- a/Aimbot-PPO-Python/Pytorch/arguments-jp.md
+++ b/Aimbot-PPO-Python/Pytorch/arguments-jp.md
@ -0,0 +1,52 @@
 - `--seed <int>`：実験の乱数Seed。デフォルト値は`9331`。
 - `--path <str>`：環境パス。デフォルト値は`"./Build/3.6/Aimbot-ParallelEnv"`。
 - `--workerID <int>`：Unity Worker ID。デフォルト値は`1`。
 - `--baseport <int>`：Unity環境への接続用Port。デフォルト値は`500`。
 - `--lr <float>`：Optimizerのデフォルト学習率。デフォルト値は`5e-5`。
 - `--cuda`：有効にすると、デフォルトでcudaを使用します。`true`または`false`を渡すことで有効/無効を切り替えられます。
 - `--total-timesteps <int>`：実験の合計タイムステップ数。デフォルト値は`3150000`。
 ### モデルパラメータ
 - `--train`：モデルを訓練するかどうか。デフォルトで有効。
 - `--freeze-viewnet`：ビューネットワーク(raycast)をfreezeする。デフォルトは`False`。
 - `--datasetSize <int>`：訓練データセットのサイズ。データセットが十分なデータを集めたら訓練を開始する。デフォルト値は`6000`。
 - `--minibatchSize <int>`：minibatchのサイズ。デフォルト値は`512`。
 - `--epochs <int>`：epochs。デフォルト値は`3`。
 - `--annealLR`：ポリシーとバリューネットワークの学習率を退火するかどうか。デフォルトは`True`。
 - `--wandb-track`：wandbでトラッキングするかどうか。デフォルトは`False`。
 - `--save-model`：モデルを保存するかどうか。デフォルトは`False`。
 - `--wandb-entity <str>`：wandbプロジェクトのエンティティ。デフォルト値は`"koha9"`。
 - `--load-dir <str>`：モデルのロードディレクトリ。デフォルト値は`None`。
 - `--decision-period <int>`：実際動作を実行する時のタイムステップの間隔。デフォルト値は`1`。
 - `--result-broadcast-ratio <float>`：ラウンドに勝った場合の報酬のbroadcast ratio、デフォルト値は`1/30`。
 - `--target-lr <float>`：学習率を下げる時の目標値。デフォルト値は`1e-6`。
 ### 損失関数パラメータ
 - `--policy-coef <float>`：policy損失の係数。デフォルト値は`[0.8, 0.8, 0.8, 0.8]`。
 - `--entropy-coef <float>`：entropy損失の係数。デフォルト値は`[0.05, 0.05, 0.05, 0.05]`。
 - `--critic-coef <float>`：critic損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
 - `--loss-coef <float>`：全体の損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
 ### GAE損失パラメータ
 - `--gae`：GAEを使用してアドバンテージを計算するかどうか。デフォルトで有効。
 - `--norm-adv`：アドバンテージを正規化するかどうか。デフォルトは`False`。
 - `--gamma <float>`：割引因子gamma。デフォルト値は`0.999`。
 - `--gaeLambda <float>`：GAEのlambda値。デフォルト値は`0.95`。
 - `--clip-coef <float>`：代替クリッピング係数。デフォルト値は`0.11`。
 - `--clip-vloss`：論文で述べられている価値関数の損失のクリッピングを使用するかどうか。デフォルトで有効。
 - `--max-grad-norm <float>`：勾配のクリッピングの最大ノルム。デフォルト値は`0.5`。
 ### 環境パラメータ
 - `--target-num <int>`：Targetの種類数。デフォルト値は`4`。
 - `--env-timelimit <int>`：ラウンドごとの時間制限。デフォルト値は`30`。
 - `--base-win-reward <int>`：ラウンドに勝った場合の基本報酬。デフォルト値は`999`。
 - `--base-lose-reward <int>`：ラウンドに負けた場合の基本報酬。デフォルト値は`-999`。
 - `--target-state-size <int>`：Targetの状態サイズ。デフォルト値は`6`。
 - `--time-state-size <int>`：ゲームの残り時間の状態サイズ。デフォルト値は`1`。
 - `--gun-state-size <int>`：銃の状態サイズ。デフォルト値は`1`。
 - `--my-state-size <int>`：自分の状態サイズ。デフォルト値は`4`。
 - `--total-target-size <int>`：全Targetの状態サイズ。デフォルト値は`12`。
--- a/Aimbot-PPO-Python/Pytorch/arguments.py
+++ b/Aimbot-PPO-Python/Pytorch/arguments.py
@ -4,7 +4,7 @@ import uuid
 from distutils.util import strtobool
 DEFAULT_SEED = 9331
-ENV_PATH = "../Build/3.5/Aimbot-ParallelEnv"
+ENV_PATH = "../Build/3.6/Aimbot-ParallelEnv"
 WAND_ENTITY = "koha9"
 WORKER_ID = 1
 BASE_PORT = 1000
@ -16,15 +16,15 @@ TOTAL_STEPS = 3150000
 BATCH_SIZE = 512
 MAX_TRAINNING_DATASETS = 6000
 DECISION_PERIOD = 1
-LEARNING_RATE = 1.5e-4
+LEARNING_RATE = 5e-5
-GAMMA = 0.99
+GAMMA = 0.999
 GAE_LAMBDA = 0.95
 EPOCHS = 3
 CLIP_COEF = 0.11
 LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
 POLICY_COEF = [0.8, 0.8, 0.8, 0.8]
 ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
-CRITIC_COEF = [0.8, 0.8, 0.8, 0.8]
+CRITIC_COEF = [1.0, 1.0, 1.0, 1.0]
 TARGET_LEARNING_RATE = 1e-6
 FREEZE_VIEW_NETWORK = False
@ -35,7 +35,7 @@ TRAIN = True
 SAVE_MODEL = True
 WANDB_TACK = True
 LOAD_DIR = None
-# LOAD_DIR = "../PPO-Model/GotoOnly-Level0123_9331_1696965321/5.1035867.pt"
+# LOAD_DIR = "../PPO-Model/GotoOnly-Level1234_9331_1697122986/8.853553.pt"
 # Unity Environment Parameters
 TARGET_STATE_SIZE = 6
--- a/Aimbot-PPO-Python/Pytorch/multiThread-PPO.py
+++ b/Aimbot-PPO-Python/Pytorch/multiThread-PPO.py
@ -1,255 +0,0 @@
 import time
 import numpy as np
 import random
 import uuid
 import torch
 import atexit
 import os
 from aimbotEnv import Aimbot
 from aimbotEnv import AimbotSideChannel
 from ppoagent import PPOAgent
 from airecorder import WandbRecorder
 from aimemory import PPOMem
 from aimemory import Targets
 from arguments import parse_args
 from arguments import set_save_model, is_save_model
 import torch.optim as optim
 # side channel uuid
 SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
 # tensorboard names
 GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel_V2"
 GAME_TYPE = "GotoOnly-Level0123-new512Model"
 if __name__ == "__main__":
    args = parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
    best_reward = -1
    # Initialize environment agent optimizer
    aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID)
    env = Aimbot(
        env_path=args.path,
        worker_id=args.workerID,
        base_port=args.baseport,
        side_channels=[aimbot_side_channel])
    if args.load_dir is None:
        agent = PPOAgent(
            env=env,
            this_args=args,
            device=device,
        ).to(device)
    else:
        agent = torch.load(args.load_dir)
        # freeze
        if args.freeze_viewnet:
            # freeze the view network
            print("FREEZE VIEW NETWORK is not compatible with Full MNN!")
            raise NotImplementedError
        print("Load Agent", args.load_dir)
        print(agent.eval())
    # optimizer
    optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
    # Tensorboard and WandB Recorder
    run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
    wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
    # start the game
    total_update_step = args.target_num * args.total_timesteps // args.datasetSize
    target_steps = [0 for i in range(args.target_num)]
    start_time = time.time()
    state, _, done = env.reset()
    # initialize AI memories
    ppo_memories = PPOMem(
        args=args,
        unity_agent_num=env.unity_agent_num,
        device=device,
    )
    # MAIN LOOP: run agent in environment
    for total_steps in range(total_update_step):
        # discount learning rate, while step == total_update_step lr will be 0
        if args.annealLR:
            final_lr_ratio = args.target_lr / args.lr
            frac = 1.0 - ((total_steps + 1.0) / total_update_step)
            lr_now = frac * args.lr
            optimizer.param_groups[0]["lr"] = lr_now
        else:
            lr_now = args.lr
        # episode start show learning rate
        print("new episode", total_steps, "learning rate = ", lr_now)
        step = 0
        training = False
        train_queue = []
        last_reward = [0. for i in range(env.unity_agent_num)]
        # MAIN LOOP: run agent in environment
        while True:
            # Target Type(state[0][0]) is stay(4),use all zero action
            if state[0][0] == 4:
                next_state, reward, next_done = env.step(env.all_zero_action)
                state, done = next_state, next_done
                continue
            # On decision point, and Target Type(state[0][0]) is not stay(4) choose action by agent
            if step % args.decision_period == 0:
                step += 1
                # Choose action by agent
                with torch.no_grad():
                    # predict actions
                    action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
                        torch.tensor(state,dtype=torch.float32).to(device)
                    )
                    value = value.flatten()
                # variable from GPU to CPU
                action_cpu = action.cpu().numpy()
                dis_logprob_cpu = dis_logprob.cpu().numpy()
                con_logprob_cpu = con_logprob.cpu().numpy()
                value_cpu = value.cpu().numpy()
                # Environment step
                next_state, reward, next_done = env.step(action_cpu)
                # save memories
                if args.train:
                    ppo_memories.save_memories(
                        now_step=step,
                        agent=agent,
                        state=state,
                        action_cpu=action_cpu,
                        dis_logprob_cpu=dis_logprob_cpu,
                        con_logprob_cpu=con_logprob_cpu,
                        reward=reward,
                        done=done,
                        value_cpu=value_cpu,
                        last_reward=last_reward,
                        next_done=next_done,
                        next_state=next_state,
                    )
                    # check if any training dataset is full and ready to train
                    for i in range(args.target_num):
                        if ppo_memories.obs[i].size()[0] >= args.datasetSize:
                            # start train NN
                            train_queue.append(i)
                    if len(train_queue) > 0:
                        # break while loop and start train
                        break
                    # update state
                state, done = next_state, next_done
            else:
                step += 1
                # skip this step use last predict action
                next_state, reward, next_done = env.step(action_cpu)
                # save memories
                if args.train:
                    ppo_memories.save_memories(
                        now_step=step,
                        agent=agent,
                        state=state,
                        action_cpu=action_cpu,
                        dis_logprob_cpu=dis_logprob_cpu,
                        con_logprob_cpu=con_logprob_cpu,
                        reward=reward,
                        done=done,
                        value_cpu=value_cpu,
                        last_reward=last_reward,
                        next_done=next_done,
                        next_state=next_state,
                    )
                    # update state
                    state = next_state
                    last_reward = reward
        if args.train:
            # train mode on
            mean_reward_list = []  # for WANDB
            # loop all training queue
            for this_train_ind in train_queue:
                # start time
                start_time = time.time()
                target_steps[this_train_ind] += 1
                # train agent
                (
                    v_loss,
                    dis_pg_loss,
                    con_pg_loss,
                    loss,
                    entropy_loss
                ) = agent.train_net(
                    this_train_ind=this_train_ind,
                    ppo_memories=ppo_memories,
                    optimizer=optimizer
                )
                # record mean reward before clear history
                print("done")
                target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
                mean_reward_list.append(target_reward_mean)
                targetName = Targets(this_train_ind).name
                # clear this target training set buffer
                ppo_memories.clear_training_datasets(this_train_ind)
                # record rewards for plotting purposes
                wdb_recorder.add_target_scalar(
                    targetName,
                    this_train_ind,
                    v_loss,
                    dis_pg_loss,
                    con_pg_loss,
                    loss,
                    entropy_loss,
                    target_reward_mean,
                    target_steps,
                )
                print(f"episode over Target{targetName} mean reward:", target_reward_mean)
            TotalRewardMean = np.mean(mean_reward_list)
            wdb_recorder.add_global_scalar(
                TotalRewardMean,
                optimizer.param_groups[0]["lr"],
                total_steps,
            )
            # print cost time as seconds
            print("cost time:", time.time() - start_time)
            # New Record! or save model
            if ((is_save_model() or TotalRewardMean > best_reward) and args.save_model):
                # check saveDir is exist
                saveDir = "../PPO-Model/" + run_name + "/"
                if not os.path.isdir(saveDir):
                    os.mkdir(saveDir)
                best_reward = TotalRewardMean
                torch.save(agent, saveDir + str(TotalRewardMean) + ".pt")
                print("Model Saved!")
                set_save_model(False)
        else:
            # train mode off
            mean_reward_list = []  # for WANDB
            # while not in training mode, clear the buffer
            for this_train_ind in train_queue:
                target_steps[this_train_ind] += 1
                targetName = Targets(this_train_ind).name
                target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
                mean_reward_list.append(target_reward_mean)
                print(target_steps[this_train_ind])
                # clear this target training set buffer
                ppo_memories.clear_training_datasets(this_train_ind)
                # record rewards for plotting purposes
                wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean,
                                               target_steps[this_train_ind])
                wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind])
                print(f"episode over Target{targetName} mean reward:", target_reward_mean)
            TotalRewardMean = np.mean(mean_reward_list)
            wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
    saveDir = "../PPO-Model/" + run_name + "/"
    if not os.path.isdir(saveDir):
        os.mkdir(saveDir)
    best_reward = target_reward_mean
    torch.save(agent, saveDir + "_last.pt")
    env.close()
    wdb_recorder.writer.close()
--- a/Aimbot-PPO-Python/Pytorch/ppoagent.py
+++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py
@ -8,14 +8,14 @@ from aimbotEnv import Aimbot
 from torch.distributions.normal import Normal
 from torch.distributions.categorical import Categorical
 firstLayerNum = 512
 secondLayerNum = 128
 def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    nn.init.orthogonal_(layer.weight, std)
    nn.init.constant_(layer.bias, bias_const)
    return layer
 neural_size_1 = 400
 neural_size_2 = 300
 class PPOAgent(nn.Module):
    def __init__(
@ -33,15 +33,6 @@ class PPOAgent(nn.Module):
        self.unity_action_size = env.unity_action_size
        self.state_size = self.unity_observation_shape[0]
        self.agent_num = env.unity_agent_num
        self.target_size = self.args.target_state_size
        self.time_state_size = self.args.time_state_size
        self.gun_state_size = self.args.gun_state_size
        self.my_state_size = self.args.my_state_size
        self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
        self.state_size_without_ray = self.args.total_target_size
        self.head_input_size = (
                env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
        )  # except target state input
        self.unity_discrete_type = env.unity_discrete_type
        self.discrete_size = env.unity_discrete_size
@ -51,9 +42,9 @@ class PPOAgent(nn.Module):
        self.hidden_networks = nn.ModuleList(
            [
                nn.Sequential(
-                    layer_init(nn.Linear(self.state_size, firstLayerNum)),
+                    layer_init(nn.Linear(self.state_size, neural_size_1)),
                    nn.LeakyReLU(),
-                    layer_init(nn.Linear(firstLayerNum, secondLayerNum)),
+                    layer_init(nn.Linear(neural_size_1, neural_size_2)),
                    nn.LeakyReLU(),
                    )
                for i in range(self.target_num)
@ -61,16 +52,16 @@ class PPOAgent(nn.Module):
        )
        self.actor_dis = nn.ModuleList(
-            [layer_init(nn.Linear(secondLayerNum, self.discrete_size), std=0.5) for i in range(self.target_num)]
+            [layer_init(nn.Linear(neural_size_2, self.discrete_size), std=0.5) for i in range(self.target_num)]
        )
        self.actor_mean = nn.ModuleList(
-            [layer_init(nn.Linear(secondLayerNum, self.continuous_size), std=0) for i in range(self.target_num)]
+            [layer_init(nn.Linear(neural_size_2, self.continuous_size), std=0) for i in range(self.target_num)]
        )
        self.actor_logstd = nn.ParameterList(
            [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
        )
        self.critic = nn.ModuleList(
-            [layer_init(nn.Linear(secondLayerNum, 1), std=0) for i in range(self.target_num)]
+            [layer_init(nn.Linear(neural_size_2, 1), std=0) for i in range(self.target_num)]
        )
    def get_value(self, state: torch.Tensor):
Author	SHA1	Message	Date
Koha9	573b09a920	Argument説明	2024-03-02 17:36:33 +09:00
Koha9	9d9524429c	整理无用变量，对环境3.6进行适配	2024-01-24 17:07:45 +09:00
Koha9	5aa7e0936a	参数Critic_coef修改	2023-11-23 15:29:13 +09:00