Compare commits

..

1 Commits

Author SHA1 Message Date
2ea8a5f104 MultiThread PPO First Commit 2023-11-23 15:25:34 +09:00
8 changed files with 307 additions and 293 deletions

View File

@ -81,184 +81,43 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import argparse\n",
"import wandb\n",
"import time\n",
"import numpy as np\n",
"import random\n",
"import uuid\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"\n",
"from AimbotEnv import Aimbot\n",
"from tqdm import tqdm\n",
"from torch.distributions.normal import Normal\n",
"from torch.distributions.categorical import Categorical\n",
"from distutils.util import strtobool\n",
"from torch.utils.tensorboard import SummaryWriter\n",
"from mlagents_envs.environment import UnityEnvironment\n",
"from mlagents_envs.side_channel.side_channel import (\n",
" SideChannel,\n",
" IncomingMessage,\n",
" OutgoingMessage,\n",
")\n",
"from typing import List\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'aaa' object has no attribute 'outa'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[5], line 14\u001b[0m\n\u001b[0;32m 12\u001b[0m asd \u001b[39m=\u001b[39m aaa(outa, outb)\n\u001b[0;32m 13\u001b[0m asd\u001b[39m.\u001b[39mfunc()\n\u001b[1;32m---> 14\u001b[0m \u001b[39mprint\u001b[39m(asd\u001b[39m.\u001b[39;49mouta) \u001b[39m# 输出 100\u001b[39;00m\n",
"\u001b[1;31mAttributeError\u001b[0m: 'aaa' object has no attribute 'outa'"
]
}
],
"source": [
"class aaa():\n",
" def __init__(self, a, b):\n",
" self.a = a\n",
" self.b = b\n",
"\n",
" def func(self):\n",
" global outa\n",
" outa = 100\n",
"\n",
"outa = 1\n",
"outb = 2\n",
"asd = aaa(outa, outb)\n",
"asd.func()\n",
"print(asd.outa) # 输出 100"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"usage: ipykernel_launcher.py [-h] [--seed SEED]\n",
"ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme=\"hmac-sha256\" --Session.key=b\"46ef9317-59fb-4ab6-ae4e-6b35744fc423\" --shell=9002 --transport=\"tcp\" --iopub=9004 --f=c:\\Users\\UCUNI\\AppData\\Roaming\\jupyter\\runtime\\kernel-v2-311926K1uko38tdWb.json\n"
]
},
{
"ename": "SystemExit",
"evalue": "2",
"output_type": "error",
"traceback": [
"An exception has occurred, use %tb to see the full traceback.\n",
"\u001b[1;31mSystemExit\u001b[0m\u001b[1;31m:\u001b[0m 2\n"
]
}
],
"source": [
"import argparse\n",
"\n",
"def parse_args():\n",
" parser = argparse.ArgumentParser()\n",
" parser.add_argument(\"--seed\", type=int, default=11,\n",
" help=\"seed of the experiment\")\n",
" args = parser.parse_args()\n",
" return args\n",
"\n",
"arggg = parse_args()\n",
"print(type(arggg))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y=\"a;b;c\"\n",
"len(y.split(\";\"))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2]\n"
"0\n",
"i = 0\n",
"i = 1\n",
"i = 2\n",
"i = 3\n",
"i = 4\n",
"i = 5\n",
"i = 6\n",
"i = 7\n",
"i = 8\n",
"i = 9\n",
"10\n"
]
}
],
"source": [
"a = np.array([1,2,3,4])\n",
"print(a[[False,True,False,False]])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{1, 2, 3, 4}"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a = {1,2,3}\n",
"a.add(4)\n",
"a"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([3, 4])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a = np.array([[1,3],[2,4]])\n",
"a.max(axis=1)\n"
"import threading\n",
"\n",
"num = 0\n",
"\n",
"def print_numers():\n",
" global num\n",
" for i in range(10):\n",
" num +=1\n",
" print(\"i = \",i)\n",
"\n",
"thread = threading.Thread(target=print_numers)\n",
"\n",
"print(num)\n",
"thread.start()\n",
"thread.join()\n",
"print(num)"
]
}
],

View File

@ -19,8 +19,8 @@ import torch.optim as optim
# side channel uuid
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
# tensorboard names
GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel"
GAME_TYPE = "GotoOnly-3.6-Level0123-newModel-Onehot"
GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel_V2"
GAME_TYPE = "GotoOnly-Level0123-new512Model"
if __name__ == "__main__":
args = parse_args()

View File

@ -58,7 +58,6 @@ class PPOMem:
# print("Win! Broadcast reward!",rewardBF[-1])
print(sum(thisRewardBF) / len(thisRewardBF))
thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward
# broadcast result reward, increase all reward in this round by remainTime * self.result_broadcast_ratio
thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist()
else:
print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1])
@ -89,7 +88,7 @@ class PPOMem:
self.dones_bf[i].append(done[i])
self.values_bf[i].append(value_cpu[i])
if now_step % self.decision_period == 0:
# on decision period, add last skiped round's reward, only affact in decision_period != 1
# on decision period, add last skiped round's reward
self.rewards_bf[i].append(reward[i] + last_reward[i])
else:
# not on decision period, only add this round's reward

View File

@ -1,56 +0,0 @@
本项目使用以下命令行参数来配置运行环境和模型训练参数:
- `--seed <int>`:实验的随机种子。默认值为`9331`。
- `--path <str>`:环境路径。默认值为`"./Build/3.6/Aimbot-ParallelEnv"`。
- `--workerID <int>`Unity worker ID。默认值为`1`。
- `--baseport <int>`用于连接Unity环境的端口。默认值为`500`。
- `--lr <float>`:优化器的默认学习率。默认值为`5e-5`。
- `--cuda`如果启用将默认使用cuda。可以通过传入`true`或`false`来开启或关闭。
- `--total-timesteps <int>`:实验的总时间步数。默认值为`3150000`。
### 模型参数
- `--train`:是否训练模型。默认启用。
- `--freeze-viewnet`:是否冻结视图网络(raycast)。默认为`False`。
- `--datasetSize <int>`:训练数据集的大小,当数据集收集足够的数据时开始训练。默认值为`6000`。
- `--minibatchSize <int>`minibatch大小。默认值为`512`。
- `--epochs <int>`更新策略的K次迭代。默认值为`3`。
- `--annealLR`:是否对策略和价值网络进行学习率退火。默认为`True`。
- `--wandb-track`是否在wandb上跟踪。默认为`False`。
- `--save-model`:是否保存模型。默认为`False`。
- `--wandb-entity <str>`wandb项目的实体。默认值为`"koha9"`。
- `--load-dir <str>`:模型加载目录。默认值为`None`。
- `--decision-period <int>`Timestep之间的动作执行间隔。默认值为`1`。
- `--result-broadcast-ratio <float>`当赢得回合时对结果的reward进行broadcast的比例默认值为`1/30`。
- `--target-lr <float>`:下调学习率的目标值。默认值为`1e-6`。
### 损失函数参数
- `--policy-coef <float>`:策略损失的系数。默认值为`[0.8, 0.8, 0.8, 0.8]`。
- `--entropy-coef <float>`:熵损失的系数。默认值为`[0.05, 0.05, 0.05, 0.05]`。
- `--critic-coef <float>`:评论家损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
- `--loss-coef <float>`:总损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
### GAE损失参数
- `--gae`是否使用GAE进行优势计算。默认启用。
- `--norm-adv`:是否标准化优势。默认为`False`。
- `--gamma <float>`折扣因子gamma。默认值为`0.999`。
- `--gaeLambda <float>`GAE的lambda值。默认值为`0.95`。
- `--clip-coef <float>`:替代裁剪系数。默认值为`0.11`。
- `--clip-vloss`:是否使用论文中的裁剪价值函数损失。默认启用。
- `--max-grad-norm <float>`:梯度裁剪的最大范数。默认值为`0.5`。
### 环境参数
- `--target-num <int>`:目标种类数量。默认值为`4`。
- `--env-timelimit <int>`:每轮的时间限制。默认值为`30`。
- `--base-win-reward <int>`:赢得回合的基础奖励。默认值为`999`。
- `--base-lose-reward <int>`:输掉回合的基础奖励。默认值为`-999`。
- `--target-state-size <int>`target状态的大小。默认值为`6`。
- `--time-state-size <int>`:游戏剩余时间状态的大小。默认值为`1`。
- `--gun-state-size <int>`:枪状态的大小。默认值为`1`。
- `--my-state-size <int>`:我的状态大小。默认值为`4`。
- `--total-target-size <int>`总target状态的大小。默认值为`12`。

View File

@ -1,52 +0,0 @@
- `--seed <int>`実験の乱数Seed。デフォルト値は`9331`。
- `--path <str>`:環境パス。デフォルト値は`"./Build/3.6/Aimbot-ParallelEnv"`。
- `--workerID <int>`Unity Worker ID。デフォルト値は`1`。
- `--baseport <int>`Unity環境への接続用Port。デフォルト値は`500`。
- `--lr <float>`Optimizerのデフォルト学習率。デフォルト値は`5e-5`。
- `--cuda`有効にすると、デフォルトでcudaを使用します。`true`または`false`を渡すことで有効/無効を切り替えられます。
- `--total-timesteps <int>`:実験の合計タイムステップ数。デフォルト値は`3150000`。
### モデルパラメータ
- `--train`:モデルを訓練するかどうか。デフォルトで有効。
- `--freeze-viewnet`:ビューネットワーク(raycast)をfreezeする。デフォルトは`False`。
- `--datasetSize <int>`:訓練データセットのサイズ。データセットが十分なデータを集めたら訓練を開始する。デフォルト値は`6000`。
- `--minibatchSize <int>`minibatchのサイズ。デフォルト値は`512`。
- `--epochs <int>`epochs。デフォルト値は`3`。
- `--annealLR`:ポリシーとバリューネットワークの学習率を退火するかどうか。デフォルトは`True`。
- `--wandb-track`wandbでトラッキングするかどうか。デフォルトは`False`。
- `--save-model`:モデルを保存するかどうか。デフォルトは`False`。
- `--wandb-entity <str>`wandbプロジェクトのエンティティ。デフォルト値は`"koha9"`。
- `--load-dir <str>`:モデルのロードディレクトリ。デフォルト値は`None`。
- `--decision-period <int>`:実際動作を実行する時のタイムステップの間隔。デフォルト値は`1`。
- `--result-broadcast-ratio <float>`ラウンドに勝った場合の報酬のbroadcast ratio、デフォルト値は`1/30`。
- `--target-lr <float>`:学習率を下げる時の目標値。デフォルト値は`1e-6`。
### 損失関数パラメータ
- `--policy-coef <float>`policy損失の係数。デフォルト値は`[0.8, 0.8, 0.8, 0.8]`。
- `--entropy-coef <float>`entropy損失の係数。デフォルト値は`[0.05, 0.05, 0.05, 0.05]`。
- `--critic-coef <float>`critic損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
- `--loss-coef <float>`:全体の損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
### GAE損失パラメータ
- `--gae`GAEを使用してアドバンテージを計算するかどうか。デフォルトで有効。
- `--norm-adv`:アドバンテージを正規化するかどうか。デフォルトは`False`。
- `--gamma <float>`割引因子gamma。デフォルト値は`0.999`。
- `--gaeLambda <float>`GAEのlambda値。デフォルト値は`0.95`。
- `--clip-coef <float>`:代替クリッピング係数。デフォルト値は`0.11`。
- `--clip-vloss`:論文で述べられている価値関数の損失のクリッピングを使用するかどうか。デフォルトで有効。
- `--max-grad-norm <float>`:勾配のクリッピングの最大ノルム。デフォルト値は`0.5`。
### 環境パラメータ
- `--target-num <int>`Targetの種類数。デフォルト値は`4`。
- `--env-timelimit <int>`:ラウンドごとの時間制限。デフォルト値は`30`。
- `--base-win-reward <int>`:ラウンドに勝った場合の基本報酬。デフォルト値は`999`。
- `--base-lose-reward <int>`:ラウンドに負けた場合の基本報酬。デフォルト値は`-999`。
- `--target-state-size <int>`Targetの状態サイズ。デフォルト値は`6`。
- `--time-state-size <int>`:ゲームの残り時間の状態サイズ。デフォルト値は`1`。
- `--gun-state-size <int>`:銃の状態サイズ。デフォルト値は`1`。
- `--my-state-size <int>`:自分の状態サイズ。デフォルト値は`4`。
- `--total-target-size <int>`全Targetの状態サイズ。デフォルト値は`12`。

View File

@ -4,7 +4,7 @@ import uuid
from distutils.util import strtobool
DEFAULT_SEED = 9331
ENV_PATH = "../Build/3.6/Aimbot-ParallelEnv"
ENV_PATH = "../Build/3.5/Aimbot-ParallelEnv"
WAND_ENTITY = "koha9"
WORKER_ID = 1
BASE_PORT = 1000
@ -16,15 +16,15 @@ TOTAL_STEPS = 3150000
BATCH_SIZE = 512
MAX_TRAINNING_DATASETS = 6000
DECISION_PERIOD = 1
LEARNING_RATE = 5e-5
GAMMA = 0.999
LEARNING_RATE = 1.5e-4
GAMMA = 0.99
GAE_LAMBDA = 0.95
EPOCHS = 3
CLIP_COEF = 0.11
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
POLICY_COEF = [0.8, 0.8, 0.8, 0.8]
ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
CRITIC_COEF = [1.0, 1.0, 1.0, 1.0]
CRITIC_COEF = [0.8, 0.8, 0.8, 0.8]
TARGET_LEARNING_RATE = 1e-6
FREEZE_VIEW_NETWORK = False
@ -35,7 +35,7 @@ TRAIN = True
SAVE_MODEL = True
WANDB_TACK = True
LOAD_DIR = None
# LOAD_DIR = "../PPO-Model/GotoOnly-Level1234_9331_1697122986/8.853553.pt"
# LOAD_DIR = "../PPO-Model/GotoOnly-Level0123_9331_1696965321/5.1035867.pt"
# Unity Environment Parameters
TARGET_STATE_SIZE = 6

View File

@ -0,0 +1,255 @@
import time
import numpy as np
import random
import uuid
import torch
import atexit
import os
from aimbotEnv import Aimbot
from aimbotEnv import AimbotSideChannel
from ppoagent import PPOAgent
from airecorder import WandbRecorder
from aimemory import PPOMem
from aimemory import Targets
from arguments import parse_args
from arguments import set_save_model, is_save_model
import torch.optim as optim
# side channel uuid
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
# tensorboard names
GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel_V2"
GAME_TYPE = "GotoOnly-Level0123-new512Model"
if __name__ == "__main__":
args = parse_args()
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
best_reward = -1
# Initialize environment agent optimizer
aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID)
env = Aimbot(
env_path=args.path,
worker_id=args.workerID,
base_port=args.baseport,
side_channels=[aimbot_side_channel])
if args.load_dir is None:
agent = PPOAgent(
env=env,
this_args=args,
device=device,
).to(device)
else:
agent = torch.load(args.load_dir)
# freeze
if args.freeze_viewnet:
# freeze the view network
print("FREEZE VIEW NETWORK is not compatible with Full MNN!")
raise NotImplementedError
print("Load Agent", args.load_dir)
print(agent.eval())
# optimizer
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
# Tensorboard and WandB Recorder
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
# start the game
total_update_step = args.target_num * args.total_timesteps // args.datasetSize
target_steps = [0 for i in range(args.target_num)]
start_time = time.time()
state, _, done = env.reset()
# initialize AI memories
ppo_memories = PPOMem(
args=args,
unity_agent_num=env.unity_agent_num,
device=device,
)
# MAIN LOOP: run agent in environment
for total_steps in range(total_update_step):
# discount learning rate, while step == total_update_step lr will be 0
if args.annealLR:
final_lr_ratio = args.target_lr / args.lr
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
lr_now = frac * args.lr
optimizer.param_groups[0]["lr"] = lr_now
else:
lr_now = args.lr
# episode start show learning rate
print("new episode", total_steps, "learning rate = ", lr_now)
step = 0
training = False
train_queue = []
last_reward = [0. for i in range(env.unity_agent_num)]
# MAIN LOOP: run agent in environment
while True:
# Target Type(state[0][0]) is stay(4),use all zero action
if state[0][0] == 4:
next_state, reward, next_done = env.step(env.all_zero_action)
state, done = next_state, next_done
continue
# On decision point, and Target Type(state[0][0]) is not stay(4) choose action by agent
if step % args.decision_period == 0:
step += 1
# Choose action by agent
with torch.no_grad():
# predict actions
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
torch.tensor(state,dtype=torch.float32).to(device)
)
value = value.flatten()
# variable from GPU to CPU
action_cpu = action.cpu().numpy()
dis_logprob_cpu = dis_logprob.cpu().numpy()
con_logprob_cpu = con_logprob.cpu().numpy()
value_cpu = value.cpu().numpy()
# Environment step
next_state, reward, next_done = env.step(action_cpu)
# save memories
if args.train:
ppo_memories.save_memories(
now_step=step,
agent=agent,
state=state,
action_cpu=action_cpu,
dis_logprob_cpu=dis_logprob_cpu,
con_logprob_cpu=con_logprob_cpu,
reward=reward,
done=done,
value_cpu=value_cpu,
last_reward=last_reward,
next_done=next_done,
next_state=next_state,
)
# check if any training dataset is full and ready to train
for i in range(args.target_num):
if ppo_memories.obs[i].size()[0] >= args.datasetSize:
# start train NN
train_queue.append(i)
if len(train_queue) > 0:
# break while loop and start train
break
# update state
state, done = next_state, next_done
else:
step += 1
# skip this step use last predict action
next_state, reward, next_done = env.step(action_cpu)
# save memories
if args.train:
ppo_memories.save_memories(
now_step=step,
agent=agent,
state=state,
action_cpu=action_cpu,
dis_logprob_cpu=dis_logprob_cpu,
con_logprob_cpu=con_logprob_cpu,
reward=reward,
done=done,
value_cpu=value_cpu,
last_reward=last_reward,
next_done=next_done,
next_state=next_state,
)
# update state
state = next_state
last_reward = reward
if args.train:
# train mode on
mean_reward_list = [] # for WANDB
# loop all training queue
for this_train_ind in train_queue:
# start time
start_time = time.time()
target_steps[this_train_ind] += 1
# train agent
(
v_loss,
dis_pg_loss,
con_pg_loss,
loss,
entropy_loss
) = agent.train_net(
this_train_ind=this_train_ind,
ppo_memories=ppo_memories,
optimizer=optimizer
)
# record mean reward before clear history
print("done")
target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
mean_reward_list.append(target_reward_mean)
targetName = Targets(this_train_ind).name
# clear this target training set buffer
ppo_memories.clear_training_datasets(this_train_ind)
# record rewards for plotting purposes
wdb_recorder.add_target_scalar(
targetName,
this_train_ind,
v_loss,
dis_pg_loss,
con_pg_loss,
loss,
entropy_loss,
target_reward_mean,
target_steps,
)
print(f"episode over Target{targetName} mean reward:", target_reward_mean)
TotalRewardMean = np.mean(mean_reward_list)
wdb_recorder.add_global_scalar(
TotalRewardMean,
optimizer.param_groups[0]["lr"],
total_steps,
)
# print cost time as seconds
print("cost time:", time.time() - start_time)
# New Record! or save model
if ((is_save_model() or TotalRewardMean > best_reward) and args.save_model):
# check saveDir is exist
saveDir = "../PPO-Model/" + run_name + "/"
if not os.path.isdir(saveDir):
os.mkdir(saveDir)
best_reward = TotalRewardMean
torch.save(agent, saveDir + str(TotalRewardMean) + ".pt")
print("Model Saved!")
set_save_model(False)
else:
# train mode off
mean_reward_list = [] # for WANDB
# while not in training mode, clear the buffer
for this_train_ind in train_queue:
target_steps[this_train_ind] += 1
targetName = Targets(this_train_ind).name
target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
mean_reward_list.append(target_reward_mean)
print(target_steps[this_train_ind])
# clear this target training set buffer
ppo_memories.clear_training_datasets(this_train_ind)
# record rewards for plotting purposes
wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean,
target_steps[this_train_ind])
wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind])
print(f"episode over Target{targetName} mean reward:", target_reward_mean)
TotalRewardMean = np.mean(mean_reward_list)
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
saveDir = "../PPO-Model/" + run_name + "/"
if not os.path.isdir(saveDir):
os.mkdir(saveDir)
best_reward = target_reward_mean
torch.save(agent, saveDir + "_last.pt")
env.close()
wdb_recorder.writer.close()

View File

@ -8,14 +8,14 @@ from aimbotEnv import Aimbot
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
firstLayerNum = 512
secondLayerNum = 128
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
nn.init.orthogonal_(layer.weight, std)
nn.init.constant_(layer.bias, bias_const)
return layer
neural_size_1 = 400
neural_size_2 = 300
class PPOAgent(nn.Module):
def __init__(
@ -33,6 +33,15 @@ class PPOAgent(nn.Module):
self.unity_action_size = env.unity_action_size
self.state_size = self.unity_observation_shape[0]
self.agent_num = env.unity_agent_num
self.target_size = self.args.target_state_size
self.time_state_size = self.args.time_state_size
self.gun_state_size = self.args.gun_state_size
self.my_state_size = self.args.my_state_size
self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
self.state_size_without_ray = self.args.total_target_size
self.head_input_size = (
env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
) # except target state input
self.unity_discrete_type = env.unity_discrete_type
self.discrete_size = env.unity_discrete_size
@ -42,9 +51,9 @@ class PPOAgent(nn.Module):
self.hidden_networks = nn.ModuleList(
[
nn.Sequential(
layer_init(nn.Linear(self.state_size, neural_size_1)),
layer_init(nn.Linear(self.state_size, firstLayerNum)),
nn.LeakyReLU(),
layer_init(nn.Linear(neural_size_1, neural_size_2)),
layer_init(nn.Linear(firstLayerNum, secondLayerNum)),
nn.LeakyReLU(),
)
for i in range(self.target_num)
@ -52,16 +61,16 @@ class PPOAgent(nn.Module):
)
self.actor_dis = nn.ModuleList(
[layer_init(nn.Linear(neural_size_2, self.discrete_size), std=0.5) for i in range(self.target_num)]
[layer_init(nn.Linear(secondLayerNum, self.discrete_size), std=0.5) for i in range(self.target_num)]
)
self.actor_mean = nn.ModuleList(
[layer_init(nn.Linear(neural_size_2, self.continuous_size), std=0) for i in range(self.target_num)]
[layer_init(nn.Linear(secondLayerNum, self.continuous_size), std=0) for i in range(self.target_num)]
)
self.actor_logstd = nn.ParameterList(
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
)
self.critic = nn.ModuleList(
[layer_init(nn.Linear(neural_size_2, 1), std=0) for i in range(self.target_num)]
[layer_init(nn.Linear(secondLayerNum, 1), std=0) for i in range(self.target_num)]
)
def get_value(self, state: torch.Tensor):