Compare commits
6 Commits
OffP-Parti
...
OffP-FullM
Author | SHA1 | Date | |
---|---|---|---|
573b09a920 | |||
9d9524429c | |||
5aa7e0936a | |||
3bc5c30fd3 | |||
2741d6d51a | |||
9432eaa76e |
@ -11,6 +11,7 @@ from mlagents_envs.side_channel.side_channel import (
|
||||
IncomingMessage,
|
||||
OutgoingMessage,
|
||||
)
|
||||
from arguments import set_save_model
|
||||
|
||||
|
||||
class Aimbot(gym.Env):
|
||||
@ -176,18 +177,21 @@ class AimbotSideChannel(SideChannel):
|
||||
"Warning|Message1|Message2|Message3" or
|
||||
"Error|Message1|Message2|Message3"
|
||||
"""
|
||||
this_message = msg.read_string()
|
||||
this_result = this_message.split("|")
|
||||
print(this_result)
|
||||
if this_result[0] == "Warning":
|
||||
if this_result[1] == "Result":
|
||||
airecorder.total_rounds[this_result[2]] += 1
|
||||
if this_result[3] == "Win":
|
||||
airecorder.win_rounds[this_result[2]] += 1
|
||||
this_message_Original = msg.read_string()
|
||||
this_message = this_message_Original.split("|")
|
||||
print(this_message)
|
||||
if this_message[0] == "Warning":
|
||||
if this_message[1] == "Result":
|
||||
airecorder.total_rounds[this_message[2]] += 1
|
||||
if this_message[3] == "Win":
|
||||
airecorder.win_rounds[this_message[2]] += 1
|
||||
# print(TotalRounds)
|
||||
# print(WinRounds)
|
||||
elif this_result[0] == "Error":
|
||||
print(this_message)
|
||||
if this_message[1] == "Command":
|
||||
set_save_model(True)
|
||||
print("Command: " + this_message_Original)
|
||||
elif this_message[0] == "Error":
|
||||
print(this_message_Original)
|
||||
# # while Message type is Warning
|
||||
# if(thisResult[0] == "Warning"):
|
||||
# # while Message1 is result means one game is over
|
||||
|
@ -181,30 +181,84 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([[0., 0., 0., 0.],\n",
|
||||
" [0., 0., 0., 0.],\n",
|
||||
" [0., 0., 0., 0.],\n",
|
||||
" [0., 0., 0., 0.],\n",
|
||||
" [0., 0., 0., 0.],\n",
|
||||
" [0., 0., 0., 0.],\n",
|
||||
" [0., 0., 0., 0.],\n",
|
||||
" [0., 0., 0., 0.]])"
|
||||
"3"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"np.zeros((8, 4))"
|
||||
"y=\"a;b;c\"\n",
|
||||
"len(y.split(\";\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[2]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a = np.array([1,2,3,4])\n",
|
||||
"print(a[[False,True,False,False]])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{1, 2, 3, 4}"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a = {1,2,3}\n",
|
||||
"a.add(4)\n",
|
||||
"a"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([3, 4])"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a = np.array([[1,3],[2,4]])\n",
|
||||
"a.max(axis=1)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -4,6 +4,7 @@ import random
|
||||
import uuid
|
||||
import torch
|
||||
import atexit
|
||||
import os
|
||||
|
||||
from aimbotEnv import Aimbot
|
||||
from aimbotEnv import AimbotSideChannel
|
||||
@ -12,13 +13,14 @@ from airecorder import WandbRecorder
|
||||
from aimemory import PPOMem
|
||||
from aimemory import Targets
|
||||
from arguments import parse_args
|
||||
from arguments import set_save_model, is_save_model
|
||||
import torch.optim as optim
|
||||
|
||||
# side channel uuid
|
||||
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
||||
# tensorboard names
|
||||
GAME_NAME = "Aimbot_Hybrid_V3"
|
||||
GAME_TYPE = "Mix_Verification"
|
||||
GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel"
|
||||
GAME_TYPE = "GotoOnly-3.6-Level0123-newModel-Onehot"
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
@ -47,9 +49,8 @@ if __name__ == "__main__":
|
||||
# freeze
|
||||
if args.freeze_viewnet:
|
||||
# freeze the view network
|
||||
for p in agent.viewNetwork.parameters():
|
||||
p.requires_grad = False
|
||||
print("VIEW NETWORK FREEZE")
|
||||
print("FREEZE VIEW NETWORK is not compatible with Full MNN!")
|
||||
raise NotImplementedError
|
||||
print("Load Agent", args.load_dir)
|
||||
print(agent.eval())
|
||||
# optimizer
|
||||
@ -58,16 +59,6 @@ if __name__ == "__main__":
|
||||
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
|
||||
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
|
||||
|
||||
@atexit.register
|
||||
def save_model():
|
||||
# close env
|
||||
env.close()
|
||||
if args.save_model:
|
||||
# save model while exit
|
||||
save_dir = "../PPO-Model/" + run_name + "_last.pt"
|
||||
torch.save(agent, save_dir)
|
||||
print("save model to " + save_dir)
|
||||
|
||||
# start the game
|
||||
total_update_step = args.target_num * args.total_timesteps // args.datasetSize
|
||||
target_steps = [0 for i in range(args.target_num)]
|
||||
@ -223,11 +214,16 @@ if __name__ == "__main__":
|
||||
)
|
||||
# print cost time as seconds
|
||||
print("cost time:", time.time() - start_time)
|
||||
# New Record!
|
||||
if TotalRewardMean > best_reward and args.save_model:
|
||||
best_reward = target_reward_mean
|
||||
saveDir = "../PPO-Model/" + run_name + "_" + str(TotalRewardMean) + ".pt"
|
||||
torch.save(agent, saveDir)
|
||||
# New Record! or save model
|
||||
if ((is_save_model() or TotalRewardMean > best_reward) and args.save_model):
|
||||
# check saveDir is exist
|
||||
saveDir = "../PPO-Model/" + run_name + "/"
|
||||
if not os.path.isdir(saveDir):
|
||||
os.mkdir(saveDir)
|
||||
best_reward = TotalRewardMean
|
||||
torch.save(agent, saveDir + str(TotalRewardMean) + ".pt")
|
||||
print("Model Saved!")
|
||||
set_save_model(False)
|
||||
else:
|
||||
# train mode off
|
||||
mean_reward_list = [] # for WANDB
|
||||
@ -250,7 +246,10 @@ if __name__ == "__main__":
|
||||
TotalRewardMean = np.mean(mean_reward_list)
|
||||
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
||||
|
||||
saveDir = "../PPO-Model/" + run_name + "_last.pt"
|
||||
torch.save(agent, saveDir)
|
||||
saveDir = "../PPO-Model/" + run_name + "/"
|
||||
if not os.path.isdir(saveDir):
|
||||
os.mkdir(saveDir)
|
||||
best_reward = target_reward_mean
|
||||
torch.save(agent, saveDir + "_last.pt")
|
||||
env.close()
|
||||
wdb_recorder.writer.close()
|
||||
|
@ -58,6 +58,7 @@ class PPOMem:
|
||||
# print("Win! Broadcast reward!",rewardBF[-1])
|
||||
print(sum(thisRewardBF) / len(thisRewardBF))
|
||||
thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward
|
||||
# broadcast result reward, increase all reward in this round by remainTime * self.result_broadcast_ratio
|
||||
thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist()
|
||||
else:
|
||||
print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1])
|
||||
@ -88,7 +89,7 @@ class PPOMem:
|
||||
self.dones_bf[i].append(done[i])
|
||||
self.values_bf[i].append(value_cpu[i])
|
||||
if now_step % self.decision_period == 0:
|
||||
# on decision period, add last skiped round's reward
|
||||
# on decision period, add last skiped round's reward, only affact in decision_period != 1
|
||||
self.rewards_bf[i].append(reward[i] + last_reward[i])
|
||||
else:
|
||||
# not on decision period, only add this round's reward
|
||||
|
56
Aimbot-PPO-Python/Pytorch/arguments-cn.md
Normal file
56
Aimbot-PPO-Python/Pytorch/arguments-cn.md
Normal file
@ -0,0 +1,56 @@
|
||||
|
||||
|
||||
本项目使用以下命令行参数来配置运行环境和模型训练参数:
|
||||
|
||||
- `--seed <int>`:实验的随机种子。默认值为`9331`。
|
||||
- `--path <str>`:环境路径。默认值为`"./Build/3.6/Aimbot-ParallelEnv"`。
|
||||
- `--workerID <int>`:Unity worker ID。默认值为`1`。
|
||||
- `--baseport <int>`:用于连接Unity环境的端口。默认值为`500`。
|
||||
- `--lr <float>`:优化器的默认学习率。默认值为`5e-5`。
|
||||
- `--cuda`:如果启用,将默认使用cuda。可以通过传入`true`或`false`来开启或关闭。
|
||||
- `--total-timesteps <int>`:实验的总时间步数。默认值为`3150000`。
|
||||
|
||||
### 模型参数
|
||||
|
||||
- `--train`:是否训练模型。默认启用。
|
||||
- `--freeze-viewnet`:是否冻结视图网络(raycast)。默认为`False`。
|
||||
- `--datasetSize <int>`:训练数据集的大小,当数据集收集足够的数据时开始训练。默认值为`6000`。
|
||||
- `--minibatchSize <int>`:minibatch大小。默认值为`512`。
|
||||
- `--epochs <int>`:更新策略的K次迭代。默认值为`3`。
|
||||
- `--annealLR`:是否对策略和价值网络进行学习率退火。默认为`True`。
|
||||
- `--wandb-track`:是否在wandb上跟踪。默认为`False`。
|
||||
- `--save-model`:是否保存模型。默认为`False`。
|
||||
- `--wandb-entity <str>`:wandb项目的实体。默认值为`"koha9"`。
|
||||
- `--load-dir <str>`:模型加载目录。默认值为`None`。
|
||||
- `--decision-period <int>`:Timestep之间的动作执行间隔。默认值为`1`。
|
||||
- `--result-broadcast-ratio <float>`:当赢得回合时,对结果的reward进行broadcast的比例,默认值为`1/30`。
|
||||
- `--target-lr <float>`:下调学习率的目标值。默认值为`1e-6`。
|
||||
|
||||
### 损失函数参数
|
||||
|
||||
- `--policy-coef <float>`:策略损失的系数。默认值为`[0.8, 0.8, 0.8, 0.8]`。
|
||||
- `--entropy-coef <float>`:熵损失的系数。默认值为`[0.05, 0.05, 0.05, 0.05]`。
|
||||
- `--critic-coef <float>`:评论家损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
|
||||
- `--loss-coef <float>`:总损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
|
||||
|
||||
### GAE损失参数
|
||||
|
||||
- `--gae`:是否使用GAE进行优势计算。默认启用。
|
||||
- `--norm-adv`:是否标准化优势。默认为`False`。
|
||||
- `--gamma <float>`:折扣因子gamma。默认值为`0.999`。
|
||||
- `--gaeLambda <float>`:GAE的lambda值。默认值为`0.95`。
|
||||
- `--clip-coef <float>`:替代裁剪系数。默认值为`0.11`。
|
||||
- `--clip-vloss`:是否使用论文中的裁剪价值函数损失。默认启用。
|
||||
- `--max-grad-norm <float>`:梯度裁剪的最大范数。默认值为`0.5`。
|
||||
|
||||
### 环境参数
|
||||
|
||||
- `--target-num <int>`:目标种类数量。默认值为`4`。
|
||||
- `--env-timelimit <int>`:每轮的时间限制。默认值为`30`。
|
||||
- `--base-win-reward <int>`:赢得回合的基础奖励。默认值为`999`。
|
||||
- `--base-lose-reward <int>`:输掉回合的基础奖励。默认值为`-999`。
|
||||
- `--target-state-size <int>`:target状态的大小。默认值为`6`。
|
||||
- `--time-state-size <int>`:游戏剩余时间状态的大小。默认值为`1`。
|
||||
- `--gun-state-size <int>`:枪状态的大小。默认值为`1`。
|
||||
- `--my-state-size <int>`:我的状态大小。默认值为`4`。
|
||||
- `--total-target-size <int>`:总target状态的大小。默认值为`12`。
|
52
Aimbot-PPO-Python/Pytorch/arguments-jp.md
Normal file
52
Aimbot-PPO-Python/Pytorch/arguments-jp.md
Normal file
@ -0,0 +1,52 @@
|
||||
- `--seed <int>`:実験の乱数Seed。デフォルト値は`9331`。
|
||||
- `--path <str>`:環境パス。デフォルト値は`"./Build/3.6/Aimbot-ParallelEnv"`。
|
||||
- `--workerID <int>`:Unity Worker ID。デフォルト値は`1`。
|
||||
- `--baseport <int>`:Unity環境への接続用Port。デフォルト値は`500`。
|
||||
- `--lr <float>`:Optimizerのデフォルト学習率。デフォルト値は`5e-5`。
|
||||
- `--cuda`:有効にすると、デフォルトでcudaを使用します。`true`または`false`を渡すことで有効/無効を切り替えられます。
|
||||
- `--total-timesteps <int>`:実験の合計タイムステップ数。デフォルト値は`3150000`。
|
||||
|
||||
### モデルパラメータ
|
||||
|
||||
- `--train`:モデルを訓練するかどうか。デフォルトで有効。
|
||||
- `--freeze-viewnet`:ビューネットワーク(raycast)をfreezeする。デフォルトは`False`。
|
||||
- `--datasetSize <int>`:訓練データセットのサイズ。データセットが十分なデータを集めたら訓練を開始する。デフォルト値は`6000`。
|
||||
- `--minibatchSize <int>`:minibatchのサイズ。デフォルト値は`512`。
|
||||
- `--epochs <int>`:epochs。デフォルト値は`3`。
|
||||
- `--annealLR`:ポリシーとバリューネットワークの学習率を退火するかどうか。デフォルトは`True`。
|
||||
- `--wandb-track`:wandbでトラッキングするかどうか。デフォルトは`False`。
|
||||
- `--save-model`:モデルを保存するかどうか。デフォルトは`False`。
|
||||
- `--wandb-entity <str>`:wandbプロジェクトのエンティティ。デフォルト値は`"koha9"`。
|
||||
- `--load-dir <str>`:モデルのロードディレクトリ。デフォルト値は`None`。
|
||||
- `--decision-period <int>`:実際動作を実行する時のタイムステップの間隔。デフォルト値は`1`。
|
||||
- `--result-broadcast-ratio <float>`:ラウンドに勝った場合の報酬のbroadcast ratio、デフォルト値は`1/30`。
|
||||
- `--target-lr <float>`:学習率を下げる時の目標値。デフォルト値は`1e-6`。
|
||||
|
||||
### 損失関数パラメータ
|
||||
|
||||
- `--policy-coef <float>`:policy損失の係数。デフォルト値は`[0.8, 0.8, 0.8, 0.8]`。
|
||||
- `--entropy-coef <float>`:entropy損失の係数。デフォルト値は`[0.05, 0.05, 0.05, 0.05]`。
|
||||
- `--critic-coef <float>`:critic損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
|
||||
- `--loss-coef <float>`:全体の損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
|
||||
|
||||
### GAE損失パラメータ
|
||||
|
||||
- `--gae`:GAEを使用してアドバンテージを計算するかどうか。デフォルトで有効。
|
||||
- `--norm-adv`:アドバンテージを正規化するかどうか。デフォルトは`False`。
|
||||
- `--gamma <float>`:割引因子gamma。デフォルト値は`0.999`。
|
||||
- `--gaeLambda <float>`:GAEのlambda値。デフォルト値は`0.95`。
|
||||
- `--clip-coef <float>`:代替クリッピング係数。デフォルト値は`0.11`。
|
||||
- `--clip-vloss`:論文で述べられている価値関数の損失のクリッピングを使用するかどうか。デフォルトで有効。
|
||||
- `--max-grad-norm <float>`:勾配のクリッピングの最大ノルム。デフォルト値は`0.5`。
|
||||
|
||||
### 環境パラメータ
|
||||
|
||||
- `--target-num <int>`:Targetの種類数。デフォルト値は`4`。
|
||||
- `--env-timelimit <int>`:ラウンドごとの時間制限。デフォルト値は`30`。
|
||||
- `--base-win-reward <int>`:ラウンドに勝った場合の基本報酬。デフォルト値は`999`。
|
||||
- `--base-lose-reward <int>`:ラウンドに負けた場合の基本報酬。デフォルト値は`-999`。
|
||||
- `--target-state-size <int>`:Targetの状態サイズ。デフォルト値は`6`。
|
||||
- `--time-state-size <int>`:ゲームの残り時間の状態サイズ。デフォルト値は`1`。
|
||||
- `--gun-state-size <int>`:銃の状態サイズ。デフォルト値は`1`。
|
||||
- `--my-state-size <int>`:自分の状態サイズ。デフォルト値は`4`。
|
||||
- `--total-target-size <int>`:全Targetの状態サイズ。デフォルト値は`12`。
|
@ -4,41 +4,38 @@ import uuid
|
||||
from distutils.util import strtobool
|
||||
|
||||
DEFAULT_SEED = 9331
|
||||
ENV_PATH = "../Build/3.1.6/Aimbot-ParallelEnv"
|
||||
ENV_PATH = "../Build/3.6/Aimbot-ParallelEnv"
|
||||
WAND_ENTITY = "koha9"
|
||||
WORKER_ID = 1
|
||||
BASE_PORT = 1000
|
||||
|
||||
# tensorboard names
|
||||
GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3"
|
||||
GAME_TYPE = "Mix_Verification"
|
||||
|
||||
# max round steps per agent is 2500/Decision_period, 25 seconds
|
||||
TOTAL_STEPS = 3150000
|
||||
BATCH_SIZE = 512
|
||||
MAX_TRAINNING_DATASETS = 6000
|
||||
DECISION_PERIOD = 1
|
||||
LEARNING_RATE = 6.5e-4
|
||||
GAMMA = 0.99
|
||||
LEARNING_RATE = 5e-5
|
||||
GAMMA = 0.999
|
||||
GAE_LAMBDA = 0.95
|
||||
EPOCHS = 3
|
||||
CLIP_COEF = 0.11
|
||||
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
|
||||
POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
|
||||
POLICY_COEF = [0.8, 0.8, 0.8, 0.8]
|
||||
ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
|
||||
CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
|
||||
CRITIC_COEF = [1.0, 1.0, 1.0, 1.0]
|
||||
TARGET_LEARNING_RATE = 1e-6
|
||||
|
||||
FREEZE_VIEW_NETWORK = False
|
||||
BROADCASTREWARD = False
|
||||
ANNEAL_LEARNING_RATE = True
|
||||
CLIP_VLOSS = True
|
||||
NORM_ADV = False
|
||||
TRAIN = True
|
||||
SAVE_MODEL = False
|
||||
WANDB_TACK = False
|
||||
SAVE_MODEL = True
|
||||
WANDB_TACK = True
|
||||
LOAD_DIR = None
|
||||
#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
|
||||
# LOAD_DIR = "../PPO-Model/GotoOnly-Level1234_9331_1697122986/8.853553.pt"
|
||||
|
||||
# Unity Environment Parameters
|
||||
TARGET_STATE_SIZE = 6
|
||||
@ -53,6 +50,16 @@ TARGETNUM= 4
|
||||
ENV_TIMELIMIT = 30
|
||||
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
|
||||
|
||||
save_model_this_episode = False
|
||||
|
||||
def is_save_model():
|
||||
global save_model_this_episode
|
||||
return save_model_this_episode
|
||||
def set_save_model(save_model:bool):
|
||||
print("set save model to ",save_model)
|
||||
global save_model_this_episode
|
||||
save_model_this_episode = save_model
|
||||
|
||||
def parse_args():
|
||||
# fmt: off
|
||||
# pytorch and environment parameters
|
||||
@ -97,8 +104,6 @@ def parse_args():
|
||||
help="the number of steps to run in each environment per policy rollout")
|
||||
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
|
||||
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
|
||||
parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True,
|
||||
help="save model or not")
|
||||
# target_learning_rate
|
||||
parser.add_argument("--target-lr", type=float, default=TARGET_LEARNING_RATE,
|
||||
help="target value of downscaling the learning rate")
|
||||
|
@ -14,6 +14,8 @@ def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
||||
nn.init.constant_(layer.bias, bias_const)
|
||||
return layer
|
||||
|
||||
neural_size_1 = 400
|
||||
neural_size_2 = 300
|
||||
|
||||
class PPOAgent(nn.Module):
|
||||
def __init__(
|
||||
@ -31,99 +33,76 @@ class PPOAgent(nn.Module):
|
||||
self.unity_action_size = env.unity_action_size
|
||||
self.state_size = self.unity_observation_shape[0]
|
||||
self.agent_num = env.unity_agent_num
|
||||
self.target_size = self.args.target_state_size
|
||||
self.time_state_size = self.args.time_state_size
|
||||
self.gun_state_size = self.args.gun_state_size
|
||||
self.my_state_size = self.args.my_state_size
|
||||
self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
|
||||
self.state_size_without_ray = self.args.total_target_size
|
||||
self.head_input_size = (
|
||||
env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
|
||||
) # except target state input
|
||||
|
||||
self.unity_discrete_type = env.unity_discrete_type
|
||||
self.discrete_size = env.unity_discrete_size
|
||||
self.discrete_shape = list(env.unity_discrete_branches)
|
||||
self.continuous_size = env.unity_continuous_size
|
||||
|
||||
self.view_network = nn.Sequential(layer_init(nn.Linear(self.ray_state_size, 200)), nn.LeakyReLU())
|
||||
self.target_networks = nn.ModuleList(
|
||||
self.hidden_networks = nn.ModuleList(
|
||||
[
|
||||
nn.Sequential(layer_init(nn.Linear(self.state_size_without_ray, 100)), nn.LeakyReLU())
|
||||
for i in range(self.target_num)
|
||||
]
|
||||
)
|
||||
self.middle_networks = nn.ModuleList(
|
||||
[
|
||||
nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU())
|
||||
nn.Sequential(
|
||||
layer_init(nn.Linear(self.state_size, neural_size_1)),
|
||||
nn.LeakyReLU(),
|
||||
layer_init(nn.Linear(neural_size_1, neural_size_2)),
|
||||
nn.LeakyReLU(),
|
||||
)
|
||||
for i in range(self.target_num)
|
||||
]
|
||||
)
|
||||
|
||||
self.actor_dis = nn.ModuleList(
|
||||
[layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(self.target_num)]
|
||||
[layer_init(nn.Linear(neural_size_2, self.discrete_size), std=0.5) for i in range(self.target_num)]
|
||||
)
|
||||
self.actor_mean = nn.ModuleList(
|
||||
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
|
||||
[layer_init(nn.Linear(neural_size_2, self.continuous_size), std=0) for i in range(self.target_num)]
|
||||
)
|
||||
self.actor_logstd = nn.ParameterList(
|
||||
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
|
||||
) # nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||
)
|
||||
self.critic = nn.ModuleList(
|
||||
[layer_init(nn.Linear(200, 1), std=1) for i in range(self.target_num)]
|
||||
[layer_init(nn.Linear(neural_size_2, 1), std=0) for i in range(self.target_num)]
|
||||
)
|
||||
|
||||
def get_value(self, state: torch.Tensor):
|
||||
# get critic value
|
||||
# state.size()[0] is batch_size
|
||||
target = state[:, 0].to(torch.int32) # int
|
||||
this_state_num = target.size()[0]
|
||||
view_input = state[:, -self.ray_state_size:] # all ray input
|
||||
target_input = state[:, : self.state_size_without_ray]
|
||||
view_layer = self.view_network(view_input)
|
||||
target_layer = torch.stack(
|
||||
[self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
|
||||
)
|
||||
middle_input = torch.cat([view_layer, target_layer], dim=1)
|
||||
middle_layer = torch.stack(
|
||||
[self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
|
||||
hidden_output = torch.stack(
|
||||
[self.hidden_networks[target[i]](state[i]) for i in range(state.size()[0])]
|
||||
)
|
||||
criticV = torch.stack(
|
||||
[self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||
) # self.critic
|
||||
[self.critic[target[i]](hidden_output[i]) for i in range(state.size()[0])]
|
||||
)
|
||||
return criticV
|
||||
|
||||
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||
# get actions and value
|
||||
target = state[:, 0].to(torch.int32) # int
|
||||
this_state_num = target.size()[0]
|
||||
view_input = state[:, -self.ray_state_size:] # all ray input
|
||||
target_input = state[:, : self.state_size_without_ray]
|
||||
view_layer = self.view_network(view_input)
|
||||
target_layer = torch.stack(
|
||||
[self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
|
||||
)
|
||||
middle_input = torch.cat([view_layer, target_layer], dim=1)
|
||||
middle_layer = torch.stack(
|
||||
[self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
|
||||
hidden_output = torch.stack(
|
||||
[self.hidden_networks[target[i]](state[i]) for i in range(target.size()[0])]
|
||||
)
|
||||
|
||||
# discrete
|
||||
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
||||
dis_logits = torch.stack(
|
||||
[self.actor_dis[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||
[self.actor_dis[target[i]](hidden_output[i]) for i in range(target.size()[0])]
|
||||
)
|
||||
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
||||
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
||||
# continuous
|
||||
actions_mean = torch.stack(
|
||||
[self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||
[self.actor_mean[target[i]](hidden_output[i]) for i in range(target.size()[0])]
|
||||
) # self.actor_mean(hidden)
|
||||
action_logstd = torch.stack(
|
||||
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)]
|
||||
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(target.size()[0])]
|
||||
)
|
||||
# print(action_logstd)
|
||||
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
|
||||
con_probs = Normal(actions_mean, action_std)
|
||||
# critic
|
||||
criticV = torch.stack(
|
||||
[self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||
[self.critic[target[i]](hidden_output[i]) for i in range(target.size()[0])]
|
||||
) # self.critic
|
||||
|
||||
if actions is None:
|
||||
|
Loading…
Reference in New Issue
Block a user