Change training dataset storage method
save training dataset by it target type. while training NN use single target training set to backward NN. this improve at least 20 times faster than last update!
This commit is contained in:
parent
895cd5c118
commit
cbc385ca10
@ -10,6 +10,7 @@ import torch.optim as optim
|
||||
|
||||
from AimbotEnv import Aimbot
|
||||
from tqdm import tqdm
|
||||
from enum import Enum
|
||||
from torch.distributions.normal import Normal
|
||||
from torch.distributions.categorical import Categorical
|
||||
from distutils.util import strtobool
|
||||
@ -34,11 +35,11 @@ BASE_PORT = 1001
|
||||
# max round steps per agent is 2500/Decision_period, 25 seconds
|
||||
# !!!check every parameters before run!!!
|
||||
|
||||
TOTAL_STEPS = 6000000
|
||||
TOTAL_STEPS = 6750000
|
||||
BATCH_SIZE = 512
|
||||
MAX_TRAINNING_DATASETS = 8000
|
||||
MAX_TRAINNING_DATASETS = 3000
|
||||
DECISION_PERIOD = 1
|
||||
LEARNING_RATE = 8e-4
|
||||
LEARNING_RATE = 1e-3
|
||||
GAMMA = 0.99
|
||||
GAE_LAMBDA = 0.95
|
||||
EPOCHS = 4
|
||||
@ -54,17 +55,27 @@ NORM_ADV = True
|
||||
TRAIN = True
|
||||
|
||||
WANDB_TACK = True
|
||||
#LOAD_DIR = None
|
||||
LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
|
||||
LOAD_DIR = None
|
||||
#LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
|
||||
|
||||
# public data
|
||||
class Targets(Enum):
|
||||
Free = 0
|
||||
Go = 1
|
||||
Attack = 2
|
||||
Defence = 3
|
||||
Num = 4
|
||||
BASE_WINREWARD = 999
|
||||
BASE_LOSEREWARD = -999
|
||||
TARGETNUM= 4
|
||||
ENV_TIMELIMIT = 30
|
||||
RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT
|
||||
TotalRounds = {"Go":0,"Attack":0,"Free":0}
|
||||
WinRounds = {"Go":0,"Attack":0,"Free":0}
|
||||
TotalRounds = {"Free":0,"Go":0,"Attack":0}
|
||||
WinRounds = {"Free":0,"Go":0,"Attack":0}
|
||||
|
||||
# !!!SPECIAL PARAMETERS!!!
|
||||
# change it while program is finished
|
||||
using_targets_num = 3
|
||||
|
||||
|
||||
def parse_args():
|
||||
@ -164,7 +175,7 @@ class PPOAgent(nn.Module):
|
||||
|
||||
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||
hidden = self.network(state)
|
||||
targets = state[:,0]
|
||||
targets = state[:,0].to(torch.int32)
|
||||
|
||||
# discrete
|
||||
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
||||
@ -321,8 +332,8 @@ if __name__ == "__main__":
|
||||
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
||||
|
||||
# Tensorboard and WandB Recorder
|
||||
game_name = "Aimbot_Target"
|
||||
game_type = "OffPolicy_HMNN_EndBC"
|
||||
game_name = "Aimbot_Target_Hybrid_Multi_Output"
|
||||
game_type = "OffPolicy_EndBC"
|
||||
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
|
||||
if args.wandb_track:
|
||||
wandb.init(
|
||||
@ -351,14 +362,24 @@ if __name__ == "__main__":
|
||||
dones_bf = [[] for i in range(env.unity_agent_num)]
|
||||
values_bf = [[] for i in range(env.unity_agent_num)]
|
||||
|
||||
# TRY NOT TO MODIFY: start the game
|
||||
total_update_step = args.total_timesteps // args.datasetSize
|
||||
global_step = 0
|
||||
# start the game
|
||||
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
|
||||
target_steps = [0 for i in range(TARGETNUM)]
|
||||
start_time = time.time()
|
||||
state, _, done = env.reset()
|
||||
# state = torch.Tensor(next_obs).to(device)
|
||||
# next_done = torch.zeros(env.unity_agent_num).to(device)
|
||||
|
||||
# initialize empty training datasets
|
||||
obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size)
|
||||
actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size)
|
||||
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||
advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||
returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||
|
||||
for total_steps in range(total_update_step):
|
||||
# discunt learning rate, while step == total_update_step lr will be 0
|
||||
print("new episode")
|
||||
@ -368,24 +389,15 @@ if __name__ == "__main__":
|
||||
lrnow = frac * args.lr
|
||||
optimizer.param_groups[0]["lr"] = lrnow
|
||||
|
||||
# initialize empty training datasets
|
||||
obs = torch.tensor([]).to(device) # (n,env.unity_observation_size)
|
||||
actions = torch.tensor([]).to(device) # (n,env.unity_action_size)
|
||||
dis_logprobs = torch.tensor([]).to(device) # (n,1)
|
||||
con_logprobs = torch.tensor([]).to(device) # (n,1)
|
||||
rewards = torch.tensor([]).to(device) # (n,1)
|
||||
values = torch.tensor([]).to(device) # (n,1)
|
||||
advantages = torch.tensor([]).to(device) # (n,1)
|
||||
returns = torch.tensor([]).to(device) # (n,1)
|
||||
|
||||
# MAIN LOOP: run agent in environment
|
||||
i = 0
|
||||
training = False
|
||||
trainQueue = []
|
||||
while True:
|
||||
if i % args.decision_period == 0:
|
||||
step = round(i / args.decision_period)
|
||||
# Choose action by agent
|
||||
global_step += 1 * env.unity_agent_num
|
||||
|
||||
with torch.no_grad():
|
||||
# predict actions
|
||||
@ -416,7 +428,8 @@ if __name__ == "__main__":
|
||||
# finished a round, send finished memories to training datasets
|
||||
# compute advantage and discounted reward
|
||||
#print(i,"over")
|
||||
thisRewardsTensor = broadCastEndReward(rewards_bf[i],state[i,6])
|
||||
roundTargetType = int(state[i,0])
|
||||
thisRewardsTensor = broadCastEndReward(rewards_bf[i],roundTargetType)
|
||||
adv, rt = GAE(
|
||||
agent,
|
||||
args,
|
||||
@ -427,18 +440,18 @@ if __name__ == "__main__":
|
||||
torch.Tensor([next_done[i]]).to(device),
|
||||
)
|
||||
# send memories to training datasets
|
||||
obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
|
||||
actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
|
||||
dis_logprobs = torch.cat(
|
||||
(dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
||||
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
|
||||
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
|
||||
dis_logprobs[roundTargetType] = torch.cat(
|
||||
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
||||
)
|
||||
con_logprobs = torch.cat(
|
||||
(con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
||||
con_logprobs[roundTargetType] = torch.cat(
|
||||
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
||||
)
|
||||
rewards = torch.cat((rewards, thisRewardsTensor), 0)
|
||||
values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
|
||||
advantages = torch.cat((advantages, adv), 0)
|
||||
returns = torch.cat((returns, rt), 0)
|
||||
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
|
||||
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
|
||||
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
|
||||
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
|
||||
|
||||
# clear buffers
|
||||
ob_bf[i] = []
|
||||
@ -448,10 +461,13 @@ if __name__ == "__main__":
|
||||
rewards_bf[i] = []
|
||||
dones_bf[i] = []
|
||||
values_bf[i] = []
|
||||
print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
|
||||
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
|
||||
|
||||
if obs.size()[0] >= args.datasetSize:
|
||||
for i in range(TARGETNUM):
|
||||
if obs[i].size()[0] >= args.datasetSize:
|
||||
# start train NN
|
||||
trainQueue.append(i)
|
||||
if(len(trainQueue)>0):
|
||||
break
|
||||
state, done = next_state, next_done
|
||||
else:
|
||||
@ -507,15 +523,19 @@ if __name__ == "__main__":
|
||||
i += 1
|
||||
|
||||
if args.train:
|
||||
meanRewardList = [] # for WANDB
|
||||
# loop all tarining queue
|
||||
for thisT in trainQueue:
|
||||
target_steps[thisT]+=1
|
||||
# flatten the batch
|
||||
b_obs = obs.reshape((-1,) + env.unity_observation_shape)
|
||||
b_dis_logprobs = dis_logprobs.reshape(-1)
|
||||
b_con_logprobs = con_logprobs.reshape(-1)
|
||||
b_actions = actions.reshape((-1,) + (env.unity_action_size,))
|
||||
b_advantages = advantages.reshape(-1)
|
||||
b_returns = returns.reshape(-1)
|
||||
b_values = values.reshape(-1)
|
||||
b_size = b_obs.size()[0]
|
||||
b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
|
||||
b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
|
||||
b_con_logprobs = con_logprobs[thisT].reshape(-1)
|
||||
b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
|
||||
b_advantages = advantages[thisT].reshape(-1)
|
||||
b_returns = returns[thisT].reshape(-1)
|
||||
b_values = values[thisT].reshape(-1)
|
||||
b_size = b_obs[thisT].size()[0]
|
||||
# Optimizing the policy and value network
|
||||
b_inds = np.arange(b_size)
|
||||
# clipfracs = []
|
||||
@ -605,30 +625,41 @@ if __name__ == "__main__":
|
||||
if approx_kl > args.target_kl:
|
||||
break
|
||||
"""
|
||||
# record mean reward before clear history
|
||||
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
|
||||
meanRewardList.append(targetRewardMean)
|
||||
targetName = Targets(thisT).name
|
||||
|
||||
# clear this target trainning set buffer
|
||||
obs[thisT] = torch.tensor([]).to(device)
|
||||
actions[thisT] = torch.tensor([]).to(device)
|
||||
dis_logprobs[thisT] = torch.tensor([]).to(device)
|
||||
con_logprobs[thisT] = torch.tensor([]).to(device)
|
||||
rewards[thisT] = torch.tensor([]).to(device)
|
||||
values[thisT] = torch.tensor([]).to(device)
|
||||
advantages[thisT] = torch.tensor([]).to(device)
|
||||
returns[thisT] = torch.tensor([]).to(device)
|
||||
|
||||
# record rewards for plotting purposes
|
||||
rewardsMean = np.mean(rewards.to("cpu").detach().numpy().copy())
|
||||
writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
|
||||
writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
|
||||
writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step)
|
||||
writer.add_scalar("losses/con_policy_loss", con_pg_loss.item(), global_step)
|
||||
writer.add_scalar("losses/total_loss", loss.item(), global_step)
|
||||
writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step)
|
||||
# writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
|
||||
# writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
|
||||
# writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
|
||||
# print("SPS:", int(global_step / (time.time() - start_time)))
|
||||
print("episode over mean reward:", rewardsMean)
|
||||
writer.add_scalar(
|
||||
"charts/SPS", int(global_step / (time.time() - start_time)), global_step
|
||||
)
|
||||
writer.add_scalar("charts/Reward", rewardsMean, global_step)
|
||||
writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"], global_step)
|
||||
writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"], global_step)
|
||||
writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"], global_step)
|
||||
if rewardsMean > bestReward:
|
||||
bestReward = rewardsMean
|
||||
saveDir = "../PPO-Model/Target-700-500-256-hybrid-" + str(rewardsMean) + ".pt"
|
||||
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
|
||||
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
||||
TotalRewardMean = np.mean(meanRewardList)
|
||||
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
||||
writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps)
|
||||
# New Record!
|
||||
if TotalRewardMean > bestReward:
|
||||
bestReward = targetRewardMean
|
||||
saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt"
|
||||
torch.save(agent, saveDir)
|
||||
|
||||
saveDir = "../PPO-Model/Hybrid-MNN-500-300-Last" + ".pt"
|
||||
torch.save(agent, saveDir)
|
||||
env.close()
|
||||
writer.close()
|
||||
|
Loading…
Reference in New Issue
Block a user