Change training dataset storage method

save training dataset by it target type.
while training NN use single target training set to backward NN.
this improve at least 20 times faster than last update!
This commit is contained in:
Koha9 2022-12-03 07:54:38 +09:00
parent 895cd5c118
commit cbc385ca10

View File

@ -10,6 +10,7 @@ import torch.optim as optim
from AimbotEnv import Aimbot
from tqdm import tqdm
from enum import Enum
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
from distutils.util import strtobool
@ -34,11 +35,11 @@ BASE_PORT = 1001
# max round steps per agent is 2500/Decision_period, 25 seconds
# !!!check every parameters before run!!!
TOTAL_STEPS = 6000000
TOTAL_STEPS = 6750000
BATCH_SIZE = 512
MAX_TRAINNING_DATASETS = 8000
MAX_TRAINNING_DATASETS = 3000
DECISION_PERIOD = 1
LEARNING_RATE = 8e-4
LEARNING_RATE = 1e-3
GAMMA = 0.99
GAE_LAMBDA = 0.95
EPOCHS = 4
@ -54,17 +55,27 @@ NORM_ADV = True
TRAIN = True
WANDB_TACK = True
#LOAD_DIR = None
LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
LOAD_DIR = None
#LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
# public data
class Targets(Enum):
Free = 0
Go = 1
Attack = 2
Defence = 3
Num = 4
BASE_WINREWARD = 999
BASE_LOSEREWARD = -999
TARGETNUM= 4
ENV_TIMELIMIT = 30
RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT
TotalRounds = {"Go":0,"Attack":0,"Free":0}
WinRounds = {"Go":0,"Attack":0,"Free":0}
TotalRounds = {"Free":0,"Go":0,"Attack":0}
WinRounds = {"Free":0,"Go":0,"Attack":0}
# !!!SPECIAL PARAMETERS!!!
# change it while program is finished
using_targets_num = 3
def parse_args():
@ -164,7 +175,7 @@ class PPOAgent(nn.Module):
def get_actions_value(self, state: torch.Tensor, actions=None):
hidden = self.network(state)
targets = state[:,0]
targets = state[:,0].to(torch.int32)
# discrete
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
@ -321,8 +332,8 @@ if __name__ == "__main__":
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
# Tensorboard and WandB Recorder
game_name = "Aimbot_Target"
game_type = "OffPolicy_HMNN_EndBC"
game_name = "Aimbot_Target_Hybrid_Multi_Output"
game_type = "OffPolicy_EndBC"
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
if args.wandb_track:
wandb.init(
@ -351,14 +362,24 @@ if __name__ == "__main__":
dones_bf = [[] for i in range(env.unity_agent_num)]
values_bf = [[] for i in range(env.unity_agent_num)]
# TRY NOT TO MODIFY: start the game
total_update_step = args.total_timesteps // args.datasetSize
global_step = 0
# start the game
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
target_steps = [0 for i in range(TARGETNUM)]
start_time = time.time()
state, _, done = env.reset()
# state = torch.Tensor(next_obs).to(device)
# next_done = torch.zeros(env.unity_agent_num).to(device)
# initialize empty training datasets
obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size)
actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size)
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
for total_steps in range(total_update_step):
# discunt learning rate, while step == total_update_step lr will be 0
print("new episode")
@ -368,24 +389,15 @@ if __name__ == "__main__":
lrnow = frac * args.lr
optimizer.param_groups[0]["lr"] = lrnow
# initialize empty training datasets
obs = torch.tensor([]).to(device) # (n,env.unity_observation_size)
actions = torch.tensor([]).to(device) # (n,env.unity_action_size)
dis_logprobs = torch.tensor([]).to(device) # (n,1)
con_logprobs = torch.tensor([]).to(device) # (n,1)
rewards = torch.tensor([]).to(device) # (n,1)
values = torch.tensor([]).to(device) # (n,1)
advantages = torch.tensor([]).to(device) # (n,1)
returns = torch.tensor([]).to(device) # (n,1)
# MAIN LOOP: run agent in environment
i = 0
training = False
trainQueue = []
while True:
if i % args.decision_period == 0:
step = round(i / args.decision_period)
# Choose action by agent
global_step += 1 * env.unity_agent_num
with torch.no_grad():
# predict actions
@ -416,7 +428,8 @@ if __name__ == "__main__":
# finished a round, send finished memories to training datasets
# compute advantage and discounted reward
#print(i,"over")
thisRewardsTensor = broadCastEndReward(rewards_bf[i],state[i,6])
roundTargetType = int(state[i,0])
thisRewardsTensor = broadCastEndReward(rewards_bf[i],roundTargetType)
adv, rt = GAE(
agent,
args,
@ -427,18 +440,18 @@ if __name__ == "__main__":
torch.Tensor([next_done[i]]).to(device),
)
# send memories to training datasets
obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs = torch.cat(
(dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs[roundTargetType] = torch.cat(
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
)
con_logprobs = torch.cat(
(con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
con_logprobs[roundTargetType] = torch.cat(
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
)
rewards = torch.cat((rewards, thisRewardsTensor), 0)
values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
advantages = torch.cat((advantages, adv), 0)
returns = torch.cat((returns, rt), 0)
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
# clear buffers
ob_bf[i] = []
@ -448,10 +461,13 @@ if __name__ == "__main__":
rewards_bf[i] = []
dones_bf[i] = []
values_bf[i] = []
print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
if obs.size()[0] >= args.datasetSize:
for i in range(TARGETNUM):
if obs[i].size()[0] >= args.datasetSize:
# start train NN
trainQueue.append(i)
if(len(trainQueue)>0):
break
state, done = next_state, next_done
else:
@ -507,15 +523,19 @@ if __name__ == "__main__":
i += 1
if args.train:
meanRewardList = [] # for WANDB
# loop all tarining queue
for thisT in trainQueue:
target_steps[thisT]+=1
# flatten the batch
b_obs = obs.reshape((-1,) + env.unity_observation_shape)
b_dis_logprobs = dis_logprobs.reshape(-1)
b_con_logprobs = con_logprobs.reshape(-1)
b_actions = actions.reshape((-1,) + (env.unity_action_size,))
b_advantages = advantages.reshape(-1)
b_returns = returns.reshape(-1)
b_values = values.reshape(-1)
b_size = b_obs.size()[0]
b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
b_con_logprobs = con_logprobs[thisT].reshape(-1)
b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
b_advantages = advantages[thisT].reshape(-1)
b_returns = returns[thisT].reshape(-1)
b_values = values[thisT].reshape(-1)
b_size = b_obs[thisT].size()[0]
# Optimizing the policy and value network
b_inds = np.arange(b_size)
# clipfracs = []
@ -605,30 +625,41 @@ if __name__ == "__main__":
if approx_kl > args.target_kl:
break
"""
# record mean reward before clear history
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
meanRewardList.append(targetRewardMean)
targetName = Targets(thisT).name
# clear this target trainning set buffer
obs[thisT] = torch.tensor([]).to(device)
actions[thisT] = torch.tensor([]).to(device)
dis_logprobs[thisT] = torch.tensor([]).to(device)
con_logprobs[thisT] = torch.tensor([]).to(device)
rewards[thisT] = torch.tensor([]).to(device)
values[thisT] = torch.tensor([]).to(device)
advantages[thisT] = torch.tensor([]).to(device)
returns[thisT] = torch.tensor([]).to(device)
# record rewards for plotting purposes
rewardsMean = np.mean(rewards.to("cpu").detach().numpy().copy())
writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step)
writer.add_scalar("losses/con_policy_loss", con_pg_loss.item(), global_step)
writer.add_scalar("losses/total_loss", loss.item(), global_step)
writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step)
# writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
# writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
# writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
# print("SPS:", int(global_step / (time.time() - start_time)))
print("episode over mean reward:", rewardsMean)
writer.add_scalar(
"charts/SPS", int(global_step / (time.time() - start_time)), global_step
)
writer.add_scalar("charts/Reward", rewardsMean, global_step)
writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"], global_step)
writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"], global_step)
writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"], global_step)
if rewardsMean > bestReward:
bestReward = rewardsMean
saveDir = "../PPO-Model/Target-700-500-256-hybrid-" + str(rewardsMean) + ".pt"
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
TotalRewardMean = np.mean(meanRewardList)
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps)
# New Record!
if TotalRewardMean > bestReward:
bestReward = targetRewardMean
saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt"
torch.save(agent, saveDir)
saveDir = "../PPO-Model/Hybrid-MNN-500-300-Last" + ".pt"
torch.save(agent, saveDir)
env.close()
writer.close()