Change training dataset storage method

save training dataset by it target type.
while training NN use single target training set to backward NN.
this improve at least 20 times faster than last update!
This commit is contained in:
Koha9 2022-12-03 07:54:38 +09:00
parent 895cd5c118
commit cbc385ca10

View File

@ -10,6 +10,7 @@ import torch.optim as optim
from AimbotEnv import Aimbot from AimbotEnv import Aimbot
from tqdm import tqdm from tqdm import tqdm
from enum import Enum
from torch.distributions.normal import Normal from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical from torch.distributions.categorical import Categorical
from distutils.util import strtobool from distutils.util import strtobool
@ -34,11 +35,11 @@ BASE_PORT = 1001
# max round steps per agent is 2500/Decision_period, 25 seconds # max round steps per agent is 2500/Decision_period, 25 seconds
# !!!check every parameters before run!!! # !!!check every parameters before run!!!
TOTAL_STEPS = 6000000 TOTAL_STEPS = 6750000
BATCH_SIZE = 512 BATCH_SIZE = 512
MAX_TRAINNING_DATASETS = 8000 MAX_TRAINNING_DATASETS = 3000
DECISION_PERIOD = 1 DECISION_PERIOD = 1
LEARNING_RATE = 8e-4 LEARNING_RATE = 1e-3
GAMMA = 0.99 GAMMA = 0.99
GAE_LAMBDA = 0.95 GAE_LAMBDA = 0.95
EPOCHS = 4 EPOCHS = 4
@ -54,17 +55,27 @@ NORM_ADV = True
TRAIN = True TRAIN = True
WANDB_TACK = True WANDB_TACK = True
#LOAD_DIR = None LOAD_DIR = None
LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt" #LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
# public data # public data
class Targets(Enum):
Free = 0
Go = 1
Attack = 2
Defence = 3
Num = 4
BASE_WINREWARD = 999 BASE_WINREWARD = 999
BASE_LOSEREWARD = -999 BASE_LOSEREWARD = -999
TARGETNUM= 4 TARGETNUM= 4
ENV_TIMELIMIT = 30 ENV_TIMELIMIT = 30
RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT
TotalRounds = {"Go":0,"Attack":0,"Free":0} TotalRounds = {"Free":0,"Go":0,"Attack":0}
WinRounds = {"Go":0,"Attack":0,"Free":0} WinRounds = {"Free":0,"Go":0,"Attack":0}
# !!!SPECIAL PARAMETERS!!!
# change it while program is finished
using_targets_num = 3
def parse_args(): def parse_args():
@ -164,7 +175,7 @@ class PPOAgent(nn.Module):
def get_actions_value(self, state: torch.Tensor, actions=None): def get_actions_value(self, state: torch.Tensor, actions=None):
hidden = self.network(state) hidden = self.network(state)
targets = state[:,0] targets = state[:,0].to(torch.int32)
# discrete # discrete
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出 # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
@ -321,8 +332,8 @@ if __name__ == "__main__":
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
# Tensorboard and WandB Recorder # Tensorboard and WandB Recorder
game_name = "Aimbot_Target" game_name = "Aimbot_Target_Hybrid_Multi_Output"
game_type = "OffPolicy_HMNN_EndBC" game_type = "OffPolicy_EndBC"
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}" run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
if args.wandb_track: if args.wandb_track:
wandb.init( wandb.init(
@ -351,14 +362,24 @@ if __name__ == "__main__":
dones_bf = [[] for i in range(env.unity_agent_num)] dones_bf = [[] for i in range(env.unity_agent_num)]
values_bf = [[] for i in range(env.unity_agent_num)] values_bf = [[] for i in range(env.unity_agent_num)]
# TRY NOT TO MODIFY: start the game # start the game
total_update_step = args.total_timesteps // args.datasetSize total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
global_step = 0 target_steps = [0 for i in range(TARGETNUM)]
start_time = time.time() start_time = time.time()
state, _, done = env.reset() state, _, done = env.reset()
# state = torch.Tensor(next_obs).to(device) # state = torch.Tensor(next_obs).to(device)
# next_done = torch.zeros(env.unity_agent_num).to(device) # next_done = torch.zeros(env.unity_agent_num).to(device)
# initialize empty training datasets
obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size)
actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size)
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
for total_steps in range(total_update_step): for total_steps in range(total_update_step):
# discunt learning rate, while step == total_update_step lr will be 0 # discunt learning rate, while step == total_update_step lr will be 0
print("new episode") print("new episode")
@ -368,24 +389,15 @@ if __name__ == "__main__":
lrnow = frac * args.lr lrnow = frac * args.lr
optimizer.param_groups[0]["lr"] = lrnow optimizer.param_groups[0]["lr"] = lrnow
# initialize empty training datasets
obs = torch.tensor([]).to(device) # (n,env.unity_observation_size)
actions = torch.tensor([]).to(device) # (n,env.unity_action_size)
dis_logprobs = torch.tensor([]).to(device) # (n,1)
con_logprobs = torch.tensor([]).to(device) # (n,1)
rewards = torch.tensor([]).to(device) # (n,1)
values = torch.tensor([]).to(device) # (n,1)
advantages = torch.tensor([]).to(device) # (n,1)
returns = torch.tensor([]).to(device) # (n,1)
# MAIN LOOP: run agent in environment # MAIN LOOP: run agent in environment
i = 0 i = 0
training = False training = False
trainQueue = []
while True: while True:
if i % args.decision_period == 0: if i % args.decision_period == 0:
step = round(i / args.decision_period) step = round(i / args.decision_period)
# Choose action by agent # Choose action by agent
global_step += 1 * env.unity_agent_num
with torch.no_grad(): with torch.no_grad():
# predict actions # predict actions
@ -416,7 +428,8 @@ if __name__ == "__main__":
# finished a round, send finished memories to training datasets # finished a round, send finished memories to training datasets
# compute advantage and discounted reward # compute advantage and discounted reward
#print(i,"over") #print(i,"over")
thisRewardsTensor = broadCastEndReward(rewards_bf[i],state[i,6]) roundTargetType = int(state[i,0])
thisRewardsTensor = broadCastEndReward(rewards_bf[i],roundTargetType)
adv, rt = GAE( adv, rt = GAE(
agent, agent,
args, args,
@ -427,18 +440,18 @@ if __name__ == "__main__":
torch.Tensor([next_done[i]]).to(device), torch.Tensor([next_done[i]]).to(device),
) )
# send memories to training datasets # send memories to training datasets
obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0) obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0) actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs = torch.cat( dis_logprobs[roundTargetType] = torch.cat(
(dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0 (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
) )
con_logprobs = torch.cat( con_logprobs[roundTargetType] = torch.cat(
(con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0 (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
) )
rewards = torch.cat((rewards, thisRewardsTensor), 0) rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0) values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
advantages = torch.cat((advantages, adv), 0) advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
returns = torch.cat((returns, rt), 0) returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
# clear buffers # clear buffers
ob_bf[i] = [] ob_bf[i] = []
@ -448,10 +461,13 @@ if __name__ == "__main__":
rewards_bf[i] = [] rewards_bf[i] = []
dones_bf[i] = [] dones_bf[i] = []
values_bf[i] = [] values_bf[i] = []
print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}") print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
if obs.size()[0] >= args.datasetSize: for i in range(TARGETNUM):
# start train NN if obs[i].size()[0] >= args.datasetSize:
# start train NN
trainQueue.append(i)
if(len(trainQueue)>0):
break break
state, done = next_state, next_done state, done = next_state, next_done
else: else:
@ -507,128 +523,143 @@ if __name__ == "__main__":
i += 1 i += 1
if args.train: if args.train:
# flatten the batch meanRewardList = [] # for WANDB
b_obs = obs.reshape((-1,) + env.unity_observation_shape) # loop all tarining queue
b_dis_logprobs = dis_logprobs.reshape(-1) for thisT in trainQueue:
b_con_logprobs = con_logprobs.reshape(-1) target_steps[thisT]+=1
b_actions = actions.reshape((-1,) + (env.unity_action_size,)) # flatten the batch
b_advantages = advantages.reshape(-1) b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
b_returns = returns.reshape(-1) b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
b_values = values.reshape(-1) b_con_logprobs = con_logprobs[thisT].reshape(-1)
b_size = b_obs.size()[0] b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
# Optimizing the policy and value network b_advantages = advantages[thisT].reshape(-1)
b_inds = np.arange(b_size) b_returns = returns[thisT].reshape(-1)
# clipfracs = [] b_values = values[thisT].reshape(-1)
for epoch in range(args.epochs): b_size = b_obs[thisT].size()[0]
# shuffle all datasets # Optimizing the policy and value network
np.random.shuffle(b_inds) b_inds = np.arange(b_size)
for start in range(0, b_size, args.minibatchSize): # clipfracs = []
end = start + args.minibatchSize for epoch in range(args.epochs):
mb_inds = b_inds[start:end] # shuffle all datasets
mb_advantages = b_advantages[mb_inds] np.random.shuffle(b_inds)
for start in range(0, b_size, args.minibatchSize):
end = start + args.minibatchSize
mb_inds = b_inds[start:end]
mb_advantages = b_advantages[mb_inds]
# normalize advantages # normalize advantages
if args.norm_adv: if args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / ( mb_advantages = (mb_advantages - mb_advantages.mean()) / (
mb_advantages.std() + 1e-8 mb_advantages.std() + 1e-8
)
(
_,
new_dis_logprob,
dis_entropy,
new_con_logprob,
con_entropy,
newvalue,
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
# discrete ratio
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
dis_ratio = dis_logratio.exp()
# continuous ratio
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
con_ratio = con_logratio.exp()
"""
# early stop
with torch.no_grad():
# calculate approx_kl http://joschu.net/blog/kl-approx.html
old_approx_kl = (-logratio).mean()
approx_kl = ((ratio - 1) - logratio).mean()
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
"""
# discrete Policy loss
dis_pg_loss_orig = -mb_advantages * dis_ratio
dis_pg_loss_clip = -mb_advantages * torch.clamp(
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
)
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
# continuous Policy loss
con_pg_loss_orig = -mb_advantages * con_ratio
con_pg_loss_clip = -mb_advantages * torch.clamp(
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
)
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
# Value loss
newvalue = newvalue.view(-1)
if args.clip_vloss:
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
v_clipped = b_values[mb_inds] + torch.clamp(
newvalue - b_values[mb_inds],
-args.clip_coef,
args.clip_coef,
)
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
# total loss
entropy_loss = dis_entropy.mean() + con_entropy.mean()
loss = (
dis_pg_loss * args.policy_coef
+ con_pg_loss * args.policy_coef
- entropy_loss * args.ent_coef
+ v_loss * args.critic_coef
) )
( optimizer.zero_grad()
_, loss.backward()
new_dis_logprob, # Clips gradient norm of an iterable of parameters.
dis_entropy, nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
new_con_logprob, optimizer.step()
con_entropy,
newvalue,
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
# discrete ratio
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
dis_ratio = dis_logratio.exp()
# continuous ratio
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
con_ratio = con_logratio.exp()
""" """
# early stop if args.target_kl is not None:
with torch.no_grad(): if approx_kl > args.target_kl:
# calculate approx_kl http://joschu.net/blog/kl-approx.html break
old_approx_kl = (-logratio).mean()
approx_kl = ((ratio - 1) - logratio).mean()
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
""" """
# record mean reward before clear history
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
meanRewardList.append(targetRewardMean)
targetName = Targets(thisT).name
# discrete Policy loss # clear this target trainning set buffer
dis_pg_loss_orig = -mb_advantages * dis_ratio obs[thisT] = torch.tensor([]).to(device)
dis_pg_loss_clip = -mb_advantages * torch.clamp( actions[thisT] = torch.tensor([]).to(device)
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef dis_logprobs[thisT] = torch.tensor([]).to(device)
) con_logprobs[thisT] = torch.tensor([]).to(device)
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() rewards[thisT] = torch.tensor([]).to(device)
# continuous Policy loss values[thisT] = torch.tensor([]).to(device)
con_pg_loss_orig = -mb_advantages * con_ratio advantages[thisT] = torch.tensor([]).to(device)
con_pg_loss_clip = -mb_advantages * torch.clamp( returns[thisT] = torch.tensor([]).to(device)
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
)
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
# Value loss # record rewards for plotting purposes
newvalue = newvalue.view(-1) writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
if args.clip_vloss: writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
v_clipped = b_values[mb_inds] + torch.clamp( writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
newvalue - b_values[mb_inds], writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
-args.clip_coef, writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
args.clip_coef, writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
) writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 print(f"episode over Target{targetName} mean reward:", targetRewardMean)
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) TotalRewardMean = np.mean(meanRewardList)
v_loss = 0.5 * v_loss_max.mean() writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
else: writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps)
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() # New Record!
if TotalRewardMean > bestReward:
# total loss bestReward = targetRewardMean
entropy_loss = dis_entropy.mean() + con_entropy.mean() saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt"
loss = (
dis_pg_loss * args.policy_coef
+ con_pg_loss * args.policy_coef
- entropy_loss * args.ent_coef
+ v_loss * args.critic_coef
)
optimizer.zero_grad()
loss.backward()
# Clips gradient norm of an iterable of parameters.
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
optimizer.step()
"""
if args.target_kl is not None:
if approx_kl > args.target_kl:
break
"""
# record rewards for plotting purposes
rewardsMean = np.mean(rewards.to("cpu").detach().numpy().copy())
writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step)
writer.add_scalar("losses/con_policy_loss", con_pg_loss.item(), global_step)
writer.add_scalar("losses/total_loss", loss.item(), global_step)
writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step)
# writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
# writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
# writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
# print("SPS:", int(global_step / (time.time() - start_time)))
print("episode over mean reward:", rewardsMean)
writer.add_scalar(
"charts/SPS", int(global_step / (time.time() - start_time)), global_step
)
writer.add_scalar("charts/Reward", rewardsMean, global_step)
writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"], global_step)
writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"], global_step)
writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"], global_step)
if rewardsMean > bestReward:
bestReward = rewardsMean
saveDir = "../PPO-Model/Target-700-500-256-hybrid-" + str(rewardsMean) + ".pt"
torch.save(agent, saveDir) torch.save(agent, saveDir)
saveDir = "../PPO-Model/Hybrid-MNN-500-300-Last" + ".pt"
torch.save(agent, saveDir)
env.close() env.close()
writer.close() writer.close()