Change training dataset storage method
save training dataset by it target type. while training NN use single target training set to backward NN. this improve at least 20 times faster than last update!
This commit is contained in:
parent
895cd5c118
commit
cbc385ca10
@ -10,6 +10,7 @@ import torch.optim as optim
|
|||||||
|
|
||||||
from AimbotEnv import Aimbot
|
from AimbotEnv import Aimbot
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from enum import Enum
|
||||||
from torch.distributions.normal import Normal
|
from torch.distributions.normal import Normal
|
||||||
from torch.distributions.categorical import Categorical
|
from torch.distributions.categorical import Categorical
|
||||||
from distutils.util import strtobool
|
from distutils.util import strtobool
|
||||||
@ -34,11 +35,11 @@ BASE_PORT = 1001
|
|||||||
# max round steps per agent is 2500/Decision_period, 25 seconds
|
# max round steps per agent is 2500/Decision_period, 25 seconds
|
||||||
# !!!check every parameters before run!!!
|
# !!!check every parameters before run!!!
|
||||||
|
|
||||||
TOTAL_STEPS = 6000000
|
TOTAL_STEPS = 6750000
|
||||||
BATCH_SIZE = 512
|
BATCH_SIZE = 512
|
||||||
MAX_TRAINNING_DATASETS = 8000
|
MAX_TRAINNING_DATASETS = 3000
|
||||||
DECISION_PERIOD = 1
|
DECISION_PERIOD = 1
|
||||||
LEARNING_RATE = 8e-4
|
LEARNING_RATE = 1e-3
|
||||||
GAMMA = 0.99
|
GAMMA = 0.99
|
||||||
GAE_LAMBDA = 0.95
|
GAE_LAMBDA = 0.95
|
||||||
EPOCHS = 4
|
EPOCHS = 4
|
||||||
@ -54,17 +55,27 @@ NORM_ADV = True
|
|||||||
TRAIN = True
|
TRAIN = True
|
||||||
|
|
||||||
WANDB_TACK = True
|
WANDB_TACK = True
|
||||||
#LOAD_DIR = None
|
LOAD_DIR = None
|
||||||
LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
|
#LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
|
||||||
|
|
||||||
# public data
|
# public data
|
||||||
|
class Targets(Enum):
|
||||||
|
Free = 0
|
||||||
|
Go = 1
|
||||||
|
Attack = 2
|
||||||
|
Defence = 3
|
||||||
|
Num = 4
|
||||||
BASE_WINREWARD = 999
|
BASE_WINREWARD = 999
|
||||||
BASE_LOSEREWARD = -999
|
BASE_LOSEREWARD = -999
|
||||||
TARGETNUM= 4
|
TARGETNUM= 4
|
||||||
ENV_TIMELIMIT = 30
|
ENV_TIMELIMIT = 30
|
||||||
RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT
|
RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT
|
||||||
TotalRounds = {"Go":0,"Attack":0,"Free":0}
|
TotalRounds = {"Free":0,"Go":0,"Attack":0}
|
||||||
WinRounds = {"Go":0,"Attack":0,"Free":0}
|
WinRounds = {"Free":0,"Go":0,"Attack":0}
|
||||||
|
|
||||||
|
# !!!SPECIAL PARAMETERS!!!
|
||||||
|
# change it while program is finished
|
||||||
|
using_targets_num = 3
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
@ -164,7 +175,7 @@ class PPOAgent(nn.Module):
|
|||||||
|
|
||||||
def get_actions_value(self, state: torch.Tensor, actions=None):
|
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||||
hidden = self.network(state)
|
hidden = self.network(state)
|
||||||
targets = state[:,0]
|
targets = state[:,0].to(torch.int32)
|
||||||
|
|
||||||
# discrete
|
# discrete
|
||||||
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
||||||
@ -321,8 +332,8 @@ if __name__ == "__main__":
|
|||||||
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
||||||
|
|
||||||
# Tensorboard and WandB Recorder
|
# Tensorboard and WandB Recorder
|
||||||
game_name = "Aimbot_Target"
|
game_name = "Aimbot_Target_Hybrid_Multi_Output"
|
||||||
game_type = "OffPolicy_HMNN_EndBC"
|
game_type = "OffPolicy_EndBC"
|
||||||
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
|
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
|
||||||
if args.wandb_track:
|
if args.wandb_track:
|
||||||
wandb.init(
|
wandb.init(
|
||||||
@ -351,14 +362,24 @@ if __name__ == "__main__":
|
|||||||
dones_bf = [[] for i in range(env.unity_agent_num)]
|
dones_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
values_bf = [[] for i in range(env.unity_agent_num)]
|
values_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
|
||||||
# TRY NOT TO MODIFY: start the game
|
# start the game
|
||||||
total_update_step = args.total_timesteps // args.datasetSize
|
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
|
||||||
global_step = 0
|
target_steps = [0 for i in range(TARGETNUM)]
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
state, _, done = env.reset()
|
state, _, done = env.reset()
|
||||||
# state = torch.Tensor(next_obs).to(device)
|
# state = torch.Tensor(next_obs).to(device)
|
||||||
# next_done = torch.zeros(env.unity_agent_num).to(device)
|
# next_done = torch.zeros(env.unity_agent_num).to(device)
|
||||||
|
|
||||||
|
# initialize empty training datasets
|
||||||
|
obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size)
|
||||||
|
actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size)
|
||||||
|
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
|
||||||
for total_steps in range(total_update_step):
|
for total_steps in range(total_update_step):
|
||||||
# discunt learning rate, while step == total_update_step lr will be 0
|
# discunt learning rate, while step == total_update_step lr will be 0
|
||||||
print("new episode")
|
print("new episode")
|
||||||
@ -368,24 +389,15 @@ if __name__ == "__main__":
|
|||||||
lrnow = frac * args.lr
|
lrnow = frac * args.lr
|
||||||
optimizer.param_groups[0]["lr"] = lrnow
|
optimizer.param_groups[0]["lr"] = lrnow
|
||||||
|
|
||||||
# initialize empty training datasets
|
|
||||||
obs = torch.tensor([]).to(device) # (n,env.unity_observation_size)
|
|
||||||
actions = torch.tensor([]).to(device) # (n,env.unity_action_size)
|
|
||||||
dis_logprobs = torch.tensor([]).to(device) # (n,1)
|
|
||||||
con_logprobs = torch.tensor([]).to(device) # (n,1)
|
|
||||||
rewards = torch.tensor([]).to(device) # (n,1)
|
|
||||||
values = torch.tensor([]).to(device) # (n,1)
|
|
||||||
advantages = torch.tensor([]).to(device) # (n,1)
|
|
||||||
returns = torch.tensor([]).to(device) # (n,1)
|
|
||||||
|
|
||||||
# MAIN LOOP: run agent in environment
|
# MAIN LOOP: run agent in environment
|
||||||
i = 0
|
i = 0
|
||||||
training = False
|
training = False
|
||||||
|
trainQueue = []
|
||||||
while True:
|
while True:
|
||||||
if i % args.decision_period == 0:
|
if i % args.decision_period == 0:
|
||||||
step = round(i / args.decision_period)
|
step = round(i / args.decision_period)
|
||||||
# Choose action by agent
|
# Choose action by agent
|
||||||
global_step += 1 * env.unity_agent_num
|
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# predict actions
|
# predict actions
|
||||||
@ -416,7 +428,8 @@ if __name__ == "__main__":
|
|||||||
# finished a round, send finished memories to training datasets
|
# finished a round, send finished memories to training datasets
|
||||||
# compute advantage and discounted reward
|
# compute advantage and discounted reward
|
||||||
#print(i,"over")
|
#print(i,"over")
|
||||||
thisRewardsTensor = broadCastEndReward(rewards_bf[i],state[i,6])
|
roundTargetType = int(state[i,0])
|
||||||
|
thisRewardsTensor = broadCastEndReward(rewards_bf[i],roundTargetType)
|
||||||
adv, rt = GAE(
|
adv, rt = GAE(
|
||||||
agent,
|
agent,
|
||||||
args,
|
args,
|
||||||
@ -427,18 +440,18 @@ if __name__ == "__main__":
|
|||||||
torch.Tensor([next_done[i]]).to(device),
|
torch.Tensor([next_done[i]]).to(device),
|
||||||
)
|
)
|
||||||
# send memories to training datasets
|
# send memories to training datasets
|
||||||
obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
|
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
|
||||||
actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
|
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
|
||||||
dis_logprobs = torch.cat(
|
dis_logprobs[roundTargetType] = torch.cat(
|
||||||
(dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
||||||
)
|
)
|
||||||
con_logprobs = torch.cat(
|
con_logprobs[roundTargetType] = torch.cat(
|
||||||
(con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
||||||
)
|
)
|
||||||
rewards = torch.cat((rewards, thisRewardsTensor), 0)
|
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
|
||||||
values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
|
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
|
||||||
advantages = torch.cat((advantages, adv), 0)
|
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
|
||||||
returns = torch.cat((returns, rt), 0)
|
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
|
||||||
|
|
||||||
# clear buffers
|
# clear buffers
|
||||||
ob_bf[i] = []
|
ob_bf[i] = []
|
||||||
@ -448,10 +461,13 @@ if __name__ == "__main__":
|
|||||||
rewards_bf[i] = []
|
rewards_bf[i] = []
|
||||||
dones_bf[i] = []
|
dones_bf[i] = []
|
||||||
values_bf[i] = []
|
values_bf[i] = []
|
||||||
print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
|
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
|
||||||
|
|
||||||
if obs.size()[0] >= args.datasetSize:
|
for i in range(TARGETNUM):
|
||||||
# start train NN
|
if obs[i].size()[0] >= args.datasetSize:
|
||||||
|
# start train NN
|
||||||
|
trainQueue.append(i)
|
||||||
|
if(len(trainQueue)>0):
|
||||||
break
|
break
|
||||||
state, done = next_state, next_done
|
state, done = next_state, next_done
|
||||||
else:
|
else:
|
||||||
@ -507,128 +523,143 @@ if __name__ == "__main__":
|
|||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
if args.train:
|
if args.train:
|
||||||
# flatten the batch
|
meanRewardList = [] # for WANDB
|
||||||
b_obs = obs.reshape((-1,) + env.unity_observation_shape)
|
# loop all tarining queue
|
||||||
b_dis_logprobs = dis_logprobs.reshape(-1)
|
for thisT in trainQueue:
|
||||||
b_con_logprobs = con_logprobs.reshape(-1)
|
target_steps[thisT]+=1
|
||||||
b_actions = actions.reshape((-1,) + (env.unity_action_size,))
|
# flatten the batch
|
||||||
b_advantages = advantages.reshape(-1)
|
b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
|
||||||
b_returns = returns.reshape(-1)
|
b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
|
||||||
b_values = values.reshape(-1)
|
b_con_logprobs = con_logprobs[thisT].reshape(-1)
|
||||||
b_size = b_obs.size()[0]
|
b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
|
||||||
# Optimizing the policy and value network
|
b_advantages = advantages[thisT].reshape(-1)
|
||||||
b_inds = np.arange(b_size)
|
b_returns = returns[thisT].reshape(-1)
|
||||||
# clipfracs = []
|
b_values = values[thisT].reshape(-1)
|
||||||
for epoch in range(args.epochs):
|
b_size = b_obs[thisT].size()[0]
|
||||||
# shuffle all datasets
|
# Optimizing the policy and value network
|
||||||
np.random.shuffle(b_inds)
|
b_inds = np.arange(b_size)
|
||||||
for start in range(0, b_size, args.minibatchSize):
|
# clipfracs = []
|
||||||
end = start + args.minibatchSize
|
for epoch in range(args.epochs):
|
||||||
mb_inds = b_inds[start:end]
|
# shuffle all datasets
|
||||||
mb_advantages = b_advantages[mb_inds]
|
np.random.shuffle(b_inds)
|
||||||
|
for start in range(0, b_size, args.minibatchSize):
|
||||||
|
end = start + args.minibatchSize
|
||||||
|
mb_inds = b_inds[start:end]
|
||||||
|
mb_advantages = b_advantages[mb_inds]
|
||||||
|
|
||||||
# normalize advantages
|
# normalize advantages
|
||||||
if args.norm_adv:
|
if args.norm_adv:
|
||||||
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
||||||
mb_advantages.std() + 1e-8
|
mb_advantages.std() + 1e-8
|
||||||
|
)
|
||||||
|
|
||||||
|
(
|
||||||
|
_,
|
||||||
|
new_dis_logprob,
|
||||||
|
dis_entropy,
|
||||||
|
new_con_logprob,
|
||||||
|
con_entropy,
|
||||||
|
newvalue,
|
||||||
|
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
||||||
|
# discrete ratio
|
||||||
|
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
||||||
|
dis_ratio = dis_logratio.exp()
|
||||||
|
# continuous ratio
|
||||||
|
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
|
||||||
|
con_ratio = con_logratio.exp()
|
||||||
|
|
||||||
|
"""
|
||||||
|
# early stop
|
||||||
|
with torch.no_grad():
|
||||||
|
# calculate approx_kl http://joschu.net/blog/kl-approx.html
|
||||||
|
old_approx_kl = (-logratio).mean()
|
||||||
|
approx_kl = ((ratio - 1) - logratio).mean()
|
||||||
|
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
|
||||||
|
"""
|
||||||
|
|
||||||
|
# discrete Policy loss
|
||||||
|
dis_pg_loss_orig = -mb_advantages * dis_ratio
|
||||||
|
dis_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||||
|
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
||||||
|
)
|
||||||
|
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
|
||||||
|
# continuous Policy loss
|
||||||
|
con_pg_loss_orig = -mb_advantages * con_ratio
|
||||||
|
con_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||||
|
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
||||||
|
)
|
||||||
|
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
||||||
|
|
||||||
|
# Value loss
|
||||||
|
newvalue = newvalue.view(-1)
|
||||||
|
if args.clip_vloss:
|
||||||
|
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
||||||
|
v_clipped = b_values[mb_inds] + torch.clamp(
|
||||||
|
newvalue - b_values[mb_inds],
|
||||||
|
-args.clip_coef,
|
||||||
|
args.clip_coef,
|
||||||
|
)
|
||||||
|
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
||||||
|
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
||||||
|
v_loss = 0.5 * v_loss_max.mean()
|
||||||
|
else:
|
||||||
|
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
||||||
|
|
||||||
|
# total loss
|
||||||
|
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
||||||
|
loss = (
|
||||||
|
dis_pg_loss * args.policy_coef
|
||||||
|
+ con_pg_loss * args.policy_coef
|
||||||
|
- entropy_loss * args.ent_coef
|
||||||
|
+ v_loss * args.critic_coef
|
||||||
)
|
)
|
||||||
|
|
||||||
(
|
optimizer.zero_grad()
|
||||||
_,
|
loss.backward()
|
||||||
new_dis_logprob,
|
# Clips gradient norm of an iterable of parameters.
|
||||||
dis_entropy,
|
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
|
||||||
new_con_logprob,
|
optimizer.step()
|
||||||
con_entropy,
|
|
||||||
newvalue,
|
|
||||||
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
|
||||||
# discrete ratio
|
|
||||||
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
|
||||||
dis_ratio = dis_logratio.exp()
|
|
||||||
# continuous ratio
|
|
||||||
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
|
|
||||||
con_ratio = con_logratio.exp()
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# early stop
|
if args.target_kl is not None:
|
||||||
with torch.no_grad():
|
if approx_kl > args.target_kl:
|
||||||
# calculate approx_kl http://joschu.net/blog/kl-approx.html
|
break
|
||||||
old_approx_kl = (-logratio).mean()
|
|
||||||
approx_kl = ((ratio - 1) - logratio).mean()
|
|
||||||
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
|
|
||||||
"""
|
"""
|
||||||
|
# record mean reward before clear history
|
||||||
|
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
|
||||||
|
meanRewardList.append(targetRewardMean)
|
||||||
|
targetName = Targets(thisT).name
|
||||||
|
|
||||||
# discrete Policy loss
|
# clear this target trainning set buffer
|
||||||
dis_pg_loss_orig = -mb_advantages * dis_ratio
|
obs[thisT] = torch.tensor([]).to(device)
|
||||||
dis_pg_loss_clip = -mb_advantages * torch.clamp(
|
actions[thisT] = torch.tensor([]).to(device)
|
||||||
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
dis_logprobs[thisT] = torch.tensor([]).to(device)
|
||||||
)
|
con_logprobs[thisT] = torch.tensor([]).to(device)
|
||||||
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
|
rewards[thisT] = torch.tensor([]).to(device)
|
||||||
# continuous Policy loss
|
values[thisT] = torch.tensor([]).to(device)
|
||||||
con_pg_loss_orig = -mb_advantages * con_ratio
|
advantages[thisT] = torch.tensor([]).to(device)
|
||||||
con_pg_loss_clip = -mb_advantages * torch.clamp(
|
returns[thisT] = torch.tensor([]).to(device)
|
||||||
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
|
||||||
)
|
|
||||||
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
|
||||||
|
|
||||||
# Value loss
|
# record rewards for plotting purposes
|
||||||
newvalue = newvalue.view(-1)
|
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
||||||
if args.clip_vloss:
|
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
||||||
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
|
||||||
v_clipped = b_values[mb_inds] + torch.clamp(
|
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
|
||||||
newvalue - b_values[mb_inds],
|
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
|
||||||
-args.clip_coef,
|
writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
|
||||||
args.clip_coef,
|
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
|
||||||
)
|
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
|
||||||
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
||||||
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
TotalRewardMean = np.mean(meanRewardList)
|
||||||
v_loss = 0.5 * v_loss_max.mean()
|
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
||||||
else:
|
writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps)
|
||||||
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
# New Record!
|
||||||
|
if TotalRewardMean > bestReward:
|
||||||
# total loss
|
bestReward = targetRewardMean
|
||||||
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt"
|
||||||
loss = (
|
|
||||||
dis_pg_loss * args.policy_coef
|
|
||||||
+ con_pg_loss * args.policy_coef
|
|
||||||
- entropy_loss * args.ent_coef
|
|
||||||
+ v_loss * args.critic_coef
|
|
||||||
)
|
|
||||||
|
|
||||||
optimizer.zero_grad()
|
|
||||||
loss.backward()
|
|
||||||
# Clips gradient norm of an iterable of parameters.
|
|
||||||
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
|
|
||||||
optimizer.step()
|
|
||||||
|
|
||||||
"""
|
|
||||||
if args.target_kl is not None:
|
|
||||||
if approx_kl > args.target_kl:
|
|
||||||
break
|
|
||||||
"""
|
|
||||||
# record rewards for plotting purposes
|
|
||||||
rewardsMean = np.mean(rewards.to("cpu").detach().numpy().copy())
|
|
||||||
writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
|
|
||||||
writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
|
|
||||||
writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step)
|
|
||||||
writer.add_scalar("losses/con_policy_loss", con_pg_loss.item(), global_step)
|
|
||||||
writer.add_scalar("losses/total_loss", loss.item(), global_step)
|
|
||||||
writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step)
|
|
||||||
# writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
|
|
||||||
# writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
|
|
||||||
# writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
|
|
||||||
# print("SPS:", int(global_step / (time.time() - start_time)))
|
|
||||||
print("episode over mean reward:", rewardsMean)
|
|
||||||
writer.add_scalar(
|
|
||||||
"charts/SPS", int(global_step / (time.time() - start_time)), global_step
|
|
||||||
)
|
|
||||||
writer.add_scalar("charts/Reward", rewardsMean, global_step)
|
|
||||||
writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"], global_step)
|
|
||||||
writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"], global_step)
|
|
||||||
writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"], global_step)
|
|
||||||
if rewardsMean > bestReward:
|
|
||||||
bestReward = rewardsMean
|
|
||||||
saveDir = "../PPO-Model/Target-700-500-256-hybrid-" + str(rewardsMean) + ".pt"
|
|
||||||
torch.save(agent, saveDir)
|
torch.save(agent, saveDir)
|
||||||
|
|
||||||
|
saveDir = "../PPO-Model/Hybrid-MNN-500-300-Last" + ".pt"
|
||||||
|
torch.save(agent, saveDir)
|
||||||
env.close()
|
env.close()
|
||||||
writer.close()
|
writer.close()
|
||||||
|
Loading…
Reference in New Issue
Block a user