修正预测函数小错误,规范化命名

修正get_actions_value中在非学习模式时仍然使用sample模式而不是取最佳值
规范化命名
This commit is contained in:
Koha9 2023-08-04 03:49:49 +09:00
parent 15c1edb6c9
commit 52ccce88bc
3 changed files with 69 additions and 77 deletions

View File

@ -2,6 +2,9 @@
<dictionary name="UCUNI">
<words>
<w>aimbot</w>
<w>logprobs</w>
<w>logstd</w>
<w>unclipped</w>
</words>
</dictionary>
</component>

View File

@ -20,9 +20,6 @@ SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
GAME_NAME = "Aimbot_Hybrid_V3"
GAME_TYPE = "Mix_Verification"
# !!!SPECIAL PARAMETERS!!!
using_targets_num = 3
if __name__ == "__main__":
args = parse_args()
random.seed(args.seed)
@ -61,7 +58,6 @@ if __name__ == "__main__":
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
@atexit.register
def save_model():
# close env
@ -72,9 +68,8 @@ if __name__ == "__main__":
torch.save(agent, save_dir)
print("save model to " + save_dir)
# start the game
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
total_update_step = args.target_num * args.total_timesteps // args.datasetSize
target_steps = [0 for i in range(args.target_num)]
start_time = time.time()
state, _, done = env.reset()

View File

@ -17,10 +17,10 @@ def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
class PPOAgent(nn.Module):
def __init__(
self,
env: Aimbot,
this_args:argparse.Namespace,
device: torch.device,
self,
env: Aimbot,
this_args: argparse.Namespace,
device: torch.device,
):
super(PPOAgent, self).__init__()
self.device = device
@ -38,7 +38,7 @@ class PPOAgent(nn.Module):
self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
self.state_size_without_ray = self.args.total_target_size
self.head_input_size = (
env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
) # except target state input
self.unity_discrete_type = env.unity_discrete_type
@ -65,9 +65,6 @@ class PPOAgent(nn.Module):
self.actor_mean = nn.ModuleList(
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
)
# self.actor_logstd =
# nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
self.actor_logstd = nn.ParameterList(
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
) # nn.Parameter(torch.zeros(1, self.continuous_size))
@ -78,7 +75,7 @@ class PPOAgent(nn.Module):
def get_value(self, state: torch.Tensor):
target = state[:, 0].to(torch.int32) # int
this_state_num = target.size()[0]
view_input = state[:, -self.ray_state_size :] # all ray input
view_input = state[:, -self.ray_state_size:] # all ray input
target_input = state[:, : self.state_size_without_ray]
view_layer = self.view_network(view_input)
target_layer = torch.stack(
@ -96,7 +93,7 @@ class PPOAgent(nn.Module):
def get_actions_value(self, state: torch.Tensor, actions=None):
target = state[:, 0].to(torch.int32) # int
this_state_num = target.size()[0]
view_input = state[:, -self.ray_state_size :] # all ray input
view_input = state[:, -self.ray_state_size:] # all ray input
target_input = state[:, : self.state_size_without_ray]
view_layer = self.view_network(view_input)
target_layer = torch.stack(
@ -118,8 +115,6 @@ class PPOAgent(nn.Module):
actions_mean = torch.stack(
[self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)]
) # self.actor_mean(hidden)
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
action_logstd = torch.stack(
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)]
)
@ -134,32 +129,31 @@ class PPOAgent(nn.Module):
if actions is None:
if self.train_agent:
# select actions base on probability distribution model
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
conAct = con_probs.sample()
actions = torch.cat([disAct.T, conAct], dim=1)
dis_act = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
con_act = con_probs.sample()
actions = torch.cat([dis_act.T, con_act], dim=1)
else:
# select actions base on best probability distribution
# disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
conAct = actions_mean
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
conAct = con_probs.sample()
actions = torch.cat([disAct.T, conAct], dim=1)
dis_act = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
con_act = actions_mean
actions = torch.cat([dis_act.T, con_act], dim=1)
else:
disAct = actions[:, 0 : self.unity_discrete_type].T
conAct = actions[:, self.unity_discrete_type :]
dis_act = actions[:, 0: self.unity_discrete_type].T
con_act = actions[:, self.unity_discrete_type:]
dis_log_prob = torch.stack(
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
[ctgr.log_prob(act) for act, ctgr in zip(dis_act, multi_categoricals)]
)
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
return (
actions,
dis_log_prob.sum(0),
dis_entropy.sum(0),
con_probs.log_prob(conAct).sum(1),
con_probs.log_prob(con_act).sum(1),
con_probs.entropy().sum(1),
criticV,
)
def train_net(self, this_train_ind:int,ppo_memories,optimizer) -> tuple:
def train_net(self, this_train_ind: int, ppo_memories, optimizer) -> tuple:
start_time = time.time()
# flatten the batch
b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape)
@ -171,24 +165,24 @@ class PPOAgent(nn.Module):
b_values = ppo_memories.values[this_train_ind].reshape(-1)
b_size = b_obs.size()[0]
# optimizing the policy and value network
b_inds = np.arange(b_size)
b_index = np.arange(b_size)
for epoch in range(self.args.epochs):
print("epoch:",epoch,end="")
print("epoch:", epoch, end="")
# shuffle all datasets
np.random.shuffle(b_inds)
np.random.shuffle(b_index)
for start in range(0, b_size, self.args.minibatchSize):
print(".",end="")
print(".", end="")
end = start + self.args.minibatchSize
mb_inds = b_inds[start:end]
if(np.size(mb_inds)<=1):
mb_index = b_index[start:end]
if np.size(mb_index) <= 1:
break
mb_advantages = b_advantages[mb_inds]
mb_advantages = b_advantages[mb_index]
# normalize advantages
if self.args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
mb_advantages.std() + 1e-8
mb_advantages.std() + 1e-8
)
(
@ -197,14 +191,14 @@ class PPOAgent(nn.Module):
dis_entropy,
new_con_logprob,
con_entropy,
newvalue,
) = self.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
new_value,
) = self.get_actions_value(b_obs[mb_index], b_actions[mb_index])
# discrete ratio
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
dis_ratio = dis_logratio.exp()
dis_log_ratio = new_dis_logprob - b_dis_logprobs[mb_index]
dis_ratio = dis_log_ratio.exp()
# continuous ratio
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
con_ratio = con_logratio.exp()
con_log_ratio = new_con_logprob - b_con_logprobs[mb_index]
con_ratio = con_log_ratio.exp()
"""
# early stop
@ -229,38 +223,38 @@ class PPOAgent(nn.Module):
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
# Value loss
newvalue = newvalue.view(-1)
new_value = new_value.view(-1)
if self.args.clip_vloss:
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
v_clipped = b_values[mb_inds] + torch.clamp(
newvalue - b_values[mb_inds],
v_loss_unclipped = (new_value - b_returns[mb_index]) ** 2
v_clipped = b_values[mb_index] + torch.clamp(
new_value - b_values[mb_index],
-self.args.clip_coef,
self.args.clip_coef,
)
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
v_loss_clipped = (v_clipped - b_returns[mb_index]) ** 2
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
v_loss = 0.5 * ((new_value - b_returns[mb_index]) ** 2).mean()
# total loss
entropy_loss = dis_entropy.mean() + con_entropy.mean()
loss = (
dis_pg_loss * self.args.policy_coef[this_train_ind]
+ con_pg_loss * self.args.policy_coef[this_train_ind]
+ entropy_loss * self.args.entropy_coef[this_train_ind]
+ v_loss * self.args.critic_coef[this_train_ind]
)*self.args.loss_coef[this_train_ind]
dis_pg_loss * self.args.policy_coef[this_train_ind]
+ con_pg_loss * self.args.policy_coef[this_train_ind]
+ entropy_loss * self.args.entropy_coef[this_train_ind]
+ v_loss * self.args.critic_coef[this_train_ind]
) * self.args.loss_coef[this_train_ind]
if(torch.isnan(loss).any()):
if torch.isnan(loss).any():
print("LOSS Include NAN!!!")
if(torch.isnan(dis_pg_loss.any())):
if torch.isnan(dis_pg_loss.any()):
print("dis_pg_loss include nan")
if(torch.isnan(con_pg_loss.any())):
if torch.isnan(con_pg_loss.any()):
print("con_pg_loss include nan")
if(torch.isnan(entropy_loss.any())):
if torch.isnan(entropy_loss.any()):
print("entropy_loss include nan")
if(torch.isnan(v_loss.any())):
if torch.isnan(v_loss.any()):
print("v_loss include nan")
raise
@ -275,15 +269,15 @@ class PPOAgent(nn.Module):
if approx_kl > args.target_kl:
break
"""
return (v_loss,dis_pg_loss,con_pg_loss,loss,entropy_loss)
return v_loss, dis_pg_loss, con_pg_loss, loss, entropy_loss
def gae(
self,
rewards: torch.Tensor,
dones: torch.Tensor,
values: torch.tensor,
next_obs: torch.tensor,
next_done: torch.Tensor,
self,
rewards: torch.Tensor,
dones: torch.Tensor,
values: torch.tensor,
next_obs: torch.tensor,
next_done: torch.Tensor,
) -> tuple:
# GAE
with torch.no_grad():
@ -294,25 +288,25 @@ class PPOAgent(nn.Module):
last_gae_lam = 0
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
next_non_terminal = 1.0 - next_done
next_values = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
next_non_terminal = 1.0 - dones[t + 1]
next_values = values[t + 1]
delta = rewards[t] + self.args.gamma * next_values * nextnonterminal - values[t]
delta = rewards[t] + self.args.gamma * next_values * next_non_terminal - values[t]
advantages[t] = last_gae_lam = (
delta + self.args.gamma * self.args.gaeLambda * nextnonterminal * last_gae_lam
delta + self.args.gamma * self.args.gaeLambda * next_non_terminal * last_gae_lam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(self.device)
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
next_non_terminal = 1.0 - next_done
next_return = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
next_non_terminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + self.args.gamma * nextnonterminal * next_return
returns[t] = rewards[t] + self.args.gamma * next_non_terminal * next_return
advantages = returns - values
return advantages, returns
return advantages, returns