修正预测函数小错误,规范化命名
修正get_actions_value中在非学习模式时仍然使用sample模式而不是取最佳值 规范化命名
This commit is contained in:
parent
15c1edb6c9
commit
52ccce88bc
@ -2,6 +2,9 @@
|
||||
<dictionary name="UCUNI">
|
||||
<words>
|
||||
<w>aimbot</w>
|
||||
<w>logprobs</w>
|
||||
<w>logstd</w>
|
||||
<w>unclipped</w>
|
||||
</words>
|
||||
</dictionary>
|
||||
</component>
|
@ -20,9 +20,6 @@ SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
||||
GAME_NAME = "Aimbot_Hybrid_V3"
|
||||
GAME_TYPE = "Mix_Verification"
|
||||
|
||||
# !!!SPECIAL PARAMETERS!!!
|
||||
using_targets_num = 3
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
random.seed(args.seed)
|
||||
@ -61,7 +58,6 @@ if __name__ == "__main__":
|
||||
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
|
||||
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
|
||||
|
||||
|
||||
@atexit.register
|
||||
def save_model():
|
||||
# close env
|
||||
@ -72,9 +68,8 @@ if __name__ == "__main__":
|
||||
torch.save(agent, save_dir)
|
||||
print("save model to " + save_dir)
|
||||
|
||||
|
||||
# start the game
|
||||
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
|
||||
total_update_step = args.target_num * args.total_timesteps // args.datasetSize
|
||||
target_steps = [0 for i in range(args.target_num)]
|
||||
start_time = time.time()
|
||||
state, _, done = env.reset()
|
||||
|
@ -17,10 +17,10 @@ def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
||||
|
||||
class PPOAgent(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
env: Aimbot,
|
||||
this_args:argparse.Namespace,
|
||||
device: torch.device,
|
||||
self,
|
||||
env: Aimbot,
|
||||
this_args: argparse.Namespace,
|
||||
device: torch.device,
|
||||
):
|
||||
super(PPOAgent, self).__init__()
|
||||
self.device = device
|
||||
@ -38,7 +38,7 @@ class PPOAgent(nn.Module):
|
||||
self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
|
||||
self.state_size_without_ray = self.args.total_target_size
|
||||
self.head_input_size = (
|
||||
env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
|
||||
env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
|
||||
) # except target state input
|
||||
|
||||
self.unity_discrete_type = env.unity_discrete_type
|
||||
@ -65,9 +65,6 @@ class PPOAgent(nn.Module):
|
||||
self.actor_mean = nn.ModuleList(
|
||||
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
|
||||
)
|
||||
# self.actor_logstd =
|
||||
# nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
|
||||
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||
self.actor_logstd = nn.ParameterList(
|
||||
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
|
||||
) # nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||
@ -78,7 +75,7 @@ class PPOAgent(nn.Module):
|
||||
def get_value(self, state: torch.Tensor):
|
||||
target = state[:, 0].to(torch.int32) # int
|
||||
this_state_num = target.size()[0]
|
||||
view_input = state[:, -self.ray_state_size :] # all ray input
|
||||
view_input = state[:, -self.ray_state_size:] # all ray input
|
||||
target_input = state[:, : self.state_size_without_ray]
|
||||
view_layer = self.view_network(view_input)
|
||||
target_layer = torch.stack(
|
||||
@ -96,7 +93,7 @@ class PPOAgent(nn.Module):
|
||||
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||
target = state[:, 0].to(torch.int32) # int
|
||||
this_state_num = target.size()[0]
|
||||
view_input = state[:, -self.ray_state_size :] # all ray input
|
||||
view_input = state[:, -self.ray_state_size:] # all ray input
|
||||
target_input = state[:, : self.state_size_without_ray]
|
||||
view_layer = self.view_network(view_input)
|
||||
target_layer = torch.stack(
|
||||
@ -118,8 +115,6 @@ class PPOAgent(nn.Module):
|
||||
actions_mean = torch.stack(
|
||||
[self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||
) # self.actor_mean(hidden)
|
||||
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
|
||||
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
|
||||
action_logstd = torch.stack(
|
||||
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)]
|
||||
)
|
||||
@ -134,32 +129,31 @@ class PPOAgent(nn.Module):
|
||||
if actions is None:
|
||||
if self.train_agent:
|
||||
# select actions base on probability distribution model
|
||||
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
||||
conAct = con_probs.sample()
|
||||
actions = torch.cat([disAct.T, conAct], dim=1)
|
||||
dis_act = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
||||
con_act = con_probs.sample()
|
||||
actions = torch.cat([dis_act.T, con_act], dim=1)
|
||||
else:
|
||||
# select actions base on best probability distribution
|
||||
# disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
||||
conAct = actions_mean
|
||||
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
||||
conAct = con_probs.sample()
|
||||
actions = torch.cat([disAct.T, conAct], dim=1)
|
||||
dis_act = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
||||
con_act = actions_mean
|
||||
actions = torch.cat([dis_act.T, con_act], dim=1)
|
||||
else:
|
||||
disAct = actions[:, 0 : self.unity_discrete_type].T
|
||||
conAct = actions[:, self.unity_discrete_type :]
|
||||
dis_act = actions[:, 0: self.unity_discrete_type].T
|
||||
con_act = actions[:, self.unity_discrete_type:]
|
||||
dis_log_prob = torch.stack(
|
||||
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
|
||||
[ctgr.log_prob(act) for act, ctgr in zip(dis_act, multi_categoricals)]
|
||||
)
|
||||
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
||||
return (
|
||||
actions,
|
||||
dis_log_prob.sum(0),
|
||||
dis_entropy.sum(0),
|
||||
con_probs.log_prob(conAct).sum(1),
|
||||
con_probs.log_prob(con_act).sum(1),
|
||||
con_probs.entropy().sum(1),
|
||||
criticV,
|
||||
)
|
||||
def train_net(self, this_train_ind:int,ppo_memories,optimizer) -> tuple:
|
||||
|
||||
def train_net(self, this_train_ind: int, ppo_memories, optimizer) -> tuple:
|
||||
start_time = time.time()
|
||||
# flatten the batch
|
||||
b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape)
|
||||
@ -171,24 +165,24 @@ class PPOAgent(nn.Module):
|
||||
b_values = ppo_memories.values[this_train_ind].reshape(-1)
|
||||
b_size = b_obs.size()[0]
|
||||
# optimizing the policy and value network
|
||||
b_inds = np.arange(b_size)
|
||||
|
||||
b_index = np.arange(b_size)
|
||||
|
||||
for epoch in range(self.args.epochs):
|
||||
print("epoch:",epoch,end="")
|
||||
print("epoch:", epoch, end="")
|
||||
# shuffle all datasets
|
||||
np.random.shuffle(b_inds)
|
||||
np.random.shuffle(b_index)
|
||||
for start in range(0, b_size, self.args.minibatchSize):
|
||||
print(".",end="")
|
||||
print(".", end="")
|
||||
end = start + self.args.minibatchSize
|
||||
mb_inds = b_inds[start:end]
|
||||
if(np.size(mb_inds)<=1):
|
||||
mb_index = b_index[start:end]
|
||||
if np.size(mb_index) <= 1:
|
||||
break
|
||||
mb_advantages = b_advantages[mb_inds]
|
||||
mb_advantages = b_advantages[mb_index]
|
||||
|
||||
# normalize advantages
|
||||
if self.args.norm_adv:
|
||||
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
||||
mb_advantages.std() + 1e-8
|
||||
mb_advantages.std() + 1e-8
|
||||
)
|
||||
|
||||
(
|
||||
@ -197,14 +191,14 @@ class PPOAgent(nn.Module):
|
||||
dis_entropy,
|
||||
new_con_logprob,
|
||||
con_entropy,
|
||||
newvalue,
|
||||
) = self.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
||||
new_value,
|
||||
) = self.get_actions_value(b_obs[mb_index], b_actions[mb_index])
|
||||
# discrete ratio
|
||||
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
||||
dis_ratio = dis_logratio.exp()
|
||||
dis_log_ratio = new_dis_logprob - b_dis_logprobs[mb_index]
|
||||
dis_ratio = dis_log_ratio.exp()
|
||||
# continuous ratio
|
||||
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
|
||||
con_ratio = con_logratio.exp()
|
||||
con_log_ratio = new_con_logprob - b_con_logprobs[mb_index]
|
||||
con_ratio = con_log_ratio.exp()
|
||||
|
||||
"""
|
||||
# early stop
|
||||
@ -229,38 +223,38 @@ class PPOAgent(nn.Module):
|
||||
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
||||
|
||||
# Value loss
|
||||
newvalue = newvalue.view(-1)
|
||||
new_value = new_value.view(-1)
|
||||
if self.args.clip_vloss:
|
||||
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
||||
v_clipped = b_values[mb_inds] + torch.clamp(
|
||||
newvalue - b_values[mb_inds],
|
||||
v_loss_unclipped = (new_value - b_returns[mb_index]) ** 2
|
||||
v_clipped = b_values[mb_index] + torch.clamp(
|
||||
new_value - b_values[mb_index],
|
||||
-self.args.clip_coef,
|
||||
self.args.clip_coef,
|
||||
)
|
||||
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
||||
v_loss_clipped = (v_clipped - b_returns[mb_index]) ** 2
|
||||
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
||||
v_loss = 0.5 * v_loss_max.mean()
|
||||
else:
|
||||
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
||||
v_loss = 0.5 * ((new_value - b_returns[mb_index]) ** 2).mean()
|
||||
|
||||
# total loss
|
||||
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
||||
loss = (
|
||||
dis_pg_loss * self.args.policy_coef[this_train_ind]
|
||||
+ con_pg_loss * self.args.policy_coef[this_train_ind]
|
||||
+ entropy_loss * self.args.entropy_coef[this_train_ind]
|
||||
+ v_loss * self.args.critic_coef[this_train_ind]
|
||||
)*self.args.loss_coef[this_train_ind]
|
||||
dis_pg_loss * self.args.policy_coef[this_train_ind]
|
||||
+ con_pg_loss * self.args.policy_coef[this_train_ind]
|
||||
+ entropy_loss * self.args.entropy_coef[this_train_ind]
|
||||
+ v_loss * self.args.critic_coef[this_train_ind]
|
||||
) * self.args.loss_coef[this_train_ind]
|
||||
|
||||
if(torch.isnan(loss).any()):
|
||||
if torch.isnan(loss).any():
|
||||
print("LOSS Include NAN!!!")
|
||||
if(torch.isnan(dis_pg_loss.any())):
|
||||
if torch.isnan(dis_pg_loss.any()):
|
||||
print("dis_pg_loss include nan")
|
||||
if(torch.isnan(con_pg_loss.any())):
|
||||
if torch.isnan(con_pg_loss.any()):
|
||||
print("con_pg_loss include nan")
|
||||
if(torch.isnan(entropy_loss.any())):
|
||||
if torch.isnan(entropy_loss.any()):
|
||||
print("entropy_loss include nan")
|
||||
if(torch.isnan(v_loss.any())):
|
||||
if torch.isnan(v_loss.any()):
|
||||
print("v_loss include nan")
|
||||
raise
|
||||
|
||||
@ -275,15 +269,15 @@ class PPOAgent(nn.Module):
|
||||
if approx_kl > args.target_kl:
|
||||
break
|
||||
"""
|
||||
return (v_loss,dis_pg_loss,con_pg_loss,loss,entropy_loss)
|
||||
return v_loss, dis_pg_loss, con_pg_loss, loss, entropy_loss
|
||||
|
||||
def gae(
|
||||
self,
|
||||
rewards: torch.Tensor,
|
||||
dones: torch.Tensor,
|
||||
values: torch.tensor,
|
||||
next_obs: torch.tensor,
|
||||
next_done: torch.Tensor,
|
||||
self,
|
||||
rewards: torch.Tensor,
|
||||
dones: torch.Tensor,
|
||||
values: torch.tensor,
|
||||
next_obs: torch.tensor,
|
||||
next_done: torch.Tensor,
|
||||
) -> tuple:
|
||||
# GAE
|
||||
with torch.no_grad():
|
||||
@ -294,25 +288,25 @@ class PPOAgent(nn.Module):
|
||||
last_gae_lam = 0
|
||||
for t in reversed(range(data_size)):
|
||||
if t == data_size - 1:
|
||||
nextnonterminal = 1.0 - next_done
|
||||
next_non_terminal = 1.0 - next_done
|
||||
next_values = next_value
|
||||
else:
|
||||
nextnonterminal = 1.0 - dones[t + 1]
|
||||
next_non_terminal = 1.0 - dones[t + 1]
|
||||
next_values = values[t + 1]
|
||||
delta = rewards[t] + self.args.gamma * next_values * nextnonterminal - values[t]
|
||||
delta = rewards[t] + self.args.gamma * next_values * next_non_terminal - values[t]
|
||||
advantages[t] = last_gae_lam = (
|
||||
delta + self.args.gamma * self.args.gaeLambda * nextnonterminal * last_gae_lam
|
||||
delta + self.args.gamma * self.args.gaeLambda * next_non_terminal * last_gae_lam
|
||||
)
|
||||
returns = advantages + values
|
||||
else:
|
||||
returns = torch.zeros_like(rewards).to(self.device)
|
||||
for t in reversed(range(data_size)):
|
||||
if t == data_size - 1:
|
||||
nextnonterminal = 1.0 - next_done
|
||||
next_non_terminal = 1.0 - next_done
|
||||
next_return = next_value
|
||||
else:
|
||||
nextnonterminal = 1.0 - dones[t + 1]
|
||||
next_non_terminal = 1.0 - dones[t + 1]
|
||||
next_return = returns[t + 1]
|
||||
returns[t] = rewards[t] + self.args.gamma * nextnonterminal * next_return
|
||||
returns[t] = rewards[t] + self.args.gamma * next_non_terminal * next_return
|
||||
advantages = returns - values
|
||||
return advantages, returns
|
||||
return advantages, returns
|
||||
|
Loading…
Reference in New Issue
Block a user