2023-07-15 11:37:23 +00:00
|
|
|
import numpy as np
|
|
|
|
import torch
|
2023-07-22 10:26:39 +00:00
|
|
|
import argparse
|
|
|
|
|
2023-07-15 11:37:23 +00:00
|
|
|
from torch import nn
|
|
|
|
from aimbotEnv import Aimbot
|
|
|
|
from torch.distributions.normal import Normal
|
|
|
|
from torch.distributions.categorical import Categorical
|
|
|
|
|
|
|
|
|
|
|
|
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
|
|
|
nn.init.orthogonal_(layer.weight, std)
|
|
|
|
nn.init.constant_(layer.bias, bias_const)
|
|
|
|
return layer
|
|
|
|
|
|
|
|
|
|
|
|
class PPOAgent(nn.Module):
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
env: Aimbot,
|
2023-07-22 10:26:39 +00:00
|
|
|
this_args:argparse.Namespace,
|
|
|
|
train_agent: bool,
|
|
|
|
target_num: int,
|
2023-07-15 11:37:23 +00:00
|
|
|
target_state_size: int,
|
|
|
|
time_state_size: int,
|
|
|
|
gun_state_size: int,
|
|
|
|
my_state_size: int,
|
|
|
|
total_t_size: int,
|
2023-07-22 10:26:39 +00:00
|
|
|
device: torch.device,
|
2023-07-15 11:37:23 +00:00
|
|
|
):
|
|
|
|
super(PPOAgent, self).__init__()
|
2023-07-22 10:26:39 +00:00
|
|
|
self.device = device
|
|
|
|
self.args = this_args
|
|
|
|
self.trainAgent = train_agent
|
|
|
|
self.targetNum = target_num
|
2023-07-15 11:37:23 +00:00
|
|
|
self.stateSize = env.unity_observation_shape[0]
|
|
|
|
self.agentNum = env.unity_agent_num
|
|
|
|
self.targetSize = target_state_size
|
|
|
|
self.timeSize = time_state_size
|
|
|
|
self.gunSize = gun_state_size
|
|
|
|
self.myStateSize = my_state_size
|
|
|
|
self.raySize = env.unity_observation_shape[0] - total_t_size
|
|
|
|
self.nonRaySize = total_t_size
|
|
|
|
self.head_input_size = (
|
|
|
|
env.unity_observation_shape[0] - self.targetSize - self.timeSize - self.gunSize
|
|
|
|
) # except target state input
|
|
|
|
|
|
|
|
self.unityDiscreteType = env.unity_discrete_type
|
|
|
|
self.discrete_size = env.unity_discrete_size
|
|
|
|
self.discrete_shape = list(env.unity_discrete_branches)
|
|
|
|
self.continuous_size = env.unity_continuous_size
|
|
|
|
|
|
|
|
self.viewNetwork = nn.Sequential(layer_init(nn.Linear(self.raySize, 200)), nn.LeakyReLU())
|
|
|
|
self.targetNetworks = nn.ModuleList(
|
|
|
|
[
|
|
|
|
nn.Sequential(layer_init(nn.Linear(self.nonRaySize, 100)), nn.LeakyReLU())
|
2023-07-22 10:26:39 +00:00
|
|
|
for i in range(target_num)
|
2023-07-15 11:37:23 +00:00
|
|
|
]
|
|
|
|
)
|
|
|
|
self.middleNetworks = nn.ModuleList(
|
|
|
|
[
|
|
|
|
nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU())
|
2023-07-22 10:26:39 +00:00
|
|
|
for i in range(target_num)
|
2023-07-15 11:37:23 +00:00
|
|
|
]
|
|
|
|
)
|
|
|
|
self.actor_dis = nn.ModuleList(
|
2023-07-22 10:26:39 +00:00
|
|
|
[layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(target_num)]
|
2023-07-15 11:37:23 +00:00
|
|
|
)
|
|
|
|
self.actor_mean = nn.ModuleList(
|
2023-07-22 10:26:39 +00:00
|
|
|
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(target_num)]
|
2023-07-15 11:37:23 +00:00
|
|
|
)
|
|
|
|
# self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
|
|
|
|
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
|
|
|
self.actor_logstd = nn.ParameterList(
|
2023-07-22 10:26:39 +00:00
|
|
|
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(target_num)]
|
2023-07-15 11:37:23 +00:00
|
|
|
) # nn.Parameter(torch.zeros(1, self.continuous_size))
|
|
|
|
self.critic = nn.ModuleList(
|
2023-07-22 10:26:39 +00:00
|
|
|
[layer_init(nn.Linear(200, 1), std=1) for i in range(target_num)]
|
2023-07-15 11:37:23 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
def get_value(self, state: torch.Tensor):
|
|
|
|
target = state[:, 0].to(torch.int32) # int
|
|
|
|
thisStateNum = target.size()[0]
|
|
|
|
viewInput = state[:, -self.raySize :] # all ray input
|
|
|
|
targetInput = state[:, : self.nonRaySize]
|
|
|
|
viewLayer = self.viewNetwork(viewInput)
|
|
|
|
targetLayer = torch.stack(
|
|
|
|
[self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]
|
|
|
|
)
|
|
|
|
middleInput = torch.cat([viewLayer, targetLayer], dim=1)
|
|
|
|
middleLayer = torch.stack(
|
|
|
|
[self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]
|
|
|
|
)
|
|
|
|
criticV = torch.stack(
|
|
|
|
[self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]
|
|
|
|
) # self.critic
|
|
|
|
return criticV
|
|
|
|
|
|
|
|
def get_actions_value(self, state: torch.Tensor, actions=None):
|
|
|
|
target = state[:, 0].to(torch.int32) # int
|
|
|
|
thisStateNum = target.size()[0]
|
|
|
|
viewInput = state[:, -self.raySize :] # all ray input
|
|
|
|
targetInput = state[:, : self.nonRaySize]
|
|
|
|
viewLayer = self.viewNetwork(viewInput)
|
|
|
|
targetLayer = torch.stack(
|
|
|
|
[self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]
|
|
|
|
)
|
|
|
|
middleInput = torch.cat([viewLayer, targetLayer], dim=1)
|
|
|
|
middleLayer = torch.stack(
|
|
|
|
[self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]
|
|
|
|
)
|
|
|
|
|
|
|
|
# discrete
|
|
|
|
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
|
|
|
dis_logits = torch.stack(
|
|
|
|
[self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)]
|
|
|
|
)
|
|
|
|
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
|
|
|
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
|
|
|
# continuous
|
|
|
|
actions_mean = torch.stack(
|
|
|
|
[self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]
|
|
|
|
) # self.actor_mean(hidden)
|
|
|
|
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
|
|
|
|
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
|
|
|
|
action_logstd = torch.stack(
|
|
|
|
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(thisStateNum)]
|
|
|
|
)
|
|
|
|
# print(action_logstd)
|
|
|
|
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
|
|
|
|
con_probs = Normal(actions_mean, action_std)
|
|
|
|
# critic
|
|
|
|
criticV = torch.stack(
|
|
|
|
[self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]
|
|
|
|
) # self.critic
|
|
|
|
|
|
|
|
if actions is None:
|
|
|
|
if self.trainAgent:
|
|
|
|
# select actions base on probability distribution model
|
|
|
|
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
|
|
|
conAct = con_probs.sample()
|
|
|
|
actions = torch.cat([disAct.T, conAct], dim=1)
|
|
|
|
else:
|
|
|
|
# select actions base on best probability distribution
|
|
|
|
# disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
|
|
|
conAct = actions_mean
|
|
|
|
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
|
|
|
conAct = con_probs.sample()
|
|
|
|
actions = torch.cat([disAct.T, conAct], dim=1)
|
|
|
|
else:
|
|
|
|
disAct = actions[:, 0 : self.unityDiscreteType].T
|
|
|
|
conAct = actions[:, self.unityDiscreteType :]
|
|
|
|
dis_log_prob = torch.stack(
|
|
|
|
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
|
|
|
|
)
|
|
|
|
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
|
|
|
return (
|
|
|
|
actions,
|
|
|
|
dis_log_prob.sum(0),
|
|
|
|
dis_entropy.sum(0),
|
|
|
|
con_probs.log_prob(conAct).sum(1),
|
|
|
|
con_probs.entropy().sum(1),
|
|
|
|
criticV,
|
|
|
|
)
|
|
|
|
|
2023-07-22 10:26:39 +00:00
|
|
|
def gae(
|
|
|
|
self,
|
|
|
|
rewards: torch.Tensor,
|
|
|
|
dones: torch.Tensor,
|
|
|
|
values: torch.tensor,
|
|
|
|
next_obs: torch.tensor,
|
|
|
|
next_done: torch.Tensor,
|
|
|
|
) -> tuple:
|
|
|
|
# GAE
|
|
|
|
with torch.no_grad():
|
|
|
|
next_value = self.get_value(next_obs).reshape(1, -1)
|
|
|
|
data_size = rewards.size()[0]
|
|
|
|
if self.args.gae:
|
|
|
|
advantages = torch.zeros_like(rewards).to(self.device)
|
|
|
|
last_gae_lam = 0
|
|
|
|
for t in reversed(range(data_size)):
|
|
|
|
if t == data_size - 1:
|
|
|
|
nextnonterminal = 1.0 - next_done
|
|
|
|
next_values = next_value
|
|
|
|
else:
|
|
|
|
nextnonterminal = 1.0 - dones[t + 1]
|
|
|
|
next_values = values[t + 1]
|
|
|
|
delta = rewards[t] + self.args.gamma * next_values * nextnonterminal - values[t]
|
|
|
|
advantages[t] = last_gae_lam = (
|
|
|
|
delta + self.args.gamma * self.args.gaeLambda * nextnonterminal * last_gae_lam
|
|
|
|
)
|
|
|
|
returns = advantages + values
|
|
|
|
else:
|
|
|
|
returns = torch.zeros_like(rewards).to(self.device)
|
|
|
|
for t in reversed(range(data_size)):
|
|
|
|
if t == data_size - 1:
|
|
|
|
nextnonterminal = 1.0 - next_done
|
|
|
|
next_return = next_value
|
|
|
|
else:
|
|
|
|
nextnonterminal = 1.0 - dones[t + 1]
|
|
|
|
next_return = returns[t + 1]
|
|
|
|
returns[t] = rewards[t] + self.args.gamma * nextnonterminal * next_return
|
|
|
|
advantages = returns - values
|
|
|
|
return advantages, returns
|