Compare commits

...

3 Commits

Author SHA1 Message Date
34206b95c5 Multi Agent Each Type Action Select Style
Multi Agent Each Type Action Select Style.
waste too much time
2022-12-14 09:01:29 +09:00
1787872e82 wrong remain Time Fix
wrong remain Time Fix, what a stupid mistake...
and fix doubled WANDB writer
2022-12-04 09:20:05 +09:00
ad9817e7a4 Totally disparate NN by target
Totally disparate NN by target.
2022-12-03 21:35:33 +09:00
3 changed files with 176 additions and 202 deletions

3
.gitignore vendored
View File

@ -83,4 +83,5 @@ crashlytics-build.properties
/Aimbot-PPO-Python/Backup/ /Aimbot-PPO-Python/Backup/
/Aimbot-PPO-Python/Build/ /Aimbot-PPO-Python/Build/
/Aimbot-PPO-Python/PPO-Model/ /Aimbot-PPO-Python/PPO-Model/
/Aimbot-PPO-Python/GAIL-Expert-Data/ /Aimbot-PPO-Python/GAIL-Expert-Data/
/Aimbot-PPO-Python/runs/

View File

@ -23,40 +23,41 @@ from mlagents_envs.side_channel.side_channel import (
) )
from typing import List from typing import List
bestReward = 0 bestReward = -1
DEFAULT_SEED = 933139 DEFAULT_SEED = 9331
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy/Aimbot-ParallelEnv" ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv"
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e") SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
WAND_ENTITY = "koha9" WAND_ENTITY = "koha9"
WORKER_ID = 2 WORKER_ID = 3
BASE_PORT = 1001 BASE_PORT = 1002
# max round steps per agent is 2500/Decision_period, 25 seconds # max round steps per agent is 2500/Decision_period, 25 seconds
# !!!check every parameters before run!!! # !!!check every parameters before run!!!
TOTAL_STEPS = 6750000 TOTAL_STEPS = 3150000
BATCH_SIZE = 512 BATCH_SIZE = 1024
MAX_TRAINNING_DATASETS = 3000 MAX_TRAINNING_DATASETS = 6000
DECISION_PERIOD = 1 DECISION_PERIOD = 1
LEARNING_RATE = 1e-3 LEARNING_RATE = 5e-4
GAMMA = 0.99 GAMMA = 0.99
GAE_LAMBDA = 0.95 GAE_LAMBDA = 0.95
EPOCHS = 4 EPOCHS = 3
CLIP_COEF = 0.1 CLIP_COEF = 0.11
POLICY_COEF = 1.0 LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
ENTROPY_COEF = 0.01 POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
CRITIC_COEF = 0.5 ENTROPY_COEF = [0.1, 0.1, 0.1, 0.1]
TARGET_LEARNING_RATE = 5e-5 CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
TARGET_LEARNING_RATE = 1e-6
ANNEAL_LEARNING_RATE = True ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True CLIP_VLOSS = True
NORM_ADV = True NORM_ADV = True
TRAIN = True TRAIN = True
WANDB_TACK = True WANDB_TACK = False
LOAD_DIR = None LOAD_DIR = None
#LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt" #LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670522099-freeonly-12/Aimbot-target-last.pt"
# public data # public data
class Targets(Enum): class Targets(Enum):
@ -65,11 +66,17 @@ class Targets(Enum):
Attack = 2 Attack = 2
Defence = 3 Defence = 3
Num = 4 Num = 4
TARGET_STATE_SIZE = 6
INAREA_STATE_SIZE = 1
TIME_STATE_SIZE = 1
GUN_STATE_SIZE = 1
MY_STATE_SIZE = 4
TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
BASE_WINREWARD = 999 BASE_WINREWARD = 999
BASE_LOSEREWARD = -999 BASE_LOSEREWARD = -999
TARGETNUM= 4 TARGETNUM= 4
ENV_TIMELIMIT = 30 ENV_TIMELIMIT = 30
RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
TotalRounds = {"Free":0,"Go":0,"Attack":0} TotalRounds = {"Free":0,"Go":0,"Attack":0}
WinRounds = {"Free":0,"Go":0,"Attack":0} WinRounds = {"Free":0,"Go":0,"Attack":0}
@ -116,6 +123,8 @@ def parse_args():
help="load model directory") help="load model directory")
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD, parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
help="the number of steps to run in each environment per policy rollout") help="the number of steps to run in each environment per policy rollout")
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
# GAE loss # GAE loss
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
@ -155,38 +164,43 @@ class PPOAgent(nn.Module):
def __init__(self, env: Aimbot,targetNum:int): def __init__(self, env: Aimbot,targetNum:int):
super(PPOAgent, self).__init__() super(PPOAgent, self).__init__()
self.targetNum = targetNum self.targetNum = targetNum
self.stateSize = env.unity_observation_shape[0]
self.targetSize = TARGET_STATE_SIZE
self.timeSize = TIME_STATE_SIZE
self.gunSize = GUN_STATE_SIZE
self.myStateSize = MY_STATE_SIZE
self.totalMiddleSize = TOTAL_T_SIZE
self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input
self.discrete_size = env.unity_discrete_size self.discrete_size = env.unity_discrete_size
self.discrete_shape = list(env.unity_discrete_branches) self.discrete_shape = list(env.unity_discrete_branches)
self.continuous_size = env.unity_continuous_size self.continuous_size = env.unity_continuous_size
self.network = nn.Sequential( self.network = nn.Sequential(
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 500)), layer_init(nn.Linear(env.unity_observation_shape[0], 300)),
nn.ReLU(), nn.Tanh(),
layer_init(nn.Linear(500, 300)), layer_init(nn.Linear(300, 200)),
nn.ReLU(), nn.Tanh(),
) )
self.actor_dis = nn.ModuleList([layer_init(nn.Linear(300, self.discrete_size), std=0.01) for i in range(targetNum)]) self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5)
self.actor_mean = nn.ModuleList([layer_init(nn.Linear(300, self.continuous_size), std=0.01) for i in range(targetNum)]) self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5)
self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)]) self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
self.critic = layer_init(nn.Linear(300, 1), std=1) self.critic = layer_init(nn.Linear(200, 1), std=1)
def get_value(self, state: torch.Tensor): def get_value(self, state: torch.Tensor):
return self.critic(self.network(state)) return self.critic(self.network(state))
def get_actions_value(self, state: torch.Tensor, actions=None): def get_actions_value(self, state: torch.Tensor, actions=None):
hidden = self.network(state) hidden = self.network(state)
targets = state[:,0].to(torch.int32)
# discrete # discrete
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出 dis_logits = self.actor_dis(hidden)
dis_logits = torch.stack([self.actor_dis[targets[i]](hidden[i]) for i in range(targets.size()[0])])
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1) split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits] multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
# continuous # continuous
actions_mean = torch.stack([self.actor_mean[targets[i]](hidden[i]) for i in range(targets.size()[0])]) # self.actor_mean(hidden) actions_mean = self.actor_mean(hidden)
# action_logstd = torch.stack([self.actor_logstd[targets[i]].expand_as(actions_mean) for i in range(targets.size()[0])]) # self.actor_logstd.expand_as(actions_mean) action_logstd = self.actor_logstd.expand_as(actions_mean)
# print(action_logstd) action_std = torch.exp(action_logstd)
action_std = torch.squeeze(torch.stack([torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])]),dim = -1) # torch.exp(action_logstd)
con_probs = Normal(actions_mean, action_std) con_probs = Normal(actions_mean, action_std)
if actions is None: if actions is None:
@ -301,11 +315,11 @@ def broadCastEndReward(rewardBF:list,remainTime:float):
if (rewardBF[-1]<=-500): if (rewardBF[-1]<=-500):
# print("Lose DO NOT BROAD CAST",rewardBF[-1]) # print("Lose DO NOT BROAD CAST",rewardBF[-1])
thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD
thisRewardBF = (np.asarray(thisRewardBF)).tolist() thisRewardBF = thisRewardBF
elif (rewardBF[-1]>=500): elif (rewardBF[-1]>=500):
# print("Win! Broadcast reward!",rewardBF[-1]) # print("Win! Broadcast reward!",rewardBF[-1])
thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD
thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*RESULT_BROADCAST_RATIO)).tolist() thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist()
else: else:
print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1]) print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1])
return torch.Tensor(thisRewardBF).to(device) return torch.Tensor(thisRewardBF).to(device)
@ -322,17 +336,22 @@ if __name__ == "__main__":
# Initialize environment anget optimizer # Initialize environment anget optimizer
aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID); aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel]) env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel])
agentList = []
optimizers = []
if args.load_dir is None: if args.load_dir is None:
agent = PPOAgent(env,TARGETNUM).to(device) for i in range(using_targets_num):
agentList.append(PPOAgent(env,TARGETNUM).to(device))
optimizers.append(optim.Adam(agentList[i].parameters(), lr=args.lr, eps=1e-5))
else: else:
agent = torch.load(args.load_dir) print("NAH")
print("Load Agent", args.load_dir) # !!!not finished
print(agent.eval()) # agent = torch.load(args.load_dir)
# print("Load Agent", args.load_dir)
# print(agent.eval())
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
# Tensorboard and WandB Recorder # Tensorboard and WandB Recorder
game_name = "Aimbot_Target_Hybrid_Multi_Output" game_name = "Aimbot_Target_Hybrid_PMNN_V2"
game_type = "OffPolicy_EndBC" game_type = "OffPolicy_EndBC"
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}" run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
if args.wandb_track: if args.wandb_track:
@ -382,38 +401,51 @@ if __name__ == "__main__":
for total_steps in range(total_update_step): for total_steps in range(total_update_step):
# discunt learning rate, while step == total_update_step lr will be 0 # discunt learning rate, while step == total_update_step lr will be 0
print("new episode")
if args.annealLR: if args.annealLR:
finalRatio = TARGET_LEARNING_RATE/args.lr finalRatio = TARGET_LEARNING_RATE/args.lr
frac = 1.0 - finalRatio*((total_steps - 1.0) / total_update_step) frac = 1.0 - ((total_steps + 1.0) / total_update_step)
lrnow = frac * args.lr lrnow = frac * args.lr
optimizer.param_groups[0]["lr"] = lrnow for optimizer in optimizers:
optimizer.param_groups[0]["lr"] = lrnow
else:
lrnow = args.lr
print("new episode",total_steps,"learning rate = ",lrnow)
# MAIN LOOP: run agent in environment # MAIN LOOP: run agent in environment
i = 0 step = 0
training = False training = False
trainQueue = [] trainQueue = []
last_reward = [0.for i in range(env.unity_agent_num)]
action = torch.zeros((env.unity_agent_num,env.unity_discrete_type+env.unity_continuous_size))
dis_logprob = torch.zeros((env.unity_agent_num,env.unity_discrete_size))
con_logprob = torch.zeros((env.unity_agent_num,env.unity_continuous_size))
value = torch.zeros((env.unity_agent_num,1))
while True: while True:
if i % args.decision_period == 0: if step % args.decision_period == 0:
step = round(i / args.decision_period) step += 1
# Choose action by agent # Choose action by agent
with torch.no_grad(): with torch.no_grad():
# predict actions # predict actions
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value( for i in range(env.unity_agent_num):
torch.Tensor(state).to(device) actTarget = int(state[i][0])
) act, dis_lgprb, _, con_lgprb, _, vl = agentList[actTarget].get_actions_value(
value = value.flatten() torch.Tensor([state[i]]).to(device)
)
action[i] = act
dis_logprob[i] = dis_lgprb.squeeze(0)
con_logprob[i] = con_lgprb.squeeze(0)
value[i] = vl.squeeze(0)
# variable from GPU to CPU # variable from GPU to CPU
action_cpu = action.cpu().numpy() action_cpu = action.cpu().numpy()
dis_logprob_cpu = dis_logprob.cpu().numpy() dis_logprob_cpu = dis_logprob.cpu().numpy()
con_logprob_cpu = con_logprob.cpu().numpy() con_logprob_cpu = con_logprob.cpu().numpy()
value_cpu = value.cpu().numpy() value_cpu = value.flatten().cpu().numpy()
# Environment step # Environment step
next_state, reward, next_done = env.step(action_cpu) next_state, reward, next_done = env.step(action_cpu)
# save memories # save memories
for i in range(env.unity_agent_num): for i in range(env.unity_agent_num):
# save memories to buffers # save memories to buffers
@ -421,22 +453,24 @@ if __name__ == "__main__":
act_bf[i].append(action_cpu[i]) act_bf[i].append(action_cpu[i])
dis_logprobs_bf[i].append(dis_logprob_cpu[i]) dis_logprobs_bf[i].append(dis_logprob_cpu[i])
con_logprobs_bf[i].append(con_logprob_cpu[i]) con_logprobs_bf[i].append(con_logprob_cpu[i])
rewards_bf[i].append(reward[i]) rewards_bf[i].append(reward[i]+last_reward[i])
dones_bf[i].append(done[i]) dones_bf[i].append(done[i])
values_bf[i].append(value_cpu[i]) values_bf[i].append(value_cpu[i])
remainTime = state[i,TARGET_STATE_SIZE]
if next_done[i] == True: if next_done[i] == True:
# finished a round, send finished memories to training datasets # finished a round, send finished memories to training datasets
# compute advantage and discounted reward # compute advantage and discounted reward
#print(i,"over") #print(i,"over")
endTarget = int(ob_bf[i][0][0])
roundTargetType = int(state[i,0]) roundTargetType = int(state[i,0])
thisRewardsTensor = broadCastEndReward(rewards_bf[i],roundTargetType) thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
adv, rt = GAE( adv, rt = GAE(
agent, agentList[endTarget],
args, args,
thisRewardsTensor, thisRewardsTensor,
torch.Tensor(dones_bf[i]).to(device), torch.Tensor(dones_bf[i]).to(device),
torch.tensor(values_bf[i]).to(device), torch.tensor(values_bf[i]).to(device),
torch.tensor(next_state[i]).to(device), torch.tensor(next_state[i]).to(device).unsqueeze(0),
torch.Tensor([next_done[i]]).to(device), torch.Tensor([next_done[i]]).to(device),
) )
# send memories to training datasets # send memories to training datasets
@ -471,13 +505,14 @@ if __name__ == "__main__":
break break
state, done = next_state, next_done state, done = next_state, next_done
else: else:
step += 1
# skip this step use last predict action # skip this step use last predict action
next_obs, reward, next_done = env.step(action_cpu) next_state, reward, next_done = env.step(action_cpu)
# save memories # save memories
for i in range(env.unity_agent_num): for i in range(env.unity_agent_num):
if next_done[i] == True: if next_done[i] == True:
#print(i,"over???") #print(i,"over???")
# save last memories to buffers # save memories to buffers
ob_bf[i].append(state[i]) ob_bf[i].append(state[i])
act_bf[i].append(action_cpu[i]) act_bf[i].append(action_cpu[i])
dis_logprobs_bf[i].append(dis_logprob_cpu[i]) dis_logprobs_bf[i].append(dis_logprob_cpu[i])
@ -485,30 +520,33 @@ if __name__ == "__main__":
rewards_bf[i].append(reward[i]) rewards_bf[i].append(reward[i])
dones_bf[i].append(done[i]) dones_bf[i].append(done[i])
values_bf[i].append(value_cpu[i]) values_bf[i].append(value_cpu[i])
remainTime = state[i,TARGET_STATE_SIZE]
# finished a round, send finished memories to training datasets # finished a round, send finished memories to training datasets
# compute advantage and discounted reward # compute advantage and discounted reward
roundTargetType = int(state[i,0])
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
adv, rt = GAE( adv, rt = GAE(
agent, agentList[roundTargetType],
args, args,
torch.tensor(rewards_bf[i]).to(device), thisRewardsTensor,
torch.Tensor(dones_bf[i]).to(device), torch.Tensor(dones_bf[i]).to(device),
torch.tensor(values_bf[i]).to(device), torch.tensor(values_bf[i]).to(device),
torch.tensor(next_state[i]).to(device), torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0),
torch.Tensor([next_done[i]]).to(device), torch.Tensor([next_done[i]]).to(device),
) )
# send memories to training datasets # send memories to training datasets
obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0) obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0) actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs = torch.cat( dis_logprobs[roundTargetType] = torch.cat(
(dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0 (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
) )
con_logprobs = torch.cat( con_logprobs[roundTargetType] = torch.cat(
(con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0 (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
) )
rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0) rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0) values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
advantages = torch.cat((advantages, adv), 0) advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
returns = torch.cat((returns, rt), 0) returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
# clear buffers # clear buffers
ob_bf[i] = [] ob_bf[i] = []
@ -518,8 +556,10 @@ if __name__ == "__main__":
rewards_bf[i] = [] rewards_bf[i] = []
dones_bf[i] = [] dones_bf[i] = []
values_bf[i] = [] values_bf[i] = []
print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}") print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
state, done = next_state, next_done
state = next_state
last_reward = reward
i += 1 i += 1
if args.train: if args.train:
@ -535,14 +575,16 @@ if __name__ == "__main__":
b_advantages = advantages[thisT].reshape(-1) b_advantages = advantages[thisT].reshape(-1)
b_returns = returns[thisT].reshape(-1) b_returns = returns[thisT].reshape(-1)
b_values = values[thisT].reshape(-1) b_values = values[thisT].reshape(-1)
b_size = b_obs[thisT].size()[0] b_size = b_obs.size()[0]
# Optimizing the policy and value network # Optimizing the policy and value network
b_inds = np.arange(b_size) b_inds = np.arange(b_size)
# clipfracs = [] # clipfracs = []
for epoch in range(args.epochs): for epoch in range(args.epochs):
print(epoch,end="")
# shuffle all datasets # shuffle all datasets
np.random.shuffle(b_inds) np.random.shuffle(b_inds)
for start in range(0, b_size, args.minibatchSize): for start in range(0, b_size, args.minibatchSize):
print(".",end="")
end = start + args.minibatchSize end = start + args.minibatchSize
mb_inds = b_inds[start:end] mb_inds = b_inds[start:end]
mb_advantages = b_advantages[mb_inds] mb_advantages = b_advantages[mb_inds]
@ -560,7 +602,7 @@ if __name__ == "__main__":
new_con_logprob, new_con_logprob,
con_entropy, con_entropy,
newvalue, newvalue,
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) ) = agentList[thisT].get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
# discrete ratio # discrete ratio
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
dis_ratio = dis_logratio.exp() dis_ratio = dis_logratio.exp()
@ -608,17 +650,17 @@ if __name__ == "__main__":
# total loss # total loss
entropy_loss = dis_entropy.mean() + con_entropy.mean() entropy_loss = dis_entropy.mean() + con_entropy.mean()
loss = ( loss = (
dis_pg_loss * args.policy_coef dis_pg_loss * POLICY_COEF[thisT]
+ con_pg_loss * args.policy_coef + con_pg_loss * POLICY_COEF[thisT]
- entropy_loss * args.ent_coef + entropy_loss * ENTROPY_COEF[thisT]
+ v_loss * args.critic_coef + v_loss * CRITIC_COEF[thisT]
) )*LOSS_COEF[thisT]
optimizer.zero_grad() optimizers[thisT].zero_grad()
loss.backward() loss.backward()
# Clips gradient norm of an iterable of parameters. # Clips gradient norm of an iterable of parameters.
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) nn.utils.clip_grad_norm_(agentList[thisT].parameters(), args.max_grad_norm)
optimizer.step() optimizers[thisT].step()
""" """
if args.target_kl is not None: if args.target_kl is not None:
@ -626,6 +668,7 @@ if __name__ == "__main__":
break break
""" """
# record mean reward before clear history # record mean reward before clear history
print("done")
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
meanRewardList.append(targetRewardMean) meanRewardList.append(targetRewardMean)
targetName = Targets(thisT).name targetName = Targets(thisT).name
@ -642,7 +685,6 @@ if __name__ == "__main__":
# record rewards for plotting purposes # record rewards for plotting purposes
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT]) writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT]) writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT]) writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT]) writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
@ -656,10 +698,12 @@ if __name__ == "__main__":
# New Record! # New Record!
if TotalRewardMean > bestReward: if TotalRewardMean > bestReward:
bestReward = targetRewardMean bestReward = targetRewardMean
saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt" for i in range(using_targets_num):
torch.save(agent, saveDir) saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) +"_"+ str(i)+".pt"
torch.save(agentList[i], saveDir)
saveDir = "../PPO-Model/Hybrid-MNN-500-300-Last" + ".pt" for i in range(using_targets_num):
torch.save(agent, saveDir) saveDir = "../PPO-Model/"+ run_name +"_last_"+ str(i) + ".pt"
torch.save(agentList[i], saveDir)
env.close() env.close()
writer.close() writer.close()

View File

@ -601,13 +601,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import numpy as np\n",
"import torch\n", "import torch\n",
"import torch.nn as nn\n", "import torch.nn as nn\n",
"import torch.optim as optim\n", "import torch.optim as optim\n",
"from AimbotEnv import Aimbot\n",
"from torch.distributions.normal import Normal\n", "from torch.distributions.normal import Normal\n",
"from torch.distributions.categorical import Categorical\n", "from torch.distributions.categorical import Categorical\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() and True else \"cpu\")\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() and True else \"cpu\")\n",
@ -620,39 +622,37 @@
"class PPOAgent(nn.Module):\n", "class PPOAgent(nn.Module):\n",
" def __init__(self, env: Aimbot,targetNum:int):\n", " def __init__(self, env: Aimbot,targetNum:int):\n",
" super(PPOAgent, self).__init__()\n", " super(PPOAgent, self).__init__()\n",
" self.targetNum = targetNum\n", " self.stateSize = env.unity_observation_shape[0]\n",
"\n",
" self.discrete_size = env.unity_discrete_size\n", " self.discrete_size = env.unity_discrete_size\n",
" self.discrete_shape = list(env.unity_discrete_branches)\n", " self.discrete_shape = list(env.unity_discrete_branches)\n",
" self.continuous_size = env.unity_continuous_size\n", " self.continuous_size = env.unity_continuous_size\n",
"\n", "\n",
" self.network = nn.Sequential(\n", " self.network = nn.Sequential(\n",
" layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 500)),\n", " layer_init(nn.Linear(env.unity_observation_shape[0], 300)),\n",
" nn.ReLU(),\n", " nn.Tanh(),\n",
" layer_init(nn.Linear(500, 300)),\n", " layer_init(nn.Linear(300, 200)),\n",
" nn.ReLU(),\n", " nn.Tanh(),\n",
" )\n", " )\n",
" self.actor_dis = nn.ModuleList([layer_init(nn.Linear(300, self.discrete_size), std=0.01) for i in range(targetNum)])\n", " self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5)\n",
" self.actor_mean = nn.ModuleList([layer_init(nn.Linear(300, self.continuous_size), std=0.01) for i in range(targetNum)])\n", " self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5)\n",
" self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)])\n", " self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))\n",
" self.critic = layer_init(nn.Linear(300, 1), std=1)\n", " self.critic = layer_init(nn.Linear(200, 1), std=1)\n",
"\n", "\n",
" def get_value(self, state: torch.Tensor):\n", " def get_value(self, state: torch.Tensor):\n",
" return self.critic(self.network(state))\n", " return self.critic(self.network(state))\n",
"\n", "\n",
" def get_actions_value(self, state: torch.Tensor, actions=None):\n", " def get_actions_value(self, state: torch.Tensor, actions=None):\n",
" hidden = self.network(state)\n", " hidden = self.network(state)\n",
" targets = torch.argmax(state[:,0:self.targetNum],dim=1)\n",
"\n", "\n",
" # discrete\n", " # discrete\n",
" # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出\n", " dis_logits = self.actor_dis(hidden)\n",
" dis_logits = torch.stack([self.actor_dis[targets[i]](hidden[i]) for i in range(targets.size()[0])])\n",
" split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n", " split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n",
" multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n", " multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n",
" # continuous\n", " # continuous\n",
" actions_mean = torch.stack([self.actor_mean[targets[i]](hidden[i]) for i in range(targets.size()[0])]) # self.actor_mean(hidden)\n", " actions_mean = self.actor_mean(hidden)\n",
" # action_logstd = torch.stack([self.actor_logstd[targets[i]].expand_as(actions_mean) for i in range(targets.size()[0])]) # self.actor_logstd.expand_as(actions_mean)\n", " action_logstd = self.actor_logstd.expand_as(actions_mean)\n",
" # print(action_logstd)\n", " action_std = torch.exp(action_logstd)\n",
" action_std = torch.squeeze(torch.stack([torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])]),dim = -1) # torch.exp(action_logstd)\n",
" con_probs = Normal(actions_mean, action_std)\n", " con_probs = Normal(actions_mean, action_std)\n",
"\n", "\n",
" if actions is None:\n", " if actions is None:\n",
@ -680,117 +680,46 @@
" con_probs.log_prob(conAct).sum(1),\n", " con_probs.log_prob(conAct).sum(1),\n",
" con_probs.entropy().sum(1),\n", " con_probs.entropy().sum(1),\n",
" self.critic(hidden),\n", " self.critic(hidden),\n",
" )\n", " )"
"agent = PPOAgent(env,4).to(device)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"ppp = \"../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv\"\n",
"env = Aimbot(envPath=ppp, workerID=1, basePort=1000,side_channels=[])\n",
"agent_list = []\n",
"optimizers = []\n",
"for i in range(3):\n",
" agent_list.append(PPOAgent(env=env,targetNum=3).to('cuda'))\n",
" optimizers.append(optim.Adam(agent_list[i].parameters(),lr=1e-4))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"array([[ 1. , -10.343613 , 0. , -7.367299 ,\n", "tensor([1., 2., 3., 4., 5.])"
" 0. , 0. , 30. , -10.343662 ,\n",
" 1. , -33.708736 , 1. , 1. ,\n",
" 1. , 1. , 2. , 1. ,\n",
" 1. , 1. , 2. , 2. ,\n",
" 2. , 1. , 1. , 1. ,\n",
" 33.270493 , 39.50663 , 49.146526 , 32.595673 ,\n",
" 30.21616 , 21.163797 , 46.9299 , 1.3264331 ,\n",
" 1.2435672 , 1.2541904 , 30.08522 , 30.041445 ,\n",
" 21.072094 , 0. ],\n",
" [ 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 30. , -5.5892515 ,\n",
" 1. , -29.907726 , 1. , 1. ,\n",
" 1. , 1. , 2. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 41.408752 , 47.830173 , 45.03225 , 31.905174 ,\n",
" 41.849663 , 41.849648 , 43.001434 , 45.0322 ,\n",
" 47.48242 , 40.00285 , 41.668346 , 41.607723 ,\n",
" 41.668335 , 0. ],\n",
" [ 1. , 2.9582403 , 0. , -4.699738 ,\n",
" 0. , 0. , 30. , -5.412487 ,\n",
" 1. , -32.79967 , 1. , 2. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 2. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 20.17488 , 49.507687 , 48.162056 , 45.98998 ,\n",
" 44.75835 , 31.08564 , 32.865173 , 24.676666 ,\n",
" 12.952409 , 39.69923 , 44.564423 , 44.49966 ,\n",
" 44.564495 , 0. ],\n",
" [ 2. , -0.20171738, 0. , -10.340863 ,\n",
" 0. , 0. , 30. , -22.987915 ,\n",
" 1. , -34.37514 , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 1. , 2. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 11.631058 , 13.872022 , 18.006863 , 27.457632 ,\n",
" 46.343067 , 46.343094 , 20.155125 , 49.867714 ,\n",
" 52.965984 , 56.775608 , 46.14223 , 46.075138 ,\n",
" 46.142246 , 0. ],\n",
" [ 2. , -14.687862 , 0. , -12.615574 ,\n",
" 0. , 0. , 30. , 15.125373 ,\n",
" 1. , -30.849268 , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 2. ,\n",
" 52.430542 , 48.912865 , 46.05145 , 43.974594 ,\n",
" 42.796673 , 26.467875 , 11.072432 , 7.190229 ,\n",
" 5.483198 , 4.5500183 , 42.611244 , 42.549267 ,\n",
" 18.856438 , 0. ],\n",
" [ 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 30. , -4.0314903 ,\n",
" 1. , -29.164669 , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 44.074184 , 46.9762 , 44.228096 , 42.2335 ,\n",
" 41.102253 , 41.102367 , 42.233757 , 44.22849 ,\n",
" 44.321827 , 37.335304 , 40.924183 , 40.86467 ,\n",
" 40.924236 , 0. ],\n",
" [ 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 30. , -18.603981 ,\n",
" 1. , -29.797592 , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 1. , 2. , 2. , 2. ,\n",
" 19.134174 , 22.76088 , 29.468704 , 42.88739 ,\n",
" 41.738823 , 41.739002 , 42.88781 , 44.913647 ,\n",
" 47.704174 , 51.135338 , 20.418388 , 12.470214 ,\n",
" 12.670923 , 0. ],\n",
" [ 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 30. , -19.07032 ,\n",
" 1. , -30.246218 , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. ,\n",
" 18.336487 , 21.81617 , 28.251017 , 42.977867 ,\n",
" 42.18994 , 42.19034 , 43.351707 , 45.399582 ,\n",
" 48.22037 , 51.68873 , 42.00719 , 41.94621 ,\n",
" 42.00739 , 0. ]], dtype=float32)"
] ]
}, },
"execution_count": 7, "execution_count": 5,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"state,_,_ = env.getSteps()\n", "import torch\n",
"state" "\n",
] "aaa = torch.zeros((8,5))\n",
}, "aaa[0] = torch.Tensor([1,2,3,4,5])\n",
{ "aaa[0]"
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"env.close()"
] ]
} }
], ],