Compare commits

...

2 Commits

Author SHA1 Message Date
4b8ffeac6d GAIL V0.1 save point
GAIL V0.1 save point
todo 
1.human action record(GAILMem) debug
2.gail debug
2022-12-04 08:42:10 +09:00
ad9817e7a4 Totally disparate NN by target
Totally disparate NN by target.
2022-12-03 21:35:33 +09:00
5 changed files with 992 additions and 14 deletions

View File

@ -0,0 +1,679 @@
import os
import argparse
import wandb
import time
import numpy as np
import random
import uuid
import torch
import torch.nn as nn
import torch.optim as optim
from AimbotEnv import Aimbot
from tqdm import tqdm
from enum import Enum
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
from distutils.util import strtobool
from torch.utils.tensorboard import SummaryWriter
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.side_channel import (
SideChannel,
IncomingMessage,
OutgoingMessage,
)
from typing import List
from GAILMem import GAILMem
bestReward = 0
useCUDA = True
DEFAULT_SEED = 9331
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy/Aimbot-ParallelEnv"
EXPERT_PATH = "NAN"
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
WAND_ENTITY = "koha9"
WORKER_ID = 2
BASE_PORT = 1001
# max round steps per agent is 2500/Decision_period, 25 seconds
# !!!check every parameters before run!!!
TOTAL_STEPS = 6750000
BATCH_SIZE = 512
MAX_TRAINNING_DATASETS = 3000
DECISION_PERIOD = 1
LEARNING_RATE = 1e-3
GAMMA = 0.99
GAE_LAMBDA = 0.95
EPOCHS = 4
CLIP_COEF = 0.1
POLICY_COEF = 1.0
ENTROPY_COEF = 0.01
CRITIC_COEF = 0.5
TARGET_LEARNING_RATE = 1e-5
DSCM_ENHANCED = 1
ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True
NORM_ADV = True
TRAIN = True
WANDB_TACK = True
LOAD_DIR = None
# LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
# public data
class Targets(Enum):
Free = 0
Go = 1
Attack = 2
Defence = 3
Num = 4
BASE_WINREWARD = 999
BASE_LOSEREWARD = -999
TARGETNUM = 4
ENV_TIMELIMIT = 30
RESULT_BROADCAST_RATIO = 2 / ENV_TIMELIMIT
TotalRounds = {"Free": 0, "Go": 0, "Attack": 0}
WinRounds = {"Free": 0, "Go": 0, "Attack": 0}
EPS = 1e-8
# !!!SPECIAL PARAMETERS!!!
# change it while program is finished
using_targets_num = 3
def parse_args():
# fmt: off
# pytorch and environment parameters
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
help="seed of the experiment")
parser.add_argument("--path", type=str, default=ENV_PATH,
help="enviroment path")
parser.add_argument("--workerID", type=int, default=WORKER_ID,
help="unity worker ID")
parser.add_argument("--baseport", type=int, default=BASE_PORT,
help="port to connect to Unity environment")
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
help="the learning rate of optimizer")
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="if toggled, cuda will be enabled by default")
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
help="total timesteps of the experiments")
# model parameters
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
help="Train Model or not")
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
help="training dataset size,start training while dataset collect enough data")
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
help="nimi batch size")
parser.add_argument("--epochs", type=int, default=EPOCHS,
help="the K epochs to update the policy")
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
help="Toggle learning rate annealing for policy and value networks")
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
help="track on the wandb")
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
help="the entity (team) of wandb's project")
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
help="load model directory")
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
help="the number of steps to run in each environment per policy rollout")
# GAE loss
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="Use GAE for advantage computation")
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
help="Toggles advantages normalization")
parser.add_argument("--gamma", type=float, default=GAMMA,
help="the discount factor gamma")
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
help="the lambda for the general advantage estimation")
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
help="the surrogate clipping coefficient")
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
help="coefficient of the policy")
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
help="coefficient of the entropy")
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
help="coefficient of the value function")
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
parser.add_argument("--max-grad-norm", type=float, default=0.5,
help="the maximum norm for the gradient clipping")
parser.add_argument("--target-kl", type=float, default=None,
help="the target KL divergence threshold")
# fmt: on
args = parser.parse_args()
return args
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class PPOAgent(nn.Module):
def __init__(self, env: Aimbot, targetNum: int):
super(PPOAgent, self).__init__()
self.targetNum = targetNum
self.discrete_size = env.unity_discrete_size
self.discrete_shape = list(env.unity_discrete_branches)
self.continuous_size = env.unity_continuous_size
self.network = nn.ModuleList(
[
nn.Sequential(
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 300)),
nn.ReLU(),
layer_init(nn.Linear(300, 200)),
nn.ReLU(),
)
for i in range(targetNum)
]
)
self.actor_dis = nn.ModuleList(
[layer_init(nn.Linear(200, self.discrete_size), std=0.01) for i in range(targetNum)]
)
self.actor_mean = nn.ModuleList(
[layer_init(nn.Linear(200, self.continuous_size), std=0.01) for i in range(targetNum)]
)
self.actor_logstd = nn.ParameterList(
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)]
)
self.critic = nn.ModuleList(
[layer_init(nn.Linear(200, 1), std=1) for i in range(targetNum)]
)
def get_value(self, state: torch.Tensor):
targets = state[:, 0].to(torch.int32)
hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])])
return torch.stack([self.critic[targets[i]](hidden[i]) for i in range(targets.size()[0])])
def get_actions_value(self, state: torch.Tensor, actions=None):
targets = state[:, 0].to(torch.int32)
hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])])
# discrete
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
dis_logits = torch.stack(
[self.actor_dis[targets[i]](hidden[i]) for i in range(targets.size()[0])]
)
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
# continuous
actions_mean = torch.stack(
[self.actor_mean[targets[i]](hidden[i]) for i in range(targets.size()[0])]
) # self.actor_mean(hidden)
# action_logstd = torch.stack([self.actor_logstd[targets[i]].expand_as(actions_mean) for i in range(targets.size()[0])]) # self.actor_logstd.expand_as(actions_mean)
# print(action_logstd)
action_std = torch.squeeze(
torch.stack(
[torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])]
),
dim=-1,
) # torch.exp(action_logstd)
con_probs = Normal(actions_mean, action_std)
# critic
criticV = torch.stack(
[self.critic[targets[i]](hidden[i]) for i in range(targets.size()[0])]
)
if actions is None:
if TRAIN:
# select actions base on probability distribution model
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
conAct = con_probs.sample()
actions = torch.cat([disAct.T, conAct], dim=1)
else:
# select actions base on best probability distribution
disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
conAct = actions_mean
actions = torch.cat([disAct.T, conAct], dim=1)
else:
disAct = actions[:, 0 : env.unity_discrete_type].T
conAct = actions[:, env.unity_discrete_type :]
dis_log_prob = torch.stack(
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
)
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
return (
actions,
dis_log_prob.sum(0),
dis_entropy.sum(0),
con_probs.log_prob(conAct).sum(1),
con_probs.entropy().sum(1),
criticV,
)
def GAE(agent, args, rewards, dones, values, next_obs, next_done):
# GAE
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
data_size = rewards.size()[0]
if args.gae:
advantages = torch.zeros_like(rewards).to(device)
lastgaelam = 0
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
nextvalues = values[t + 1]
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
advantages[t] = lastgaelam = (
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(device)
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
next_return = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
advantages = returns - values
return advantages, returns
class AimbotSideChannel(SideChannel):
def __init__(self, channel_id: uuid.UUID) -> None:
super().__init__(channel_id)
def on_message_received(self, msg: IncomingMessage) -> None:
"""
Note: We must implement this method of the SideChannel interface to
receive messages from Unity
"""
thisMessage = msg.read_string()
# print(thisMessage)
thisResult = thisMessage.split("|")
if thisResult[0] == "result":
TotalRounds[thisResult[1]] += 1
if thisResult[2] == "Win":
WinRounds[thisResult[1]] += 1
# print(TotalRounds)
# print(WinRounds)
elif thisResult[0] == "Error":
print(thisMessage)
# 发送函数
def send_string(self, data: str) -> None:
# send a string toC#
msg = OutgoingMessage()
msg.write_string(data)
super().queue_message_to_send(msg)
def send_bool(self, data: bool) -> None:
msg = OutgoingMessage()
msg.write_bool(data)
super().queue_message_to_send(msg)
def send_int(self, data: int) -> None:
msg = OutgoingMessage()
msg.write_int32(data)
super().queue_message_to_send(msg)
def send_float(self, data: float) -> None:
msg = OutgoingMessage()
msg.write_float32(data)
super().queue_message_to_send(msg)
def send_float_list(self, data: List[float]) -> None:
msg = OutgoingMessage()
msg.write_float32_list(data)
super().queue_message_to_send(msg)
def broadCastEndReward(rewardBF: list, remainTime: float):
thisRewardBF = rewardBF
if rewardBF[-1] <= -500:
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
thisRewardBF[-1] = rewardBF[-1] - BASE_LOSEREWARD
thisRewardBF = (np.asarray(thisRewardBF)).tolist()
elif rewardBF[-1] >= 500:
# print("Win! Broadcast reward!",rewardBF[-1])
thisRewardBF[-1] = rewardBF[-1] - BASE_WINREWARD
thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * RESULT_BROADCAST_RATIO)).tolist()
else:
print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1])
return torch.Tensor(thisRewardBF).to(device)
class Discriminator(nn.Module):
def __init__(self, env: Aimbot, targetNum: int):
super(Discriminator, self).__init__()
self.targetNum = targetNum
self.trajectory_size = env.unity_observation_shape + env.unity_action_size
self.network = nn.ModuleList(
[
nn.Sequential(
layer_init(nn.Linear(np.array(self.trajectory_size).prod(), 300)),
nn.ReLU(),
layer_init(nn.Linear(300, 200)),
nn.ReLU(),
)
for i in range(targetNum)
]
)
self.output = nn.ModuleList(
[nn.Relu(layer_init(nn.Linear(200, 1), std=0.01)) for i in range(targetNum)]
)
def get_value(self, state: torch.Tensor):
targets = state[:, 0].to(torch.int32)
hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])])
return torch.stack([self.output[targets[i]](hidden[i]) for i in range(targets.size()[0])])
if __name__ == "__main__":
args = parse_args()
random.seed(DEFAULT_SEED)
np.random.seed(DEFAULT_SEED)
torch.manual_seed(DEFAULT_SEED)
device = torch.device("cuda" if torch.cuda.is_available() and useCUDA else "cpu")
aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID)
env = Aimbot(
envPath=ENV_PATH, workerID=WORKER_ID, basePort=BASE_PORT, side_channels=[aimBotsideChannel]
)
if LOAD_DIR is None:
agent = PPOAgent(env, TARGETNUM).to(device)
discriminator = Discriminator(env, TARGETNUM).to(device)
else:
print("NOT AVALABLE")
agent = torch.load(LOAD_DIR)
print("Load Agent", LOAD_DIR)
print(agent.eval())
# Optimizers
PPOoptimizer = optim.Adam(agent.parameters(), lr=LEARNING_RATE, eps=1e-5)
DSCMoptimizer = optim.Adam(discriminator.parameters(), lr=LEARNING_RATE, eps=1e-5)
# Tensorboard and WandB Recorder
game_name = "Aimbot_GAIL"
game_type = "ORG"
run_name = f"{game_name}_{game_type}_{DEFAULT_SEED}_{int(time.time())}"
if WANDB_TACK:
wandb.init(
project=game_name,
entity=WAND_ENTITY,
sync_tensorboard=True,
config=vars(args),
name=run_name,
monitor_gym=True,
save_code=True,
)
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
"hyperparameters",
"|param|value|\n|-|-|\n%s"
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)
agentMem = GAILMem()
expertMem = GAILMem()
expertMem.loadMemFile(EXPERT_PATH)
# start the game
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
target_steps = [0 for i in range(TARGETNUM)]
start_time = time.time()
state, _, done = env.reset()
# Trajectory Buffer
ob_bf = [[] for i in range(env.unity_agent_num)]
act_bf = [[] for i in range(env.unity_agent_num)]
dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
rewards_bf = [[] for i in range(env.unity_agent_num)]
dones_bf = [[] for i in range(env.unity_agent_num)]
values_bf = [[] for i in range(env.unity_agent_num)]
# initialize empty training datasets
obs = [
torch.tensor([]).to(device) for i in range(TARGETNUM)
] # (TARGETNUM,n,env.unity_observation_size)
actions = [
torch.tensor([]).to(device) for i in range(TARGETNUM)
] # (TARGETNUM,n,env.unity_action_size)
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
dones = [torch.tensor([]).to(device) for i in range(TARGETNUM)]
next_state_buffer = [[] for i in range(TARGETNUM)]
next_done_buffer = [[] for i in range(TARGETNUM)]
for total_steps in range(total_update_step):
print("new episode")
trainQueue = []
while True:
with torch.no_grad():
# predict actions
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
torch.Tensor(state).to(device)
)
value = value.flatten()
# variable from GPU to CPU
action_cpu = action.cpu().numpy()
dis_logprob_cpu = dis_logprob.cpu().numpy()
con_logprob_cpu = con_logprob.cpu().numpy()
value_cpu = value.cpu().numpy()
# Environment step
next_state, reward, next_done = env.step(action_cpu)
# save mem
for i in range(env.unity_agent_num):
# save memories to buffers
ob_bf[i].append(state[i])
act_bf[i].append(action_cpu[i])
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
con_logprobs_bf[i].append(con_logprob_cpu[i])
rewards_bf[i].append(reward[i])
dones_bf[i].append(done[i])
values_bf[i].append(value_cpu[i])
if next_done[i] == True:
# finished a round, send finished memories to training datasets
thisTarget = int(state[i, 0])
next_state_buffer[thisTarget] = next_state
next_done_buffer[thisTarget] = next_done
obs[thisTarget] = torch.cat(
(obs[thisTarget], torch.tensor(ob_bf[i]).to(device)), 0
)
actions[thisTarget] = torch.cat(
(actions[thisTarget], torch.tensor(act_bf[i]).to(device)), 0
)
dis_logprobs[thisTarget] = torch.cat(
(dis_logprobs[thisTarget], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
)
con_logprobs[thisTarget] = torch.cat(
(con_logprobs[thisTarget], torch.tensor(con_logprobs_bf[i]).to(device)), 0
)
values[thisTarget] = torch.cat(
(values[thisTarget], torch.tensor(values_bf[i]).to(device)), 0
)
dones[thisTarget] = torch.cat(
(dones[thisTarget], torch.tensor(dones_bf[i]).to(device)), 0
)
for i in range(TARGETNUM):
if obs[i].size()[0] >= args.datasetSize:
# start train NN
trainQueue.append(i)
if len(trainQueue) > 0:
break
state, done = next_state, next_done
if args.train:
meanRewardList = []
for thisT in trainQueue:
target_steps[thisT] += 1
# get agent training datasets
b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
b_con_logprobs = con_logprobs[thisT].reshape(-1)
b_values = values[thisT].reshape(-1)
b_dones = dones[thisT].reshape(-1)
dataNum = b_obs[thisT].size()[0]
# get expert training datasets from GAILMem
exp_obs, _, exp_actions, _, _ = expertMem.getRandomSample(dataNum, thisT)
# trajectory
agent_trajectory = torch.cat((b_obs, b_actions), dim=1)
exp_trajectory = torch.cat((exp_obs, exp_actions), dim=1)
# discriminator ACC
with torch.no_grad():
thisDSCM_agent_acc = torch.mean(discriminator.get_value(agent_trajectory))
thisDSCM_expt_acc = torch.mean(discriminator.get_value(exp_trajectory))
# train discriminator
b_inds = np.arange(dataNum)
for epoch in range(args.epoch):
np.random.shuffle(b_inds)
# train discriminator
for start in range(0, dataNum, args.minibatchSize):
end = start + args.minibatchSize
mb_inds = b_inds[start:end]
exp_value = discriminator.get_value(exp_trajectory[mb_inds])
agent_value = discriminator.get_value(agent_trajectory[mb_inds])
# DSCM loss function
exp_loss = torch.log(1.0 - exp_value + EPS)
agent_loss = torch.log(agent_value + EPS)
loss = exp_loss + agent_loss
# DSCM backward
DSCMoptimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(discriminator.parameters(), args.max_grad_norm)
DSCMoptimizer.step()
# get discriminator reward
with torch.no_grad():
DSCMReward = discriminator.get_value(agent_trajectory) * DSCM_ENHANCED
advantages, rts = GAE(
agent,
args,
DSCMReward,
b_dones,
b_values,
next_state_buffer[thisTarget],
next_done_buffer[thisTarget],
)
b_advantages = advantages.reshape(-1)
b_returns = rts.reshape(-1)
# train PPO agent
for epoch in range(args.epoch):
np.random.shuffle(b_inds)
for start in range(0, dataNum, args.minibatchSize):
end = start + args.minibatchSize
mb_inds = b_inds[start:end]
mb_advantages = b_advantages[mb_inds]
# normalize advantages
if args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
mb_advantages.std() + 1e-8
)
(
_,
new_dis_logprob,
dis_entropy,
new_con_logprob,
con_entropy,
newvalue,
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
# discrete ratio
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
dis_ratio = dis_logratio.exp()
# continuous ratio
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
con_ratio = con_logratio.exp()
# discrete Policy loss
dis_pg_loss_orig = -mb_advantages * dis_ratio
dis_pg_loss_clip = -mb_advantages * torch.clamp(
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
)
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
# continuous Policy loss
con_pg_loss_orig = -mb_advantages * con_ratio
con_pg_loss_clip = -mb_advantages * torch.clamp(
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
)
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
# Value loss
newvalue = newvalue.view(-1)
if args.clip_vloss:
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
v_clipped = b_values[mb_inds] + torch.clamp(
newvalue - b_values[mb_inds],
-args.clip_coef,
args.clip_coef,
)
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
# total loss
entropy_loss = dis_entropy.mean() + con_entropy.mean()
loss = (
dis_pg_loss * args.policy_coef
+ con_pg_loss * args.policy_coef
- entropy_loss * args.ent_coef
+ v_loss * args.critic_coef
)
PPOoptimizer.zero_grad()
loss.backward()
# Clips gradient norm of an iterable of parameters.
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
PPOoptimizer.step()
# record mean reward before clear history
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
meanRewardList.append(targetRewardMean)
targetName = Targets(thisT).name
# clear this target trainning set buffer
obs[thisT] = torch.tensor([]).to(device)
actions[thisT] = torch.tensor([]).to(device)
dis_logprobs[thisT] = torch.tensor([]).to(device)
con_logprobs[thisT] = torch.tensor([]).to(device)
rewards[thisT] = torch.tensor([]).to(device)
values[thisT] = torch.tensor([]).to(device)
dones[thisT] = torch.tensor([]).to(device)
# record rewards for plotting purposes
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
writer.add_scalar(f"Target{targetName}/Discriminator_EXP_ACC", thisDSCM_expt_acc,target_steps[thisT])
writer.add_scalar(f"Target{targetName}/Discriminator_Agent_ACC", thisDSCM_agent_acc,target_steps[thisT])
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
TotalRewardMean = np.mean(meanRewardList)
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
writer.add_scalar("GlobalCharts/learning_rate", PPOoptimizer.param_groups[0]["lr"], total_steps)
# New Record!
if TotalRewardMean > bestReward:
bestReward = targetRewardMean
saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt"
torch.save(agent, saveDir)
# train PPO
# record

View File

@ -0,0 +1,176 @@
import os
import random
import numpy as np
class GAILMem():
def __init__(self, targetNum):
self.targetNum = targetNum
self.states = [[] for i in range(self.targetNum)]
self.actorProbs = [[] for i in range(self.targetNum)]
self.actions = [[] for i in range(self.targetNum)]
self.rewards = [[] for i in range(self.targetNum)]
self.dones = [[] for i in range(self.targetNum)]
self.memNum = [0 for i in range(self.targetNum)]
def clearMem(self,targetType):
"""clearMemories"""
self.states[targetType] = []
self.actorProbs[targetType] = []
self.actions[targetType] = []
self.rewards[targetType] = []
self.dones[targetType] = []
self.memNum[targetType] = 0
def saveMemtoFile(self, dir: str):
"""save memories ndarray to npz file
Args:
dir (str): save direction,like"GAIL-Expert-Data/",end with "/"
"""
for i in range(self.targetNum):
statesNP = np.asarray(self.states[i])
actorProbsNP = np.asarray(self.actorProbs[i])
actionsNP = np.asarray(self.actions[i])
rewardsNP = np.asarray(self.rewards[i])
donesNP = np.asarray(self.dones[i])
thisSaveDir = dir + "pack-" + str(self.memNum) + str(i)
try:
np.savez(
thisSaveDir,
states=statesNP,
actorProbs=actorProbsNP,
actions=actionsNP,
rewards=rewardsNP,
dones=donesNP,
)
except FileNotFoundError:
os.mkdir(dir)
np.savez(
thisSaveDir,
states=statesNP,
actorProbs=actorProbsNP,
actions=actionsNP,
rewards=rewardsNP,
dones=donesNP,
)
def loadMemFile(self, dir: str):
"""load memories from mpz file
Args:
dir (str): file direction
"""
for i in range(self.targetNum):
self.clearMem(i)
loadDir = dir + "pack-" + str(self.memNum) + str(i) + ".npz"
memFile = np.load(loadDir, allow_pickle=True)
self.states[i] = memFile["states"].tolist()
self.actorProbs[i] = memFile["actorProbs"].tolist()
self.actions[i] = memFile["actions"].tolist()
self.rewards[i] = memFile["rewards"].tolist()
self.dones[i] = memFile["dones"].tolist()
self.memNum = len(self.states[i])
def getRandomSample(self,sampleNum: int,targetType:int):
"""get random unique sample set.
Args:
sampleNum (int, optional): sample number, while 0 return all samples. Defaults to 0.
Returns:
tuple: (states,actorProbs,actions,rewards,dones)
"""
if sampleNum == 0:
return (
self.getStates(),
self.getActorProbs(),
self.getActions(),
self.getRewards(),
self.getDones(),
)
else:
randIndex = random.sample(range(0, self.memNum), sampleNum)
return (
self.standDims(np.asarray(self.states)[randIndex]),
self.standDims(np.asarray(self.actorProbs)[randIndex]),
self.standDims(np.asarray(self.actions)[randIndex]),
self.standDims(np.asarray(self.rewards)[randIndex]),
self.standDims(np.asarray(self.dones)[randIndex]),
)
def getStates(self,targetType):
"""get all States data as ndarray
Returns:
ndarray: ndarray type State data
"""
return self.standDims(np.asarray(self.states[targetType]))
def getActorProbs(self,targetType):
"""get all ActorProbs data as ndarray
Returns:
ndarray: ndarray type ActorProbs data
"""
return self.standDims(np.asarray(self.actorProbs[targetType]))
def getActions(self,targetType):
"""get all Actions data as ndarray
Returns:
ndarray: ndarray type Actions data
"""
return self.standDims(np.asarray(self.actions[targetType]))
def getRewards(self,targetType):
"""get all Rewards data as ndarray
Returns:
ndarray: ndarray type Rewards data
"""
return self.standDims(np.asarray(self.rewards[targetType]))
def getDones(self,targetType):
"""get all Dones data as ndarray
Returns:
ndarray: ndarray type Dones data
"""
return self.standDims(np.asarray(self.dones[targetType]))
def standDims(self, data):
"""standalize data's dimension
Args:
data (list): data list
Returns:
ndarray: ndarra type data
"""
# standarlize data's dimension
if np.ndim(data) > 2:
return np.squeeze(data, axis=1)
elif np.ndim(data) < 2:
return np.expand_dims(data, axis=1)
else:
return np.asarray(data)
def saveMems(self, state, actorProb, action, reward, done):
"""save memories
Args:
state (_type_): sates
actorProb (_type_): actor predict result
action (_type_): actor choosed action
reward (_type_): reward
done (function): done
"""
targetType = int(state[0,0])
self.states[targetType].append(state)
self.actorProbs[targetType].append(actorProb)
self.actions[targetType].append(action)
self.rewards[targetType].append(reward)
self.dones[targetType].append(done)
self.memNum[targetType] += 1

View File

@ -0,0 +1,97 @@
import time
import numpy as np
from AimbotEnv import Aimbot
from GAILMem import GAILMem
import keyboard
import mouse
import math
# Env
ENV_PATH = "../Build/HUMAN-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy/Aimbot-ParallelEnv"
WORKER_ID = 1
BASE_PORT = 200
# ENV Para
MOUSEDISCOUNT = 20.0
MAX_EP = 10000000
STACKSTATESIZE = 3
STACKINTERCE = 29
class HumanActions:
def __init__(self, mouseDiscount: float = 10, screenW: int = 1920, screenH: int = 1080):
def multiPressed():
pass
keyboard.add_hotkey("w+a", multiPressed)
keyboard.add_hotkey("w+d", multiPressed)
keyboard.add_hotkey("s+a", multiPressed)
keyboard.add_hotkey("s+d", multiPressed)
self.screenW = screenW
self.screenH = screenH
self.MOUSEDISCOUNT = mouseDiscount
self.mouseSmooth = 5
self.mouseMax = 10
def getHumanActions(self):
x, _ = mouse.get_position()
xMovement = (x - self.screenW / 2) / self.MOUSEDISCOUNT
xMovement = self.smoothMouseMovement(xMovement)
ws = 0
ad = 0
click = 0
if keyboard.is_pressed("w"):
ws = 1
elif keyboard.is_pressed("s"):
ws = 2
if keyboard.is_pressed("d"):
ad = 1
elif keyboard.is_pressed("a"):
ad = 2
if keyboard.is_pressed("w+d"):
ws = 1
ad = 1
elif keyboard.is_pressed("w+a"):
ws = 1
ad = 2
elif keyboard.is_pressed("s+d"):
ws = 2
ad = 1
elif keyboard.is_pressed("s+a"):
ws = 2
ad = 2
if keyboard.is_pressed("0"):
click = 1
actions = np.asarray([[ws, ad, click, xMovement]])
mouse.move(self.screenW / 2, self.screenH / 2)
return actions
def smoothMouseMovement(self, x: float):
out = (1 / (1 + math.exp(-x / self.mouseSmooth)) - 1 / 2) * self.mouseMax * 2
return out
if __name__ == "__main__":
env = Aimbot(
envPath=ENV_PATH,
workerID=WORKER_ID,
basePort=BASE_PORT,
side_channels=[],
)
demoMem = GAILMem(4)
demoAct = HumanActions(mouseDiscount=MOUSEDISCOUNT)
for ep in range(MAX_EP):
print("EP Start")
done = False
while not done:
actions = demoAct.getHumanActions()
nextState, r, done = env.step(actions=actions)
demoMem.saveMems(state=nextState, actorProb=None, action=actions, reward=None, done=None)
state = nextState
#nowMemNum = demoMem.memNum
saveSteps = 500
lastMemCheckPoint = 0

View File

@ -47,7 +47,7 @@ CLIP_COEF = 0.1
POLICY_COEF = 1.0
ENTROPY_COEF = 0.01
CRITIC_COEF = 0.5
TARGET_LEARNING_RATE = 5e-5
TARGET_LEARNING_RATE = 1e-5
ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True
@ -159,23 +159,24 @@ class PPOAgent(nn.Module):
self.discrete_shape = list(env.unity_discrete_branches)
self.continuous_size = env.unity_continuous_size
self.network = nn.Sequential(
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 500)),
self.network = nn.ModuleList([nn.Sequential(
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 300)),
nn.ReLU(),
layer_init(nn.Linear(500, 300)),
nn.ReLU(),
)
self.actor_dis = nn.ModuleList([layer_init(nn.Linear(300, self.discrete_size), std=0.01) for i in range(targetNum)])
self.actor_mean = nn.ModuleList([layer_init(nn.Linear(300, self.continuous_size), std=0.01) for i in range(targetNum)])
layer_init(nn.Linear(300, 200)),
nn.ReLU()) for i in range(targetNum)])
self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.01) for i in range(targetNum)])
self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.01) for i in range(targetNum)])
self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)])
self.critic = layer_init(nn.Linear(300, 1), std=1)
self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)])
def get_value(self, state: torch.Tensor):
return self.critic(self.network(state))
targets = state[:,0].to(torch.int32)
hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])])
return torch.stack([self.critic[targets[i]](hidden[i])for i in range(targets.size()[0])])
def get_actions_value(self, state: torch.Tensor, actions=None):
hidden = self.network(state)
targets = state[:,0].to(torch.int32)
hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])])
# discrete
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
@ -188,6 +189,8 @@ class PPOAgent(nn.Module):
# print(action_logstd)
action_std = torch.squeeze(torch.stack([torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])]),dim = -1) # torch.exp(action_logstd)
con_probs = Normal(actions_mean, action_std)
# critic
criticV = torch.stack([self.critic[targets[i]](hidden[i])for i in range(targets.size()[0])])
if actions is None:
if args.train:
@ -207,13 +210,14 @@ class PPOAgent(nn.Module):
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
)
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
return (
actions,
dis_log_prob.sum(0),
dis_entropy.sum(0),
con_probs.log_prob(conAct).sum(1),
con_probs.entropy().sum(1),
self.critic(hidden),
criticV,
)
@ -436,7 +440,7 @@ if __name__ == "__main__":
thisRewardsTensor,
torch.Tensor(dones_bf[i]).to(device),
torch.tensor(values_bf[i]).to(device),
torch.tensor(next_state[i]).to(device),
torch.tensor([next_state[i]]).to(device),
torch.Tensor([next_done[i]]).to(device),
)
# send memories to training datasets
@ -642,7 +646,6 @@ if __name__ == "__main__":
# record rewards for plotting purposes
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])

View File

@ -792,6 +792,29 @@
"source": [
"env.close()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"float"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import torch\n",
"\n",
"aa = torch.Tensor([[1,2,3],[2,2,3],[3,2,3],[4,2,3]]).to(\"cuda\")\n",
"type(torch.mean(aa).item())"
]
}
],
"metadata": {