Compare commits
2 Commits
OffP-FullM
...
GAIL
Author | SHA1 | Date | |
---|---|---|---|
4b8ffeac6d | |||
ad9817e7a4 |
679
Aimbot-PPO-Python/Pytorch/GAIL.py
Normal file
679
Aimbot-PPO-Python/Pytorch/GAIL.py
Normal file
@ -0,0 +1,679 @@
|
||||
import os
|
||||
import argparse
|
||||
import wandb
|
||||
import time
|
||||
import numpy as np
|
||||
import random
|
||||
import uuid
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
|
||||
from AimbotEnv import Aimbot
|
||||
from tqdm import tqdm
|
||||
from enum import Enum
|
||||
from torch.distributions.normal import Normal
|
||||
from torch.distributions.categorical import Categorical
|
||||
from distutils.util import strtobool
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
from mlagents_envs.environment import UnityEnvironment
|
||||
from mlagents_envs.side_channel.side_channel import (
|
||||
SideChannel,
|
||||
IncomingMessage,
|
||||
OutgoingMessage,
|
||||
)
|
||||
from typing import List
|
||||
from GAILMem import GAILMem
|
||||
|
||||
bestReward = 0
|
||||
|
||||
useCUDA = True
|
||||
DEFAULT_SEED = 9331
|
||||
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy/Aimbot-ParallelEnv"
|
||||
EXPERT_PATH = "NAN"
|
||||
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
||||
WAND_ENTITY = "koha9"
|
||||
WORKER_ID = 2
|
||||
BASE_PORT = 1001
|
||||
|
||||
# max round steps per agent is 2500/Decision_period, 25 seconds
|
||||
# !!!check every parameters before run!!!
|
||||
|
||||
TOTAL_STEPS = 6750000
|
||||
BATCH_SIZE = 512
|
||||
MAX_TRAINNING_DATASETS = 3000
|
||||
DECISION_PERIOD = 1
|
||||
LEARNING_RATE = 1e-3
|
||||
GAMMA = 0.99
|
||||
GAE_LAMBDA = 0.95
|
||||
EPOCHS = 4
|
||||
CLIP_COEF = 0.1
|
||||
POLICY_COEF = 1.0
|
||||
ENTROPY_COEF = 0.01
|
||||
CRITIC_COEF = 0.5
|
||||
TARGET_LEARNING_RATE = 1e-5
|
||||
DSCM_ENHANCED = 1
|
||||
|
||||
ANNEAL_LEARNING_RATE = True
|
||||
CLIP_VLOSS = True
|
||||
NORM_ADV = True
|
||||
TRAIN = True
|
||||
|
||||
WANDB_TACK = True
|
||||
LOAD_DIR = None
|
||||
# LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
|
||||
|
||||
# public data
|
||||
class Targets(Enum):
|
||||
Free = 0
|
||||
Go = 1
|
||||
Attack = 2
|
||||
Defence = 3
|
||||
Num = 4
|
||||
|
||||
|
||||
BASE_WINREWARD = 999
|
||||
BASE_LOSEREWARD = -999
|
||||
TARGETNUM = 4
|
||||
ENV_TIMELIMIT = 30
|
||||
RESULT_BROADCAST_RATIO = 2 / ENV_TIMELIMIT
|
||||
TotalRounds = {"Free": 0, "Go": 0, "Attack": 0}
|
||||
WinRounds = {"Free": 0, "Go": 0, "Attack": 0}
|
||||
EPS = 1e-8
|
||||
|
||||
# !!!SPECIAL PARAMETERS!!!
|
||||
# change it while program is finished
|
||||
using_targets_num = 3
|
||||
|
||||
|
||||
def parse_args():
|
||||
# fmt: off
|
||||
# pytorch and environment parameters
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
|
||||
help="seed of the experiment")
|
||||
parser.add_argument("--path", type=str, default=ENV_PATH,
|
||||
help="enviroment path")
|
||||
parser.add_argument("--workerID", type=int, default=WORKER_ID,
|
||||
help="unity worker ID")
|
||||
parser.add_argument("--baseport", type=int, default=BASE_PORT,
|
||||
help="port to connect to Unity environment")
|
||||
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
|
||||
help="the learning rate of optimizer")
|
||||
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||
help="if toggled, cuda will be enabled by default")
|
||||
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
|
||||
help="total timesteps of the experiments")
|
||||
|
||||
# model parameters
|
||||
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
|
||||
help="Train Model or not")
|
||||
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
|
||||
help="training dataset size,start training while dataset collect enough data")
|
||||
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
|
||||
help="nimi batch size")
|
||||
parser.add_argument("--epochs", type=int, default=EPOCHS,
|
||||
help="the K epochs to update the policy")
|
||||
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
|
||||
help="Toggle learning rate annealing for policy and value networks")
|
||||
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
|
||||
help="track on the wandb")
|
||||
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
|
||||
help="the entity (team) of wandb's project")
|
||||
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
|
||||
help="load model directory")
|
||||
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
|
||||
help="the number of steps to run in each environment per policy rollout")
|
||||
|
||||
# GAE loss
|
||||
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||
help="Use GAE for advantage computation")
|
||||
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
|
||||
help="Toggles advantages normalization")
|
||||
parser.add_argument("--gamma", type=float, default=GAMMA,
|
||||
help="the discount factor gamma")
|
||||
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
|
||||
help="the lambda for the general advantage estimation")
|
||||
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
|
||||
help="the surrogate clipping coefficient")
|
||||
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
|
||||
help="coefficient of the policy")
|
||||
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
|
||||
help="coefficient of the entropy")
|
||||
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
|
||||
help="coefficient of the value function")
|
||||
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
|
||||
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
|
||||
parser.add_argument("--max-grad-norm", type=float, default=0.5,
|
||||
help="the maximum norm for the gradient clipping")
|
||||
parser.add_argument("--target-kl", type=float, default=None,
|
||||
help="the target KL divergence threshold")
|
||||
# fmt: on
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
||||
torch.nn.init.orthogonal_(layer.weight, std)
|
||||
torch.nn.init.constant_(layer.bias, bias_const)
|
||||
return layer
|
||||
|
||||
|
||||
class PPOAgent(nn.Module):
|
||||
def __init__(self, env: Aimbot, targetNum: int):
|
||||
super(PPOAgent, self).__init__()
|
||||
self.targetNum = targetNum
|
||||
self.discrete_size = env.unity_discrete_size
|
||||
self.discrete_shape = list(env.unity_discrete_branches)
|
||||
self.continuous_size = env.unity_continuous_size
|
||||
|
||||
self.network = nn.ModuleList(
|
||||
[
|
||||
nn.Sequential(
|
||||
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 300)),
|
||||
nn.ReLU(),
|
||||
layer_init(nn.Linear(300, 200)),
|
||||
nn.ReLU(),
|
||||
)
|
||||
for i in range(targetNum)
|
||||
]
|
||||
)
|
||||
self.actor_dis = nn.ModuleList(
|
||||
[layer_init(nn.Linear(200, self.discrete_size), std=0.01) for i in range(targetNum)]
|
||||
)
|
||||
self.actor_mean = nn.ModuleList(
|
||||
[layer_init(nn.Linear(200, self.continuous_size), std=0.01) for i in range(targetNum)]
|
||||
)
|
||||
self.actor_logstd = nn.ParameterList(
|
||||
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)]
|
||||
)
|
||||
self.critic = nn.ModuleList(
|
||||
[layer_init(nn.Linear(200, 1), std=1) for i in range(targetNum)]
|
||||
)
|
||||
|
||||
def get_value(self, state: torch.Tensor):
|
||||
targets = state[:, 0].to(torch.int32)
|
||||
hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])])
|
||||
return torch.stack([self.critic[targets[i]](hidden[i]) for i in range(targets.size()[0])])
|
||||
|
||||
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||
targets = state[:, 0].to(torch.int32)
|
||||
hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])])
|
||||
|
||||
# discrete
|
||||
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
||||
dis_logits = torch.stack(
|
||||
[self.actor_dis[targets[i]](hidden[i]) for i in range(targets.size()[0])]
|
||||
)
|
||||
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
||||
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
||||
# continuous
|
||||
actions_mean = torch.stack(
|
||||
[self.actor_mean[targets[i]](hidden[i]) for i in range(targets.size()[0])]
|
||||
) # self.actor_mean(hidden)
|
||||
# action_logstd = torch.stack([self.actor_logstd[targets[i]].expand_as(actions_mean) for i in range(targets.size()[0])]) # self.actor_logstd.expand_as(actions_mean)
|
||||
# print(action_logstd)
|
||||
action_std = torch.squeeze(
|
||||
torch.stack(
|
||||
[torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])]
|
||||
),
|
||||
dim=-1,
|
||||
) # torch.exp(action_logstd)
|
||||
con_probs = Normal(actions_mean, action_std)
|
||||
# critic
|
||||
criticV = torch.stack(
|
||||
[self.critic[targets[i]](hidden[i]) for i in range(targets.size()[0])]
|
||||
)
|
||||
|
||||
if actions is None:
|
||||
if TRAIN:
|
||||
# select actions base on probability distribution model
|
||||
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
||||
conAct = con_probs.sample()
|
||||
actions = torch.cat([disAct.T, conAct], dim=1)
|
||||
else:
|
||||
# select actions base on best probability distribution
|
||||
disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
||||
conAct = actions_mean
|
||||
actions = torch.cat([disAct.T, conAct], dim=1)
|
||||
else:
|
||||
disAct = actions[:, 0 : env.unity_discrete_type].T
|
||||
conAct = actions[:, env.unity_discrete_type :]
|
||||
dis_log_prob = torch.stack(
|
||||
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
|
||||
)
|
||||
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
||||
|
||||
return (
|
||||
actions,
|
||||
dis_log_prob.sum(0),
|
||||
dis_entropy.sum(0),
|
||||
con_probs.log_prob(conAct).sum(1),
|
||||
con_probs.entropy().sum(1),
|
||||
criticV,
|
||||
)
|
||||
|
||||
|
||||
def GAE(agent, args, rewards, dones, values, next_obs, next_done):
|
||||
# GAE
|
||||
with torch.no_grad():
|
||||
next_value = agent.get_value(next_obs).reshape(1, -1)
|
||||
data_size = rewards.size()[0]
|
||||
if args.gae:
|
||||
advantages = torch.zeros_like(rewards).to(device)
|
||||
lastgaelam = 0
|
||||
for t in reversed(range(data_size)):
|
||||
if t == data_size - 1:
|
||||
nextnonterminal = 1.0 - next_done
|
||||
nextvalues = next_value
|
||||
else:
|
||||
nextnonterminal = 1.0 - dones[t + 1]
|
||||
nextvalues = values[t + 1]
|
||||
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
|
||||
advantages[t] = lastgaelam = (
|
||||
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
|
||||
)
|
||||
returns = advantages + values
|
||||
else:
|
||||
returns = torch.zeros_like(rewards).to(device)
|
||||
for t in reversed(range(data_size)):
|
||||
if t == data_size - 1:
|
||||
nextnonterminal = 1.0 - next_done
|
||||
next_return = next_value
|
||||
else:
|
||||
nextnonterminal = 1.0 - dones[t + 1]
|
||||
next_return = returns[t + 1]
|
||||
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
|
||||
advantages = returns - values
|
||||
return advantages, returns
|
||||
|
||||
|
||||
class AimbotSideChannel(SideChannel):
|
||||
def __init__(self, channel_id: uuid.UUID) -> None:
|
||||
super().__init__(channel_id)
|
||||
|
||||
def on_message_received(self, msg: IncomingMessage) -> None:
|
||||
"""
|
||||
Note: We must implement this method of the SideChannel interface to
|
||||
receive messages from Unity
|
||||
"""
|
||||
thisMessage = msg.read_string()
|
||||
# print(thisMessage)
|
||||
thisResult = thisMessage.split("|")
|
||||
if thisResult[0] == "result":
|
||||
TotalRounds[thisResult[1]] += 1
|
||||
if thisResult[2] == "Win":
|
||||
WinRounds[thisResult[1]] += 1
|
||||
# print(TotalRounds)
|
||||
# print(WinRounds)
|
||||
elif thisResult[0] == "Error":
|
||||
print(thisMessage)
|
||||
|
||||
# 发送函数
|
||||
def send_string(self, data: str) -> None:
|
||||
# send a string toC#
|
||||
msg = OutgoingMessage()
|
||||
msg.write_string(data)
|
||||
super().queue_message_to_send(msg)
|
||||
|
||||
def send_bool(self, data: bool) -> None:
|
||||
msg = OutgoingMessage()
|
||||
msg.write_bool(data)
|
||||
super().queue_message_to_send(msg)
|
||||
|
||||
def send_int(self, data: int) -> None:
|
||||
msg = OutgoingMessage()
|
||||
msg.write_int32(data)
|
||||
super().queue_message_to_send(msg)
|
||||
|
||||
def send_float(self, data: float) -> None:
|
||||
msg = OutgoingMessage()
|
||||
msg.write_float32(data)
|
||||
super().queue_message_to_send(msg)
|
||||
|
||||
def send_float_list(self, data: List[float]) -> None:
|
||||
msg = OutgoingMessage()
|
||||
msg.write_float32_list(data)
|
||||
super().queue_message_to_send(msg)
|
||||
|
||||
|
||||
def broadCastEndReward(rewardBF: list, remainTime: float):
|
||||
thisRewardBF = rewardBF
|
||||
if rewardBF[-1] <= -500:
|
||||
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
|
||||
thisRewardBF[-1] = rewardBF[-1] - BASE_LOSEREWARD
|
||||
thisRewardBF = (np.asarray(thisRewardBF)).tolist()
|
||||
elif rewardBF[-1] >= 500:
|
||||
# print("Win! Broadcast reward!",rewardBF[-1])
|
||||
thisRewardBF[-1] = rewardBF[-1] - BASE_WINREWARD
|
||||
thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * RESULT_BROADCAST_RATIO)).tolist()
|
||||
else:
|
||||
print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1])
|
||||
return torch.Tensor(thisRewardBF).to(device)
|
||||
|
||||
|
||||
class Discriminator(nn.Module):
|
||||
def __init__(self, env: Aimbot, targetNum: int):
|
||||
super(Discriminator, self).__init__()
|
||||
self.targetNum = targetNum
|
||||
self.trajectory_size = env.unity_observation_shape + env.unity_action_size
|
||||
|
||||
self.network = nn.ModuleList(
|
||||
[
|
||||
nn.Sequential(
|
||||
layer_init(nn.Linear(np.array(self.trajectory_size).prod(), 300)),
|
||||
nn.ReLU(),
|
||||
layer_init(nn.Linear(300, 200)),
|
||||
nn.ReLU(),
|
||||
)
|
||||
for i in range(targetNum)
|
||||
]
|
||||
)
|
||||
self.output = nn.ModuleList(
|
||||
[nn.Relu(layer_init(nn.Linear(200, 1), std=0.01)) for i in range(targetNum)]
|
||||
)
|
||||
|
||||
def get_value(self, state: torch.Tensor):
|
||||
targets = state[:, 0].to(torch.int32)
|
||||
hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])])
|
||||
return torch.stack([self.output[targets[i]](hidden[i]) for i in range(targets.size()[0])])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
random.seed(DEFAULT_SEED)
|
||||
np.random.seed(DEFAULT_SEED)
|
||||
torch.manual_seed(DEFAULT_SEED)
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and useCUDA else "cpu")
|
||||
|
||||
aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID)
|
||||
env = Aimbot(
|
||||
envPath=ENV_PATH, workerID=WORKER_ID, basePort=BASE_PORT, side_channels=[aimBotsideChannel]
|
||||
)
|
||||
if LOAD_DIR is None:
|
||||
agent = PPOAgent(env, TARGETNUM).to(device)
|
||||
discriminator = Discriminator(env, TARGETNUM).to(device)
|
||||
else:
|
||||
print("NOT AVALABLE")
|
||||
agent = torch.load(LOAD_DIR)
|
||||
print("Load Agent", LOAD_DIR)
|
||||
print(agent.eval())
|
||||
|
||||
# Optimizers
|
||||
PPOoptimizer = optim.Adam(agent.parameters(), lr=LEARNING_RATE, eps=1e-5)
|
||||
DSCMoptimizer = optim.Adam(discriminator.parameters(), lr=LEARNING_RATE, eps=1e-5)
|
||||
|
||||
# Tensorboard and WandB Recorder
|
||||
game_name = "Aimbot_GAIL"
|
||||
game_type = "ORG"
|
||||
run_name = f"{game_name}_{game_type}_{DEFAULT_SEED}_{int(time.time())}"
|
||||
if WANDB_TACK:
|
||||
wandb.init(
|
||||
project=game_name,
|
||||
entity=WAND_ENTITY,
|
||||
sync_tensorboard=True,
|
||||
config=vars(args),
|
||||
name=run_name,
|
||||
monitor_gym=True,
|
||||
save_code=True,
|
||||
)
|
||||
writer = SummaryWriter(f"runs/{run_name}")
|
||||
writer.add_text(
|
||||
"hyperparameters",
|
||||
"|param|value|\n|-|-|\n%s"
|
||||
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
|
||||
)
|
||||
|
||||
agentMem = GAILMem()
|
||||
expertMem = GAILMem()
|
||||
expertMem.loadMemFile(EXPERT_PATH)
|
||||
|
||||
# start the game
|
||||
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
|
||||
target_steps = [0 for i in range(TARGETNUM)]
|
||||
start_time = time.time()
|
||||
state, _, done = env.reset()
|
||||
# Trajectory Buffer
|
||||
ob_bf = [[] for i in range(env.unity_agent_num)]
|
||||
act_bf = [[] for i in range(env.unity_agent_num)]
|
||||
dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
||||
con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
||||
rewards_bf = [[] for i in range(env.unity_agent_num)]
|
||||
dones_bf = [[] for i in range(env.unity_agent_num)]
|
||||
values_bf = [[] for i in range(env.unity_agent_num)]
|
||||
|
||||
# initialize empty training datasets
|
||||
obs = [
|
||||
torch.tensor([]).to(device) for i in range(TARGETNUM)
|
||||
] # (TARGETNUM,n,env.unity_observation_size)
|
||||
actions = [
|
||||
torch.tensor([]).to(device) for i in range(TARGETNUM)
|
||||
] # (TARGETNUM,n,env.unity_action_size)
|
||||
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||
dones = [torch.tensor([]).to(device) for i in range(TARGETNUM)]
|
||||
|
||||
next_state_buffer = [[] for i in range(TARGETNUM)]
|
||||
next_done_buffer = [[] for i in range(TARGETNUM)]
|
||||
for total_steps in range(total_update_step):
|
||||
print("new episode")
|
||||
trainQueue = []
|
||||
while True:
|
||||
with torch.no_grad():
|
||||
# predict actions
|
||||
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
|
||||
torch.Tensor(state).to(device)
|
||||
)
|
||||
value = value.flatten()
|
||||
|
||||
# variable from GPU to CPU
|
||||
action_cpu = action.cpu().numpy()
|
||||
dis_logprob_cpu = dis_logprob.cpu().numpy()
|
||||
con_logprob_cpu = con_logprob.cpu().numpy()
|
||||
value_cpu = value.cpu().numpy()
|
||||
|
||||
# Environment step
|
||||
next_state, reward, next_done = env.step(action_cpu)
|
||||
|
||||
# save mem
|
||||
for i in range(env.unity_agent_num):
|
||||
# save memories to buffers
|
||||
ob_bf[i].append(state[i])
|
||||
act_bf[i].append(action_cpu[i])
|
||||
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
||||
con_logprobs_bf[i].append(con_logprob_cpu[i])
|
||||
rewards_bf[i].append(reward[i])
|
||||
dones_bf[i].append(done[i])
|
||||
values_bf[i].append(value_cpu[i])
|
||||
if next_done[i] == True:
|
||||
# finished a round, send finished memories to training datasets
|
||||
thisTarget = int(state[i, 0])
|
||||
next_state_buffer[thisTarget] = next_state
|
||||
next_done_buffer[thisTarget] = next_done
|
||||
obs[thisTarget] = torch.cat(
|
||||
(obs[thisTarget], torch.tensor(ob_bf[i]).to(device)), 0
|
||||
)
|
||||
actions[thisTarget] = torch.cat(
|
||||
(actions[thisTarget], torch.tensor(act_bf[i]).to(device)), 0
|
||||
)
|
||||
dis_logprobs[thisTarget] = torch.cat(
|
||||
(dis_logprobs[thisTarget], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
||||
)
|
||||
con_logprobs[thisTarget] = torch.cat(
|
||||
(con_logprobs[thisTarget], torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
||||
)
|
||||
values[thisTarget] = torch.cat(
|
||||
(values[thisTarget], torch.tensor(values_bf[i]).to(device)), 0
|
||||
)
|
||||
dones[thisTarget] = torch.cat(
|
||||
(dones[thisTarget], torch.tensor(dones_bf[i]).to(device)), 0
|
||||
)
|
||||
for i in range(TARGETNUM):
|
||||
if obs[i].size()[0] >= args.datasetSize:
|
||||
# start train NN
|
||||
trainQueue.append(i)
|
||||
if len(trainQueue) > 0:
|
||||
break
|
||||
state, done = next_state, next_done
|
||||
|
||||
if args.train:
|
||||
meanRewardList = []
|
||||
for thisT in trainQueue:
|
||||
target_steps[thisT] += 1
|
||||
# get agent training datasets
|
||||
b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
|
||||
b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
|
||||
b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
|
||||
b_con_logprobs = con_logprobs[thisT].reshape(-1)
|
||||
b_values = values[thisT].reshape(-1)
|
||||
b_dones = dones[thisT].reshape(-1)
|
||||
dataNum = b_obs[thisT].size()[0]
|
||||
# get expert training datasets from GAILMem
|
||||
exp_obs, _, exp_actions, _, _ = expertMem.getRandomSample(dataNum, thisT)
|
||||
|
||||
# trajectory
|
||||
agent_trajectory = torch.cat((b_obs, b_actions), dim=1)
|
||||
exp_trajectory = torch.cat((exp_obs, exp_actions), dim=1)
|
||||
|
||||
# discriminator ACC
|
||||
with torch.no_grad():
|
||||
thisDSCM_agent_acc = torch.mean(discriminator.get_value(agent_trajectory))
|
||||
thisDSCM_expt_acc = torch.mean(discriminator.get_value(exp_trajectory))
|
||||
# train discriminator
|
||||
b_inds = np.arange(dataNum)
|
||||
for epoch in range(args.epoch):
|
||||
np.random.shuffle(b_inds)
|
||||
# train discriminator
|
||||
for start in range(0, dataNum, args.minibatchSize):
|
||||
end = start + args.minibatchSize
|
||||
mb_inds = b_inds[start:end]
|
||||
exp_value = discriminator.get_value(exp_trajectory[mb_inds])
|
||||
agent_value = discriminator.get_value(agent_trajectory[mb_inds])
|
||||
# DSCM loss function
|
||||
exp_loss = torch.log(1.0 - exp_value + EPS)
|
||||
agent_loss = torch.log(agent_value + EPS)
|
||||
loss = exp_loss + agent_loss
|
||||
# DSCM backward
|
||||
DSCMoptimizer.zero_grad()
|
||||
loss.backward()
|
||||
nn.utils.clip_grad_norm_(discriminator.parameters(), args.max_grad_norm)
|
||||
DSCMoptimizer.step()
|
||||
# get discriminator reward
|
||||
with torch.no_grad():
|
||||
DSCMReward = discriminator.get_value(agent_trajectory) * DSCM_ENHANCED
|
||||
advantages, rts = GAE(
|
||||
agent,
|
||||
args,
|
||||
DSCMReward,
|
||||
b_dones,
|
||||
b_values,
|
||||
next_state_buffer[thisTarget],
|
||||
next_done_buffer[thisTarget],
|
||||
)
|
||||
b_advantages = advantages.reshape(-1)
|
||||
b_returns = rts.reshape(-1)
|
||||
|
||||
# train PPO agent
|
||||
for epoch in range(args.epoch):
|
||||
np.random.shuffle(b_inds)
|
||||
for start in range(0, dataNum, args.minibatchSize):
|
||||
end = start + args.minibatchSize
|
||||
mb_inds = b_inds[start:end]
|
||||
mb_advantages = b_advantages[mb_inds]
|
||||
# normalize advantages
|
||||
if args.norm_adv:
|
||||
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
||||
mb_advantages.std() + 1e-8
|
||||
)
|
||||
(
|
||||
_,
|
||||
new_dis_logprob,
|
||||
dis_entropy,
|
||||
new_con_logprob,
|
||||
con_entropy,
|
||||
newvalue,
|
||||
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
||||
# discrete ratio
|
||||
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
||||
dis_ratio = dis_logratio.exp()
|
||||
# continuous ratio
|
||||
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
|
||||
con_ratio = con_logratio.exp()
|
||||
# discrete Policy loss
|
||||
dis_pg_loss_orig = -mb_advantages * dis_ratio
|
||||
dis_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
||||
)
|
||||
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
|
||||
# continuous Policy loss
|
||||
con_pg_loss_orig = -mb_advantages * con_ratio
|
||||
con_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
||||
)
|
||||
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
||||
# Value loss
|
||||
newvalue = newvalue.view(-1)
|
||||
if args.clip_vloss:
|
||||
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
||||
v_clipped = b_values[mb_inds] + torch.clamp(
|
||||
newvalue - b_values[mb_inds],
|
||||
-args.clip_coef,
|
||||
args.clip_coef,
|
||||
)
|
||||
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
||||
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
||||
v_loss = 0.5 * v_loss_max.mean()
|
||||
else:
|
||||
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
||||
# total loss
|
||||
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
||||
loss = (
|
||||
dis_pg_loss * args.policy_coef
|
||||
+ con_pg_loss * args.policy_coef
|
||||
- entropy_loss * args.ent_coef
|
||||
+ v_loss * args.critic_coef
|
||||
)
|
||||
PPOoptimizer.zero_grad()
|
||||
loss.backward()
|
||||
# Clips gradient norm of an iterable of parameters.
|
||||
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
|
||||
PPOoptimizer.step()
|
||||
# record mean reward before clear history
|
||||
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
|
||||
meanRewardList.append(targetRewardMean)
|
||||
targetName = Targets(thisT).name
|
||||
|
||||
# clear this target trainning set buffer
|
||||
obs[thisT] = torch.tensor([]).to(device)
|
||||
actions[thisT] = torch.tensor([]).to(device)
|
||||
dis_logprobs[thisT] = torch.tensor([]).to(device)
|
||||
con_logprobs[thisT] = torch.tensor([]).to(device)
|
||||
rewards[thisT] = torch.tensor([]).to(device)
|
||||
values[thisT] = torch.tensor([]).to(device)
|
||||
dones[thisT] = torch.tensor([]).to(device)
|
||||
|
||||
# record rewards for plotting purposes
|
||||
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/Discriminator_EXP_ACC", thisDSCM_expt_acc,target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/Discriminator_Agent_ACC", thisDSCM_agent_acc,target_steps[thisT])
|
||||
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
||||
TotalRewardMean = np.mean(meanRewardList)
|
||||
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
||||
writer.add_scalar("GlobalCharts/learning_rate", PPOoptimizer.param_groups[0]["lr"], total_steps)
|
||||
# New Record!
|
||||
if TotalRewardMean > bestReward:
|
||||
bestReward = targetRewardMean
|
||||
saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt"
|
||||
torch.save(agent, saveDir)
|
||||
|
||||
# train PPO
|
||||
# record
|
176
Aimbot-PPO-Python/Pytorch/GAILMem.py
Normal file
176
Aimbot-PPO-Python/Pytorch/GAILMem.py
Normal file
@ -0,0 +1,176 @@
|
||||
import os
|
||||
import random
|
||||
import numpy as np
|
||||
|
||||
|
||||
class GAILMem():
|
||||
def __init__(self, targetNum):
|
||||
self.targetNum = targetNum
|
||||
self.states = [[] for i in range(self.targetNum)]
|
||||
self.actorProbs = [[] for i in range(self.targetNum)]
|
||||
self.actions = [[] for i in range(self.targetNum)]
|
||||
self.rewards = [[] for i in range(self.targetNum)]
|
||||
self.dones = [[] for i in range(self.targetNum)]
|
||||
self.memNum = [0 for i in range(self.targetNum)]
|
||||
|
||||
def clearMem(self,targetType):
|
||||
"""clearMemories"""
|
||||
self.states[targetType] = []
|
||||
self.actorProbs[targetType] = []
|
||||
self.actions[targetType] = []
|
||||
self.rewards[targetType] = []
|
||||
self.dones[targetType] = []
|
||||
self.memNum[targetType] = 0
|
||||
def saveMemtoFile(self, dir: str):
|
||||
"""save memories ndarray to npz file
|
||||
|
||||
Args:
|
||||
dir (str): save direction,like"GAIL-Expert-Data/",end with "/"
|
||||
"""
|
||||
for i in range(self.targetNum):
|
||||
statesNP = np.asarray(self.states[i])
|
||||
actorProbsNP = np.asarray(self.actorProbs[i])
|
||||
actionsNP = np.asarray(self.actions[i])
|
||||
rewardsNP = np.asarray(self.rewards[i])
|
||||
donesNP = np.asarray(self.dones[i])
|
||||
thisSaveDir = dir + "pack-" + str(self.memNum) + str(i)
|
||||
try:
|
||||
np.savez(
|
||||
thisSaveDir,
|
||||
states=statesNP,
|
||||
actorProbs=actorProbsNP,
|
||||
actions=actionsNP,
|
||||
rewards=rewardsNP,
|
||||
dones=donesNP,
|
||||
)
|
||||
except FileNotFoundError:
|
||||
os.mkdir(dir)
|
||||
np.savez(
|
||||
thisSaveDir,
|
||||
states=statesNP,
|
||||
actorProbs=actorProbsNP,
|
||||
actions=actionsNP,
|
||||
rewards=rewardsNP,
|
||||
dones=donesNP,
|
||||
)
|
||||
def loadMemFile(self, dir: str):
|
||||
"""load memories from mpz file
|
||||
|
||||
Args:
|
||||
dir (str): file direction
|
||||
"""
|
||||
for i in range(self.targetNum):
|
||||
self.clearMem(i)
|
||||
loadDir = dir + "pack-" + str(self.memNum) + str(i) + ".npz"
|
||||
memFile = np.load(loadDir, allow_pickle=True)
|
||||
self.states[i] = memFile["states"].tolist()
|
||||
self.actorProbs[i] = memFile["actorProbs"].tolist()
|
||||
self.actions[i] = memFile["actions"].tolist()
|
||||
self.rewards[i] = memFile["rewards"].tolist()
|
||||
self.dones[i] = memFile["dones"].tolist()
|
||||
self.memNum = len(self.states[i])
|
||||
def getRandomSample(self,sampleNum: int,targetType:int):
|
||||
"""get random unique sample set.
|
||||
|
||||
Args:
|
||||
sampleNum (int, optional): sample number, while 0 return all samples. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
tuple: (states,actorProbs,actions,rewards,dones)
|
||||
"""
|
||||
if sampleNum == 0:
|
||||
return (
|
||||
self.getStates(),
|
||||
self.getActorProbs(),
|
||||
self.getActions(),
|
||||
self.getRewards(),
|
||||
self.getDones(),
|
||||
)
|
||||
else:
|
||||
randIndex = random.sample(range(0, self.memNum), sampleNum)
|
||||
return (
|
||||
self.standDims(np.asarray(self.states)[randIndex]),
|
||||
self.standDims(np.asarray(self.actorProbs)[randIndex]),
|
||||
self.standDims(np.asarray(self.actions)[randIndex]),
|
||||
self.standDims(np.asarray(self.rewards)[randIndex]),
|
||||
self.standDims(np.asarray(self.dones)[randIndex]),
|
||||
)
|
||||
|
||||
def getStates(self,targetType):
|
||||
"""get all States data as ndarray
|
||||
|
||||
Returns:
|
||||
ndarray: ndarray type State data
|
||||
"""
|
||||
return self.standDims(np.asarray(self.states[targetType]))
|
||||
|
||||
def getActorProbs(self,targetType):
|
||||
"""get all ActorProbs data as ndarray
|
||||
|
||||
Returns:
|
||||
ndarray: ndarray type ActorProbs data
|
||||
"""
|
||||
|
||||
return self.standDims(np.asarray(self.actorProbs[targetType]))
|
||||
|
||||
def getActions(self,targetType):
|
||||
"""get all Actions data as ndarray
|
||||
|
||||
Returns:
|
||||
ndarray: ndarray type Actions data
|
||||
"""
|
||||
|
||||
return self.standDims(np.asarray(self.actions[targetType]))
|
||||
|
||||
def getRewards(self,targetType):
|
||||
"""get all Rewards data as ndarray
|
||||
|
||||
Returns:
|
||||
ndarray: ndarray type Rewards data
|
||||
"""
|
||||
|
||||
return self.standDims(np.asarray(self.rewards[targetType]))
|
||||
|
||||
def getDones(self,targetType):
|
||||
"""get all Dones data as ndarray
|
||||
|
||||
Returns:
|
||||
ndarray: ndarray type Dones data
|
||||
"""
|
||||
|
||||
return self.standDims(np.asarray(self.dones[targetType]))
|
||||
|
||||
def standDims(self, data):
|
||||
"""standalize data's dimension
|
||||
|
||||
Args:
|
||||
data (list): data list
|
||||
|
||||
Returns:
|
||||
ndarray: ndarra type data
|
||||
"""
|
||||
# standarlize data's dimension
|
||||
if np.ndim(data) > 2:
|
||||
return np.squeeze(data, axis=1)
|
||||
elif np.ndim(data) < 2:
|
||||
return np.expand_dims(data, axis=1)
|
||||
else:
|
||||
return np.asarray(data)
|
||||
|
||||
def saveMems(self, state, actorProb, action, reward, done):
|
||||
"""save memories
|
||||
|
||||
Args:
|
||||
state (_type_): sates
|
||||
actorProb (_type_): actor predict result
|
||||
action (_type_): actor choosed action
|
||||
reward (_type_): reward
|
||||
done (function): done
|
||||
"""
|
||||
targetType = int(state[0,0])
|
||||
self.states[targetType].append(state)
|
||||
self.actorProbs[targetType].append(actorProb)
|
||||
self.actions[targetType].append(action)
|
||||
self.rewards[targetType].append(reward)
|
||||
self.dones[targetType].append(done)
|
||||
self.memNum[targetType] += 1
|
97
Aimbot-PPO-Python/Pytorch/GAILRecorder.py
Normal file
97
Aimbot-PPO-Python/Pytorch/GAILRecorder.py
Normal file
@ -0,0 +1,97 @@
|
||||
import time
|
||||
import numpy as np
|
||||
from AimbotEnv import Aimbot
|
||||
from GAILMem import GAILMem
|
||||
import keyboard
|
||||
import mouse
|
||||
import math
|
||||
|
||||
|
||||
# Env
|
||||
ENV_PATH = "../Build/HUMAN-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy/Aimbot-ParallelEnv"
|
||||
WORKER_ID = 1
|
||||
BASE_PORT = 200
|
||||
|
||||
# ENV Para
|
||||
MOUSEDISCOUNT = 20.0
|
||||
MAX_EP = 10000000
|
||||
STACKSTATESIZE = 3
|
||||
STACKINTERCE = 29
|
||||
|
||||
|
||||
class HumanActions:
|
||||
def __init__(self, mouseDiscount: float = 10, screenW: int = 1920, screenH: int = 1080):
|
||||
def multiPressed():
|
||||
pass
|
||||
|
||||
keyboard.add_hotkey("w+a", multiPressed)
|
||||
keyboard.add_hotkey("w+d", multiPressed)
|
||||
keyboard.add_hotkey("s+a", multiPressed)
|
||||
keyboard.add_hotkey("s+d", multiPressed)
|
||||
self.screenW = screenW
|
||||
self.screenH = screenH
|
||||
self.MOUSEDISCOUNT = mouseDiscount
|
||||
self.mouseSmooth = 5
|
||||
self.mouseMax = 10
|
||||
|
||||
def getHumanActions(self):
|
||||
x, _ = mouse.get_position()
|
||||
xMovement = (x - self.screenW / 2) / self.MOUSEDISCOUNT
|
||||
xMovement = self.smoothMouseMovement(xMovement)
|
||||
ws = 0
|
||||
ad = 0
|
||||
click = 0
|
||||
if keyboard.is_pressed("w"):
|
||||
ws = 1
|
||||
elif keyboard.is_pressed("s"):
|
||||
ws = 2
|
||||
if keyboard.is_pressed("d"):
|
||||
ad = 1
|
||||
elif keyboard.is_pressed("a"):
|
||||
ad = 2
|
||||
if keyboard.is_pressed("w+d"):
|
||||
ws = 1
|
||||
ad = 1
|
||||
elif keyboard.is_pressed("w+a"):
|
||||
ws = 1
|
||||
ad = 2
|
||||
elif keyboard.is_pressed("s+d"):
|
||||
ws = 2
|
||||
ad = 1
|
||||
elif keyboard.is_pressed("s+a"):
|
||||
ws = 2
|
||||
ad = 2
|
||||
if keyboard.is_pressed("0"):
|
||||
click = 1
|
||||
|
||||
actions = np.asarray([[ws, ad, click, xMovement]])
|
||||
|
||||
mouse.move(self.screenW / 2, self.screenH / 2)
|
||||
return actions
|
||||
|
||||
def smoothMouseMovement(self, x: float):
|
||||
out = (1 / (1 + math.exp(-x / self.mouseSmooth)) - 1 / 2) * self.mouseMax * 2
|
||||
return out
|
||||
|
||||
if __name__ == "__main__":
|
||||
env = Aimbot(
|
||||
envPath=ENV_PATH,
|
||||
workerID=WORKER_ID,
|
||||
basePort=BASE_PORT,
|
||||
side_channels=[],
|
||||
)
|
||||
demoMem = GAILMem(4)
|
||||
demoAct = HumanActions(mouseDiscount=MOUSEDISCOUNT)
|
||||
|
||||
for ep in range(MAX_EP):
|
||||
print("EP Start")
|
||||
done = False
|
||||
while not done:
|
||||
actions = demoAct.getHumanActions()
|
||||
nextState, r, done = env.step(actions=actions)
|
||||
demoMem.saveMems(state=nextState, actorProb=None, action=actions, reward=None, done=None)
|
||||
state = nextState
|
||||
#nowMemNum = demoMem.memNum
|
||||
saveSteps = 500
|
||||
lastMemCheckPoint = 0
|
||||
|
@ -47,7 +47,7 @@ CLIP_COEF = 0.1
|
||||
POLICY_COEF = 1.0
|
||||
ENTROPY_COEF = 0.01
|
||||
CRITIC_COEF = 0.5
|
||||
TARGET_LEARNING_RATE = 5e-5
|
||||
TARGET_LEARNING_RATE = 1e-5
|
||||
|
||||
ANNEAL_LEARNING_RATE = True
|
||||
CLIP_VLOSS = True
|
||||
@ -159,23 +159,24 @@ class PPOAgent(nn.Module):
|
||||
self.discrete_shape = list(env.unity_discrete_branches)
|
||||
self.continuous_size = env.unity_continuous_size
|
||||
|
||||
self.network = nn.Sequential(
|
||||
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 500)),
|
||||
self.network = nn.ModuleList([nn.Sequential(
|
||||
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 300)),
|
||||
nn.ReLU(),
|
||||
layer_init(nn.Linear(500, 300)),
|
||||
nn.ReLU(),
|
||||
)
|
||||
self.actor_dis = nn.ModuleList([layer_init(nn.Linear(300, self.discrete_size), std=0.01) for i in range(targetNum)])
|
||||
self.actor_mean = nn.ModuleList([layer_init(nn.Linear(300, self.continuous_size), std=0.01) for i in range(targetNum)])
|
||||
layer_init(nn.Linear(300, 200)),
|
||||
nn.ReLU()) for i in range(targetNum)])
|
||||
self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.01) for i in range(targetNum)])
|
||||
self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.01) for i in range(targetNum)])
|
||||
self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)])
|
||||
self.critic = layer_init(nn.Linear(300, 1), std=1)
|
||||
self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)])
|
||||
|
||||
def get_value(self, state: torch.Tensor):
|
||||
return self.critic(self.network(state))
|
||||
targets = state[:,0].to(torch.int32)
|
||||
hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])])
|
||||
return torch.stack([self.critic[targets[i]](hidden[i])for i in range(targets.size()[0])])
|
||||
|
||||
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||
hidden = self.network(state)
|
||||
targets = state[:,0].to(torch.int32)
|
||||
hidden = torch.stack([self.network[targets[i]](state[i]) for i in range(targets.size()[0])])
|
||||
|
||||
# discrete
|
||||
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
||||
@ -188,6 +189,8 @@ class PPOAgent(nn.Module):
|
||||
# print(action_logstd)
|
||||
action_std = torch.squeeze(torch.stack([torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])]),dim = -1) # torch.exp(action_logstd)
|
||||
con_probs = Normal(actions_mean, action_std)
|
||||
# critic
|
||||
criticV = torch.stack([self.critic[targets[i]](hidden[i])for i in range(targets.size()[0])])
|
||||
|
||||
if actions is None:
|
||||
if args.train:
|
||||
@ -207,13 +210,14 @@ class PPOAgent(nn.Module):
|
||||
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
|
||||
)
|
||||
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
||||
|
||||
return (
|
||||
actions,
|
||||
dis_log_prob.sum(0),
|
||||
dis_entropy.sum(0),
|
||||
con_probs.log_prob(conAct).sum(1),
|
||||
con_probs.entropy().sum(1),
|
||||
self.critic(hidden),
|
||||
criticV,
|
||||
)
|
||||
|
||||
|
||||
@ -436,7 +440,7 @@ if __name__ == "__main__":
|
||||
thisRewardsTensor,
|
||||
torch.Tensor(dones_bf[i]).to(device),
|
||||
torch.tensor(values_bf[i]).to(device),
|
||||
torch.tensor(next_state[i]).to(device),
|
||||
torch.tensor([next_state[i]]).to(device),
|
||||
torch.Tensor([next_done[i]]).to(device),
|
||||
)
|
||||
# send memories to training datasets
|
||||
@ -642,7 +646,6 @@ if __name__ == "__main__":
|
||||
|
||||
# record rewards for plotting purposes
|
||||
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
|
||||
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
|
||||
|
@ -792,6 +792,29 @@
|
||||
"source": [
|
||||
"env.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"float"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"aa = torch.Tensor([[1,2,3],[2,2,3],[3,2,3],[4,2,3]]).to(\"cuda\")\n",
|
||||
"type(torch.mean(aa).item())"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
Loading…
Reference in New Issue
Block a user