2022-12-01 10:55:51 +00:00
import argparse
import wandb
import time
import numpy as np
import random
import uuid
import torch
import torch . nn as nn
import torch . optim as optim
from AimbotEnv import Aimbot
from tqdm import tqdm
2022-12-02 22:54:38 +00:00
from enum import Enum
2022-12-01 10:55:51 +00:00
from torch . distributions . normal import Normal
from torch . distributions . categorical import Categorical
from distutils . util import strtobool
from torch . utils . tensorboard import SummaryWriter
from mlagents_envs . environment import UnityEnvironment
from mlagents_envs . side_channel . side_channel import (
SideChannel ,
IncomingMessage ,
OutgoingMessage ,
)
from typing import List
2022-12-09 20:03:13 +00:00
bestReward = - 1
2022-12-01 10:55:51 +00:00
2022-12-09 20:03:13 +00:00
DEFAULT_SEED = 9331
ENV_PATH = " ../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.5-FreeOnly-NormalMapSize/Aimbot-ParallelEnv "
2022-12-01 10:55:51 +00:00
SIDE_CHANNEL_UUID = uuid . UUID ( " 8bbfb62a-99b4-457c-879d-b78b69066b5e " )
WAND_ENTITY = " koha9 "
2022-12-09 20:03:13 +00:00
WORKER_ID = 1
BASE_PORT = 1000
2022-12-01 10:55:51 +00:00
# max round steps per agent is 2500/Decision_period, 25 seconds
# !!!check every parameters before run!!!
2022-12-09 20:03:13 +00:00
TOTAL_STEPS = 3150000
BATCH_SIZE = 256
MAX_TRAINNING_DATASETS = 6000
2022-12-01 10:55:51 +00:00
DECISION_PERIOD = 1
2022-12-09 20:03:13 +00:00
LEARNING_RATE = 5e-4
2022-12-01 10:55:51 +00:00
GAMMA = 0.99
GAE_LAMBDA = 0.95
EPOCHS = 4
2022-12-09 20:03:13 +00:00
CLIP_COEF = 0.11
LOSS_COEF = [ 1.0 , 1.0 , 1.0 , 1.0 ] # free go attack defence
POLICY_COEF = [ 1.0 , 1.0 , 1.0 , 1.0 ]
ENTROPY_COEF = [ 0.1 , 0.1 , 0.1 , 0.1 ]
CRITIC_COEF = [ 0.5 , 0.5 , 0.5 , 0.5 ]
TARGET_LEARNING_RATE = 1e-6
2022-12-01 10:55:51 +00:00
ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True
NORM_ADV = True
TRAIN = True
WANDB_TACK = True
2022-12-02 22:54:38 +00:00
LOAD_DIR = None
2022-12-09 20:03:13 +00:00
#LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670522099-freeonly-12/Aimbot-target-last.pt"
2022-12-01 10:55:51 +00:00
# public data
2022-12-02 22:54:38 +00:00
class Targets ( Enum ) :
Free = 0
Go = 1
Attack = 2
Defence = 3
Num = 4
2022-12-09 20:03:13 +00:00
TARGET_STATE_SIZE = 7 # 6+1
TIME_STATE_SIZE = 1
GUN_STATE_SIZE = 1
MY_STATE_SIZE = 4
TOTAL_MIDDLE_STATE_SIZE = TARGET_STATE_SIZE + TIME_STATE_SIZE + GUN_STATE_SIZE + MY_STATE_SIZE
2022-12-02 18:58:19 +00:00
BASE_WINREWARD = 999
BASE_LOSEREWARD = - 999
2022-12-01 10:55:51 +00:00
TARGETNUM = 4
2022-12-02 18:58:19 +00:00
ENV_TIMELIMIT = 30
2022-12-09 20:03:13 +00:00
RESULT_BROADCAST_RATIO = 1 / ENV_TIMELIMIT
2022-12-02 22:54:38 +00:00
TotalRounds = { " Free " : 0 , " Go " : 0 , " Attack " : 0 }
WinRounds = { " Free " : 0 , " Go " : 0 , " Attack " : 0 }
# !!!SPECIAL PARAMETERS!!!
# change it while program is finished
using_targets_num = 3
2022-12-01 10:55:51 +00:00
def parse_args ( ) :
# fmt: off
# pytorch and environment parameters
parser = argparse . ArgumentParser ( )
parser . add_argument ( " --seed " , type = int , default = DEFAULT_SEED ,
help = " seed of the experiment " )
parser . add_argument ( " --path " , type = str , default = ENV_PATH ,
help = " enviroment path " )
parser . add_argument ( " --workerID " , type = int , default = WORKER_ID ,
help = " unity worker ID " )
parser . add_argument ( " --baseport " , type = int , default = BASE_PORT ,
help = " port to connect to Unity environment " )
parser . add_argument ( " --lr " , type = float , default = LEARNING_RATE ,
help = " the learning rate of optimizer " )
parser . add_argument ( " --cuda " , type = lambda x : bool ( strtobool ( x ) ) , default = True , nargs = " ? " , const = True ,
help = " if toggled, cuda will be enabled by default " )
parser . add_argument ( " --total-timesteps " , type = int , default = TOTAL_STEPS ,
help = " total timesteps of the experiments " )
# model parameters
parser . add_argument ( " --train " , type = lambda x : bool ( strtobool ( x ) ) , default = TRAIN , nargs = " ? " , const = True ,
help = " Train Model or not " )
parser . add_argument ( " --datasetSize " , type = int , default = MAX_TRAINNING_DATASETS ,
help = " training dataset size,start training while dataset collect enough data " )
parser . add_argument ( " --minibatchSize " , type = int , default = BATCH_SIZE ,
help = " nimi batch size " )
parser . add_argument ( " --epochs " , type = int , default = EPOCHS ,
help = " the K epochs to update the policy " )
parser . add_argument ( " --annealLR " , type = lambda x : bool ( strtobool ( x ) ) , default = ANNEAL_LEARNING_RATE , nargs = " ? " , const = True ,
help = " Toggle learning rate annealing for policy and value networks " )
parser . add_argument ( " --wandb-track " , type = lambda x : bool ( strtobool ( x ) ) , default = WANDB_TACK , nargs = " ? " , const = True ,
help = " track on the wandb " )
parser . add_argument ( " --wandb-entity " , type = str , default = WAND_ENTITY ,
help = " the entity (team) of wandb ' s project " )
parser . add_argument ( " --load-dir " , type = str , default = LOAD_DIR ,
help = " load model directory " )
parser . add_argument ( " --decision-period " , type = int , default = DECISION_PERIOD ,
help = " the number of steps to run in each environment per policy rollout " )
2022-12-09 20:03:13 +00:00
parser . add_argument ( " --result-broadcast-ratio " , type = float , default = RESULT_BROADCAST_RATIO ,
help = " broadcast result when win round is reached,r=result-broadcast-ratio*remainTime " )
2022-12-01 10:55:51 +00:00
# GAE loss
parser . add_argument ( " --gae " , type = lambda x : bool ( strtobool ( x ) ) , default = True , nargs = " ? " , const = True ,
help = " Use GAE for advantage computation " )
parser . add_argument ( " --norm-adv " , type = lambda x : bool ( strtobool ( x ) ) , default = NORM_ADV , nargs = " ? " , const = True ,
help = " Toggles advantages normalization " )
parser . add_argument ( " --gamma " , type = float , default = GAMMA ,
help = " the discount factor gamma " )
parser . add_argument ( " --gaeLambda " , type = float , default = GAE_LAMBDA ,
help = " the lambda for the general advantage estimation " )
parser . add_argument ( " --clip-coef " , type = float , default = CLIP_COEF ,
help = " the surrogate clipping coefficient " )
parser . add_argument ( " --policy-coef " , type = float , default = POLICY_COEF ,
help = " coefficient of the policy " )
parser . add_argument ( " --ent-coef " , type = float , default = ENTROPY_COEF ,
help = " coefficient of the entropy " )
parser . add_argument ( " --critic-coef " , type = float , default = CRITIC_COEF ,
help = " coefficient of the value function " )
parser . add_argument ( " --clip-vloss " , type = lambda x : bool ( strtobool ( x ) ) , default = CLIP_VLOSS , nargs = " ? " , const = True ,
help = " Toggles whether or not to use a clipped loss for the value function, as per the paper. " )
parser . add_argument ( " --max-grad-norm " , type = float , default = 0.5 ,
help = " the maximum norm for the gradient clipping " )
parser . add_argument ( " --target-kl " , type = float , default = None ,
help = " the target KL divergence threshold " )
# fmt: on
args = parser . parse_args ( )
return args
def layer_init ( layer , std = np . sqrt ( 2 ) , bias_const = 0.0 ) :
torch . nn . init . orthogonal_ ( layer . weight , std )
torch . nn . init . constant_ ( layer . bias , bias_const )
return layer
class PPOAgent ( nn . Module ) :
def __init__ ( self , env : Aimbot , targetNum : int ) :
super ( PPOAgent , self ) . __init__ ( )
self . targetNum = targetNum
2022-12-09 20:03:13 +00:00
self . targetSize = TARGET_STATE_SIZE
self . timeSize = TIME_STATE_SIZE
self . gunSize = GUN_STATE_SIZE
self . myStateSize = MY_STATE_SIZE
self . totalMiddleSize = TOTAL_MIDDLE_STATE_SIZE
self . head_input_size = env . unity_observation_shape [ 0 ] - self . targetSize - self . timeSize - self . gunSize # except target state input
2022-12-01 10:55:51 +00:00
self . discrete_size = env . unity_discrete_size
self . discrete_shape = list ( env . unity_discrete_branches )
self . continuous_size = env . unity_continuous_size
self . network = nn . Sequential (
2022-12-09 20:03:13 +00:00
layer_init ( nn . Linear ( self . head_input_size , 256 ) ) ,
nn . Tanh ( ) ,
layer_init ( nn . Linear ( 256 , 200 ) ) ,
nn . Tanh ( ) ,
2022-12-01 10:55:51 +00:00
)
2022-12-09 20:03:13 +00:00
self . targetNetwork = nn . ModuleList ( [ nn . Sequential (
layer_init ( nn . Linear ( self . totalMiddleSize + 200 , 128 ) ) ,
nn . Tanh ( ) ,
layer_init ( nn . Linear ( 128 , 64 ) ) ,
nn . Tanh ( )
) for i in range ( targetNum ) ] )
self . actor_dis = nn . ModuleList ( [ layer_init ( nn . Linear ( 64 , self . discrete_size ) , std = 0.01 ) for i in range ( targetNum ) ] )
self . actor_mean = nn . ModuleList ( [ layer_init ( nn . Linear ( 64 , self . continuous_size ) , std = 0.01 ) for i in range ( targetNum ) ] )
2022-12-01 10:55:51 +00:00
self . actor_logstd = nn . ParameterList ( [ nn . Parameter ( torch . zeros ( 1 , self . continuous_size ) ) for i in range ( targetNum ) ] )
2022-12-09 20:03:13 +00:00
self . critic = nn . ModuleList ( [ layer_init ( nn . Linear ( 64 , 1 ) , std = 1 ) for i in range ( targetNum ) ] )
2022-12-01 10:55:51 +00:00
def get_value ( self , state : torch . Tensor ) :
2022-12-09 20:03:13 +00:00
headInput = state [ : , - self . head_input_size : ] # except target state
hidden = self . network ( headInput ) # (n,200)
targets = state [ : , 0 ] . to ( torch . int32 ) # int
middleInput = state [ : , 0 : self . totalMiddleSize ] # (n,targetSize)
middleInput = torch . cat ( [ middleInput , hidden ] , dim = 1 ) # targetState+hidden(n,targetSize+200)
middleLayer = torch . stack ( [ self . targetNetwork [ targets [ i ] ] ( middleInput [ i ] ) for i in range ( targets . size ( ) [ 0 ] ) ] )
return torch . stack ( [ self . critic [ targets [ i ] ] ( middleLayer [ i ] ) for i in range ( targets . size ( ) [ 0 ] ) ] )
2022-12-01 10:55:51 +00:00
def get_actions_value ( self , state : torch . Tensor , actions = None ) :
2022-12-09 20:03:13 +00:00
headInput = state [ : , - self . head_input_size : ] # except target state
hidden = self . network ( headInput ) # (n,200)
targets = state [ : , 0 ] . to ( torch . int32 ) # int
middleInput = state [ : , 0 : self . totalMiddleSize ] # (n,targetSize)
middleInput = torch . cat ( [ middleInput , hidden ] , dim = 1 ) # targetState+hidden(n,targetSize+200)
middleLayer = torch . stack ( [ self . targetNetwork [ targets [ i ] ] ( middleInput [ i ] ) for i in range ( targets . size ( ) [ 0 ] ) ] )
2022-12-01 10:55:51 +00:00
# discrete
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
2022-12-09 20:03:13 +00:00
dis_logits = torch . stack ( [ self . actor_dis [ targets [ i ] ] ( middleLayer [ i ] ) for i in range ( targets . size ( ) [ 0 ] ) ] )
2022-12-01 10:55:51 +00:00
split_logits = torch . split ( dis_logits , self . discrete_shape , dim = 1 )
multi_categoricals = [ Categorical ( logits = thisLogits ) for thisLogits in split_logits ]
# continuous
2022-12-09 20:03:13 +00:00
actions_mean = torch . stack ( [ self . actor_mean [ targets [ i ] ] ( middleLayer [ i ] ) for i in range ( targets . size ( ) [ 0 ] ) ] ) # self.actor_mean(hidden)
2022-12-01 10:55:51 +00:00
# action_logstd = torch.stack([self.actor_logstd[targets[i]].expand_as(actions_mean) for i in range(targets.size()[0])]) # self.actor_logstd.expand_as(actions_mean)
# print(action_logstd)
action_std = torch . squeeze ( torch . stack ( [ torch . exp ( self . actor_logstd [ targets [ i ] ] ) for i in range ( targets . size ( ) [ 0 ] ) ] ) , dim = - 1 ) # torch.exp(action_logstd)
con_probs = Normal ( actions_mean , action_std )
2022-12-09 20:03:13 +00:00
# critic
criticV = torch . stack ( [ self . critic [ targets [ i ] ] ( middleLayer [ i ] ) for i in range ( targets . size ( ) [ 0 ] ) ] )
2022-12-01 10:55:51 +00:00
if actions is None :
if args . train :
# select actions base on probability distribution model
disAct = torch . stack ( [ ctgr . sample ( ) for ctgr in multi_categoricals ] )
conAct = con_probs . sample ( )
actions = torch . cat ( [ disAct . T , conAct ] , dim = 1 )
else :
# select actions base on best probability distribution
disAct = torch . stack ( [ torch . argmax ( logit , dim = 1 ) for logit in split_logits ] )
conAct = actions_mean
actions = torch . cat ( [ disAct . T , conAct ] , dim = 1 )
else :
disAct = actions [ : , 0 : env . unity_discrete_type ] . T
conAct = actions [ : , env . unity_discrete_type : ]
dis_log_prob = torch . stack (
[ ctgr . log_prob ( act ) for act , ctgr in zip ( disAct , multi_categoricals ) ]
)
dis_entropy = torch . stack ( [ ctgr . entropy ( ) for ctgr in multi_categoricals ] )
return (
actions ,
dis_log_prob . sum ( 0 ) ,
dis_entropy . sum ( 0 ) ,
con_probs . log_prob ( conAct ) . sum ( 1 ) ,
con_probs . entropy ( ) . sum ( 1 ) ,
2022-12-09 20:03:13 +00:00
criticV ,
2022-12-01 10:55:51 +00:00
)
def GAE ( agent , args , rewards , dones , values , next_obs , next_done ) :
# GAE
with torch . no_grad ( ) :
next_value = agent . get_value ( next_obs ) . reshape ( 1 , - 1 )
data_size = rewards . size ( ) [ 0 ]
if args . gae :
advantages = torch . zeros_like ( rewards ) . to ( device )
lastgaelam = 0
for t in reversed ( range ( data_size ) ) :
if t == data_size - 1 :
nextnonterminal = 1.0 - next_done
nextvalues = next_value
else :
nextnonterminal = 1.0 - dones [ t + 1 ]
nextvalues = values [ t + 1 ]
delta = rewards [ t ] + args . gamma * nextvalues * nextnonterminal - values [ t ]
advantages [ t ] = lastgaelam = (
delta + args . gamma * args . gaeLambda * nextnonterminal * lastgaelam
)
returns = advantages + values
else :
returns = torch . zeros_like ( rewards ) . to ( device )
for t in reversed ( range ( data_size ) ) :
if t == data_size - 1 :
nextnonterminal = 1.0 - next_done
next_return = next_value
else :
nextnonterminal = 1.0 - dones [ t + 1 ]
next_return = returns [ t + 1 ]
returns [ t ] = rewards [ t ] + args . gamma * nextnonterminal * next_return
advantages = returns - values
return advantages , returns
class AimbotSideChannel ( SideChannel ) :
def __init__ ( self , channel_id : uuid . UUID ) - > None :
super ( ) . __init__ ( channel_id )
def on_message_received ( self , msg : IncomingMessage ) - > None :
"""
Note : We must implement this method of the SideChannel interface to
receive messages from Unity
"""
thisMessage = msg . read_string ( )
2022-12-02 18:58:19 +00:00
# print(thisMessage)
2022-12-01 10:55:51 +00:00
thisResult = thisMessage . split ( " | " )
if ( thisResult [ 0 ] == " result " ) :
TotalRounds [ thisResult [ 1 ] ] + = 1
if ( thisResult [ 2 ] == " Win " ) :
WinRounds [ thisResult [ 1 ] ] + = 1
#print(TotalRounds)
#print(WinRounds)
elif ( thisResult [ 0 ] == " Error " ) :
print ( thisMessage )
# 发送函数
def send_string ( self , data : str ) - > None :
2022-12-02 18:58:19 +00:00
# send a string toC#
2022-12-01 10:55:51 +00:00
msg = OutgoingMessage ( )
msg . write_string ( data )
super ( ) . queue_message_to_send ( msg )
def send_bool ( self , data : bool ) - > None :
msg = OutgoingMessage ( )
msg . write_bool ( data )
super ( ) . queue_message_to_send ( msg )
def send_int ( self , data : int ) - > None :
msg = OutgoingMessage ( )
msg . write_int32 ( data )
super ( ) . queue_message_to_send ( msg )
def send_float ( self , data : float ) - > None :
msg = OutgoingMessage ( )
msg . write_float32 ( data )
super ( ) . queue_message_to_send ( msg )
def send_float_list ( self , data : List [ float ] ) - > None :
msg = OutgoingMessage ( )
msg . write_float32_list ( data )
super ( ) . queue_message_to_send ( msg )
2022-12-02 18:58:19 +00:00
def broadCastEndReward ( rewardBF : list , remainTime : float ) :
thisRewardBF = rewardBF
if ( rewardBF [ - 1 ] < = - 500 ) :
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
thisRewardBF [ - 1 ] = rewardBF [ - 1 ] - BASE_LOSEREWARD
2022-12-09 20:03:13 +00:00
thisRewardBF = thisRewardBF
2022-12-02 18:58:19 +00:00
elif ( rewardBF [ - 1 ] > = 500 ) :
# print("Win! Broadcast reward!",rewardBF[-1])
thisRewardBF [ - 1 ] = rewardBF [ - 1 ] - BASE_WINREWARD
2022-12-09 20:03:13 +00:00
thisRewardBF = ( np . asarray ( thisRewardBF ) + ( remainTime * args . result_broadcast_ratio ) ) . tolist ( )
2022-12-02 18:58:19 +00:00
else :
print ( " !!!!!DIDNT GET RESULT REWARD!!!!!! " , rewardBF [ - 1 ] )
return torch . Tensor ( thisRewardBF ) . to ( device )
2022-12-01 10:55:51 +00:00
if __name__ == " __main__ " :
args = parse_args ( )
random . seed ( args . seed )
np . random . seed ( args . seed )
torch . manual_seed ( args . seed )
device = torch . device ( " cuda " if torch . cuda . is_available ( ) and args . cuda else " cpu " )
# Initialize environment anget optimizer
aimBotsideChannel = AimbotSideChannel ( SIDE_CHANNEL_UUID ) ;
env = Aimbot ( envPath = args . path , workerID = args . workerID , basePort = args . baseport , side_channels = [ aimBotsideChannel ] )
if args . load_dir is None :
agent = PPOAgent ( env , TARGETNUM ) . to ( device )
else :
agent = torch . load ( args . load_dir )
print ( " Load Agent " , args . load_dir )
print ( agent . eval ( ) )
optimizer = optim . Adam ( agent . parameters ( ) , lr = args . lr , eps = 1e-5 )
# Tensorboard and WandB Recorder
2022-12-09 20:03:13 +00:00
game_name = " Aimbot_Target_Hybrid_PMNN_V2 "
2022-12-02 22:54:38 +00:00
game_type = " OffPolicy_EndBC "
2022-12-01 10:55:51 +00:00
run_name = f " { game_name } _ { game_type } _ { args . seed } _ { int ( time . time ( ) ) } "
if args . wandb_track :
wandb . init (
project = game_name ,
entity = args . wandb_entity ,
sync_tensorboard = True ,
config = vars ( args ) ,
name = run_name ,
monitor_gym = True ,
save_code = True ,
)
writer = SummaryWriter ( f " runs/ { run_name } " )
writer . add_text (
" hyperparameters " ,
" |param|value| \n |-|-| \n %s "
% ( " \n " . join ( [ f " | { key } | { value } | " for key , value in vars ( args ) . items ( ) ] ) ) ,
)
# Trajectory Buffer
ob_bf = [ [ ] for i in range ( env . unity_agent_num ) ]
act_bf = [ [ ] for i in range ( env . unity_agent_num ) ]
dis_logprobs_bf = [ [ ] for i in range ( env . unity_agent_num ) ]
con_logprobs_bf = [ [ ] for i in range ( env . unity_agent_num ) ]
rewards_bf = [ [ ] for i in range ( env . unity_agent_num ) ]
dones_bf = [ [ ] for i in range ( env . unity_agent_num ) ]
values_bf = [ [ ] for i in range ( env . unity_agent_num ) ]
2022-12-02 22:54:38 +00:00
# start the game
total_update_step = using_targets_num * args . total_timesteps / / args . datasetSize
target_steps = [ 0 for i in range ( TARGETNUM ) ]
2022-12-01 10:55:51 +00:00
start_time = time . time ( )
state , _ , done = env . reset ( )
# state = torch.Tensor(next_obs).to(device)
# next_done = torch.zeros(env.unity_agent_num).to(device)
2022-12-02 22:54:38 +00:00
# initialize empty training datasets
obs = [ torch . tensor ( [ ] ) . to ( device ) for i in range ( TARGETNUM ) ] # (TARGETNUM,n,env.unity_observation_size)
actions = [ torch . tensor ( [ ] ) . to ( device ) for i in range ( TARGETNUM ) ] # (TARGETNUM,n,env.unity_action_size)
dis_logprobs = [ torch . tensor ( [ ] ) . to ( device ) for i in range ( TARGETNUM ) ] # (TARGETNUM,n,1)
con_logprobs = [ torch . tensor ( [ ] ) . to ( device ) for i in range ( TARGETNUM ) ] # (TARGETNUM,n,1)
rewards = [ torch . tensor ( [ ] ) . to ( device ) for i in range ( TARGETNUM ) ] # (TARGETNUM,n,1)
values = [ torch . tensor ( [ ] ) . to ( device ) for i in range ( TARGETNUM ) ] # (TARGETNUM,n,1)
advantages = [ torch . tensor ( [ ] ) . to ( device ) for i in range ( TARGETNUM ) ] # (TARGETNUM,n,1)
returns = [ torch . tensor ( [ ] ) . to ( device ) for i in range ( TARGETNUM ) ] # (TARGETNUM,n,1)
2022-12-01 10:55:51 +00:00
for total_steps in range ( total_update_step ) :
# discunt learning rate, while step == total_update_step lr will be 0
2022-12-09 20:03:13 +00:00
2022-12-01 10:55:51 +00:00
if args . annealLR :
finalRatio = TARGET_LEARNING_RATE / args . lr
2022-12-09 20:03:13 +00:00
frac = 1.0 - ( ( total_steps + 1.0 ) / total_update_step )
2022-12-01 10:55:51 +00:00
lrnow = frac * args . lr
optimizer . param_groups [ 0 ] [ " lr " ] = lrnow
2022-12-09 20:03:13 +00:00
else :
lrnow = args . lr
print ( " new episode " , total_steps , " learning rate = " , lrnow )
2022-12-01 10:55:51 +00:00
# MAIN LOOP: run agent in environment
i = 0
training = False
2022-12-02 22:54:38 +00:00
trainQueue = [ ]
2022-12-01 10:55:51 +00:00
while True :
if i % args . decision_period == 0 :
step = round ( i / args . decision_period )
# Choose action by agent
with torch . no_grad ( ) :
# predict actions
action , dis_logprob , _ , con_logprob , _ , value = agent . get_actions_value (
torch . Tensor ( state ) . to ( device )
)
value = value . flatten ( )
# variable from GPU to CPU
action_cpu = action . cpu ( ) . numpy ( )
dis_logprob_cpu = dis_logprob . cpu ( ) . numpy ( )
con_logprob_cpu = con_logprob . cpu ( ) . numpy ( )
value_cpu = value . cpu ( ) . numpy ( )
# Environment step
next_state , reward , next_done = env . step ( action_cpu )
# save memories
for i in range ( env . unity_agent_num ) :
# save memories to buffers
ob_bf [ i ] . append ( state [ i ] )
act_bf [ i ] . append ( action_cpu [ i ] )
dis_logprobs_bf [ i ] . append ( dis_logprob_cpu [ i ] )
con_logprobs_bf [ i ] . append ( con_logprob_cpu [ i ] )
rewards_bf [ i ] . append ( reward [ i ] )
dones_bf [ i ] . append ( done [ i ] )
values_bf [ i ] . append ( value_cpu [ i ] )
2022-12-09 20:03:13 +00:00
remainTime = state [ i , TARGET_STATE_SIZE ]
2022-12-01 10:55:51 +00:00
if next_done [ i ] == True :
# finished a round, send finished memories to training datasets
# compute advantage and discounted reward
#print(i,"over")
2022-12-02 22:54:38 +00:00
roundTargetType = int ( state [ i , 0 ] )
2022-12-09 20:03:13 +00:00
thisRewardsTensor = broadCastEndReward ( rewards_bf [ i ] , remainTime )
2022-12-01 10:55:51 +00:00
adv , rt = GAE (
agent ,
args ,
2022-12-02 18:58:19 +00:00
thisRewardsTensor ,
2022-12-01 10:55:51 +00:00
torch . Tensor ( dones_bf [ i ] ) . to ( device ) ,
torch . tensor ( values_bf [ i ] ) . to ( device ) ,
2022-12-09 20:03:13 +00:00
torch . tensor ( [ next_state [ i ] ] ) . to ( device ) ,
2022-12-01 10:55:51 +00:00
torch . Tensor ( [ next_done [ i ] ] ) . to ( device ) ,
)
# send memories to training datasets
2022-12-02 22:54:38 +00:00
obs [ roundTargetType ] = torch . cat ( ( obs [ roundTargetType ] , torch . tensor ( ob_bf [ i ] ) . to ( device ) ) , 0 )
actions [ roundTargetType ] = torch . cat ( ( actions [ roundTargetType ] , torch . tensor ( act_bf [ i ] ) . to ( device ) ) , 0 )
dis_logprobs [ roundTargetType ] = torch . cat (
( dis_logprobs [ roundTargetType ] , torch . tensor ( dis_logprobs_bf [ i ] ) . to ( device ) ) , 0
2022-12-01 10:55:51 +00:00
)
2022-12-02 22:54:38 +00:00
con_logprobs [ roundTargetType ] = torch . cat (
( con_logprobs [ roundTargetType ] , torch . tensor ( con_logprobs_bf [ i ] ) . to ( device ) ) , 0
2022-12-01 10:55:51 +00:00
)
2022-12-02 22:54:38 +00:00
rewards [ roundTargetType ] = torch . cat ( ( rewards [ roundTargetType ] , thisRewardsTensor ) , 0 )
values [ roundTargetType ] = torch . cat ( ( values [ roundTargetType ] , torch . tensor ( values_bf [ i ] ) . to ( device ) ) , 0 )
advantages [ roundTargetType ] = torch . cat ( ( advantages [ roundTargetType ] , adv ) , 0 )
returns [ roundTargetType ] = torch . cat ( ( returns [ roundTargetType ] , rt ) , 0 )
2022-12-01 10:55:51 +00:00
# clear buffers
ob_bf [ i ] = [ ]
act_bf [ i ] = [ ]
dis_logprobs_bf [ i ] = [ ]
con_logprobs_bf [ i ] = [ ]
rewards_bf [ i ] = [ ]
dones_bf [ i ] = [ ]
values_bf [ i ] = [ ]
2022-12-02 22:54:38 +00:00
print ( f " train dataset { Targets ( roundTargetType ) . name } added: { obs [ roundTargetType ] . size ( ) [ 0 ] } / { args . datasetSize } " )
2022-12-01 10:55:51 +00:00
2022-12-02 22:54:38 +00:00
for i in range ( TARGETNUM ) :
if obs [ i ] . size ( ) [ 0 ] > = args . datasetSize :
# start train NN
trainQueue . append ( i )
if ( len ( trainQueue ) > 0 ) :
2022-12-01 10:55:51 +00:00
break
state , done = next_state , next_done
else :
# skip this step use last predict action
next_obs , reward , next_done = env . step ( action_cpu )
# save memories
for i in range ( env . unity_agent_num ) :
if next_done [ i ] == True :
#print(i,"over???")
# save last memories to buffers
ob_bf [ i ] . append ( state [ i ] )
act_bf [ i ] . append ( action_cpu [ i ] )
dis_logprobs_bf [ i ] . append ( dis_logprob_cpu [ i ] )
con_logprobs_bf [ i ] . append ( con_logprob_cpu [ i ] )
rewards_bf [ i ] . append ( reward [ i ] )
dones_bf [ i ] . append ( done [ i ] )
values_bf [ i ] . append ( value_cpu [ i ] )
# finished a round, send finished memories to training datasets
# compute advantage and discounted reward
adv , rt = GAE (
agent ,
args ,
torch . tensor ( rewards_bf [ i ] ) . to ( device ) ,
torch . Tensor ( dones_bf [ i ] ) . to ( device ) ,
torch . tensor ( values_bf [ i ] ) . to ( device ) ,
torch . tensor ( next_state [ i ] ) . to ( device ) ,
torch . Tensor ( [ next_done [ i ] ] ) . to ( device ) ,
)
# send memories to training datasets
obs = torch . cat ( ( obs , torch . tensor ( ob_bf [ i ] ) . to ( device ) ) , 0 )
actions = torch . cat ( ( actions , torch . tensor ( act_bf [ i ] ) . to ( device ) ) , 0 )
dis_logprobs = torch . cat (
( dis_logprobs , torch . tensor ( dis_logprobs_bf [ i ] ) . to ( device ) ) , 0
)
con_logprobs = torch . cat (
( con_logprobs , torch . tensor ( con_logprobs_bf [ i ] ) . to ( device ) ) , 0
)
rewards = torch . cat ( ( rewards , torch . tensor ( rewards_bf [ i ] ) . to ( device ) ) , 0 )
values = torch . cat ( ( values , torch . tensor ( values_bf [ i ] ) . to ( device ) ) , 0 )
advantages = torch . cat ( ( advantages , adv ) , 0 )
returns = torch . cat ( ( returns , rt ) , 0 )
# clear buffers
ob_bf [ i ] = [ ]
act_bf [ i ] = [ ]
dis_logprobs_bf [ i ] = [ ]
con_logprobs_bf [ i ] = [ ]
rewards_bf [ i ] = [ ]
dones_bf [ i ] = [ ]
values_bf [ i ] = [ ]
2022-12-09 20:03:13 +00:00
# print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
2022-12-01 10:55:51 +00:00
state , done = next_state , next_done
i + = 1
if args . train :
2022-12-02 22:54:38 +00:00
meanRewardList = [ ] # for WANDB
# loop all tarining queue
for thisT in trainQueue :
target_steps [ thisT ] + = 1
# flatten the batch
b_obs = obs [ thisT ] . reshape ( ( - 1 , ) + env . unity_observation_shape )
b_dis_logprobs = dis_logprobs [ thisT ] . reshape ( - 1 )
b_con_logprobs = con_logprobs [ thisT ] . reshape ( - 1 )
b_actions = actions [ thisT ] . reshape ( ( - 1 , ) + ( env . unity_action_size , ) )
b_advantages = advantages [ thisT ] . reshape ( - 1 )
b_returns = returns [ thisT ] . reshape ( - 1 )
b_values = values [ thisT ] . reshape ( - 1 )
b_size = b_obs [ thisT ] . size ( ) [ 0 ]
# Optimizing the policy and value network
b_inds = np . arange ( b_size )
# clipfracs = []
for epoch in range ( args . epochs ) :
# shuffle all datasets
np . random . shuffle ( b_inds )
for start in range ( 0 , b_size , args . minibatchSize ) :
end = start + args . minibatchSize
mb_inds = b_inds [ start : end ]
mb_advantages = b_advantages [ mb_inds ]
# normalize advantages
if args . norm_adv :
mb_advantages = ( mb_advantages - mb_advantages . mean ( ) ) / (
mb_advantages . std ( ) + 1e-8
)
(
_ ,
new_dis_logprob ,
dis_entropy ,
new_con_logprob ,
con_entropy ,
newvalue ,
) = agent . get_actions_value ( b_obs [ mb_inds ] , b_actions [ mb_inds ] )
# discrete ratio
dis_logratio = new_dis_logprob - b_dis_logprobs [ mb_inds ]
dis_ratio = dis_logratio . exp ( )
# continuous ratio
con_logratio = new_con_logprob - b_con_logprobs [ mb_inds ]
con_ratio = con_logratio . exp ( )
"""
# early stop
with torch . no_grad ( ) :
# calculate approx_kl http://joschu.net/blog/kl-approx.html
old_approx_kl = ( - logratio ) . mean ( )
approx_kl = ( ( ratio - 1 ) - logratio ) . mean ( )
clipfracs + = [ ( ( ratio - 1.0 ) . abs ( ) > args . clip_coef ) . float ( ) . mean ( ) . item ( ) ]
"""
# discrete Policy loss
dis_pg_loss_orig = - mb_advantages * dis_ratio
dis_pg_loss_clip = - mb_advantages * torch . clamp (
dis_ratio , 1 - args . clip_coef , 1 + args . clip_coef
)
dis_pg_loss = torch . max ( dis_pg_loss_orig , dis_pg_loss_clip ) . mean ( )
# continuous Policy loss
con_pg_loss_orig = - mb_advantages * con_ratio
con_pg_loss_clip = - mb_advantages * torch . clamp (
con_ratio , 1 - args . clip_coef , 1 + args . clip_coef
)
con_pg_loss = torch . max ( con_pg_loss_orig , con_pg_loss_clip ) . mean ( )
# Value loss
newvalue = newvalue . view ( - 1 )
if args . clip_vloss :
v_loss_unclipped = ( newvalue - b_returns [ mb_inds ] ) * * 2
v_clipped = b_values [ mb_inds ] + torch . clamp (
newvalue - b_values [ mb_inds ] ,
- args . clip_coef ,
args . clip_coef ,
)
v_loss_clipped = ( v_clipped - b_returns [ mb_inds ] ) * * 2
v_loss_max = torch . max ( v_loss_unclipped , v_loss_clipped )
v_loss = 0.5 * v_loss_max . mean ( )
else :
v_loss = 0.5 * ( ( newvalue - b_returns [ mb_inds ] ) * * 2 ) . mean ( )
# total loss
entropy_loss = dis_entropy . mean ( ) + con_entropy . mean ( )
loss = (
2022-12-09 20:03:13 +00:00
dis_pg_loss * POLICY_COEF [ thisT ]
+ con_pg_loss * POLICY_COEF [ thisT ]
- entropy_loss * ENTROPY_COEF [ thisT ]
+ v_loss * CRITIC_COEF [ thisT ]
) * LOSS_COEF [ thisT ]
2022-12-01 10:55:51 +00:00
2022-12-02 22:54:38 +00:00
optimizer . zero_grad ( )
loss . backward ( )
# Clips gradient norm of an iterable of parameters.
nn . utils . clip_grad_norm_ ( agent . parameters ( ) , args . max_grad_norm )
optimizer . step ( )
2022-12-01 10:55:51 +00:00
"""
2022-12-02 22:54:38 +00:00
if args . target_kl is not None :
if approx_kl > args . target_kl :
break
2022-12-01 10:55:51 +00:00
"""
2022-12-02 22:54:38 +00:00
# record mean reward before clear history
targetRewardMean = np . mean ( rewards [ thisT ] . to ( " cpu " ) . detach ( ) . numpy ( ) . copy ( ) )
meanRewardList . append ( targetRewardMean )
targetName = Targets ( thisT ) . name
# clear this target trainning set buffer
obs [ thisT ] = torch . tensor ( [ ] ) . to ( device )
actions [ thisT ] = torch . tensor ( [ ] ) . to ( device )
dis_logprobs [ thisT ] = torch . tensor ( [ ] ) . to ( device )
con_logprobs [ thisT ] = torch . tensor ( [ ] ) . to ( device )
rewards [ thisT ] = torch . tensor ( [ ] ) . to ( device )
values [ thisT ] = torch . tensor ( [ ] ) . to ( device )
advantages [ thisT ] = torch . tensor ( [ ] ) . to ( device )
returns [ thisT ] = torch . tensor ( [ ] ) . to ( device )
# record rewards for plotting purposes
writer . add_scalar ( f " Target { targetName } /value_loss " , v_loss . item ( ) , target_steps [ thisT ] )
writer . add_scalar ( f " Target { targetName } /dis_policy_loss " , dis_pg_loss . item ( ) , target_steps [ thisT ] )
writer . add_scalar ( f " Target { targetName } /con_policy_loss " , con_pg_loss . item ( ) , target_steps [ thisT ] )
writer . add_scalar ( f " Target { targetName } /total_loss " , loss . item ( ) , target_steps [ thisT ] )
writer . add_scalar ( f " Target { targetName } /entropy_loss " , entropy_loss . item ( ) , target_steps [ thisT ] )
writer . add_scalar ( f " Target { targetName } /Reward " , targetRewardMean , target_steps [ thisT ] )
writer . add_scalar ( f " Target { targetName } /WinRatio " , WinRounds [ targetName ] / TotalRounds [ targetName ] , target_steps [ thisT ] )
print ( f " episode over Target { targetName } mean reward: " , targetRewardMean )
TotalRewardMean = np . mean ( meanRewardList )
writer . add_scalar ( " GlobalCharts/TotalRewardMean " , TotalRewardMean , total_steps )
writer . add_scalar ( " GlobalCharts/learning_rate " , optimizer . param_groups [ 0 ] [ " lr " ] , total_steps )
# New Record!
if TotalRewardMean > bestReward :
bestReward = targetRewardMean
2022-12-09 20:03:13 +00:00
saveDir = " ../PPO-Model/ " + run_name + " _ " + str ( TotalRewardMean ) + " .pt "
2022-12-01 10:55:51 +00:00
torch . save ( agent , saveDir )
2022-12-09 20:03:13 +00:00
saveDir = " ../PPO-Model/ " + run_name + " _last.pt "
2022-12-02 22:54:38 +00:00
torch . save ( agent , saveDir )
2022-12-01 10:55:51 +00:00
env . close ( )
writer . close ( )