import tensorflow as tf import tensorflow_probability as tfp import numpy as np import math import copy import datetime import os from tensorflow import keras from tensorflow.keras import layers from tensorflow.keras import optimizers from keras_radam import RAdam class PPO(object): """Create PPO Agent """ def __init__(self, stateSize, disActShape, conActSize, conActRange, criticLR, actorLR, gamma, epsilon, entropyWeight, saveDir, loadModelDir): # check disActShape is correct(greater than 1) try: if np.any(np.array(disActShape)<=1): raise ValueError("disActShape error,disActShape should greater than 1 but get",disActShape) except ValueError as e: raise self.stateSize = stateSize # self.actionSize = actionSize self.disActShape = disActShape # shape of discrete action output. like [3,3,2] self.disActSize = len(disActShape) self.conActSize = conActSize self.conActRange = conActRange self.criticLR = criticLR self.actorLR = actorLR self.GAMMA = gamma self.EPSILON = epsilon self.saveDir = saveDir self.entropyWeight = entropyWeight self.disOutputSize = sum(disActShape) self.conOutputSize = conActSize * 2 if loadModelDir == None: # critc NN self.critic = self.buildCriticNet(self.stateSize, 1, compileModel = True) # actor NN self.actor = self.buildActorNet(self.stateSize, self.conActRange, compileModel = True) else: # critc NN self.critic = self.buildCriticNet(self.stateSize, 1, compileModel=True) # actor NN self.actor = self.buildActorNet(self.stateSize, self.conActRange, compileModel=True) # load weight to Critic&Actor NN self.loadWeightToModels(loadModelDir) # Build Net def buildActorNet(self, inputSize, continuousActionRange,compileModel): """build Actor Nueral Net and compile.Output:[disAct1,disAct2,disAct3,mu,sigma] Args: inputSize (int): InputLayer Nueral size. continuousActionRange (foat): continuous Action's max Range. Returns: keras.Model: return Actor NN """ stateInput = layers.Input(shape=(inputSize,), name='stateInput') dense0 = layers.Dense(500, activation='relu',name='dense0',)(stateInput) dense1 = layers.Dense(200, activation='relu',name='dense1',)(dense0) dense2 = layers.Dense(100, activation='relu', name='dense2')(dense1) disAct1 = layers.Dense(3, activation='softmax',name='WSAction')(dense2) # WS disAct2 = layers.Dense(3, activation='softmax',name='ADAction')(dense2) # AD disAct3 = layers.Dense(2, activation='softmax',name='ShootAction')(dense2) # Mouse shoot mu = continuousActionRange * layers.Dense(1, activation='tanh', name='muOut')(dense2) # mu,既正态分布mean sigma = 1e-8 + layers.Dense(1, activation='softplus',name='sigmaOut')(dense2) # sigma,既正态分布 # musig = layers.concatenate([mu,sigma],name = 'musig') totalOut = layers.concatenate( [disAct1, disAct2, disAct3, mu, sigma], name='totalOut') # package model = keras.Model(inputs=stateInput, outputs=totalOut) #actorOPT = optimizers.Adam(learning_rate = self.actorLR) if compileModel: actorOPT = RAdam(self.actorLR) model.compile(optimizer=actorOPT, loss=self.aLoss()) return model def buildCriticNet(self, inputSize, outputSize,compileModel): """build Critic Nueral Net and compile.Output:[Q] Args: inputSize (int): InputLayer Neural Size outputSize (float): Q size Returns: keras.Model: return Critic NN """ stateInput = keras.Input(shape=(inputSize,)) dense0 = layers.Dense(500, activation='relu', name='dense0',)(stateInput) dense1 = layers.Dense(200, activation='relu')(dense0) dense2 = layers.Dense(100, activation='relu')(dense1) output = layers.Dense(outputSize)(dense2) model = keras.Model(inputs=stateInput, outputs=output) if compileModel: criticOPT = optimizers.Adam(learning_rate=self.criticLR) model.compile(optimizer=criticOPT, loss=self.cLoss()) return model # loss Function def cLoss(self): """Critic Loss function """ def loss(y_true, y_pred): # y_true: discountedR # y_pred: critcV = model.predict(states) advantage = y_true - y_pred # TD error loss = tf.reduce_mean(tf.square(advantage)) return loss return loss def aLoss(self): def getDiscreteALoss(nowProbs,oldProbs,advantage): """get Discrete Action Loss Args: nowProbs (tf.constant): (length,actionSize) oldProbs (tf.constant): (length,actionSize) advantage (tf.constant): (length,) Returns: tf.constant: (length,) """ entropy = tf.reduce_mean(tf.math.multiply(nowProbs,tf.math.log(nowProbs+1e-6))) ratio = tf.math.divide(nowProbs,oldProbs+1e-6) value = tf.math.multiply(ratio,tf.expand_dims(advantage,axis = 1)) clipRatio = tf.clip_by_value(ratio,1. - self.EPSILON,1.+self.EPSILON) clipValue = tf.math.multiply(clipRatio,tf.expand_dims(advantage,axis = 1)) loss = -tf.reduce_mean(tf.math.minimum(value,clipValue)) + self.entropyWeight * entropy return loss def getContinuousALoss(musig,actions,oldProbs,advantage): """get Continuous Action Loss Args: musig (tf.constant): (length,2) actions (tf.constant): (length,) oldProbs (tf.constant): (length,) advantage (tf.constant): (length,) Returns: tf.constant: (length,) """ mu = musig[:,0] sigma = musig[:,1] dist = tfp.distributions.Normal(mu,sigma) nowProbs = dist.prob(actions) ratio = tf.math.divide(nowProbs,oldProbs+1e-6) entropy = tf.reduce_mean(dist.entropy()) value = tf.math.multiply(ratio,tf.expand_dims(advantage,axis = 1)) clipValue = tf.clip_by_value(ratio,1. - self.EPSILON,1.+self.EPSILON) * advantage loss = -tf.reduce_mean(tf.math.minimum(value,clipValue)) + self.entropyWeight * entropy return loss def loss(y_true, y_pred): # y_true: [[disAct1, disAct2, disAct3, mu, sigma]] # y_pred: muSigma = self.actor(state) = # [[disAct1, disAct2, disAct3, mu, sigma]] oldDisProbs = y_true[:,0:self.disOutputSize] oldConMusigs = y_true[:,self.disOutputSize:self.disOutputSize+self.conActSize] conActions = y_true[:,self.disOutputSize+self.conActSize:self.disOutputSize+(self.conActSize*2)] advantage = y_true[:,-1] nowDisProbs = y_pred[:,0:self.disOutputSize] # [disAct1, disAct2, disAct3] nowConMusigs = y_pred[:,self.disOutputSize:] #[musig1,musig2] totalALoss = tf.constant([0.]) totalActionNum = 0 # for nowProb,oldProb in zip(tf.transpose(nowDisProbs,perm=[1,0,2]),tf.transpose(oldDisProbs,perm=[1,0,2])): lastDisActShape = 0 for shape in self.disActShape: thisNowDisProbs = nowDisProbs[:,lastDisActShape:lastDisActShape+shape] thisOldDisProbs = oldDisProbs[:,lastDisActShape:lastDisActShape+shape] discreteALoss = getDiscreteALoss(thisNowDisProbs,thisOldDisProbs,advantage) lastDisActShape += shape totalALoss += discreteALoss totalActionNum += 1 # for nowConMusig,conAction,oldPiProb in zip(tf.transpose(nowConMusigs,perm=[1,0,2]),conActions,oldPiProbs): lastConAct = 0 for act in range(self.conActSize): thisNowConMusig = nowConMusigs[:,lastConAct:lastConAct+((act+1)*2)] thisOldConMusig = oldConMusigs[:,lastConAct:lastConAct+((act+1)*2)] thisConAction = conActions[:,act] continuousAloss = getContinuousALoss(thisNowConMusig,thisConAction,thisOldConMusig,advantage) totalALoss += continuousAloss totalActionNum += 1 loss = tf.divide(totalALoss,totalActionNum) return loss return loss # get Action&V def chooseAction(self, state): """Agent choose action to take Args: state (np.array): enviroment state Returns: np.array: disAct1, discreteAction1 disAct2, discreteAction2 disAct3, discreteAction3 conAction, continuousAction predictResult, actor NN predict Result output """ # let actor choose action,use the normal distribution # state = np.expand_dims(state,0) # check state dimension is [1,statesize] if state.ndim!=2: state = state.reshape([1,self.stateSize]) predictResult = self.actor(state) # get predict result [[disAct1, disAct2, disAct3, musig]] predictResult = predictResult.numpy() disAct1Prob = predictResult[0][0:3] disAct2Prob = predictResult[0][3:6] disAct3Prob = predictResult[0][6:8] mu = predictResult[0][8] sigma = predictResult[0][9] if math.isnan(mu) or math.isnan(sigma): # check mu or sigma is nan print("mu or sigma is nan") disAct1 = np.argmax(disAct1Prob) # WS 0 or 1 or 2 disAct2 = np.argmax(disAct2Prob) # AD 0 or 1 or 2 disAct3 = np.argmax(disAct3Prob) # mouse shoot 0 or 1 normDist = np.random.normal(loc=mu, scale=sigma) # normalDistribution conAction = np.clip(normDist, -self.conActRange, self.conActRange) # 在正态分布中随机get一个action return disAct1, disAct2, disAct3, conAction, predictResult def getCriticV(self, state): """get Critic predict V value Args: state (np.array): Env state Returns: tensor: retrun Critic predict result """ # if state.ndim < 2: # state = np.expand_dims(state,0) if state.ndim!=2: state = state.reshape([1,self.stateSize]) return self.critic.predict(state) def discountReward(self, nextState, rewards): """Discount future rewards Args: nextState (np.array): next Env state rewards (np.array): reward list of this episode Returns: np.array: discounted rewards list,same shape as rewards that input """ # 降低未来的rewards nextV = self.getCriticV(nextState) discountedRewards = [] for r in rewards[::-1]: nextV = r + self.GAMMA*nextV discountedRewards.append(nextV) discountedRewards.reverse() # \ESREVER/ discountedRewards = np.squeeze(discountedRewards) discountedRewards = np.expand_dims(discountedRewards, axis=1) #discountedRewards = np.array(discountedRewards)[:, np.newaxis] return discountedRewards def conProb(self, mu, sig, x): """calculate probability when x in Normal distribution(mu,sigma) Args: mu (np,array): mu sig (np.array): sigma x (np.array): x Returns: np.array: probabilities """ # 获取在正态分布mu,sig下当取x值时的概率 # return shape : (length,1) mu = np.reshape(mu, (np.size(mu),)) sig = np.reshape(sig, (np.size(sig),)) x = np.reshape(x, (np.size(x),)) dist = tfp.distributions.Normal(mu, sig) prob = dist.prob(x) prob = np.reshape(prob, (np.size(x), 1)) #dist = 1./(tf.sqrt(2.*np.pi)*sig) #prob = dist*tf.exp(-tf.square(x-mu)/(2.*tf.square(sig))) return prob def trainCritcActor(self, states, actions, rewards, nextState, criticEpochs, actorEpochs): # Train ActorNN and CriticNN # states: Buffer States # actions: Buffer Actions # rewards: Buffer Rewards,没有Discount处理 # nextState: 下一个单独state # criticEpochs: just criticNN'Epochs # acotrEpochs: just acotrNN'Epochs discountedR = self.discountReward(nextState, rewards) criticMeanLoss = self.trainCritic(states, discountedR, criticEpochs) actorMeanLoss = self.trainActor( states, actions, discountedR, actorEpochs) print("A_Loss:", actorMeanLoss, "C_Loss:", criticMeanLoss) return actorMeanLoss, criticMeanLoss def trainCritic(self, states, discountedR, epochs): # Trian Critic # states: Buffer States # discountedR: Discounted Rewards # Epochs: just Epochs # IDK why this should be list...It just work... # If discountR in np.array type it will throw 'Failed to find data adapter that can handle' # discountedR = discountedR.tolist() his = self.critic.fit(x=states, y=discountedR, epochs=epochs, verbose=0) return np.mean(his.history['loss']) def trainActor(self, states, actions, discountedR, epochs): """Actor NN trainning function Args: states (np.array): Env states actions (np.array): action history discountedR (np.array): discountedR epochs (int): epochs,how many time NN learning Returns: Average actor loss: this learning round's average actor loss """ # Trian Actor # states: Buffer States # actions: Buffer Actions # discountedR: Discounted Rewards # Epochs: just Epochs states = np.asarray(states) actions = np.asarray(actions, dtype=np.float32) # predict with old Actor NN oldActorResult = self.actor.predict(states) # assembly Actions history disActions = actions[:,0:self.disActSize] conActions = actions[:,self.disActSize:] # assembly predictResult as old Actor's Result oldDisProbs = oldActorResult[:,0:self.disOutputSize] # [disAct1, disAct2, disAct3] oldConMusigs = oldActorResult[:,self.disOutputSize:] # [musig1,musig2] oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions) criticV = self.critic.predict(states) advantage = copy.deepcopy(discountedR - criticV) # pack [oldDisProbs,oldPiProbs,conActions,advantage] as y_true y_true = np.hstack((oldDisProbs,oldPiProbs,conActions,advantage)) # train start if np.any(tf.math.is_nan(y_true)): print("y_true got nan") print("oldConMusigs",oldConMusigs) print("oldPiProbs",oldPiProbs) print("conActions",conActions) print("oldConMusigs",oldConMusigs) his = self.actor.fit(x=states, y=y_true, epochs=epochs, verbose=0) if np.any(tf.math.is_nan(his.history['loss'])): print("his.history['loss'] is nan!") print(his.history['loss']) return np.mean(his.history['loss']) def saveWeights(self,score): """save now NN's Weight. Use "models.save_weights" method. Save as "tf" format "ckpt" file. Args: score (int): now score """ actor_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/actor/" + "actor.ckpt" critic_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/critic/" + "critic.ckpt" score_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/" + str(round(score)) self.actor.save_weights(actor_save_dir, save_format="tf") self.critic.save_weights(critic_save_dir, save_format="tf") # create an empty file named as score to recored score scorefile = open(score_dir,'w') scorefile.close() print("Model's Weights Saved") def loadWeightToModels(self,loadDir): """load NN Model. Use "models.load_weights()" method. Load "tf" format "ckpt" file. Args: loadDir (string): Model dir """ actorDir = loadDir + "/actor/" + "actor.ckpt" criticDir = loadDir + "/critic/" + "critic.ckpt" self.actor.load_weights(actorDir) self.critic.load_weights(criticDir) print("++++++++++++++++++++++++++++++++++++") print("++++++++++++Model Loaded++++++++++++") print(loadDir) print("++++++++++++++++++++++++++++++++++++") def saveModel(self, score): """save now NN Model. Use "model.save()" method. Args: score (int): now score """ score = "_" + str(round(score)) actor_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + \ score+"/actor.h5" critic_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + \ score+"/critic.h5" self.actor.save(actor_save_dir) self.critic.save(critic_save_dir) print("Model Saved") def loadModel(self, loadDir): """load NN Model. Use "models.load_model()" method. Args: loadDir (string): Model dir Returns: tf.keras.models: retuen compiled models. """ actorDir = loadDir+"/actor.h5" criticDir = loadDir+"/critic.h5" actor_net_loaded = tf.keras.models.load_model(actorDir) critic_net_loaded = tf.keras.models.load_model(criticDir) print("++++++++++++++++++++++++++++++++++++") print("++++++++++++Model Loaded++++++++++++") print(loadDir) print("++++++++++++++++++++++++++++++++++++") return actor_net_loaded, critic_net_loaded