463 lines
18 KiB
Python
463 lines
18 KiB
Python
|
import tensorflow as tf
|
|||
|
import tensorflow_probability as tfp
|
|||
|
import numpy as np
|
|||
|
import math
|
|||
|
import copy
|
|||
|
import datetime
|
|||
|
import os
|
|||
|
|
|||
|
from tensorflow import keras
|
|||
|
from tensorflow.keras import layers
|
|||
|
from tensorflow.keras import optimizers
|
|||
|
from keras_radam import RAdam
|
|||
|
|
|||
|
|
|||
|
class PPO(object):
|
|||
|
"""Create PPO Agent
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, stateSize, disActShape, conActSize, conActRange, criticLR, actorLR, gamma, epsilon, entropyWeight, saveDir, loadModelDir):
|
|||
|
|
|||
|
# check disActShape is correct(greater than 1)
|
|||
|
try:
|
|||
|
if np.any(np.array(disActShape)<=1):
|
|||
|
raise ValueError("disActShape error,disActShape should greater than 1 but get",disActShape)
|
|||
|
except ValueError as e:
|
|||
|
raise
|
|||
|
|
|||
|
self.stateSize = stateSize
|
|||
|
# self.actionSize = actionSize
|
|||
|
self.disActShape = disActShape # shape of discrete action output. like [3,3,2]
|
|||
|
self.disActSize = len(disActShape)
|
|||
|
self.conActSize = conActSize
|
|||
|
self.conActRange = conActRange
|
|||
|
self.criticLR = criticLR
|
|||
|
self.actorLR = actorLR
|
|||
|
self.GAMMA = gamma
|
|||
|
self.EPSILON = epsilon
|
|||
|
self.saveDir = saveDir
|
|||
|
self.entropyWeight = entropyWeight
|
|||
|
|
|||
|
self.disOutputSize = sum(disActShape)
|
|||
|
self.conOutputSize = conActSize * 2
|
|||
|
|
|||
|
if loadModelDir == None:
|
|||
|
# critc NN
|
|||
|
self.critic = self.buildCriticNet(self.stateSize, 1, compileModel = True)
|
|||
|
# actor NN
|
|||
|
self.actor = self.buildActorNet(self.stateSize, self.conActRange, compileModel = True)
|
|||
|
else:
|
|||
|
# critc NN
|
|||
|
self.critic = self.buildCriticNet(self.stateSize, 1, compileModel=True)
|
|||
|
# actor NN
|
|||
|
self.actor = self.buildActorNet(self.stateSize, self.conActRange, compileModel=True)
|
|||
|
# load weight to Critic&Actor NN
|
|||
|
self.loadWeightToModels(loadModelDir)
|
|||
|
|
|||
|
|
|||
|
# Build Net
|
|||
|
def buildActorNet(self, inputSize, continuousActionRange,compileModel):
|
|||
|
"""build Actor Nueral Net and compile.Output:[disAct1,disAct2,disAct3,mu,sigma]
|
|||
|
|
|||
|
Args:
|
|||
|
inputSize (int): InputLayer Nueral size.
|
|||
|
continuousActionRange (foat): continuous Action's max Range.
|
|||
|
|
|||
|
Returns:
|
|||
|
keras.Model: return Actor NN
|
|||
|
"""
|
|||
|
stateInput = layers.Input(shape=(inputSize,), name='stateInput')
|
|||
|
dense0 = layers.Dense(500, activation='relu',name='dense0',)(stateInput)
|
|||
|
dense1 = layers.Dense(200, activation='relu',name='dense1',)(dense0)
|
|||
|
dense2 = layers.Dense(100, activation='relu', name='dense2')(dense1)
|
|||
|
|
|||
|
disAct1 = layers.Dense(3, activation='softmax',name='WSAction')(dense2) # WS
|
|||
|
disAct2 = layers.Dense(3, activation='softmax',name='ADAction')(dense2) # AD
|
|||
|
disAct3 = layers.Dense(2, activation='softmax',name='ShootAction')(dense2) # Mouse shoot
|
|||
|
mu = continuousActionRange * layers.Dense(1, activation='tanh', name='muOut')(dense2) # mu,既正态分布mean
|
|||
|
sigma = 1e-8 + layers.Dense(1, activation='softplus',name='sigmaOut')(dense2) # sigma,既正态分布
|
|||
|
# musig = layers.concatenate([mu,sigma],name = 'musig')
|
|||
|
totalOut = layers.concatenate(
|
|||
|
[disAct1, disAct2, disAct3, mu, sigma], name='totalOut') # package
|
|||
|
|
|||
|
model = keras.Model(inputs=stateInput, outputs=totalOut)
|
|||
|
#actorOPT = optimizers.Adam(learning_rate = self.actorLR)
|
|||
|
if compileModel:
|
|||
|
actorOPT = RAdam(self.actorLR)
|
|||
|
model.compile(optimizer=actorOPT, loss=self.aLoss())
|
|||
|
return model
|
|||
|
|
|||
|
def buildCriticNet(self, inputSize, outputSize,compileModel):
|
|||
|
"""build Critic Nueral Net and compile.Output:[Q]
|
|||
|
|
|||
|
Args:
|
|||
|
inputSize (int): InputLayer Neural Size
|
|||
|
outputSize (float): Q size
|
|||
|
|
|||
|
Returns:
|
|||
|
keras.Model: return Critic NN
|
|||
|
"""
|
|||
|
stateInput = keras.Input(shape=(inputSize,))
|
|||
|
dense0 = layers.Dense(500, activation='relu',
|
|||
|
name='dense0',)(stateInput)
|
|||
|
dense1 = layers.Dense(200, activation='relu')(dense0)
|
|||
|
dense2 = layers.Dense(100, activation='relu')(dense1)
|
|||
|
output = layers.Dense(outputSize)(dense2)
|
|||
|
model = keras.Model(inputs=stateInput, outputs=output)
|
|||
|
if compileModel:
|
|||
|
criticOPT = optimizers.Adam(learning_rate=self.criticLR)
|
|||
|
model.compile(optimizer=criticOPT, loss=self.cLoss())
|
|||
|
return model
|
|||
|
|
|||
|
# loss Function
|
|||
|
def cLoss(self):
|
|||
|
"""Critic Loss function
|
|||
|
"""
|
|||
|
def loss(y_true, y_pred):
|
|||
|
# y_true: discountedR
|
|||
|
# y_pred: critcV = model.predict(states)
|
|||
|
|
|||
|
advantage = y_true - y_pred # TD error
|
|||
|
loss = tf.reduce_mean(tf.square(advantage))
|
|||
|
return loss
|
|||
|
return loss
|
|||
|
|
|||
|
def aLoss(self):
|
|||
|
def getDiscreteALoss(nowProbs,oldProbs,advantage):
|
|||
|
"""get Discrete Action Loss
|
|||
|
|
|||
|
Args:
|
|||
|
nowProbs (tf.constant): (length,actionSize)
|
|||
|
oldProbs (tf.constant): (length,actionSize)
|
|||
|
advantage (tf.constant): (length,)
|
|||
|
|
|||
|
Returns:
|
|||
|
tf.constant: (length,)
|
|||
|
"""
|
|||
|
entropy = tf.reduce_mean(tf.math.multiply(nowProbs,tf.math.log(nowProbs+1e-6)))
|
|||
|
ratio = tf.math.divide(nowProbs,oldProbs+1e-6)
|
|||
|
value = tf.math.multiply(ratio,tf.expand_dims(advantage,axis = 1))
|
|||
|
clipRatio = tf.clip_by_value(ratio,1. - self.EPSILON,1.+self.EPSILON)
|
|||
|
clipValue = tf.math.multiply(clipRatio,tf.expand_dims(advantage,axis = 1))
|
|||
|
loss = -tf.reduce_mean(tf.math.minimum(value,clipValue)) + self.entropyWeight * entropy
|
|||
|
return loss
|
|||
|
|
|||
|
def getContinuousALoss(musig,actions,oldProbs,advantage):
|
|||
|
"""get Continuous Action Loss
|
|||
|
|
|||
|
Args:
|
|||
|
musig (tf.constant): (length,2)
|
|||
|
actions (tf.constant): (length,)
|
|||
|
oldProbs (tf.constant): (length,)
|
|||
|
advantage (tf.constant): (length,)
|
|||
|
|
|||
|
Returns:
|
|||
|
tf.constant: (length,)
|
|||
|
"""
|
|||
|
mu = musig[:,0]
|
|||
|
sigma = musig[:,1]
|
|||
|
dist = tfp.distributions.Normal(mu,sigma)
|
|||
|
|
|||
|
nowProbs = dist.prob(actions)
|
|||
|
ratio = tf.math.divide(nowProbs,oldProbs+1e-6)
|
|||
|
entropy = tf.reduce_mean(dist.entropy())
|
|||
|
|
|||
|
value = tf.math.multiply(ratio,tf.expand_dims(advantage,axis = 1))
|
|||
|
clipValue = tf.clip_by_value(ratio,1. - self.EPSILON,1.+self.EPSILON) * advantage
|
|||
|
loss = -tf.reduce_mean(tf.math.minimum(value,clipValue)) + self.entropyWeight * entropy
|
|||
|
return loss
|
|||
|
|
|||
|
def loss(y_true, y_pred):
|
|||
|
# y_true: [[disAct1, disAct2, disAct3, mu, sigma]]
|
|||
|
# y_pred: muSigma = self.actor(state) =
|
|||
|
# [[disAct1, disAct2, disAct3, mu, sigma]]
|
|||
|
oldDisProbs = y_true[:,0:self.disOutputSize]
|
|||
|
oldConMusigs = y_true[:,self.disOutputSize:self.disOutputSize+self.conActSize]
|
|||
|
conActions = y_true[:,self.disOutputSize+self.conActSize:self.disOutputSize+(self.conActSize*2)]
|
|||
|
advantage = y_true[:,-1]
|
|||
|
|
|||
|
nowDisProbs = y_pred[:,0:self.disOutputSize] # [disAct1, disAct2, disAct3]
|
|||
|
nowConMusigs = y_pred[:,self.disOutputSize:] #[musig1,musig2]
|
|||
|
|
|||
|
totalALoss = tf.constant([0.])
|
|||
|
totalActionNum = 0
|
|||
|
|
|||
|
# for nowProb,oldProb in zip(tf.transpose(nowDisProbs,perm=[1,0,2]),tf.transpose(oldDisProbs,perm=[1,0,2])):
|
|||
|
lastDisActShape = 0
|
|||
|
for shape in self.disActShape:
|
|||
|
thisNowDisProbs = nowDisProbs[:,lastDisActShape:lastDisActShape+shape]
|
|||
|
thisOldDisProbs = oldDisProbs[:,lastDisActShape:lastDisActShape+shape]
|
|||
|
discreteALoss = getDiscreteALoss(thisNowDisProbs,thisOldDisProbs,advantage)
|
|||
|
lastDisActShape += shape
|
|||
|
totalALoss += discreteALoss
|
|||
|
totalActionNum += 1
|
|||
|
# for nowConMusig,conAction,oldPiProb in zip(tf.transpose(nowConMusigs,perm=[1,0,2]),conActions,oldPiProbs):
|
|||
|
lastConAct = 0
|
|||
|
for act in range(self.conActSize):
|
|||
|
thisNowConMusig = nowConMusigs[:,lastConAct:lastConAct+((act+1)*2)]
|
|||
|
thisOldConMusig = oldConMusigs[:,lastConAct:lastConAct+((act+1)*2)]
|
|||
|
thisConAction = conActions[:,act]
|
|||
|
continuousAloss = getContinuousALoss(thisNowConMusig,thisConAction,thisOldConMusig,advantage)
|
|||
|
totalALoss += continuousAloss
|
|||
|
totalActionNum += 1
|
|||
|
|
|||
|
loss = tf.divide(totalALoss,totalActionNum)
|
|||
|
return loss
|
|||
|
return loss
|
|||
|
|
|||
|
# get Action&V
|
|||
|
def chooseAction(self, state):
|
|||
|
"""Agent choose action to take
|
|||
|
|
|||
|
Args:
|
|||
|
state (np.array): enviroment state
|
|||
|
|
|||
|
Returns:
|
|||
|
np.array:
|
|||
|
disAct1,
|
|||
|
discreteAction1
|
|||
|
disAct2,
|
|||
|
discreteAction2
|
|||
|
disAct3,
|
|||
|
discreteAction3
|
|||
|
conAction,
|
|||
|
continuousAction
|
|||
|
predictResult,
|
|||
|
actor NN predict Result output
|
|||
|
"""
|
|||
|
# let actor choose action,use the normal distribution
|
|||
|
# state = np.expand_dims(state,0)
|
|||
|
|
|||
|
# check state dimension is [1,statesize]
|
|||
|
if state.ndim!=2:
|
|||
|
state = state.reshape([1,self.stateSize])
|
|||
|
|
|||
|
predictResult = self.actor(state) # get predict result [[disAct1, disAct2, disAct3, musig]]
|
|||
|
predictResult = predictResult.numpy()
|
|||
|
disAct1Prob = predictResult[0][0:3]
|
|||
|
disAct2Prob = predictResult[0][3:6]
|
|||
|
disAct3Prob = predictResult[0][6:8]
|
|||
|
mu = predictResult[0][8]
|
|||
|
sigma = predictResult[0][9]
|
|||
|
if math.isnan(mu) or math.isnan(sigma):
|
|||
|
# check mu or sigma is nan
|
|||
|
print("mu or sigma is nan")
|
|||
|
|
|||
|
disAct1 = np.argmax(disAct1Prob) # WS 0 or 1 or 2
|
|||
|
disAct2 = np.argmax(disAct2Prob) # AD 0 or 1 or 2
|
|||
|
disAct3 = np.argmax(disAct3Prob) # mouse shoot 0 or 1
|
|||
|
normDist = np.random.normal(loc=mu, scale=sigma) # normalDistribution
|
|||
|
conAction = np.clip(normDist, -self.conActRange,
|
|||
|
self.conActRange) # 在正态分布中随机get一个action
|
|||
|
return disAct1, disAct2, disAct3, conAction, predictResult
|
|||
|
|
|||
|
def getCriticV(self, state):
|
|||
|
"""get Critic predict V value
|
|||
|
|
|||
|
Args:
|
|||
|
state (np.array): Env state
|
|||
|
|
|||
|
Returns:
|
|||
|
tensor: retrun Critic predict result
|
|||
|
"""
|
|||
|
# if state.ndim < 2:
|
|||
|
# state = np.expand_dims(state,0)
|
|||
|
if state.ndim!=2:
|
|||
|
state = state.reshape([1,self.stateSize])
|
|||
|
return self.critic.predict(state)
|
|||
|
|
|||
|
def discountReward(self, nextState, rewards):
|
|||
|
"""Discount future rewards
|
|||
|
|
|||
|
Args:
|
|||
|
nextState (np.array): next Env state
|
|||
|
rewards (np.array): reward list of this episode
|
|||
|
|
|||
|
Returns:
|
|||
|
np.array: discounted rewards list,same shape as rewards that input
|
|||
|
"""
|
|||
|
# 降低未来的rewards
|
|||
|
nextV = self.getCriticV(nextState)
|
|||
|
discountedRewards = []
|
|||
|
for r in rewards[::-1]:
|
|||
|
nextV = r + self.GAMMA*nextV
|
|||
|
discountedRewards.append(nextV)
|
|||
|
discountedRewards.reverse() # \ESREVER/
|
|||
|
discountedRewards = np.squeeze(discountedRewards)
|
|||
|
discountedRewards = np.expand_dims(discountedRewards, axis=1)
|
|||
|
#discountedRewards = np.array(discountedRewards)[:, np.newaxis]
|
|||
|
return discountedRewards
|
|||
|
|
|||
|
def conProb(self, mu, sig, x):
|
|||
|
"""calculate probability when x in Normal distribution(mu,sigma)
|
|||
|
|
|||
|
Args:
|
|||
|
mu (np,array): mu
|
|||
|
sig (np.array): sigma
|
|||
|
x (np.array): x
|
|||
|
|
|||
|
Returns:
|
|||
|
np.array: probabilities
|
|||
|
"""
|
|||
|
# 获取在正态分布mu,sig下当取x值时的概率
|
|||
|
# return shape : (length,1)
|
|||
|
mu = np.reshape(mu, (np.size(mu),))
|
|||
|
sig = np.reshape(sig, (np.size(sig),))
|
|||
|
x = np.reshape(x, (np.size(x),))
|
|||
|
|
|||
|
dist = tfp.distributions.Normal(mu, sig)
|
|||
|
prob = dist.prob(x)
|
|||
|
|
|||
|
prob = np.reshape(prob, (np.size(x), 1))
|
|||
|
#dist = 1./(tf.sqrt(2.*np.pi)*sig)
|
|||
|
#prob = dist*tf.exp(-tf.square(x-mu)/(2.*tf.square(sig)))
|
|||
|
return prob
|
|||
|
|
|||
|
def trainCritcActor(self, states, actions, rewards, nextState, criticEpochs, actorEpochs):
|
|||
|
# Train ActorNN and CriticNN
|
|||
|
# states: Buffer States
|
|||
|
# actions: Buffer Actions
|
|||
|
# rewards: Buffer Rewards,没有Discount处理
|
|||
|
# nextState: 下一个单独state
|
|||
|
# criticEpochs: just criticNN'Epochs
|
|||
|
# acotrEpochs: just acotrNN'Epochs
|
|||
|
discountedR = self.discountReward(nextState, rewards)
|
|||
|
|
|||
|
criticMeanLoss = self.trainCritic(states, discountedR, criticEpochs)
|
|||
|
actorMeanLoss = self.trainActor(
|
|||
|
states, actions, discountedR, actorEpochs)
|
|||
|
print("A_Loss:", actorMeanLoss, "C_Loss:", criticMeanLoss)
|
|||
|
return actorMeanLoss, criticMeanLoss
|
|||
|
|
|||
|
def trainCritic(self, states, discountedR, epochs):
|
|||
|
# Trian Critic
|
|||
|
# states: Buffer States
|
|||
|
# discountedR: Discounted Rewards
|
|||
|
# Epochs: just Epochs
|
|||
|
|
|||
|
# IDK why this should be list...It just work...
|
|||
|
# If discountR in np.array type it will throw 'Failed to find data adapter that can handle'
|
|||
|
# discountedR = discountedR.tolist()
|
|||
|
his = self.critic.fit(x=states, y=discountedR,
|
|||
|
epochs=epochs, verbose=0)
|
|||
|
return np.mean(his.history['loss'])
|
|||
|
|
|||
|
def trainActor(self, states, actions, discountedR, epochs):
|
|||
|
"""Actor NN trainning function
|
|||
|
|
|||
|
Args:
|
|||
|
states (np.array): Env states
|
|||
|
actions (np.array): action history
|
|||
|
discountedR (np.array): discountedR
|
|||
|
epochs (int): epochs,how many time NN learning
|
|||
|
|
|||
|
Returns:
|
|||
|
Average actor loss: this learning round's average actor loss
|
|||
|
"""
|
|||
|
# Trian Actor
|
|||
|
# states: Buffer States
|
|||
|
# actions: Buffer Actions
|
|||
|
# discountedR: Discounted Rewards
|
|||
|
# Epochs: just Epochs
|
|||
|
|
|||
|
states = np.asarray(states)
|
|||
|
actions = np.asarray(actions, dtype=np.float32)
|
|||
|
# predict with old Actor NN
|
|||
|
oldActorResult = self.actor.predict(states)
|
|||
|
|
|||
|
# assembly Actions history
|
|||
|
disActions = actions[:,0:self.disActSize]
|
|||
|
conActions = actions[:,self.disActSize:]
|
|||
|
# assembly predictResult as old Actor's Result
|
|||
|
oldDisProbs = oldActorResult[:,0:self.disOutputSize] # [disAct1, disAct2, disAct3]
|
|||
|
oldConMusigs = oldActorResult[:,self.disOutputSize:] # [musig1,musig2]
|
|||
|
oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
|
|||
|
|
|||
|
criticV = self.critic.predict(states)
|
|||
|
advantage = copy.deepcopy(discountedR - criticV)
|
|||
|
|
|||
|
# pack [oldDisProbs,oldPiProbs,conActions,advantage] as y_true
|
|||
|
y_true = np.hstack((oldDisProbs,oldPiProbs,conActions,advantage))
|
|||
|
|
|||
|
# train start
|
|||
|
if np.any(tf.math.is_nan(y_true)):
|
|||
|
print("y_true got nan")
|
|||
|
print("oldConMusigs",oldConMusigs)
|
|||
|
print("oldPiProbs",oldPiProbs)
|
|||
|
print("conActions",conActions)
|
|||
|
print("oldConMusigs",oldConMusigs)
|
|||
|
his = self.actor.fit(x=states, y=y_true, epochs=epochs, verbose=0)
|
|||
|
if np.any(tf.math.is_nan(his.history['loss'])):
|
|||
|
print("his.history['loss'] is nan!")
|
|||
|
print(his.history['loss'])
|
|||
|
return np.mean(his.history['loss'])
|
|||
|
|
|||
|
def saveWeights(self,score):
|
|||
|
"""save now NN's Weight. Use "models.save_weights" method.
|
|||
|
Save as "tf" format "ckpt" file.
|
|||
|
|
|||
|
Args:
|
|||
|
score (int): now score
|
|||
|
"""
|
|||
|
actor_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/actor/" + "actor.ckpt"
|
|||
|
critic_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/critic/" + "critic.ckpt"
|
|||
|
score_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/" + str(round(score))
|
|||
|
self.actor.save_weights(actor_save_dir, save_format="tf")
|
|||
|
self.critic.save_weights(critic_save_dir, save_format="tf")
|
|||
|
# create an empty file named as score to recored score
|
|||
|
scorefile = open(score_dir,'w')
|
|||
|
scorefile.close()
|
|||
|
print("Model's Weights Saved")
|
|||
|
|
|||
|
def loadWeightToModels(self,loadDir):
|
|||
|
"""load NN Model. Use "models.load_weights()" method.
|
|||
|
Load "tf" format "ckpt" file.
|
|||
|
|
|||
|
Args:
|
|||
|
loadDir (string): Model dir
|
|||
|
"""
|
|||
|
actorDir = loadDir + "/actor/" + "actor.ckpt"
|
|||
|
criticDir = loadDir + "/critic/" + "critic.ckpt"
|
|||
|
self.actor.load_weights(actorDir)
|
|||
|
self.critic.load_weights(criticDir)
|
|||
|
|
|||
|
print("++++++++++++++++++++++++++++++++++++")
|
|||
|
print("++++++++++++Model Loaded++++++++++++")
|
|||
|
print(loadDir)
|
|||
|
print("++++++++++++++++++++++++++++++++++++")
|
|||
|
|
|||
|
def saveModel(self, score):
|
|||
|
"""save now NN Model. Use "model.save()" method.
|
|||
|
|
|||
|
Args:
|
|||
|
score (int): now score
|
|||
|
"""
|
|||
|
score = "_" + str(round(score))
|
|||
|
actor_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + \
|
|||
|
score+"/actor.h5"
|
|||
|
critic_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + \
|
|||
|
score+"/critic.h5"
|
|||
|
self.actor.save(actor_save_dir)
|
|||
|
self.critic.save(critic_save_dir)
|
|||
|
print("Model Saved")
|
|||
|
|
|||
|
def loadModel(self, loadDir):
|
|||
|
"""load NN Model. Use "models.load_model()" method.
|
|||
|
|
|||
|
Args:
|
|||
|
loadDir (string): Model dir
|
|||
|
|
|||
|
Returns:
|
|||
|
tf.keras.models: retuen compiled models.
|
|||
|
"""
|
|||
|
actorDir = loadDir+"/actor.h5"
|
|||
|
criticDir = loadDir+"/critic.h5"
|
|||
|
actor_net_loaded = tf.keras.models.load_model(actorDir)
|
|||
|
critic_net_loaded = tf.keras.models.load_model(criticDir)
|
|||
|
|
|||
|
print("++++++++++++++++++++++++++++++++++++")
|
|||
|
print("++++++++++++Model Loaded++++++++++++")
|
|||
|
print(loadDir)
|
|||
|
print("++++++++++++++++++++++++++++++++++++")
|
|||
|
return actor_net_loaded, critic_net_loaded
|