2022-09-05 11:46:08 +00:00
|
|
|
|
import tensorflow as tf
|
2022-10-10 21:40:15 +00:00
|
|
|
|
from tensorflow.python.ops.numpy_ops import ndarray
|
2022-09-05 11:46:08 +00:00
|
|
|
|
import tensorflow_probability as tfp
|
|
|
|
|
import numpy as np
|
2022-10-10 21:40:15 +00:00
|
|
|
|
import time
|
2022-09-05 11:46:08 +00:00
|
|
|
|
import math
|
2022-10-10 21:40:15 +00:00
|
|
|
|
|
2022-09-05 11:46:08 +00:00
|
|
|
|
import datetime
|
2022-10-10 21:40:15 +00:00
|
|
|
|
from PPOConfig import PPOConfig
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
|
|
|
|
from tensorflow import keras
|
|
|
|
|
from tensorflow.keras import layers
|
|
|
|
|
from tensorflow.keras import optimizers
|
2022-10-10 21:40:15 +00:00
|
|
|
|
|
|
|
|
|
EPS = 1e-10
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PPO(object):
|
2022-10-10 21:40:15 +00:00
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
stateSize: int,
|
|
|
|
|
disActShape: list,
|
|
|
|
|
conActSize: int,
|
|
|
|
|
conActRange: float,
|
|
|
|
|
PPOConfig: PPOConfig,
|
|
|
|
|
):
|
|
|
|
|
"""initialize PPO
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
stateSize (int): enviroment state size
|
|
|
|
|
disActShape (numpy): discrete Action shape.
|
|
|
|
|
just like [3,2],means 2 type of dis actions,each act include 3 and 2 types
|
|
|
|
|
if no discrete action output then use [0].
|
|
|
|
|
conActSize (int): continuous Action Size. if no continuous action output then use 0.
|
|
|
|
|
conActRange (float): continuous action range. -conActRange to +conActRange
|
|
|
|
|
PPOConfig (PPOConfig): PPO configuration
|
|
|
|
|
"""
|
|
|
|
|
# check use dis action or not.
|
|
|
|
|
if disActShape == [0]:
|
|
|
|
|
# non dis action output
|
|
|
|
|
self.disActSize = 0
|
|
|
|
|
self.disOutputSize = 0
|
|
|
|
|
else:
|
|
|
|
|
# make sure disActShape greater than 1
|
|
|
|
|
try:
|
|
|
|
|
if np.any(np.array(disActShape) <= 1):
|
|
|
|
|
raise ValueError(
|
|
|
|
|
"disActShape error,disActShape should greater than 1 but get", disActShape
|
|
|
|
|
)
|
|
|
|
|
except ValueError:
|
|
|
|
|
raise
|
|
|
|
|
self.disActSize = len(disActShape)
|
|
|
|
|
self.disOutputSize = sum(disActShape)
|
|
|
|
|
|
2022-09-05 11:46:08 +00:00
|
|
|
|
self.stateSize = stateSize
|
2022-10-10 21:40:15 +00:00
|
|
|
|
self.disActShape = disActShape
|
2022-09-05 11:46:08 +00:00
|
|
|
|
self.conActSize = conActSize
|
|
|
|
|
self.conActRange = conActRange
|
2022-10-10 21:40:15 +00:00
|
|
|
|
self.muSigSize = 2
|
|
|
|
|
self.conOutputSize = conActSize * self.muSigSize
|
|
|
|
|
|
|
|
|
|
# config
|
|
|
|
|
self.NNShape = PPOConfig.NNShape
|
|
|
|
|
self.criticLR = PPOConfig.criticLR
|
|
|
|
|
self.actorLR = PPOConfig.actorLR
|
|
|
|
|
self.gamma = PPOConfig.gamma
|
|
|
|
|
self.lmbda = PPOConfig.lmbda
|
|
|
|
|
self.clipRange = PPOConfig.clipRange
|
|
|
|
|
self.entropyWeight = PPOConfig.entropyWeight
|
|
|
|
|
self.trainEpochs = PPOConfig.trainEpochs
|
|
|
|
|
self.saveDir = PPOConfig.saveDir
|
|
|
|
|
self.loadModelDir = PPOConfig.loadModelDir
|
|
|
|
|
print("---------thisPPO Params---------")
|
|
|
|
|
print("self.stateSize = ", self.stateSize)
|
|
|
|
|
print("self.disActShape = ", self.disActShape)
|
|
|
|
|
print("self.disActSize", self.disActSize)
|
|
|
|
|
print("self.disOutputSize", self.disOutputSize)
|
|
|
|
|
print("self.conActSize = ", self.conActSize)
|
|
|
|
|
print("self.conActRange = ", self.conActRange)
|
|
|
|
|
print("self.conOutputSize = ", self.conOutputSize)
|
|
|
|
|
|
|
|
|
|
# config
|
|
|
|
|
print("---------thisPPO config---------")
|
|
|
|
|
print("self.NNShape = ", self.NNShape)
|
|
|
|
|
print("self.criticLR = ", self.criticLR)
|
|
|
|
|
print("self.actorLR = ", self.actorLR)
|
|
|
|
|
print("self.gamma = ", self.gamma)
|
|
|
|
|
print("self.lmbda = ", self.lmbda)
|
|
|
|
|
print("self.clipRange = ", self.clipRange)
|
|
|
|
|
print("self.entropyWeight = ", self.entropyWeight)
|
|
|
|
|
print("self.trainEpochs = ", self.trainEpochs)
|
|
|
|
|
print("self.saveDir = ", self.saveDir)
|
|
|
|
|
print("self.loadModelDir = ", self.loadModelDir)
|
|
|
|
|
|
|
|
|
|
# load NN or not
|
|
|
|
|
if self.loadModelDir is None:
|
2022-09-05 11:46:08 +00:00
|
|
|
|
# critc NN
|
2022-10-10 21:40:15 +00:00
|
|
|
|
self.critic = self.buildCriticNet(self.stateSize, 1, compileModel=True)
|
2022-09-05 11:46:08 +00:00
|
|
|
|
# actor NN
|
2022-10-10 21:40:15 +00:00
|
|
|
|
self.actor = self.buildActorNet(self.stateSize, compileModel=True)
|
|
|
|
|
print("---------Actor Model Create Success---------")
|
|
|
|
|
self.actor.summary()
|
|
|
|
|
print("---------Critic Model Create Success---------")
|
|
|
|
|
self.critic.summary()
|
2022-09-05 11:46:08 +00:00
|
|
|
|
else:
|
|
|
|
|
# critc NN
|
|
|
|
|
self.critic = self.buildCriticNet(self.stateSize, 1, compileModel=True)
|
|
|
|
|
# actor NN
|
2022-10-10 21:40:15 +00:00
|
|
|
|
self.actor = self.buildActorNet(self.stateSize, compileModel=True)
|
2022-09-05 11:46:08 +00:00
|
|
|
|
# load weight to Critic&Actor NN
|
2022-10-10 21:40:15 +00:00
|
|
|
|
self.loadWeightToModels(self.loadModelDir)
|
|
|
|
|
print("---------Actor Model Load Success---------")
|
|
|
|
|
self.actor.summary()
|
|
|
|
|
print("---------Critic Model Load Success---------")
|
|
|
|
|
self.critic.summary()
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
|
|
|
|
# Build Net
|
2022-10-10 21:40:15 +00:00
|
|
|
|
def buildActorNet(self, inputSize: int, compileModel: bool):
|
2022-09-05 11:46:08 +00:00
|
|
|
|
"""build Actor Nueral Net and compile.Output:[disAct1,disAct2,disAct3,mu,sigma]
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
inputSize (int): InputLayer Nueral size.
|
2022-10-10 21:40:15 +00:00
|
|
|
|
compileModel (bool): compile Model or not.
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
keras.Model: return Actor NN
|
|
|
|
|
"""
|
2022-10-10 21:40:15 +00:00
|
|
|
|
# -----------Input Layers-----------
|
|
|
|
|
stateInput = layers.Input(shape=(inputSize,), name="stateInput")
|
|
|
|
|
|
|
|
|
|
# -------Intermediate layers--------
|
|
|
|
|
interLayers = []
|
|
|
|
|
interLayersIndex = 0
|
|
|
|
|
for neuralUnit in self.NNShape:
|
|
|
|
|
thisLayerName = "dense" + str(interLayersIndex)
|
|
|
|
|
if interLayersIndex == 0:
|
|
|
|
|
interLayers.append(
|
|
|
|
|
layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(stateInput)
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
interLayers.append(
|
|
|
|
|
layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(interLayers[-1])
|
|
|
|
|
)
|
|
|
|
|
interLayersIndex += 1
|
|
|
|
|
|
|
|
|
|
# ----------Output Layers-----------
|
|
|
|
|
outputLayersList = []
|
|
|
|
|
if self.disActSize != 0:
|
|
|
|
|
# while NN have discrete action output.
|
|
|
|
|
disActIndex = 0
|
|
|
|
|
for thisDisActDepth in self.disActShape:
|
|
|
|
|
thisDisActName = "disAct" + str(disActIndex)
|
|
|
|
|
outputLayersList.append(
|
|
|
|
|
layers.Dense(thisDisActDepth, activation="softmax", name=thisDisActName)(
|
|
|
|
|
interLayers[-1]
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
disActIndex += 1
|
|
|
|
|
if self.conActSize != 0:
|
|
|
|
|
# while NN have continuous action output.
|
|
|
|
|
mu = tf.multiply(
|
|
|
|
|
layers.Dense(1, activation="tanh", name="muOut")(interLayers[-1]), self.conActRange
|
|
|
|
|
) # mu,既正态分布位置参数
|
|
|
|
|
sigma = tf.add(
|
|
|
|
|
layers.Dense(1, activation="softplus", name="sigmaOut")(interLayers[-1]), EPS
|
|
|
|
|
) # sigma,既正态分布尺度参数
|
|
|
|
|
outputLayersList.append(mu)
|
|
|
|
|
outputLayersList.append(sigma)
|
|
|
|
|
totalOut = layers.concatenate(outputLayersList, name="totalOut") # package
|
|
|
|
|
|
|
|
|
|
# ----------Model Compile-----------
|
2022-09-05 11:46:08 +00:00
|
|
|
|
model = keras.Model(inputs=stateInput, outputs=totalOut)
|
2022-10-10 21:40:15 +00:00
|
|
|
|
if compileModel: # Compile Model
|
|
|
|
|
actorOPT = optimizers.Adam(learning_rate=self.actorLR)
|
2022-09-05 11:46:08 +00:00
|
|
|
|
model.compile(optimizer=actorOPT, loss=self.aLoss())
|
|
|
|
|
return model
|
|
|
|
|
|
2022-10-10 21:40:15 +00:00
|
|
|
|
def buildCriticNet(self, inputSize: int, outputSize: int, compileModel: bool):
|
2022-09-05 11:46:08 +00:00
|
|
|
|
"""build Critic Nueral Net and compile.Output:[Q]
|
|
|
|
|
|
|
|
|
|
Args:
|
2022-10-10 21:40:15 +00:00
|
|
|
|
inputSize (int): input size
|
|
|
|
|
outputSize (int): output size
|
|
|
|
|
compileModel (bool): compile Model or not.
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
keras.Model: return Critic NN
|
|
|
|
|
"""
|
2022-10-10 21:40:15 +00:00
|
|
|
|
# -----------Input Layers-----------
|
|
|
|
|
stateInput = keras.Input(shape=(inputSize,), name="stateInput")
|
|
|
|
|
|
|
|
|
|
# -------Intermediate layers--------
|
|
|
|
|
interLayers = []
|
|
|
|
|
interLayersIndex = 0
|
|
|
|
|
for neuralUnit in self.NNShape:
|
|
|
|
|
thisLayerName = "dense" + str(interLayersIndex)
|
|
|
|
|
if interLayersIndex == 0:
|
|
|
|
|
interLayers.append(
|
|
|
|
|
layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(stateInput)
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
interLayers.append(
|
|
|
|
|
layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(interLayers[-1])
|
|
|
|
|
)
|
|
|
|
|
interLayersIndex += 1
|
|
|
|
|
|
|
|
|
|
# ----------Output Layers-----------
|
|
|
|
|
output = layers.Dense(outputSize, activation=None)(interLayers[-1])
|
|
|
|
|
|
|
|
|
|
# ----------Model Compile-----------
|
2022-09-05 11:46:08 +00:00
|
|
|
|
model = keras.Model(inputs=stateInput, outputs=output)
|
|
|
|
|
if compileModel:
|
|
|
|
|
criticOPT = optimizers.Adam(learning_rate=self.criticLR)
|
|
|
|
|
model.compile(optimizer=criticOPT, loss=self.cLoss())
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
# loss Function
|
2022-10-10 21:40:15 +00:00
|
|
|
|
# critic loss
|
2022-09-05 11:46:08 +00:00
|
|
|
|
def cLoss(self):
|
2022-10-10 21:40:15 +00:00
|
|
|
|
"""Critic Loss function"""
|
|
|
|
|
|
2022-09-05 11:46:08 +00:00
|
|
|
|
def loss(y_true, y_pred):
|
|
|
|
|
# y_true: discountedR
|
|
|
|
|
# y_pred: critcV = model.predict(states)
|
|
|
|
|
|
2022-10-10 21:40:15 +00:00
|
|
|
|
adv = y_true - y_pred # TD error
|
|
|
|
|
loss = tf.reduce_mean(tf.square(adv))
|
2022-09-05 11:46:08 +00:00
|
|
|
|
return loss
|
2022-10-10 21:40:15 +00:00
|
|
|
|
|
2022-09-05 11:46:08 +00:00
|
|
|
|
return loss
|
|
|
|
|
|
2022-10-10 21:40:15 +00:00
|
|
|
|
# actor loss
|
2022-09-05 11:46:08 +00:00
|
|
|
|
def aLoss(self):
|
2022-10-10 21:40:15 +00:00
|
|
|
|
"""Actor Loss function"""
|
|
|
|
|
|
|
|
|
|
def getDiscreteALoss(nowProbs, oldProbs, disOneHotAct, actShape, advantage):
|
2022-09-05 11:46:08 +00:00
|
|
|
|
"""get Discrete Action Loss
|
|
|
|
|
|
|
|
|
|
Args:
|
2022-10-10 21:40:15 +00:00
|
|
|
|
nowProbs (tf.constant): (length,actionProbSize)
|
|
|
|
|
oldProbs (tf.constant): (length,actionProbSize)
|
2022-09-05 11:46:08 +00:00
|
|
|
|
advantage (tf.constant): (length,)
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
tf.constant: (length,)
|
|
|
|
|
"""
|
2022-10-10 21:40:15 +00:00
|
|
|
|
entropy = tf.negative(
|
|
|
|
|
tf.reduce_mean(tf.math.multiply(nowProbs, tf.math.log(nowProbs + EPS)))
|
|
|
|
|
)
|
|
|
|
|
nowSingleProbs = tf.reduce_mean(tf.multiply(nowProbs, disOneHotAct), axis=1)
|
|
|
|
|
nowSingleProbs = tf.multiply(nowSingleProbs, actShape)
|
|
|
|
|
oldSingleProbs = tf.reduce_mean(tf.multiply(oldProbs, disOneHotAct), axis=1)
|
|
|
|
|
oldSingleProbs = tf.multiply(oldSingleProbs, actShape)
|
|
|
|
|
ratio = tf.math.divide(nowSingleProbs, oldSingleProbs + EPS)
|
|
|
|
|
value = tf.math.multiply(ratio, advantage)
|
|
|
|
|
clipRatio = tf.clip_by_value(ratio, 1.0 - self.clipRange, 1.0 + self.clipRange)
|
|
|
|
|
clipValue = tf.math.multiply(clipRatio, advantage)
|
|
|
|
|
loss = tf.math.negative(
|
|
|
|
|
tf.reduce_mean(tf.math.minimum(value, clipValue))
|
|
|
|
|
- tf.multiply(self.entropyWeight, entropy)
|
|
|
|
|
)
|
2022-09-05 11:46:08 +00:00
|
|
|
|
return loss
|
2022-10-10 21:40:15 +00:00
|
|
|
|
|
|
|
|
|
def getContinuousALoss(musig, actions, oldProbs, advantage):
|
2022-09-05 11:46:08 +00:00
|
|
|
|
"""get Continuous Action Loss
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
musig (tf.constant): (length,2)
|
|
|
|
|
actions (tf.constant): (length,)
|
|
|
|
|
oldProbs (tf.constant): (length,)
|
|
|
|
|
advantage (tf.constant): (length,)
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
tf.constant: (length,)
|
|
|
|
|
"""
|
2022-10-10 21:40:15 +00:00
|
|
|
|
mu = musig[:, 0]
|
|
|
|
|
sigma = musig[:, 1]
|
|
|
|
|
dist = tfp.distributions.Normal(mu, sigma)
|
|
|
|
|
|
2022-09-05 11:46:08 +00:00
|
|
|
|
nowProbs = dist.prob(actions)
|
|
|
|
|
entropy = tf.reduce_mean(dist.entropy())
|
2022-10-10 21:40:15 +00:00
|
|
|
|
|
|
|
|
|
ratio = tf.math.divide(nowProbs, oldProbs + EPS)
|
|
|
|
|
value = tf.math.multiply(ratio, advantage)
|
|
|
|
|
clipRatio = tf.clip_by_value(ratio, 1.0 - self.clipRange, 1.0 + self.clipRange)
|
|
|
|
|
clipValue = tf.math.multiply(clipRatio, advantage)
|
|
|
|
|
loss = tf.negative(
|
|
|
|
|
tf.reduce_mean(tf.math.minimum(value, clipValue))
|
|
|
|
|
- tf.multiply(self.entropyWeight, entropy)
|
|
|
|
|
)
|
2022-09-05 11:46:08 +00:00
|
|
|
|
return loss
|
|
|
|
|
|
|
|
|
|
def loss(y_true, y_pred):
|
2022-10-10 21:40:15 +00:00
|
|
|
|
# y_true: [[disActProb..., conActProbs..., disOneHotActs..., conAct..., advantage]]
|
|
|
|
|
# y_pred: [[disActProb..., mu, sigma...]]
|
|
|
|
|
totalALoss = 0
|
2022-09-05 11:46:08 +00:00
|
|
|
|
totalActionNum = 0
|
2022-10-10 21:40:15 +00:00
|
|
|
|
advantage = tf.expand_dims(y_true[:, -1], axis=1)
|
|
|
|
|
|
|
|
|
|
if self.disActSize != 0:
|
|
|
|
|
# while NN have discrete action output.
|
|
|
|
|
oldDisProbs = y_true[:, 0 : self.disOutputSize]
|
|
|
|
|
nowDisProbs = y_pred[:, 0 : self.disOutputSize] # [disAct1, disAct2, disAct3]
|
|
|
|
|
disOneHotActs = y_true[
|
|
|
|
|
:,
|
|
|
|
|
self.disOutputSize
|
|
|
|
|
+ self.conActSize : self.disOutputSize
|
|
|
|
|
+ self.conActSize
|
|
|
|
|
+ self.disOutputSize,
|
|
|
|
|
]
|
|
|
|
|
lastDisActShape = 0
|
|
|
|
|
for thisShape in self.disActShape:
|
|
|
|
|
thisNowDisProbs = nowDisProbs[:, lastDisActShape : lastDisActShape + thisShape]
|
|
|
|
|
thisOldDisProbs = oldDisProbs[:, lastDisActShape : lastDisActShape + thisShape]
|
|
|
|
|
thisDisOneHotActs = disOneHotActs[
|
|
|
|
|
:, lastDisActShape : lastDisActShape + thisShape
|
|
|
|
|
]
|
|
|
|
|
discreteALoss = getDiscreteALoss(
|
|
|
|
|
thisNowDisProbs, thisOldDisProbs, thisDisOneHotActs, thisShape, advantage
|
|
|
|
|
)
|
|
|
|
|
lastDisActShape += thisShape
|
|
|
|
|
totalALoss += discreteALoss
|
|
|
|
|
totalActionNum += 1.0
|
|
|
|
|
if self.conActSize != 0:
|
|
|
|
|
# while NN have continuous action output.
|
|
|
|
|
oldConProbs = y_true[:, self.disOutputSize : self.disOutputSize + self.conActSize]
|
|
|
|
|
conActions = y_true[
|
|
|
|
|
:,
|
|
|
|
|
self.disOutputSize
|
|
|
|
|
+ self.conActSize : self.disOutputSize
|
|
|
|
|
+ self.conActSize
|
|
|
|
|
+ self.conActSize,
|
|
|
|
|
]
|
|
|
|
|
nowConMusigs = y_pred[:, self.disOutputSize :] # [musig1,musig2]
|
|
|
|
|
lastConAct = 0
|
|
|
|
|
for conAct in range(self.conActSize):
|
|
|
|
|
thisNowConMusig = nowConMusigs[:, lastConAct : lastConAct + self.muSigSize]
|
|
|
|
|
thisOldConProb = oldConProbs[:, conAct : conAct + 1]
|
|
|
|
|
thisConAction = conActions[:, conAct]
|
|
|
|
|
continuousAloss = getContinuousALoss(
|
|
|
|
|
thisNowConMusig, thisConAction, thisOldConProb, advantage
|
|
|
|
|
)
|
|
|
|
|
totalALoss += continuousAloss
|
|
|
|
|
totalActionNum += 1.0
|
|
|
|
|
lastConAct += self.muSigSize
|
|
|
|
|
loss = tf.divide(totalALoss, totalActionNum)
|
2022-09-05 11:46:08 +00:00
|
|
|
|
return loss
|
2022-10-10 21:40:15 +00:00
|
|
|
|
|
2022-09-05 11:46:08 +00:00
|
|
|
|
return loss
|
|
|
|
|
|
2022-10-10 21:40:15 +00:00
|
|
|
|
# get Actions&values
|
|
|
|
|
def chooseAction(self, state: ndarray):
|
2022-09-05 11:46:08 +00:00
|
|
|
|
"""Agent choose action to take
|
|
|
|
|
|
|
|
|
|
Args:
|
2022-10-10 21:40:15 +00:00
|
|
|
|
state (ndarray): enviroment state
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
|
|
|
|
Returns:
|
2022-10-10 21:40:15 +00:00
|
|
|
|
np.array:
|
|
|
|
|
actions,
|
|
|
|
|
actions list,2dims like [[0],[1],[1.5]]
|
2022-09-05 11:46:08 +00:00
|
|
|
|
predictResult,
|
|
|
|
|
actor NN predict Result output
|
|
|
|
|
"""
|
|
|
|
|
# let actor choose action,use the normal distribution
|
|
|
|
|
# state = np.expand_dims(state,0)
|
2022-10-10 21:40:15 +00:00
|
|
|
|
|
|
|
|
|
# check state dimension is [stateNum,statesize]
|
|
|
|
|
if state.ndim != 2:
|
|
|
|
|
stateNum = int(len(state) / self.stateSize)
|
|
|
|
|
state = state.reshape([stateNum, self.stateSize])
|
2022-09-05 11:46:08 +00:00
|
|
|
|
predictResult = self.actor(state) # get predict result [[disAct1, disAct2, disAct3, musig]]
|
2022-10-10 21:40:15 +00:00
|
|
|
|
# print("predictResult",predictResult)
|
|
|
|
|
# predictResult = predictResult.numpy()
|
|
|
|
|
actions = []
|
|
|
|
|
if self.disActSize != 0:
|
|
|
|
|
# while NN have discrete action output.
|
|
|
|
|
lastDisActShape = 0
|
|
|
|
|
for shape in self.disActShape:
|
|
|
|
|
thisDisActProbs = predictResult[:, lastDisActShape : lastDisActShape + shape]
|
|
|
|
|
dist = tfp.distributions.Categorical(probs=thisDisActProbs, dtype=tf.float32)
|
|
|
|
|
action = int(dist.sample().numpy()[0])
|
|
|
|
|
# action = np.argmax(thisDisActProbs)
|
|
|
|
|
actions.append(action)
|
|
|
|
|
lastDisActShape += shape
|
|
|
|
|
if self.conActSize != 0:
|
|
|
|
|
# while NN have continuous action output.
|
|
|
|
|
lastConAct = 0
|
|
|
|
|
for actIndex in range(self.conActSize):
|
|
|
|
|
thisMu = predictResult[:, self.disOutputSize + lastConAct]
|
|
|
|
|
thisSig = predictResult[:, self.disOutputSize + lastConAct + 1]
|
|
|
|
|
if math.isnan(thisMu) or math.isnan(thisSig):
|
|
|
|
|
# check mu or sigma is nan
|
|
|
|
|
print("chooseAction:mu or sigma is nan")
|
|
|
|
|
thisDist = np.random.normal(loc=thisMu, scale=thisSig)
|
|
|
|
|
actions.append(np.clip(thisDist, -self.conActRange, self.conActRange))
|
|
|
|
|
lastConAct += 2
|
|
|
|
|
return actions, predictResult
|
|
|
|
|
|
|
|
|
|
def trainCritcActor(
|
|
|
|
|
self,
|
|
|
|
|
states: ndarray,
|
|
|
|
|
oldActorResult: ndarray,
|
|
|
|
|
actions: ndarray,
|
|
|
|
|
rewards: ndarray,
|
|
|
|
|
dones: ndarray,
|
|
|
|
|
nextState: ndarray,
|
|
|
|
|
epochs: int = None,
|
|
|
|
|
):
|
|
|
|
|
"""train critic&actor use PPO ways
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
states (ndarray): states
|
|
|
|
|
oldActorResult (ndarray): actor predict result
|
|
|
|
|
actions (ndarray): predicted actions include both discrete actions and continuous actions
|
|
|
|
|
rewards (ndarray): rewards from enviroment
|
|
|
|
|
dones (ndarray): dones from enviroment
|
|
|
|
|
nextState (ndarray): next state from enviroment
|
|
|
|
|
epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
tf.constant: criticLoss, actorLoss
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
if epochs == None:
|
|
|
|
|
epochs = self.trainEpochs
|
|
|
|
|
criticValues = self.getCriticV(state=states)
|
|
|
|
|
discountedR = self.discountReward(nextState, criticValues, dones, rewards)
|
|
|
|
|
advantage = self.getGAE(discountedR, criticValues)
|
|
|
|
|
|
|
|
|
|
criticLoss = self.trainCritic(states, discountedR, epochs)
|
|
|
|
|
actorLoss = self.trainActor(states, oldActorResult, actions, advantage, epochs)
|
|
|
|
|
# print("A_Loss:", actorLoss, "C_Loss:", criticLoss)
|
|
|
|
|
return criticLoss, actorLoss
|
|
|
|
|
|
|
|
|
|
def trainCritic(self, states: ndarray, discountedR: ndarray, epochs: int = None):
|
|
|
|
|
"""critic NN trainning function
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
states (ndarray): states
|
|
|
|
|
discountedR (ndarray): discounted rewards
|
|
|
|
|
epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
tf.constant: all critic losses
|
|
|
|
|
"""
|
|
|
|
|
if epochs == None:
|
|
|
|
|
epochs = self.trainEpochs
|
|
|
|
|
his = self.critic.fit(x=states, y=discountedR, epochs=epochs, verbose=0)
|
|
|
|
|
return his.history["loss"]
|
|
|
|
|
|
|
|
|
|
def trainActor(
|
|
|
|
|
self,
|
|
|
|
|
states: ndarray,
|
|
|
|
|
oldActorResult: ndarray,
|
|
|
|
|
actions: ndarray,
|
|
|
|
|
advantage: ndarray,
|
|
|
|
|
epochs: int = None,
|
|
|
|
|
):
|
|
|
|
|
"""actor NN trainning function
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
states (ndarray): states
|
|
|
|
|
oldActorResult (ndarray): actor predict results
|
|
|
|
|
actions (ndarray): acotor predict actions
|
|
|
|
|
advantage (ndarray): GAE advantage
|
|
|
|
|
epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
tf.constant: all actor losses
|
|
|
|
|
"""
|
|
|
|
|
# Trian Actor
|
|
|
|
|
# states: Buffer States
|
|
|
|
|
# actions: Buffer Actions
|
|
|
|
|
# discountedR: Discounted Rewards
|
|
|
|
|
# Epochs: just Epochs
|
|
|
|
|
if epochs == None:
|
|
|
|
|
epochs = self.trainEpochs
|
|
|
|
|
actions = np.asarray(actions, dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
disActions = actions[:, 0 : self.disActSize]
|
|
|
|
|
conActions = actions[:, self.disActSize :]
|
|
|
|
|
oldDisProbs = oldActorResult[:, 0 : self.disOutputSize] # [disAct1, disAct2, disAct3]
|
|
|
|
|
oldConMusigs = oldActorResult[:, self.disOutputSize :] # [musig1,musig2]
|
|
|
|
|
if self.disActSize != 0:
|
|
|
|
|
disOneHotActs = self.getOneHotActs(disActions)
|
|
|
|
|
if self.conActSize != 0:
|
|
|
|
|
# while NN have discrete6 & continuous actions output.
|
|
|
|
|
oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
|
|
|
|
|
# pack [oldDisProbs,oldPiProbs,conActions,advantage] as y_true
|
|
|
|
|
y_true = np.hstack((oldDisProbs, oldPiProbs, disOneHotActs, conActions, advantage))
|
|
|
|
|
else:
|
|
|
|
|
# while NN have only discrete actions output.
|
|
|
|
|
# pack [oldDisProbs,advantage] as y_true
|
|
|
|
|
y_true = np.hstack((oldDisProbs, disOneHotActs, advantage))
|
|
|
|
|
else:
|
|
|
|
|
if self.conActSize != 0:
|
|
|
|
|
# while NN have only continuous action output.
|
|
|
|
|
oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
|
|
|
|
|
# pack [oldPiProbs,conActions,advantage] as y_true
|
|
|
|
|
y_true = np.hstack((oldPiProbs, conActions, advantage))
|
|
|
|
|
else:
|
|
|
|
|
print("trainActor:disActSize & conActSize error")
|
|
|
|
|
time.sleep(999999)
|
|
|
|
|
# assembly Actions history
|
|
|
|
|
# train start
|
|
|
|
|
if np.any(tf.math.is_nan(y_true)):
|
|
|
|
|
print("y_true got nan")
|
|
|
|
|
print("y_true", y_true)
|
|
|
|
|
his = self.actor.fit(x=states, y=y_true, epochs=epochs, verbose=0)
|
|
|
|
|
if np.any(tf.math.is_nan(his.history["loss"])):
|
|
|
|
|
print("his.history['loss'] is nan!")
|
|
|
|
|
print(his.history["loss"])
|
|
|
|
|
return his.history["loss"]
|
|
|
|
|
|
|
|
|
|
def saveWeights(self, score: float):
|
|
|
|
|
"""save now NN's Weight. Use "models.save_weights" method.
|
|
|
|
|
Save as "tf" format "ckpt" file.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
score (float): now score
|
|
|
|
|
"""
|
|
|
|
|
actor_save_dir = (
|
|
|
|
|
self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/actor/" + "actor.ckpt"
|
|
|
|
|
)
|
|
|
|
|
critic_save_dir = (
|
|
|
|
|
self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/critic/" + "critic.ckpt"
|
|
|
|
|
)
|
|
|
|
|
self.actor.save_weights(actor_save_dir, save_format="tf")
|
|
|
|
|
self.critic.save_weights(critic_save_dir, save_format="tf")
|
|
|
|
|
# create an empty file named as score to recored score
|
|
|
|
|
score_dir = (
|
|
|
|
|
self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/" + str(round(score))
|
|
|
|
|
)
|
|
|
|
|
scorefile = open(score_dir, "w")
|
|
|
|
|
scorefile.close()
|
|
|
|
|
print("Model's Weights Saved")
|
|
|
|
|
|
|
|
|
|
def loadWeightToModels(self, loadDir: str):
|
|
|
|
|
"""load NN Model. Use "models.load_weights()" method.
|
|
|
|
|
Load "tf" format "ckpt" file.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
loadDir (str): Model dir
|
|
|
|
|
"""
|
|
|
|
|
actorDir = loadDir + "/actor/" + "actor.ckpt"
|
|
|
|
|
criticDir = loadDir + "/critic/" + "critic.ckpt"
|
|
|
|
|
self.actor.load_weights(actorDir)
|
|
|
|
|
self.critic.load_weights(criticDir)
|
|
|
|
|
|
|
|
|
|
print("++++++++++++++++++++++++++++++++++++")
|
|
|
|
|
print("++++++++++++Model Loaded++++++++++++")
|
|
|
|
|
print(loadDir)
|
|
|
|
|
print("++++++++++++++++++++++++++++++++++++")
|
|
|
|
|
|
|
|
|
|
def getCriticV(self, state: ndarray):
|
2022-09-05 11:46:08 +00:00
|
|
|
|
"""get Critic predict V value
|
|
|
|
|
|
|
|
|
|
Args:
|
2022-10-10 21:40:15 +00:00
|
|
|
|
state (ndarray): Env state
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
tensor: retrun Critic predict result
|
|
|
|
|
"""
|
|
|
|
|
# if state.ndim < 2:
|
|
|
|
|
# state = np.expand_dims(state,0)
|
2022-10-10 21:40:15 +00:00
|
|
|
|
if state.ndim != 2:
|
|
|
|
|
stateNum = int(len(state) / self.stateSize)
|
|
|
|
|
state = state.reshape([stateNum, self.stateSize])
|
2022-09-05 11:46:08 +00:00
|
|
|
|
return self.critic.predict(state)
|
|
|
|
|
|
2022-10-10 21:40:15 +00:00
|
|
|
|
def discountReward(self, nextState: ndarray, values: ndarray, dones: ndarray, rewards: ndarray):
|
2022-09-05 11:46:08 +00:00
|
|
|
|
"""Discount future rewards
|
|
|
|
|
|
|
|
|
|
Args:
|
2022-10-10 21:40:15 +00:00
|
|
|
|
nextState (ndarray): next Env state
|
|
|
|
|
values (ndarray): critic predict values
|
|
|
|
|
dones (ndarray): dones from enviroment
|
|
|
|
|
rewards (ndarray): reward list of this episode
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
|
|
|
|
Returns:
|
2022-10-10 21:40:15 +00:00
|
|
|
|
ndarray: discounted rewards list,same shape as rewards that input
|
|
|
|
|
"""
|
|
|
|
|
"""
|
|
|
|
|
nextV = self.getCriticV(nextState)
|
|
|
|
|
dones = 1 - dones
|
|
|
|
|
discountedRewards = []
|
|
|
|
|
for i in reversed(range(len(rewards))):
|
|
|
|
|
nextV = rewards[i] + dones[i] * self.gamma * nextV
|
|
|
|
|
discountedRewards.append(nextV)
|
|
|
|
|
discountedRewards.reverse() # reverse
|
|
|
|
|
discountedRewards = np.squeeze(discountedRewards)
|
|
|
|
|
discountedRewards = np.expand_dims(discountedRewards, axis=1)
|
|
|
|
|
# discountedRewards = np.array(discountedRewards)[:, np.newaxis]
|
|
|
|
|
return discountedRewards
|
|
|
|
|
"""
|
2022-09-05 11:46:08 +00:00
|
|
|
|
"""
|
|
|
|
|
nextV = self.getCriticV(nextState)
|
|
|
|
|
discountedRewards = []
|
|
|
|
|
for r in rewards[::-1]:
|
2022-10-10 21:40:15 +00:00
|
|
|
|
nextV = r + self.gamma * nextV
|
2022-09-05 11:46:08 +00:00
|
|
|
|
discountedRewards.append(nextV)
|
2022-10-10 21:40:15 +00:00
|
|
|
|
discountedRewards.reverse() # reverse
|
2022-09-05 11:46:08 +00:00
|
|
|
|
discountedRewards = np.squeeze(discountedRewards)
|
|
|
|
|
discountedRewards = np.expand_dims(discountedRewards, axis=1)
|
2022-10-10 21:40:15 +00:00
|
|
|
|
# discountedRewards = np.array(discountedRewards)[:, np.newaxis]
|
|
|
|
|
print(discountedRewards)
|
2022-09-05 11:46:08 +00:00
|
|
|
|
return discountedRewards
|
2022-10-10 21:40:15 +00:00
|
|
|
|
"""
|
|
|
|
|
g = 0
|
|
|
|
|
discountedRewards = []
|
|
|
|
|
lastValue = self.getCriticV(nextState)
|
|
|
|
|
values = np.append(values, lastValue, axis=0)
|
|
|
|
|
dones = 1 - dones
|
|
|
|
|
for i in reversed(range(len(rewards))):
|
|
|
|
|
delta = rewards[i] + self.gamma * values[i + 1] * dones[i] - values[i]
|
|
|
|
|
g = delta + self.gamma * self.lmbda * dones[i] * g
|
|
|
|
|
discountedRewards.append(g + values[i])
|
|
|
|
|
discountedRewards.reverse()
|
|
|
|
|
return np.asarray(discountedRewards)
|
|
|
|
|
|
|
|
|
|
def getGAE(self, discountedRewards: ndarray, values: ndarray):
|
|
|
|
|
"""compute GAE adcantage
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
discountedRewards (ndarray): discounted rewards
|
|
|
|
|
values (ndarray): critic predict values
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
2022-10-10 21:40:15 +00:00
|
|
|
|
Returns:
|
|
|
|
|
ndarray: GAE advantage
|
|
|
|
|
"""
|
|
|
|
|
advantage = discountedRewards - values
|
|
|
|
|
advantage = (advantage - np.mean(advantage)) / (np.std(advantage) + EPS)
|
|
|
|
|
return advantage
|
|
|
|
|
|
|
|
|
|
def conProb(self, mu: ndarray, sig: ndarray, x: ndarray):
|
2022-09-05 11:46:08 +00:00
|
|
|
|
"""calculate probability when x in Normal distribution(mu,sigma)
|
|
|
|
|
|
|
|
|
|
Args:
|
2022-10-10 21:40:15 +00:00
|
|
|
|
mu (ndarray): mu
|
|
|
|
|
sig (ndarray): sigma
|
|
|
|
|
x (ndarray): x
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
|
|
|
|
Returns:
|
2022-10-10 21:40:15 +00:00
|
|
|
|
ndarray: probability
|
2022-09-05 11:46:08 +00:00
|
|
|
|
"""
|
|
|
|
|
# 获取在正态分布mu,sig下当取x值时的概率
|
|
|
|
|
# return shape : (length,1)
|
|
|
|
|
mu = np.reshape(mu, (np.size(mu),))
|
|
|
|
|
sig = np.reshape(sig, (np.size(sig),))
|
|
|
|
|
x = np.reshape(x, (np.size(x),))
|
|
|
|
|
|
|
|
|
|
dist = tfp.distributions.Normal(mu, sig)
|
|
|
|
|
prob = dist.prob(x)
|
|
|
|
|
|
|
|
|
|
prob = np.reshape(prob, (np.size(x), 1))
|
2022-10-10 21:40:15 +00:00
|
|
|
|
# dist = 1./(tf.sqrt(2.*np.pi)*sig)
|
|
|
|
|
# prob = dist*tf.exp(-tf.square(x-mu)/(2.*tf.square(sig)))
|
2022-09-05 11:46:08 +00:00
|
|
|
|
return prob
|
|
|
|
|
|
2022-10-10 21:40:15 +00:00
|
|
|
|
def getOneHotActs(self, disActions):
|
|
|
|
|
"""one hot action encoder
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
|
|
|
|
Args:
|
2022-10-10 21:40:15 +00:00
|
|
|
|
disActions (ndarray): discrete actions
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
|
|
|
|
Returns:
|
2022-10-10 21:40:15 +00:00
|
|
|
|
ndarray: one hot actions
|
2022-09-05 11:46:08 +00:00
|
|
|
|
"""
|
2022-10-10 21:40:15 +00:00
|
|
|
|
actIndex = 0
|
|
|
|
|
for thisShape in self.disActShape:
|
|
|
|
|
thisActs = disActions[:, actIndex]
|
|
|
|
|
thisOneHotAct = tf.squeeze(tf.one_hot(thisActs, thisShape)).numpy()
|
|
|
|
|
if actIndex == 0:
|
|
|
|
|
oneHotActs = thisOneHotAct
|
|
|
|
|
else:
|
|
|
|
|
oneHotActs = np.append(oneHotActs, thisOneHotAct, axis=1)
|
|
|
|
|
actIndex += 1
|
|
|
|
|
return oneHotActs
|
|
|
|
|
|
|
|
|
|
def getAverageEntropy(self, probs: ndarray):
|
|
|
|
|
"""get average dis&con ACT Entropys
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
|
|
|
|
Args:
|
2022-10-10 21:40:15 +00:00
|
|
|
|
probs (ndarray): actor NN predict result
|
2022-09-05 11:46:08 +00:00
|
|
|
|
|
2022-10-10 21:40:15 +00:00
|
|
|
|
Returns:
|
|
|
|
|
float: average total entropy
|
|
|
|
|
list: discrete entropys
|
|
|
|
|
list: continuous entropys
|
2022-09-05 11:46:08 +00:00
|
|
|
|
"""
|
2022-10-10 21:40:15 +00:00
|
|
|
|
discreteEntropys = []
|
|
|
|
|
continuousEntropys = []
|
|
|
|
|
if self.disActSize != 0:
|
|
|
|
|
disProbs = probs[:, 0 : self.disOutputSize]
|
|
|
|
|
lastDisActIndex = 0
|
|
|
|
|
for actShape in self.disActShape:
|
|
|
|
|
thisDisProbs = disProbs[:, lastDisActIndex : lastDisActIndex + actShape]
|
|
|
|
|
lastDisActIndex += actShape
|
|
|
|
|
discreteEntropys.append(
|
|
|
|
|
tf.negative(
|
|
|
|
|
tf.reduce_mean(
|
|
|
|
|
tf.math.multiply(thisDisProbs, tf.math.log(thisDisProbs + EPS))
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
if self.conActSize != 0:
|
|
|
|
|
conProbs = probs[:, self.disOutputSize :]
|
|
|
|
|
conActIndex = 0
|
|
|
|
|
for i in range(self.conActSize):
|
|
|
|
|
thisConProbs = conProbs[:, conActIndex : conActIndex + 2]
|
|
|
|
|
conActIndex += 2
|
|
|
|
|
continuousEntropys.append(tf.reduce_mean(thisConProbs[:, 1]))
|
|
|
|
|
averageEntropy = np.mean([np.mean(discreteEntropys), np.mean(continuousEntropys)])
|
|
|
|
|
return averageEntropy, discreteEntropys, continuousEntropys
|