Aimbot-PPO/Aimbot-PPO-Python/PPO.py

import tensorflow as tf
from tensorflow.python.ops.numpy_ops import ndarray
import tensorflow_probability as tfp
import numpy as np
import time
import math

import datetime
from PPOConfig import PPOConfig

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers

EPS = 1e-10


class PPO(object):
    def __init__(
        self,
        stateSize: int,
        disActShape: list,
        conActSize: int,
        conActRange: float,
        PPOConfig: PPOConfig,
    ):
        """initialize PPO

        Args:
            stateSize (int): enviroment state size
            disActShape (numpy): discrete Action shape.
                                just like [3,2],means 2 type of dis actions,each act include 3 and 2 types
                                if no discrete action output then use [0].
            conActSize (int): continuous Action Size. if no continuous action output then use 0.
            conActRange (float): continuous action range. -conActRange to +conActRange
            PPOConfig (PPOConfig): PPO configuration
        """
        # check use dis action or not.
        if disActShape == [0]:
            # non dis action output
            self.disActSize = 0
            self.disOutputSize = 0
        else:
            # make sure disActShape greater than 1
            try:
                if np.any(np.array(disActShape) <= 1):
                    raise ValueError(
                        "disActShape error,disActShape should greater than 1 but get", disActShape
                    )
            except ValueError:
                raise
            self.disActSize = len(disActShape)
            self.disOutputSize = sum(disActShape)

        self.stateSize = stateSize
        self.disActShape = disActShape
        self.conActSize = conActSize
        self.conActRange = conActRange
        self.muSigSize = 2
        self.conOutputSize = conActSize * self.muSigSize

        # config
        self.NNShape = PPOConfig.NNShape
        self.criticLR = PPOConfig.criticLR
        self.actorLR = PPOConfig.actorLR
        self.gamma = PPOConfig.gamma
        self.lmbda = PPOConfig.lmbda
        self.clipRange = PPOConfig.clipRange
        self.entropyWeight = PPOConfig.entropyWeight
        self.trainEpochs = PPOConfig.trainEpochs
        self.saveDir = PPOConfig.saveDir
        self.loadModelDir = PPOConfig.loadModelDir
        print("---------thisPPO Params---------")
        print("self.stateSize = ", self.stateSize)
        print("self.disActShape = ", self.disActShape)
        print("self.disActSize", self.disActSize)
        print("self.disOutputSize", self.disOutputSize)
        print("self.conActSize = ", self.conActSize)
        print("self.conActRange = ", self.conActRange)
        print("self.conOutputSize = ", self.conOutputSize)

        # config
        print("---------thisPPO config---------")
        print("self.NNShape = ", self.NNShape)
        print("self.criticLR = ", self.criticLR)
        print("self.actorLR = ", self.actorLR)
        print("self.gamma = ", self.gamma)
        print("self.lmbda = ", self.lmbda)
        print("self.clipRange = ", self.clipRange)
        print("self.entropyWeight = ", self.entropyWeight)
        print("self.trainEpochs = ", self.trainEpochs)
        print("self.saveDir = ", self.saveDir)
        print("self.loadModelDir = ", self.loadModelDir)

        # load NN or not
        if self.loadModelDir is None:
            # critc NN
            self.critic = self.buildCriticNet(self.stateSize, 1, compileModel=True)
            # actor NN
            self.actor = self.buildActorNet(self.stateSize, compileModel=True)
            print("---------Actor Model Create Success---------")
            self.actor.summary()
            print("---------Critic Model Create Success---------")
            self.critic.summary()
        else:
            # critc NN
            self.critic = self.buildCriticNet(self.stateSize, 1, compileModel=True)
            # actor NN
            self.actor = self.buildActorNet(self.stateSize, compileModel=True)
            # load weight to Critic&Actor NN
            self.loadWeightToModels(self.loadModelDir)
            print("---------Actor Model Load Success---------")
            self.actor.summary()
            print("---------Critic Model Load Success---------")
            self.critic.summary()

    # Build Net
    def buildActorNet(self, inputSize: int, compileModel: bool):
        """build Actor Nueral Net and compile.Output:[disAct1,disAct2,disAct3,mu,sigma]

        Args:
            inputSize (int): InputLayer Nueral size.
            compileModel (bool): compile Model or not.

        Returns:
            keras.Model: return Actor NN
        """
        # -----------Input Layers-----------
        stateInput = layers.Input(shape=(inputSize,), name="stateInput")

        # -------Intermediate layers--------
        interLayers = []
        interLayersIndex = 0
        for neuralUnit in self.NNShape:
            thisLayerName = "dense" + str(interLayersIndex)
            if interLayersIndex == 0:
                interLayers.append(
                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(stateInput)
                )
            else:
                interLayers.append(
                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(interLayers[-1])
                )
            interLayersIndex += 1

        # ----------Output Layers-----------
        outputLayersList = []
        if self.disActSize != 0:
            # while NN have discrete action output.
            disActIndex = 0
            for thisDisActDepth in self.disActShape:
                thisDisActName = "disAct" + str(disActIndex)
                outputLayersList.append(
                    layers.Dense(thisDisActDepth, activation="softmax", name=thisDisActName)(
                        interLayers[-1]
                    )
                )
                disActIndex += 1
        if self.conActSize != 0:
            # while NN have continuous action output.
            mu = tf.multiply(
                layers.Dense(1, activation="tanh", name="muOut")(interLayers[-1]), self.conActRange
            )  # mu，既正态分布位置参数
            sigma = tf.add(
                layers.Dense(1, activation="softplus", name="sigmaOut")(interLayers[-1]), EPS
            )  # sigma，既正态分布尺度参数
            outputLayersList.append(mu)
            outputLayersList.append(sigma)
        totalOut = layers.concatenate(outputLayersList, name="totalOut")  # package

        # ----------Model Compile-----------
        model = keras.Model(inputs=stateInput, outputs=totalOut)
        if compileModel:  # Compile Model
            actorOPT = optimizers.Adam(learning_rate=self.actorLR)
            model.compile(optimizer=actorOPT, loss=self.aLoss())
        return model

    def buildCriticNet(self, inputSize: int, outputSize: int, compileModel: bool):
        """build Critic Nueral Net and compile.Output:[Q]

        Args:
            inputSize (int): input size
            outputSize (int): output size
            compileModel (bool): compile Model or not.

        Returns:
            keras.Model: return Critic NN
        """
        # -----------Input Layers-----------
        stateInput = keras.Input(shape=(inputSize,), name="stateInput")

        # -------Intermediate layers--------
        interLayers = []
        interLayersIndex = 0
        for neuralUnit in self.NNShape:
            thisLayerName = "dense" + str(interLayersIndex)
            if interLayersIndex == 0:
                interLayers.append(
                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(stateInput)
                )
            else:
                interLayers.append(
                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(interLayers[-1])
                )
            interLayersIndex += 1

        # ----------Output Layers-----------
        output = layers.Dense(outputSize, activation=None)(interLayers[-1])

        # ----------Model Compile-----------
        model = keras.Model(inputs=stateInput, outputs=output)
        if compileModel:
            criticOPT = optimizers.Adam(learning_rate=self.criticLR)
            model.compile(optimizer=criticOPT, loss=self.cLoss())
        return model

    # loss Function
    # critic loss
    def cLoss(self):
        """Critic Loss function"""

        def loss(y_true, y_pred):
            # y_true: discountedR
            # y_pred: critcV = model.predict(states)

            adv = y_true - y_pred  # TD error
            loss = tf.reduce_mean(tf.square(adv))
            return loss

        return loss

    # actor loss
    def aLoss(self):
        """Actor Loss function"""

        def getDiscreteALoss(nowProbs, oldProbs, disOneHotAct, actShape, advantage):
            """get Discrete Action Loss

            Args:
                nowProbs (tf.constant): (length,actionProbSize)
                oldProbs (tf.constant): (length,actionProbSize)
                advantage (tf.constant): (length,)

            Returns:
                tf.constant: (length,)
            """
            entropy = tf.negative(
                tf.reduce_mean(tf.math.multiply(nowProbs, tf.math.log(nowProbs + EPS)))
            )
            nowSingleProbs = tf.reduce_mean(tf.multiply(nowProbs, disOneHotAct), axis=1)
            nowSingleProbs = tf.multiply(nowSingleProbs, actShape)
            oldSingleProbs = tf.reduce_mean(tf.multiply(oldProbs, disOneHotAct), axis=1)
            oldSingleProbs = tf.multiply(oldSingleProbs, actShape)
            ratio = tf.math.divide(nowSingleProbs, oldSingleProbs + EPS)
            value = tf.math.multiply(ratio, advantage)
            clipRatio = tf.clip_by_value(ratio, 1.0 - self.clipRange, 1.0 + self.clipRange)
            clipValue = tf.math.multiply(clipRatio, advantage)
            loss = tf.math.negative(
                tf.reduce_mean(tf.math.minimum(value, clipValue))
                - tf.multiply(self.entropyWeight, entropy)
            )
            return loss

        def getContinuousALoss(musig, actions, oldProbs, advantage):
            """get Continuous Action Loss

            Args:
                musig (tf.constant): (length,2)
                actions (tf.constant): (length,)
                oldProbs (tf.constant): (length,)
                advantage (tf.constant): (length,)

            Returns:
                tf.constant: (length,)
            """
            mu = musig[:, 0]
            sigma = musig[:, 1]
            dist = tfp.distributions.Normal(mu, sigma)

            nowProbs = dist.prob(actions)
            entropy = tf.reduce_mean(dist.entropy())

            ratio = tf.math.divide(nowProbs, oldProbs + EPS)
            value = tf.math.multiply(ratio, advantage)
            clipRatio = tf.clip_by_value(ratio, 1.0 - self.clipRange, 1.0 + self.clipRange)
            clipValue = tf.math.multiply(clipRatio, advantage)
            loss = tf.negative(
                tf.reduce_mean(tf.math.minimum(value, clipValue))
                - tf.multiply(self.entropyWeight, entropy)
            )
            return loss

        def loss(y_true, y_pred):
            # y_true: [[disActProb..., conActProbs..., disOneHotActs..., conAct..., advantage]]
            # y_pred: [[disActProb..., mu, sigma...]]
            totalALoss = 0
            totalActionNum = 0
            advantage = tf.expand_dims(y_true[:, -1], axis=1)

            if self.disActSize != 0:
                # while NN have discrete action output.
                oldDisProbs = y_true[:, 0 : self.disOutputSize]
                nowDisProbs = y_pred[:, 0 : self.disOutputSize]  # [disAct1, disAct2, disAct3]
                disOneHotActs = y_true[
                    :,
                    self.disOutputSize
                    + self.conActSize : self.disOutputSize
                    + self.conActSize
                    + self.disOutputSize,
                ]
                lastDisActShape = 0
                for thisShape in self.disActShape:
                    thisNowDisProbs = nowDisProbs[:, lastDisActShape : lastDisActShape + thisShape]
                    thisOldDisProbs = oldDisProbs[:, lastDisActShape : lastDisActShape + thisShape]
                    thisDisOneHotActs = disOneHotActs[
                        :, lastDisActShape : lastDisActShape + thisShape
                    ]
                    discreteALoss = getDiscreteALoss(
                        thisNowDisProbs, thisOldDisProbs, thisDisOneHotActs, thisShape, advantage
                    )
                    lastDisActShape += thisShape
                    totalALoss += discreteALoss
                    totalActionNum += 1.0
            if self.conActSize != 0:
                # while NN have continuous action output.
                oldConProbs = y_true[:, self.disOutputSize : self.disOutputSize + self.conActSize]
                conActions = y_true[
                    :,
                    self.disOutputSize
                    + self.conActSize : self.disOutputSize
                    + self.conActSize
                    + self.conActSize,
                ]
                nowConMusigs = y_pred[:, self.disOutputSize :]  # [musig1,musig2]
                lastConAct = 0
                for conAct in range(self.conActSize):
                    thisNowConMusig = nowConMusigs[:, lastConAct : lastConAct + self.muSigSize]
                    thisOldConProb = oldConProbs[:, conAct : conAct + 1]
                    thisConAction = conActions[:, conAct]
                    continuousAloss = getContinuousALoss(
                        thisNowConMusig, thisConAction, thisOldConProb, advantage
                    )
                    totalALoss += continuousAloss
                    totalActionNum += 1.0
                    lastConAct += self.muSigSize
            loss = tf.divide(totalALoss, totalActionNum)
            return loss

        return loss

    # get Actions&values
    def chooseAction(self, state: ndarray):
        """Agent choose action to take

        Args:
            state (ndarray): enviroment state

        Returns:
            np.array:
                actions,
                    actions list,2dims like [[0],[1],[1.5]]
                predictResult,
                    actor NN predict Result output
        """
        # let actor choose action,use the normal distribution
        # state = np.expand_dims(state,0)

        # check state dimension is [stateNum,statesize]
        if state.ndim != 2:
            stateNum = int(len(state) / self.stateSize)
            state = state.reshape([stateNum, self.stateSize])
        predictResult = self.actor(state)  # get predict result [[disAct1, disAct2, disAct3, musig]]
        # print("predictResult",predictResult)
        # predictResult = predictResult.numpy()
        actions = []
        if self.disActSize != 0:
            # while NN have discrete action output.
            lastDisActShape = 0
            for shape in self.disActShape:
                thisDisActProbs = predictResult[:, lastDisActShape : lastDisActShape + shape]
                dist = tfp.distributions.Categorical(probs=thisDisActProbs, dtype=tf.float32)
                action = int(dist.sample().numpy()[0])
                # action = np.argmax(thisDisActProbs)
                actions.append(action)
                lastDisActShape += shape
        if self.conActSize != 0:
            # while NN have continuous action output.
            lastConAct = 0
            for actIndex in range(self.conActSize):
                thisMu = predictResult[:, self.disOutputSize + lastConAct]
                thisSig = predictResult[:, self.disOutputSize + lastConAct + 1]
                if math.isnan(thisMu) or math.isnan(thisSig):
                    # check mu or sigma is nan
                    print("chooseAction:mu or sigma is nan")
                thisDist = np.random.normal(loc=thisMu, scale=thisSig)
                actions.append(np.clip(thisDist, -self.conActRange, self.conActRange))
                lastConAct += 2
        return actions, predictResult

    def trainCritcActor(
        self,
        states: ndarray,
        oldActorResult: ndarray,
        actions: ndarray,
        rewards: ndarray,
        dones: ndarray,
        nextState: ndarray,
        epochs: int = None,
    ):
        """train critic&actor use PPO ways

        Args:
            states (ndarray): states
            oldActorResult (ndarray): actor predict result
            actions (ndarray): predicted actions include both discrete actions and continuous actions
            rewards (ndarray): rewards from enviroment
            dones (ndarray): dones from enviroment
            nextState (ndarray): next state from enviroment
            epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.

        Returns:
            tf.constant: criticLoss, actorLoss
        """

        if epochs == None:
            epochs = self.trainEpochs
        criticValues = self.getCriticV(state=states)
        discountedR = self.discountReward(nextState, criticValues, dones, rewards)
        advantage = self.getGAE(discountedR, criticValues)

        criticLoss = self.trainCritic(states, discountedR, epochs)
        actorLoss = self.trainActor(states, oldActorResult, actions, advantage, epochs)
        # print("A_Loss:", actorLoss, "C_Loss:", criticLoss)
        return criticLoss, actorLoss

    def trainCritic(self, states: ndarray, discountedR: ndarray, epochs: int = None):
        """critic NN trainning function

        Args:
            states (ndarray): states
            discountedR (ndarray): discounted rewards
            epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.

        Returns:
            tf.constant: all critic losses
        """
        if epochs == None:
            epochs = self.trainEpochs
        his = self.critic.fit(x=states, y=discountedR, epochs=epochs, verbose=0)
        return his.history["loss"]

    def trainActor(
        self,
        states: ndarray,
        oldActorResult: ndarray,
        actions: ndarray,
        advantage: ndarray,
        epochs: int = None,
    ):
        """actor NN trainning function

        Args:
            states (ndarray): states
            oldActorResult (ndarray): actor predict results
            actions (ndarray): acotor predict actions
            advantage (ndarray): GAE advantage
            epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.

        Returns:
            tf.constant: all actor losses
        """
        # Trian Actor
        # states: Buffer States
        # actions: Buffer Actions
        # discountedR: Discounted Rewards
        # Epochs: just Epochs
        if epochs == None:
            epochs = self.trainEpochs
        actions = np.asarray(actions, dtype=np.float32)

        disActions = actions[:, 0 : self.disActSize]
        conActions = actions[:, self.disActSize :]
        oldDisProbs = oldActorResult[:, 0 : self.disOutputSize]  # [disAct1, disAct2, disAct3]
        oldConMusigs = oldActorResult[:, self.disOutputSize :]  # [musig1,musig2]
        if self.disActSize != 0:
            disOneHotActs = self.getOneHotActs(disActions)
            if self.conActSize != 0:
                # while NN have discrete6 & continuous actions output.
                oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
                # pack [oldDisProbs,oldPiProbs,conActions,advantage] as y_true
                y_true = np.hstack((oldDisProbs, oldPiProbs, disOneHotActs, conActions, advantage))
            else:
                # while NN have only discrete actions output.
                # pack [oldDisProbs,advantage] as y_true
                y_true = np.hstack((oldDisProbs, disOneHotActs, advantage))
        else:
            if self.conActSize != 0:
                # while NN have only continuous action output.
                oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
                # pack [oldPiProbs,conActions,advantage] as y_true
                y_true = np.hstack((oldPiProbs, conActions, advantage))
            else:
                print("trainActor:disActSize & conActSize error")
                time.sleep(999999)
        # assembly Actions history
        # train start
        if np.any(tf.math.is_nan(y_true)):
            print("y_true got nan")
            print("y_true", y_true)
        his = self.actor.fit(x=states, y=y_true, epochs=epochs, verbose=0)
        if np.any(tf.math.is_nan(his.history["loss"])):
            print("his.history['loss'] is nan!")
            print(his.history["loss"])
        return his.history["loss"]

    def saveWeights(self, score: float):
        """save now NN's Weight. Use "models.save_weights" method.
        Save as "tf" format "ckpt" file.

        Args:
            score (float): now score
        """
        actor_save_dir = (
            self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/actor/" + "actor.ckpt"
        )
        critic_save_dir = (
            self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/critic/" + "critic.ckpt"
        )
        self.actor.save_weights(actor_save_dir, save_format="tf")
        self.critic.save_weights(critic_save_dir, save_format="tf")
        # create an empty file named  as score to recored score
        score_dir = (
            self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/" + str(round(score))
        )
        scorefile = open(score_dir, "w")
        scorefile.close()
        print("Model's Weights Saved")

    def loadWeightToModels(self, loadDir: str):
        """load NN Model. Use "models.load_weights()" method.
        Load "tf" format "ckpt" file.

        Args:
            loadDir (str): Model dir
        """
        actorDir = loadDir + "/actor/" + "actor.ckpt"
        criticDir = loadDir + "/critic/" + "critic.ckpt"
        self.actor.load_weights(actorDir)
        self.critic.load_weights(criticDir)

        print("++++++++++++++++++++++++++++++++++++")
        print("++++++++++++Model Loaded++++++++++++")
        print(loadDir)
        print("++++++++++++++++++++++++++++++++++++")

    def getCriticV(self, state: ndarray):
        """get Critic predict V value

        Args:
            state (ndarray): Env state

        Returns:
            tensor: retrun Critic predict result
        """
        # if state.ndim < 2:
        #    state = np.expand_dims(state,0)
        if state.ndim != 2:
            stateNum = int(len(state) / self.stateSize)
            state = state.reshape([stateNum, self.stateSize])
        return self.critic.predict(state)

    def discountReward(self, nextState: ndarray, values: ndarray, dones: ndarray, rewards: ndarray):
        """Discount future rewards

        Args:
            nextState (ndarray): next Env state
            values (ndarray): critic predict values
            dones (ndarray): dones from enviroment
            rewards (ndarray): reward list of this episode

        Returns:
            ndarray: discounted rewards list,same shape as rewards that input
        """
        """
        nextV = self.getCriticV(nextState)
        dones = 1 - dones
        discountedRewards = []
        for i in reversed(range(len(rewards))):
            nextV = rewards[i] + dones[i] * self.gamma * nextV
            discountedRewards.append(nextV)
        discountedRewards.reverse()  # reverse
        discountedRewards = np.squeeze(discountedRewards)
        discountedRewards = np.expand_dims(discountedRewards, axis=1)
        # discountedRewards = np.array(discountedRewards)[:, np.newaxis]
        return discountedRewards
        """
        """
        nextV = self.getCriticV(nextState)
        discountedRewards = []
        for r in rewards[::-1]:
            nextV = r + self.gamma * nextV
            discountedRewards.append(nextV)
        discountedRewards.reverse()  # reverse
        discountedRewards = np.squeeze(discountedRewards)
        discountedRewards = np.expand_dims(discountedRewards, axis=1)
        # discountedRewards = np.array(discountedRewards)[:, np.newaxis]
        print(discountedRewards)
        return discountedRewards
        """
        g = 0
        discountedRewards = []
        lastValue = self.getCriticV(nextState)
        values = np.append(values, lastValue, axis=0)
        dones = 1 - dones
        for i in reversed(range(len(rewards))):
            delta = rewards[i] + self.gamma * values[i + 1] * dones[i] - values[i]
            g = delta + self.gamma * self.lmbda * dones[i] * g
            discountedRewards.append(g + values[i])
        discountedRewards.reverse()
        return np.asarray(discountedRewards)

    def getGAE(self, discountedRewards: ndarray, values: ndarray):
        """compute GAE adcantage

        Args:
            discountedRewards (ndarray): discounted rewards
            values (ndarray): critic predict values

        Returns:
            ndarray: GAE advantage
        """
        advantage = discountedRewards - values
        advantage = (advantage - np.mean(advantage)) / (np.std(advantage) + EPS)
        return advantage

    def conProb(self, mu: ndarray, sig: ndarray, x: ndarray):
        """calculate probability when x in Normal distribution(mu,sigma)

        Args:
            mu (ndarray): mu
            sig (ndarray): sigma
            x (ndarray): x

        Returns:
            ndarray: probability
        """
        # 获取在正态分布mu,sig下当取x值时的概率
        # return shape : (length,1)
        mu = np.reshape(mu, (np.size(mu),))
        sig = np.reshape(sig, (np.size(sig),))
        x = np.reshape(x, (np.size(x),))

        dist = tfp.distributions.Normal(mu, sig)
        prob = dist.prob(x)

        prob = np.reshape(prob, (np.size(x), 1))
        # dist = 1./(tf.sqrt(2.*np.pi)*sig)
        # prob = dist*tf.exp(-tf.square(x-mu)/(2.*tf.square(sig)))
        return prob

    def getOneHotActs(self, disActions):
        """one hot action encoder

        Args:
            disActions (ndarray): discrete actions

        Returns:
            ndarray: one hot actions
        """
        actIndex = 0
        for thisShape in self.disActShape:
            thisActs = disActions[:, actIndex]
            thisOneHotAct = tf.squeeze(tf.one_hot(thisActs, thisShape)).numpy()
            if actIndex == 0:
                oneHotActs = thisOneHotAct
            else:
                oneHotActs = np.append(oneHotActs, thisOneHotAct, axis=1)
            actIndex += 1
        return oneHotActs

    def getAverageEntropy(self, probs: ndarray):
        """get average dis&con ACT Entropys

        Args:
            probs (ndarray): actor NN predict result

        Returns:
            float: average total entropy
            list: discrete entropys
            list: continuous entropys
        """
        discreteEntropys = []
        continuousEntropys = []
        if self.disActSize != 0:
            disProbs = probs[:, 0 : self.disOutputSize]
            lastDisActIndex = 0
            for actShape in self.disActShape:
                thisDisProbs = disProbs[:, lastDisActIndex : lastDisActIndex + actShape]
                lastDisActIndex += actShape
                discreteEntropys.append(
                    tf.negative(
                        tf.reduce_mean(
                            tf.math.multiply(thisDisProbs, tf.math.log(thisDisProbs + EPS))
                        )
                    )
                )
        if self.conActSize != 0:
            conProbs = probs[:, self.disOutputSize :]
            conActIndex = 0
            for i in range(self.conActSize):
                thisConProbs = conProbs[:, conActIndex : conActIndex + 2]
                conActIndex += 2
                continuousEntropys.append(tf.reduce_mean(thisConProbs[:, 1]))
        averageEntropy = np.mean([np.mean(discreteEntropys), np.mean(continuousEntropys)])
        return averageEntropy, discreteEntropys, continuousEntropys
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								import tensorflow as tf
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								from tensorflow.python.ops.numpy_ops import ndarray
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								import tensorflow_probability as tfp
 								import numpy as np
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								import time
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								import math
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								import datetime
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								from PPOConfig import PPOConfig
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
 								from tensorflow import keras
 								from tensorflow.keras import layers
 								from tensorflow.keras import optimizers
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
 								EPS = 1e-10
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
 								class PPO(object):
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								    def __init__(
 								        self,
 								        stateSize: int,
 								        disActShape: list,
 								        conActSize: int,
 								        conActRange: float,
 								        PPOConfig: PPOConfig,
 								    ):
 								        """initialize PPO
 								        Args:
 								            stateSize (int): enviroment state size
 								            disActShape (numpy): discrete Action shape.
 								                                just like [3,2],means 2 type of dis actions,each act include 3 and 2 types
 								                                if no discrete action output then use [0].
 								            conActSize (int): continuous Action Size. if no continuous action output then use 0.
 								            conActRange (float): continuous action range. -conActRange to +conActRange
 								            PPOConfig (PPOConfig): PPO configuration
 								        """
 								        # check use dis action or not.
 								        if disActShape == [0]:
 								            # non dis action output
 								            self.disActSize = 0
 								            self.disOutputSize = 0
 								        else:
 								            # make sure disActShape greater than 1
 								            try:
 								                if np.any(np.array(disActShape) <= 1):
 								                    raise ValueError(
 								                        "disActShape error,disActShape should greater than 1 but get", disActShape
 								                    )
 								            except ValueError:
 								                raise
 								            self.disActSize = len(disActShape)
 								            self.disOutputSize = sum(disActShape)
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        self.stateSize = stateSize
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        self.disActShape = disActShape
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        self.conActSize = conActSize
 								        self.conActRange = conActRange
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        self.muSigSize = 2
 								        self.conOutputSize = conActSize * self.muSigSize
 								        # config
 								        self.NNShape = PPOConfig.NNShape
 								        self.criticLR = PPOConfig.criticLR
 								        self.actorLR = PPOConfig.actorLR
 								        self.gamma = PPOConfig.gamma
 								        self.lmbda = PPOConfig.lmbda
 								        self.clipRange = PPOConfig.clipRange
 								        self.entropyWeight = PPOConfig.entropyWeight
 								        self.trainEpochs = PPOConfig.trainEpochs
 								        self.saveDir = PPOConfig.saveDir
 								        self.loadModelDir = PPOConfig.loadModelDir
 								        print("---------thisPPO Params---------")
 								        print("self.stateSize = ", self.stateSize)
 								        print("self.disActShape = ", self.disActShape)
 								        print("self.disActSize", self.disActSize)
 								        print("self.disOutputSize", self.disOutputSize)
 								        print("self.conActSize = ", self.conActSize)
 								        print("self.conActRange = ", self.conActRange)
 								        print("self.conOutputSize = ", self.conOutputSize)
 								        # config
 								        print("---------thisPPO config---------")
 								        print("self.NNShape = ", self.NNShape)
 								        print("self.criticLR = ", self.criticLR)
 								        print("self.actorLR = ", self.actorLR)
 								        print("self.gamma = ", self.gamma)
 								        print("self.lmbda = ", self.lmbda)
 								        print("self.clipRange = ", self.clipRange)
 								        print("self.entropyWeight = ", self.entropyWeight)
 								        print("self.trainEpochs = ", self.trainEpochs)
 								        print("self.saveDir = ", self.saveDir)
 								        print("self.loadModelDir = ", self.loadModelDir)
 								        # load NN or not
 								        if self.loadModelDir is None:
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								            # critc NN
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            self.critic = self.buildCriticNet(self.stateSize, 1, compileModel=True)
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								            # actor NN
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            self.actor = self.buildActorNet(self.stateSize, compileModel=True)
 								            print("---------Actor Model Create Success---------")
 								            self.actor.summary()
 								            print("---------Critic Model Create Success---------")
 								            self.critic.summary()
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        else:
 								            # critc NN
 								            self.critic = self.buildCriticNet(self.stateSize, 1, compileModel=True)
 								            # actor NN
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            self.actor = self.buildActorNet(self.stateSize, compileModel=True)
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								            # load weight to Critic&Actor NN
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            self.loadWeightToModels(self.loadModelDir)
 								            print("---------Actor Model Load Success---------")
 								            self.actor.summary()
 								            print("---------Critic Model Load Success---------")
 								            self.critic.summary()
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
 								    # Build Net
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								    def buildActorNet(self, inputSize: int, compileModel: bool):
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        """build Actor Nueral Net and compile.Output:[disAct1,disAct2,disAct3,mu,sigma]
 								        Args:
 								            inputSize (int): InputLayer Nueral size.
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            compileModel (bool): compile Model or not.
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
 								        Returns:
 								            keras.Model: return Actor NN
 								        """
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        # -----------Input Layers-----------
 								        stateInput = layers.Input(shape=(inputSize,), name="stateInput")
 								        # -------Intermediate layers--------
 								        interLayers = []
 								        interLayersIndex = 0
 								        for neuralUnit in self.NNShape:
 								            thisLayerName = "dense" + str(interLayersIndex)
 								            if interLayersIndex == 0:
 								                interLayers.append(
 								                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(stateInput)
 								                )
 								            else:
 								                interLayers.append(
 								                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(interLayers[-1])
 								                )
 								            interLayersIndex += 1
 								        # ----------Output Layers-----------
 								        outputLayersList = []
 								        if self.disActSize != 0:
 								            # while NN have discrete action output.
 								            disActIndex = 0
 								            for thisDisActDepth in self.disActShape:
 								                thisDisActName = "disAct" + str(disActIndex)
 								                outputLayersList.append(
 								                    layers.Dense(thisDisActDepth, activation="softmax", name=thisDisActName)(
 								                        interLayers[-1]
 								                    )
 								                )
 								                disActIndex += 1
 								        if self.conActSize != 0:
 								            # while NN have continuous action output.
 								            mu = tf.multiply(
 								                layers.Dense(1, activation="tanh", name="muOut")(interLayers[-1]), self.conActRange
 								            )  # mu，既正态分布位置参数
 								            sigma = tf.add(
 								                layers.Dense(1, activation="softplus", name="sigmaOut")(interLayers[-1]), EPS
 								            )  # sigma，既正态分布尺度参数
 								            outputLayersList.append(mu)
 								            outputLayersList.append(sigma)
 								        totalOut = layers.concatenate(outputLayersList, name="totalOut")  # package
 								        # ----------Model Compile-----------
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        model = keras.Model(inputs=stateInput, outputs=totalOut)
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        if compileModel:  # Compile Model
 								            actorOPT = optimizers.Adam(learning_rate=self.actorLR)
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								            model.compile(optimizer=actorOPT, loss=self.aLoss())
 								        return model
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								    def buildCriticNet(self, inputSize: int, outputSize: int, compileModel: bool):
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        """build Critic Nueral Net and compile.Output:[Q]
 								        Args:
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            inputSize (int): input size
 								            outputSize (int): output size
 								            compileModel (bool): compile Model or not.
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
 								        Returns:
 								            keras.Model: return Critic NN
 								        """
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        # -----------Input Layers-----------
 								        stateInput = keras.Input(shape=(inputSize,), name="stateInput")
 								        # -------Intermediate layers--------
 								        interLayers = []
 								        interLayersIndex = 0
 								        for neuralUnit in self.NNShape:
 								            thisLayerName = "dense" + str(interLayersIndex)
 								            if interLayersIndex == 0:
 								                interLayers.append(
 								                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(stateInput)
 								                )
 								            else:
 								                interLayers.append(
 								                    layers.Dense(neuralUnit, activation="relu", name=thisLayerName)(interLayers[-1])
 								                )
 								            interLayersIndex += 1
 								        # ----------Output Layers-----------
 								        output = layers.Dense(outputSize, activation=None)(interLayers[-1])
 								        # ----------Model Compile-----------
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        model = keras.Model(inputs=stateInput, outputs=output)
 								        if compileModel:
 								            criticOPT = optimizers.Adam(learning_rate=self.criticLR)
 								            model.compile(optimizer=criticOPT, loss=self.cLoss())
 								        return model
 								    # loss Function
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								    # critic loss
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								    def cLoss(self):
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        """Critic Loss function"""
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        def loss(y_true, y_pred):
 								            # y_true: discountedR
 								            # y_pred: critcV = model.predict(states)
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            adv = y_true - y_pred  # TD error
 								            loss = tf.reduce_mean(tf.square(adv))
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								            return loss
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        return loss
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								    # actor loss
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								    def aLoss(self):
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        """Actor Loss function"""
 								        def getDiscreteALoss(nowProbs, oldProbs, disOneHotAct, actShape, advantage):
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								            """get Discrete Action Loss
 								            Args:
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								                nowProbs (tf.constant): (length,actionProbSize)
 								                oldProbs (tf.constant): (length,actionProbSize)
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								                advantage (tf.constant): (length,)
 								            Returns:
 								                tf.constant: (length,)
 								            """
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            entropy = tf.negative(
 								                tf.reduce_mean(tf.math.multiply(nowProbs, tf.math.log(nowProbs + EPS)))
 								            )
 								            nowSingleProbs = tf.reduce_mean(tf.multiply(nowProbs, disOneHotAct), axis=1)
 								            nowSingleProbs = tf.multiply(nowSingleProbs, actShape)
 								            oldSingleProbs = tf.reduce_mean(tf.multiply(oldProbs, disOneHotAct), axis=1)
 								            oldSingleProbs = tf.multiply(oldSingleProbs, actShape)
 								            ratio = tf.math.divide(nowSingleProbs, oldSingleProbs + EPS)
 								            value = tf.math.multiply(ratio, advantage)
 								            clipRatio = tf.clip_by_value(ratio, 1.0 - self.clipRange, 1.0 + self.clipRange)
 								            clipValue = tf.math.multiply(clipRatio, advantage)
 								            loss = tf.math.negative(
 								                tf.reduce_mean(tf.math.minimum(value, clipValue))
 								                - tf.multiply(self.entropyWeight, entropy)
 								            )
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								            return loss
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
 								        def getContinuousALoss(musig, actions, oldProbs, advantage):
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								            """get Continuous Action Loss
 								            Args:
 								                musig (tf.constant): (length,2)
 								                actions (tf.constant): (length,)
 								                oldProbs (tf.constant): (length,)
 								                advantage (tf.constant): (length,)
 								            Returns:
 								                tf.constant: (length,)
 								            """
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            mu = musig[:, 0]
 								            sigma = musig[:, 1]
 								            dist = tfp.distributions.Normal(mu, sigma)
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								            nowProbs = dist.prob(actions)
 								            entropy = tf.reduce_mean(dist.entropy())
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
 								            ratio = tf.math.divide(nowProbs, oldProbs + EPS)
 								            value = tf.math.multiply(ratio, advantage)
 								            clipRatio = tf.clip_by_value(ratio, 1.0 - self.clipRange, 1.0 + self.clipRange)
 								            clipValue = tf.math.multiply(clipRatio, advantage)
 								            loss = tf.negative(
 								                tf.reduce_mean(tf.math.minimum(value, clipValue))
 								                - tf.multiply(self.entropyWeight, entropy)
 								            )
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								            return loss
 								        def loss(y_true, y_pred):
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            # y_true: [[disActProb..., conActProbs..., disOneHotActs..., conAct..., advantage]]
 								            # y_pred: [[disActProb..., mu, sigma...]]
 								            totalALoss = 0
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								            totalActionNum = 0
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            advantage = tf.expand_dims(y_true[:, -1], axis=1)
 								            if self.disActSize != 0:
 								                # while NN have discrete action output.
 								                oldDisProbs = y_true[:, 0 : self.disOutputSize]
 								                nowDisProbs = y_pred[:, 0 : self.disOutputSize]  # [disAct1, disAct2, disAct3]
 								                disOneHotActs = y_true[
 								                    :,
 								                    self.disOutputSize
 								                    + self.conActSize : self.disOutputSize
 								                    + self.conActSize
 								                    + self.disOutputSize,
 								                ]
 								                lastDisActShape = 0
 								                for thisShape in self.disActShape:
 								                    thisNowDisProbs = nowDisProbs[:, lastDisActShape : lastDisActShape + thisShape]
 								                    thisOldDisProbs = oldDisProbs[:, lastDisActShape : lastDisActShape + thisShape]
 								                    thisDisOneHotActs = disOneHotActs[
 								                        :, lastDisActShape : lastDisActShape + thisShape
 								                    ]
 								                    discreteALoss = getDiscreteALoss(
 								                        thisNowDisProbs, thisOldDisProbs, thisDisOneHotActs, thisShape, advantage
 								                    )
 								                    lastDisActShape += thisShape
 								                    totalALoss += discreteALoss
 								                    totalActionNum += 1.0
 								            if self.conActSize != 0:
 								                # while NN have continuous action output.
 								                oldConProbs = y_true[:, self.disOutputSize : self.disOutputSize + self.conActSize]
 								                conActions = y_true[
 								                    :,
 								                    self.disOutputSize
 								                    + self.conActSize : self.disOutputSize
 								                    + self.conActSize
 								                    + self.conActSize,
 								                ]
 								                nowConMusigs = y_pred[:, self.disOutputSize :]  # [musig1,musig2]
 								                lastConAct = 0
 								                for conAct in range(self.conActSize):
 								                    thisNowConMusig = nowConMusigs[:, lastConAct : lastConAct + self.muSigSize]
 								                    thisOldConProb = oldConProbs[:, conAct : conAct + 1]
 								                    thisConAction = conActions[:, conAct]
 								                    continuousAloss = getContinuousALoss(
 								                        thisNowConMusig, thisConAction, thisOldConProb, advantage
 								                    )
 								                    totalALoss += continuousAloss
 								                    totalActionNum += 1.0
 								                    lastConAct += self.muSigSize
 								            loss = tf.divide(totalALoss, totalActionNum)
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								            return loss
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        return loss
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								    # get Actions&values
 								    def chooseAction(self, state: ndarray):
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        """Agent choose action to take
 								        Args:
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            state (ndarray): enviroment state
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
 								        Returns:
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            np.array:
 								                actions,
 								                    actions list,2dims like [[0],[1],[1.5]]
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								                predictResult,
 								                    actor NN predict Result output
 								        """
 								        # let actor choose action,use the normal distribution
 								        # state = np.expand_dims(state,0)
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
 								        # check state dimension is [stateNum,statesize]
 								        if state.ndim != 2:
 								            stateNum = int(len(state) / self.stateSize)
 								            state = state.reshape([stateNum, self.stateSize])
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        predictResult = self.actor(state)  # get predict result [[disAct1, disAct2, disAct3, musig]]
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        # print("predictResult",predictResult)
 								        # predictResult = predictResult.numpy()
 								        actions = []
 								        if self.disActSize != 0:
 								            # while NN have discrete action output.
 								            lastDisActShape = 0
 								            for shape in self.disActShape:
 								                thisDisActProbs = predictResult[:, lastDisActShape : lastDisActShape + shape]
 								                dist = tfp.distributions.Categorical(probs=thisDisActProbs, dtype=tf.float32)
 								                action = int(dist.sample().numpy()[0])
 								                # action = np.argmax(thisDisActProbs)
 								                actions.append(action)
 								                lastDisActShape += shape
 								        if self.conActSize != 0:
 								            # while NN have continuous action output.
 								            lastConAct = 0
 								            for actIndex in range(self.conActSize):
 								                thisMu = predictResult[:, self.disOutputSize + lastConAct]
 								                thisSig = predictResult[:, self.disOutputSize + lastConAct + 1]
 								                if math.isnan(thisMu) or math.isnan(thisSig):
 								                    # check mu or sigma is nan
 								                    print("chooseAction:mu or sigma is nan")
 								                thisDist = np.random.normal(loc=thisMu, scale=thisSig)
 								                actions.append(np.clip(thisDist, -self.conActRange, self.conActRange))
 								                lastConAct += 2
 								        return actions, predictResult
 								    def trainCritcActor(
 								        self,
 								        states: ndarray,
 								        oldActorResult: ndarray,
 								        actions: ndarray,
 								        rewards: ndarray,
 								        dones: ndarray,
 								        nextState: ndarray,
 								        epochs: int = None,
 								    ):
 								        """train critic&actor use PPO ways
 								        Args:
 								            states (ndarray): states
 								            oldActorResult (ndarray): actor predict result
 								            actions (ndarray): predicted actions include both discrete actions and continuous actions
 								            rewards (ndarray): rewards from enviroment
 								            dones (ndarray): dones from enviroment
 								            nextState (ndarray): next state from enviroment
 								            epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.
 								        Returns:
 								            tf.constant: criticLoss, actorLoss
 								        """
 								        if epochs == None:
 								            epochs = self.trainEpochs
 								        criticValues = self.getCriticV(state=states)
 								        discountedR = self.discountReward(nextState, criticValues, dones, rewards)
 								        advantage = self.getGAE(discountedR, criticValues)
 								        criticLoss = self.trainCritic(states, discountedR, epochs)
 								        actorLoss = self.trainActor(states, oldActorResult, actions, advantage, epochs)
 								        # print("A_Loss:", actorLoss, "C_Loss:", criticLoss)
 								        return criticLoss, actorLoss
 								    def trainCritic(self, states: ndarray, discountedR: ndarray, epochs: int = None):
 								        """critic NN trainning function
 								        Args:
 								            states (ndarray): states
 								            discountedR (ndarray): discounted rewards
 								            epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.
 								        Returns:
 								            tf.constant: all critic losses
 								        """
 								        if epochs == None:
 								            epochs = self.trainEpochs
 								        his = self.critic.fit(x=states, y=discountedR, epochs=epochs, verbose=0)
 								        return his.history["loss"]
 								    def trainActor(
 								        self,
 								        states: ndarray,
 								        oldActorResult: ndarray,
 								        actions: ndarray,
 								        advantage: ndarray,
 								        epochs: int = None,
 								    ):
 								        """actor NN trainning function
 								        Args:
 								            states (ndarray): states
 								            oldActorResult (ndarray): actor predict results
 								            actions (ndarray): acotor predict actions
 								            advantage (ndarray): GAE advantage
 								            epochs (int, optional): train epochs,default to ppoConfig. Defaults to None.
 								        Returns:
 								            tf.constant: all actor losses
 								        """
 								        # Trian Actor
 								        # states: Buffer States
 								        # actions: Buffer Actions
 								        # discountedR: Discounted Rewards
 								        # Epochs: just Epochs
 								        if epochs == None:
 								            epochs = self.trainEpochs
 								        actions = np.asarray(actions, dtype=np.float32)
 								        disActions = actions[:, 0 : self.disActSize]
 								        conActions = actions[:, self.disActSize :]
 								        oldDisProbs = oldActorResult[:, 0 : self.disOutputSize]  # [disAct1, disAct2, disAct3]
 								        oldConMusigs = oldActorResult[:, self.disOutputSize :]  # [musig1,musig2]
 								        if self.disActSize != 0:
 								            disOneHotActs = self.getOneHotActs(disActions)
 								            if self.conActSize != 0:
 								                # while NN have discrete6 & continuous actions output.
 								                oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
 								                # pack [oldDisProbs,oldPiProbs,conActions,advantage] as y_true
 								                y_true = np.hstack((oldDisProbs, oldPiProbs, disOneHotActs, conActions, advantage))
 								            else:
 								                # while NN have only discrete actions output.
 								                # pack [oldDisProbs,advantage] as y_true
 								                y_true = np.hstack((oldDisProbs, disOneHotActs, advantage))
 								        else:
 								            if self.conActSize != 0:
 								                # while NN have only continuous action output.
 								                oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
 								                # pack [oldPiProbs,conActions,advantage] as y_true
 								                y_true = np.hstack((oldPiProbs, conActions, advantage))
 								            else:
 								                print("trainActor:disActSize & conActSize error")
 								                time.sleep(999999)
 								        # assembly Actions history
 								        # train start
 								        if np.any(tf.math.is_nan(y_true)):
 								            print("y_true got nan")
 								            print("y_true", y_true)
 								        his = self.actor.fit(x=states, y=y_true, epochs=epochs, verbose=0)
 								        if np.any(tf.math.is_nan(his.history["loss"])):
 								            print("his.history['loss'] is nan!")
 								            print(his.history["loss"])
 								        return his.history["loss"]
 								    def saveWeights(self, score: float):
 								        """save now NN's Weight. Use "models.save_weights" method.
 								        Save as "tf" format "ckpt" file.
 								        Args:
 								            score (float): now score
 								        """
 								        actor_save_dir = (
 								            self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/actor/" + "actor.ckpt"
 								        )
 								        critic_save_dir = (
 								            self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/critic/" + "critic.ckpt"
 								        )
 								        self.actor.save_weights(actor_save_dir, save_format="tf")
 								        self.critic.save_weights(critic_save_dir, save_format="tf")
 								        # create an empty file named  as score to recored score
 								        score_dir = (
 								            self.saveDir + datetime.datetime.now().strftime("%H%M%S") + "/" + str(round(score))
 								        )
 								        scorefile = open(score_dir, "w")
 								        scorefile.close()
 								        print("Model's Weights Saved")
 								    def loadWeightToModels(self, loadDir: str):
 								        """load NN Model. Use "models.load_weights()" method.
 								        Load "tf" format "ckpt" file.
 								        Args:
 								            loadDir (str): Model dir
 								        """
 								        actorDir = loadDir + "/actor/" + "actor.ckpt"
 								        criticDir = loadDir + "/critic/" + "critic.ckpt"
 								        self.actor.load_weights(actorDir)
 								        self.critic.load_weights(criticDir)
 								        print("++++++++++++++++++++++++++++++++++++")
 								        print("++++++++++++Model Loaded++++++++++++")
 								        print(loadDir)
 								        print("++++++++++++++++++++++++++++++++++++")
 								    def getCriticV(self, state: ndarray):
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        """get Critic predict V value
 								        Args:
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            state (ndarray): Env state
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
 								        Returns:
 								            tensor: retrun Critic predict result
 								        """
 								        # if state.ndim < 2:
 								        #    state = np.expand_dims(state,0)
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        if state.ndim != 2:
 								            stateNum = int(len(state) / self.stateSize)
 								            state = state.reshape([stateNum, self.stateSize])
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        return self.critic.predict(state)
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								    def discountReward(self, nextState: ndarray, values: ndarray, dones: ndarray, rewards: ndarray):
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        """Discount future rewards
 								        Args:
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            nextState (ndarray): next Env state
 								            values (ndarray): critic predict values
 								            dones (ndarray): dones from enviroment
 								            rewards (ndarray): reward list of this episode
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
 								        Returns:
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            ndarray: discounted rewards list,same shape as rewards that input
 								        """
 								        """
 								        nextV = self.getCriticV(nextState)
 								        dones = 1 - dones
 								        discountedRewards = []
 								        for i in reversed(range(len(rewards))):
 								            nextV = rewards[i] + dones[i] * self.gamma * nextV
 								            discountedRewards.append(nextV)
 								        discountedRewards.reverse()  # reverse
 								        discountedRewards = np.squeeze(discountedRewards)
 								        discountedRewards = np.expand_dims(discountedRewards, axis=1)
 								        # discountedRewards = np.array(discountedRewards)[:, np.newaxis]
 								        return discountedRewards
 								        """
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        """
 								        nextV = self.getCriticV(nextState)
 								        discountedRewards = []
 								        for r in rewards[::-1]:
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            nextV = r + self.gamma * nextV
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								            discountedRewards.append(nextV)
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        discountedRewards.reverse()  # reverse
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        discountedRewards = np.squeeze(discountedRewards)
 								        discountedRewards = np.expand_dims(discountedRewards, axis=1)
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        # discountedRewards = np.array(discountedRewards)[:, np.newaxis]
 								        print(discountedRewards)
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        return discountedRewards
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        """
 								        g = 0
 								        discountedRewards = []
 								        lastValue = self.getCriticV(nextState)
 								        values = np.append(values, lastValue, axis=0)
 								        dones = 1 - dones
 								        for i in reversed(range(len(rewards))):
 								            delta = rewards[i] + self.gamma * values[i + 1] * dones[i] - values[i]
 								            g = delta + self.gamma * self.lmbda * dones[i] * g
 								            discountedRewards.append(g + values[i])
 								        discountedRewards.reverse()
 								        return np.asarray(discountedRewards)
 								    def getGAE(self, discountedRewards: ndarray, values: ndarray):
 								        """compute GAE adcantage
 								        Args:
 								            discountedRewards (ndarray): discounted rewards
 								            values (ndarray): critic predict values
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        Returns:
 								            ndarray: GAE advantage
 								        """
 								        advantage = discountedRewards - values
 								        advantage = (advantage - np.mean(advantage)) / (np.std(advantage) + EPS)
 								        return advantage
 								    def conProb(self, mu: ndarray, sig: ndarray, x: ndarray):
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        """calculate probability when x in Normal distribution(mu,sigma)
 								        Args:
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            mu (ndarray): mu
 								            sig (ndarray): sigma
 								            x (ndarray): x
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
 								        Returns:
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            ndarray: probability
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        """
 								        # 获取在正态分布mu,sig下当取x值时的概率
 								        # return shape : (length,1)
 								        mu = np.reshape(mu, (np.size(mu),))
 								        sig = np.reshape(sig, (np.size(sig),))
 								        x = np.reshape(x, (np.size(x),))
 								        dist = tfp.distributions.Normal(mu, sig)
 								        prob = dist.prob(x)
 								        prob = np.reshape(prob, (np.size(x), 1))
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        # dist = 1./(tf.sqrt(2.*np.pi)*sig)
 								        # prob = dist*tf.exp(-tf.square(x-mu)/(2.*tf.square(sig)))
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        return prob
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								    def getOneHotActs(self, disActions):
 								        """one hot action encoder
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
 								        Args:
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            disActions (ndarray): discrete actions
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
 								        Returns:
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            ndarray: one hot actions
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        """
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        actIndex = 0
 								        for thisShape in self.disActShape:
 								            thisActs = disActions[:, actIndex]
 								            thisOneHotAct = tf.squeeze(tf.one_hot(thisActs, thisShape)).numpy()
 								            if actIndex == 0:
 								                oneHotActs = thisOneHotAct
 								            else:
 								                oneHotActs = np.append(oneHotActs, thisOneHotAct, axis=1)
 								            actIndex += 1
 								        return oneHotActs
 								    def getAverageEntropy(self, probs: ndarray):
 								        """get average dis&con ACT Entropys
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
 								        Args:
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								            probs (ndarray): actor NN predict result
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        Returns:
 								            float: average total entropy
 								            list: discrete entropys
 								            list: continuous entropys
-												Aimbot Enviroment very first

Basic environment include Multi scene, Reward Change, Visible chart, etc....
											
										
										
											2022-09-05 11:46:08 +00:00
+								        """
-												Update PPO class,add python human control 

Python:
Update PPO class
add python human control 
Unity: 
add FP/TP choose button
											
										
										
											2022-10-10 21:40:15 +00:00
+								        discreteEntropys = []
 								        continuousEntropys = []
 								        if self.disActSize != 0:
 								            disProbs = probs[:, 0 : self.disOutputSize]
 								            lastDisActIndex = 0
 								            for actShape in self.disActShape:
 								                thisDisProbs = disProbs[:, lastDisActIndex : lastDisActIndex + actShape]
 								                lastDisActIndex += actShape
 								                discreteEntropys.append(
 								                    tf.negative(
 								                        tf.reduce_mean(
 								                            tf.math.multiply(thisDisProbs, tf.math.log(thisDisProbs + EPS))
 								                        )
 								                    )
 								                )
 								        if self.conActSize != 0:
 								            conProbs = probs[:, self.disOutputSize :]
 								            conActIndex = 0
 								            for i in range(self.conActSize):
 								                thisConProbs = conProbs[:, conActIndex : conActIndex + 2]
 								                conActIndex += 2
 								                continuousEntropys.append(tf.reduce_mean(thisConProbs[:, 1]))
 								        averageEntropy = np.mean([np.mean(discreteEntropys), np.mean(continuousEntropys)])
 								        return averageEntropy, discreteEntropys, continuousEntropys