Aimbot-PPO/Aimbot-PPO-Python/PPO.py

463 lines
18 KiB
Python
Raw Normal View History

import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import math
import copy
import datetime
import os
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from keras_radam import RAdam
class PPO(object):
"""Create PPO Agent
"""
def __init__(self, stateSize, disActShape, conActSize, conActRange, criticLR, actorLR, gamma, epsilon, entropyWeight, saveDir, loadModelDir):
# check disActShape is correct(greater than 1)
try:
if np.any(np.array(disActShape)<=1):
raise ValueError("disActShape error,disActShape should greater than 1 but get",disActShape)
except ValueError as e:
raise
self.stateSize = stateSize
# self.actionSize = actionSize
self.disActShape = disActShape # shape of discrete action output. like [3,3,2]
self.disActSize = len(disActShape)
self.conActSize = conActSize
self.conActRange = conActRange
self.criticLR = criticLR
self.actorLR = actorLR
self.GAMMA = gamma
self.EPSILON = epsilon
self.saveDir = saveDir
self.entropyWeight = entropyWeight
self.disOutputSize = sum(disActShape)
self.conOutputSize = conActSize * 2
if loadModelDir == None:
# critc NN
self.critic = self.buildCriticNet(self.stateSize, 1, compileModel = True)
# actor NN
self.actor = self.buildActorNet(self.stateSize, self.conActRange, compileModel = True)
else:
# critc NN
self.critic = self.buildCriticNet(self.stateSize, 1, compileModel=True)
# actor NN
self.actor = self.buildActorNet(self.stateSize, self.conActRange, compileModel=True)
# load weight to Critic&Actor NN
self.loadWeightToModels(loadModelDir)
# Build Net
def buildActorNet(self, inputSize, continuousActionRange,compileModel):
"""build Actor Nueral Net and compile.Output:[disAct1,disAct2,disAct3,mu,sigma]
Args:
inputSize (int): InputLayer Nueral size.
continuousActionRange (foat): continuous Action's max Range.
Returns:
keras.Model: return Actor NN
"""
stateInput = layers.Input(shape=(inputSize,), name='stateInput')
dense0 = layers.Dense(500, activation='relu',name='dense0',)(stateInput)
dense1 = layers.Dense(200, activation='relu',name='dense1',)(dense0)
dense2 = layers.Dense(100, activation='relu', name='dense2')(dense1)
disAct1 = layers.Dense(3, activation='softmax',name='WSAction')(dense2) # WS
disAct2 = layers.Dense(3, activation='softmax',name='ADAction')(dense2) # AD
disAct3 = layers.Dense(2, activation='softmax',name='ShootAction')(dense2) # Mouse shoot
mu = continuousActionRange * layers.Dense(1, activation='tanh', name='muOut')(dense2) # mu既正态分布mean
sigma = 1e-8 + layers.Dense(1, activation='softplus',name='sigmaOut')(dense2) # sigma既正态分布
# musig = layers.concatenate([mu,sigma],name = 'musig')
totalOut = layers.concatenate(
[disAct1, disAct2, disAct3, mu, sigma], name='totalOut') # package
model = keras.Model(inputs=stateInput, outputs=totalOut)
#actorOPT = optimizers.Adam(learning_rate = self.actorLR)
if compileModel:
actorOPT = RAdam(self.actorLR)
model.compile(optimizer=actorOPT, loss=self.aLoss())
return model
def buildCriticNet(self, inputSize, outputSize,compileModel):
"""build Critic Nueral Net and compile.Output:[Q]
Args:
inputSize (int): InputLayer Neural Size
outputSize (float): Q size
Returns:
keras.Model: return Critic NN
"""
stateInput = keras.Input(shape=(inputSize,))
dense0 = layers.Dense(500, activation='relu',
name='dense0',)(stateInput)
dense1 = layers.Dense(200, activation='relu')(dense0)
dense2 = layers.Dense(100, activation='relu')(dense1)
output = layers.Dense(outputSize)(dense2)
model = keras.Model(inputs=stateInput, outputs=output)
if compileModel:
criticOPT = optimizers.Adam(learning_rate=self.criticLR)
model.compile(optimizer=criticOPT, loss=self.cLoss())
return model
# loss Function
def cLoss(self):
"""Critic Loss function
"""
def loss(y_true, y_pred):
# y_true: discountedR
# y_pred: critcV = model.predict(states)
advantage = y_true - y_pred # TD error
loss = tf.reduce_mean(tf.square(advantage))
return loss
return loss
def aLoss(self):
def getDiscreteALoss(nowProbs,oldProbs,advantage):
"""get Discrete Action Loss
Args:
nowProbs (tf.constant): (length,actionSize)
oldProbs (tf.constant): (length,actionSize)
advantage (tf.constant): (length,)
Returns:
tf.constant: (length,)
"""
entropy = tf.reduce_mean(tf.math.multiply(nowProbs,tf.math.log(nowProbs+1e-6)))
ratio = tf.math.divide(nowProbs,oldProbs+1e-6)
value = tf.math.multiply(ratio,tf.expand_dims(advantage,axis = 1))
clipRatio = tf.clip_by_value(ratio,1. - self.EPSILON,1.+self.EPSILON)
clipValue = tf.math.multiply(clipRatio,tf.expand_dims(advantage,axis = 1))
loss = -tf.reduce_mean(tf.math.minimum(value,clipValue)) + self.entropyWeight * entropy
return loss
def getContinuousALoss(musig,actions,oldProbs,advantage):
"""get Continuous Action Loss
Args:
musig (tf.constant): (length,2)
actions (tf.constant): (length,)
oldProbs (tf.constant): (length,)
advantage (tf.constant): (length,)
Returns:
tf.constant: (length,)
"""
mu = musig[:,0]
sigma = musig[:,1]
dist = tfp.distributions.Normal(mu,sigma)
nowProbs = dist.prob(actions)
ratio = tf.math.divide(nowProbs,oldProbs+1e-6)
entropy = tf.reduce_mean(dist.entropy())
value = tf.math.multiply(ratio,tf.expand_dims(advantage,axis = 1))
clipValue = tf.clip_by_value(ratio,1. - self.EPSILON,1.+self.EPSILON) * advantage
loss = -tf.reduce_mean(tf.math.minimum(value,clipValue)) + self.entropyWeight * entropy
return loss
def loss(y_true, y_pred):
# y_true: [[disAct1, disAct2, disAct3, mu, sigma]]
# y_pred: muSigma = self.actor(state) =
# [[disAct1, disAct2, disAct3, mu, sigma]]
oldDisProbs = y_true[:,0:self.disOutputSize]
oldConMusigs = y_true[:,self.disOutputSize:self.disOutputSize+self.conActSize]
conActions = y_true[:,self.disOutputSize+self.conActSize:self.disOutputSize+(self.conActSize*2)]
advantage = y_true[:,-1]
nowDisProbs = y_pred[:,0:self.disOutputSize] # [disAct1, disAct2, disAct3]
nowConMusigs = y_pred[:,self.disOutputSize:] #[musig1,musig2]
totalALoss = tf.constant([0.])
totalActionNum = 0
# for nowProb,oldProb in zip(tf.transpose(nowDisProbs,perm=[1,0,2]),tf.transpose(oldDisProbs,perm=[1,0,2])):
lastDisActShape = 0
for shape in self.disActShape:
thisNowDisProbs = nowDisProbs[:,lastDisActShape:lastDisActShape+shape]
thisOldDisProbs = oldDisProbs[:,lastDisActShape:lastDisActShape+shape]
discreteALoss = getDiscreteALoss(thisNowDisProbs,thisOldDisProbs,advantage)
lastDisActShape += shape
totalALoss += discreteALoss
totalActionNum += 1
# for nowConMusig,conAction,oldPiProb in zip(tf.transpose(nowConMusigs,perm=[1,0,2]),conActions,oldPiProbs):
lastConAct = 0
for act in range(self.conActSize):
thisNowConMusig = nowConMusigs[:,lastConAct:lastConAct+((act+1)*2)]
thisOldConMusig = oldConMusigs[:,lastConAct:lastConAct+((act+1)*2)]
thisConAction = conActions[:,act]
continuousAloss = getContinuousALoss(thisNowConMusig,thisConAction,thisOldConMusig,advantage)
totalALoss += continuousAloss
totalActionNum += 1
loss = tf.divide(totalALoss,totalActionNum)
return loss
return loss
# get Action&V
def chooseAction(self, state):
"""Agent choose action to take
Args:
state (np.array): enviroment state
Returns:
np.array:
disAct1,
discreteAction1
disAct2,
discreteAction2
disAct3,
discreteAction3
conAction,
continuousAction
predictResult,
actor NN predict Result output
"""
# let actor choose action,use the normal distribution
# state = np.expand_dims(state,0)
# check state dimension is [1,statesize]
if state.ndim!=2:
state = state.reshape([1,self.stateSize])
predictResult = self.actor(state) # get predict result [[disAct1, disAct2, disAct3, musig]]
predictResult = predictResult.numpy()
disAct1Prob = predictResult[0][0:3]
disAct2Prob = predictResult[0][3:6]
disAct3Prob = predictResult[0][6:8]
mu = predictResult[0][8]
sigma = predictResult[0][9]
if math.isnan(mu) or math.isnan(sigma):
# check mu or sigma is nan
print("mu or sigma is nan")
disAct1 = np.argmax(disAct1Prob) # WS 0 or 1 or 2
disAct2 = np.argmax(disAct2Prob) # AD 0 or 1 or 2
disAct3 = np.argmax(disAct3Prob) # mouse shoot 0 or 1
normDist = np.random.normal(loc=mu, scale=sigma) # normalDistribution
conAction = np.clip(normDist, -self.conActRange,
self.conActRange) # 在正态分布中随机get一个action
return disAct1, disAct2, disAct3, conAction, predictResult
def getCriticV(self, state):
"""get Critic predict V value
Args:
state (np.array): Env state
Returns:
tensor: retrun Critic predict result
"""
# if state.ndim < 2:
# state = np.expand_dims(state,0)
if state.ndim!=2:
state = state.reshape([1,self.stateSize])
return self.critic.predict(state)
def discountReward(self, nextState, rewards):
"""Discount future rewards
Args:
nextState (np.array): next Env state
rewards (np.array): reward list of this episode
Returns:
np.array: discounted rewards list,same shape as rewards that input
"""
# 降低未来的rewards
nextV = self.getCriticV(nextState)
discountedRewards = []
for r in rewards[::-1]:
nextV = r + self.GAMMA*nextV
discountedRewards.append(nextV)
discountedRewards.reverse() # \ESREVER/
discountedRewards = np.squeeze(discountedRewards)
discountedRewards = np.expand_dims(discountedRewards, axis=1)
#discountedRewards = np.array(discountedRewards)[:, np.newaxis]
return discountedRewards
def conProb(self, mu, sig, x):
"""calculate probability when x in Normal distribution(mu,sigma)
Args:
mu (np,array): mu
sig (np.array): sigma
x (np.array): x
Returns:
np.array: probabilities
"""
# 获取在正态分布mu,sig下当取x值时的概率
# return shape : (length,1)
mu = np.reshape(mu, (np.size(mu),))
sig = np.reshape(sig, (np.size(sig),))
x = np.reshape(x, (np.size(x),))
dist = tfp.distributions.Normal(mu, sig)
prob = dist.prob(x)
prob = np.reshape(prob, (np.size(x), 1))
#dist = 1./(tf.sqrt(2.*np.pi)*sig)
#prob = dist*tf.exp(-tf.square(x-mu)/(2.*tf.square(sig)))
return prob
def trainCritcActor(self, states, actions, rewards, nextState, criticEpochs, actorEpochs):
# Train ActorNN and CriticNN
# states: Buffer States
# actions: Buffer Actions
# rewards: Buffer Rewards,没有Discount处理
# nextState: 下一个单独state
# criticEpochs: just criticNN'Epochs
# acotrEpochs: just acotrNN'Epochs
discountedR = self.discountReward(nextState, rewards)
criticMeanLoss = self.trainCritic(states, discountedR, criticEpochs)
actorMeanLoss = self.trainActor(
states, actions, discountedR, actorEpochs)
print("A_Loss:", actorMeanLoss, "C_Loss:", criticMeanLoss)
return actorMeanLoss, criticMeanLoss
def trainCritic(self, states, discountedR, epochs):
# Trian Critic
# states: Buffer States
# discountedR: Discounted Rewards
# Epochs: just Epochs
# IDK why this should be list...It just work...
# If discountR in np.array type it will throw 'Failed to find data adapter that can handle'
# discountedR = discountedR.tolist()
his = self.critic.fit(x=states, y=discountedR,
epochs=epochs, verbose=0)
return np.mean(his.history['loss'])
def trainActor(self, states, actions, discountedR, epochs):
"""Actor NN trainning function
Args:
states (np.array): Env states
actions (np.array): action history
discountedR (np.array): discountedR
epochs (int): epochs,how many time NN learning
Returns:
Average actor loss: this learning round's average actor loss
"""
# Trian Actor
# states: Buffer States
# actions: Buffer Actions
# discountedR: Discounted Rewards
# Epochs: just Epochs
states = np.asarray(states)
actions = np.asarray(actions, dtype=np.float32)
# predict with old Actor NN
oldActorResult = self.actor.predict(states)
# assembly Actions history
disActions = actions[:,0:self.disActSize]
conActions = actions[:,self.disActSize:]
# assembly predictResult as old Actor's Result
oldDisProbs = oldActorResult[:,0:self.disOutputSize] # [disAct1, disAct2, disAct3]
oldConMusigs = oldActorResult[:,self.disOutputSize:] # [musig1,musig2]
oldPiProbs = self.conProb(oldConMusigs[:, 0], oldConMusigs[:, 1], conActions)
criticV = self.critic.predict(states)
advantage = copy.deepcopy(discountedR - criticV)
# pack [oldDisProbs,oldPiProbs,conActions,advantage] as y_true
y_true = np.hstack((oldDisProbs,oldPiProbs,conActions,advantage))
# train start
if np.any(tf.math.is_nan(y_true)):
print("y_true got nan")
print("oldConMusigs",oldConMusigs)
print("oldPiProbs",oldPiProbs)
print("conActions",conActions)
print("oldConMusigs",oldConMusigs)
his = self.actor.fit(x=states, y=y_true, epochs=epochs, verbose=0)
if np.any(tf.math.is_nan(his.history['loss'])):
print("his.history['loss'] is nan!")
print(his.history['loss'])
return np.mean(his.history['loss'])
def saveWeights(self,score):
"""save now NN's Weight. Use "models.save_weights" method.
Save as "tf" format "ckpt" file.
Args:
score (int): now score
"""
actor_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/actor/" + "actor.ckpt"
critic_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/critic/" + "critic.ckpt"
score_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + "/" + str(round(score))
self.actor.save_weights(actor_save_dir, save_format="tf")
self.critic.save_weights(critic_save_dir, save_format="tf")
# create an empty file named as score to recored score
scorefile = open(score_dir,'w')
scorefile.close()
print("Model's Weights Saved")
def loadWeightToModels(self,loadDir):
"""load NN Model. Use "models.load_weights()" method.
Load "tf" format "ckpt" file.
Args:
loadDir (string): Model dir
"""
actorDir = loadDir + "/actor/" + "actor.ckpt"
criticDir = loadDir + "/critic/" + "critic.ckpt"
self.actor.load_weights(actorDir)
self.critic.load_weights(criticDir)
print("++++++++++++++++++++++++++++++++++++")
print("++++++++++++Model Loaded++++++++++++")
print(loadDir)
print("++++++++++++++++++++++++++++++++++++")
def saveModel(self, score):
"""save now NN Model. Use "model.save()" method.
Args:
score (int): now score
"""
score = "_" + str(round(score))
actor_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + \
score+"/actor.h5"
critic_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S") + \
score+"/critic.h5"
self.actor.save(actor_save_dir)
self.critic.save(critic_save_dir)
print("Model Saved")
def loadModel(self, loadDir):
"""load NN Model. Use "models.load_model()" method.
Args:
loadDir (string): Model dir
Returns:
tf.keras.models: retuen compiled models.
"""
actorDir = loadDir+"/actor.h5"
criticDir = loadDir+"/critic.h5"
actor_net_loaded = tf.keras.models.load_model(actorDir)
critic_net_loaded = tf.keras.models.load_model(criticDir)
print("++++++++++++++++++++++++++++++++++++")
print("++++++++++++Model Loaded++++++++++++")
print(loadDir)
print("++++++++++++++++++++++++++++++++++++")
return actor_net_loaded, critic_net_loaded