Compare commits
19 Commits
OffP-FullM
...
OffP-FullM
Author | SHA1 | Date | |
---|---|---|---|
573b09a920 | |||
9d9524429c | |||
5aa7e0936a | |||
3bc5c30fd3 | |||
2741d6d51a | |||
9432eaa76e | |||
52ccce88bc | |||
15c1edb6c9 | |||
f9ee51c256 | |||
be1322381e | |||
efb5c61f0d | |||
ef0ee495f2 | |||
a21fd724af | |||
177974888a | |||
bee609d160 | |||
cbcecfa9e9 | |||
0e0d98d8b1 | |||
3116831ae6 | |||
bf77060456 |
3
.gitignore
vendored
3
.gitignore
vendored
@ -76,6 +76,8 @@ crashlytics-build.properties
|
|||||||
/Aimbot-PPO-Python/.vscode/
|
/Aimbot-PPO-Python/.vscode/
|
||||||
/Aimbot-PPO-Python/.mypy_cache/
|
/Aimbot-PPO-Python/.mypy_cache/
|
||||||
/Aimbot-PPO-Python/__pycache__/
|
/Aimbot-PPO-Python/__pycache__/
|
||||||
|
/Aimbot-PPO-Python/wandb/
|
||||||
|
/Aimbot-PPO-Python/runs/
|
||||||
/Aimbot-PPO-Python/Tensorflow/__pycache__/
|
/Aimbot-PPO-Python/Tensorflow/__pycache__/
|
||||||
/Aimbot-PPO-Python/Pytorch/__pycache__/
|
/Aimbot-PPO-Python/Pytorch/__pycache__/
|
||||||
/Aimbot-PPO-Python/Pytorch/runs/
|
/Aimbot-PPO-Python/Pytorch/runs/
|
||||||
@ -84,4 +86,3 @@ crashlytics-build.properties
|
|||||||
/Aimbot-PPO-Python/Build/
|
/Aimbot-PPO-Python/Build/
|
||||||
/Aimbot-PPO-Python/PPO-Model/
|
/Aimbot-PPO-Python/PPO-Model/
|
||||||
/Aimbot-PPO-Python/GAIL-Expert-Data/
|
/Aimbot-PPO-Python/GAIL-Expert-Data/
|
||||||
/Aimbot-PPO-Python/runs/
|
|
5
.vscode/settings.json
vendored
Normal file
5
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"python.linting.enabled": false,
|
||||||
|
"python.analysis.typeCheckingMode": "off",
|
||||||
|
"commentTranslate.source": "intellsmi.deepl-translate-deepl"
|
||||||
|
}
|
3
Aimbot-PPO-Python/Pytorch/.idea/.gitignore
generated
vendored
Normal file
3
Aimbot-PPO-Python/Pytorch/.idea/.gitignore
generated
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
8
Aimbot-PPO-Python/Pytorch/.idea/Pytorch.iml
generated
Normal file
8
Aimbot-PPO-Python/Pytorch/.idea/Pytorch.iml
generated
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="jdk" jdkName="mlagents39" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
10
Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml
generated
Normal file
10
Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml
generated
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<component name="ProjectDictionaryState">
|
||||||
|
<dictionary name="UCUNI">
|
||||||
|
<words>
|
||||||
|
<w>aimbot</w>
|
||||||
|
<w>logprobs</w>
|
||||||
|
<w>logstd</w>
|
||||||
|
<w>unclipped</w>
|
||||||
|
</words>
|
||||||
|
</dictionary>
|
||||||
|
</component>
|
6
Aimbot-PPO-Python/Pytorch/.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
Aimbot-PPO-Python/Pytorch/.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
4
Aimbot-PPO-Python/Pytorch/.idea/misc.xml
generated
Normal file
4
Aimbot-PPO-Python/Pytorch/.idea/misc.xml
generated
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="mlagents39" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
8
Aimbot-PPO-Python/Pytorch/.idea/modules.xml
generated
Normal file
8
Aimbot-PPO-Python/Pytorch/.idea/modules.xml
generated
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/Pytorch.iml" filepath="$PROJECT_DIR$/.idea/Pytorch.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
6
Aimbot-PPO-Python/Pytorch/.idea/vcs.xml
generated
Normal file
6
Aimbot-PPO-Python/Pytorch/.idea/vcs.xml
generated
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
@ -1,26 +1,34 @@
|
|||||||
import gym
|
import gym
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import uuid
|
||||||
|
import airecorder
|
||||||
from numpy import ndarray
|
from numpy import ndarray
|
||||||
from mlagents_envs.base_env import ActionTuple
|
from mlagents_envs.base_env import ActionTuple
|
||||||
from mlagents_envs.environment import UnityEnvironment
|
from mlagents_envs.environment import UnityEnvironment
|
||||||
|
from typing import Tuple, List
|
||||||
|
from mlagents_envs.side_channel.side_channel import (
|
||||||
|
SideChannel,
|
||||||
|
IncomingMessage,
|
||||||
|
OutgoingMessage,
|
||||||
|
)
|
||||||
|
from arguments import set_save_model
|
||||||
|
|
||||||
|
|
||||||
class Aimbot(gym.Env):
|
class Aimbot(gym.Env):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
envPath: str,
|
env_path: str,
|
||||||
workerID: int = 1,
|
worker_id: int = 1,
|
||||||
basePort: int = 100,
|
base_port: int = 100,
|
||||||
side_channels: list = []
|
side_channels: list = []
|
||||||
):
|
):
|
||||||
super(Aimbot, self).__init__()
|
super(Aimbot, self).__init__()
|
||||||
self.env = UnityEnvironment(
|
self.env = UnityEnvironment(
|
||||||
file_name=envPath,
|
file_name=env_path,
|
||||||
seed=1,
|
seed=1,
|
||||||
side_channels=side_channels,
|
side_channels=side_channels,
|
||||||
worker_id=workerID,
|
worker_id=worker_id,
|
||||||
base_port=basePort,
|
base_port=base_port,
|
||||||
)
|
)
|
||||||
self.env.reset()
|
self.env.reset()
|
||||||
# all behavior_specs
|
# all behavior_specs
|
||||||
@ -34,7 +42,7 @@ class Aimbot(gym.Env):
|
|||||||
# environment action specs
|
# environment action specs
|
||||||
self.unity_action_spec = self.unity_specs.action_spec
|
self.unity_action_spec = self.unity_specs.action_spec
|
||||||
# environment sample observation
|
# environment sample observation
|
||||||
decisionSteps, _ = self.env.get_steps(self.unity_beha_name)
|
decision_steps, _ = self.env.get_steps(self.unity_beha_name)
|
||||||
|
|
||||||
# OBSERVATION SPECS
|
# OBSERVATION SPECS
|
||||||
# environment state shape. like tuple:(93,)
|
# environment state shape. like tuple:(93,)
|
||||||
@ -57,31 +65,34 @@ class Aimbot(gym.Env):
|
|||||||
|
|
||||||
# AGENT SPECS
|
# AGENT SPECS
|
||||||
# all agents ID
|
# all agents ID
|
||||||
self.unity_agent_IDS = decisionSteps.agent_id
|
self.unity_agent_IDS = decision_steps.agent_id
|
||||||
# agents number
|
# agents number
|
||||||
self.unity_agent_num = len(self.unity_agent_IDS)
|
self.unity_agent_num = len(self.unity_agent_IDS)
|
||||||
|
|
||||||
def reset(self):
|
# all zero action
|
||||||
"""reset enviroment and get observations
|
self.all_zero_action = np.zeros((self.unity_agent_num, self.unity_action_size))
|
||||||
|
|
||||||
|
def reset(self) -> Tuple[np.ndarray, List, List]:
|
||||||
|
"""reset environment and get observations
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
ndarray: nextState, reward, done, loadDir, saveNow
|
ndarray: next_state, reward, done, loadDir, saveNow
|
||||||
"""
|
"""
|
||||||
# reset env
|
# reset env
|
||||||
self.env.reset()
|
self.env.reset()
|
||||||
nextState, reward, done = self.getSteps()
|
next_state, reward, done = self.get_steps()
|
||||||
return nextState, reward, done
|
return next_state, reward, done
|
||||||
|
|
||||||
# TODO:
|
# TODO:
|
||||||
# delete all stack state DONE
|
# delete all stack state DONE
|
||||||
# getstep State disassembly function DONE
|
# get-step State disassembly function DONE
|
||||||
# delete agent selection function DONE
|
# delete agent selection function DONE
|
||||||
# self.step action wrapper function DONE
|
# self.step action wrapper function DONE
|
||||||
def step(
|
def step(
|
||||||
self,
|
self,
|
||||||
actions: ndarray,
|
actions: ndarray,
|
||||||
):
|
) -> Tuple[np.ndarray, List, List]:
|
||||||
"""change ations list to ActionTuple then send it to enviroment
|
"""change actions list to ActionTuple then send it to environment
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
actions (ndarray): PPO chooseAction output action list.(agentNum,actionNum)
|
actions (ndarray): PPO chooseAction output action list.(agentNum,actionNum)
|
||||||
@ -89,36 +100,36 @@ class Aimbot(gym.Env):
|
|||||||
Returns:
|
Returns:
|
||||||
ndarray: nextState, reward, done
|
ndarray: nextState, reward, done
|
||||||
"""
|
"""
|
||||||
# take action to enviroment
|
# take action to environment
|
||||||
# return mextState,reward,done
|
# return mextState,reward,done
|
||||||
# discrete action
|
# discrete action
|
||||||
if self.unity_dis_act_exist:
|
if self.unity_dis_act_exist:
|
||||||
# create discrete action from actions list
|
# create discrete action from actions list
|
||||||
discreteActions = actions[:, 0 : self.unity_discrete_type]
|
discrete_actions = actions[:, 0: self.unity_discrete_type]
|
||||||
else:
|
else:
|
||||||
# create empty discrete action
|
# create empty discrete action
|
||||||
discreteActions = np.asarray([[0]])
|
discrete_actions = np.asarray([[0]])
|
||||||
# continuous action
|
# continuous action
|
||||||
if self.unity_con_act_exist:
|
if self.unity_con_act_exist:
|
||||||
# create continuous actions from actions list
|
# create continuous actions from actions list
|
||||||
continuousActions = actions[:, self.unity_discrete_type :]
|
continuous_actions = actions[:, self.unity_discrete_type:]
|
||||||
else:
|
else:
|
||||||
# create empty continuous action
|
# create empty continuous action
|
||||||
continuousActions = np.asanyarray([[0.0]])
|
continuous_actions = np.asanyarray([[0.0]])
|
||||||
|
|
||||||
# Dummy continuous action
|
# Dummy continuous action
|
||||||
# continuousActions = np.asanyarray([[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]])
|
# continuousActions = np.asanyarray([[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]])
|
||||||
# create actionTuple
|
# create actionTuple
|
||||||
thisActionTuple = ActionTuple(continuous=continuousActions, discrete=discreteActions)
|
this_action_tuple = ActionTuple(continuous=continuous_actions, discrete=discrete_actions)
|
||||||
# take action to env
|
# take action to env
|
||||||
self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple)
|
self.env.set_actions(behavior_name=self.unity_beha_name, action=this_action_tuple)
|
||||||
self.env.step()
|
self.env.step()
|
||||||
# get nextState & reward & done after this action
|
# get nextState & reward & done after this action
|
||||||
nextStates, rewards, dones = self.getSteps()
|
next_states, rewards, dones = self.get_steps()
|
||||||
return nextStates, rewards, dones
|
return next_states, rewards, dones
|
||||||
|
|
||||||
def getSteps(self):
|
def get_steps(self) -> Tuple[np.ndarray, List, List]:
|
||||||
"""get enviroment now observations.
|
"""get environment now observations.
|
||||||
Include State, Reward, Done
|
Include State, Reward, Done
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -127,28 +138,99 @@ class Aimbot(gym.Env):
|
|||||||
ndarray: nextState, reward, done
|
ndarray: nextState, reward, done
|
||||||
"""
|
"""
|
||||||
# get nextState & reward & done
|
# get nextState & reward & done
|
||||||
decisionSteps, terminalSteps = self.env.get_steps(self.unity_beha_name)
|
decision_steps, terminal_steps = self.env.get_steps(self.unity_beha_name)
|
||||||
nextStates = []
|
next_states = []
|
||||||
dones = []
|
dones = []
|
||||||
rewards = []
|
rewards = []
|
||||||
for thisAgentID in self.unity_agent_IDS:
|
for this_agent_ID in self.unity_agent_IDS:
|
||||||
# while Episode over agentID will both in decisionSteps and terminalSteps.
|
# while Episode over agentID will both in decisionSteps and terminalSteps.
|
||||||
# avoid redundant state and reward,
|
# avoid redundant state and reward,
|
||||||
# use agentExist toggle to check if agent is already exist.
|
# use agentExist toggle to check if agent is already exist.
|
||||||
agentExist = False
|
agent_exist = False
|
||||||
# game done
|
# game done
|
||||||
if thisAgentID in terminalSteps:
|
if this_agent_ID in terminal_steps:
|
||||||
nextStates.append(terminalSteps[thisAgentID].obs[0])
|
next_states.append(terminal_steps[this_agent_ID].obs[0])
|
||||||
dones.append(True)
|
dones.append(True)
|
||||||
rewards.append(terminalSteps[thisAgentID].reward)
|
rewards.append(terminal_steps[this_agent_ID].reward)
|
||||||
agentExist = True
|
agent_exist = True
|
||||||
# game not over yet and agent not in terminalSteps
|
# game not over yet and agent not in terminalSteps
|
||||||
if (thisAgentID in decisionSteps) and (not agentExist):
|
if (this_agent_ID in decision_steps) and (not agent_exist):
|
||||||
nextStates.append(decisionSteps[thisAgentID].obs[0])
|
next_states.append(decision_steps[this_agent_ID].obs[0])
|
||||||
dones.append(False)
|
dones.append(False)
|
||||||
rewards.append(decisionSteps[thisAgentID].reward)
|
rewards.append(decision_steps[this_agent_ID].reward)
|
||||||
|
|
||||||
return np.asarray(nextStates), rewards, dones
|
return np.asarray(next_states), rewards, dones
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.env.close()
|
self.env.close()
|
||||||
|
|
||||||
|
|
||||||
|
class AimbotSideChannel(SideChannel):
|
||||||
|
def __init__(self, channel_id: uuid.UUID) -> None:
|
||||||
|
super().__init__(channel_id)
|
||||||
|
|
||||||
|
def on_message_received(self, msg: IncomingMessage) -> None:
|
||||||
|
"""
|
||||||
|
Note: We must implement this method of the SideChannel interface to
|
||||||
|
receive messages from Unity
|
||||||
|
Message will be sent like this:
|
||||||
|
"Warning|Message1|Message2|Message3" or
|
||||||
|
"Error|Message1|Message2|Message3"
|
||||||
|
"""
|
||||||
|
this_message_Original = msg.read_string()
|
||||||
|
this_message = this_message_Original.split("|")
|
||||||
|
print(this_message)
|
||||||
|
if this_message[0] == "Warning":
|
||||||
|
if this_message[1] == "Result":
|
||||||
|
airecorder.total_rounds[this_message[2]] += 1
|
||||||
|
if this_message[3] == "Win":
|
||||||
|
airecorder.win_rounds[this_message[2]] += 1
|
||||||
|
# print(TotalRounds)
|
||||||
|
# print(WinRounds)
|
||||||
|
if this_message[1] == "Command":
|
||||||
|
set_save_model(True)
|
||||||
|
print("Command: " + this_message_Original)
|
||||||
|
elif this_message[0] == "Error":
|
||||||
|
print(this_message_Original)
|
||||||
|
# # while Message type is Warning
|
||||||
|
# if(thisResult[0] == "Warning"):
|
||||||
|
# # while Message1 is result means one game is over
|
||||||
|
# if (thisResult[1] == "Result"):
|
||||||
|
# TotalRounds[thisResult[2]]+=1
|
||||||
|
# # while Message3 is Win means this agent win this game
|
||||||
|
# if(thisResult[3] == "Win"):
|
||||||
|
# WinRounds[thisResult[2]]+=1
|
||||||
|
# # while Message1 is GameState means this game is just start
|
||||||
|
# # and tell python which game mode is
|
||||||
|
# elif (thisResult[1] == "GameState"):
|
||||||
|
# SCrecieved = 1
|
||||||
|
# # while Message type is Error
|
||||||
|
# elif(thisResult[0] == "Error"):
|
||||||
|
# print(thisMessage)
|
||||||
|
|
||||||
|
# 发送函数
|
||||||
|
def send_string(self, data: str) -> None:
|
||||||
|
# send a string toC#
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_string(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_bool(self, data: bool) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_bool(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_int(self, data: int) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_int32(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_float(self, data: float) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_float32(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_float_list(self, data: List[float]) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_float32_list(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
769
Aimbot-PPO-Python/Pytorch/Archive/graph.py
Normal file
769
Aimbot-PPO-Python/Pytorch/Archive/graph.py
Normal file
@ -0,0 +1,769 @@
|
|||||||
|
import argparse
|
||||||
|
import wandb
|
||||||
|
import time
|
||||||
|
import numpy as np
|
||||||
|
import random
|
||||||
|
import uuid
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.optim as optim
|
||||||
|
import atexit
|
||||||
|
|
||||||
|
from torchviz import make_dot, make_dot_from_trace
|
||||||
|
from AimbotEnv import Aimbot
|
||||||
|
from tqdm import tqdm
|
||||||
|
from enum import Enum
|
||||||
|
from torch.distributions.normal import Normal
|
||||||
|
from torch.distributions.categorical import Categorical
|
||||||
|
from distutils.util import strtobool
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
from mlagents_envs.environment import UnityEnvironment
|
||||||
|
from mlagents_envs.side_channel.side_channel import (
|
||||||
|
SideChannel,
|
||||||
|
IncomingMessage,
|
||||||
|
OutgoingMessage,
|
||||||
|
)
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
bestReward = -1
|
||||||
|
|
||||||
|
DEFAULT_SEED = 9331
|
||||||
|
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv"
|
||||||
|
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
||||||
|
WAND_ENTITY = "koha9"
|
||||||
|
WORKER_ID = 2
|
||||||
|
BASE_PORT = 1111
|
||||||
|
|
||||||
|
# max round steps per agent is 2500/Decision_period, 25 seconds
|
||||||
|
# !!!check every parameters before run!!!
|
||||||
|
|
||||||
|
TOTAL_STEPS = 3150000
|
||||||
|
BATCH_SIZE = 1024
|
||||||
|
MAX_TRAINNING_DATASETS = 6000
|
||||||
|
DECISION_PERIOD = 1
|
||||||
|
LEARNING_RATE = 5e-4
|
||||||
|
GAMMA = 0.99
|
||||||
|
GAE_LAMBDA = 0.95
|
||||||
|
EPOCHS = 3
|
||||||
|
CLIP_COEF = 0.11
|
||||||
|
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
|
||||||
|
POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
|
||||||
|
ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
|
||||||
|
CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
|
||||||
|
TARGET_LEARNING_RATE = 1e-6
|
||||||
|
FREEZE_VIEW_NETWORK = False
|
||||||
|
|
||||||
|
ANNEAL_LEARNING_RATE = True
|
||||||
|
CLIP_VLOSS = True
|
||||||
|
NORM_ADV = True
|
||||||
|
TRAIN = True
|
||||||
|
|
||||||
|
SAVE_MODEL = False
|
||||||
|
WANDB_TACK = False
|
||||||
|
LOAD_DIR = None
|
||||||
|
#LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948-freeonly-20/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948_0.7949778.pt"
|
||||||
|
|
||||||
|
# public data
|
||||||
|
class Targets(Enum):
|
||||||
|
Free = 0
|
||||||
|
Go = 1
|
||||||
|
Attack = 2
|
||||||
|
Defence = 3
|
||||||
|
Num = 4
|
||||||
|
TARGET_STATE_SIZE = 6
|
||||||
|
INAREA_STATE_SIZE = 1
|
||||||
|
TIME_STATE_SIZE = 1
|
||||||
|
GUN_STATE_SIZE = 1
|
||||||
|
MY_STATE_SIZE = 4
|
||||||
|
TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
|
||||||
|
BASE_WINREWARD = 999
|
||||||
|
BASE_LOSEREWARD = -999
|
||||||
|
TARGETNUM= 4
|
||||||
|
ENV_TIMELIMIT = 30
|
||||||
|
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
|
||||||
|
TotalRounds = {"Free":0,"Go":0,"Attack":0}
|
||||||
|
WinRounds = {"Free":0,"Go":0,"Attack":0}
|
||||||
|
|
||||||
|
# !!!SPECIAL PARAMETERS!!!
|
||||||
|
# change it while program is finished
|
||||||
|
using_targets_num = 3
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
# fmt: off
|
||||||
|
# pytorch and environment parameters
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
|
||||||
|
help="seed of the experiment")
|
||||||
|
parser.add_argument("--path", type=str, default=ENV_PATH,
|
||||||
|
help="enviroment path")
|
||||||
|
parser.add_argument("--workerID", type=int, default=WORKER_ID,
|
||||||
|
help="unity worker ID")
|
||||||
|
parser.add_argument("--baseport", type=int, default=BASE_PORT,
|
||||||
|
help="port to connect to Unity environment")
|
||||||
|
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
|
||||||
|
help="the learning rate of optimizer")
|
||||||
|
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||||
|
help="if toggled, cuda will be enabled by default")
|
||||||
|
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
|
||||||
|
help="total timesteps of the experiments")
|
||||||
|
|
||||||
|
# model parameters
|
||||||
|
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
|
||||||
|
help="Train Model or not")
|
||||||
|
parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
|
||||||
|
help="freeze view network or not")
|
||||||
|
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
|
||||||
|
help="training dataset size,start training while dataset collect enough data")
|
||||||
|
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
|
||||||
|
help="nimi batch size")
|
||||||
|
parser.add_argument("--epochs", type=int, default=EPOCHS,
|
||||||
|
help="the K epochs to update the policy")
|
||||||
|
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
|
||||||
|
help="Toggle learning rate annealing for policy and value networks")
|
||||||
|
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
|
||||||
|
help="track on the wandb")
|
||||||
|
parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
|
||||||
|
help="save model or not")
|
||||||
|
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
|
||||||
|
help="the entity (team) of wandb's project")
|
||||||
|
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
|
||||||
|
help="load model directory")
|
||||||
|
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
|
||||||
|
help="the number of steps to run in each environment per policy rollout")
|
||||||
|
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
|
||||||
|
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
|
||||||
|
|
||||||
|
# GAE loss
|
||||||
|
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||||
|
help="Use GAE for advantage computation")
|
||||||
|
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
|
||||||
|
help="Toggles advantages normalization")
|
||||||
|
parser.add_argument("--gamma", type=float, default=GAMMA,
|
||||||
|
help="the discount factor gamma")
|
||||||
|
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
|
||||||
|
help="the lambda for the general advantage estimation")
|
||||||
|
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
|
||||||
|
help="the surrogate clipping coefficient")
|
||||||
|
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
|
||||||
|
help="coefficient of the policy")
|
||||||
|
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
|
||||||
|
help="coefficient of the entropy")
|
||||||
|
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
|
||||||
|
help="coefficient of the value function")
|
||||||
|
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
|
||||||
|
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
|
||||||
|
parser.add_argument("--max-grad-norm", type=float, default=0.5,
|
||||||
|
help="the maximum norm for the gradient clipping")
|
||||||
|
parser.add_argument("--target-kl", type=float, default=None,
|
||||||
|
help="the target KL divergence threshold")
|
||||||
|
# fmt: on
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
||||||
|
torch.nn.init.orthogonal_(layer.weight, std)
|
||||||
|
torch.nn.init.constant_(layer.bias, bias_const)
|
||||||
|
return layer
|
||||||
|
|
||||||
|
|
||||||
|
class PPOAgent(nn.Module):
|
||||||
|
def __init__(self, env: Aimbot,targetNum:int):
|
||||||
|
super(PPOAgent, self).__init__()
|
||||||
|
self.targetNum = targetNum
|
||||||
|
self.stateSize = env.unity_observation_shape[0]
|
||||||
|
self.agentNum = env.unity_agent_num
|
||||||
|
self.targetSize = TARGET_STATE_SIZE
|
||||||
|
self.timeSize = TIME_STATE_SIZE
|
||||||
|
self.gunSize = GUN_STATE_SIZE
|
||||||
|
self.myStateSize = MY_STATE_SIZE
|
||||||
|
self.raySize = env.unity_observation_shape[0] - TOTAL_T_SIZE
|
||||||
|
self.nonRaySize = TOTAL_T_SIZE
|
||||||
|
self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input
|
||||||
|
|
||||||
|
self.discrete_size = env.unity_discrete_size
|
||||||
|
self.discrete_shape = list(env.unity_discrete_branches)
|
||||||
|
self.continuous_size = env.unity_continuous_size
|
||||||
|
|
||||||
|
self.viewNetwork = nn.Sequential(
|
||||||
|
layer_init(nn.Linear(self.raySize, 200)),
|
||||||
|
nn.Tanh()
|
||||||
|
)
|
||||||
|
self.targetNetworks = nn.ModuleList([nn.Sequential(
|
||||||
|
layer_init(nn.Linear(self.nonRaySize, 100)),
|
||||||
|
nn.Tanh()
|
||||||
|
)for i in range(targetNum)])
|
||||||
|
self.middleNetworks = nn.ModuleList([nn.Sequential(
|
||||||
|
layer_init(nn.Linear(300,200)),
|
||||||
|
nn.Tanh()
|
||||||
|
)for i in range(targetNum)])
|
||||||
|
self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(targetNum)])
|
||||||
|
self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(targetNum)])
|
||||||
|
# self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
|
||||||
|
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||||
|
self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1,self.continuous_size))for i in range(targetNum)]) # nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||||
|
self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)])
|
||||||
|
|
||||||
|
def get_value(self, state: torch.Tensor):
|
||||||
|
target = state[:,0].to(torch.int32) # int
|
||||||
|
thisStateNum = target.size()[0]
|
||||||
|
viewInput = state[:,-self.raySize:] # all ray input
|
||||||
|
targetInput = state[:,:self.nonRaySize]
|
||||||
|
viewLayer = self.viewNetwork(viewInput)
|
||||||
|
targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)])
|
||||||
|
middleInput = torch.cat([viewLayer,targetLayer],dim = 1)
|
||||||
|
middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)])
|
||||||
|
criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic
|
||||||
|
return criticV
|
||||||
|
|
||||||
|
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||||
|
target = state[:,0].to(torch.int32) # int
|
||||||
|
thisStateNum = target.size()[0]
|
||||||
|
viewInput = state[:,-self.raySize:] # all ray input
|
||||||
|
targetInput = state[:,:self.nonRaySize]
|
||||||
|
viewLayer = self.viewNetwork(viewInput)
|
||||||
|
targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)])
|
||||||
|
middleInput = torch.cat([viewLayer,targetLayer],dim = 1)
|
||||||
|
middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)])
|
||||||
|
|
||||||
|
# discrete
|
||||||
|
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
||||||
|
dis_logits = torch.stack([self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)])
|
||||||
|
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
||||||
|
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
||||||
|
# continuous
|
||||||
|
actions_mean = torch.stack([self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_mean(hidden)
|
||||||
|
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
|
||||||
|
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
|
||||||
|
action_logstd = torch.stack([torch.squeeze(self.actor_logstd[target[i]],0) for i in range(thisStateNum)])
|
||||||
|
# print(action_logstd)
|
||||||
|
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
|
||||||
|
con_probs = Normal(actions_mean, action_std)
|
||||||
|
# critic
|
||||||
|
criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic
|
||||||
|
|
||||||
|
if actions is None:
|
||||||
|
if args.train:
|
||||||
|
# select actions base on probability distribution model
|
||||||
|
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
||||||
|
conAct = con_probs.sample()
|
||||||
|
actions = torch.cat([disAct.T, conAct], dim=1)
|
||||||
|
else:
|
||||||
|
# select actions base on best probability distribution
|
||||||
|
disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
||||||
|
conAct = actions_mean
|
||||||
|
actions = torch.cat([disAct.T, conAct], dim=1)
|
||||||
|
else:
|
||||||
|
disAct = actions[:, 0 : env.unity_discrete_type].T
|
||||||
|
conAct = actions[:, env.unity_discrete_type :]
|
||||||
|
dis_log_prob = torch.stack(
|
||||||
|
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
|
||||||
|
)
|
||||||
|
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
||||||
|
return (
|
||||||
|
actions,
|
||||||
|
dis_log_prob.sum(0),
|
||||||
|
dis_entropy.sum(0),
|
||||||
|
con_probs.log_prob(conAct).sum(1),
|
||||||
|
con_probs.entropy().sum(1),
|
||||||
|
criticV,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def GAE(agent, args, rewards, dones, values, next_obs, next_done):
|
||||||
|
# GAE
|
||||||
|
with torch.no_grad():
|
||||||
|
next_value = agent.get_value(next_obs).reshape(1, -1)
|
||||||
|
data_size = rewards.size()[0]
|
||||||
|
if args.gae:
|
||||||
|
advantages = torch.zeros_like(rewards).to(device)
|
||||||
|
lastgaelam = 0
|
||||||
|
for t in reversed(range(data_size)):
|
||||||
|
if t == data_size - 1:
|
||||||
|
nextnonterminal = 1.0 - next_done
|
||||||
|
nextvalues = next_value
|
||||||
|
else:
|
||||||
|
nextnonterminal = 1.0 - dones[t + 1]
|
||||||
|
nextvalues = values[t + 1]
|
||||||
|
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
|
||||||
|
advantages[t] = lastgaelam = (
|
||||||
|
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
|
||||||
|
)
|
||||||
|
returns = advantages + values
|
||||||
|
else:
|
||||||
|
returns = torch.zeros_like(rewards).to(device)
|
||||||
|
for t in reversed(range(data_size)):
|
||||||
|
if t == data_size - 1:
|
||||||
|
nextnonterminal = 1.0 - next_done
|
||||||
|
next_return = next_value
|
||||||
|
else:
|
||||||
|
nextnonterminal = 1.0 - dones[t + 1]
|
||||||
|
next_return = returns[t + 1]
|
||||||
|
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
|
||||||
|
advantages = returns - values
|
||||||
|
return advantages, returns
|
||||||
|
|
||||||
|
class AimbotSideChannel(SideChannel):
|
||||||
|
def __init__(self, channel_id: uuid.UUID) -> None:
|
||||||
|
super().__init__(channel_id)
|
||||||
|
def on_message_received(self, msg: IncomingMessage) -> None:
|
||||||
|
"""
|
||||||
|
Note: We must implement this method of the SideChannel interface to
|
||||||
|
receive messages from Unity
|
||||||
|
"""
|
||||||
|
thisMessage = msg.read_string()
|
||||||
|
# print(thisMessage)
|
||||||
|
thisResult = thisMessage.split("|")
|
||||||
|
if(thisResult[0] == "result"):
|
||||||
|
TotalRounds[thisResult[1]]+=1
|
||||||
|
if(thisResult[2] == "Win"):
|
||||||
|
WinRounds[thisResult[1]]+=1
|
||||||
|
#print(TotalRounds)
|
||||||
|
#print(WinRounds)
|
||||||
|
elif(thisResult[0] == "Error"):
|
||||||
|
print(thisMessage)
|
||||||
|
# 发送函数
|
||||||
|
def send_string(self, data: str) -> None:
|
||||||
|
# send a string toC#
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_string(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_bool(self, data: bool) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_bool(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_int(self, data: int) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_int32(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_float(self, data: float) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_float32(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_float_list(self, data: List[float]) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_float32_list(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def broadCastEndReward(rewardBF:list,remainTime:float):
|
||||||
|
thisRewardBF = rewardBF
|
||||||
|
if (rewardBF[-1]<=-500):
|
||||||
|
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
|
||||||
|
thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD
|
||||||
|
thisRewardBF = thisRewardBF
|
||||||
|
elif (rewardBF[-1]>=500):
|
||||||
|
# print("Win! Broadcast reward!",rewardBF[-1])
|
||||||
|
thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD
|
||||||
|
thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist()
|
||||||
|
else:
|
||||||
|
print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1])
|
||||||
|
return torch.Tensor(thisRewardBF).to(device)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
random.seed(args.seed)
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
|
||||||
|
|
||||||
|
# Initialize environment anget optimizer
|
||||||
|
aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
|
||||||
|
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel])
|
||||||
|
if args.load_dir is None:
|
||||||
|
agent = PPOAgent(env,TARGETNUM).to(device)
|
||||||
|
else:
|
||||||
|
agent = torch.load(args.load_dir)
|
||||||
|
# freeze
|
||||||
|
if args.freeze_viewnet:
|
||||||
|
# freeze the view network
|
||||||
|
for p in agent.viewNetwork.parameters():
|
||||||
|
p.requires_grad = False
|
||||||
|
print("VIEW NETWORK FREEZED")
|
||||||
|
print("Load Agent", args.load_dir)
|
||||||
|
print(agent.eval())
|
||||||
|
|
||||||
|
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
||||||
|
|
||||||
|
# Tensorboard and WandB Recorder
|
||||||
|
game_name = "Aimbot_Target_Hybrid_PMNN_V2"
|
||||||
|
game_type = "OffPolicy_EndBC"
|
||||||
|
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
|
||||||
|
if args.wandb_track:
|
||||||
|
wandb.init(
|
||||||
|
project=game_name,
|
||||||
|
entity=args.wandb_entity,
|
||||||
|
sync_tensorboard=True,
|
||||||
|
config=vars(args),
|
||||||
|
name=run_name,
|
||||||
|
monitor_gym=True,
|
||||||
|
save_code=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
writer = SummaryWriter(f"runs/{run_name}")
|
||||||
|
writer.add_text(
|
||||||
|
"hyperparameters",
|
||||||
|
"|param|value|\n|-|-|\n%s"
|
||||||
|
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
|
||||||
|
)
|
||||||
|
|
||||||
|
@atexit.register
|
||||||
|
def save_model():
|
||||||
|
# save model while exit
|
||||||
|
saveDir = "../PPO-Model/"+ run_name + "_last.pt"
|
||||||
|
torch.save(agent, saveDir)
|
||||||
|
print("save model to " + saveDir)
|
||||||
|
|
||||||
|
# Trajectory Buffer
|
||||||
|
ob_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
act_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
rewards_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
dones_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
values_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
|
||||||
|
# start the game
|
||||||
|
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
|
||||||
|
target_steps = [0 for i in range(TARGETNUM)]
|
||||||
|
start_time = time.time()
|
||||||
|
state, _, done = env.reset()
|
||||||
|
# state = torch.Tensor(next_obs).to(device)
|
||||||
|
# next_done = torch.zeros(env.unity_agent_num).to(device)
|
||||||
|
|
||||||
|
# initialize empty training datasets
|
||||||
|
obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size)
|
||||||
|
actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size)
|
||||||
|
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
|
||||||
|
vis_graph = make_dot(agent.get_actions_value(
|
||||||
|
torch.Tensor(state).to(device)
|
||||||
|
), params=dict(agent.named_parameters()))
|
||||||
|
vis_graph.view() # 会在当前目录下保存一个“Digraph.gv.pdf”文件,并在默认浏览器中打开
|
||||||
|
|
||||||
|
with torch.onnx.set_training(agent, False):
|
||||||
|
trace, _ = torch.jit.get_trace_graph(agent, args=(torch.Tensor(state).to(device),))
|
||||||
|
make_dot_from_trace(trace)
|
||||||
|
raise
|
||||||
|
|
||||||
|
for total_steps in range(total_update_step):
|
||||||
|
# discunt learning rate, while step == total_update_step lr will be 0
|
||||||
|
|
||||||
|
if args.annealLR:
|
||||||
|
finalRatio = TARGET_LEARNING_RATE/args.lr
|
||||||
|
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
|
||||||
|
lrnow = frac * args.lr
|
||||||
|
optimizer.param_groups[0]["lr"] = lrnow
|
||||||
|
else:
|
||||||
|
lrnow = args.lr
|
||||||
|
print("new episode",total_steps,"learning rate = ",lrnow)
|
||||||
|
|
||||||
|
|
||||||
|
# MAIN LOOP: run agent in environment
|
||||||
|
step = 0
|
||||||
|
training = False
|
||||||
|
trainQueue = []
|
||||||
|
last_reward = [0.for i in range(env.unity_agent_num)]
|
||||||
|
while True:
|
||||||
|
if step % args.decision_period == 0:
|
||||||
|
step += 1
|
||||||
|
# Choose action by agent
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
# predict actions
|
||||||
|
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
|
||||||
|
torch.Tensor(state).to(device)
|
||||||
|
)
|
||||||
|
value = value.flatten()
|
||||||
|
|
||||||
|
# variable from GPU to CPU
|
||||||
|
action_cpu = action.cpu().numpy()
|
||||||
|
dis_logprob_cpu = dis_logprob.cpu().numpy()
|
||||||
|
con_logprob_cpu = con_logprob.cpu().numpy()
|
||||||
|
value_cpu = value.cpu().numpy()
|
||||||
|
# Environment step
|
||||||
|
next_state, reward, next_done = env.step(action_cpu)
|
||||||
|
|
||||||
|
# save memories
|
||||||
|
for i in range(env.unity_agent_num):
|
||||||
|
# save memories to buffers
|
||||||
|
ob_bf[i].append(state[i])
|
||||||
|
act_bf[i].append(action_cpu[i])
|
||||||
|
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
||||||
|
con_logprobs_bf[i].append(con_logprob_cpu[i])
|
||||||
|
rewards_bf[i].append(reward[i]+last_reward[i])
|
||||||
|
dones_bf[i].append(done[i])
|
||||||
|
values_bf[i].append(value_cpu[i])
|
||||||
|
remainTime = state[i,TARGET_STATE_SIZE]
|
||||||
|
if next_done[i] == True:
|
||||||
|
# finished a round, send finished memories to training datasets
|
||||||
|
# compute advantage and discounted reward
|
||||||
|
#print(i,"over")
|
||||||
|
roundTargetType = int(state[i,0])
|
||||||
|
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
|
||||||
|
adv, rt = GAE(
|
||||||
|
agent,
|
||||||
|
args,
|
||||||
|
thisRewardsTensor,
|
||||||
|
torch.Tensor(dones_bf[i]).to(device),
|
||||||
|
torch.tensor(values_bf[i]).to(device),
|
||||||
|
torch.tensor(next_state[i]).to(device).unsqueeze(0),
|
||||||
|
torch.Tensor([next_done[i]]).to(device),
|
||||||
|
)
|
||||||
|
# send memories to training datasets
|
||||||
|
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
|
||||||
|
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
|
||||||
|
dis_logprobs[roundTargetType] = torch.cat(
|
||||||
|
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
||||||
|
)
|
||||||
|
con_logprobs[roundTargetType] = torch.cat(
|
||||||
|
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
||||||
|
)
|
||||||
|
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
|
||||||
|
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
|
||||||
|
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
|
||||||
|
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
|
||||||
|
|
||||||
|
# clear buffers
|
||||||
|
ob_bf[i] = []
|
||||||
|
act_bf[i] = []
|
||||||
|
dis_logprobs_bf[i] = []
|
||||||
|
con_logprobs_bf[i] = []
|
||||||
|
rewards_bf[i] = []
|
||||||
|
dones_bf[i] = []
|
||||||
|
values_bf[i] = []
|
||||||
|
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
|
||||||
|
|
||||||
|
for i in range(TARGETNUM):
|
||||||
|
if obs[i].size()[0] >= args.datasetSize:
|
||||||
|
# start train NN
|
||||||
|
trainQueue.append(i)
|
||||||
|
if(len(trainQueue)>0):
|
||||||
|
break
|
||||||
|
state, done = next_state, next_done
|
||||||
|
else:
|
||||||
|
step += 1
|
||||||
|
# skip this step use last predict action
|
||||||
|
next_state, reward, next_done = env.step(action_cpu)
|
||||||
|
# save memories
|
||||||
|
for i in range(env.unity_agent_num):
|
||||||
|
if next_done[i] == True:
|
||||||
|
#print(i,"over???")
|
||||||
|
# save memories to buffers
|
||||||
|
ob_bf[i].append(state[i])
|
||||||
|
act_bf[i].append(action_cpu[i])
|
||||||
|
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
||||||
|
con_logprobs_bf[i].append(con_logprob_cpu[i])
|
||||||
|
rewards_bf[i].append(reward[i])
|
||||||
|
dones_bf[i].append(done[i])
|
||||||
|
values_bf[i].append(value_cpu[i])
|
||||||
|
remainTime = state[i,TARGET_STATE_SIZE]
|
||||||
|
# finished a round, send finished memories to training datasets
|
||||||
|
# compute advantage and discounted reward
|
||||||
|
roundTargetType = int(state[i,0])
|
||||||
|
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
|
||||||
|
adv, rt = GAE(
|
||||||
|
agent,
|
||||||
|
args,
|
||||||
|
thisRewardsTensor,
|
||||||
|
torch.Tensor(dones_bf[i]).to(device),
|
||||||
|
torch.tensor(values_bf[i]).to(device),
|
||||||
|
torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0),
|
||||||
|
torch.Tensor([next_done[i]]).to(device),
|
||||||
|
)
|
||||||
|
# send memories to training datasets
|
||||||
|
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
|
||||||
|
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
|
||||||
|
dis_logprobs[roundTargetType] = torch.cat(
|
||||||
|
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
||||||
|
)
|
||||||
|
con_logprobs[roundTargetType] = torch.cat(
|
||||||
|
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
||||||
|
)
|
||||||
|
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
|
||||||
|
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
|
||||||
|
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
|
||||||
|
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
|
||||||
|
|
||||||
|
# clear buffers
|
||||||
|
ob_bf[i] = []
|
||||||
|
act_bf[i] = []
|
||||||
|
dis_logprobs_bf[i] = []
|
||||||
|
con_logprobs_bf[i] = []
|
||||||
|
rewards_bf[i] = []
|
||||||
|
dones_bf[i] = []
|
||||||
|
values_bf[i] = []
|
||||||
|
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
|
||||||
|
|
||||||
|
state = next_state
|
||||||
|
last_reward = reward
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
if args.train:
|
||||||
|
meanRewardList = [] # for WANDB
|
||||||
|
# loop all tarining queue
|
||||||
|
for thisT in trainQueue:
|
||||||
|
target_steps[thisT]+=1
|
||||||
|
# flatten the batch
|
||||||
|
b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
|
||||||
|
b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
|
||||||
|
b_con_logprobs = con_logprobs[thisT].reshape(-1)
|
||||||
|
b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
|
||||||
|
b_advantages = advantages[thisT].reshape(-1)
|
||||||
|
b_returns = returns[thisT].reshape(-1)
|
||||||
|
b_values = values[thisT].reshape(-1)
|
||||||
|
b_size = b_obs.size()[0]
|
||||||
|
# Optimizing the policy and value network
|
||||||
|
b_inds = np.arange(b_size)
|
||||||
|
# clipfracs = []
|
||||||
|
for epoch in range(args.epochs):
|
||||||
|
print(epoch,end="")
|
||||||
|
# shuffle all datasets
|
||||||
|
np.random.shuffle(b_inds)
|
||||||
|
for start in range(0, b_size, args.minibatchSize):
|
||||||
|
print(".",end="")
|
||||||
|
end = start + args.minibatchSize
|
||||||
|
mb_inds = b_inds[start:end]
|
||||||
|
if(np.size(mb_inds)<=1):
|
||||||
|
break
|
||||||
|
mb_advantages = b_advantages[mb_inds]
|
||||||
|
|
||||||
|
# normalize advantages
|
||||||
|
if args.norm_adv:
|
||||||
|
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
||||||
|
mb_advantages.std() + 1e-8
|
||||||
|
)
|
||||||
|
|
||||||
|
(
|
||||||
|
_,
|
||||||
|
new_dis_logprob,
|
||||||
|
dis_entropy,
|
||||||
|
new_con_logprob,
|
||||||
|
con_entropy,
|
||||||
|
newvalue,
|
||||||
|
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
||||||
|
# discrete ratio
|
||||||
|
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
||||||
|
dis_ratio = dis_logratio.exp()
|
||||||
|
# continuous ratio
|
||||||
|
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
|
||||||
|
con_ratio = con_logratio.exp()
|
||||||
|
|
||||||
|
"""
|
||||||
|
# early stop
|
||||||
|
with torch.no_grad():
|
||||||
|
# calculate approx_kl http://joschu.net/blog/kl-approx.html
|
||||||
|
old_approx_kl = (-logratio).mean()
|
||||||
|
approx_kl = ((ratio - 1) - logratio).mean()
|
||||||
|
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
|
||||||
|
"""
|
||||||
|
|
||||||
|
# discrete Policy loss
|
||||||
|
dis_pg_loss_orig = -mb_advantages * dis_ratio
|
||||||
|
dis_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||||
|
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
||||||
|
)
|
||||||
|
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
|
||||||
|
# continuous Policy loss
|
||||||
|
con_pg_loss_orig = -mb_advantages * con_ratio
|
||||||
|
con_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||||
|
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
||||||
|
)
|
||||||
|
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
||||||
|
|
||||||
|
# Value loss
|
||||||
|
newvalue = newvalue.view(-1)
|
||||||
|
if args.clip_vloss:
|
||||||
|
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
||||||
|
v_clipped = b_values[mb_inds] + torch.clamp(
|
||||||
|
newvalue - b_values[mb_inds],
|
||||||
|
-args.clip_coef,
|
||||||
|
args.clip_coef,
|
||||||
|
)
|
||||||
|
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
||||||
|
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
||||||
|
v_loss = 0.5 * v_loss_max.mean()
|
||||||
|
else:
|
||||||
|
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
||||||
|
|
||||||
|
# total loss
|
||||||
|
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
||||||
|
loss = (
|
||||||
|
dis_pg_loss * POLICY_COEF[thisT]
|
||||||
|
+ con_pg_loss * POLICY_COEF[thisT]
|
||||||
|
+ entropy_loss * ENTROPY_COEF[thisT]
|
||||||
|
+ v_loss * CRITIC_COEF[thisT]
|
||||||
|
)*LOSS_COEF[thisT]
|
||||||
|
|
||||||
|
if(torch.isnan(loss).any()):
|
||||||
|
print("LOSS Include NAN!!!")
|
||||||
|
if(torch.isnan(dis_pg_loss.any())):
|
||||||
|
print("dis_pg_loss include nan")
|
||||||
|
if(torch.isnan(con_pg_loss.any())):
|
||||||
|
print("con_pg_loss include nan")
|
||||||
|
if(torch.isnan(entropy_loss.any())):
|
||||||
|
print("entropy_loss include nan")
|
||||||
|
if(torch.isnan(v_loss.any())):
|
||||||
|
print("v_loss include nan")
|
||||||
|
raise
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
loss.backward()
|
||||||
|
# Clips gradient norm of an iterable of parameters.
|
||||||
|
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
"""
|
||||||
|
if args.target_kl is not None:
|
||||||
|
if approx_kl > args.target_kl:
|
||||||
|
break
|
||||||
|
"""
|
||||||
|
# record mean reward before clear history
|
||||||
|
print("done")
|
||||||
|
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
|
||||||
|
meanRewardList.append(targetRewardMean)
|
||||||
|
targetName = Targets(thisT).name
|
||||||
|
|
||||||
|
# clear this target trainning set buffer
|
||||||
|
obs[thisT] = torch.tensor([]).to(device)
|
||||||
|
actions[thisT] = torch.tensor([]).to(device)
|
||||||
|
dis_logprobs[thisT] = torch.tensor([]).to(device)
|
||||||
|
con_logprobs[thisT] = torch.tensor([]).to(device)
|
||||||
|
rewards[thisT] = torch.tensor([]).to(device)
|
||||||
|
values[thisT] = torch.tensor([]).to(device)
|
||||||
|
advantages[thisT] = torch.tensor([]).to(device)
|
||||||
|
returns[thisT] = torch.tensor([]).to(device)
|
||||||
|
|
||||||
|
# record rewards for plotting purposes
|
||||||
|
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
||||||
|
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
|
||||||
|
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
|
||||||
|
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
|
||||||
|
writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
|
||||||
|
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
|
||||||
|
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
|
||||||
|
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
||||||
|
TotalRewardMean = np.mean(meanRewardList)
|
||||||
|
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
||||||
|
writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps)
|
||||||
|
# New Record!
|
||||||
|
if TotalRewardMean > bestReward and args.save_model:
|
||||||
|
bestReward = targetRewardMean
|
||||||
|
saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt"
|
||||||
|
torch.save(agent, saveDir)
|
||||||
|
|
||||||
|
saveDir = "../PPO-Model/"+ run_name + "_last.pt"
|
||||||
|
torch.save(agent, saveDir)
|
||||||
|
env.close()
|
||||||
|
writer.close()
|
287
Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
Normal file
287
Aimbot-PPO-Python/Pytorch/Archive/test2.ipynb
Normal file
File diff suppressed because one or more lines are too long
1298
Aimbot-PPO-Python/Pytorch/Archive/testarea.ipynb
Normal file
1298
Aimbot-PPO-Python/Pytorch/Archive/testarea.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -1,329 +1,26 @@
|
|||||||
import argparse
|
|
||||||
import wandb
|
|
||||||
import time
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import random
|
import random
|
||||||
import uuid
|
import uuid
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import atexit
|
||||||
|
import os
|
||||||
|
|
||||||
|
from aimbotEnv import Aimbot
|
||||||
|
from aimbotEnv import AimbotSideChannel
|
||||||
|
from ppoagent import PPOAgent
|
||||||
|
from airecorder import WandbRecorder
|
||||||
|
from aimemory import PPOMem
|
||||||
|
from aimemory import Targets
|
||||||
|
from arguments import parse_args
|
||||||
|
from arguments import set_save_model, is_save_model
|
||||||
import torch.optim as optim
|
import torch.optim as optim
|
||||||
|
|
||||||
from AimbotEnv import Aimbot
|
# side channel uuid
|
||||||
from tqdm import tqdm
|
|
||||||
from enum import Enum
|
|
||||||
from torch.distributions.normal import Normal
|
|
||||||
from torch.distributions.categorical import Categorical
|
|
||||||
from distutils.util import strtobool
|
|
||||||
from torch.utils.tensorboard import SummaryWriter
|
|
||||||
from mlagents_envs.environment import UnityEnvironment
|
|
||||||
from mlagents_envs.side_channel.side_channel import (
|
|
||||||
SideChannel,
|
|
||||||
IncomingMessage,
|
|
||||||
OutgoingMessage,
|
|
||||||
)
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
bestReward = -1
|
|
||||||
|
|
||||||
DEFAULT_SEED = 9331
|
|
||||||
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv"
|
|
||||||
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
||||||
WAND_ENTITY = "koha9"
|
# tensorboard names
|
||||||
WORKER_ID = 3
|
GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel"
|
||||||
BASE_PORT = 1002
|
GAME_TYPE = "GotoOnly-3.6-Level0123-newModel-Onehot"
|
||||||
|
|
||||||
# max round steps per agent is 2500/Decision_period, 25 seconds
|
|
||||||
# !!!check every parameters before run!!!
|
|
||||||
|
|
||||||
TOTAL_STEPS = 3150000
|
|
||||||
BATCH_SIZE = 1024
|
|
||||||
MAX_TRAINNING_DATASETS = 6000
|
|
||||||
DECISION_PERIOD = 1
|
|
||||||
LEARNING_RATE = 5e-4
|
|
||||||
GAMMA = 0.99
|
|
||||||
GAE_LAMBDA = 0.95
|
|
||||||
EPOCHS = 3
|
|
||||||
CLIP_COEF = 0.11
|
|
||||||
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
|
|
||||||
POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
|
|
||||||
ENTROPY_COEF = [0.1, 0.1, 0.1, 0.1]
|
|
||||||
CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
|
|
||||||
TARGET_LEARNING_RATE = 1e-6
|
|
||||||
|
|
||||||
ANNEAL_LEARNING_RATE = True
|
|
||||||
CLIP_VLOSS = True
|
|
||||||
NORM_ADV = True
|
|
||||||
TRAIN = True
|
|
||||||
|
|
||||||
WANDB_TACK = False
|
|
||||||
LOAD_DIR = None
|
|
||||||
#LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670522099-freeonly-12/Aimbot-target-last.pt"
|
|
||||||
|
|
||||||
# public data
|
|
||||||
class Targets(Enum):
|
|
||||||
Free = 0
|
|
||||||
Go = 1
|
|
||||||
Attack = 2
|
|
||||||
Defence = 3
|
|
||||||
Num = 4
|
|
||||||
TARGET_STATE_SIZE = 6
|
|
||||||
INAREA_STATE_SIZE = 1
|
|
||||||
TIME_STATE_SIZE = 1
|
|
||||||
GUN_STATE_SIZE = 1
|
|
||||||
MY_STATE_SIZE = 4
|
|
||||||
TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
|
|
||||||
BASE_WINREWARD = 999
|
|
||||||
BASE_LOSEREWARD = -999
|
|
||||||
TARGETNUM= 4
|
|
||||||
ENV_TIMELIMIT = 30
|
|
||||||
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
|
|
||||||
TotalRounds = {"Free":0,"Go":0,"Attack":0}
|
|
||||||
WinRounds = {"Free":0,"Go":0,"Attack":0}
|
|
||||||
|
|
||||||
# !!!SPECIAL PARAMETERS!!!
|
|
||||||
# change it while program is finished
|
|
||||||
using_targets_num = 3
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
# fmt: off
|
|
||||||
# pytorch and environment parameters
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
|
|
||||||
help="seed of the experiment")
|
|
||||||
parser.add_argument("--path", type=str, default=ENV_PATH,
|
|
||||||
help="enviroment path")
|
|
||||||
parser.add_argument("--workerID", type=int, default=WORKER_ID,
|
|
||||||
help="unity worker ID")
|
|
||||||
parser.add_argument("--baseport", type=int, default=BASE_PORT,
|
|
||||||
help="port to connect to Unity environment")
|
|
||||||
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
|
|
||||||
help="the learning rate of optimizer")
|
|
||||||
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
|
||||||
help="if toggled, cuda will be enabled by default")
|
|
||||||
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
|
|
||||||
help="total timesteps of the experiments")
|
|
||||||
|
|
||||||
# model parameters
|
|
||||||
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
|
|
||||||
help="Train Model or not")
|
|
||||||
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
|
|
||||||
help="training dataset size,start training while dataset collect enough data")
|
|
||||||
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
|
|
||||||
help="nimi batch size")
|
|
||||||
parser.add_argument("--epochs", type=int, default=EPOCHS,
|
|
||||||
help="the K epochs to update the policy")
|
|
||||||
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
|
|
||||||
help="Toggle learning rate annealing for policy and value networks")
|
|
||||||
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
|
|
||||||
help="track on the wandb")
|
|
||||||
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
|
|
||||||
help="the entity (team) of wandb's project")
|
|
||||||
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
|
|
||||||
help="load model directory")
|
|
||||||
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
|
|
||||||
help="the number of steps to run in each environment per policy rollout")
|
|
||||||
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
|
|
||||||
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
|
|
||||||
|
|
||||||
# GAE loss
|
|
||||||
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
|
||||||
help="Use GAE for advantage computation")
|
|
||||||
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
|
|
||||||
help="Toggles advantages normalization")
|
|
||||||
parser.add_argument("--gamma", type=float, default=GAMMA,
|
|
||||||
help="the discount factor gamma")
|
|
||||||
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
|
|
||||||
help="the lambda for the general advantage estimation")
|
|
||||||
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
|
|
||||||
help="the surrogate clipping coefficient")
|
|
||||||
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
|
|
||||||
help="coefficient of the policy")
|
|
||||||
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
|
|
||||||
help="coefficient of the entropy")
|
|
||||||
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
|
|
||||||
help="coefficient of the value function")
|
|
||||||
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
|
|
||||||
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
|
|
||||||
parser.add_argument("--max-grad-norm", type=float, default=0.5,
|
|
||||||
help="the maximum norm for the gradient clipping")
|
|
||||||
parser.add_argument("--target-kl", type=float, default=None,
|
|
||||||
help="the target KL divergence threshold")
|
|
||||||
# fmt: on
|
|
||||||
args = parser.parse_args()
|
|
||||||
return args
|
|
||||||
|
|
||||||
|
|
||||||
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
|
||||||
torch.nn.init.orthogonal_(layer.weight, std)
|
|
||||||
torch.nn.init.constant_(layer.bias, bias_const)
|
|
||||||
return layer
|
|
||||||
|
|
||||||
|
|
||||||
class PPOAgent(nn.Module):
|
|
||||||
def __init__(self, env: Aimbot,targetNum:int):
|
|
||||||
super(PPOAgent, self).__init__()
|
|
||||||
self.targetNum = targetNum
|
|
||||||
self.stateSize = env.unity_observation_shape[0]
|
|
||||||
self.targetSize = TARGET_STATE_SIZE
|
|
||||||
self.timeSize = TIME_STATE_SIZE
|
|
||||||
self.gunSize = GUN_STATE_SIZE
|
|
||||||
self.myStateSize = MY_STATE_SIZE
|
|
||||||
self.totalMiddleSize = TOTAL_T_SIZE
|
|
||||||
self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input
|
|
||||||
|
|
||||||
self.discrete_size = env.unity_discrete_size
|
|
||||||
self.discrete_shape = list(env.unity_discrete_branches)
|
|
||||||
self.continuous_size = env.unity_continuous_size
|
|
||||||
|
|
||||||
self.network = nn.Sequential(
|
|
||||||
layer_init(nn.Linear(env.unity_observation_shape[0], 300)),
|
|
||||||
nn.Tanh(),
|
|
||||||
layer_init(nn.Linear(300, 200)),
|
|
||||||
nn.Tanh(),
|
|
||||||
)
|
|
||||||
self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5)
|
|
||||||
self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5)
|
|
||||||
self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
|
||||||
self.critic = layer_init(nn.Linear(200, 1), std=1)
|
|
||||||
|
|
||||||
def get_value(self, state: torch.Tensor):
|
|
||||||
return self.critic(self.network(state))
|
|
||||||
|
|
||||||
def get_actions_value(self, state: torch.Tensor, actions=None):
|
|
||||||
hidden = self.network(state)
|
|
||||||
|
|
||||||
# discrete
|
|
||||||
dis_logits = self.actor_dis(hidden)
|
|
||||||
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
|
||||||
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
|
||||||
# continuous
|
|
||||||
actions_mean = self.actor_mean(hidden)
|
|
||||||
action_logstd = self.actor_logstd.expand_as(actions_mean)
|
|
||||||
action_std = torch.exp(action_logstd)
|
|
||||||
con_probs = Normal(actions_mean, action_std)
|
|
||||||
|
|
||||||
if actions is None:
|
|
||||||
if args.train:
|
|
||||||
# select actions base on probability distribution model
|
|
||||||
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
|
||||||
conAct = con_probs.sample()
|
|
||||||
actions = torch.cat([disAct.T, conAct], dim=1)
|
|
||||||
else:
|
|
||||||
# select actions base on best probability distribution
|
|
||||||
disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
|
||||||
conAct = actions_mean
|
|
||||||
actions = torch.cat([disAct.T, conAct], dim=1)
|
|
||||||
else:
|
|
||||||
disAct = actions[:, 0 : env.unity_discrete_type].T
|
|
||||||
conAct = actions[:, env.unity_discrete_type :]
|
|
||||||
dis_log_prob = torch.stack(
|
|
||||||
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
|
|
||||||
)
|
|
||||||
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
|
||||||
return (
|
|
||||||
actions,
|
|
||||||
dis_log_prob.sum(0),
|
|
||||||
dis_entropy.sum(0),
|
|
||||||
con_probs.log_prob(conAct).sum(1),
|
|
||||||
con_probs.entropy().sum(1),
|
|
||||||
self.critic(hidden),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def GAE(agent, args, rewards, dones, values, next_obs, next_done):
|
|
||||||
# GAE
|
|
||||||
with torch.no_grad():
|
|
||||||
next_value = agent.get_value(next_obs).reshape(1, -1)
|
|
||||||
data_size = rewards.size()[0]
|
|
||||||
if args.gae:
|
|
||||||
advantages = torch.zeros_like(rewards).to(device)
|
|
||||||
lastgaelam = 0
|
|
||||||
for t in reversed(range(data_size)):
|
|
||||||
if t == data_size - 1:
|
|
||||||
nextnonterminal = 1.0 - next_done
|
|
||||||
nextvalues = next_value
|
|
||||||
else:
|
|
||||||
nextnonterminal = 1.0 - dones[t + 1]
|
|
||||||
nextvalues = values[t + 1]
|
|
||||||
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
|
|
||||||
advantages[t] = lastgaelam = (
|
|
||||||
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
|
|
||||||
)
|
|
||||||
returns = advantages + values
|
|
||||||
else:
|
|
||||||
returns = torch.zeros_like(rewards).to(device)
|
|
||||||
for t in reversed(range(data_size)):
|
|
||||||
if t == data_size - 1:
|
|
||||||
nextnonterminal = 1.0 - next_done
|
|
||||||
next_return = next_value
|
|
||||||
else:
|
|
||||||
nextnonterminal = 1.0 - dones[t + 1]
|
|
||||||
next_return = returns[t + 1]
|
|
||||||
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
|
|
||||||
advantages = returns - values
|
|
||||||
return advantages, returns
|
|
||||||
|
|
||||||
class AimbotSideChannel(SideChannel):
|
|
||||||
def __init__(self, channel_id: uuid.UUID) -> None:
|
|
||||||
super().__init__(channel_id)
|
|
||||||
def on_message_received(self, msg: IncomingMessage) -> None:
|
|
||||||
"""
|
|
||||||
Note: We must implement this method of the SideChannel interface to
|
|
||||||
receive messages from Unity
|
|
||||||
"""
|
|
||||||
thisMessage = msg.read_string()
|
|
||||||
# print(thisMessage)
|
|
||||||
thisResult = thisMessage.split("|")
|
|
||||||
if(thisResult[0] == "result"):
|
|
||||||
TotalRounds[thisResult[1]]+=1
|
|
||||||
if(thisResult[2] == "Win"):
|
|
||||||
WinRounds[thisResult[1]]+=1
|
|
||||||
#print(TotalRounds)
|
|
||||||
#print(WinRounds)
|
|
||||||
elif(thisResult[0] == "Error"):
|
|
||||||
print(thisMessage)
|
|
||||||
# 发送函数
|
|
||||||
def send_string(self, data: str) -> None:
|
|
||||||
# send a string toC#
|
|
||||||
msg = OutgoingMessage()
|
|
||||||
msg.write_string(data)
|
|
||||||
super().queue_message_to_send(msg)
|
|
||||||
|
|
||||||
def send_bool(self, data: bool) -> None:
|
|
||||||
msg = OutgoingMessage()
|
|
||||||
msg.write_bool(data)
|
|
||||||
super().queue_message_to_send(msg)
|
|
||||||
|
|
||||||
def send_int(self, data: int) -> None:
|
|
||||||
msg = OutgoingMessage()
|
|
||||||
msg.write_int32(data)
|
|
||||||
super().queue_message_to_send(msg)
|
|
||||||
|
|
||||||
def send_float(self, data: float) -> None:
|
|
||||||
msg = OutgoingMessage()
|
|
||||||
msg.write_float32(data)
|
|
||||||
super().queue_message_to_send(msg)
|
|
||||||
|
|
||||||
def send_float_list(self, data: List[float]) -> None:
|
|
||||||
msg = OutgoingMessage()
|
|
||||||
msg.write_float32_list(data)
|
|
||||||
super().queue_message_to_send(msg)
|
|
||||||
|
|
||||||
def broadCastEndReward(rewardBF:list,remainTime:float):
|
|
||||||
thisRewardBF = rewardBF
|
|
||||||
if (rewardBF[-1]<=-500):
|
|
||||||
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
|
|
||||||
thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD
|
|
||||||
thisRewardBF = thisRewardBF
|
|
||||||
elif (rewardBF[-1]>=500):
|
|
||||||
# print("Win! Broadcast reward!",rewardBF[-1])
|
|
||||||
thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD
|
|
||||||
thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist()
|
|
||||||
else:
|
|
||||||
print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1])
|
|
||||||
return torch.Tensor(thisRewardBF).to(device)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
@ -332,378 +29,227 @@ if __name__ == "__main__":
|
|||||||
torch.manual_seed(args.seed)
|
torch.manual_seed(args.seed)
|
||||||
|
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
|
||||||
|
best_reward = -1
|
||||||
|
|
||||||
# Initialize environment anget optimizer
|
# Initialize environment agent optimizer
|
||||||
aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
|
aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID)
|
||||||
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel])
|
env = Aimbot(
|
||||||
agentList = []
|
env_path=args.path,
|
||||||
optimizers = []
|
worker_id=args.workerID,
|
||||||
|
base_port=args.baseport,
|
||||||
|
side_channels=[aimbot_side_channel])
|
||||||
if args.load_dir is None:
|
if args.load_dir is None:
|
||||||
for i in range(using_targets_num):
|
agent = PPOAgent(
|
||||||
agentList.append(PPOAgent(env,TARGETNUM).to(device))
|
env=env,
|
||||||
optimizers.append(optim.Adam(agentList[i].parameters(), lr=args.lr, eps=1e-5))
|
this_args=args,
|
||||||
|
device=device,
|
||||||
|
).to(device)
|
||||||
else:
|
else:
|
||||||
print("NAH")
|
agent = torch.load(args.load_dir)
|
||||||
# !!!not finished
|
# freeze
|
||||||
# agent = torch.load(args.load_dir)
|
if args.freeze_viewnet:
|
||||||
# print("Load Agent", args.load_dir)
|
# freeze the view network
|
||||||
# print(agent.eval())
|
print("FREEZE VIEW NETWORK is not compatible with Full MNN!")
|
||||||
|
raise NotImplementedError
|
||||||
|
print("Load Agent", args.load_dir)
|
||||||
|
print(agent.eval())
|
||||||
|
# optimizer
|
||||||
|
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
||||||
# Tensorboard and WandB Recorder
|
# Tensorboard and WandB Recorder
|
||||||
game_name = "Aimbot_Target_Hybrid_PMNN_V2"
|
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
|
||||||
game_type = "OffPolicy_EndBC"
|
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
|
||||||
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
|
|
||||||
if args.wandb_track:
|
|
||||||
wandb.init(
|
|
||||||
project=game_name,
|
|
||||||
entity=args.wandb_entity,
|
|
||||||
sync_tensorboard=True,
|
|
||||||
config=vars(args),
|
|
||||||
name=run_name,
|
|
||||||
monitor_gym=True,
|
|
||||||
save_code=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
writer = SummaryWriter(f"runs/{run_name}")
|
|
||||||
writer.add_text(
|
|
||||||
"hyperparameters",
|
|
||||||
"|param|value|\n|-|-|\n%s"
|
|
||||||
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Trajectory Buffer
|
|
||||||
ob_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
act_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
rewards_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
dones_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
values_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
|
|
||||||
# start the game
|
# start the game
|
||||||
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
|
total_update_step = args.target_num * args.total_timesteps // args.datasetSize
|
||||||
target_steps = [0 for i in range(TARGETNUM)]
|
target_steps = [0 for i in range(args.target_num)]
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
state, _, done = env.reset()
|
state, _, done = env.reset()
|
||||||
# state = torch.Tensor(next_obs).to(device)
|
|
||||||
# next_done = torch.zeros(env.unity_agent_num).to(device)
|
|
||||||
|
|
||||||
# initialize empty training datasets
|
|
||||||
obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size)
|
|
||||||
actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size)
|
|
||||||
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
|
||||||
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
|
||||||
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
|
||||||
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
|
||||||
advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
|
||||||
returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
|
||||||
|
|
||||||
for total_steps in range(total_update_step):
|
|
||||||
# discunt learning rate, while step == total_update_step lr will be 0
|
|
||||||
|
|
||||||
if args.annealLR:
|
|
||||||
finalRatio = TARGET_LEARNING_RATE/args.lr
|
|
||||||
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
|
|
||||||
lrnow = frac * args.lr
|
|
||||||
for optimizer in optimizers:
|
|
||||||
optimizer.param_groups[0]["lr"] = lrnow
|
|
||||||
else:
|
|
||||||
lrnow = args.lr
|
|
||||||
print("new episode",total_steps,"learning rate = ",lrnow)
|
|
||||||
|
|
||||||
|
# initialize AI memories
|
||||||
|
ppo_memories = PPOMem(
|
||||||
|
args=args,
|
||||||
|
unity_agent_num=env.unity_agent_num,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
|
||||||
# MAIN LOOP: run agent in environment
|
# MAIN LOOP: run agent in environment
|
||||||
|
for total_steps in range(total_update_step):
|
||||||
|
# discount learning rate, while step == total_update_step lr will be 0
|
||||||
|
if args.annealLR:
|
||||||
|
final_lr_ratio = args.target_lr / args.lr
|
||||||
|
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
|
||||||
|
lr_now = frac * args.lr
|
||||||
|
optimizer.param_groups[0]["lr"] = lr_now
|
||||||
|
else:
|
||||||
|
lr_now = args.lr
|
||||||
|
|
||||||
|
# episode start show learning rate
|
||||||
|
print("new episode", total_steps, "learning rate = ", lr_now)
|
||||||
step = 0
|
step = 0
|
||||||
training = False
|
training = False
|
||||||
trainQueue = []
|
train_queue = []
|
||||||
last_reward = [0.for i in range(env.unity_agent_num)]
|
last_reward = [0. for i in range(env.unity_agent_num)]
|
||||||
action = torch.zeros((env.unity_agent_num,env.unity_discrete_type+env.unity_continuous_size))
|
# MAIN LOOP: run agent in environment
|
||||||
dis_logprob = torch.zeros((env.unity_agent_num,env.unity_discrete_size))
|
|
||||||
con_logprob = torch.zeros((env.unity_agent_num,env.unity_continuous_size))
|
|
||||||
value = torch.zeros((env.unity_agent_num,1))
|
|
||||||
while True:
|
while True:
|
||||||
|
# Target Type(state[0][0]) is stay(4),use all zero action
|
||||||
|
if state[0][0] == 4:
|
||||||
|
next_state, reward, next_done = env.step(env.all_zero_action)
|
||||||
|
state, done = next_state, next_done
|
||||||
|
continue
|
||||||
|
# On decision point, and Target Type(state[0][0]) is not stay(4) choose action by agent
|
||||||
if step % args.decision_period == 0:
|
if step % args.decision_period == 0:
|
||||||
step += 1
|
step += 1
|
||||||
# Choose action by agent
|
# Choose action by agent
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# predict actions
|
# predict actions
|
||||||
for i in range(env.unity_agent_num):
|
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
|
||||||
actTarget = int(state[i][0])
|
torch.tensor(state,dtype=torch.float32).to(device)
|
||||||
act, dis_lgprb, _, con_lgprb, _, vl = agentList[actTarget].get_actions_value(
|
|
||||||
torch.Tensor([state[i]]).to(device)
|
|
||||||
)
|
)
|
||||||
action[i] = act
|
value = value.flatten()
|
||||||
dis_logprob[i] = dis_lgprb.squeeze(0)
|
|
||||||
con_logprob[i] = con_lgprb.squeeze(0)
|
|
||||||
value[i] = vl.squeeze(0)
|
|
||||||
|
|
||||||
# variable from GPU to CPU
|
# variable from GPU to CPU
|
||||||
action_cpu = action.cpu().numpy()
|
action_cpu = action.cpu().numpy()
|
||||||
dis_logprob_cpu = dis_logprob.cpu().numpy()
|
dis_logprob_cpu = dis_logprob.cpu().numpy()
|
||||||
con_logprob_cpu = con_logprob.cpu().numpy()
|
con_logprob_cpu = con_logprob.cpu().numpy()
|
||||||
value_cpu = value.flatten().cpu().numpy()
|
value_cpu = value.cpu().numpy()
|
||||||
# Environment step
|
# Environment step
|
||||||
next_state, reward, next_done = env.step(action_cpu)
|
next_state, reward, next_done = env.step(action_cpu)
|
||||||
|
|
||||||
# save memories
|
# save memories
|
||||||
for i in range(env.unity_agent_num):
|
if args.train:
|
||||||
# save memories to buffers
|
ppo_memories.save_memories(
|
||||||
ob_bf[i].append(state[i])
|
now_step=step,
|
||||||
act_bf[i].append(action_cpu[i])
|
agent=agent,
|
||||||
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
state=state,
|
||||||
con_logprobs_bf[i].append(con_logprob_cpu[i])
|
action_cpu=action_cpu,
|
||||||
rewards_bf[i].append(reward[i]+last_reward[i])
|
dis_logprob_cpu=dis_logprob_cpu,
|
||||||
dones_bf[i].append(done[i])
|
con_logprob_cpu=con_logprob_cpu,
|
||||||
values_bf[i].append(value_cpu[i])
|
reward=reward,
|
||||||
remainTime = state[i,TARGET_STATE_SIZE]
|
done=done,
|
||||||
if next_done[i] == True:
|
value_cpu=value_cpu,
|
||||||
# finished a round, send finished memories to training datasets
|
last_reward=last_reward,
|
||||||
# compute advantage and discounted reward
|
next_done=next_done,
|
||||||
#print(i,"over")
|
next_state=next_state,
|
||||||
endTarget = int(ob_bf[i][0][0])
|
|
||||||
roundTargetType = int(state[i,0])
|
|
||||||
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
|
|
||||||
adv, rt = GAE(
|
|
||||||
agentList[endTarget],
|
|
||||||
args,
|
|
||||||
thisRewardsTensor,
|
|
||||||
torch.Tensor(dones_bf[i]).to(device),
|
|
||||||
torch.tensor(values_bf[i]).to(device),
|
|
||||||
torch.tensor(next_state[i]).to(device).unsqueeze(0),
|
|
||||||
torch.Tensor([next_done[i]]).to(device),
|
|
||||||
)
|
)
|
||||||
# send memories to training datasets
|
# check if any training dataset is full and ready to train
|
||||||
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
|
for i in range(args.target_num):
|
||||||
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
|
if ppo_memories.obs[i].size()[0] >= args.datasetSize:
|
||||||
dis_logprobs[roundTargetType] = torch.cat(
|
|
||||||
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
|
||||||
)
|
|
||||||
con_logprobs[roundTargetType] = torch.cat(
|
|
||||||
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
|
||||||
)
|
|
||||||
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
|
|
||||||
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
|
|
||||||
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
|
|
||||||
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
|
|
||||||
|
|
||||||
# clear buffers
|
|
||||||
ob_bf[i] = []
|
|
||||||
act_bf[i] = []
|
|
||||||
dis_logprobs_bf[i] = []
|
|
||||||
con_logprobs_bf[i] = []
|
|
||||||
rewards_bf[i] = []
|
|
||||||
dones_bf[i] = []
|
|
||||||
values_bf[i] = []
|
|
||||||
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
|
|
||||||
|
|
||||||
for i in range(TARGETNUM):
|
|
||||||
if obs[i].size()[0] >= args.datasetSize:
|
|
||||||
# start train NN
|
# start train NN
|
||||||
trainQueue.append(i)
|
train_queue.append(i)
|
||||||
if(len(trainQueue)>0):
|
if len(train_queue) > 0:
|
||||||
|
# break while loop and start train
|
||||||
break
|
break
|
||||||
|
# update state
|
||||||
state, done = next_state, next_done
|
state, done = next_state, next_done
|
||||||
else:
|
else:
|
||||||
step += 1
|
step += 1
|
||||||
# skip this step use last predict action
|
# skip this step use last predict action
|
||||||
next_state, reward, next_done = env.step(action_cpu)
|
next_state, reward, next_done = env.step(action_cpu)
|
||||||
# save memories
|
# save memories
|
||||||
for i in range(env.unity_agent_num):
|
if args.train:
|
||||||
if next_done[i] == True:
|
ppo_memories.save_memories(
|
||||||
#print(i,"over???")
|
now_step=step,
|
||||||
# save memories to buffers
|
agent=agent,
|
||||||
ob_bf[i].append(state[i])
|
state=state,
|
||||||
act_bf[i].append(action_cpu[i])
|
action_cpu=action_cpu,
|
||||||
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
dis_logprob_cpu=dis_logprob_cpu,
|
||||||
con_logprobs_bf[i].append(con_logprob_cpu[i])
|
con_logprob_cpu=con_logprob_cpu,
|
||||||
rewards_bf[i].append(reward[i])
|
reward=reward,
|
||||||
dones_bf[i].append(done[i])
|
done=done,
|
||||||
values_bf[i].append(value_cpu[i])
|
value_cpu=value_cpu,
|
||||||
remainTime = state[i,TARGET_STATE_SIZE]
|
last_reward=last_reward,
|
||||||
# finished a round, send finished memories to training datasets
|
next_done=next_done,
|
||||||
# compute advantage and discounted reward
|
next_state=next_state,
|
||||||
roundTargetType = int(state[i,0])
|
|
||||||
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
|
|
||||||
adv, rt = GAE(
|
|
||||||
agentList[roundTargetType],
|
|
||||||
args,
|
|
||||||
thisRewardsTensor,
|
|
||||||
torch.Tensor(dones_bf[i]).to(device),
|
|
||||||
torch.tensor(values_bf[i]).to(device),
|
|
||||||
torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0),
|
|
||||||
torch.Tensor([next_done[i]]).to(device),
|
|
||||||
)
|
)
|
||||||
# send memories to training datasets
|
# update state
|
||||||
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
|
|
||||||
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
|
|
||||||
dis_logprobs[roundTargetType] = torch.cat(
|
|
||||||
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
|
||||||
)
|
|
||||||
con_logprobs[roundTargetType] = torch.cat(
|
|
||||||
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
|
||||||
)
|
|
||||||
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
|
|
||||||
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
|
|
||||||
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
|
|
||||||
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
|
|
||||||
|
|
||||||
# clear buffers
|
|
||||||
ob_bf[i] = []
|
|
||||||
act_bf[i] = []
|
|
||||||
dis_logprobs_bf[i] = []
|
|
||||||
con_logprobs_bf[i] = []
|
|
||||||
rewards_bf[i] = []
|
|
||||||
dones_bf[i] = []
|
|
||||||
values_bf[i] = []
|
|
||||||
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
|
|
||||||
|
|
||||||
state = next_state
|
state = next_state
|
||||||
last_reward = reward
|
last_reward = reward
|
||||||
i += 1
|
|
||||||
|
|
||||||
if args.train:
|
if args.train:
|
||||||
meanRewardList = [] # for WANDB
|
# train mode on
|
||||||
# loop all tarining queue
|
mean_reward_list = [] # for WANDB
|
||||||
for thisT in trainQueue:
|
# loop all training queue
|
||||||
target_steps[thisT]+=1
|
for this_train_ind in train_queue:
|
||||||
# flatten the batch
|
# start time
|
||||||
b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
|
start_time = time.time()
|
||||||
b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
|
target_steps[this_train_ind] += 1
|
||||||
b_con_logprobs = con_logprobs[thisT].reshape(-1)
|
# train agent
|
||||||
b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
|
|
||||||
b_advantages = advantages[thisT].reshape(-1)
|
|
||||||
b_returns = returns[thisT].reshape(-1)
|
|
||||||
b_values = values[thisT].reshape(-1)
|
|
||||||
b_size = b_obs.size()[0]
|
|
||||||
# Optimizing the policy and value network
|
|
||||||
b_inds = np.arange(b_size)
|
|
||||||
# clipfracs = []
|
|
||||||
for epoch in range(args.epochs):
|
|
||||||
print(epoch,end="")
|
|
||||||
# shuffle all datasets
|
|
||||||
np.random.shuffle(b_inds)
|
|
||||||
for start in range(0, b_size, args.minibatchSize):
|
|
||||||
print(".",end="")
|
|
||||||
end = start + args.minibatchSize
|
|
||||||
mb_inds = b_inds[start:end]
|
|
||||||
mb_advantages = b_advantages[mb_inds]
|
|
||||||
|
|
||||||
# normalize advantages
|
|
||||||
if args.norm_adv:
|
|
||||||
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
|
||||||
mb_advantages.std() + 1e-8
|
|
||||||
)
|
|
||||||
|
|
||||||
(
|
(
|
||||||
_,
|
v_loss,
|
||||||
new_dis_logprob,
|
dis_pg_loss,
|
||||||
dis_entropy,
|
con_pg_loss,
|
||||||
new_con_logprob,
|
loss,
|
||||||
con_entropy,
|
entropy_loss
|
||||||
newvalue,
|
) = agent.train_net(
|
||||||
) = agentList[thisT].get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
this_train_ind=this_train_ind,
|
||||||
# discrete ratio
|
ppo_memories=ppo_memories,
|
||||||
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
optimizer=optimizer
|
||||||
dis_ratio = dis_logratio.exp()
|
|
||||||
# continuous ratio
|
|
||||||
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
|
|
||||||
con_ratio = con_logratio.exp()
|
|
||||||
|
|
||||||
"""
|
|
||||||
# early stop
|
|
||||||
with torch.no_grad():
|
|
||||||
# calculate approx_kl http://joschu.net/blog/kl-approx.html
|
|
||||||
old_approx_kl = (-logratio).mean()
|
|
||||||
approx_kl = ((ratio - 1) - logratio).mean()
|
|
||||||
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
|
|
||||||
"""
|
|
||||||
|
|
||||||
# discrete Policy loss
|
|
||||||
dis_pg_loss_orig = -mb_advantages * dis_ratio
|
|
||||||
dis_pg_loss_clip = -mb_advantages * torch.clamp(
|
|
||||||
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
|
||||||
)
|
)
|
||||||
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
|
|
||||||
# continuous Policy loss
|
|
||||||
con_pg_loss_orig = -mb_advantages * con_ratio
|
|
||||||
con_pg_loss_clip = -mb_advantages * torch.clamp(
|
|
||||||
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
|
||||||
)
|
|
||||||
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
|
||||||
|
|
||||||
# Value loss
|
|
||||||
newvalue = newvalue.view(-1)
|
|
||||||
if args.clip_vloss:
|
|
||||||
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
|
||||||
v_clipped = b_values[mb_inds] + torch.clamp(
|
|
||||||
newvalue - b_values[mb_inds],
|
|
||||||
-args.clip_coef,
|
|
||||||
args.clip_coef,
|
|
||||||
)
|
|
||||||
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
|
||||||
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
|
||||||
v_loss = 0.5 * v_loss_max.mean()
|
|
||||||
else:
|
|
||||||
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
|
||||||
|
|
||||||
# total loss
|
|
||||||
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
|
||||||
loss = (
|
|
||||||
dis_pg_loss * POLICY_COEF[thisT]
|
|
||||||
+ con_pg_loss * POLICY_COEF[thisT]
|
|
||||||
+ entropy_loss * ENTROPY_COEF[thisT]
|
|
||||||
+ v_loss * CRITIC_COEF[thisT]
|
|
||||||
)*LOSS_COEF[thisT]
|
|
||||||
|
|
||||||
optimizers[thisT].zero_grad()
|
|
||||||
loss.backward()
|
|
||||||
# Clips gradient norm of an iterable of parameters.
|
|
||||||
nn.utils.clip_grad_norm_(agentList[thisT].parameters(), args.max_grad_norm)
|
|
||||||
optimizers[thisT].step()
|
|
||||||
|
|
||||||
"""
|
|
||||||
if args.target_kl is not None:
|
|
||||||
if approx_kl > args.target_kl:
|
|
||||||
break
|
|
||||||
"""
|
|
||||||
# record mean reward before clear history
|
# record mean reward before clear history
|
||||||
print("done")
|
print("done")
|
||||||
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
|
target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
|
||||||
meanRewardList.append(targetRewardMean)
|
mean_reward_list.append(target_reward_mean)
|
||||||
targetName = Targets(thisT).name
|
targetName = Targets(this_train_ind).name
|
||||||
|
|
||||||
# clear this target trainning set buffer
|
# clear this target training set buffer
|
||||||
obs[thisT] = torch.tensor([]).to(device)
|
ppo_memories.clear_training_datasets(this_train_ind)
|
||||||
actions[thisT] = torch.tensor([]).to(device)
|
# record rewards for plotting purposes
|
||||||
dis_logprobs[thisT] = torch.tensor([]).to(device)
|
wdb_recorder.add_target_scalar(
|
||||||
con_logprobs[thisT] = torch.tensor([]).to(device)
|
targetName,
|
||||||
rewards[thisT] = torch.tensor([]).to(device)
|
this_train_ind,
|
||||||
values[thisT] = torch.tensor([]).to(device)
|
v_loss,
|
||||||
advantages[thisT] = torch.tensor([]).to(device)
|
dis_pg_loss,
|
||||||
returns[thisT] = torch.tensor([]).to(device)
|
con_pg_loss,
|
||||||
|
loss,
|
||||||
|
entropy_loss,
|
||||||
|
target_reward_mean,
|
||||||
|
target_steps,
|
||||||
|
)
|
||||||
|
print(f"episode over Target{targetName} mean reward:", target_reward_mean)
|
||||||
|
TotalRewardMean = np.mean(mean_reward_list)
|
||||||
|
wdb_recorder.add_global_scalar(
|
||||||
|
TotalRewardMean,
|
||||||
|
optimizer.param_groups[0]["lr"],
|
||||||
|
total_steps,
|
||||||
|
)
|
||||||
|
# print cost time as seconds
|
||||||
|
print("cost time:", time.time() - start_time)
|
||||||
|
# New Record! or save model
|
||||||
|
if ((is_save_model() or TotalRewardMean > best_reward) and args.save_model):
|
||||||
|
# check saveDir is exist
|
||||||
|
saveDir = "../PPO-Model/" + run_name + "/"
|
||||||
|
if not os.path.isdir(saveDir):
|
||||||
|
os.mkdir(saveDir)
|
||||||
|
best_reward = TotalRewardMean
|
||||||
|
torch.save(agent, saveDir + str(TotalRewardMean) + ".pt")
|
||||||
|
print("Model Saved!")
|
||||||
|
set_save_model(False)
|
||||||
|
else:
|
||||||
|
# train mode off
|
||||||
|
mean_reward_list = [] # for WANDB
|
||||||
|
# while not in training mode, clear the buffer
|
||||||
|
for this_train_ind in train_queue:
|
||||||
|
target_steps[this_train_ind] += 1
|
||||||
|
targetName = Targets(this_train_ind).name
|
||||||
|
target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
|
||||||
|
mean_reward_list.append(target_reward_mean)
|
||||||
|
print(target_steps[this_train_ind])
|
||||||
|
|
||||||
|
# clear this target training set buffer
|
||||||
|
ppo_memories.clear_training_datasets(this_train_ind)
|
||||||
|
|
||||||
# record rewards for plotting purposes
|
# record rewards for plotting purposes
|
||||||
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean,
|
||||||
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
|
target_steps[this_train_ind])
|
||||||
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
|
wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind])
|
||||||
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
|
print(f"episode over Target{targetName} mean reward:", target_reward_mean)
|
||||||
writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
|
TotalRewardMean = np.mean(mean_reward_list)
|
||||||
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
|
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
||||||
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
|
|
||||||
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
|
||||||
TotalRewardMean = np.mean(meanRewardList)
|
|
||||||
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
|
||||||
writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps)
|
|
||||||
# New Record!
|
|
||||||
if TotalRewardMean > bestReward:
|
|
||||||
bestReward = targetRewardMean
|
|
||||||
for i in range(using_targets_num):
|
|
||||||
saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) +"_"+ str(i)+".pt"
|
|
||||||
torch.save(agentList[i], saveDir)
|
|
||||||
|
|
||||||
for i in range(using_targets_num):
|
saveDir = "../PPO-Model/" + run_name + "/"
|
||||||
saveDir = "../PPO-Model/"+ run_name +"_last_"+ str(i) + ".pt"
|
if not os.path.isdir(saveDir):
|
||||||
torch.save(agentList[i], saveDir)
|
os.mkdir(saveDir)
|
||||||
|
best_reward = target_reward_mean
|
||||||
|
torch.save(agent, saveDir + "_last.pt")
|
||||||
env.close()
|
env.close()
|
||||||
writer.close()
|
wdb_recorder.writer.close()
|
||||||
|
143
Aimbot-PPO-Python/Pytorch/aimemory.py
Normal file
143
Aimbot-PPO-Python/Pytorch/aimemory.py
Normal file
@ -0,0 +1,143 @@
|
|||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
import argparse
|
||||||
|
from ppoagent import PPOAgent
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
# public data
|
||||||
|
class Targets(Enum):
|
||||||
|
Free = 0
|
||||||
|
Go = 1
|
||||||
|
Attack = 2
|
||||||
|
Defence = 3
|
||||||
|
Num = 4
|
||||||
|
|
||||||
|
class PPOMem:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
args: argparse.Namespace,
|
||||||
|
unity_agent_num: int,
|
||||||
|
device: torch.device,
|
||||||
|
) -> None:
|
||||||
|
self.target_num = args.target_num
|
||||||
|
self.data_set_size = args.datasetSize
|
||||||
|
self.result_broadcast_ratio = args.result_broadcast_ratio
|
||||||
|
self.decision_period = args.decision_period
|
||||||
|
self.unity_agent_num = unity_agent_num
|
||||||
|
|
||||||
|
self.base_lose_reward = args.base_lose_reward
|
||||||
|
self.base_win_reward = args.base_win_reward
|
||||||
|
self.target_state_size = args.target_state_size
|
||||||
|
self.device = device
|
||||||
|
|
||||||
|
# Trajectory Buffer
|
||||||
|
self.ob_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
|
self.act_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
|
self.dis_logprobs_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
|
self.con_logprobs_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
|
self.rewards_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
|
self.dones_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
|
self.values_bf = [[] for i in range(self.unity_agent_num)]
|
||||||
|
|
||||||
|
# initialize empty training datasets
|
||||||
|
self.obs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,env.unity_observation_size)
|
||||||
|
self.actions = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,env.unity_action_size)
|
||||||
|
self.dis_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
|
||||||
|
self.con_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
|
||||||
|
self.rewards = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
|
||||||
|
self.values = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
|
||||||
|
self.advantages = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
|
||||||
|
self.returns = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
|
||||||
|
|
||||||
|
def broad_cast_end_reward(self, rewardBF: list, remainTime: float) -> torch.Tensor:
|
||||||
|
thisRewardBF = rewardBF.copy()
|
||||||
|
if rewardBF[-1] <= -500:
|
||||||
|
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
|
||||||
|
thisRewardBF[-1] = rewardBF[-1] - self.base_lose_reward
|
||||||
|
elif rewardBF[-1] >= 500:
|
||||||
|
# print("Win! Broadcast reward!",rewardBF[-1])
|
||||||
|
print(sum(thisRewardBF) / len(thisRewardBF))
|
||||||
|
thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward
|
||||||
|
# broadcast result reward, increase all reward in this round by remainTime * self.result_broadcast_ratio
|
||||||
|
thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist()
|
||||||
|
else:
|
||||||
|
print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1])
|
||||||
|
return torch.tensor(thisRewardBF,dtype=torch.float32).to(self.device)
|
||||||
|
|
||||||
|
def save_memories(
|
||||||
|
self,
|
||||||
|
now_step: int,
|
||||||
|
agent: PPOAgent,
|
||||||
|
state: np.ndarray,
|
||||||
|
action_cpu: np.ndarray,
|
||||||
|
dis_logprob_cpu: np.ndarray,
|
||||||
|
con_logprob_cpu: np.ndarray,
|
||||||
|
reward: list,
|
||||||
|
done: list,
|
||||||
|
value_cpu: np.ndarray,
|
||||||
|
last_reward: list,
|
||||||
|
next_done: list,
|
||||||
|
next_state: np.ndarray,
|
||||||
|
):
|
||||||
|
for i in range(self.unity_agent_num):
|
||||||
|
if now_step % self.decision_period == 0 or next_done[i] == True:
|
||||||
|
# only on decision period or finished a round, save memories to buffer
|
||||||
|
self.ob_bf[i].append(state[i])
|
||||||
|
self.act_bf[i].append(action_cpu[i])
|
||||||
|
self.dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
||||||
|
self.con_logprobs_bf[i].append(con_logprob_cpu[i])
|
||||||
|
self.dones_bf[i].append(done[i])
|
||||||
|
self.values_bf[i].append(value_cpu[i])
|
||||||
|
if now_step % self.decision_period == 0:
|
||||||
|
# on decision period, add last skiped round's reward, only affact in decision_period != 1
|
||||||
|
self.rewards_bf[i].append(reward[i] + last_reward[i])
|
||||||
|
else:
|
||||||
|
# not on decision period, only add this round's reward
|
||||||
|
self.rewards_bf[i].append(reward[i])
|
||||||
|
if next_done[i] == True:
|
||||||
|
# finished a round, send finished memories to training datasets
|
||||||
|
# compute advantage and discounted reward
|
||||||
|
remainTime = state[i, self.target_state_size]
|
||||||
|
roundTargetType = int(state[i, 0])
|
||||||
|
thisRewardsTensor = self.broad_cast_end_reward(self.rewards_bf[i], remainTime)
|
||||||
|
adv, rt = agent.gae(
|
||||||
|
rewards=thisRewardsTensor,
|
||||||
|
dones=torch.tensor(self.dones_bf[i],dtype=torch.float32).to(self.device),
|
||||||
|
values=torch.tensor(self.values_bf[i]).to(self.device),
|
||||||
|
next_obs=torch.tensor(next_state[i]).to(self.device).unsqueeze(0),
|
||||||
|
next_done=torch.tensor([next_done[i]],dtype=torch.float32).to(self.device),
|
||||||
|
)
|
||||||
|
# send memories to training datasets
|
||||||
|
self.obs[roundTargetType] = torch.cat((self.obs[roundTargetType], torch.tensor(np.array(self.ob_bf[i])).to(self.device)), 0)
|
||||||
|
self.actions[roundTargetType] = torch.cat((self.actions[roundTargetType], torch.tensor(np.array(self.act_bf[i])).to(self.device)), 0)
|
||||||
|
self.dis_logprobs[roundTargetType] = torch.cat((self.dis_logprobs[roundTargetType], torch.tensor(np.array(self.dis_logprobs_bf[i])).to(self.device)), 0)
|
||||||
|
self.con_logprobs[roundTargetType] = torch.cat((self.con_logprobs[roundTargetType], torch.tensor(np.array(self.con_logprobs_bf[i])).to(self.device)), 0)
|
||||||
|
self.rewards[roundTargetType] = torch.cat((self.rewards[roundTargetType], thisRewardsTensor), 0)
|
||||||
|
self.values[roundTargetType] = torch.cat((self.values[roundTargetType], torch.tensor(np.array(self.values_bf[i])).to(self.device)), 0)
|
||||||
|
self.advantages[roundTargetType] = torch.cat((self.advantages[roundTargetType], adv), 0)
|
||||||
|
self.returns[roundTargetType] = torch.cat((self.returns[roundTargetType], rt), 0)
|
||||||
|
|
||||||
|
# clear buffers
|
||||||
|
self.clear_buffers(i)
|
||||||
|
print(f"train dataset {Targets(roundTargetType).name} added:{self.obs[roundTargetType].size()[0]}/{self.data_set_size}")
|
||||||
|
|
||||||
|
def clear_buffers(self,ind:int):
|
||||||
|
# clear buffers
|
||||||
|
self.ob_bf[ind] = []
|
||||||
|
self.act_bf[ind] = []
|
||||||
|
self.dis_logprobs_bf[ind] = []
|
||||||
|
self.con_logprobs_bf[ind] = []
|
||||||
|
self.rewards_bf[ind] = []
|
||||||
|
self.dones_bf[ind] = []
|
||||||
|
self.values_bf[ind] = []
|
||||||
|
|
||||||
|
def clear_training_datasets(self,ind:int):
|
||||||
|
# clear training datasets
|
||||||
|
self.obs[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.actions[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.dis_logprobs[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.con_logprobs[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.rewards[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.values[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.advantages[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.returns[ind] = torch.tensor([]).to(self.device)
|
81
Aimbot-PPO-Python/Pytorch/airecorder.py
Normal file
81
Aimbot-PPO-Python/Pytorch/airecorder.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
|
||||||
|
import wandb
|
||||||
|
|
||||||
|
total_rounds = {"Free": 0, "Go": 0, "Attack": 0}
|
||||||
|
win_rounds = {"Free": 0, "Go": 0, "Attack": 0}
|
||||||
|
|
||||||
|
|
||||||
|
# class for wandb recording
|
||||||
|
class WandbRecorder:
|
||||||
|
def __init__(self, game_name: str, game_type: str, run_name: str, _args) -> None:
|
||||||
|
# init wandb
|
||||||
|
self.game_name = game_name
|
||||||
|
self.game_type = game_type
|
||||||
|
self._args = _args
|
||||||
|
self.run_name = run_name
|
||||||
|
if self._args.wandb_track:
|
||||||
|
wandb.init(
|
||||||
|
project=self.game_name,
|
||||||
|
entity=self._args.wandb_entity,
|
||||||
|
sync_tensorboard=True,
|
||||||
|
config=vars(self._args),
|
||||||
|
name=self.run_name,
|
||||||
|
monitor_gym=True,
|
||||||
|
save_code=True,
|
||||||
|
)
|
||||||
|
self.writer = SummaryWriter(f"runs/{self.run_name}")
|
||||||
|
self.writer.add_text(
|
||||||
|
"hyperparameters",
|
||||||
|
"|param|value|\n|-|-|\n%s"
|
||||||
|
% ("\n".join([f"|{key}|{value}|" for key, value in vars(self._args).items()])),
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_target_scalar(
|
||||||
|
self,
|
||||||
|
target_name,
|
||||||
|
this_t,
|
||||||
|
v_loss,
|
||||||
|
dis_pg_loss,
|
||||||
|
con_pg_loss,
|
||||||
|
loss,
|
||||||
|
entropy_loss,
|
||||||
|
target_reward_mean,
|
||||||
|
target_steps,
|
||||||
|
):
|
||||||
|
# fmt:off
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/value_loss", v_loss.item(), target_steps[this_t]
|
||||||
|
)
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[this_t]
|
||||||
|
)
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[this_t]
|
||||||
|
)
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/total_loss", loss.item(), target_steps[this_t]
|
||||||
|
)
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[this_t]
|
||||||
|
)
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/Reward", target_reward_mean, target_steps[this_t]
|
||||||
|
)
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[this_t],
|
||||||
|
)
|
||||||
|
# fmt:on
|
||||||
|
|
||||||
|
def add_global_scalar(
|
||||||
|
self,
|
||||||
|
total_reward_mean,
|
||||||
|
learning_rate,
|
||||||
|
total_steps,
|
||||||
|
):
|
||||||
|
self.writer.add_scalar("GlobalCharts/TotalRewardMean", total_reward_mean, total_steps)
|
||||||
|
self.writer.add_scalar("GlobalCharts/learning_rate", learning_rate, total_steps)
|
||||||
|
def add_win_ratio(self, target_name, target_steps):
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps,
|
||||||
|
)
|
56
Aimbot-PPO-Python/Pytorch/arguments-cn.md
Normal file
56
Aimbot-PPO-Python/Pytorch/arguments-cn.md
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
|
||||||
|
|
||||||
|
本项目使用以下命令行参数来配置运行环境和模型训练参数:
|
||||||
|
|
||||||
|
- `--seed <int>`:实验的随机种子。默认值为`9331`。
|
||||||
|
- `--path <str>`:环境路径。默认值为`"./Build/3.6/Aimbot-ParallelEnv"`。
|
||||||
|
- `--workerID <int>`:Unity worker ID。默认值为`1`。
|
||||||
|
- `--baseport <int>`:用于连接Unity环境的端口。默认值为`500`。
|
||||||
|
- `--lr <float>`:优化器的默认学习率。默认值为`5e-5`。
|
||||||
|
- `--cuda`:如果启用,将默认使用cuda。可以通过传入`true`或`false`来开启或关闭。
|
||||||
|
- `--total-timesteps <int>`:实验的总时间步数。默认值为`3150000`。
|
||||||
|
|
||||||
|
### 模型参数
|
||||||
|
|
||||||
|
- `--train`:是否训练模型。默认启用。
|
||||||
|
- `--freeze-viewnet`:是否冻结视图网络(raycast)。默认为`False`。
|
||||||
|
- `--datasetSize <int>`:训练数据集的大小,当数据集收集足够的数据时开始训练。默认值为`6000`。
|
||||||
|
- `--minibatchSize <int>`:minibatch大小。默认值为`512`。
|
||||||
|
- `--epochs <int>`:更新策略的K次迭代。默认值为`3`。
|
||||||
|
- `--annealLR`:是否对策略和价值网络进行学习率退火。默认为`True`。
|
||||||
|
- `--wandb-track`:是否在wandb上跟踪。默认为`False`。
|
||||||
|
- `--save-model`:是否保存模型。默认为`False`。
|
||||||
|
- `--wandb-entity <str>`:wandb项目的实体。默认值为`"koha9"`。
|
||||||
|
- `--load-dir <str>`:模型加载目录。默认值为`None`。
|
||||||
|
- `--decision-period <int>`:Timestep之间的动作执行间隔。默认值为`1`。
|
||||||
|
- `--result-broadcast-ratio <float>`:当赢得回合时,对结果的reward进行broadcast的比例,默认值为`1/30`。
|
||||||
|
- `--target-lr <float>`:下调学习率的目标值。默认值为`1e-6`。
|
||||||
|
|
||||||
|
### 损失函数参数
|
||||||
|
|
||||||
|
- `--policy-coef <float>`:策略损失的系数。默认值为`[0.8, 0.8, 0.8, 0.8]`。
|
||||||
|
- `--entropy-coef <float>`:熵损失的系数。默认值为`[0.05, 0.05, 0.05, 0.05]`。
|
||||||
|
- `--critic-coef <float>`:评论家损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
|
||||||
|
- `--loss-coef <float>`:总损失的系数。默认值为`[1.0, 1.0, 1.0, 1.0]`。
|
||||||
|
|
||||||
|
### GAE损失参数
|
||||||
|
|
||||||
|
- `--gae`:是否使用GAE进行优势计算。默认启用。
|
||||||
|
- `--norm-adv`:是否标准化优势。默认为`False`。
|
||||||
|
- `--gamma <float>`:折扣因子gamma。默认值为`0.999`。
|
||||||
|
- `--gaeLambda <float>`:GAE的lambda值。默认值为`0.95`。
|
||||||
|
- `--clip-coef <float>`:替代裁剪系数。默认值为`0.11`。
|
||||||
|
- `--clip-vloss`:是否使用论文中的裁剪价值函数损失。默认启用。
|
||||||
|
- `--max-grad-norm <float>`:梯度裁剪的最大范数。默认值为`0.5`。
|
||||||
|
|
||||||
|
### 环境参数
|
||||||
|
|
||||||
|
- `--target-num <int>`:目标种类数量。默认值为`4`。
|
||||||
|
- `--env-timelimit <int>`:每轮的时间限制。默认值为`30`。
|
||||||
|
- `--base-win-reward <int>`:赢得回合的基础奖励。默认值为`999`。
|
||||||
|
- `--base-lose-reward <int>`:输掉回合的基础奖励。默认值为`-999`。
|
||||||
|
- `--target-state-size <int>`:target状态的大小。默认值为`6`。
|
||||||
|
- `--time-state-size <int>`:游戏剩余时间状态的大小。默认值为`1`。
|
||||||
|
- `--gun-state-size <int>`:枪状态的大小。默认值为`1`。
|
||||||
|
- `--my-state-size <int>`:我的状态大小。默认值为`4`。
|
||||||
|
- `--total-target-size <int>`:总target状态的大小。默认值为`12`。
|
52
Aimbot-PPO-Python/Pytorch/arguments-jp.md
Normal file
52
Aimbot-PPO-Python/Pytorch/arguments-jp.md
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
- `--seed <int>`:実験の乱数Seed。デフォルト値は`9331`。
|
||||||
|
- `--path <str>`:環境パス。デフォルト値は`"./Build/3.6/Aimbot-ParallelEnv"`。
|
||||||
|
- `--workerID <int>`:Unity Worker ID。デフォルト値は`1`。
|
||||||
|
- `--baseport <int>`:Unity環境への接続用Port。デフォルト値は`500`。
|
||||||
|
- `--lr <float>`:Optimizerのデフォルト学習率。デフォルト値は`5e-5`。
|
||||||
|
- `--cuda`:有効にすると、デフォルトでcudaを使用します。`true`または`false`を渡すことで有効/無効を切り替えられます。
|
||||||
|
- `--total-timesteps <int>`:実験の合計タイムステップ数。デフォルト値は`3150000`。
|
||||||
|
|
||||||
|
### モデルパラメータ
|
||||||
|
|
||||||
|
- `--train`:モデルを訓練するかどうか。デフォルトで有効。
|
||||||
|
- `--freeze-viewnet`:ビューネットワーク(raycast)をfreezeする。デフォルトは`False`。
|
||||||
|
- `--datasetSize <int>`:訓練データセットのサイズ。データセットが十分なデータを集めたら訓練を開始する。デフォルト値は`6000`。
|
||||||
|
- `--minibatchSize <int>`:minibatchのサイズ。デフォルト値は`512`。
|
||||||
|
- `--epochs <int>`:epochs。デフォルト値は`3`。
|
||||||
|
- `--annealLR`:ポリシーとバリューネットワークの学習率を退火するかどうか。デフォルトは`True`。
|
||||||
|
- `--wandb-track`:wandbでトラッキングするかどうか。デフォルトは`False`。
|
||||||
|
- `--save-model`:モデルを保存するかどうか。デフォルトは`False`。
|
||||||
|
- `--wandb-entity <str>`:wandbプロジェクトのエンティティ。デフォルト値は`"koha9"`。
|
||||||
|
- `--load-dir <str>`:モデルのロードディレクトリ。デフォルト値は`None`。
|
||||||
|
- `--decision-period <int>`:実際動作を実行する時のタイムステップの間隔。デフォルト値は`1`。
|
||||||
|
- `--result-broadcast-ratio <float>`:ラウンドに勝った場合の報酬のbroadcast ratio、デフォルト値は`1/30`。
|
||||||
|
- `--target-lr <float>`:学習率を下げる時の目標値。デフォルト値は`1e-6`。
|
||||||
|
|
||||||
|
### 損失関数パラメータ
|
||||||
|
|
||||||
|
- `--policy-coef <float>`:policy損失の係数。デフォルト値は`[0.8, 0.8, 0.8, 0.8]`。
|
||||||
|
- `--entropy-coef <float>`:entropy損失の係数。デフォルト値は`[0.05, 0.05, 0.05, 0.05]`。
|
||||||
|
- `--critic-coef <float>`:critic損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
|
||||||
|
- `--loss-coef <float>`:全体の損失の係数。デフォルト値は`[1.0, 1.0, 1.0, 1.0]`。
|
||||||
|
|
||||||
|
### GAE損失パラメータ
|
||||||
|
|
||||||
|
- `--gae`:GAEを使用してアドバンテージを計算するかどうか。デフォルトで有効。
|
||||||
|
- `--norm-adv`:アドバンテージを正規化するかどうか。デフォルトは`False`。
|
||||||
|
- `--gamma <float>`:割引因子gamma。デフォルト値は`0.999`。
|
||||||
|
- `--gaeLambda <float>`:GAEのlambda値。デフォルト値は`0.95`。
|
||||||
|
- `--clip-coef <float>`:代替クリッピング係数。デフォルト値は`0.11`。
|
||||||
|
- `--clip-vloss`:論文で述べられている価値関数の損失のクリッピングを使用するかどうか。デフォルトで有効。
|
||||||
|
- `--max-grad-norm <float>`:勾配のクリッピングの最大ノルム。デフォルト値は`0.5`。
|
||||||
|
|
||||||
|
### 環境パラメータ
|
||||||
|
|
||||||
|
- `--target-num <int>`:Targetの種類数。デフォルト値は`4`。
|
||||||
|
- `--env-timelimit <int>`:ラウンドごとの時間制限。デフォルト値は`30`。
|
||||||
|
- `--base-win-reward <int>`:ラウンドに勝った場合の基本報酬。デフォルト値は`999`。
|
||||||
|
- `--base-lose-reward <int>`:ラウンドに負けた場合の基本報酬。デフォルト値は`-999`。
|
||||||
|
- `--target-state-size <int>`:Targetの状態サイズ。デフォルト値は`6`。
|
||||||
|
- `--time-state-size <int>`:ゲームの残り時間の状態サイズ。デフォルト値は`1`。
|
||||||
|
- `--gun-state-size <int>`:銃の状態サイズ。デフォルト値は`1`。
|
||||||
|
- `--my-state-size <int>`:自分の状態サイズ。デフォルト値は`4`。
|
||||||
|
- `--total-target-size <int>`:全Targetの状態サイズ。デフォルト値は`12`。
|
159
Aimbot-PPO-Python/Pytorch/arguments.py
Normal file
159
Aimbot-PPO-Python/Pytorch/arguments.py
Normal file
@ -0,0 +1,159 @@
|
|||||||
|
import argparse
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
from distutils.util import strtobool
|
||||||
|
|
||||||
|
DEFAULT_SEED = 9331
|
||||||
|
ENV_PATH = "../Build/3.6/Aimbot-ParallelEnv"
|
||||||
|
WAND_ENTITY = "koha9"
|
||||||
|
WORKER_ID = 1
|
||||||
|
BASE_PORT = 1000
|
||||||
|
|
||||||
|
# tensorboard names
|
||||||
|
|
||||||
|
# max round steps per agent is 2500/Decision_period, 25 seconds
|
||||||
|
TOTAL_STEPS = 3150000
|
||||||
|
BATCH_SIZE = 512
|
||||||
|
MAX_TRAINNING_DATASETS = 6000
|
||||||
|
DECISION_PERIOD = 1
|
||||||
|
LEARNING_RATE = 5e-5
|
||||||
|
GAMMA = 0.999
|
||||||
|
GAE_LAMBDA = 0.95
|
||||||
|
EPOCHS = 3
|
||||||
|
CLIP_COEF = 0.11
|
||||||
|
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
|
||||||
|
POLICY_COEF = [0.8, 0.8, 0.8, 0.8]
|
||||||
|
ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
|
||||||
|
CRITIC_COEF = [1.0, 1.0, 1.0, 1.0]
|
||||||
|
TARGET_LEARNING_RATE = 1e-6
|
||||||
|
|
||||||
|
FREEZE_VIEW_NETWORK = False
|
||||||
|
ANNEAL_LEARNING_RATE = True
|
||||||
|
CLIP_VLOSS = True
|
||||||
|
NORM_ADV = False
|
||||||
|
TRAIN = True
|
||||||
|
SAVE_MODEL = True
|
||||||
|
WANDB_TACK = True
|
||||||
|
LOAD_DIR = None
|
||||||
|
# LOAD_DIR = "../PPO-Model/GotoOnly-Level1234_9331_1697122986/8.853553.pt"
|
||||||
|
|
||||||
|
# Unity Environment Parameters
|
||||||
|
TARGET_STATE_SIZE = 6
|
||||||
|
INAREA_STATE_SIZE = 1
|
||||||
|
TIME_STATE_SIZE = 1
|
||||||
|
GUN_STATE_SIZE = 1
|
||||||
|
MY_STATE_SIZE = 4
|
||||||
|
TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
|
||||||
|
BASE_WINREWARD = 999
|
||||||
|
BASE_LOSEREWARD = -999
|
||||||
|
TARGETNUM= 4
|
||||||
|
ENV_TIMELIMIT = 30
|
||||||
|
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
|
||||||
|
|
||||||
|
save_model_this_episode = False
|
||||||
|
|
||||||
|
def is_save_model():
|
||||||
|
global save_model_this_episode
|
||||||
|
return save_model_this_episode
|
||||||
|
def set_save_model(save_model:bool):
|
||||||
|
print("set save model to ",save_model)
|
||||||
|
global save_model_this_episode
|
||||||
|
save_model_this_episode = save_model
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
# fmt: off
|
||||||
|
# pytorch and environment parameters
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
|
||||||
|
help="seed of the experiment")
|
||||||
|
parser.add_argument("--path", type=str, default=ENV_PATH,
|
||||||
|
help="enviroment path")
|
||||||
|
parser.add_argument("--workerID", type=int, default=WORKER_ID,
|
||||||
|
help="unity worker ID")
|
||||||
|
parser.add_argument("--baseport", type=int, default=BASE_PORT,
|
||||||
|
help="port to connect to Unity environment")
|
||||||
|
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
|
||||||
|
help="the default learning rate of optimizer")
|
||||||
|
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||||
|
help="if toggled, cuda will be enabled by default")
|
||||||
|
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
|
||||||
|
help="total timesteps of the experiments")
|
||||||
|
|
||||||
|
# model parameters
|
||||||
|
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
|
||||||
|
help="Train Model or not")
|
||||||
|
parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
|
||||||
|
help="freeze view network or not")
|
||||||
|
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
|
||||||
|
help="training dataset size,start training while dataset collect enough data")
|
||||||
|
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
|
||||||
|
help="nimi batch size")
|
||||||
|
parser.add_argument("--epochs", type=int, default=EPOCHS,
|
||||||
|
help="the K epochs to update the policy")
|
||||||
|
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
|
||||||
|
help="Toggle learning rate annealing for policy and value networks")
|
||||||
|
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
|
||||||
|
help="track on the wandb")
|
||||||
|
parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
|
||||||
|
help="save model or not")
|
||||||
|
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
|
||||||
|
help="the entity (team) of wandb's project")
|
||||||
|
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
|
||||||
|
help="load model directory")
|
||||||
|
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
|
||||||
|
help="the number of steps to run in each environment per policy rollout")
|
||||||
|
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
|
||||||
|
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
|
||||||
|
# target_learning_rate
|
||||||
|
parser.add_argument("--target-lr", type=float, default=TARGET_LEARNING_RATE,
|
||||||
|
help="target value of downscaling the learning rate")
|
||||||
|
|
||||||
|
# POLICY_COEF ENTROPY_COEF CRITIC_COEF LOSS_COEF
|
||||||
|
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
|
||||||
|
help="coefficient of the policy loss")
|
||||||
|
parser.add_argument("--entropy-coef", type=float, default=ENTROPY_COEF,
|
||||||
|
help="coefficient of the entropy loss")
|
||||||
|
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
|
||||||
|
help="coefficient of the critic loss")
|
||||||
|
parser.add_argument("--loss-coef", type=float, default=LOSS_COEF,
|
||||||
|
help="coefficient of the total loss")
|
||||||
|
|
||||||
|
# GAE loss
|
||||||
|
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||||
|
help="Use GAE for advantage computation")
|
||||||
|
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
|
||||||
|
help="Toggles advantages normalization")
|
||||||
|
parser.add_argument("--gamma", type=float, default=GAMMA,
|
||||||
|
help="the discount factor gamma")
|
||||||
|
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
|
||||||
|
help="the lambda for the general advantage estimation")
|
||||||
|
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
|
||||||
|
help="the surrogate clipping coefficient")
|
||||||
|
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
|
||||||
|
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
|
||||||
|
parser.add_argument("--max-grad-norm", type=float, default=0.5,
|
||||||
|
help="the maximum norm for the gradient clipping")
|
||||||
|
parser.add_argument("--target-kl", type=float, default=None,
|
||||||
|
help="the target KL divergence threshold")
|
||||||
|
# environment parameters
|
||||||
|
parser.add_argument("--target-num", type=int, default=TARGETNUM,
|
||||||
|
help="the number of targets")
|
||||||
|
parser.add_argument("--env-timelimit", type=int, default=ENV_TIMELIMIT,
|
||||||
|
help="the time limit of each round")
|
||||||
|
parser.add_argument("--base-win-reward", type=int, default=BASE_WINREWARD,
|
||||||
|
help="the base reward of win round")
|
||||||
|
parser.add_argument("--base-lose-reward", type=int, default=BASE_LOSEREWARD,
|
||||||
|
help="the base reward of lose round")
|
||||||
|
parser.add_argument("--target-state-size", type=int, default=TARGET_STATE_SIZE,
|
||||||
|
help="the size of target state")
|
||||||
|
parser.add_argument("--time-state-size", type=int, default=TIME_STATE_SIZE,
|
||||||
|
help="the size of time state")
|
||||||
|
parser.add_argument("--gun-state-size", type=int, default=GUN_STATE_SIZE,
|
||||||
|
help="the size of gun state")
|
||||||
|
parser.add_argument("--my-state-size", type=int, default=MY_STATE_SIZE,
|
||||||
|
help="the size of my state")
|
||||||
|
parser.add_argument("--total-target-size", type=int, default=TOTAL_T_SIZE,
|
||||||
|
help="the size of total target state")
|
||||||
|
# fmt: on
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
291
Aimbot-PPO-Python/Pytorch/ppoagent.py
Normal file
291
Aimbot-PPO-Python/Pytorch/ppoagent.py
Normal file
@ -0,0 +1,291 @@
|
|||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
|
||||||
|
from torch import nn
|
||||||
|
from aimbotEnv import Aimbot
|
||||||
|
from torch.distributions.normal import Normal
|
||||||
|
from torch.distributions.categorical import Categorical
|
||||||
|
|
||||||
|
|
||||||
|
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
||||||
|
nn.init.orthogonal_(layer.weight, std)
|
||||||
|
nn.init.constant_(layer.bias, bias_const)
|
||||||
|
return layer
|
||||||
|
|
||||||
|
neural_size_1 = 400
|
||||||
|
neural_size_2 = 300
|
||||||
|
|
||||||
|
class PPOAgent(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
env: Aimbot,
|
||||||
|
this_args: argparse.Namespace,
|
||||||
|
device: torch.device,
|
||||||
|
):
|
||||||
|
super(PPOAgent, self).__init__()
|
||||||
|
self.device = device
|
||||||
|
self.args = this_args
|
||||||
|
self.train_agent = self.args.train
|
||||||
|
self.target_num = self.args.target_num
|
||||||
|
self.unity_observation_shape = env.unity_observation_shape
|
||||||
|
self.unity_action_size = env.unity_action_size
|
||||||
|
self.state_size = self.unity_observation_shape[0]
|
||||||
|
self.agent_num = env.unity_agent_num
|
||||||
|
|
||||||
|
self.unity_discrete_type = env.unity_discrete_type
|
||||||
|
self.discrete_size = env.unity_discrete_size
|
||||||
|
self.discrete_shape = list(env.unity_discrete_branches)
|
||||||
|
self.continuous_size = env.unity_continuous_size
|
||||||
|
|
||||||
|
self.hidden_networks = nn.ModuleList(
|
||||||
|
[
|
||||||
|
nn.Sequential(
|
||||||
|
layer_init(nn.Linear(self.state_size, neural_size_1)),
|
||||||
|
nn.LeakyReLU(),
|
||||||
|
layer_init(nn.Linear(neural_size_1, neural_size_2)),
|
||||||
|
nn.LeakyReLU(),
|
||||||
|
)
|
||||||
|
for i in range(self.target_num)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
self.actor_dis = nn.ModuleList(
|
||||||
|
[layer_init(nn.Linear(neural_size_2, self.discrete_size), std=0.5) for i in range(self.target_num)]
|
||||||
|
)
|
||||||
|
self.actor_mean = nn.ModuleList(
|
||||||
|
[layer_init(nn.Linear(neural_size_2, self.continuous_size), std=0) for i in range(self.target_num)]
|
||||||
|
)
|
||||||
|
self.actor_logstd = nn.ParameterList(
|
||||||
|
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
|
||||||
|
)
|
||||||
|
self.critic = nn.ModuleList(
|
||||||
|
[layer_init(nn.Linear(neural_size_2, 1), std=0) for i in range(self.target_num)]
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_value(self, state: torch.Tensor):
|
||||||
|
# get critic value
|
||||||
|
# state.size()[0] is batch_size
|
||||||
|
target = state[:, 0].to(torch.int32) # int
|
||||||
|
hidden_output = torch.stack(
|
||||||
|
[self.hidden_networks[target[i]](state[i]) for i in range(state.size()[0])]
|
||||||
|
)
|
||||||
|
criticV = torch.stack(
|
||||||
|
[self.critic[target[i]](hidden_output[i]) for i in range(state.size()[0])]
|
||||||
|
)
|
||||||
|
return criticV
|
||||||
|
|
||||||
|
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||||
|
# get actions and value
|
||||||
|
target = state[:, 0].to(torch.int32) # int
|
||||||
|
hidden_output = torch.stack(
|
||||||
|
[self.hidden_networks[target[i]](state[i]) for i in range(target.size()[0])]
|
||||||
|
)
|
||||||
|
|
||||||
|
# discrete
|
||||||
|
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
||||||
|
dis_logits = torch.stack(
|
||||||
|
[self.actor_dis[target[i]](hidden_output[i]) for i in range(target.size()[0])]
|
||||||
|
)
|
||||||
|
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
||||||
|
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
||||||
|
# continuous
|
||||||
|
actions_mean = torch.stack(
|
||||||
|
[self.actor_mean[target[i]](hidden_output[i]) for i in range(target.size()[0])]
|
||||||
|
) # self.actor_mean(hidden)
|
||||||
|
action_logstd = torch.stack(
|
||||||
|
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(target.size()[0])]
|
||||||
|
)
|
||||||
|
# print(action_logstd)
|
||||||
|
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
|
||||||
|
con_probs = Normal(actions_mean, action_std)
|
||||||
|
# critic
|
||||||
|
criticV = torch.stack(
|
||||||
|
[self.critic[target[i]](hidden_output[i]) for i in range(target.size()[0])]
|
||||||
|
) # self.critic
|
||||||
|
|
||||||
|
if actions is None:
|
||||||
|
if self.train_agent:
|
||||||
|
# select actions base on probability distribution model
|
||||||
|
dis_act = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
||||||
|
con_act = con_probs.sample()
|
||||||
|
actions = torch.cat([dis_act.T, con_act], dim=1)
|
||||||
|
else:
|
||||||
|
# select actions base on best probability distribution
|
||||||
|
dis_act = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
||||||
|
con_act = actions_mean
|
||||||
|
actions = torch.cat([dis_act.T, con_act], dim=1)
|
||||||
|
else:
|
||||||
|
dis_act = actions[:, 0: self.unity_discrete_type].T
|
||||||
|
con_act = actions[:, self.unity_discrete_type:]
|
||||||
|
dis_log_prob = torch.stack(
|
||||||
|
[ctgr.log_prob(act) for act, ctgr in zip(dis_act, multi_categoricals)]
|
||||||
|
)
|
||||||
|
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
||||||
|
return (
|
||||||
|
actions,
|
||||||
|
dis_log_prob.sum(0),
|
||||||
|
dis_entropy.sum(0),
|
||||||
|
con_probs.log_prob(con_act).sum(1),
|
||||||
|
con_probs.entropy().sum(1),
|
||||||
|
criticV,
|
||||||
|
)
|
||||||
|
|
||||||
|
def train_net(self, this_train_ind: int, ppo_memories, optimizer) -> tuple:
|
||||||
|
start_time = time.time()
|
||||||
|
# flatten the batch
|
||||||
|
b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape)
|
||||||
|
b_dis_logprobs = ppo_memories.dis_logprobs[this_train_ind].reshape(-1)
|
||||||
|
b_con_logprobs = ppo_memories.con_logprobs[this_train_ind].reshape(-1)
|
||||||
|
b_actions = ppo_memories.actions[this_train_ind].reshape((-1,) + (self.unity_action_size,))
|
||||||
|
b_advantages = ppo_memories.advantages[this_train_ind].reshape(-1)
|
||||||
|
b_returns = ppo_memories.returns[this_train_ind].reshape(-1)
|
||||||
|
b_values = ppo_memories.values[this_train_ind].reshape(-1)
|
||||||
|
b_size = b_obs.size()[0]
|
||||||
|
# optimizing the policy and value network
|
||||||
|
b_index = np.arange(b_size)
|
||||||
|
|
||||||
|
for epoch in range(self.args.epochs):
|
||||||
|
print("epoch:", epoch, end="")
|
||||||
|
# shuffle all datasets
|
||||||
|
np.random.shuffle(b_index)
|
||||||
|
for start in range(0, b_size, self.args.minibatchSize):
|
||||||
|
print(".", end="")
|
||||||
|
end = start + self.args.minibatchSize
|
||||||
|
mb_index = b_index[start:end]
|
||||||
|
if np.size(mb_index) <= 1:
|
||||||
|
break
|
||||||
|
mb_advantages = b_advantages[mb_index]
|
||||||
|
|
||||||
|
# normalize advantages
|
||||||
|
if self.args.norm_adv:
|
||||||
|
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
||||||
|
mb_advantages.std() + 1e-8
|
||||||
|
)
|
||||||
|
|
||||||
|
(
|
||||||
|
_,
|
||||||
|
new_dis_logprob,
|
||||||
|
dis_entropy,
|
||||||
|
new_con_logprob,
|
||||||
|
con_entropy,
|
||||||
|
new_value,
|
||||||
|
) = self.get_actions_value(b_obs[mb_index], b_actions[mb_index])
|
||||||
|
# discrete ratio
|
||||||
|
dis_log_ratio = new_dis_logprob - b_dis_logprobs[mb_index]
|
||||||
|
dis_ratio = dis_log_ratio.exp()
|
||||||
|
# continuous ratio
|
||||||
|
con_log_ratio = new_con_logprob - b_con_logprobs[mb_index]
|
||||||
|
con_ratio = con_log_ratio.exp()
|
||||||
|
|
||||||
|
"""
|
||||||
|
# early stop
|
||||||
|
with torch.no_grad():
|
||||||
|
# calculate approx_kl http://joschu.net/blog/kl-approx.html
|
||||||
|
old_approx_kl = (-logratio).mean()
|
||||||
|
approx_kl = ((ratio - 1) - logratio).mean()
|
||||||
|
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
|
||||||
|
"""
|
||||||
|
|
||||||
|
# discrete Policy loss
|
||||||
|
dis_pg_loss_orig = -mb_advantages * dis_ratio
|
||||||
|
dis_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||||
|
dis_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
|
||||||
|
)
|
||||||
|
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
|
||||||
|
# continuous Policy loss
|
||||||
|
con_pg_loss_orig = -mb_advantages * con_ratio
|
||||||
|
con_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||||
|
con_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
|
||||||
|
)
|
||||||
|
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
||||||
|
|
||||||
|
# Value loss
|
||||||
|
new_value = new_value.view(-1)
|
||||||
|
if self.args.clip_vloss:
|
||||||
|
v_loss_unclipped = (new_value - b_returns[mb_index]) ** 2
|
||||||
|
v_clipped = b_values[mb_index] + torch.clamp(
|
||||||
|
new_value - b_values[mb_index],
|
||||||
|
-self.args.clip_coef,
|
||||||
|
self.args.clip_coef,
|
||||||
|
)
|
||||||
|
v_loss_clipped = (v_clipped - b_returns[mb_index]) ** 2
|
||||||
|
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
||||||
|
v_loss = 0.5 * v_loss_max.mean()
|
||||||
|
else:
|
||||||
|
v_loss = 0.5 * ((new_value - b_returns[mb_index]) ** 2).mean()
|
||||||
|
|
||||||
|
# total loss
|
||||||
|
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
||||||
|
loss = (
|
||||||
|
dis_pg_loss * self.args.policy_coef[this_train_ind]
|
||||||
|
+ con_pg_loss * self.args.policy_coef[this_train_ind]
|
||||||
|
+ entropy_loss * self.args.entropy_coef[this_train_ind]
|
||||||
|
+ v_loss * self.args.critic_coef[this_train_ind]
|
||||||
|
) * self.args.loss_coef[this_train_ind]
|
||||||
|
|
||||||
|
if torch.isnan(loss).any():
|
||||||
|
print("LOSS Include NAN!!!")
|
||||||
|
if torch.isnan(dis_pg_loss.any()):
|
||||||
|
print("dis_pg_loss include nan")
|
||||||
|
if torch.isnan(con_pg_loss.any()):
|
||||||
|
print("con_pg_loss include nan")
|
||||||
|
if torch.isnan(entropy_loss.any()):
|
||||||
|
print("entropy_loss include nan")
|
||||||
|
if torch.isnan(v_loss.any()):
|
||||||
|
print("v_loss include nan")
|
||||||
|
raise
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
loss.backward()
|
||||||
|
# Clips gradient norm of an iterable of parameters.
|
||||||
|
nn.utils.clip_grad_norm_(self.parameters(), self.args.max_grad_norm)
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
"""
|
||||||
|
if args.target_kl is not None:
|
||||||
|
if approx_kl > args.target_kl:
|
||||||
|
break
|
||||||
|
"""
|
||||||
|
return v_loss, dis_pg_loss, con_pg_loss, loss, entropy_loss
|
||||||
|
|
||||||
|
def gae(
|
||||||
|
self,
|
||||||
|
rewards: torch.Tensor,
|
||||||
|
dones: torch.Tensor,
|
||||||
|
values: torch.Tensor,
|
||||||
|
next_obs: torch.Tensor,
|
||||||
|
next_done: torch.Tensor,
|
||||||
|
) -> tuple:
|
||||||
|
# GAE
|
||||||
|
with torch.no_grad():
|
||||||
|
next_value = self.get_value(next_obs).reshape(1, -1)
|
||||||
|
data_size = rewards.size()[0]
|
||||||
|
if self.args.gae:
|
||||||
|
advantages = torch.zeros_like(rewards).to(self.device)
|
||||||
|
last_gae_lam = 0
|
||||||
|
for t in reversed(range(data_size)):
|
||||||
|
if t == data_size - 1:
|
||||||
|
next_non_terminal = 1.0 - next_done
|
||||||
|
next_values = next_value
|
||||||
|
else:
|
||||||
|
next_non_terminal = 1.0 - dones[t + 1]
|
||||||
|
next_values = values[t + 1]
|
||||||
|
delta = rewards[t] + self.args.gamma * next_values * next_non_terminal - values[t]
|
||||||
|
advantages[t] = last_gae_lam = (
|
||||||
|
delta + self.args.gamma * self.args.gaeLambda * next_non_terminal * last_gae_lam
|
||||||
|
)
|
||||||
|
returns = advantages + values
|
||||||
|
else:
|
||||||
|
returns = torch.zeros_like(rewards).to(self.device)
|
||||||
|
for t in reversed(range(data_size)):
|
||||||
|
if t == data_size - 1:
|
||||||
|
next_non_terminal = 1.0 - next_done
|
||||||
|
next_return = next_value
|
||||||
|
else:
|
||||||
|
next_non_terminal = 1.0 - dones[t + 1]
|
||||||
|
next_return = returns[t + 1]
|
||||||
|
returns[t] = rewards[t] + self.args.gamma * next_non_terminal * next_return
|
||||||
|
advantages = returns - values
|
||||||
|
return advantages, returns
|
BIN
Aimbot-PPO-Python/Pytorch/pytorch_run_archive.zip
Normal file
BIN
Aimbot-PPO-Python/Pytorch/pytorch_run_archive.zip
Normal file
Binary file not shown.
@ -1,753 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Action, 1 continuous ctrl 2.1\n",
|
|
||||||
"Action, 0 continuous ctrl -1.1\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import gym\n",
|
|
||||||
"from gym.spaces import Dict, Discrete, Box, Tuple\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"class SampleGym(gym.Env):\n",
|
|
||||||
" def __init__(self, config={}):\n",
|
|
||||||
" self.config = config\n",
|
|
||||||
" self.action_space = Tuple((Discrete(2), Box(-10, 10, (2,))))\n",
|
|
||||||
" self.observation_space = Box(-10, 10, (2, 2))\n",
|
|
||||||
" self.p_done = config.get(\"p_done\", 0.1)\n",
|
|
||||||
"\n",
|
|
||||||
" def reset(self):\n",
|
|
||||||
" return self.observation_space.sample()\n",
|
|
||||||
"\n",
|
|
||||||
" def step(self, action):\n",
|
|
||||||
" chosen_action = action[0]\n",
|
|
||||||
" cnt_control = action[1][chosen_action]\n",
|
|
||||||
"\n",
|
|
||||||
" if chosen_action == 0:\n",
|
|
||||||
" reward = cnt_control\n",
|
|
||||||
" else:\n",
|
|
||||||
" reward = -cnt_control - 1\n",
|
|
||||||
"\n",
|
|
||||||
" print(f\"Action, {chosen_action} continuous ctrl {cnt_control}\")\n",
|
|
||||||
" return (\n",
|
|
||||||
" self.observation_space.sample(),\n",
|
|
||||||
" reward,\n",
|
|
||||||
" bool(np.random.choice([True, False], p=[self.p_done, 1.0 - self.p_done])),\n",
|
|
||||||
" {},\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"if __name__ == \"__main__\":\n",
|
|
||||||
" env = SampleGym()\n",
|
|
||||||
" env.reset()\n",
|
|
||||||
" env.step((1, [-1, 2.1])) # should say use action 1 with 2.1\n",
|
|
||||||
" env.step((0, [-1.1, 2.1])) # should say use action 0 with -1.1"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from mlagents_envs.environment import UnityEnvironment\n",
|
|
||||||
"from gym_unity.envs import UnityToGymWrapper\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"\n",
|
|
||||||
"ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n",
|
|
||||||
"WORKER_ID = 1\n",
|
|
||||||
"BASE_PORT = 2002\n",
|
|
||||||
"\n",
|
|
||||||
"env = UnityEnvironment(\n",
|
|
||||||
" file_name=ENV_PATH,\n",
|
|
||||||
" seed=1,\n",
|
|
||||||
" side_channels=[],\n",
|
|
||||||
" worker_id=WORKER_ID,\n",
|
|
||||||
" base_port=BASE_PORT,\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"trackedAgent = 0\n",
|
|
||||||
"env.reset()\n",
|
|
||||||
"BEHA_SPECS = env.behavior_specs\n",
|
|
||||||
"BEHA_NAME = list(BEHA_SPECS)[0]\n",
|
|
||||||
"SPEC = BEHA_SPECS[BEHA_NAME]\n",
|
|
||||||
"print(SPEC)\n",
|
|
||||||
"\n",
|
|
||||||
"decisionSteps, terminalSteps = env.get_steps(BEHA_NAME)\n",
|
|
||||||
"\n",
|
|
||||||
"if trackedAgent in decisionSteps: # ゲーム終了していない場合、環境状態がdecision_stepsに保存される\n",
|
|
||||||
" nextState = decisionSteps[trackedAgent].obs[0]\n",
|
|
||||||
" reward = decisionSteps[trackedAgent].reward\n",
|
|
||||||
" done = False\n",
|
|
||||||
"if trackedAgent in terminalSteps: # ゲーム終了した場合、環境状態がterminal_stepsに保存される\n",
|
|
||||||
" nextState = terminalSteps[trackedAgent].obs[0]\n",
|
|
||||||
" reward = terminalSteps[trackedAgent].reward\n",
|
|
||||||
" done = True\n",
|
|
||||||
"print(decisionSteps.agent_id)\n",
|
|
||||||
"print(terminalSteps)\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"decisionSteps.agent_id [1 2 5 7]\n",
|
|
||||||
"decisionSteps.agent_id_to_index {1: 0, 2: 1, 5: 2, 7: 3}\n",
|
|
||||||
"decisionSteps.reward [0. 0. 0. 0.]\n",
|
|
||||||
"decisionSteps.action_mask [array([[False, False, False],\n",
|
|
||||||
" [False, False, False],\n",
|
|
||||||
" [False, False, False],\n",
|
|
||||||
" [False, False, False]]), array([[False, False, False],\n",
|
|
||||||
" [False, False, False],\n",
|
|
||||||
" [False, False, False],\n",
|
|
||||||
" [False, False, False]]), array([[False, False],\n",
|
|
||||||
" [False, False],\n",
|
|
||||||
" [False, False],\n",
|
|
||||||
" [False, False]])]\n",
|
|
||||||
"decisionSteps.obs [ 0. 0. 0. 0. 0. 0.\n",
|
|
||||||
" 0. 0. 0. 0. 0. 0.\n",
|
|
||||||
" 0. 0. 0. 0. 0. 0.\n",
|
|
||||||
" 0. 0. 0. 0. 0. 0.\n",
|
|
||||||
" 0. 0. 0. 0. 0. 0.\n",
|
|
||||||
" 0. 0. 0. 0. 0. 0.\n",
|
|
||||||
" 0. 0. 0. 0. 0. 0.\n",
|
|
||||||
" 0. 0. 0. 0. 0. 0.\n",
|
|
||||||
" 0. 0. 0. 0. 0. 0.\n",
|
|
||||||
" 0. 0. 0. 0. 0. 0.\n",
|
|
||||||
" 0. 0. -15.994009 1. -26.322788 1.\n",
|
|
||||||
" 1. 1. 1. 1. 1. 2.\n",
|
|
||||||
" 1. 1. 1. 1. 1. 1.\n",
|
|
||||||
" 1. 1.3519633 1.6946528 2.3051548 3.673389 9.067246\n",
|
|
||||||
" 17.521473 21.727095 22.753294 24.167128 25.905216 18.35725\n",
|
|
||||||
" 21.02278 21.053417 0. ]\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"'decisionSteps.obs [array([[-15.994009 , 1. , -26.322788 , 1. , 1. ,\\n 1. , 1. , 1. , 1. , 2. ,\\n 1. , 1. , 1. , 1. , 1. ,\\n 1. , 1. , 1.3519633, 1.6946528, 2.3051548,\\n 3.673389 , 9.067246 , 17.521473 , 21.727095 , 22.753294 ,\\n 24.167128 , 25.905216 , 18.35725 , 21.02278 , 21.053417 ,\\n 0. ],\\n [ -1.8809433, 1. , -25.66834 , 1. , 2. ,\\n 1. , 1. , 1. , 1. , 1. ,\\n 1. , 1. , 1. , 1. , 1. ,\\n 1. , 1. , 16.768637 , 23.414627 , 22.04486 ,\\n 21.050663 , 20.486784 , 20.486784 , 21.050665 , 15.049731 ,\\n 11.578419 , 9.695194 , 20.398016 , 20.368341 , 20.398016 ,\\n...\\n 20.551746 , 20.00118 , 20.001116 , 20.551594 , 21.5222 ,\\n 17.707508 , 14.86889 , 19.914494 , 19.885508 , 19.914463 ,\\n 0. ]], dtype=float32)]'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 9,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"print(\"decisionSteps.agent_id\",decisionSteps.agent_id)\n",
|
|
||||||
"# decisionSteps.agent_id [1 2 5 7]\n",
|
|
||||||
"print(\"decisionSteps.agent_id_to_index\",decisionSteps.agent_id_to_index)\n",
|
|
||||||
"# decisionSteps.agent_id_to_index {1: 0, 2: 1, 5: 2, 7: 3}\n",
|
|
||||||
"print(\"decisionSteps.reward\",decisionSteps.reward)\n",
|
|
||||||
"# decisionSteps.reward [0. 0. 0. 0.]\n",
|
|
||||||
"print(\"decisionSteps.action_mask\",decisionSteps.action_mask)\n",
|
|
||||||
"'''\n",
|
|
||||||
"decisionSteps.action_mask [array([[False, False, False],\n",
|
|
||||||
" [False, False, False],\n",
|
|
||||||
" [False, False, False],\n",
|
|
||||||
" [False, False, False]]), array([[False, False, False],\n",
|
|
||||||
" [False, False, False],\n",
|
|
||||||
" [False, False, False],\n",
|
|
||||||
" [False, False, False]]), array([[False, False],\n",
|
|
||||||
" [False, False],\n",
|
|
||||||
" [False, False],\n",
|
|
||||||
" [False, False]])]\n",
|
|
||||||
"'''\n",
|
|
||||||
"print(\"decisionSteps.obs\", decisionSteps.obs[0][0])\n",
|
|
||||||
"'''decisionSteps.obs [array([[-15.994009 , 1. , -26.322788 , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 2. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1.3519633, 1.6946528, 2.3051548,\n",
|
|
||||||
" 3.673389 , 9.067246 , 17.521473 , 21.727095 , 22.753294 ,\n",
|
|
||||||
" 24.167128 , 25.905216 , 18.35725 , 21.02278 , 21.053417 ,\n",
|
|
||||||
" 0. ],\n",
|
|
||||||
" [ -1.8809433, 1. , -25.66834 , 1. , 2. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 16.768637 , 23.414627 , 22.04486 ,\n",
|
|
||||||
" 21.050663 , 20.486784 , 20.486784 , 21.050665 , 15.049731 ,\n",
|
|
||||||
" 11.578419 , 9.695194 , 20.398016 , 20.368341 , 20.398016 ,\n",
|
|
||||||
"...\n",
|
|
||||||
" 20.551746 , 20.00118 , 20.001116 , 20.551594 , 21.5222 ,\n",
|
|
||||||
" 17.707508 , 14.86889 , 19.914494 , 19.885508 , 19.914463 ,\n",
|
|
||||||
" 0. ]], dtype=float32)]'''\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from AimbotEnv import Aimbot\n",
|
|
||||||
"\n",
|
|
||||||
"ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n",
|
|
||||||
"WORKER_ID = 1\n",
|
|
||||||
"BASE_PORT = 2002\n",
|
|
||||||
"\n",
|
|
||||||
"env = Aimbot(envPath=ENV_PATH,workerID= WORKER_ID,basePort= BASE_PORT)\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"(array([[ 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , -15.994009 , 1. , -26.322788 , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 2. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1.3519633, 1.6946528,\n",
|
|
||||||
" 2.3051548, 3.673389 , 9.067246 , 17.521473 , 21.727095 ,\n",
|
|
||||||
" 22.753294 , 24.167128 , 25.905216 , 18.35725 , 21.02278 ,\n",
|
|
||||||
" 21.053417 , 0. , -15.994003 , 1. , -26.322784 ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 1.3519667,\n",
|
|
||||||
" 1.6946585, 2.3051722, 3.6734192, 9.067533 , 21.145092 ,\n",
|
|
||||||
" 21.727148 , 22.753365 , 24.167217 , 25.905317 , 18.358263 ,\n",
|
|
||||||
" 21.022812 , 21.053455 , 0. ],\n",
|
|
||||||
" [ 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , -1.8809433, 1. , -25.66834 , 1. ,\n",
|
|
||||||
" 2. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 16.768637 , 23.414627 ,\n",
|
|
||||||
" 22.04486 , 21.050663 , 20.486784 , 20.486784 , 21.050665 ,\n",
|
|
||||||
" 15.049731 , 11.578419 , 9.695194 , 20.398016 , 20.368341 ,\n",
|
|
||||||
" 20.398016 , 0. , -1.8809433, 1. , -25.66834 ,\n",
|
|
||||||
" 1. , 1. , 2. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 2. ,\n",
|
|
||||||
" 2. , 1. , 1. , 1. , 25.098585 ,\n",
|
|
||||||
" 15.749494 , 22.044899 , 21.050697 , 20.486813 , 20.486813 ,\n",
|
|
||||||
" 21.050694 , 15.049746 , 3.872317 , 3.789325 , 20.398046 ,\n",
|
|
||||||
" 20.368372 , 20.398046 , 0. ],\n",
|
|
||||||
" [ 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , -13.672583 , 1. , -26.479263 , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 5.3249803, 6.401276 ,\n",
|
|
||||||
" 8.374101 , 12.8657875, 21.302414 , 21.30242 , 21.888742 ,\n",
|
|
||||||
" 22.92251 , 24.346794 , 26.09773 , 21.210114 , 21.179258 ,\n",
|
|
||||||
" 21.210117 , 0. , -13.672583 , 1. , -26.479263 ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 2. , 1. , 1. ,\n",
|
|
||||||
" 2. , 1. , 1. , 2. , 5.3249855,\n",
|
|
||||||
" 6.4012837, 8.374114 , 12.865807 , 21.302446 , 21.30245 ,\n",
|
|
||||||
" 16.168503 , 22.922543 , 24.346823 , 7.1110754, 21.210148 ,\n",
|
|
||||||
" 21.17929 , 12.495141 , 0. ],\n",
|
|
||||||
" [ 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , -4.9038744, 1. , -25.185507 , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 20.33171 , 22.859762 ,\n",
|
|
||||||
" 21.522427 , 20.551746 , 20.00118 , 20.001116 , 20.551594 ,\n",
|
|
||||||
" 21.5222 , 17.707508 , 14.86889 , 19.914494 , 19.885508 ,\n",
|
|
||||||
" 19.914463 , 0. , -4.9038773, 1. , -25.185507 ,\n",
|
|
||||||
" 1. , 2. , 1. , 2. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 2. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. , 15.905993 ,\n",
|
|
||||||
" 22.85977 , 11.566693 , 20.551773 , 20.00121 , 20.001146 ,\n",
|
|
||||||
" 20.551619 , 7.135157 , 17.707582 , 14.868943 , 19.914528 ,\n",
|
|
||||||
" 19.88554 , 19.914494 , 0. ]], dtype=float32),\n",
|
|
||||||
" [[-0.05], [-0.05], [-0.05], [-0.05]],\n",
|
|
||||||
" [[False], [False], [False], [False]])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import numpy as np\n",
|
|
||||||
"env.unity_observation_shape\n",
|
|
||||||
"(128, 4) + env.unity_observation_shape\n",
|
|
||||||
"env.reset()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"tensor([[1, 2, 3],\n",
|
|
||||||
" [1, 2, 3],\n",
|
|
||||||
" [1, 2, 3],\n",
|
|
||||||
" [1, 2, 3]], device='cuda:0')\n",
|
|
||||||
"tensor([[1],\n",
|
|
||||||
" [2],\n",
|
|
||||||
" [3],\n",
|
|
||||||
" [4]], device='cuda:0')\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"tensor([[1, 2, 3, 1],\n",
|
|
||||||
" [1, 2, 3, 2],\n",
|
|
||||||
" [1, 2, 3, 3],\n",
|
|
||||||
" [1, 2, 3, 4]], device='cuda:0')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import torch\n",
|
|
||||||
"aa = torch.tensor([[1,2,3],[1,2,3],[1,2,3],[1,2,3]]).to(\"cuda:0\")\n",
|
|
||||||
"bb = torch.tensor([[1],[2],[3],[4]]).to(\"cuda:0\")\n",
|
|
||||||
"print(aa)\n",
|
|
||||||
"print(bb)\n",
|
|
||||||
"torch.cat([aa,bb],axis = 1)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"ename": "AttributeError",
|
|
||||||
"evalue": "Can't get attribute 'PPOAgent' on <module '__main__'>",
|
|
||||||
"output_type": "error",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|
||||||
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
|
||||||
"\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_31348\\1930153251.py\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mmymodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"../PPO-Model/SmallArea-256-128-hybrid.pt\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mmymodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0meval\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
|
||||||
"\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36mload\u001b[1;34m(f, map_location, pickle_module, **pickle_load_args)\u001b[0m\n\u001b[0;32m 710\u001b[0m \u001b[0mopened_file\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mseek\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0morig_position\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 711\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 712\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_load\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_zipfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmap_location\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 713\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0m_legacy_load\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmap_location\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 714\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
|
|
||||||
"\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36m_load\u001b[1;34m(zip_file, map_location, pickle_module, pickle_file, **pickle_load_args)\u001b[0m\n\u001b[0;32m 1047\u001b[0m \u001b[0munpickler\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mUnpicklerWrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_file\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1048\u001b[0m \u001b[0munpickler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpersistent_load\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpersistent_load\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1049\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0munpickler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1050\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1051\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_utils\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_loaded_sparse_tensors\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
|
||||||
"\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36mfind_class\u001b[1;34m(self, mod_name, name)\u001b[0m\n\u001b[0;32m 1040\u001b[0m \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1041\u001b[0m \u001b[0mmod_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_module_mapping\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmod_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmod_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1042\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmod_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1043\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1044\u001b[0m \u001b[1;31m# Load the data (which may in turn use `persistent_load` to load tensors)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
|
||||||
"\u001b[1;31mAttributeError\u001b[0m: Can't get attribute 'PPOAgent' on <module '__main__'>"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import torch\n",
|
|
||||||
"\n",
|
|
||||||
"def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n",
|
|
||||||
" torch.nn.init.orthogonal_(layer.weight, std)\n",
|
|
||||||
" torch.nn.init.constant_(layer.bias, bias_const)\n",
|
|
||||||
" return layer\n",
|
|
||||||
"\n",
|
|
||||||
"class PPOAgent(nn.Module):\n",
|
|
||||||
" def __init__(self, env: Aimbot):\n",
|
|
||||||
" super(PPOAgent, self).__init__()\n",
|
|
||||||
" self.discrete_size = env.unity_discrete_size\n",
|
|
||||||
" self.discrete_shape = list(env.unity_discrete_branches)\n",
|
|
||||||
" self.continuous_size = env.unity_continuous_size\n",
|
|
||||||
"\n",
|
|
||||||
" self.network = nn.Sequential(\n",
|
|
||||||
" layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 256)),\n",
|
|
||||||
" nn.ReLU(),\n",
|
|
||||||
" layer_init(nn.Linear(256, 128)),\n",
|
|
||||||
" nn.ReLU(),\n",
|
|
||||||
" )\n",
|
|
||||||
" self.actor_dis = layer_init(nn.Linear(128, self.discrete_size), std=0.01)\n",
|
|
||||||
" self.actor_mean = layer_init(nn.Linear(128, self.continuous_size), std=0.01)\n",
|
|
||||||
" self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))\n",
|
|
||||||
" self.critic = layer_init(nn.Linear(128, 1), std=1)\n",
|
|
||||||
"\n",
|
|
||||||
" def get_value(self, state: torch.Tensor):\n",
|
|
||||||
" return self.critic(self.network(state))\n",
|
|
||||||
"\n",
|
|
||||||
" def get_actions_value(self, state: torch.Tensor, actions=None):\n",
|
|
||||||
" hidden = self.network(state)\n",
|
|
||||||
" # discrete\n",
|
|
||||||
" dis_logits = self.actor_dis(hidden)\n",
|
|
||||||
" split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n",
|
|
||||||
" multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n",
|
|
||||||
" # continuous\n",
|
|
||||||
" actions_mean = self.actor_mean(hidden)\n",
|
|
||||||
" action_logstd = self.actor_logstd.expand_as(actions_mean)\n",
|
|
||||||
" action_std = torch.exp(action_logstd)\n",
|
|
||||||
" con_probs = Normal(actions_mean, action_std)\n",
|
|
||||||
"\n",
|
|
||||||
" if actions is None:\n",
|
|
||||||
" disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])\n",
|
|
||||||
" conAct = con_probs.sample()\n",
|
|
||||||
" actions = torch.cat([disAct.T, conAct], dim=1)\n",
|
|
||||||
" else:\n",
|
|
||||||
" disAct = actions[:, 0 : env.unity_discrete_type].T\n",
|
|
||||||
" conAct = actions[:, env.unity_discrete_type :]\n",
|
|
||||||
" dis_log_prob = torch.stack(\n",
|
|
||||||
" [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]\n",
|
|
||||||
" )\n",
|
|
||||||
" dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])\n",
|
|
||||||
" return (\n",
|
|
||||||
" actions,\n",
|
|
||||||
" dis_log_prob.sum(0),\n",
|
|
||||||
" dis_entropy.sum(0),\n",
|
|
||||||
" con_probs.log_prob(conAct).sum(1),\n",
|
|
||||||
" con_probs.entropy().sum(1),\n",
|
|
||||||
" self.critic(hidden),\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"mymodel = torch.load(\"../PPO-Model/SmallArea-256-128-hybrid.pt\")\n",
|
|
||||||
"mymodel.eval()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import torch\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"\n",
|
|
||||||
"x = torch.randn(2, 3).to(\"cuda\")\n",
|
|
||||||
"print(x)\n",
|
|
||||||
"print(torch.cat((x, x, x), 0))\n",
|
|
||||||
"print(torch.cat((x, x, x), 1))\n",
|
|
||||||
"\n",
|
|
||||||
"aa = torch.empty(0).to(\"cuda\")\n",
|
|
||||||
"torch.cat([aa,x])\n",
|
|
||||||
"bb = [[]]*2\n",
|
|
||||||
"print(bb)\n",
|
|
||||||
"bb.append(x.to(\"cpu\").tolist())\n",
|
|
||||||
"bb.append(x.to(\"cpu\").tolist())\n",
|
|
||||||
"print(bb)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 64,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"tensor([[-1.1090, 0.4686, 0.6883],\n",
|
|
||||||
" [-0.1862, -0.3943, -0.0202],\n",
|
|
||||||
" [ 0.1436, -0.9444, -1.2079],\n",
|
|
||||||
" [-2.9434, -2.5989, -0.6653],\n",
|
|
||||||
" [ 0.4668, 0.8548, -0.4641],\n",
|
|
||||||
" [-0.3956, -0.2832, -0.1889],\n",
|
|
||||||
" [-0.2801, -0.2092, 1.7254],\n",
|
|
||||||
" [ 2.7938, -0.7742, 0.7053]], device='cuda:0')\n",
|
|
||||||
"(8, 0)\n",
|
|
||||||
"---\n",
|
|
||||||
"[[array([-1.1090169, 0.4685607, 0.6883437], dtype=float32)], [array([-0.1861974 , -0.39429024, -0.02016036], dtype=float32)], [array([ 0.14360362, -0.9443668 , -1.2079065 ], dtype=float32)], [array([-2.9433894 , -2.598913 , -0.66532046], dtype=float32)], [array([ 0.46684313, 0.8547877 , -0.46408093], dtype=float32)], [array([-0.39563984, -0.2831819 , -0.18891 ], dtype=float32)], [array([-0.28008553, -0.20918302, 1.7253567 ], dtype=float32)], [array([ 2.7938051, -0.7742478, 0.705279 ], dtype=float32)]]\n",
|
|
||||||
"[[array([-1.1090169, 0.4685607, 0.6883437], dtype=float32)], [], [array([ 0.14360362, -0.9443668 , -1.2079065 ], dtype=float32)], [array([-2.9433894 , -2.598913 , -0.66532046], dtype=float32)], [array([ 0.46684313, 0.8547877 , -0.46408093], dtype=float32)], [array([-0.39563984, -0.2831819 , -0.18891 ], dtype=float32)], [array([-0.28008553, -0.20918302, 1.7253567 ], dtype=float32)], [array([ 2.7938051, -0.7742478, 0.705279 ], dtype=float32)]]\n",
|
|
||||||
"---\n",
|
|
||||||
"[array([-1.1090169, 0.4685607, 0.6883437], dtype=float32), array([-1.1090169, 0.4685607, 0.6883437], dtype=float32)]\n",
|
|
||||||
"vvv tensor([[-1.1090, 0.4686, 0.6883],\n",
|
|
||||||
" [-1.1090, 0.4686, 0.6883]], device='cuda:0')\n",
|
|
||||||
"tensor([[-1.1090, 0.4686, 0.6883],\n",
|
|
||||||
" [-1.1090, 0.4686, 0.6883]], device='cuda:0')\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"True"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 64,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import torch\n",
|
|
||||||
"\n",
|
|
||||||
"agent_num = 8\n",
|
|
||||||
"ob_buffer = [[]for i in range(agent_num)]\n",
|
|
||||||
"obs = torch.randn(8, 3).to(\"cuda\")\n",
|
|
||||||
"print(obs)\n",
|
|
||||||
"print(np.shape(np.array(ob_buffer)))\n",
|
|
||||||
"print('---')\n",
|
|
||||||
"obs_cpu = obs.to(\"cpu\").numpy()\n",
|
|
||||||
"for i in range(agent_num):\n",
|
|
||||||
" ob_buffer[i].append(obs_cpu[i])\n",
|
|
||||||
"print(ob_buffer)\n",
|
|
||||||
"ob_buffer[1] = []\n",
|
|
||||||
"print(ob_buffer)\n",
|
|
||||||
"print('---')\n",
|
|
||||||
"for i in range(agent_num):\n",
|
|
||||||
" ob_buffer[i].append(obs_cpu[i])\n",
|
|
||||||
"print(ob_buffer[0])\n",
|
|
||||||
"vvv = torch.tensor(ob_buffer[0]).to(\"cuda\")\n",
|
|
||||||
"print(\"vvv\",vvv)\n",
|
|
||||||
"empt = torch.tensor([]).to(\"cuda\")\n",
|
|
||||||
"vvvv = torch.cat((empt,vvv),0)\n",
|
|
||||||
"print(vvvv)\n",
|
|
||||||
"vvvv.size()[0]>0"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from AimbotEnv import Aimbot\n",
|
|
||||||
"from enum import Enum\n",
|
|
||||||
"import uuid\n",
|
|
||||||
"from mlagents_envs.side_channel.side_channel import (\n",
|
|
||||||
" SideChannel,\n",
|
|
||||||
" IncomingMessage,\n",
|
|
||||||
" OutgoingMessage,\n",
|
|
||||||
")\n",
|
|
||||||
"from typing import List\n",
|
|
||||||
"\n",
|
|
||||||
"class Targets(Enum):\n",
|
|
||||||
" Free = 0\n",
|
|
||||||
" Go = 1\n",
|
|
||||||
" Attack = 2\n",
|
|
||||||
" Num = 3\n",
|
|
||||||
"TotalRounds = {\"Go\":0,\"Attack\":0,\"Free\":0}\n",
|
|
||||||
"WinRounds = {\"Go\":0,\"Attack\":0,\"Free\":0}\n",
|
|
||||||
"\n",
|
|
||||||
"class AimbotSideChannel(SideChannel):\n",
|
|
||||||
" def __init__(self, channel_id: uuid.UUID) -> None:\n",
|
|
||||||
" super().__init__(channel_id)\n",
|
|
||||||
" def on_message_received(self, msg: IncomingMessage) -> None:\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" Note: We must implement this method of the SideChannel interface to\n",
|
|
||||||
" receive messages from Unity\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" thisMessage = msg.read_string()\n",
|
|
||||||
" #print(thisMessage)\n",
|
|
||||||
" thisResult = thisMessage.split(\"|\")\n",
|
|
||||||
" if(thisResult[0] == \"result\"):\n",
|
|
||||||
" TotalRounds[thisResult[1]]+=1\n",
|
|
||||||
" if(thisResult[2] == \"Win\"):\n",
|
|
||||||
" WinRounds[thisResult[1]]+=1\n",
|
|
||||||
" #print(TotalRounds)\n",
|
|
||||||
" #print(WinRounds)\n",
|
|
||||||
" elif(thisResult[0] == \"Error\"):\n",
|
|
||||||
" print(thisMessage)\n",
|
|
||||||
"\t# 发送函数\n",
|
|
||||||
" def send_string(self, data: str) -> None:\n",
|
|
||||||
" \"\"\"发送一个字符串给C#\"\"\"\n",
|
|
||||||
" msg = OutgoingMessage()\n",
|
|
||||||
" msg.write_string(data)\n",
|
|
||||||
" super().queue_message_to_send(msg)\n",
|
|
||||||
"\n",
|
|
||||||
" def send_bool(self, data: bool) -> None:\n",
|
|
||||||
" msg = OutgoingMessage()\n",
|
|
||||||
" msg.write_bool(data)\n",
|
|
||||||
" super().queue_message_to_send(msg)\n",
|
|
||||||
"\n",
|
|
||||||
" def send_int(self, data: int) -> None:\n",
|
|
||||||
" msg = OutgoingMessage()\n",
|
|
||||||
" msg.write_int32(data)\n",
|
|
||||||
" super().queue_message_to_send(msg)\n",
|
|
||||||
"\n",
|
|
||||||
" def send_float(self, data: float) -> None:\n",
|
|
||||||
" msg = OutgoingMessage()\n",
|
|
||||||
" msg.write_float32(data)\n",
|
|
||||||
" super().queue_message_to_send(msg)\n",
|
|
||||||
"\n",
|
|
||||||
" def send_float_list(self, data: List[float]) -> None:\n",
|
|
||||||
" msg = OutgoingMessage()\n",
|
|
||||||
" msg.write_float32_list(data)\n",
|
|
||||||
" super().queue_message_to_send(msg)\n",
|
|
||||||
" \n",
|
|
||||||
"SIDE_CHANNEL_UUID = uuid.UUID(\"8bbfb62a-99b4-457c-879d-b78b69066b5e\")\n",
|
|
||||||
"ENV_PATH = \"../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward/Aimbot-ParallelEnv\"\n",
|
|
||||||
"aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID)\n",
|
|
||||||
"env = Aimbot(envPath=ENV_PATH, workerID=123, basePort=999,side_channels=[aimBotsideChannel])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import torch\n",
|
|
||||||
"import torch.nn as nn\n",
|
|
||||||
"import torch.optim as optim\n",
|
|
||||||
"from AimbotEnv import Aimbot\n",
|
|
||||||
"from torch.distributions.normal import Normal\n",
|
|
||||||
"from torch.distributions.categorical import Categorical\n",
|
|
||||||
"device = torch.device(\"cuda\" if torch.cuda.is_available() and True else \"cpu\")\n",
|
|
||||||
"\n",
|
|
||||||
"def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n",
|
|
||||||
" torch.nn.init.orthogonal_(layer.weight, std)\n",
|
|
||||||
" torch.nn.init.constant_(layer.bias, bias_const)\n",
|
|
||||||
" return layer\n",
|
|
||||||
"\n",
|
|
||||||
"class PPOAgent(nn.Module):\n",
|
|
||||||
" def __init__(self, env: Aimbot,targetNum:int):\n",
|
|
||||||
" super(PPOAgent, self).__init__()\n",
|
|
||||||
" self.stateSize = env.unity_observation_shape[0]\n",
|
|
||||||
"\n",
|
|
||||||
" self.discrete_size = env.unity_discrete_size\n",
|
|
||||||
" self.discrete_shape = list(env.unity_discrete_branches)\n",
|
|
||||||
" self.continuous_size = env.unity_continuous_size\n",
|
|
||||||
"\n",
|
|
||||||
" self.network = nn.Sequential(\n",
|
|
||||||
" layer_init(nn.Linear(env.unity_observation_shape[0], 300)),\n",
|
|
||||||
" nn.Tanh(),\n",
|
|
||||||
" layer_init(nn.Linear(300, 200)),\n",
|
|
||||||
" nn.Tanh(),\n",
|
|
||||||
" )\n",
|
|
||||||
" self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5)\n",
|
|
||||||
" self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5)\n",
|
|
||||||
" self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))\n",
|
|
||||||
" self.critic = layer_init(nn.Linear(200, 1), std=1)\n",
|
|
||||||
"\n",
|
|
||||||
" def get_value(self, state: torch.Tensor):\n",
|
|
||||||
" return self.critic(self.network(state))\n",
|
|
||||||
"\n",
|
|
||||||
" def get_actions_value(self, state: torch.Tensor, actions=None):\n",
|
|
||||||
" hidden = self.network(state)\n",
|
|
||||||
"\n",
|
|
||||||
" # discrete\n",
|
|
||||||
" dis_logits = self.actor_dis(hidden)\n",
|
|
||||||
" split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n",
|
|
||||||
" multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n",
|
|
||||||
" # continuous\n",
|
|
||||||
" actions_mean = self.actor_mean(hidden)\n",
|
|
||||||
" action_logstd = self.actor_logstd.expand_as(actions_mean)\n",
|
|
||||||
" action_std = torch.exp(action_logstd)\n",
|
|
||||||
" con_probs = Normal(actions_mean, action_std)\n",
|
|
||||||
"\n",
|
|
||||||
" if actions is None:\n",
|
|
||||||
" if True:\n",
|
|
||||||
" # select actions base on probability distribution model\n",
|
|
||||||
" disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])\n",
|
|
||||||
" conAct = con_probs.sample()\n",
|
|
||||||
" actions = torch.cat([disAct.T, conAct], dim=1)\n",
|
|
||||||
" else:\n",
|
|
||||||
" # select actions base on best probability distribution\n",
|
|
||||||
" disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])\n",
|
|
||||||
" conAct = actions_mean\n",
|
|
||||||
" actions = torch.cat([disAct.T, conAct], dim=1)\n",
|
|
||||||
" else:\n",
|
|
||||||
" disAct = actions[:, 0 : env.unity_discrete_type].T\n",
|
|
||||||
" conAct = actions[:, env.unity_discrete_type :]\n",
|
|
||||||
" dis_log_prob = torch.stack(\n",
|
|
||||||
" [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]\n",
|
|
||||||
" )\n",
|
|
||||||
" dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])\n",
|
|
||||||
" return (\n",
|
|
||||||
" actions,\n",
|
|
||||||
" dis_log_prob.sum(0),\n",
|
|
||||||
" dis_entropy.sum(0),\n",
|
|
||||||
" con_probs.log_prob(conAct).sum(1),\n",
|
|
||||||
" con_probs.entropy().sum(1),\n",
|
|
||||||
" self.critic(hidden),\n",
|
|
||||||
" )"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"ppp = \"../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv\"\n",
|
|
||||||
"env = Aimbot(envPath=ppp, workerID=1, basePort=1000,side_channels=[])\n",
|
|
||||||
"agent_list = []\n",
|
|
||||||
"optimizers = []\n",
|
|
||||||
"for i in range(3):\n",
|
|
||||||
" agent_list.append(PPOAgent(env=env,targetNum=3).to('cuda'))\n",
|
|
||||||
" optimizers.append(optim.Adam(agent_list[i].parameters(),lr=1e-4))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"tensor([1., 2., 3., 4., 5.])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import torch\n",
|
|
||||||
"\n",
|
|
||||||
"aaa = torch.zeros((8,5))\n",
|
|
||||||
"aaa[0] = torch.Tensor([1,2,3,4,5])\n",
|
|
||||||
"aaa[0]"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3.9.7 64-bit",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.9.7"
|
|
||||||
},
|
|
||||||
"orig_nbformat": 4,
|
|
||||||
"vscode": {
|
|
||||||
"interpreter": {
|
|
||||||
"hash": "86e2db13b09bd6be22cb599ea60c1572b9ef36ebeaa27a4c8e961d6df315ac32"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
Binary file not shown.
Binary file not shown.
5
Aimbot-PPO-Python/testdebug.py
Normal file
5
Aimbot-PPO-Python/testdebug.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
aa = np.array([1,2,3,4,5,6,7,8,9,10])
|
||||||
|
|
||||||
|
print(aa)
|
Loading…
Reference in New Issue
Block a user