diff --git a/Aimbot-PPO-Python/Pytorch/.idea/.gitignore b/Aimbot-PPO-Python/Pytorch/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/Aimbot-PPO-Python/Pytorch/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/Aimbot-PPO-Python/Pytorch/.idea/Pytorch.iml b/Aimbot-PPO-Python/Pytorch/.idea/Pytorch.iml
new file mode 100644
index 0000000..c322a37
--- /dev/null
+++ b/Aimbot-PPO-Python/Pytorch/.idea/Pytorch.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml b/Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml
new file mode 100644
index 0000000..0a09ad1
--- /dev/null
+++ b/Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml
@@ -0,0 +1,7 @@
+
+
+
+ aimbot
+
+
+
\ No newline at end of file
diff --git a/Aimbot-PPO-Python/Pytorch/.idea/inspectionProfiles/profiles_settings.xml b/Aimbot-PPO-Python/Pytorch/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/Aimbot-PPO-Python/Pytorch/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/Aimbot-PPO-Python/Pytorch/.idea/misc.xml b/Aimbot-PPO-Python/Pytorch/.idea/misc.xml
new file mode 100644
index 0000000..8093b2d
--- /dev/null
+++ b/Aimbot-PPO-Python/Pytorch/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/Aimbot-PPO-Python/Pytorch/.idea/modules.xml b/Aimbot-PPO-Python/Pytorch/.idea/modules.xml
new file mode 100644
index 0000000..45a8e5b
--- /dev/null
+++ b/Aimbot-PPO-Python/Pytorch/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/Aimbot-PPO-Python/Pytorch/.idea/vcs.xml b/Aimbot-PPO-Python/Pytorch/.idea/vcs.xml
new file mode 100644
index 0000000..b2bdec2
--- /dev/null
+++ b/Aimbot-PPO-Python/Pytorch/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/Aimbot-PPO-Python/Pytorch/AimbotEnv.py b/Aimbot-PPO-Python/Pytorch/AimbotEnv.py
index 1a4baca..6f43799 100644
--- a/Aimbot-PPO-Python/Pytorch/AimbotEnv.py
+++ b/Aimbot-PPO-Python/Pytorch/AimbotEnv.py
@@ -15,19 +15,19 @@ from mlagents_envs.side_channel.side_channel import (
class Aimbot(gym.Env):
def __init__(
- self,
- envPath: str,
- workerID: int = 1,
- basePort: int = 100,
- side_channels: list = []
+ self,
+ env_path: str,
+ worker_id: int = 1,
+ base_port: int = 100,
+ side_channels: list = []
):
super(Aimbot, self).__init__()
self.env = UnityEnvironment(
- file_name=envPath,
+ file_name=env_path,
seed=1,
side_channels=side_channels,
- worker_id=workerID,
- base_port=basePort,
+ worker_id=worker_id,
+ base_port=base_port,
)
self.env.reset()
# all behavior_specs
@@ -41,7 +41,7 @@ class Aimbot(gym.Env):
# environment action specs
self.unity_action_spec = self.unity_specs.action_spec
# environment sample observation
- decisionSteps, _ = self.env.get_steps(self.unity_beha_name)
+ decision_steps, _ = self.env.get_steps(self.unity_beha_name)
# OBSERVATION SPECS
# environment state shape. like tuple:(93,)
@@ -64,31 +64,31 @@ class Aimbot(gym.Env):
# AGENT SPECS
# all agents ID
- self.unity_agent_IDS = decisionSteps.agent_id
+ self.unity_agent_IDS = decision_steps.agent_id
# agents number
self.unity_agent_num = len(self.unity_agent_IDS)
- def reset(self)->Tuple[np.ndarray, List, List]:
- """reset enviroment and get observations
+ def reset(self) -> Tuple[np.ndarray, List, List]:
+ """reset environment and get observations
Returns:
- ndarray: nextState, reward, done, loadDir, saveNow
+ ndarray: next_state, reward, done, loadDir, saveNow
"""
# reset env
self.env.reset()
- nextState, reward, done = self.get_steps()
- return nextState, reward, done
+ next_state, reward, done = self.get_steps()
+ return next_state, reward, done
# TODO:
# delete all stack state DONE
- # getstep State disassembly function DONE
+ # get-step State disassembly function DONE
# delete agent selection function DONE
# self.step action wrapper function DONE
def step(
- self,
- actions: ndarray,
- )->Tuple[np.ndarray, List, List]:
- """change ations list to ActionTuple then send it to enviroment
+ self,
+ actions: ndarray,
+ ) -> Tuple[np.ndarray, List, List]:
+ """change actions list to ActionTuple then send it to environment
Args:
actions (ndarray): PPO chooseAction output action list.(agentNum,actionNum)
@@ -96,36 +96,36 @@ class Aimbot(gym.Env):
Returns:
ndarray: nextState, reward, done
"""
- # take action to enviroment
+ # take action to environment
# return mextState,reward,done
# discrete action
if self.unity_dis_act_exist:
# create discrete action from actions list
- discreteActions = actions[:, 0 : self.unity_discrete_type]
+ discrete_actions = actions[:, 0: self.unity_discrete_type]
else:
# create empty discrete action
- discreteActions = np.asarray([[0]])
+ discrete_actions = np.asarray([[0]])
# continuous action
if self.unity_con_act_exist:
# create continuous actions from actions list
- continuousActions = actions[:, self.unity_discrete_type :]
+ continuous_actions = actions[:, self.unity_discrete_type:]
else:
# create empty continuous action
- continuousActions = np.asanyarray([[0.0]])
+ continuous_actions = np.asanyarray([[0.0]])
# Dummy continuous action
# continuousActions = np.asanyarray([[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]])
# create actionTuple
- thisActionTuple = ActionTuple(continuous=continuousActions, discrete=discreteActions)
+ this_action_tuple = ActionTuple(continuous=continuous_actions, discrete=discrete_actions)
# take action to env
- self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple)
+ self.env.set_actions(behavior_name=self.unity_beha_name, action=this_action_tuple)
self.env.step()
# get nextState & reward & done after this action
- nextStates, rewards, dones = self.get_steps()
- return nextStates, rewards, dones
+ next_states, rewards, dones = self.get_steps()
+ return next_states, rewards, dones
- def get_steps(self)->Tuple[np.ndarray, List, List]:
- """get enviroment now observations.
+ def get_steps(self) -> Tuple[np.ndarray, List, List]:
+ """get environment now observations.
Include State, Reward, Done
Args:
@@ -160,6 +160,7 @@ class Aimbot(gym.Env):
def close(self):
self.env.close()
+
class AimbotSideChannel(SideChannel):
def __init__(self, channel_id: uuid.UUID) -> None:
super().__init__(channel_id)
@@ -174,13 +175,15 @@ class AimbotSideChannel(SideChannel):
"""
this_message = msg.read_string()
this_result = this_message.split("|")
- if(this_result[0] == "result"):
- airecorder.total_rounds[this_result[1]]+=1
- if(this_result[2] == "Win"):
- airecorder.win_rounds[this_result[1]]+=1
- #print(TotalRounds)
- #print(WinRounds)
- elif(this_result[0] == "Error"):
+ print(this_result)
+ if this_result[0] == "Warning":
+ if this_result[1] == "Result":
+ airecorder.total_rounds[this_result[2]] += 1
+ if this_result[3] == "Win":
+ airecorder.win_rounds[this_result[2]] += 1
+ # print(TotalRounds)
+ # print(WinRounds)
+ elif this_result[0] == "Error":
print(this_message)
# # while Message type is Warning
# if(thisResult[0] == "Warning"):
@@ -197,7 +200,8 @@ class AimbotSideChannel(SideChannel):
# # while Message type is Error
# elif(thisResult[0] == "Error"):
# print(thisMessage)
- # 发送函数
+
+ # 发送函数
def send_string(self, data: str) -> None:
# send a string toC#
msg = OutgoingMessage()
@@ -222,4 +226,4 @@ class AimbotSideChannel(SideChannel):
def send_float_list(self, data: List[float]) -> None:
msg = OutgoingMessage()
msg.write_float32_list(data)
- super().queue_message_to_send(msg)
\ No newline at end of file
+ super().queue_message_to_send(msg)
diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
index b390b6a..db157d7 100644
--- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
+++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py
@@ -32,15 +32,18 @@ if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
best_reward = -1
- # Initialize environment anget optimizer
- aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
- env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimbot_sidechannel])
+ # Initialize environment agent optimizer
+ aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID)
+ env = Aimbot(env_path=args.path,
+ worker_id=args.workerID,
+ base_port=args.baseport,
+ side_channels=[aimbot_side_channel])
if args.load_dir is None:
agent = PPOAgent(
- env = env,
+ env=env,
this_args=args,
device=device,
- ).to(device)
+ ).to(device)
else:
agent = torch.load(args.load_dir)
# freeze
@@ -48,7 +51,7 @@ if __name__ == "__main__":
# freeze the view network
for p in agent.viewNetwork.parameters():
p.requires_grad = False
- print("VIEW NETWORK FREEZED")
+ print("VIEW NETWORK FREEZE")
print("Load Agent", args.load_dir)
print(agent.eval())
# optimizer
@@ -57,16 +60,18 @@ if __name__ == "__main__":
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
+
@atexit.register
def save_model():
# close env
env.close()
if args.save_model:
# save model while exit
- save_dir = "../PPO-Model/"+ run_name + "_last.pt"
+ save_dir = "../PPO-Model/" + run_name + "_last.pt"
torch.save(agent, save_dir)
print("save model to " + save_dir)
+
# start the game
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
target_steps = [0 for i in range(args.target_num)]
@@ -77,14 +82,14 @@ if __name__ == "__main__":
ppo_memories = PPOMem(
args=args,
unity_agent_num=env.unity_agent_num,
- device = device,
+ device=device,
)
# MAIN LOOP: run agent in environment
for total_steps in range(total_update_step):
- # discunt learning rate, while step == total_update_step lr will be 0
+ # discount learning rate, while step == total_update_step lr will be 0
if args.annealLR:
- final_lr_ratio = args.target_lr/args.lr
+ final_lr_ratio = args.target_lr / args.lr
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
lr_now = frac * args.lr
optimizer.param_groups[0]["lr"] = lr_now
@@ -92,13 +97,14 @@ if __name__ == "__main__":
lr_now = args.lr
# episode start show learning rate
- print("new episode",total_steps,"learning rate = ",lr_now)
- # MAIN LOOP: run agent in environment
+ print("new episode", total_steps, "learning rate = ", lr_now)
step = 0
training = False
train_queue = []
- last_reward = [0.for i in range(env.unity_agent_num)]
+ last_reward = [0. for i in range(env.unity_agent_num)]
+ # MAIN LOOP: run agent in environment
while True:
+ # On decision point, choose action by agent
if step % args.decision_period == 0:
step += 1
# Choose action by agent
@@ -119,17 +125,17 @@ if __name__ == "__main__":
# save memories
ppo_memories.save_memories(
- now_step = step,
- agent = agent,
- state = state,
- action_cpu = action_cpu,
- dis_logprob_cpu = dis_logprob_cpu,
- con_logprob_cpu = con_logprob_cpu,
- reward = reward,
- done = done,
- value_cpu = value_cpu,
- last_reward = last_reward,
- next_done = next_done,
+ now_step=step,
+ agent=agent,
+ state=state,
+ action_cpu=action_cpu,
+ dis_logprob_cpu=dis_logprob_cpu,
+ con_logprob_cpu=con_logprob_cpu,
+ reward=reward,
+ done=done,
+ value_cpu=value_cpu,
+ last_reward=last_reward,
+ next_done=next_done,
next_state=next_state,
)
# check if any training dataset is full and ready to train
@@ -137,7 +143,7 @@ if __name__ == "__main__":
if ppo_memories.obs[i].size()[0] >= args.datasetSize:
# start train NN
train_queue.append(i)
- if(len(train_queue)>0):
+ if len(train_queue) > 0:
# break while loop and start train
break
# update state
@@ -148,17 +154,17 @@ if __name__ == "__main__":
next_state, reward, next_done = env.step(action_cpu)
# save memories
ppo_memories.save_memories(
- now_step = step,
- agent = agent,
- state = state,
- action_cpu = action_cpu,
- dis_logprob_cpu = dis_logprob_cpu,
- con_logprob_cpu = con_logprob_cpu,
- reward = reward,
- done = done,
- value_cpu = value_cpu,
- last_reward = last_reward,
- next_done = next_done,
+ now_step=step,
+ agent=agent,
+ state=state,
+ action_cpu=action_cpu,
+ dis_logprob_cpu=dis_logprob_cpu,
+ con_logprob_cpu=con_logprob_cpu,
+ reward=reward,
+ done=done,
+ value_cpu=value_cpu,
+ last_reward=last_reward,
+ next_done=next_done,
next_state=next_state,
)
# update state
@@ -167,12 +173,12 @@ if __name__ == "__main__":
if args.train:
# train mode on
- mean_reward_list = [] # for WANDB
- # loop all tarining queue
+ mean_reward_list = [] # for WANDB
+ # loop all training queue
for this_train_ind in train_queue:
- # sart time
+ # start time
start_time = time.time()
- target_steps[this_train_ind]+=1
+ target_steps[this_train_ind] += 1
# train agent
(
v_loss,
@@ -180,18 +186,18 @@ if __name__ == "__main__":
con_pg_loss,
loss,
entropy_loss
- ) = agent.train_net(
- this_train_ind=this_train_ind,
- ppo_memories=ppo_memories,
- optimizer=optimizer
- )
+ ) = agent.train_net(
+ this_train_ind=this_train_ind,
+ ppo_memories=ppo_memories,
+ optimizer=optimizer
+ )
# record mean reward before clear history
print("done")
- targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
- mean_reward_list.append(targetRewardMean)
+ target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
+ mean_reward_list.append(target_reward_mean)
targetName = Targets(this_train_ind).name
- # clear this target trainning set buffer
+ # clear this target training set buffer
ppo_memories.clear_training_datasets(this_train_ind)
# record rewards for plotting purposes
wdb_recorder.add_target_scalar(
@@ -202,10 +208,10 @@ if __name__ == "__main__":
con_pg_loss,
loss,
entropy_loss,
- targetRewardMean,
+ target_reward_mean,
target_steps,
)
- print(f"episode over Target{targetName} mean reward:", targetRewardMean)
+ print(f"episode over Target{targetName} mean reward:", target_reward_mean)
TotalRewardMean = np.mean(mean_reward_list)
wdb_recorder.add_global_scalar(
TotalRewardMean,
@@ -216,31 +222,32 @@ if __name__ == "__main__":
print("cost time:", time.time() - start_time)
# New Record!
if TotalRewardMean > best_reward and args.save_model:
- best_reward = targetRewardMean
- saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt"
+ best_reward = target_reward_mean
+ saveDir = "../PPO-Model/" + run_name + "_" + str(TotalRewardMean) + ".pt"
torch.save(agent, saveDir)
else:
# train mode off
- mean_reward_list = [] # for WANDB
+ mean_reward_list = [] # for WANDB
# while not in training mode, clear the buffer
for this_train_ind in train_queue:
- target_steps[this_train_ind]+=1
+ target_steps[this_train_ind] += 1
targetName = Targets(this_train_ind).name
- targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
- mean_reward_list.append(targetRewardMean)
+ target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
+ mean_reward_list.append(target_reward_mean)
print(target_steps[this_train_ind])
- # clear this target trainning set buffer
+ # clear this target training set buffer
ppo_memories.clear_training_datasets(this_train_ind)
# record rewards for plotting purposes
- wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[this_train_ind])
- wdb_recorder.add_win_ratio(targetName,target_steps[this_train_ind])
- print(f"episode over Target{targetName} mean reward:", targetRewardMean)
+ wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean,
+ target_steps[this_train_ind])
+ wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind])
+ print(f"episode over Target{targetName} mean reward:", target_reward_mean)
TotalRewardMean = np.mean(mean_reward_list)
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
- saveDir = "../PPO-Model/"+ run_name + "_last.pt"
+ saveDir = "../PPO-Model/" + run_name + "_last.pt"
torch.save(agent, saveDir)
env.close()
wdb_recorder.writer.close()
diff --git a/Aimbot-PPO-Python/Pytorch/airecorder.py b/Aimbot-PPO-Python/Pytorch/airecorder.py
index 3cea9df..4218687 100644
--- a/Aimbot-PPO-Python/Pytorch/airecorder.py
+++ b/Aimbot-PPO-Python/Pytorch/airecorder.py
@@ -1,7 +1,6 @@
-import wandb
-import time
from torch.utils.tensorboard import SummaryWriter
+import wandb
total_rounds = {"Free": 0, "Go": 0, "Attack": 0}
win_rounds = {"Free": 0, "Go": 0, "Attack": 0}
@@ -35,7 +34,7 @@ class WandbRecorder:
def add_target_scalar(
self,
target_name,
- thisT,
+ this_t,
v_loss,
dis_pg_loss,
con_pg_loss,
@@ -46,25 +45,25 @@ class WandbRecorder:
):
# fmt:off
self.writer.add_scalar(
- f"Target{target_name}/value_loss", v_loss.item(), target_steps[thisT]
+ f"Target{target_name}/value_loss", v_loss.item(), target_steps[this_t]
)
self.writer.add_scalar(
- f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT]
+ f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[this_t]
)
self.writer.add_scalar(
- f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[thisT]
+ f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[this_t]
)
self.writer.add_scalar(
- f"Target{target_name}/total_loss", loss.item(), target_steps[thisT]
+ f"Target{target_name}/total_loss", loss.item(), target_steps[this_t]
)
self.writer.add_scalar(
- f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[thisT]
+ f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[this_t]
)
self.writer.add_scalar(
- f"Target{target_name}/Reward", target_reward_mean, target_steps[thisT]
+ f"Target{target_name}/Reward", target_reward_mean, target_steps[this_t]
)
self.writer.add_scalar(
- f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[thisT],
+ f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[this_t],
)
# fmt:on
diff --git a/Aimbot-PPO-Python/Pytorch/arguments.py b/Aimbot-PPO-Python/Pytorch/arguments.py
index 78f58f4..65d8540 100644
--- a/Aimbot-PPO-Python/Pytorch/arguments.py
+++ b/Aimbot-PPO-Python/Pytorch/arguments.py
@@ -4,7 +4,7 @@ import uuid
from distutils.util import strtobool
DEFAULT_SEED = 9331
-ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv"
+ENV_PATH = "../Build/3.1.6/Aimbot-ParallelEnv"
WAND_ENTITY = "koha9"
WORKER_ID = 1
BASE_PORT = 1000
diff --git a/Aimbot-PPO-Python/Pytorch/ppoagent.py b/Aimbot-PPO-Python/Pytorch/ppoagent.py
index bcc041b..dcf405d 100644
--- a/Aimbot-PPO-Python/Pytorch/ppoagent.py
+++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py
@@ -65,7 +65,8 @@ class PPOAgent(nn.Module):
self.actor_mean = nn.ModuleList(
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
)
- # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
+ # self.actor_logstd =
+ # nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
self.actor_logstd = nn.ParameterList(
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]