diff --git a/Aimbot-PPO-Python/Pytorch/.idea/.gitignore b/Aimbot-PPO-Python/Pytorch/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/Aimbot-PPO-Python/Pytorch/.idea/Pytorch.iml b/Aimbot-PPO-Python/Pytorch/.idea/Pytorch.iml new file mode 100644 index 0000000..c322a37 --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/.idea/Pytorch.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml b/Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml new file mode 100644 index 0000000..0a09ad1 --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml @@ -0,0 +1,7 @@ + + + + aimbot + + + \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/.idea/inspectionProfiles/profiles_settings.xml b/Aimbot-PPO-Python/Pytorch/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/.idea/misc.xml b/Aimbot-PPO-Python/Pytorch/.idea/misc.xml new file mode 100644 index 0000000..8093b2d --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/.idea/modules.xml b/Aimbot-PPO-Python/Pytorch/.idea/modules.xml new file mode 100644 index 0000000..45a8e5b --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/.idea/vcs.xml b/Aimbot-PPO-Python/Pytorch/.idea/vcs.xml new file mode 100644 index 0000000..b2bdec2 --- /dev/null +++ b/Aimbot-PPO-Python/Pytorch/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/AimbotEnv.py b/Aimbot-PPO-Python/Pytorch/AimbotEnv.py index 1a4baca..6f43799 100644 --- a/Aimbot-PPO-Python/Pytorch/AimbotEnv.py +++ b/Aimbot-PPO-Python/Pytorch/AimbotEnv.py @@ -15,19 +15,19 @@ from mlagents_envs.side_channel.side_channel import ( class Aimbot(gym.Env): def __init__( - self, - envPath: str, - workerID: int = 1, - basePort: int = 100, - side_channels: list = [] + self, + env_path: str, + worker_id: int = 1, + base_port: int = 100, + side_channels: list = [] ): super(Aimbot, self).__init__() self.env = UnityEnvironment( - file_name=envPath, + file_name=env_path, seed=1, side_channels=side_channels, - worker_id=workerID, - base_port=basePort, + worker_id=worker_id, + base_port=base_port, ) self.env.reset() # all behavior_specs @@ -41,7 +41,7 @@ class Aimbot(gym.Env): # environment action specs self.unity_action_spec = self.unity_specs.action_spec # environment sample observation - decisionSteps, _ = self.env.get_steps(self.unity_beha_name) + decision_steps, _ = self.env.get_steps(self.unity_beha_name) # OBSERVATION SPECS # environment state shape. like tuple:(93,) @@ -64,31 +64,31 @@ class Aimbot(gym.Env): # AGENT SPECS # all agents ID - self.unity_agent_IDS = decisionSteps.agent_id + self.unity_agent_IDS = decision_steps.agent_id # agents number self.unity_agent_num = len(self.unity_agent_IDS) - def reset(self)->Tuple[np.ndarray, List, List]: - """reset enviroment and get observations + def reset(self) -> Tuple[np.ndarray, List, List]: + """reset environment and get observations Returns: - ndarray: nextState, reward, done, loadDir, saveNow + ndarray: next_state, reward, done, loadDir, saveNow """ # reset env self.env.reset() - nextState, reward, done = self.get_steps() - return nextState, reward, done + next_state, reward, done = self.get_steps() + return next_state, reward, done # TODO: # delete all stack state DONE - # getstep State disassembly function DONE + # get-step State disassembly function DONE # delete agent selection function DONE # self.step action wrapper function DONE def step( - self, - actions: ndarray, - )->Tuple[np.ndarray, List, List]: - """change ations list to ActionTuple then send it to enviroment + self, + actions: ndarray, + ) -> Tuple[np.ndarray, List, List]: + """change actions list to ActionTuple then send it to environment Args: actions (ndarray): PPO chooseAction output action list.(agentNum,actionNum) @@ -96,36 +96,36 @@ class Aimbot(gym.Env): Returns: ndarray: nextState, reward, done """ - # take action to enviroment + # take action to environment # return mextState,reward,done # discrete action if self.unity_dis_act_exist: # create discrete action from actions list - discreteActions = actions[:, 0 : self.unity_discrete_type] + discrete_actions = actions[:, 0: self.unity_discrete_type] else: # create empty discrete action - discreteActions = np.asarray([[0]]) + discrete_actions = np.asarray([[0]]) # continuous action if self.unity_con_act_exist: # create continuous actions from actions list - continuousActions = actions[:, self.unity_discrete_type :] + continuous_actions = actions[:, self.unity_discrete_type:] else: # create empty continuous action - continuousActions = np.asanyarray([[0.0]]) + continuous_actions = np.asanyarray([[0.0]]) # Dummy continuous action # continuousActions = np.asanyarray([[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]) # create actionTuple - thisActionTuple = ActionTuple(continuous=continuousActions, discrete=discreteActions) + this_action_tuple = ActionTuple(continuous=continuous_actions, discrete=discrete_actions) # take action to env - self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple) + self.env.set_actions(behavior_name=self.unity_beha_name, action=this_action_tuple) self.env.step() # get nextState & reward & done after this action - nextStates, rewards, dones = self.get_steps() - return nextStates, rewards, dones + next_states, rewards, dones = self.get_steps() + return next_states, rewards, dones - def get_steps(self)->Tuple[np.ndarray, List, List]: - """get enviroment now observations. + def get_steps(self) -> Tuple[np.ndarray, List, List]: + """get environment now observations. Include State, Reward, Done Args: @@ -160,6 +160,7 @@ class Aimbot(gym.Env): def close(self): self.env.close() + class AimbotSideChannel(SideChannel): def __init__(self, channel_id: uuid.UUID) -> None: super().__init__(channel_id) @@ -174,13 +175,15 @@ class AimbotSideChannel(SideChannel): """ this_message = msg.read_string() this_result = this_message.split("|") - if(this_result[0] == "result"): - airecorder.total_rounds[this_result[1]]+=1 - if(this_result[2] == "Win"): - airecorder.win_rounds[this_result[1]]+=1 - #print(TotalRounds) - #print(WinRounds) - elif(this_result[0] == "Error"): + print(this_result) + if this_result[0] == "Warning": + if this_result[1] == "Result": + airecorder.total_rounds[this_result[2]] += 1 + if this_result[3] == "Win": + airecorder.win_rounds[this_result[2]] += 1 + # print(TotalRounds) + # print(WinRounds) + elif this_result[0] == "Error": print(this_message) # # while Message type is Warning # if(thisResult[0] == "Warning"): @@ -197,7 +200,8 @@ class AimbotSideChannel(SideChannel): # # while Message type is Error # elif(thisResult[0] == "Error"): # print(thisMessage) - # 发送函数 + + # 发送函数 def send_string(self, data: str) -> None: # send a string toC# msg = OutgoingMessage() @@ -222,4 +226,4 @@ class AimbotSideChannel(SideChannel): def send_float_list(self, data: List[float]) -> None: msg = OutgoingMessage() msg.write_float32_list(data) - super().queue_message_to_send(msg) \ No newline at end of file + super().queue_message_to_send(msg) diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index b390b6a..db157d7 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -32,15 +32,18 @@ if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") best_reward = -1 - # Initialize environment anget optimizer - aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID); - env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimbot_sidechannel]) + # Initialize environment agent optimizer + aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID) + env = Aimbot(env_path=args.path, + worker_id=args.workerID, + base_port=args.baseport, + side_channels=[aimbot_side_channel]) if args.load_dir is None: agent = PPOAgent( - env = env, + env=env, this_args=args, device=device, - ).to(device) + ).to(device) else: agent = torch.load(args.load_dir) # freeze @@ -48,7 +51,7 @@ if __name__ == "__main__": # freeze the view network for p in agent.viewNetwork.parameters(): p.requires_grad = False - print("VIEW NETWORK FREEZED") + print("VIEW NETWORK FREEZE") print("Load Agent", args.load_dir) print(agent.eval()) # optimizer @@ -57,16 +60,18 @@ if __name__ == "__main__": run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}" wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args) + @atexit.register def save_model(): # close env env.close() if args.save_model: # save model while exit - save_dir = "../PPO-Model/"+ run_name + "_last.pt" + save_dir = "../PPO-Model/" + run_name + "_last.pt" torch.save(agent, save_dir) print("save model to " + save_dir) + # start the game total_update_step = using_targets_num * args.total_timesteps // args.datasetSize target_steps = [0 for i in range(args.target_num)] @@ -77,14 +82,14 @@ if __name__ == "__main__": ppo_memories = PPOMem( args=args, unity_agent_num=env.unity_agent_num, - device = device, + device=device, ) # MAIN LOOP: run agent in environment for total_steps in range(total_update_step): - # discunt learning rate, while step == total_update_step lr will be 0 + # discount learning rate, while step == total_update_step lr will be 0 if args.annealLR: - final_lr_ratio = args.target_lr/args.lr + final_lr_ratio = args.target_lr / args.lr frac = 1.0 - ((total_steps + 1.0) / total_update_step) lr_now = frac * args.lr optimizer.param_groups[0]["lr"] = lr_now @@ -92,13 +97,14 @@ if __name__ == "__main__": lr_now = args.lr # episode start show learning rate - print("new episode",total_steps,"learning rate = ",lr_now) - # MAIN LOOP: run agent in environment + print("new episode", total_steps, "learning rate = ", lr_now) step = 0 training = False train_queue = [] - last_reward = [0.for i in range(env.unity_agent_num)] + last_reward = [0. for i in range(env.unity_agent_num)] + # MAIN LOOP: run agent in environment while True: + # On decision point, choose action by agent if step % args.decision_period == 0: step += 1 # Choose action by agent @@ -119,17 +125,17 @@ if __name__ == "__main__": # save memories ppo_memories.save_memories( - now_step = step, - agent = agent, - state = state, - action_cpu = action_cpu, - dis_logprob_cpu = dis_logprob_cpu, - con_logprob_cpu = con_logprob_cpu, - reward = reward, - done = done, - value_cpu = value_cpu, - last_reward = last_reward, - next_done = next_done, + now_step=step, + agent=agent, + state=state, + action_cpu=action_cpu, + dis_logprob_cpu=dis_logprob_cpu, + con_logprob_cpu=con_logprob_cpu, + reward=reward, + done=done, + value_cpu=value_cpu, + last_reward=last_reward, + next_done=next_done, next_state=next_state, ) # check if any training dataset is full and ready to train @@ -137,7 +143,7 @@ if __name__ == "__main__": if ppo_memories.obs[i].size()[0] >= args.datasetSize: # start train NN train_queue.append(i) - if(len(train_queue)>0): + if len(train_queue) > 0: # break while loop and start train break # update state @@ -148,17 +154,17 @@ if __name__ == "__main__": next_state, reward, next_done = env.step(action_cpu) # save memories ppo_memories.save_memories( - now_step = step, - agent = agent, - state = state, - action_cpu = action_cpu, - dis_logprob_cpu = dis_logprob_cpu, - con_logprob_cpu = con_logprob_cpu, - reward = reward, - done = done, - value_cpu = value_cpu, - last_reward = last_reward, - next_done = next_done, + now_step=step, + agent=agent, + state=state, + action_cpu=action_cpu, + dis_logprob_cpu=dis_logprob_cpu, + con_logprob_cpu=con_logprob_cpu, + reward=reward, + done=done, + value_cpu=value_cpu, + last_reward=last_reward, + next_done=next_done, next_state=next_state, ) # update state @@ -167,12 +173,12 @@ if __name__ == "__main__": if args.train: # train mode on - mean_reward_list = [] # for WANDB - # loop all tarining queue + mean_reward_list = [] # for WANDB + # loop all training queue for this_train_ind in train_queue: - # sart time + # start time start_time = time.time() - target_steps[this_train_ind]+=1 + target_steps[this_train_ind] += 1 # train agent ( v_loss, @@ -180,18 +186,18 @@ if __name__ == "__main__": con_pg_loss, loss, entropy_loss - ) = agent.train_net( - this_train_ind=this_train_ind, - ppo_memories=ppo_memories, - optimizer=optimizer - ) + ) = agent.train_net( + this_train_ind=this_train_ind, + ppo_memories=ppo_memories, + optimizer=optimizer + ) # record mean reward before clear history print("done") - targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy()) - mean_reward_list.append(targetRewardMean) + target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy()) + mean_reward_list.append(target_reward_mean) targetName = Targets(this_train_ind).name - # clear this target trainning set buffer + # clear this target training set buffer ppo_memories.clear_training_datasets(this_train_ind) # record rewards for plotting purposes wdb_recorder.add_target_scalar( @@ -202,10 +208,10 @@ if __name__ == "__main__": con_pg_loss, loss, entropy_loss, - targetRewardMean, + target_reward_mean, target_steps, ) - print(f"episode over Target{targetName} mean reward:", targetRewardMean) + print(f"episode over Target{targetName} mean reward:", target_reward_mean) TotalRewardMean = np.mean(mean_reward_list) wdb_recorder.add_global_scalar( TotalRewardMean, @@ -216,31 +222,32 @@ if __name__ == "__main__": print("cost time:", time.time() - start_time) # New Record! if TotalRewardMean > best_reward and args.save_model: - best_reward = targetRewardMean - saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt" + best_reward = target_reward_mean + saveDir = "../PPO-Model/" + run_name + "_" + str(TotalRewardMean) + ".pt" torch.save(agent, saveDir) else: # train mode off - mean_reward_list = [] # for WANDB + mean_reward_list = [] # for WANDB # while not in training mode, clear the buffer for this_train_ind in train_queue: - target_steps[this_train_ind]+=1 + target_steps[this_train_ind] += 1 targetName = Targets(this_train_ind).name - targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy()) - mean_reward_list.append(targetRewardMean) + target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy()) + mean_reward_list.append(target_reward_mean) print(target_steps[this_train_ind]) - # clear this target trainning set buffer + # clear this target training set buffer ppo_memories.clear_training_datasets(this_train_ind) # record rewards for plotting purposes - wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[this_train_ind]) - wdb_recorder.add_win_ratio(targetName,target_steps[this_train_ind]) - print(f"episode over Target{targetName} mean reward:", targetRewardMean) + wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean, + target_steps[this_train_ind]) + wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind]) + print(f"episode over Target{targetName} mean reward:", target_reward_mean) TotalRewardMean = np.mean(mean_reward_list) wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) - saveDir = "../PPO-Model/"+ run_name + "_last.pt" + saveDir = "../PPO-Model/" + run_name + "_last.pt" torch.save(agent, saveDir) env.close() wdb_recorder.writer.close() diff --git a/Aimbot-PPO-Python/Pytorch/airecorder.py b/Aimbot-PPO-Python/Pytorch/airecorder.py index 3cea9df..4218687 100644 --- a/Aimbot-PPO-Python/Pytorch/airecorder.py +++ b/Aimbot-PPO-Python/Pytorch/airecorder.py @@ -1,7 +1,6 @@ -import wandb -import time from torch.utils.tensorboard import SummaryWriter +import wandb total_rounds = {"Free": 0, "Go": 0, "Attack": 0} win_rounds = {"Free": 0, "Go": 0, "Attack": 0} @@ -35,7 +34,7 @@ class WandbRecorder: def add_target_scalar( self, target_name, - thisT, + this_t, v_loss, dis_pg_loss, con_pg_loss, @@ -46,25 +45,25 @@ class WandbRecorder: ): # fmt:off self.writer.add_scalar( - f"Target{target_name}/value_loss", v_loss.item(), target_steps[thisT] + f"Target{target_name}/value_loss", v_loss.item(), target_steps[this_t] ) self.writer.add_scalar( - f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT] + f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[this_t] ) self.writer.add_scalar( - f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[thisT] + f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[this_t] ) self.writer.add_scalar( - f"Target{target_name}/total_loss", loss.item(), target_steps[thisT] + f"Target{target_name}/total_loss", loss.item(), target_steps[this_t] ) self.writer.add_scalar( - f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[thisT] + f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[this_t] ) self.writer.add_scalar( - f"Target{target_name}/Reward", target_reward_mean, target_steps[thisT] + f"Target{target_name}/Reward", target_reward_mean, target_steps[this_t] ) self.writer.add_scalar( - f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[thisT], + f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[this_t], ) # fmt:on diff --git a/Aimbot-PPO-Python/Pytorch/arguments.py b/Aimbot-PPO-Python/Pytorch/arguments.py index 78f58f4..65d8540 100644 --- a/Aimbot-PPO-Python/Pytorch/arguments.py +++ b/Aimbot-PPO-Python/Pytorch/arguments.py @@ -4,7 +4,7 @@ import uuid from distutils.util import strtobool DEFAULT_SEED = 9331 -ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv" +ENV_PATH = "../Build/3.1.6/Aimbot-ParallelEnv" WAND_ENTITY = "koha9" WORKER_ID = 1 BASE_PORT = 1000 diff --git a/Aimbot-PPO-Python/Pytorch/ppoagent.py b/Aimbot-PPO-Python/Pytorch/ppoagent.py index bcc041b..dcf405d 100644 --- a/Aimbot-PPO-Python/Pytorch/ppoagent.py +++ b/Aimbot-PPO-Python/Pytorch/ppoagent.py @@ -65,7 +65,8 @@ class PPOAgent(nn.Module): self.actor_mean = nn.ModuleList( [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)] ) - # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)]) + # self.actor_logstd = + # nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)]) # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) self.actor_logstd = nn.ParameterList( [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]