对应V3.1.6 训练模式
主要修改SideChannel以对应V316的训练模式 规则化命名
This commit is contained in:
parent
be1322381e
commit
f9ee51c256
3
Aimbot-PPO-Python/Pytorch/.idea/.gitignore
generated
vendored
Normal file
3
Aimbot-PPO-Python/Pytorch/.idea/.gitignore
generated
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
8
Aimbot-PPO-Python/Pytorch/.idea/Pytorch.iml
generated
Normal file
8
Aimbot-PPO-Python/Pytorch/.idea/Pytorch.iml
generated
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="mlagents39" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
7
Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml
generated
Normal file
7
Aimbot-PPO-Python/Pytorch/.idea/dictionaries/UCUNI.xml
generated
Normal file
@ -0,0 +1,7 @@
|
||||
<component name="ProjectDictionaryState">
|
||||
<dictionary name="UCUNI">
|
||||
<words>
|
||||
<w>aimbot</w>
|
||||
</words>
|
||||
</dictionary>
|
||||
</component>
|
6
Aimbot-PPO-Python/Pytorch/.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
Aimbot-PPO-Python/Pytorch/.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
4
Aimbot-PPO-Python/Pytorch/.idea/misc.xml
generated
Normal file
4
Aimbot-PPO-Python/Pytorch/.idea/misc.xml
generated
Normal file
@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="mlagents39" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
Aimbot-PPO-Python/Pytorch/.idea/modules.xml
generated
Normal file
8
Aimbot-PPO-Python/Pytorch/.idea/modules.xml
generated
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/Pytorch.iml" filepath="$PROJECT_DIR$/.idea/Pytorch.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
6
Aimbot-PPO-Python/Pytorch/.idea/vcs.xml
generated
Normal file
6
Aimbot-PPO-Python/Pytorch/.idea/vcs.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
|
||||
</component>
|
||||
</project>
|
@ -15,19 +15,19 @@ from mlagents_envs.side_channel.side_channel import (
|
||||
|
||||
class Aimbot(gym.Env):
|
||||
def __init__(
|
||||
self,
|
||||
envPath: str,
|
||||
workerID: int = 1,
|
||||
basePort: int = 100,
|
||||
side_channels: list = []
|
||||
self,
|
||||
env_path: str,
|
||||
worker_id: int = 1,
|
||||
base_port: int = 100,
|
||||
side_channels: list = []
|
||||
):
|
||||
super(Aimbot, self).__init__()
|
||||
self.env = UnityEnvironment(
|
||||
file_name=envPath,
|
||||
file_name=env_path,
|
||||
seed=1,
|
||||
side_channels=side_channels,
|
||||
worker_id=workerID,
|
||||
base_port=basePort,
|
||||
worker_id=worker_id,
|
||||
base_port=base_port,
|
||||
)
|
||||
self.env.reset()
|
||||
# all behavior_specs
|
||||
@ -41,7 +41,7 @@ class Aimbot(gym.Env):
|
||||
# environment action specs
|
||||
self.unity_action_spec = self.unity_specs.action_spec
|
||||
# environment sample observation
|
||||
decisionSteps, _ = self.env.get_steps(self.unity_beha_name)
|
||||
decision_steps, _ = self.env.get_steps(self.unity_beha_name)
|
||||
|
||||
# OBSERVATION SPECS
|
||||
# environment state shape. like tuple:(93,)
|
||||
@ -64,31 +64,31 @@ class Aimbot(gym.Env):
|
||||
|
||||
# AGENT SPECS
|
||||
# all agents ID
|
||||
self.unity_agent_IDS = decisionSteps.agent_id
|
||||
self.unity_agent_IDS = decision_steps.agent_id
|
||||
# agents number
|
||||
self.unity_agent_num = len(self.unity_agent_IDS)
|
||||
|
||||
def reset(self)->Tuple[np.ndarray, List, List]:
|
||||
"""reset enviroment and get observations
|
||||
def reset(self) -> Tuple[np.ndarray, List, List]:
|
||||
"""reset environment and get observations
|
||||
|
||||
Returns:
|
||||
ndarray: nextState, reward, done, loadDir, saveNow
|
||||
ndarray: next_state, reward, done, loadDir, saveNow
|
||||
"""
|
||||
# reset env
|
||||
self.env.reset()
|
||||
nextState, reward, done = self.get_steps()
|
||||
return nextState, reward, done
|
||||
next_state, reward, done = self.get_steps()
|
||||
return next_state, reward, done
|
||||
|
||||
# TODO:
|
||||
# delete all stack state DONE
|
||||
# getstep State disassembly function DONE
|
||||
# get-step State disassembly function DONE
|
||||
# delete agent selection function DONE
|
||||
# self.step action wrapper function DONE
|
||||
def step(
|
||||
self,
|
||||
actions: ndarray,
|
||||
)->Tuple[np.ndarray, List, List]:
|
||||
"""change ations list to ActionTuple then send it to enviroment
|
||||
self,
|
||||
actions: ndarray,
|
||||
) -> Tuple[np.ndarray, List, List]:
|
||||
"""change actions list to ActionTuple then send it to environment
|
||||
|
||||
Args:
|
||||
actions (ndarray): PPO chooseAction output action list.(agentNum,actionNum)
|
||||
@ -96,36 +96,36 @@ class Aimbot(gym.Env):
|
||||
Returns:
|
||||
ndarray: nextState, reward, done
|
||||
"""
|
||||
# take action to enviroment
|
||||
# take action to environment
|
||||
# return mextState,reward,done
|
||||
# discrete action
|
||||
if self.unity_dis_act_exist:
|
||||
# create discrete action from actions list
|
||||
discreteActions = actions[:, 0 : self.unity_discrete_type]
|
||||
discrete_actions = actions[:, 0: self.unity_discrete_type]
|
||||
else:
|
||||
# create empty discrete action
|
||||
discreteActions = np.asarray([[0]])
|
||||
discrete_actions = np.asarray([[0]])
|
||||
# continuous action
|
||||
if self.unity_con_act_exist:
|
||||
# create continuous actions from actions list
|
||||
continuousActions = actions[:, self.unity_discrete_type :]
|
||||
continuous_actions = actions[:, self.unity_discrete_type:]
|
||||
else:
|
||||
# create empty continuous action
|
||||
continuousActions = np.asanyarray([[0.0]])
|
||||
continuous_actions = np.asanyarray([[0.0]])
|
||||
|
||||
# Dummy continuous action
|
||||
# continuousActions = np.asanyarray([[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]])
|
||||
# create actionTuple
|
||||
thisActionTuple = ActionTuple(continuous=continuousActions, discrete=discreteActions)
|
||||
this_action_tuple = ActionTuple(continuous=continuous_actions, discrete=discrete_actions)
|
||||
# take action to env
|
||||
self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple)
|
||||
self.env.set_actions(behavior_name=self.unity_beha_name, action=this_action_tuple)
|
||||
self.env.step()
|
||||
# get nextState & reward & done after this action
|
||||
nextStates, rewards, dones = self.get_steps()
|
||||
return nextStates, rewards, dones
|
||||
next_states, rewards, dones = self.get_steps()
|
||||
return next_states, rewards, dones
|
||||
|
||||
def get_steps(self)->Tuple[np.ndarray, List, List]:
|
||||
"""get enviroment now observations.
|
||||
def get_steps(self) -> Tuple[np.ndarray, List, List]:
|
||||
"""get environment now observations.
|
||||
Include State, Reward, Done
|
||||
|
||||
Args:
|
||||
@ -160,6 +160,7 @@ class Aimbot(gym.Env):
|
||||
def close(self):
|
||||
self.env.close()
|
||||
|
||||
|
||||
class AimbotSideChannel(SideChannel):
|
||||
def __init__(self, channel_id: uuid.UUID) -> None:
|
||||
super().__init__(channel_id)
|
||||
@ -174,13 +175,15 @@ class AimbotSideChannel(SideChannel):
|
||||
"""
|
||||
this_message = msg.read_string()
|
||||
this_result = this_message.split("|")
|
||||
if(this_result[0] == "result"):
|
||||
airecorder.total_rounds[this_result[1]]+=1
|
||||
if(this_result[2] == "Win"):
|
||||
airecorder.win_rounds[this_result[1]]+=1
|
||||
#print(TotalRounds)
|
||||
#print(WinRounds)
|
||||
elif(this_result[0] == "Error"):
|
||||
print(this_result)
|
||||
if this_result[0] == "Warning":
|
||||
if this_result[1] == "Result":
|
||||
airecorder.total_rounds[this_result[2]] += 1
|
||||
if this_result[3] == "Win":
|
||||
airecorder.win_rounds[this_result[2]] += 1
|
||||
# print(TotalRounds)
|
||||
# print(WinRounds)
|
||||
elif this_result[0] == "Error":
|
||||
print(this_message)
|
||||
# # while Message type is Warning
|
||||
# if(thisResult[0] == "Warning"):
|
||||
@ -197,7 +200,8 @@ class AimbotSideChannel(SideChannel):
|
||||
# # while Message type is Error
|
||||
# elif(thisResult[0] == "Error"):
|
||||
# print(thisMessage)
|
||||
# 发送函数
|
||||
|
||||
# 发送函数
|
||||
def send_string(self, data: str) -> None:
|
||||
# send a string toC#
|
||||
msg = OutgoingMessage()
|
||||
@ -222,4 +226,4 @@ class AimbotSideChannel(SideChannel):
|
||||
def send_float_list(self, data: List[float]) -> None:
|
||||
msg = OutgoingMessage()
|
||||
msg.write_float32_list(data)
|
||||
super().queue_message_to_send(msg)
|
||||
super().queue_message_to_send(msg)
|
||||
|
@ -32,15 +32,18 @@ if __name__ == "__main__":
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
|
||||
best_reward = -1
|
||||
|
||||
# Initialize environment anget optimizer
|
||||
aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
|
||||
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimbot_sidechannel])
|
||||
# Initialize environment agent optimizer
|
||||
aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID)
|
||||
env = Aimbot(env_path=args.path,
|
||||
worker_id=args.workerID,
|
||||
base_port=args.baseport,
|
||||
side_channels=[aimbot_side_channel])
|
||||
if args.load_dir is None:
|
||||
agent = PPOAgent(
|
||||
env = env,
|
||||
env=env,
|
||||
this_args=args,
|
||||
device=device,
|
||||
).to(device)
|
||||
).to(device)
|
||||
else:
|
||||
agent = torch.load(args.load_dir)
|
||||
# freeze
|
||||
@ -48,7 +51,7 @@ if __name__ == "__main__":
|
||||
# freeze the view network
|
||||
for p in agent.viewNetwork.parameters():
|
||||
p.requires_grad = False
|
||||
print("VIEW NETWORK FREEZED")
|
||||
print("VIEW NETWORK FREEZE")
|
||||
print("Load Agent", args.load_dir)
|
||||
print(agent.eval())
|
||||
# optimizer
|
||||
@ -57,16 +60,18 @@ if __name__ == "__main__":
|
||||
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
|
||||
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
|
||||
|
||||
|
||||
@atexit.register
|
||||
def save_model():
|
||||
# close env
|
||||
env.close()
|
||||
if args.save_model:
|
||||
# save model while exit
|
||||
save_dir = "../PPO-Model/"+ run_name + "_last.pt"
|
||||
save_dir = "../PPO-Model/" + run_name + "_last.pt"
|
||||
torch.save(agent, save_dir)
|
||||
print("save model to " + save_dir)
|
||||
|
||||
|
||||
# start the game
|
||||
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
|
||||
target_steps = [0 for i in range(args.target_num)]
|
||||
@ -77,14 +82,14 @@ if __name__ == "__main__":
|
||||
ppo_memories = PPOMem(
|
||||
args=args,
|
||||
unity_agent_num=env.unity_agent_num,
|
||||
device = device,
|
||||
device=device,
|
||||
)
|
||||
|
||||
# MAIN LOOP: run agent in environment
|
||||
for total_steps in range(total_update_step):
|
||||
# discunt learning rate, while step == total_update_step lr will be 0
|
||||
# discount learning rate, while step == total_update_step lr will be 0
|
||||
if args.annealLR:
|
||||
final_lr_ratio = args.target_lr/args.lr
|
||||
final_lr_ratio = args.target_lr / args.lr
|
||||
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
|
||||
lr_now = frac * args.lr
|
||||
optimizer.param_groups[0]["lr"] = lr_now
|
||||
@ -92,13 +97,14 @@ if __name__ == "__main__":
|
||||
lr_now = args.lr
|
||||
|
||||
# episode start show learning rate
|
||||
print("new episode",total_steps,"learning rate = ",lr_now)
|
||||
# MAIN LOOP: run agent in environment
|
||||
print("new episode", total_steps, "learning rate = ", lr_now)
|
||||
step = 0
|
||||
training = False
|
||||
train_queue = []
|
||||
last_reward = [0.for i in range(env.unity_agent_num)]
|
||||
last_reward = [0. for i in range(env.unity_agent_num)]
|
||||
# MAIN LOOP: run agent in environment
|
||||
while True:
|
||||
# On decision point, choose action by agent
|
||||
if step % args.decision_period == 0:
|
||||
step += 1
|
||||
# Choose action by agent
|
||||
@ -119,17 +125,17 @@ if __name__ == "__main__":
|
||||
|
||||
# save memories
|
||||
ppo_memories.save_memories(
|
||||
now_step = step,
|
||||
agent = agent,
|
||||
state = state,
|
||||
action_cpu = action_cpu,
|
||||
dis_logprob_cpu = dis_logprob_cpu,
|
||||
con_logprob_cpu = con_logprob_cpu,
|
||||
reward = reward,
|
||||
done = done,
|
||||
value_cpu = value_cpu,
|
||||
last_reward = last_reward,
|
||||
next_done = next_done,
|
||||
now_step=step,
|
||||
agent=agent,
|
||||
state=state,
|
||||
action_cpu=action_cpu,
|
||||
dis_logprob_cpu=dis_logprob_cpu,
|
||||
con_logprob_cpu=con_logprob_cpu,
|
||||
reward=reward,
|
||||
done=done,
|
||||
value_cpu=value_cpu,
|
||||
last_reward=last_reward,
|
||||
next_done=next_done,
|
||||
next_state=next_state,
|
||||
)
|
||||
# check if any training dataset is full and ready to train
|
||||
@ -137,7 +143,7 @@ if __name__ == "__main__":
|
||||
if ppo_memories.obs[i].size()[0] >= args.datasetSize:
|
||||
# start train NN
|
||||
train_queue.append(i)
|
||||
if(len(train_queue)>0):
|
||||
if len(train_queue) > 0:
|
||||
# break while loop and start train
|
||||
break
|
||||
# update state
|
||||
@ -148,17 +154,17 @@ if __name__ == "__main__":
|
||||
next_state, reward, next_done = env.step(action_cpu)
|
||||
# save memories
|
||||
ppo_memories.save_memories(
|
||||
now_step = step,
|
||||
agent = agent,
|
||||
state = state,
|
||||
action_cpu = action_cpu,
|
||||
dis_logprob_cpu = dis_logprob_cpu,
|
||||
con_logprob_cpu = con_logprob_cpu,
|
||||
reward = reward,
|
||||
done = done,
|
||||
value_cpu = value_cpu,
|
||||
last_reward = last_reward,
|
||||
next_done = next_done,
|
||||
now_step=step,
|
||||
agent=agent,
|
||||
state=state,
|
||||
action_cpu=action_cpu,
|
||||
dis_logprob_cpu=dis_logprob_cpu,
|
||||
con_logprob_cpu=con_logprob_cpu,
|
||||
reward=reward,
|
||||
done=done,
|
||||
value_cpu=value_cpu,
|
||||
last_reward=last_reward,
|
||||
next_done=next_done,
|
||||
next_state=next_state,
|
||||
)
|
||||
# update state
|
||||
@ -167,12 +173,12 @@ if __name__ == "__main__":
|
||||
|
||||
if args.train:
|
||||
# train mode on
|
||||
mean_reward_list = [] # for WANDB
|
||||
# loop all tarining queue
|
||||
mean_reward_list = [] # for WANDB
|
||||
# loop all training queue
|
||||
for this_train_ind in train_queue:
|
||||
# sart time
|
||||
# start time
|
||||
start_time = time.time()
|
||||
target_steps[this_train_ind]+=1
|
||||
target_steps[this_train_ind] += 1
|
||||
# train agent
|
||||
(
|
||||
v_loss,
|
||||
@ -180,18 +186,18 @@ if __name__ == "__main__":
|
||||
con_pg_loss,
|
||||
loss,
|
||||
entropy_loss
|
||||
) = agent.train_net(
|
||||
this_train_ind=this_train_ind,
|
||||
ppo_memories=ppo_memories,
|
||||
optimizer=optimizer
|
||||
)
|
||||
) = agent.train_net(
|
||||
this_train_ind=this_train_ind,
|
||||
ppo_memories=ppo_memories,
|
||||
optimizer=optimizer
|
||||
)
|
||||
# record mean reward before clear history
|
||||
print("done")
|
||||
targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
|
||||
mean_reward_list.append(targetRewardMean)
|
||||
target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
|
||||
mean_reward_list.append(target_reward_mean)
|
||||
targetName = Targets(this_train_ind).name
|
||||
|
||||
# clear this target trainning set buffer
|
||||
# clear this target training set buffer
|
||||
ppo_memories.clear_training_datasets(this_train_ind)
|
||||
# record rewards for plotting purposes
|
||||
wdb_recorder.add_target_scalar(
|
||||
@ -202,10 +208,10 @@ if __name__ == "__main__":
|
||||
con_pg_loss,
|
||||
loss,
|
||||
entropy_loss,
|
||||
targetRewardMean,
|
||||
target_reward_mean,
|
||||
target_steps,
|
||||
)
|
||||
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
||||
print(f"episode over Target{targetName} mean reward:", target_reward_mean)
|
||||
TotalRewardMean = np.mean(mean_reward_list)
|
||||
wdb_recorder.add_global_scalar(
|
||||
TotalRewardMean,
|
||||
@ -216,31 +222,32 @@ if __name__ == "__main__":
|
||||
print("cost time:", time.time() - start_time)
|
||||
# New Record!
|
||||
if TotalRewardMean > best_reward and args.save_model:
|
||||
best_reward = targetRewardMean
|
||||
saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt"
|
||||
best_reward = target_reward_mean
|
||||
saveDir = "../PPO-Model/" + run_name + "_" + str(TotalRewardMean) + ".pt"
|
||||
torch.save(agent, saveDir)
|
||||
else:
|
||||
# train mode off
|
||||
mean_reward_list = [] # for WANDB
|
||||
mean_reward_list = [] # for WANDB
|
||||
# while not in training mode, clear the buffer
|
||||
for this_train_ind in train_queue:
|
||||
target_steps[this_train_ind]+=1
|
||||
target_steps[this_train_ind] += 1
|
||||
targetName = Targets(this_train_ind).name
|
||||
targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
|
||||
mean_reward_list.append(targetRewardMean)
|
||||
target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
|
||||
mean_reward_list.append(target_reward_mean)
|
||||
print(target_steps[this_train_ind])
|
||||
|
||||
# clear this target trainning set buffer
|
||||
# clear this target training set buffer
|
||||
ppo_memories.clear_training_datasets(this_train_ind)
|
||||
|
||||
# record rewards for plotting purposes
|
||||
wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[this_train_ind])
|
||||
wdb_recorder.add_win_ratio(targetName,target_steps[this_train_ind])
|
||||
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
||||
wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean,
|
||||
target_steps[this_train_ind])
|
||||
wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind])
|
||||
print(f"episode over Target{targetName} mean reward:", target_reward_mean)
|
||||
TotalRewardMean = np.mean(mean_reward_list)
|
||||
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
||||
|
||||
saveDir = "../PPO-Model/"+ run_name + "_last.pt"
|
||||
saveDir = "../PPO-Model/" + run_name + "_last.pt"
|
||||
torch.save(agent, saveDir)
|
||||
env.close()
|
||||
wdb_recorder.writer.close()
|
||||
|
@ -1,7 +1,6 @@
|
||||
import wandb
|
||||
import time
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
import wandb
|
||||
|
||||
total_rounds = {"Free": 0, "Go": 0, "Attack": 0}
|
||||
win_rounds = {"Free": 0, "Go": 0, "Attack": 0}
|
||||
@ -35,7 +34,7 @@ class WandbRecorder:
|
||||
def add_target_scalar(
|
||||
self,
|
||||
target_name,
|
||||
thisT,
|
||||
this_t,
|
||||
v_loss,
|
||||
dis_pg_loss,
|
||||
con_pg_loss,
|
||||
@ -46,25 +45,25 @@ class WandbRecorder:
|
||||
):
|
||||
# fmt:off
|
||||
self.writer.add_scalar(
|
||||
f"Target{target_name}/value_loss", v_loss.item(), target_steps[thisT]
|
||||
f"Target{target_name}/value_loss", v_loss.item(), target_steps[this_t]
|
||||
)
|
||||
self.writer.add_scalar(
|
||||
f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT]
|
||||
f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[this_t]
|
||||
)
|
||||
self.writer.add_scalar(
|
||||
f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[thisT]
|
||||
f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[this_t]
|
||||
)
|
||||
self.writer.add_scalar(
|
||||
f"Target{target_name}/total_loss", loss.item(), target_steps[thisT]
|
||||
f"Target{target_name}/total_loss", loss.item(), target_steps[this_t]
|
||||
)
|
||||
self.writer.add_scalar(
|
||||
f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[thisT]
|
||||
f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[this_t]
|
||||
)
|
||||
self.writer.add_scalar(
|
||||
f"Target{target_name}/Reward", target_reward_mean, target_steps[thisT]
|
||||
f"Target{target_name}/Reward", target_reward_mean, target_steps[this_t]
|
||||
)
|
||||
self.writer.add_scalar(
|
||||
f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[thisT],
|
||||
f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[this_t],
|
||||
)
|
||||
# fmt:on
|
||||
|
||||
|
@ -4,7 +4,7 @@ import uuid
|
||||
from distutils.util import strtobool
|
||||
|
||||
DEFAULT_SEED = 9331
|
||||
ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv"
|
||||
ENV_PATH = "../Build/3.1.6/Aimbot-ParallelEnv"
|
||||
WAND_ENTITY = "koha9"
|
||||
WORKER_ID = 1
|
||||
BASE_PORT = 1000
|
||||
|
@ -65,7 +65,8 @@ class PPOAgent(nn.Module):
|
||||
self.actor_mean = nn.ModuleList(
|
||||
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
|
||||
)
|
||||
# self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
|
||||
# self.actor_logstd =
|
||||
# nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
|
||||
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||
self.actor_logstd = nn.ParameterList(
|
||||
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
|
||||
|
Loading…
Reference in New Issue
Block a user