对应V3.1.6 训练模式

主要修改SideChannel以对应V316的训练模式
规则化命名
This commit is contained in:
Koha9 2023-07-29 22:40:03 +09:00
parent be1322381e
commit f9ee51c256
12 changed files with 166 additions and 113 deletions

3
Aimbot-PPO-Python/Pytorch/.idea/.gitignore generated vendored Normal file
View File

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="mlagents39" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -0,0 +1,7 @@
<component name="ProjectDictionaryState">
<dictionary name="UCUNI">
<words>
<w>aimbot</w>
</words>
</dictionary>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="mlagents39" project-jdk-type="Python SDK" />
</project>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Pytorch.iml" filepath="$PROJECT_DIR$/.idea/Pytorch.iml" />
</modules>
</component>
</project>

6
Aimbot-PPO-Python/Pytorch/.idea/vcs.xml generated Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
</component>
</project>

View File

@ -15,19 +15,19 @@ from mlagents_envs.side_channel.side_channel import (
class Aimbot(gym.Env): class Aimbot(gym.Env):
def __init__( def __init__(
self, self,
envPath: str, env_path: str,
workerID: int = 1, worker_id: int = 1,
basePort: int = 100, base_port: int = 100,
side_channels: list = [] side_channels: list = []
): ):
super(Aimbot, self).__init__() super(Aimbot, self).__init__()
self.env = UnityEnvironment( self.env = UnityEnvironment(
file_name=envPath, file_name=env_path,
seed=1, seed=1,
side_channels=side_channels, side_channels=side_channels,
worker_id=workerID, worker_id=worker_id,
base_port=basePort, base_port=base_port,
) )
self.env.reset() self.env.reset()
# all behavior_specs # all behavior_specs
@ -41,7 +41,7 @@ class Aimbot(gym.Env):
# environment action specs # environment action specs
self.unity_action_spec = self.unity_specs.action_spec self.unity_action_spec = self.unity_specs.action_spec
# environment sample observation # environment sample observation
decisionSteps, _ = self.env.get_steps(self.unity_beha_name) decision_steps, _ = self.env.get_steps(self.unity_beha_name)
# OBSERVATION SPECS # OBSERVATION SPECS
# environment state shape. like tuple:(93,) # environment state shape. like tuple:(93,)
@ -64,31 +64,31 @@ class Aimbot(gym.Env):
# AGENT SPECS # AGENT SPECS
# all agents ID # all agents ID
self.unity_agent_IDS = decisionSteps.agent_id self.unity_agent_IDS = decision_steps.agent_id
# agents number # agents number
self.unity_agent_num = len(self.unity_agent_IDS) self.unity_agent_num = len(self.unity_agent_IDS)
def reset(self)->Tuple[np.ndarray, List, List]: def reset(self) -> Tuple[np.ndarray, List, List]:
"""reset enviroment and get observations """reset environment and get observations
Returns: Returns:
ndarray: nextState, reward, done, loadDir, saveNow ndarray: next_state, reward, done, loadDir, saveNow
""" """
# reset env # reset env
self.env.reset() self.env.reset()
nextState, reward, done = self.get_steps() next_state, reward, done = self.get_steps()
return nextState, reward, done return next_state, reward, done
# TODO: # TODO:
# delete all stack state DONE # delete all stack state DONE
# getstep State disassembly function DONE # get-step State disassembly function DONE
# delete agent selection function DONE # delete agent selection function DONE
# self.step action wrapper function DONE # self.step action wrapper function DONE
def step( def step(
self, self,
actions: ndarray, actions: ndarray,
)->Tuple[np.ndarray, List, List]: ) -> Tuple[np.ndarray, List, List]:
"""change ations list to ActionTuple then send it to enviroment """change actions list to ActionTuple then send it to environment
Args: Args:
actions (ndarray): PPO chooseAction output action list.(agentNum,actionNum) actions (ndarray): PPO chooseAction output action list.(agentNum,actionNum)
@ -96,36 +96,36 @@ class Aimbot(gym.Env):
Returns: Returns:
ndarray: nextState, reward, done ndarray: nextState, reward, done
""" """
# take action to enviroment # take action to environment
# return mextState,reward,done # return mextState,reward,done
# discrete action # discrete action
if self.unity_dis_act_exist: if self.unity_dis_act_exist:
# create discrete action from actions list # create discrete action from actions list
discreteActions = actions[:, 0 : self.unity_discrete_type] discrete_actions = actions[:, 0: self.unity_discrete_type]
else: else:
# create empty discrete action # create empty discrete action
discreteActions = np.asarray([[0]]) discrete_actions = np.asarray([[0]])
# continuous action # continuous action
if self.unity_con_act_exist: if self.unity_con_act_exist:
# create continuous actions from actions list # create continuous actions from actions list
continuousActions = actions[:, self.unity_discrete_type :] continuous_actions = actions[:, self.unity_discrete_type:]
else: else:
# create empty continuous action # create empty continuous action
continuousActions = np.asanyarray([[0.0]]) continuous_actions = np.asanyarray([[0.0]])
# Dummy continuous action # Dummy continuous action
# continuousActions = np.asanyarray([[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]) # continuousActions = np.asanyarray([[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]])
# create actionTuple # create actionTuple
thisActionTuple = ActionTuple(continuous=continuousActions, discrete=discreteActions) this_action_tuple = ActionTuple(continuous=continuous_actions, discrete=discrete_actions)
# take action to env # take action to env
self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple) self.env.set_actions(behavior_name=self.unity_beha_name, action=this_action_tuple)
self.env.step() self.env.step()
# get nextState & reward & done after this action # get nextState & reward & done after this action
nextStates, rewards, dones = self.get_steps() next_states, rewards, dones = self.get_steps()
return nextStates, rewards, dones return next_states, rewards, dones
def get_steps(self)->Tuple[np.ndarray, List, List]: def get_steps(self) -> Tuple[np.ndarray, List, List]:
"""get enviroment now observations. """get environment now observations.
Include State, Reward, Done Include State, Reward, Done
Args: Args:
@ -160,6 +160,7 @@ class Aimbot(gym.Env):
def close(self): def close(self):
self.env.close() self.env.close()
class AimbotSideChannel(SideChannel): class AimbotSideChannel(SideChannel):
def __init__(self, channel_id: uuid.UUID) -> None: def __init__(self, channel_id: uuid.UUID) -> None:
super().__init__(channel_id) super().__init__(channel_id)
@ -174,13 +175,15 @@ class AimbotSideChannel(SideChannel):
""" """
this_message = msg.read_string() this_message = msg.read_string()
this_result = this_message.split("|") this_result = this_message.split("|")
if(this_result[0] == "result"): print(this_result)
airecorder.total_rounds[this_result[1]]+=1 if this_result[0] == "Warning":
if(this_result[2] == "Win"): if this_result[1] == "Result":
airecorder.win_rounds[this_result[1]]+=1 airecorder.total_rounds[this_result[2]] += 1
#print(TotalRounds) if this_result[3] == "Win":
#print(WinRounds) airecorder.win_rounds[this_result[2]] += 1
elif(this_result[0] == "Error"): # print(TotalRounds)
# print(WinRounds)
elif this_result[0] == "Error":
print(this_message) print(this_message)
# # while Message type is Warning # # while Message type is Warning
# if(thisResult[0] == "Warning"): # if(thisResult[0] == "Warning"):
@ -197,7 +200,8 @@ class AimbotSideChannel(SideChannel):
# # while Message type is Error # # while Message type is Error
# elif(thisResult[0] == "Error"): # elif(thisResult[0] == "Error"):
# print(thisMessage) # print(thisMessage)
# 发送函数
# 发送函数
def send_string(self, data: str) -> None: def send_string(self, data: str) -> None:
# send a string toC# # send a string toC#
msg = OutgoingMessage() msg = OutgoingMessage()
@ -222,4 +226,4 @@ class AimbotSideChannel(SideChannel):
def send_float_list(self, data: List[float]) -> None: def send_float_list(self, data: List[float]) -> None:
msg = OutgoingMessage() msg = OutgoingMessage()
msg.write_float32_list(data) msg.write_float32_list(data)
super().queue_message_to_send(msg) super().queue_message_to_send(msg)

View File

@ -32,15 +32,18 @@ if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
best_reward = -1 best_reward = -1
# Initialize environment anget optimizer # Initialize environment agent optimizer
aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID); aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID)
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimbot_sidechannel]) env = Aimbot(env_path=args.path,
worker_id=args.workerID,
base_port=args.baseport,
side_channels=[aimbot_side_channel])
if args.load_dir is None: if args.load_dir is None:
agent = PPOAgent( agent = PPOAgent(
env = env, env=env,
this_args=args, this_args=args,
device=device, device=device,
).to(device) ).to(device)
else: else:
agent = torch.load(args.load_dir) agent = torch.load(args.load_dir)
# freeze # freeze
@ -48,7 +51,7 @@ if __name__ == "__main__":
# freeze the view network # freeze the view network
for p in agent.viewNetwork.parameters(): for p in agent.viewNetwork.parameters():
p.requires_grad = False p.requires_grad = False
print("VIEW NETWORK FREEZED") print("VIEW NETWORK FREEZE")
print("Load Agent", args.load_dir) print("Load Agent", args.load_dir)
print(agent.eval()) print(agent.eval())
# optimizer # optimizer
@ -57,16 +60,18 @@ if __name__ == "__main__":
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}" run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args) wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
@atexit.register @atexit.register
def save_model(): def save_model():
# close env # close env
env.close() env.close()
if args.save_model: if args.save_model:
# save model while exit # save model while exit
save_dir = "../PPO-Model/"+ run_name + "_last.pt" save_dir = "../PPO-Model/" + run_name + "_last.pt"
torch.save(agent, save_dir) torch.save(agent, save_dir)
print("save model to " + save_dir) print("save model to " + save_dir)
# start the game # start the game
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
target_steps = [0 for i in range(args.target_num)] target_steps = [0 for i in range(args.target_num)]
@ -77,14 +82,14 @@ if __name__ == "__main__":
ppo_memories = PPOMem( ppo_memories = PPOMem(
args=args, args=args,
unity_agent_num=env.unity_agent_num, unity_agent_num=env.unity_agent_num,
device = device, device=device,
) )
# MAIN LOOP: run agent in environment # MAIN LOOP: run agent in environment
for total_steps in range(total_update_step): for total_steps in range(total_update_step):
# discunt learning rate, while step == total_update_step lr will be 0 # discount learning rate, while step == total_update_step lr will be 0
if args.annealLR: if args.annealLR:
final_lr_ratio = args.target_lr/args.lr final_lr_ratio = args.target_lr / args.lr
frac = 1.0 - ((total_steps + 1.0) / total_update_step) frac = 1.0 - ((total_steps + 1.0) / total_update_step)
lr_now = frac * args.lr lr_now = frac * args.lr
optimizer.param_groups[0]["lr"] = lr_now optimizer.param_groups[0]["lr"] = lr_now
@ -92,13 +97,14 @@ if __name__ == "__main__":
lr_now = args.lr lr_now = args.lr
# episode start show learning rate # episode start show learning rate
print("new episode",total_steps,"learning rate = ",lr_now) print("new episode", total_steps, "learning rate = ", lr_now)
# MAIN LOOP: run agent in environment
step = 0 step = 0
training = False training = False
train_queue = [] train_queue = []
last_reward = [0.for i in range(env.unity_agent_num)] last_reward = [0. for i in range(env.unity_agent_num)]
# MAIN LOOP: run agent in environment
while True: while True:
# On decision point, choose action by agent
if step % args.decision_period == 0: if step % args.decision_period == 0:
step += 1 step += 1
# Choose action by agent # Choose action by agent
@ -119,17 +125,17 @@ if __name__ == "__main__":
# save memories # save memories
ppo_memories.save_memories( ppo_memories.save_memories(
now_step = step, now_step=step,
agent = agent, agent=agent,
state = state, state=state,
action_cpu = action_cpu, action_cpu=action_cpu,
dis_logprob_cpu = dis_logprob_cpu, dis_logprob_cpu=dis_logprob_cpu,
con_logprob_cpu = con_logprob_cpu, con_logprob_cpu=con_logprob_cpu,
reward = reward, reward=reward,
done = done, done=done,
value_cpu = value_cpu, value_cpu=value_cpu,
last_reward = last_reward, last_reward=last_reward,
next_done = next_done, next_done=next_done,
next_state=next_state, next_state=next_state,
) )
# check if any training dataset is full and ready to train # check if any training dataset is full and ready to train
@ -137,7 +143,7 @@ if __name__ == "__main__":
if ppo_memories.obs[i].size()[0] >= args.datasetSize: if ppo_memories.obs[i].size()[0] >= args.datasetSize:
# start train NN # start train NN
train_queue.append(i) train_queue.append(i)
if(len(train_queue)>0): if len(train_queue) > 0:
# break while loop and start train # break while loop and start train
break break
# update state # update state
@ -148,17 +154,17 @@ if __name__ == "__main__":
next_state, reward, next_done = env.step(action_cpu) next_state, reward, next_done = env.step(action_cpu)
# save memories # save memories
ppo_memories.save_memories( ppo_memories.save_memories(
now_step = step, now_step=step,
agent = agent, agent=agent,
state = state, state=state,
action_cpu = action_cpu, action_cpu=action_cpu,
dis_logprob_cpu = dis_logprob_cpu, dis_logprob_cpu=dis_logprob_cpu,
con_logprob_cpu = con_logprob_cpu, con_logprob_cpu=con_logprob_cpu,
reward = reward, reward=reward,
done = done, done=done,
value_cpu = value_cpu, value_cpu=value_cpu,
last_reward = last_reward, last_reward=last_reward,
next_done = next_done, next_done=next_done,
next_state=next_state, next_state=next_state,
) )
# update state # update state
@ -167,12 +173,12 @@ if __name__ == "__main__":
if args.train: if args.train:
# train mode on # train mode on
mean_reward_list = [] # for WANDB mean_reward_list = [] # for WANDB
# loop all tarining queue # loop all training queue
for this_train_ind in train_queue: for this_train_ind in train_queue:
# sart time # start time
start_time = time.time() start_time = time.time()
target_steps[this_train_ind]+=1 target_steps[this_train_ind] += 1
# train agent # train agent
( (
v_loss, v_loss,
@ -180,18 +186,18 @@ if __name__ == "__main__":
con_pg_loss, con_pg_loss,
loss, loss,
entropy_loss entropy_loss
) = agent.train_net( ) = agent.train_net(
this_train_ind=this_train_ind, this_train_ind=this_train_ind,
ppo_memories=ppo_memories, ppo_memories=ppo_memories,
optimizer=optimizer optimizer=optimizer
) )
# record mean reward before clear history # record mean reward before clear history
print("done") print("done")
targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy()) target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
mean_reward_list.append(targetRewardMean) mean_reward_list.append(target_reward_mean)
targetName = Targets(this_train_ind).name targetName = Targets(this_train_ind).name
# clear this target trainning set buffer # clear this target training set buffer
ppo_memories.clear_training_datasets(this_train_ind) ppo_memories.clear_training_datasets(this_train_ind)
# record rewards for plotting purposes # record rewards for plotting purposes
wdb_recorder.add_target_scalar( wdb_recorder.add_target_scalar(
@ -202,10 +208,10 @@ if __name__ == "__main__":
con_pg_loss, con_pg_loss,
loss, loss,
entropy_loss, entropy_loss,
targetRewardMean, target_reward_mean,
target_steps, target_steps,
) )
print(f"episode over Target{targetName} mean reward:", targetRewardMean) print(f"episode over Target{targetName} mean reward:", target_reward_mean)
TotalRewardMean = np.mean(mean_reward_list) TotalRewardMean = np.mean(mean_reward_list)
wdb_recorder.add_global_scalar( wdb_recorder.add_global_scalar(
TotalRewardMean, TotalRewardMean,
@ -216,31 +222,32 @@ if __name__ == "__main__":
print("cost time:", time.time() - start_time) print("cost time:", time.time() - start_time)
# New Record! # New Record!
if TotalRewardMean > best_reward and args.save_model: if TotalRewardMean > best_reward and args.save_model:
best_reward = targetRewardMean best_reward = target_reward_mean
saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt" saveDir = "../PPO-Model/" + run_name + "_" + str(TotalRewardMean) + ".pt"
torch.save(agent, saveDir) torch.save(agent, saveDir)
else: else:
# train mode off # train mode off
mean_reward_list = [] # for WANDB mean_reward_list = [] # for WANDB
# while not in training mode, clear the buffer # while not in training mode, clear the buffer
for this_train_ind in train_queue: for this_train_ind in train_queue:
target_steps[this_train_ind]+=1 target_steps[this_train_ind] += 1
targetName = Targets(this_train_ind).name targetName = Targets(this_train_ind).name
targetRewardMean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy()) target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
mean_reward_list.append(targetRewardMean) mean_reward_list.append(target_reward_mean)
print(target_steps[this_train_ind]) print(target_steps[this_train_ind])
# clear this target trainning set buffer # clear this target training set buffer
ppo_memories.clear_training_datasets(this_train_ind) ppo_memories.clear_training_datasets(this_train_ind)
# record rewards for plotting purposes # record rewards for plotting purposes
wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[this_train_ind]) wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean,
wdb_recorder.add_win_ratio(targetName,target_steps[this_train_ind]) target_steps[this_train_ind])
print(f"episode over Target{targetName} mean reward:", targetRewardMean) wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind])
print(f"episode over Target{targetName} mean reward:", target_reward_mean)
TotalRewardMean = np.mean(mean_reward_list) TotalRewardMean = np.mean(mean_reward_list)
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
saveDir = "../PPO-Model/"+ run_name + "_last.pt" saveDir = "../PPO-Model/" + run_name + "_last.pt"
torch.save(agent, saveDir) torch.save(agent, saveDir)
env.close() env.close()
wdb_recorder.writer.close() wdb_recorder.writer.close()

View File

@ -1,7 +1,6 @@
import wandb
import time
from torch.utils.tensorboard import SummaryWriter from torch.utils.tensorboard import SummaryWriter
import wandb
total_rounds = {"Free": 0, "Go": 0, "Attack": 0} total_rounds = {"Free": 0, "Go": 0, "Attack": 0}
win_rounds = {"Free": 0, "Go": 0, "Attack": 0} win_rounds = {"Free": 0, "Go": 0, "Attack": 0}
@ -35,7 +34,7 @@ class WandbRecorder:
def add_target_scalar( def add_target_scalar(
self, self,
target_name, target_name,
thisT, this_t,
v_loss, v_loss,
dis_pg_loss, dis_pg_loss,
con_pg_loss, con_pg_loss,
@ -46,25 +45,25 @@ class WandbRecorder:
): ):
# fmt:off # fmt:off
self.writer.add_scalar( self.writer.add_scalar(
f"Target{target_name}/value_loss", v_loss.item(), target_steps[thisT] f"Target{target_name}/value_loss", v_loss.item(), target_steps[this_t]
) )
self.writer.add_scalar( self.writer.add_scalar(
f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT] f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[this_t]
) )
self.writer.add_scalar( self.writer.add_scalar(
f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[thisT] f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[this_t]
) )
self.writer.add_scalar( self.writer.add_scalar(
f"Target{target_name}/total_loss", loss.item(), target_steps[thisT] f"Target{target_name}/total_loss", loss.item(), target_steps[this_t]
) )
self.writer.add_scalar( self.writer.add_scalar(
f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[thisT] f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[this_t]
) )
self.writer.add_scalar( self.writer.add_scalar(
f"Target{target_name}/Reward", target_reward_mean, target_steps[thisT] f"Target{target_name}/Reward", target_reward_mean, target_steps[this_t]
) )
self.writer.add_scalar( self.writer.add_scalar(
f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[thisT], f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[this_t],
) )
# fmt:on # fmt:on

View File

@ -4,7 +4,7 @@ import uuid
from distutils.util import strtobool from distutils.util import strtobool
DEFAULT_SEED = 9331 DEFAULT_SEED = 9331
ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv" ENV_PATH = "../Build/3.1.6/Aimbot-ParallelEnv"
WAND_ENTITY = "koha9" WAND_ENTITY = "koha9"
WORKER_ID = 1 WORKER_ID = 1
BASE_PORT = 1000 BASE_PORT = 1000

View File

@ -65,7 +65,8 @@ class PPOAgent(nn.Module):
self.actor_mean = nn.ModuleList( self.actor_mean = nn.ModuleList(
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)] [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
) )
# self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)]) # self.actor_logstd =
# nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
self.actor_logstd = nn.ParameterList( self.actor_logstd = nn.ParameterList(
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)] [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]