Add load & save function.
Add load & save function. Add train flag to test model. Add new action select function while in test mode. Add decision period to skip step.
This commit is contained in:
parent
474032d1e8
commit
a0895c7449
4
.gitignore
vendored
4
.gitignore
vendored
@ -81,8 +81,6 @@ crashlytics-build.properties
|
|||||||
/Aimbot-PPO-Python/Pytorch/runs/
|
/Aimbot-PPO-Python/Pytorch/runs/
|
||||||
/Aimbot-PPO-Python/Pytorch/wandb/
|
/Aimbot-PPO-Python/Pytorch/wandb/
|
||||||
/Aimbot-PPO-Python/Backup/
|
/Aimbot-PPO-Python/Backup/
|
||||||
/Aimbot-PPO-Python/Build-MultiScene-WithLoad/
|
/Aimbot-PPO-Python/Build/
|
||||||
/Aimbot-PPO-Python/Build-CloseEnemyCut/
|
|
||||||
/Aimbot-PPO-Python/Build-ParallelEnv/
|
|
||||||
/Aimbot-PPO-Python/PPO-Model/
|
/Aimbot-PPO-Python/PPO-Model/
|
||||||
/Aimbot-PPO-Python/GAIL-Expert-Data/
|
/Aimbot-PPO-Python/GAIL-Expert-Data/
|
@ -13,30 +13,36 @@ from torch.distributions.categorical import Categorical
|
|||||||
from distutils.util import strtobool
|
from distutils.util import strtobool
|
||||||
from torch.utils.tensorboard import SummaryWriter
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
|
||||||
|
bestReward = 0
|
||||||
|
|
||||||
DEFAULT_SEED = 9331
|
DEFAULT_SEED = 9331
|
||||||
ENV_PATH = "../Build-ParallelEnv/Aimbot-ParallelEnv"
|
ENV_PATH = "../Build/Build-ParallelEnv-BigArea-6Enemy/Aimbot-ParallelEnv"
|
||||||
WAND_ENTITY = "koha9"
|
WAND_ENTITY = "koha9"
|
||||||
WORKER_ID = 1
|
WORKER_ID = 1
|
||||||
BASE_PORT = 2002
|
BASE_PORT = 1000
|
||||||
|
|
||||||
|
|
||||||
|
TOTAL_STEPS = 2000000
|
||||||
|
STEP_NUM = 314
|
||||||
|
DECISION_PERIOD = 2
|
||||||
LEARNING_RATE = 7e-4
|
LEARNING_RATE = 7e-4
|
||||||
GAMMA = 0.99
|
GAMMA = 0.99
|
||||||
GAE_LAMBDA = 0.95
|
GAE_LAMBDA = 0.95
|
||||||
TOTAL_STEPS = 2000000
|
MINIBATCH_NUM = 4
|
||||||
STEP_NUM = 256
|
|
||||||
MINIBATCH_NUM = 1
|
|
||||||
EPOCHS = 4
|
EPOCHS = 4
|
||||||
CLIP_COEF = 0.1
|
CLIP_COEF = 0.1
|
||||||
|
POLICY_COEF = 1.0
|
||||||
ENTROPY_COEF = 0.01
|
ENTROPY_COEF = 0.01
|
||||||
CRITIC_COEF = 0.5
|
CRITIC_COEF = 0.5
|
||||||
|
|
||||||
ANNEAL_LEARNING_RATE = True
|
ANNEAL_LEARNING_RATE = True
|
||||||
CLIP_VLOSS = True
|
CLIP_VLOSS = True
|
||||||
NORM_ADV = True
|
NORM_ADV = True
|
||||||
|
TRAIN = True
|
||||||
|
|
||||||
WANDB_TACK = True
|
WANDB_TACK = False
|
||||||
LOAD_DIR = "../PPO-Model/SmallArea-256-128-hybrid.pt"
|
LOAD_DIR = None
|
||||||
|
# LOAD_DIR = "../PPO-Model/SmallArea-256-128-hybrid-2nd-trainning.pt"
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
@ -59,6 +65,8 @@ def parse_args():
|
|||||||
help="total timesteps of the experiments")
|
help="total timesteps of the experiments")
|
||||||
|
|
||||||
# model parameters
|
# model parameters
|
||||||
|
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
|
||||||
|
help="Train Model or not")
|
||||||
parser.add_argument("--stepNum", type=int, default=STEP_NUM,
|
parser.add_argument("--stepNum", type=int, default=STEP_NUM,
|
||||||
help="the number of steps to run in each environment per policy rollout")
|
help="the number of steps to run in each environment per policy rollout")
|
||||||
parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM,
|
parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM,
|
||||||
@ -73,8 +81,10 @@ def parse_args():
|
|||||||
help="the entity (team) of wandb's project")
|
help="the entity (team) of wandb's project")
|
||||||
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
|
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
|
||||||
help="load model directory")
|
help="load model directory")
|
||||||
|
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
|
||||||
|
help="the number of steps to run in each environment per policy rollout")
|
||||||
|
|
||||||
# GAE
|
# GAE loss
|
||||||
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||||
help="Use GAE for advantage computation")
|
help="Use GAE for advantage computation")
|
||||||
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
|
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
|
||||||
@ -85,6 +95,8 @@ def parse_args():
|
|||||||
help="the lambda for the general advantage estimation")
|
help="the lambda for the general advantage estimation")
|
||||||
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
|
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
|
||||||
help="the surrogate clipping coefficient")
|
help="the surrogate clipping coefficient")
|
||||||
|
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
|
||||||
|
help="coefficient of the policy")
|
||||||
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
|
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
|
||||||
help="coefficient of the entropy")
|
help="coefficient of the entropy")
|
||||||
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
|
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
|
||||||
@ -114,15 +126,15 @@ class PPOAgent(nn.Module):
|
|||||||
self.continuous_size = env.unity_continuous_size
|
self.continuous_size = env.unity_continuous_size
|
||||||
|
|
||||||
self.network = nn.Sequential(
|
self.network = nn.Sequential(
|
||||||
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 256)),
|
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 384)),
|
||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
layer_init(nn.Linear(256, 128)),
|
layer_init(nn.Linear(384, 256)),
|
||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
)
|
)
|
||||||
self.actor_dis = layer_init(nn.Linear(128, self.discrete_size), std=0.01)
|
self.actor_dis = layer_init(nn.Linear(256, self.discrete_size), std=0.01)
|
||||||
self.actor_mean = layer_init(nn.Linear(128, self.continuous_size), std=0.01)
|
self.actor_mean = layer_init(nn.Linear(256, self.continuous_size), std=0.01)
|
||||||
self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||||
self.critic = layer_init(nn.Linear(128, 1), std=1)
|
self.critic = layer_init(nn.Linear(256, 1), std=1)
|
||||||
|
|
||||||
def get_value(self, state: torch.Tensor):
|
def get_value(self, state: torch.Tensor):
|
||||||
return self.critic(self.network(state))
|
return self.critic(self.network(state))
|
||||||
@ -140,9 +152,16 @@ class PPOAgent(nn.Module):
|
|||||||
con_probs = Normal(actions_mean, action_std)
|
con_probs = Normal(actions_mean, action_std)
|
||||||
|
|
||||||
if actions is None:
|
if actions is None:
|
||||||
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
if args.train:
|
||||||
conAct = con_probs.sample()
|
# select actions base on probability distribution model
|
||||||
actions = torch.cat([disAct.T, conAct], dim=1)
|
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
||||||
|
conAct = con_probs.sample()
|
||||||
|
actions = torch.cat([disAct.T, conAct], dim=1)
|
||||||
|
else:
|
||||||
|
# select actions base on best probability distribution
|
||||||
|
disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
||||||
|
conAct = actions_mean
|
||||||
|
actions = torch.cat([disAct.T, conAct], dim=1)
|
||||||
else:
|
else:
|
||||||
disAct = actions[:, 0 : env.unity_discrete_type].T
|
disAct = actions[:, 0 : env.unity_discrete_type].T
|
||||||
conAct = actions[:, env.unity_discrete_type :]
|
conAct = actions[:, env.unity_discrete_type :]
|
||||||
@ -181,7 +200,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
# Tensorboard and WandB Recorder
|
# Tensorboard and WandB Recorder
|
||||||
game_name = "Aimbot"
|
game_name = "Aimbot"
|
||||||
run_name = f"{game_name}__{args.seed}__{int(time.time())}"
|
run_name = f"{game_name}_{args.seed}_{int(time.time())}"
|
||||||
if args.wandb_track:
|
if args.wandb_track:
|
||||||
wandb.init(
|
wandb.init(
|
||||||
project=run_name,
|
project=run_name,
|
||||||
@ -227,24 +246,37 @@ if __name__ == "__main__":
|
|||||||
optimizer.param_groups[0]["lr"] = lrnow
|
optimizer.param_groups[0]["lr"] = lrnow
|
||||||
|
|
||||||
# MAIN LOOP: run agent in environment
|
# MAIN LOOP: run agent in environment
|
||||||
for step in range(args.stepNum):
|
for i in range(args.stepNum * args.decision_period):
|
||||||
global_step += 1 * env.unity_agent_num
|
if i % args.decision_period == 0:
|
||||||
obs[step] = next_obs
|
step = round(i / args.decision_period)
|
||||||
dones[step] = next_done
|
# Choose action by agent
|
||||||
|
global_step += 1 * env.unity_agent_num
|
||||||
|
obs[step] = next_obs
|
||||||
|
dones[step] = next_done
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# predict actions
|
# predict actions
|
||||||
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(next_obs)
|
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
|
||||||
value = value.flatten()
|
next_obs
|
||||||
next_obs, reward, done = env.step(action.cpu().numpy())
|
)
|
||||||
|
value = value.flatten()
|
||||||
|
next_obs, reward, done = env.step(action.cpu().numpy())
|
||||||
|
|
||||||
# save memories
|
# save memories
|
||||||
actions[step] = action
|
actions[step] = action
|
||||||
dis_logprobs[step] = dis_logprob
|
dis_logprobs[step] = dis_logprob
|
||||||
con_logprobs[step] = con_logprob
|
con_logprobs[step] = con_logprob
|
||||||
values[step] = value
|
values[step] = value
|
||||||
rewards[step] = torch.tensor(reward).to(device).view(-1)
|
rewards[step] = torch.tensor(reward).to(device).view(-1)
|
||||||
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
|
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
|
||||||
|
device
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# skip this step use last predict action
|
||||||
|
next_obs, reward, done = env.step(action.cpu().numpy())
|
||||||
|
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
|
||||||
|
device
|
||||||
|
)
|
||||||
|
|
||||||
# GAE
|
# GAE
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
@ -276,119 +308,126 @@ if __name__ == "__main__":
|
|||||||
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
|
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
|
||||||
advantages = returns - values
|
advantages = returns - values
|
||||||
|
|
||||||
# flatten the batch
|
if args.train:
|
||||||
b_obs = obs.reshape((-1,) + env.unity_observation_shape)
|
# flatten the batch
|
||||||
b_dis_logprobs = dis_logprobs.reshape(-1)
|
b_obs = obs.reshape((-1,) + env.unity_observation_shape)
|
||||||
b_con_logprobs = con_logprobs.reshape(-1)
|
b_dis_logprobs = dis_logprobs.reshape(-1)
|
||||||
b_actions = actions.reshape((-1,) + (env.unity_action_size,))
|
b_con_logprobs = con_logprobs.reshape(-1)
|
||||||
b_advantages = advantages.reshape(-1)
|
b_actions = actions.reshape((-1,) + (env.unity_action_size,))
|
||||||
b_returns = returns.reshape(-1)
|
b_advantages = advantages.reshape(-1)
|
||||||
b_values = values.reshape(-1)
|
b_returns = returns.reshape(-1)
|
||||||
|
b_values = values.reshape(-1)
|
||||||
|
|
||||||
# Optimizing the policy and value network
|
# Optimizing the policy and value network
|
||||||
b_inds = np.arange(args.batch_size)
|
b_inds = np.arange(args.batch_size)
|
||||||
#clipfracs = []
|
# clipfracs = []
|
||||||
for epoch in range(args.epochs):
|
for epoch in range(args.epochs):
|
||||||
# shuffle all datasets
|
# shuffle all datasets
|
||||||
np.random.shuffle(b_inds)
|
np.random.shuffle(b_inds)
|
||||||
for start in range(0, args.batch_size, args.minibatch_size):
|
for start in range(0, args.batch_size, args.minibatch_size):
|
||||||
end = start + args.minibatch_size
|
end = start + args.minibatch_size
|
||||||
mb_inds = b_inds[start:end]
|
mb_inds = b_inds[start:end]
|
||||||
mb_advantages = b_advantages[mb_inds]
|
mb_advantages = b_advantages[mb_inds]
|
||||||
|
|
||||||
# normalize advantages
|
# normalize advantages
|
||||||
if args.norm_adv:
|
if args.norm_adv:
|
||||||
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
||||||
mb_advantages.std() + 1e-8
|
mb_advantages.std() + 1e-8
|
||||||
|
)
|
||||||
|
|
||||||
|
(
|
||||||
|
_,
|
||||||
|
new_dis_logprob,
|
||||||
|
dis_entropy,
|
||||||
|
new_con_logprob,
|
||||||
|
con_entropy,
|
||||||
|
newvalue,
|
||||||
|
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
||||||
|
# discrete ratio
|
||||||
|
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
||||||
|
dis_ratio = dis_logratio.exp()
|
||||||
|
# continuous ratio
|
||||||
|
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
|
||||||
|
con_ratio = con_logratio.exp()
|
||||||
|
|
||||||
|
"""
|
||||||
|
# early stop
|
||||||
|
with torch.no_grad():
|
||||||
|
# calculate approx_kl http://joschu.net/blog/kl-approx.html
|
||||||
|
old_approx_kl = (-logratio).mean()
|
||||||
|
approx_kl = ((ratio - 1) - logratio).mean()
|
||||||
|
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
|
||||||
|
"""
|
||||||
|
|
||||||
|
# discrete Policy loss
|
||||||
|
dis_pg_loss_orig = -mb_advantages * dis_ratio
|
||||||
|
dis_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||||
|
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
||||||
|
)
|
||||||
|
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
|
||||||
|
# continuous Policy loss
|
||||||
|
con_pg_loss_orig = -mb_advantages * con_ratio
|
||||||
|
con_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||||
|
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
||||||
|
)
|
||||||
|
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
||||||
|
|
||||||
|
# Value loss
|
||||||
|
newvalue = newvalue.view(-1)
|
||||||
|
if args.clip_vloss:
|
||||||
|
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
||||||
|
v_clipped = b_values[mb_inds] + torch.clamp(
|
||||||
|
newvalue - b_values[mb_inds],
|
||||||
|
-args.clip_coef,
|
||||||
|
args.clip_coef,
|
||||||
|
)
|
||||||
|
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
||||||
|
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
||||||
|
v_loss = 0.5 * v_loss_max.mean()
|
||||||
|
else:
|
||||||
|
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
||||||
|
|
||||||
|
# total loss
|
||||||
|
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
||||||
|
loss = (
|
||||||
|
dis_pg_loss * args.policy_coef
|
||||||
|
+ con_pg_loss * args.policy_coef
|
||||||
|
- entropy_loss * args.ent_coef
|
||||||
|
+ v_loss * args.critic_coef
|
||||||
)
|
)
|
||||||
|
|
||||||
(
|
optimizer.zero_grad()
|
||||||
_,
|
loss.backward()
|
||||||
new_dis_logprob,
|
# Clips gradient norm of an iterable of parameters.
|
||||||
dis_entropy,
|
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
|
||||||
new_con_logprob,
|
optimizer.step()
|
||||||
con_entropy,
|
|
||||||
newvalue,
|
|
||||||
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
|
||||||
# discrete ratio
|
|
||||||
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
|
||||||
dis_ratio = dis_logratio.exp()
|
|
||||||
# continuous ratio
|
|
||||||
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
|
|
||||||
con_ratio = con_logratio.exp()
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# early stop
|
if args.target_kl is not None:
|
||||||
with torch.no_grad():
|
if approx_kl > args.target_kl:
|
||||||
# calculate approx_kl http://joschu.net/blog/kl-approx.html
|
break
|
||||||
old_approx_kl = (-logratio).mean()
|
|
||||||
approx_kl = ((ratio - 1) - logratio).mean()
|
|
||||||
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
|
|
||||||
"""
|
"""
|
||||||
|
# record rewards for plotting purposes
|
||||||
# discrete Policy loss
|
rewardsMean = np.mean(rewards.to("cpu").detach().numpy().copy())
|
||||||
dis_pg_loss_orig = -mb_advantages * dis_ratio
|
writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
|
||||||
dis_pg_loss_clip = -mb_advantages * torch.clamp(
|
writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
|
||||||
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step)
|
||||||
)
|
writer.add_scalar("losses/con_policy_loss", con_pg_loss.item(), global_step)
|
||||||
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
|
writer.add_scalar("losses/total_loss", loss.item(), global_step)
|
||||||
# continuous Policy loss
|
writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step)
|
||||||
con_pg_loss_orig = -mb_advantages * con_ratio
|
# writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
|
||||||
con_pg_loss_clip = -mb_advantages * torch.clamp(
|
# writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
|
||||||
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
# writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
|
||||||
)
|
# print("SPS:", int(global_step / (time.time() - start_time)))
|
||||||
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
print("episode over mean reward:", rewardsMean)
|
||||||
|
writer.add_scalar(
|
||||||
# Value loss
|
"charts/SPS", int(global_step / (time.time() - start_time)), global_step
|
||||||
newvalue = newvalue.view(-1)
|
)
|
||||||
if args.clip_vloss:
|
writer.add_scalar("charts/Reward", rewardsMean, global_step)
|
||||||
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
if rewardsMean > bestReward:
|
||||||
v_clipped = b_values[mb_inds] + torch.clamp(
|
bestReward = rewardsMean
|
||||||
newvalue - b_values[mb_inds],
|
saveDir = "../PPO-Model/bigArea-384-128-hybrid-" + str(rewardsMean) + ".pt"
|
||||||
-args.clip_coef,
|
torch.save(agent, saveDir)
|
||||||
args.clip_coef,
|
|
||||||
)
|
|
||||||
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
|
||||||
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
|
||||||
v_loss = 0.5 * v_loss_max.mean()
|
|
||||||
else:
|
|
||||||
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
|
||||||
|
|
||||||
# total loss
|
|
||||||
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
|
||||||
loss = (
|
|
||||||
dis_pg_loss
|
|
||||||
+ con_pg_loss
|
|
||||||
- entropy_loss * args.ent_coef
|
|
||||||
+ v_loss * args.critic_coef
|
|
||||||
)
|
|
||||||
|
|
||||||
optimizer.zero_grad()
|
|
||||||
loss.backward()
|
|
||||||
# Clips gradient norm of an iterable of parameters.
|
|
||||||
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
|
|
||||||
optimizer.step()
|
|
||||||
|
|
||||||
"""
|
|
||||||
if args.target_kl is not None:
|
|
||||||
if approx_kl > args.target_kl:
|
|
||||||
break
|
|
||||||
"""
|
|
||||||
# record rewards for plotting purposes
|
|
||||||
writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
|
|
||||||
writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
|
|
||||||
writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step)
|
|
||||||
writer.add_scalar("losses/con_policy_loss", con_pg_loss.item(), global_step)
|
|
||||||
writer.add_scalar("losses/total_loss", loss.item(), global_step)
|
|
||||||
writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step)
|
|
||||||
# writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
|
|
||||||
# writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
|
|
||||||
#writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
|
|
||||||
print("SPS:", int(global_step / (time.time() - start_time)))
|
|
||||||
writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
|
|
||||||
writer.add_scalar(
|
|
||||||
"charts/Reward", np.mean(rewards.to("cpu").detach().numpy().copy()), global_step
|
|
||||||
)
|
|
||||||
|
|
||||||
env.close()
|
env.close()
|
||||||
writer.close()
|
writer.close()
|
||||||
|
@ -431,6 +431,45 @@
|
|||||||
"mymodel = torch.load(\"../PPO-Model/SmallArea-256-128-hybrid.pt\")\n",
|
"mymodel = torch.load(\"../PPO-Model/SmallArea-256-128-hybrid.pt\")\n",
|
||||||
"mymodel.eval()"
|
"mymodel.eval()"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"x : torch.Size([2, 3, 4])\n",
|
||||||
|
"x : torch.Size([6, 2, 3, 4])\n",
|
||||||
|
"x : torch.Size([6, 2, 3, 4])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import torch\n",
|
||||||
|
"#1\n",
|
||||||
|
"x = torch.randn(2, 1, 1)#为1可以扩展为3和4\n",
|
||||||
|
"x = x.expand(2, 3, 4)\n",
|
||||||
|
"print('x :', x.size())\n",
|
||||||
|
"\n",
|
||||||
|
"#2\n",
|
||||||
|
"#扩展一个新的维度必须在最前面,否则会报错\n",
|
||||||
|
"#x = x.expand(2, 3, 4, 6)\n",
|
||||||
|
"\n",
|
||||||
|
"x = x.expand(6, 2, 3, 4)\n",
|
||||||
|
"print('x :', x.size())\n",
|
||||||
|
"\n",
|
||||||
|
"#3\n",
|
||||||
|
"#某一个维度为-1表示不改变该维度的大小\n",
|
||||||
|
"x = x.expand(6, -1, -1, -1)\n",
|
||||||
|
"print('x :', x.size())\n",
|
||||||
|
"\n",
|
||||||
|
"x : torch.Size([2, 3, 4])\n",
|
||||||
|
"x : torch.Size([6, 2, 3, 4])\n",
|
||||||
|
"x : torch.Size([6, 2, 3, 4])"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
Loading…
Reference in New Issue
Block a user