Compare commits
3 Commits
OffP-FullM
...
OffP-FullM
Author | SHA1 | Date | |
---|---|---|---|
34206b95c5 | |||
1787872e82 | |||
ad9817e7a4 |
3
.gitignore
vendored
3
.gitignore
vendored
@ -83,4 +83,5 @@ crashlytics-build.properties
|
|||||||
/Aimbot-PPO-Python/Backup/
|
/Aimbot-PPO-Python/Backup/
|
||||||
/Aimbot-PPO-Python/Build/
|
/Aimbot-PPO-Python/Build/
|
||||||
/Aimbot-PPO-Python/PPO-Model/
|
/Aimbot-PPO-Python/PPO-Model/
|
||||||
/Aimbot-PPO-Python/GAIL-Expert-Data/
|
/Aimbot-PPO-Python/GAIL-Expert-Data/
|
||||||
|
/Aimbot-PPO-Python/runs/
|
@ -23,40 +23,41 @@ from mlagents_envs.side_channel.side_channel import (
|
|||||||
)
|
)
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
bestReward = 0
|
bestReward = -1
|
||||||
|
|
||||||
DEFAULT_SEED = 933139
|
DEFAULT_SEED = 9331
|
||||||
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy/Aimbot-ParallelEnv"
|
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv"
|
||||||
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
||||||
WAND_ENTITY = "koha9"
|
WAND_ENTITY = "koha9"
|
||||||
WORKER_ID = 2
|
WORKER_ID = 3
|
||||||
BASE_PORT = 1001
|
BASE_PORT = 1002
|
||||||
|
|
||||||
# max round steps per agent is 2500/Decision_period, 25 seconds
|
# max round steps per agent is 2500/Decision_period, 25 seconds
|
||||||
# !!!check every parameters before run!!!
|
# !!!check every parameters before run!!!
|
||||||
|
|
||||||
TOTAL_STEPS = 6750000
|
TOTAL_STEPS = 3150000
|
||||||
BATCH_SIZE = 512
|
BATCH_SIZE = 1024
|
||||||
MAX_TRAINNING_DATASETS = 3000
|
MAX_TRAINNING_DATASETS = 6000
|
||||||
DECISION_PERIOD = 1
|
DECISION_PERIOD = 1
|
||||||
LEARNING_RATE = 1e-3
|
LEARNING_RATE = 5e-4
|
||||||
GAMMA = 0.99
|
GAMMA = 0.99
|
||||||
GAE_LAMBDA = 0.95
|
GAE_LAMBDA = 0.95
|
||||||
EPOCHS = 4
|
EPOCHS = 3
|
||||||
CLIP_COEF = 0.1
|
CLIP_COEF = 0.11
|
||||||
POLICY_COEF = 1.0
|
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
|
||||||
ENTROPY_COEF = 0.01
|
POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
|
||||||
CRITIC_COEF = 0.5
|
ENTROPY_COEF = [0.1, 0.1, 0.1, 0.1]
|
||||||
TARGET_LEARNING_RATE = 5e-5
|
CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
|
||||||
|
TARGET_LEARNING_RATE = 1e-6
|
||||||
|
|
||||||
ANNEAL_LEARNING_RATE = True
|
ANNEAL_LEARNING_RATE = True
|
||||||
CLIP_VLOSS = True
|
CLIP_VLOSS = True
|
||||||
NORM_ADV = True
|
NORM_ADV = True
|
||||||
TRAIN = True
|
TRAIN = True
|
||||||
|
|
||||||
WANDB_TACK = True
|
WANDB_TACK = False
|
||||||
LOAD_DIR = None
|
LOAD_DIR = None
|
||||||
#LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
|
#LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670522099-freeonly-12/Aimbot-target-last.pt"
|
||||||
|
|
||||||
# public data
|
# public data
|
||||||
class Targets(Enum):
|
class Targets(Enum):
|
||||||
@ -65,11 +66,17 @@ class Targets(Enum):
|
|||||||
Attack = 2
|
Attack = 2
|
||||||
Defence = 3
|
Defence = 3
|
||||||
Num = 4
|
Num = 4
|
||||||
|
TARGET_STATE_SIZE = 6
|
||||||
|
INAREA_STATE_SIZE = 1
|
||||||
|
TIME_STATE_SIZE = 1
|
||||||
|
GUN_STATE_SIZE = 1
|
||||||
|
MY_STATE_SIZE = 4
|
||||||
|
TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
|
||||||
BASE_WINREWARD = 999
|
BASE_WINREWARD = 999
|
||||||
BASE_LOSEREWARD = -999
|
BASE_LOSEREWARD = -999
|
||||||
TARGETNUM= 4
|
TARGETNUM= 4
|
||||||
ENV_TIMELIMIT = 30
|
ENV_TIMELIMIT = 30
|
||||||
RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT
|
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
|
||||||
TotalRounds = {"Free":0,"Go":0,"Attack":0}
|
TotalRounds = {"Free":0,"Go":0,"Attack":0}
|
||||||
WinRounds = {"Free":0,"Go":0,"Attack":0}
|
WinRounds = {"Free":0,"Go":0,"Attack":0}
|
||||||
|
|
||||||
@ -116,6 +123,8 @@ def parse_args():
|
|||||||
help="load model directory")
|
help="load model directory")
|
||||||
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
|
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
|
||||||
help="the number of steps to run in each environment per policy rollout")
|
help="the number of steps to run in each environment per policy rollout")
|
||||||
|
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
|
||||||
|
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
|
||||||
|
|
||||||
# GAE loss
|
# GAE loss
|
||||||
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||||
@ -155,38 +164,43 @@ class PPOAgent(nn.Module):
|
|||||||
def __init__(self, env: Aimbot,targetNum:int):
|
def __init__(self, env: Aimbot,targetNum:int):
|
||||||
super(PPOAgent, self).__init__()
|
super(PPOAgent, self).__init__()
|
||||||
self.targetNum = targetNum
|
self.targetNum = targetNum
|
||||||
|
self.stateSize = env.unity_observation_shape[0]
|
||||||
|
self.targetSize = TARGET_STATE_SIZE
|
||||||
|
self.timeSize = TIME_STATE_SIZE
|
||||||
|
self.gunSize = GUN_STATE_SIZE
|
||||||
|
self.myStateSize = MY_STATE_SIZE
|
||||||
|
self.totalMiddleSize = TOTAL_T_SIZE
|
||||||
|
self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input
|
||||||
|
|
||||||
self.discrete_size = env.unity_discrete_size
|
self.discrete_size = env.unity_discrete_size
|
||||||
self.discrete_shape = list(env.unity_discrete_branches)
|
self.discrete_shape = list(env.unity_discrete_branches)
|
||||||
self.continuous_size = env.unity_continuous_size
|
self.continuous_size = env.unity_continuous_size
|
||||||
|
|
||||||
self.network = nn.Sequential(
|
self.network = nn.Sequential(
|
||||||
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 500)),
|
layer_init(nn.Linear(env.unity_observation_shape[0], 300)),
|
||||||
nn.ReLU(),
|
nn.Tanh(),
|
||||||
layer_init(nn.Linear(500, 300)),
|
layer_init(nn.Linear(300, 200)),
|
||||||
nn.ReLU(),
|
nn.Tanh(),
|
||||||
)
|
)
|
||||||
self.actor_dis = nn.ModuleList([layer_init(nn.Linear(300, self.discrete_size), std=0.01) for i in range(targetNum)])
|
self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5)
|
||||||
self.actor_mean = nn.ModuleList([layer_init(nn.Linear(300, self.continuous_size), std=0.01) for i in range(targetNum)])
|
self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5)
|
||||||
self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)])
|
self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||||
self.critic = layer_init(nn.Linear(300, 1), std=1)
|
self.critic = layer_init(nn.Linear(200, 1), std=1)
|
||||||
|
|
||||||
def get_value(self, state: torch.Tensor):
|
def get_value(self, state: torch.Tensor):
|
||||||
return self.critic(self.network(state))
|
return self.critic(self.network(state))
|
||||||
|
|
||||||
def get_actions_value(self, state: torch.Tensor, actions=None):
|
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||||
hidden = self.network(state)
|
hidden = self.network(state)
|
||||||
targets = state[:,0].to(torch.int32)
|
|
||||||
|
|
||||||
# discrete
|
# discrete
|
||||||
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
dis_logits = self.actor_dis(hidden)
|
||||||
dis_logits = torch.stack([self.actor_dis[targets[i]](hidden[i]) for i in range(targets.size()[0])])
|
|
||||||
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
||||||
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
||||||
# continuous
|
# continuous
|
||||||
actions_mean = torch.stack([self.actor_mean[targets[i]](hidden[i]) for i in range(targets.size()[0])]) # self.actor_mean(hidden)
|
actions_mean = self.actor_mean(hidden)
|
||||||
# action_logstd = torch.stack([self.actor_logstd[targets[i]].expand_as(actions_mean) for i in range(targets.size()[0])]) # self.actor_logstd.expand_as(actions_mean)
|
action_logstd = self.actor_logstd.expand_as(actions_mean)
|
||||||
# print(action_logstd)
|
action_std = torch.exp(action_logstd)
|
||||||
action_std = torch.squeeze(torch.stack([torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])]),dim = -1) # torch.exp(action_logstd)
|
|
||||||
con_probs = Normal(actions_mean, action_std)
|
con_probs = Normal(actions_mean, action_std)
|
||||||
|
|
||||||
if actions is None:
|
if actions is None:
|
||||||
@ -301,11 +315,11 @@ def broadCastEndReward(rewardBF:list,remainTime:float):
|
|||||||
if (rewardBF[-1]<=-500):
|
if (rewardBF[-1]<=-500):
|
||||||
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
|
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
|
||||||
thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD
|
thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD
|
||||||
thisRewardBF = (np.asarray(thisRewardBF)).tolist()
|
thisRewardBF = thisRewardBF
|
||||||
elif (rewardBF[-1]>=500):
|
elif (rewardBF[-1]>=500):
|
||||||
# print("Win! Broadcast reward!",rewardBF[-1])
|
# print("Win! Broadcast reward!",rewardBF[-1])
|
||||||
thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD
|
thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD
|
||||||
thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*RESULT_BROADCAST_RATIO)).tolist()
|
thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist()
|
||||||
else:
|
else:
|
||||||
print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1])
|
print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1])
|
||||||
return torch.Tensor(thisRewardBF).to(device)
|
return torch.Tensor(thisRewardBF).to(device)
|
||||||
@ -322,17 +336,22 @@ if __name__ == "__main__":
|
|||||||
# Initialize environment anget optimizer
|
# Initialize environment anget optimizer
|
||||||
aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
|
aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
|
||||||
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel])
|
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel])
|
||||||
|
agentList = []
|
||||||
|
optimizers = []
|
||||||
if args.load_dir is None:
|
if args.load_dir is None:
|
||||||
agent = PPOAgent(env,TARGETNUM).to(device)
|
for i in range(using_targets_num):
|
||||||
|
agentList.append(PPOAgent(env,TARGETNUM).to(device))
|
||||||
|
optimizers.append(optim.Adam(agentList[i].parameters(), lr=args.lr, eps=1e-5))
|
||||||
else:
|
else:
|
||||||
agent = torch.load(args.load_dir)
|
print("NAH")
|
||||||
print("Load Agent", args.load_dir)
|
# !!!not finished
|
||||||
print(agent.eval())
|
# agent = torch.load(args.load_dir)
|
||||||
|
# print("Load Agent", args.load_dir)
|
||||||
|
# print(agent.eval())
|
||||||
|
|
||||||
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
|
||||||
|
|
||||||
# Tensorboard and WandB Recorder
|
# Tensorboard and WandB Recorder
|
||||||
game_name = "Aimbot_Target_Hybrid_Multi_Output"
|
game_name = "Aimbot_Target_Hybrid_PMNN_V2"
|
||||||
game_type = "OffPolicy_EndBC"
|
game_type = "OffPolicy_EndBC"
|
||||||
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
|
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
|
||||||
if args.wandb_track:
|
if args.wandb_track:
|
||||||
@ -382,38 +401,51 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
for total_steps in range(total_update_step):
|
for total_steps in range(total_update_step):
|
||||||
# discunt learning rate, while step == total_update_step lr will be 0
|
# discunt learning rate, while step == total_update_step lr will be 0
|
||||||
print("new episode")
|
|
||||||
if args.annealLR:
|
if args.annealLR:
|
||||||
finalRatio = TARGET_LEARNING_RATE/args.lr
|
finalRatio = TARGET_LEARNING_RATE/args.lr
|
||||||
frac = 1.0 - finalRatio*((total_steps - 1.0) / total_update_step)
|
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
|
||||||
lrnow = frac * args.lr
|
lrnow = frac * args.lr
|
||||||
optimizer.param_groups[0]["lr"] = lrnow
|
for optimizer in optimizers:
|
||||||
|
optimizer.param_groups[0]["lr"] = lrnow
|
||||||
|
else:
|
||||||
|
lrnow = args.lr
|
||||||
|
print("new episode",total_steps,"learning rate = ",lrnow)
|
||||||
|
|
||||||
|
|
||||||
# MAIN LOOP: run agent in environment
|
# MAIN LOOP: run agent in environment
|
||||||
i = 0
|
step = 0
|
||||||
training = False
|
training = False
|
||||||
trainQueue = []
|
trainQueue = []
|
||||||
|
last_reward = [0.for i in range(env.unity_agent_num)]
|
||||||
|
action = torch.zeros((env.unity_agent_num,env.unity_discrete_type+env.unity_continuous_size))
|
||||||
|
dis_logprob = torch.zeros((env.unity_agent_num,env.unity_discrete_size))
|
||||||
|
con_logprob = torch.zeros((env.unity_agent_num,env.unity_continuous_size))
|
||||||
|
value = torch.zeros((env.unity_agent_num,1))
|
||||||
while True:
|
while True:
|
||||||
if i % args.decision_period == 0:
|
if step % args.decision_period == 0:
|
||||||
step = round(i / args.decision_period)
|
step += 1
|
||||||
# Choose action by agent
|
# Choose action by agent
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# predict actions
|
# predict actions
|
||||||
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
|
for i in range(env.unity_agent_num):
|
||||||
torch.Tensor(state).to(device)
|
actTarget = int(state[i][0])
|
||||||
)
|
act, dis_lgprb, _, con_lgprb, _, vl = agentList[actTarget].get_actions_value(
|
||||||
value = value.flatten()
|
torch.Tensor([state[i]]).to(device)
|
||||||
|
)
|
||||||
|
action[i] = act
|
||||||
|
dis_logprob[i] = dis_lgprb.squeeze(0)
|
||||||
|
con_logprob[i] = con_lgprb.squeeze(0)
|
||||||
|
value[i] = vl.squeeze(0)
|
||||||
|
|
||||||
# variable from GPU to CPU
|
# variable from GPU to CPU
|
||||||
action_cpu = action.cpu().numpy()
|
action_cpu = action.cpu().numpy()
|
||||||
dis_logprob_cpu = dis_logprob.cpu().numpy()
|
dis_logprob_cpu = dis_logprob.cpu().numpy()
|
||||||
con_logprob_cpu = con_logprob.cpu().numpy()
|
con_logprob_cpu = con_logprob.cpu().numpy()
|
||||||
value_cpu = value.cpu().numpy()
|
value_cpu = value.flatten().cpu().numpy()
|
||||||
# Environment step
|
# Environment step
|
||||||
next_state, reward, next_done = env.step(action_cpu)
|
next_state, reward, next_done = env.step(action_cpu)
|
||||||
|
|
||||||
# save memories
|
# save memories
|
||||||
for i in range(env.unity_agent_num):
|
for i in range(env.unity_agent_num):
|
||||||
# save memories to buffers
|
# save memories to buffers
|
||||||
@ -421,22 +453,24 @@ if __name__ == "__main__":
|
|||||||
act_bf[i].append(action_cpu[i])
|
act_bf[i].append(action_cpu[i])
|
||||||
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
||||||
con_logprobs_bf[i].append(con_logprob_cpu[i])
|
con_logprobs_bf[i].append(con_logprob_cpu[i])
|
||||||
rewards_bf[i].append(reward[i])
|
rewards_bf[i].append(reward[i]+last_reward[i])
|
||||||
dones_bf[i].append(done[i])
|
dones_bf[i].append(done[i])
|
||||||
values_bf[i].append(value_cpu[i])
|
values_bf[i].append(value_cpu[i])
|
||||||
|
remainTime = state[i,TARGET_STATE_SIZE]
|
||||||
if next_done[i] == True:
|
if next_done[i] == True:
|
||||||
# finished a round, send finished memories to training datasets
|
# finished a round, send finished memories to training datasets
|
||||||
# compute advantage and discounted reward
|
# compute advantage and discounted reward
|
||||||
#print(i,"over")
|
#print(i,"over")
|
||||||
|
endTarget = int(ob_bf[i][0][0])
|
||||||
roundTargetType = int(state[i,0])
|
roundTargetType = int(state[i,0])
|
||||||
thisRewardsTensor = broadCastEndReward(rewards_bf[i],roundTargetType)
|
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
|
||||||
adv, rt = GAE(
|
adv, rt = GAE(
|
||||||
agent,
|
agentList[endTarget],
|
||||||
args,
|
args,
|
||||||
thisRewardsTensor,
|
thisRewardsTensor,
|
||||||
torch.Tensor(dones_bf[i]).to(device),
|
torch.Tensor(dones_bf[i]).to(device),
|
||||||
torch.tensor(values_bf[i]).to(device),
|
torch.tensor(values_bf[i]).to(device),
|
||||||
torch.tensor(next_state[i]).to(device),
|
torch.tensor(next_state[i]).to(device).unsqueeze(0),
|
||||||
torch.Tensor([next_done[i]]).to(device),
|
torch.Tensor([next_done[i]]).to(device),
|
||||||
)
|
)
|
||||||
# send memories to training datasets
|
# send memories to training datasets
|
||||||
@ -471,13 +505,14 @@ if __name__ == "__main__":
|
|||||||
break
|
break
|
||||||
state, done = next_state, next_done
|
state, done = next_state, next_done
|
||||||
else:
|
else:
|
||||||
|
step += 1
|
||||||
# skip this step use last predict action
|
# skip this step use last predict action
|
||||||
next_obs, reward, next_done = env.step(action_cpu)
|
next_state, reward, next_done = env.step(action_cpu)
|
||||||
# save memories
|
# save memories
|
||||||
for i in range(env.unity_agent_num):
|
for i in range(env.unity_agent_num):
|
||||||
if next_done[i] == True:
|
if next_done[i] == True:
|
||||||
#print(i,"over???")
|
#print(i,"over???")
|
||||||
# save last memories to buffers
|
# save memories to buffers
|
||||||
ob_bf[i].append(state[i])
|
ob_bf[i].append(state[i])
|
||||||
act_bf[i].append(action_cpu[i])
|
act_bf[i].append(action_cpu[i])
|
||||||
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
||||||
@ -485,30 +520,33 @@ if __name__ == "__main__":
|
|||||||
rewards_bf[i].append(reward[i])
|
rewards_bf[i].append(reward[i])
|
||||||
dones_bf[i].append(done[i])
|
dones_bf[i].append(done[i])
|
||||||
values_bf[i].append(value_cpu[i])
|
values_bf[i].append(value_cpu[i])
|
||||||
|
remainTime = state[i,TARGET_STATE_SIZE]
|
||||||
# finished a round, send finished memories to training datasets
|
# finished a round, send finished memories to training datasets
|
||||||
# compute advantage and discounted reward
|
# compute advantage and discounted reward
|
||||||
|
roundTargetType = int(state[i,0])
|
||||||
|
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
|
||||||
adv, rt = GAE(
|
adv, rt = GAE(
|
||||||
agent,
|
agentList[roundTargetType],
|
||||||
args,
|
args,
|
||||||
torch.tensor(rewards_bf[i]).to(device),
|
thisRewardsTensor,
|
||||||
torch.Tensor(dones_bf[i]).to(device),
|
torch.Tensor(dones_bf[i]).to(device),
|
||||||
torch.tensor(values_bf[i]).to(device),
|
torch.tensor(values_bf[i]).to(device),
|
||||||
torch.tensor(next_state[i]).to(device),
|
torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0),
|
||||||
torch.Tensor([next_done[i]]).to(device),
|
torch.Tensor([next_done[i]]).to(device),
|
||||||
)
|
)
|
||||||
# send memories to training datasets
|
# send memories to training datasets
|
||||||
obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
|
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
|
||||||
actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
|
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
|
||||||
dis_logprobs = torch.cat(
|
dis_logprobs[roundTargetType] = torch.cat(
|
||||||
(dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
||||||
)
|
)
|
||||||
con_logprobs = torch.cat(
|
con_logprobs[roundTargetType] = torch.cat(
|
||||||
(con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
||||||
)
|
)
|
||||||
rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0)
|
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
|
||||||
values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
|
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
|
||||||
advantages = torch.cat((advantages, adv), 0)
|
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
|
||||||
returns = torch.cat((returns, rt), 0)
|
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
|
||||||
|
|
||||||
# clear buffers
|
# clear buffers
|
||||||
ob_bf[i] = []
|
ob_bf[i] = []
|
||||||
@ -518,8 +556,10 @@ if __name__ == "__main__":
|
|||||||
rewards_bf[i] = []
|
rewards_bf[i] = []
|
||||||
dones_bf[i] = []
|
dones_bf[i] = []
|
||||||
values_bf[i] = []
|
values_bf[i] = []
|
||||||
print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
|
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
|
||||||
state, done = next_state, next_done
|
|
||||||
|
state = next_state
|
||||||
|
last_reward = reward
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
if args.train:
|
if args.train:
|
||||||
@ -535,14 +575,16 @@ if __name__ == "__main__":
|
|||||||
b_advantages = advantages[thisT].reshape(-1)
|
b_advantages = advantages[thisT].reshape(-1)
|
||||||
b_returns = returns[thisT].reshape(-1)
|
b_returns = returns[thisT].reshape(-1)
|
||||||
b_values = values[thisT].reshape(-1)
|
b_values = values[thisT].reshape(-1)
|
||||||
b_size = b_obs[thisT].size()[0]
|
b_size = b_obs.size()[0]
|
||||||
# Optimizing the policy and value network
|
# Optimizing the policy and value network
|
||||||
b_inds = np.arange(b_size)
|
b_inds = np.arange(b_size)
|
||||||
# clipfracs = []
|
# clipfracs = []
|
||||||
for epoch in range(args.epochs):
|
for epoch in range(args.epochs):
|
||||||
|
print(epoch,end="")
|
||||||
# shuffle all datasets
|
# shuffle all datasets
|
||||||
np.random.shuffle(b_inds)
|
np.random.shuffle(b_inds)
|
||||||
for start in range(0, b_size, args.minibatchSize):
|
for start in range(0, b_size, args.minibatchSize):
|
||||||
|
print(".",end="")
|
||||||
end = start + args.minibatchSize
|
end = start + args.minibatchSize
|
||||||
mb_inds = b_inds[start:end]
|
mb_inds = b_inds[start:end]
|
||||||
mb_advantages = b_advantages[mb_inds]
|
mb_advantages = b_advantages[mb_inds]
|
||||||
@ -560,7 +602,7 @@ if __name__ == "__main__":
|
|||||||
new_con_logprob,
|
new_con_logprob,
|
||||||
con_entropy,
|
con_entropy,
|
||||||
newvalue,
|
newvalue,
|
||||||
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
) = agentList[thisT].get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
||||||
# discrete ratio
|
# discrete ratio
|
||||||
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
||||||
dis_ratio = dis_logratio.exp()
|
dis_ratio = dis_logratio.exp()
|
||||||
@ -608,17 +650,17 @@ if __name__ == "__main__":
|
|||||||
# total loss
|
# total loss
|
||||||
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
||||||
loss = (
|
loss = (
|
||||||
dis_pg_loss * args.policy_coef
|
dis_pg_loss * POLICY_COEF[thisT]
|
||||||
+ con_pg_loss * args.policy_coef
|
+ con_pg_loss * POLICY_COEF[thisT]
|
||||||
- entropy_loss * args.ent_coef
|
+ entropy_loss * ENTROPY_COEF[thisT]
|
||||||
+ v_loss * args.critic_coef
|
+ v_loss * CRITIC_COEF[thisT]
|
||||||
)
|
)*LOSS_COEF[thisT]
|
||||||
|
|
||||||
optimizer.zero_grad()
|
optimizers[thisT].zero_grad()
|
||||||
loss.backward()
|
loss.backward()
|
||||||
# Clips gradient norm of an iterable of parameters.
|
# Clips gradient norm of an iterable of parameters.
|
||||||
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
|
nn.utils.clip_grad_norm_(agentList[thisT].parameters(), args.max_grad_norm)
|
||||||
optimizer.step()
|
optimizers[thisT].step()
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if args.target_kl is not None:
|
if args.target_kl is not None:
|
||||||
@ -626,6 +668,7 @@ if __name__ == "__main__":
|
|||||||
break
|
break
|
||||||
"""
|
"""
|
||||||
# record mean reward before clear history
|
# record mean reward before clear history
|
||||||
|
print("done")
|
||||||
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
|
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
|
||||||
meanRewardList.append(targetRewardMean)
|
meanRewardList.append(targetRewardMean)
|
||||||
targetName = Targets(thisT).name
|
targetName = Targets(thisT).name
|
||||||
@ -642,7 +685,6 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
# record rewards for plotting purposes
|
# record rewards for plotting purposes
|
||||||
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
||||||
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
|
||||||
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
|
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
|
||||||
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
|
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
|
||||||
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
|
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
|
||||||
@ -656,10 +698,12 @@ if __name__ == "__main__":
|
|||||||
# New Record!
|
# New Record!
|
||||||
if TotalRewardMean > bestReward:
|
if TotalRewardMean > bestReward:
|
||||||
bestReward = targetRewardMean
|
bestReward = targetRewardMean
|
||||||
saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt"
|
for i in range(using_targets_num):
|
||||||
torch.save(agent, saveDir)
|
saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) +"_"+ str(i)+".pt"
|
||||||
|
torch.save(agentList[i], saveDir)
|
||||||
|
|
||||||
saveDir = "../PPO-Model/Hybrid-MNN-500-300-Last" + ".pt"
|
for i in range(using_targets_num):
|
||||||
torch.save(agent, saveDir)
|
saveDir = "../PPO-Model/"+ run_name +"_last_"+ str(i) + ".pt"
|
||||||
|
torch.save(agentList[i], saveDir)
|
||||||
env.close()
|
env.close()
|
||||||
writer.close()
|
writer.close()
|
||||||
|
@ -601,13 +601,15 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
"import torch\n",
|
"import torch\n",
|
||||||
"import torch.nn as nn\n",
|
"import torch.nn as nn\n",
|
||||||
"import torch.optim as optim\n",
|
"import torch.optim as optim\n",
|
||||||
|
"from AimbotEnv import Aimbot\n",
|
||||||
"from torch.distributions.normal import Normal\n",
|
"from torch.distributions.normal import Normal\n",
|
||||||
"from torch.distributions.categorical import Categorical\n",
|
"from torch.distributions.categorical import Categorical\n",
|
||||||
"device = torch.device(\"cuda\" if torch.cuda.is_available() and True else \"cpu\")\n",
|
"device = torch.device(\"cuda\" if torch.cuda.is_available() and True else \"cpu\")\n",
|
||||||
@ -620,39 +622,37 @@
|
|||||||
"class PPOAgent(nn.Module):\n",
|
"class PPOAgent(nn.Module):\n",
|
||||||
" def __init__(self, env: Aimbot,targetNum:int):\n",
|
" def __init__(self, env: Aimbot,targetNum:int):\n",
|
||||||
" super(PPOAgent, self).__init__()\n",
|
" super(PPOAgent, self).__init__()\n",
|
||||||
" self.targetNum = targetNum\n",
|
" self.stateSize = env.unity_observation_shape[0]\n",
|
||||||
|
"\n",
|
||||||
" self.discrete_size = env.unity_discrete_size\n",
|
" self.discrete_size = env.unity_discrete_size\n",
|
||||||
" self.discrete_shape = list(env.unity_discrete_branches)\n",
|
" self.discrete_shape = list(env.unity_discrete_branches)\n",
|
||||||
" self.continuous_size = env.unity_continuous_size\n",
|
" self.continuous_size = env.unity_continuous_size\n",
|
||||||
"\n",
|
"\n",
|
||||||
" self.network = nn.Sequential(\n",
|
" self.network = nn.Sequential(\n",
|
||||||
" layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 500)),\n",
|
" layer_init(nn.Linear(env.unity_observation_shape[0], 300)),\n",
|
||||||
" nn.ReLU(),\n",
|
" nn.Tanh(),\n",
|
||||||
" layer_init(nn.Linear(500, 300)),\n",
|
" layer_init(nn.Linear(300, 200)),\n",
|
||||||
" nn.ReLU(),\n",
|
" nn.Tanh(),\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" self.actor_dis = nn.ModuleList([layer_init(nn.Linear(300, self.discrete_size), std=0.01) for i in range(targetNum)])\n",
|
" self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5)\n",
|
||||||
" self.actor_mean = nn.ModuleList([layer_init(nn.Linear(300, self.continuous_size), std=0.01) for i in range(targetNum)])\n",
|
" self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5)\n",
|
||||||
" self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)])\n",
|
" self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))\n",
|
||||||
" self.critic = layer_init(nn.Linear(300, 1), std=1)\n",
|
" self.critic = layer_init(nn.Linear(200, 1), std=1)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" def get_value(self, state: torch.Tensor):\n",
|
" def get_value(self, state: torch.Tensor):\n",
|
||||||
" return self.critic(self.network(state))\n",
|
" return self.critic(self.network(state))\n",
|
||||||
"\n",
|
"\n",
|
||||||
" def get_actions_value(self, state: torch.Tensor, actions=None):\n",
|
" def get_actions_value(self, state: torch.Tensor, actions=None):\n",
|
||||||
" hidden = self.network(state)\n",
|
" hidden = self.network(state)\n",
|
||||||
" targets = torch.argmax(state[:,0:self.targetNum],dim=1)\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" # discrete\n",
|
" # discrete\n",
|
||||||
" # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出\n",
|
" dis_logits = self.actor_dis(hidden)\n",
|
||||||
" dis_logits = torch.stack([self.actor_dis[targets[i]](hidden[i]) for i in range(targets.size()[0])])\n",
|
|
||||||
" split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n",
|
" split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n",
|
||||||
" multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n",
|
" multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n",
|
||||||
" # continuous\n",
|
" # continuous\n",
|
||||||
" actions_mean = torch.stack([self.actor_mean[targets[i]](hidden[i]) for i in range(targets.size()[0])]) # self.actor_mean(hidden)\n",
|
" actions_mean = self.actor_mean(hidden)\n",
|
||||||
" # action_logstd = torch.stack([self.actor_logstd[targets[i]].expand_as(actions_mean) for i in range(targets.size()[0])]) # self.actor_logstd.expand_as(actions_mean)\n",
|
" action_logstd = self.actor_logstd.expand_as(actions_mean)\n",
|
||||||
" # print(action_logstd)\n",
|
" action_std = torch.exp(action_logstd)\n",
|
||||||
" action_std = torch.squeeze(torch.stack([torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])]),dim = -1) # torch.exp(action_logstd)\n",
|
|
||||||
" con_probs = Normal(actions_mean, action_std)\n",
|
" con_probs = Normal(actions_mean, action_std)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" if actions is None:\n",
|
" if actions is None:\n",
|
||||||
@ -680,117 +680,46 @@
|
|||||||
" con_probs.log_prob(conAct).sum(1),\n",
|
" con_probs.log_prob(conAct).sum(1),\n",
|
||||||
" con_probs.entropy().sum(1),\n",
|
" con_probs.entropy().sum(1),\n",
|
||||||
" self.critic(hidden),\n",
|
" self.critic(hidden),\n",
|
||||||
" )\n",
|
" )"
|
||||||
"agent = PPOAgent(env,4).to(device)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"ppp = \"../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv\"\n",
|
||||||
|
"env = Aimbot(envPath=ppp, workerID=1, basePort=1000,side_channels=[])\n",
|
||||||
|
"agent_list = []\n",
|
||||||
|
"optimizers = []\n",
|
||||||
|
"for i in range(3):\n",
|
||||||
|
" agent_list.append(PPOAgent(env=env,targetNum=3).to('cuda'))\n",
|
||||||
|
" optimizers.append(optim.Adam(agent_list[i].parameters(),lr=1e-4))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"array([[ 1. , -10.343613 , 0. , -7.367299 ,\n",
|
"tensor([1., 2., 3., 4., 5.])"
|
||||||
" 0. , 0. , 30. , -10.343662 ,\n",
|
|
||||||
" 1. , -33.708736 , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 2. , 1. ,\n",
|
|
||||||
" 1. , 1. , 2. , 2. ,\n",
|
|
||||||
" 2. , 1. , 1. , 1. ,\n",
|
|
||||||
" 33.270493 , 39.50663 , 49.146526 , 32.595673 ,\n",
|
|
||||||
" 30.21616 , 21.163797 , 46.9299 , 1.3264331 ,\n",
|
|
||||||
" 1.2435672 , 1.2541904 , 30.08522 , 30.041445 ,\n",
|
|
||||||
" 21.072094 , 0. ],\n",
|
|
||||||
" [ 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 30. , -5.5892515 ,\n",
|
|
||||||
" 1. , -29.907726 , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 2. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 41.408752 , 47.830173 , 45.03225 , 31.905174 ,\n",
|
|
||||||
" 41.849663 , 41.849648 , 43.001434 , 45.0322 ,\n",
|
|
||||||
" 47.48242 , 40.00285 , 41.668346 , 41.607723 ,\n",
|
|
||||||
" 41.668335 , 0. ],\n",
|
|
||||||
" [ 1. , 2.9582403 , 0. , -4.699738 ,\n",
|
|
||||||
" 0. , 0. , 30. , -5.412487 ,\n",
|
|
||||||
" 1. , -32.79967 , 1. , 2. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 2. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 20.17488 , 49.507687 , 48.162056 , 45.98998 ,\n",
|
|
||||||
" 44.75835 , 31.08564 , 32.865173 , 24.676666 ,\n",
|
|
||||||
" 12.952409 , 39.69923 , 44.564423 , 44.49966 ,\n",
|
|
||||||
" 44.564495 , 0. ],\n",
|
|
||||||
" [ 2. , -0.20171738, 0. , -10.340863 ,\n",
|
|
||||||
" 0. , 0. , 30. , -22.987915 ,\n",
|
|
||||||
" 1. , -34.37514 , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 2. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 11.631058 , 13.872022 , 18.006863 , 27.457632 ,\n",
|
|
||||||
" 46.343067 , 46.343094 , 20.155125 , 49.867714 ,\n",
|
|
||||||
" 52.965984 , 56.775608 , 46.14223 , 46.075138 ,\n",
|
|
||||||
" 46.142246 , 0. ],\n",
|
|
||||||
" [ 2. , -14.687862 , 0. , -12.615574 ,\n",
|
|
||||||
" 0. , 0. , 30. , 15.125373 ,\n",
|
|
||||||
" 1. , -30.849268 , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 2. ,\n",
|
|
||||||
" 52.430542 , 48.912865 , 46.05145 , 43.974594 ,\n",
|
|
||||||
" 42.796673 , 26.467875 , 11.072432 , 7.190229 ,\n",
|
|
||||||
" 5.483198 , 4.5500183 , 42.611244 , 42.549267 ,\n",
|
|
||||||
" 18.856438 , 0. ],\n",
|
|
||||||
" [ 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 30. , -4.0314903 ,\n",
|
|
||||||
" 1. , -29.164669 , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 44.074184 , 46.9762 , 44.228096 , 42.2335 ,\n",
|
|
||||||
" 41.102253 , 41.102367 , 42.233757 , 44.22849 ,\n",
|
|
||||||
" 44.321827 , 37.335304 , 40.924183 , 40.86467 ,\n",
|
|
||||||
" 40.924236 , 0. ],\n",
|
|
||||||
" [ 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 30. , -18.603981 ,\n",
|
|
||||||
" 1. , -29.797592 , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 2. , 2. , 2. ,\n",
|
|
||||||
" 19.134174 , 22.76088 , 29.468704 , 42.88739 ,\n",
|
|
||||||
" 41.738823 , 41.739002 , 42.88781 , 44.913647 ,\n",
|
|
||||||
" 47.704174 , 51.135338 , 20.418388 , 12.470214 ,\n",
|
|
||||||
" 12.670923 , 0. ],\n",
|
|
||||||
" [ 0. , 0. , 0. , 0. ,\n",
|
|
||||||
" 0. , 0. , 30. , -19.07032 ,\n",
|
|
||||||
" 1. , -30.246218 , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 1. , 1. , 1. , 1. ,\n",
|
|
||||||
" 18.336487 , 21.81617 , 28.251017 , 42.977867 ,\n",
|
|
||||||
" 42.18994 , 42.19034 , 43.351707 , 45.399582 ,\n",
|
|
||||||
" 48.22037 , 51.68873 , 42.00719 , 41.94621 ,\n",
|
|
||||||
" 42.00739 , 0. ]], dtype=float32)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 7,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"state,_,_ = env.getSteps()\n",
|
"import torch\n",
|
||||||
"state"
|
"\n",
|
||||||
]
|
"aaa = torch.zeros((8,5))\n",
|
||||||
},
|
"aaa[0] = torch.Tensor([1,2,3,4,5])\n",
|
||||||
{
|
"aaa[0]"
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"env.close()"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
Loading…
Reference in New Issue
Block a user