Parallel Environment Discrete PPO finish
Parallel Environment Discrete PPO finish. Runnable.
This commit is contained in:
parent
742529ccd7
commit
7497ffcb0f
3
.gitignore
vendored
3
.gitignore
vendored
@ -76,8 +76,11 @@ crashlytics-build.properties
|
||||
/Aimbot-PPO-Python/.vscode/
|
||||
/Aimbot-PPO-Python/.mypy_cache/
|
||||
/Aimbot-PPO-Python/__pycache__/
|
||||
/Aimbot-PPO-Python/Tensorflow/__pycache__/
|
||||
/Aimbot-PPO-Python/Pytorch/__pycache__/
|
||||
/Aimbot-PPO-Python/Backup/
|
||||
/Aimbot-PPO-Python/Build-MultiScene-WithLoad/
|
||||
/Aimbot-PPO-Python/Build-CloseEnemyCut/
|
||||
/Aimbot-PPO-Python/Build-ParallelEnv/
|
||||
/Aimbot-PPO-Python/PPO-Model/
|
||||
/Aimbot-PPO-Python/GAIL-Expert-Data/
|
161
Aimbot-PPO-Python/Pytorch/AimBotEnv-old.py
Normal file
161
Aimbot-PPO-Python/Pytorch/AimBotEnv-old.py
Normal file
@ -0,0 +1,161 @@
|
||||
from mlagents_envs.base_env import ActionTuple
|
||||
from mlagents_envs.environment import UnityEnvironment
|
||||
|
||||
import numpy as np
|
||||
from numpy import ndarray
|
||||
|
||||
|
||||
class makeEnv(object):
|
||||
def __init__(
|
||||
self,
|
||||
envPath: str,
|
||||
workerID: int = 1,
|
||||
basePort: int = 100,
|
||||
stackSize: int = 1,
|
||||
stackIntercal: int = 0,
|
||||
):
|
||||
self.env = UnityEnvironment(
|
||||
file_name=envPath,
|
||||
seed=1,
|
||||
side_channels=[],
|
||||
worker_id=workerID,
|
||||
base_port=basePort,
|
||||
)
|
||||
self.env.reset()
|
||||
|
||||
# get enviroment specs
|
||||
self.LOAD_DIR_SIZE_IN_STATE = 3
|
||||
self.TRACKED_AGENT = -1
|
||||
self.BEHA_SPECS = self.env.behavior_specs
|
||||
self.BEHA_NAME = list(self.BEHA_SPECS)[0]
|
||||
self.SPEC = self.BEHA_SPECS[self.BEHA_NAME]
|
||||
self.OBSERVATION_SPECS = self.SPEC.observation_specs[0] # observation spec
|
||||
self.ACTION_SPEC = self.SPEC.action_spec # action specs
|
||||
|
||||
self.DISCRETE_SIZE = self.ACTION_SPEC.discrete_size
|
||||
self.DISCRETE_SHAPE = list(self.ACTION_SPEC.discrete_branches)
|
||||
self.CONTINUOUS_SIZE = self.ACTION_SPEC.continuous_size
|
||||
self.SINGLE_STATE_SIZE = self.OBSERVATION_SPECS.shape[0] - self.LOAD_DIR_SIZE_IN_STATE
|
||||
self.STATE_SIZE = self.SINGLE_STATE_SIZE * stackSize
|
||||
|
||||
# stacked State
|
||||
self.STACK_SIZE = stackSize
|
||||
self.STATE_BUFFER_SIZE = stackSize + ((stackSize - 1) * stackIntercal)
|
||||
self.STACK_INDEX = list(range(0, self.STATE_BUFFER_SIZE, stackIntercal + 1))
|
||||
self.statesBuffer = np.array([[0.0] * self.SINGLE_STATE_SIZE] * self.STATE_BUFFER_SIZE)
|
||||
print("√√√√√Enviroment Initialized Success√√√√√")
|
||||
|
||||
def step(
|
||||
self,
|
||||
actions: list,
|
||||
behaviorName: ndarray = None,
|
||||
trackedAgent: int = None,
|
||||
):
|
||||
"""change ations list to ActionTuple then send it to enviroment
|
||||
|
||||
Args:
|
||||
actions (list): PPO chooseAction output action list
|
||||
behaviorName (ndarray, optional): behaviorName. Defaults to None.
|
||||
trackedAgent (int, optional): trackedAgentID. Defaults to None.
|
||||
|
||||
Returns:
|
||||
ndarray: nextState, reward, done, loadDir, saveNow
|
||||
"""
|
||||
# take action to enviroment
|
||||
# return mextState,reward,done
|
||||
if self.DISCRETE_SIZE == 0:
|
||||
# create empty discrete action
|
||||
discreteActions = np.asarray([[0]])
|
||||
else:
|
||||
# create discrete action from actions list
|
||||
discreteActions = np.asanyarray([actions[0 : self.DISCRETE_SIZE]])
|
||||
if self.CONTINUOUS_SIZE == 0:
|
||||
# create empty continuous action
|
||||
continuousActions = np.asanyarray([[0.0]])
|
||||
else:
|
||||
# create continuous actions from actions list
|
||||
continuousActions = np.asanyarray([actions[self.DISCRETE_SIZE :]])
|
||||
|
||||
if behaviorName is None:
|
||||
behaviorName = self.BEHA_NAME
|
||||
if trackedAgent is None:
|
||||
trackedAgent = self.TRACKED_AGENT
|
||||
|
||||
# create actionTuple
|
||||
thisActionTuple = ActionTuple(continuous=continuousActions, discrete=discreteActions)
|
||||
# take action to env
|
||||
self.env.set_actions(behavior_name=behaviorName, action=thisActionTuple)
|
||||
self.env.step()
|
||||
# get nextState & reward & done after this action
|
||||
nextState, reward, done, loadDir, saveNow = self.getSteps(behaviorName, trackedAgent)
|
||||
return nextState, reward, done, loadDir, saveNow
|
||||
|
||||
def getSteps(self, behaviorName=None, trackedAgent=None):
|
||||
"""get enviroment now observations.
|
||||
Include State, Reward, Done, LoadDir, SaveNow
|
||||
|
||||
Args:
|
||||
behaviorName (_type_, optional): behaviorName. Defaults to None.
|
||||
trackedAgent (_type_, optional): trackedAgent. Defaults to None.
|
||||
|
||||
Returns:
|
||||
ndarray: nextState, reward, done, loadDir, saveNow
|
||||
"""
|
||||
# get nextState & reward & done
|
||||
if behaviorName is None:
|
||||
behaviorName = self.BEHA_NAME
|
||||
decisionSteps, terminalSteps = self.env.get_steps(behaviorName)
|
||||
if self.TRACKED_AGENT == -1 and len(decisionSteps) >= 1:
|
||||
self.TRACKED_AGENT = decisionSteps.agent_id[0]
|
||||
if trackedAgent is None:
|
||||
trackedAgent = self.TRACKED_AGENT
|
||||
|
||||
if trackedAgent in decisionSteps: # ゲーム終了していない場合、環境状態がdecision_stepsに保存される
|
||||
nextState = decisionSteps[trackedAgent].obs[0]
|
||||
nextState = np.reshape(
|
||||
nextState, [self.SINGLE_STATE_SIZE + self.LOAD_DIR_SIZE_IN_STATE]
|
||||
)
|
||||
saveNow = nextState[-1]
|
||||
loadDir = nextState[-3:-1]
|
||||
nextState = nextState[:-3]
|
||||
reward = decisionSteps[trackedAgent].reward
|
||||
done = False
|
||||
if trackedAgent in terminalSteps: # ゲーム終了した場合、環境状態がterminal_stepsに保存される
|
||||
nextState = terminalSteps[trackedAgent].obs[0]
|
||||
nextState = np.reshape(
|
||||
nextState, [self.SINGLE_STATE_SIZE + self.LOAD_DIR_SIZE_IN_STATE]
|
||||
)
|
||||
saveNow = nextState[-1]
|
||||
loadDir = nextState[-3:-1]
|
||||
nextState = nextState[:-3]
|
||||
reward = terminalSteps[trackedAgent].reward
|
||||
done = True
|
||||
|
||||
# stack state
|
||||
stackedStates = self.stackStates(nextState)
|
||||
return stackedStates, reward, done, loadDir, saveNow
|
||||
|
||||
def reset(self):
|
||||
"""reset enviroment and get observations
|
||||
|
||||
Returns:
|
||||
ndarray: nextState, reward, done, loadDir, saveNow
|
||||
"""
|
||||
# reset buffer
|
||||
self.statesBuffer = np.array([[0.0] * self.SINGLE_STATE_SIZE] * self.STATE_BUFFER_SIZE)
|
||||
# reset env
|
||||
self.env.reset()
|
||||
nextState, reward, done, loadDir, saveNow = self.getSteps()
|
||||
return nextState, reward, done, loadDir, saveNow
|
||||
|
||||
def stackStates(self, state):
|
||||
# save buffer
|
||||
self.statesBuffer[0:-1] = self.statesBuffer[1:]
|
||||
self.statesBuffer[-1] = state
|
||||
|
||||
# return stacked states
|
||||
return np.reshape(self.statesBuffer[self.STACK_INDEX], (self.STATE_SIZE))
|
||||
|
||||
def render(self):
|
||||
"""render enviroment"""
|
||||
self.env.render()
|
146
Aimbot-PPO-Python/Pytorch/AimbotEnv.py
Normal file
146
Aimbot-PPO-Python/Pytorch/AimbotEnv.py
Normal file
@ -0,0 +1,146 @@
|
||||
import gym
|
||||
import numpy as np
|
||||
|
||||
from numpy import ndarray
|
||||
from mlagents_envs.base_env import ActionTuple
|
||||
from mlagents_envs.environment import UnityEnvironment
|
||||
|
||||
|
||||
class Aimbot(gym.Env):
|
||||
def __init__(
|
||||
self,
|
||||
envPath: str,
|
||||
workerID: int = 1,
|
||||
basePort: int = 100,
|
||||
):
|
||||
super(Aimbot, self).__init__()
|
||||
self.env = UnityEnvironment(
|
||||
file_name=envPath,
|
||||
seed=1,
|
||||
side_channels=[],
|
||||
worker_id=workerID,
|
||||
base_port=basePort,
|
||||
)
|
||||
self.env.reset()
|
||||
# all behavior_specs
|
||||
self.unity_specs = self.env.behavior_specs
|
||||
# environment behavior name
|
||||
self.unity_beha_name = list(self.unity_specs)[0]
|
||||
# environment behavior spec
|
||||
self.unity_specs = self.unity_specs[self.unity_beha_name]
|
||||
# environment observation_space
|
||||
self.unity_obs_specs = self.unity_specs.observation_specs[0]
|
||||
# environment action specs
|
||||
self.unity_action_spec = self.unity_specs.action_spec
|
||||
# environment sample observation
|
||||
decisionSteps, _ = self.env.get_steps(self.unity_beha_name)
|
||||
|
||||
# OBSERVATION SPECS
|
||||
# environment state shape. like tuple:(93,)
|
||||
self.unity_observation_shape = self.unity_obs_specs.shape
|
||||
|
||||
# ACTION SPECS
|
||||
# environment continuous action number. int
|
||||
self.unity_continuous_size = self.unity_action_spec.continuous_size
|
||||
# environment discrete action shapes. list (3,3,2)
|
||||
self.unity_discrete_branches = self.unity_action_spec.discrete_branches
|
||||
# environment discrete action type. int 3
|
||||
self.unity_discrete_type = self.unity_action_spec.discrete_size
|
||||
# environment discrete action type. int 3+3+2=8
|
||||
self.unity_discrete_size = sum(self.unity_discrete_branches)
|
||||
|
||||
# AGENT SPECS
|
||||
# all agents ID
|
||||
self.unity_agent_IDS = decisionSteps.agent_id
|
||||
# agents number
|
||||
self.unity_agent_num = len(self.unity_agent_IDS)
|
||||
|
||||
def reset(self):
|
||||
"""reset enviroment and get observations
|
||||
|
||||
Returns:
|
||||
ndarray: nextState, reward, done, loadDir, saveNow
|
||||
"""
|
||||
# reset env
|
||||
self.env.reset()
|
||||
nextState, reward, done = self.getSteps()
|
||||
return nextState, reward, done
|
||||
|
||||
# TODO:
|
||||
# delete all stack state DONE
|
||||
# getstep State disassembly function DONE
|
||||
# delete agent selection function DONE
|
||||
# self.step action wrapper function DONE
|
||||
def step(
|
||||
self,
|
||||
actions: ndarray,
|
||||
):
|
||||
"""change ations list to ActionTuple then send it to enviroment
|
||||
|
||||
Args:
|
||||
actions (ndarray): PPO chooseAction output action list.(agentNum,actionNum)
|
||||
|
||||
Returns:
|
||||
ndarray: nextState, reward, done
|
||||
"""
|
||||
# take action to enviroment
|
||||
# return mextState,reward,done
|
||||
if self.unity_discrete_size == 0:
|
||||
# create empty discrete action
|
||||
discreteActions = np.asarray([[0]])
|
||||
else:
|
||||
# create discrete action from actions list
|
||||
discreteActions = actions[:, 0 : self.unity_discrete_size]
|
||||
"""
|
||||
if self.unity_continuous_size == 0:
|
||||
# create empty continuous action
|
||||
continuousActions = np.asanyarray([[0.0]])
|
||||
else:
|
||||
# create continuous actions from actions list
|
||||
continuousActions = actions[:,self.unity_discrete_size :]
|
||||
"""
|
||||
continuousActions = np.asanyarray([[0.0], [0.0], [0.0], [0.0]])
|
||||
# create actionTuple
|
||||
thisActionTuple = ActionTuple(continuous=continuousActions, discrete=discreteActions)
|
||||
# take action to env
|
||||
self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple)
|
||||
self.env.step()
|
||||
# get nextState & reward & done after this action
|
||||
nextStates, rewards, dones = self.getSteps()
|
||||
return nextStates, rewards, dones
|
||||
|
||||
def getSteps(self):
|
||||
"""get enviroment now observations.
|
||||
Include State, Reward, Done
|
||||
|
||||
Args:
|
||||
|
||||
Returns:
|
||||
ndarray: nextState, reward, done
|
||||
"""
|
||||
# get nextState & reward & done
|
||||
decisionSteps, terminalSteps = self.env.get_steps(self.unity_beha_name)
|
||||
nextStates = []
|
||||
dones = []
|
||||
rewards = []
|
||||
for thisAgentID in self.unity_agent_IDS:
|
||||
# while Episode over agentID will both in decisionSteps and terminalSteps.
|
||||
# avoid redundant state and reward,
|
||||
# use agentExist toggle to check if agent is already exist.
|
||||
agentExist = False
|
||||
# game done
|
||||
if thisAgentID in terminalSteps:
|
||||
nextStates.append(terminalSteps[thisAgentID].obs[0])
|
||||
dones.append(True)
|
||||
rewards.append(terminalSteps[thisAgentID].reward)
|
||||
agentExist = True
|
||||
# game not over yet and agent not in terminalSteps
|
||||
if (thisAgentID in decisionSteps) and (not agentExist):
|
||||
nextStates.append(decisionSteps[thisAgentID].obs[0])
|
||||
dones.append(False)
|
||||
rewards.append(decisionSteps[thisAgentID].reward)
|
||||
|
||||
return np.asarray(nextStates), rewards, dones
|
||||
|
||||
def close(self):
|
||||
self.env.close()
|
291
Aimbot-PPO-Python/Pytorch/ppo.py
Normal file
291
Aimbot-PPO-Python/Pytorch/ppo.py
Normal file
@ -0,0 +1,291 @@
|
||||
import argparse
|
||||
import time
|
||||
import numpy as np
|
||||
import random
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
|
||||
from AimbotEnv import Aimbot
|
||||
from torch.distributions.normal import Normal
|
||||
from torch.distributions.categorical import Categorical
|
||||
from distutils.util import strtobool
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
DEFAULT_SEED = 9331
|
||||
ENV_PATH = "../Build-ParallelEnv/Aimbot-ParallelEnv"
|
||||
WORKER_ID = 1
|
||||
BASE_PORT = 2002
|
||||
|
||||
|
||||
LEARNING_RATE = 2e-3
|
||||
GAMMA = 0.99
|
||||
GAE_LAMBDA = 0.95
|
||||
TOTAL_STEPS = 2000000
|
||||
STEP_NUM = 128
|
||||
MINIBATCH_NUM = 4
|
||||
EPOCHS = 4
|
||||
CLIP_COEF = 0.1
|
||||
ENTROPY_COEF = 0.01
|
||||
CRITIC_COEF = 0.5
|
||||
|
||||
ANNEAL_LEARNING_RATE = True
|
||||
CLIP_VLOSS = True
|
||||
NORM_ADV = True
|
||||
|
||||
|
||||
def parse_args():
|
||||
# fmt: off
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
|
||||
help="seed of the experiment")
|
||||
parser.add_argument("--path", type=str, default=ENV_PATH,
|
||||
help="enviroment path")
|
||||
parser.add_argument("--workerID", type=int, default=WORKER_ID,
|
||||
help="unity worker ID")
|
||||
parser.add_argument("--baseport", type=int, default=BASE_PORT,
|
||||
help="port to connect to Unity environment")
|
||||
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
|
||||
help="the learning rate of optimizer")
|
||||
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||
help="if toggled, cuda will be enabled by default")
|
||||
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
|
||||
help="total timesteps of the experiments")
|
||||
|
||||
parser.add_argument("--stepNum", type=int, default=STEP_NUM,
|
||||
help="the number of steps to run in each environment per policy rollout")
|
||||
parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM,
|
||||
help="the number of mini-batches")
|
||||
parser.add_argument("--epochs", type=int, default=EPOCHS,
|
||||
help="the K epochs to update the policy")
|
||||
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
|
||||
help="Toggle learning rate annealing for policy and value networks")
|
||||
# GAE
|
||||
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||
help="Use GAE for advantage computation")
|
||||
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
|
||||
help="Toggles advantages normalization")
|
||||
parser.add_argument("--gamma", type=float, default=GAMMA,
|
||||
help="the discount factor gamma")
|
||||
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
|
||||
help="the lambda for the general advantage estimation")
|
||||
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
|
||||
help="the surrogate clipping coefficient")
|
||||
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
|
||||
help="coefficient of the entropy")
|
||||
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
|
||||
help="coefficient of the value function")
|
||||
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
|
||||
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
|
||||
parser.add_argument("--max-grad-norm", type=float, default=0.5,
|
||||
help="the maximum norm for the gradient clipping")
|
||||
parser.add_argument("--target-kl", type=float, default=None,
|
||||
help="the target KL divergence threshold")
|
||||
# fmt: on
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
||||
torch.nn.init.orthogonal_(layer.weight, std)
|
||||
torch.nn.init.constant_(layer.bias, bias_const)
|
||||
return layer
|
||||
|
||||
|
||||
class PPOAgent(nn.Module):
|
||||
def __init__(self, env: Aimbot):
|
||||
super(PPOAgent, self).__init__()
|
||||
self.discrete_size = env.unity_discrete_size
|
||||
self.discrete_shape = list(env.unity_discrete_branches)
|
||||
|
||||
self.network = nn.Sequential(
|
||||
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 128)),
|
||||
nn.Tanh(),
|
||||
layer_init(nn.Linear(128, 128)),
|
||||
nn.ReLU(),
|
||||
layer_init(nn.Linear(128, 128)),
|
||||
nn.ReLU(),
|
||||
)
|
||||
self.dis_Actor = layer_init(nn.Linear(128, self.discrete_size), std=0.01)
|
||||
self.critic = layer_init(nn.Linear(128, 1), std=1)
|
||||
|
||||
def get_value(self, state: torch.Tensor):
|
||||
return self.critic(self.network(state))
|
||||
|
||||
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||
hidden = self.network(state)
|
||||
dis_logits = self.dis_Actor(hidden)
|
||||
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
||||
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
||||
if actions is None:
|
||||
actions = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
||||
log_prob = torch.stack(
|
||||
[ctgr.log_prob(act) for act, ctgr in zip(actions, multi_categoricals)]
|
||||
)
|
||||
entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
||||
return actions.T, log_prob.sum(0), entropy.sum(0), self.critic(hidden)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
|
||||
|
||||
# Initialize environment anget optimizer
|
||||
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport)
|
||||
agent = PPOAgent(env).to(device)
|
||||
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
||||
|
||||
# Memory Record
|
||||
obs = torch.zeros((args.stepNum, env.unity_agent_num) + env.unity_observation_shape).to(device)
|
||||
actions = torch.zeros((args.stepNum, env.unity_agent_num) + (env.unity_discrete_type,)).to(
|
||||
device
|
||||
)
|
||||
logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
||||
rewards = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
||||
dones = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
||||
values = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
||||
|
||||
# TRY NOT TO MODIFY: start the game
|
||||
args.batch_size = int(env.unity_agent_num * args.stepNum)
|
||||
args.minibatch_size = int(args.batch_size // args.minibatchesNum)
|
||||
total_update_step = args.total_timesteps // args.batch_size
|
||||
global_step = 0
|
||||
start_time = time.time()
|
||||
next_obs, _, _ = env.reset()
|
||||
next_obs = torch.Tensor(next_obs).to(device)
|
||||
next_done = torch.zeros(env.unity_agent_num).to(device)
|
||||
|
||||
for total_steps in range(total_update_step):
|
||||
# discunt learning rate, while step == total_update_step lr will be 0
|
||||
if args.annealLR:
|
||||
frac = 1.0 - (total_steps - 1.0) / total_update_step
|
||||
lrnow = frac * args.lr
|
||||
optimizer.param_groups[0]["lr"] = lrnow
|
||||
|
||||
# MAIN LOOP: run agent in environment
|
||||
for step in range(args.stepNum):
|
||||
print(step)
|
||||
global_step += 1 * env.unity_agent_num
|
||||
obs[step] = next_obs
|
||||
dones[step] = next_done
|
||||
|
||||
with torch.no_grad():
|
||||
# predict actions
|
||||
action, logprob, _, value = agent.get_actions_value(next_obs)
|
||||
value = value.flatten()
|
||||
next_obs, reward, done = env.step(action.cpu().numpy())
|
||||
|
||||
# save memories
|
||||
actions[step] = action
|
||||
logprobs[step] = logprob
|
||||
values[step] = value
|
||||
rewards[step] = torch.tensor(reward).to(device).view(-1)
|
||||
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
|
||||
|
||||
# GAE
|
||||
with torch.no_grad():
|
||||
next_value = agent.get_value(next_obs).reshape(1, -1)
|
||||
if args.gae:
|
||||
advantages = torch.zeros_like(rewards).to(device)
|
||||
lastgaelam = 0
|
||||
for t in reversed(range(args.stepNum)):
|
||||
if t == args.stepNum - 1:
|
||||
nextnonterminal = 1.0 - next_done
|
||||
nextvalues = next_value
|
||||
else:
|
||||
nextnonterminal = 1.0 - dones[t + 1]
|
||||
nextvalues = values[t + 1]
|
||||
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
|
||||
advantages[t] = lastgaelam = (
|
||||
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
|
||||
)
|
||||
returns = advantages + values
|
||||
else:
|
||||
returns = torch.zeros_like(rewards).to(device)
|
||||
for t in reversed(range(args.stepNum)):
|
||||
if t == args.stepNum - 1:
|
||||
nextnonterminal = 1.0 - next_done
|
||||
next_return = next_value
|
||||
else:
|
||||
nextnonterminal = 1.0 - dones[t + 1]
|
||||
next_return = returns[t + 1]
|
||||
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
|
||||
advantages = returns - values
|
||||
|
||||
# flatten the batch
|
||||
b_obs = obs.reshape((-1,) + env.unity_observation_shape)
|
||||
b_logprobs = logprobs.reshape(-1)
|
||||
b_actions = actions.reshape((-1,) + (env.unity_discrete_type,))
|
||||
b_advantages = advantages.reshape(-1)
|
||||
b_returns = returns.reshape(-1)
|
||||
b_values = values.reshape(-1)
|
||||
|
||||
# Optimizing the policy and value network
|
||||
b_inds = np.arange(args.batch_size)
|
||||
clipfracs = []
|
||||
for epoch in range(args.epochs):
|
||||
# shuffle all datasets
|
||||
np.random.shuffle(b_inds)
|
||||
for start in range(0, args.batch_size, args.minibatch_size):
|
||||
end = start + args.minibatch_size
|
||||
mb_inds = b_inds[start:end]
|
||||
mb_advantages = b_advantages[mb_inds]
|
||||
|
||||
# normalize advantages
|
||||
if args.norm_adv:
|
||||
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
||||
mb_advantages.std() + 1e-8
|
||||
)
|
||||
|
||||
# ratio
|
||||
_, newlogprob, entropy, newvalue = agent.get_actions_value(
|
||||
b_obs[mb_inds], b_actions.long()[mb_inds].T
|
||||
)
|
||||
logratio = newlogprob - b_logprobs[mb_inds]
|
||||
ratio = logratio.exp()
|
||||
|
||||
# early stop
|
||||
with torch.no_grad():
|
||||
# calculate approx_kl http://joschu.net/blog/kl-approx.html
|
||||
old_approx_kl = (-logratio).mean()
|
||||
approx_kl = ((ratio - 1) - logratio).mean()
|
||||
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
|
||||
|
||||
# Policy loss
|
||||
pg_loss1 = -mb_advantages * ratio
|
||||
pg_loss2 = -mb_advantages * torch.clamp(
|
||||
ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
||||
)
|
||||
pg_loss = torch.max(pg_loss1, pg_loss2).mean()
|
||||
|
||||
# Value loss
|
||||
newvalue = newvalue.view(-1)
|
||||
if args.clip_vloss:
|
||||
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
||||
v_clipped = b_values[mb_inds] + torch.clamp(
|
||||
newvalue - b_values[mb_inds],
|
||||
-args.clip_coef,
|
||||
args.clip_coef,
|
||||
)
|
||||
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
||||
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
||||
v_loss = 0.5 * v_loss_max.mean()
|
||||
else:
|
||||
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
||||
|
||||
entropy_loss = entropy.mean()
|
||||
loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.critic_coef
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
# Clips gradient norm of an iterable of parameters.
|
||||
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
|
||||
optimizer.step()
|
||||
|
||||
if args.target_kl is not None:
|
||||
if approx_kl > args.target_kl:
|
||||
break
|
7
Aimbot-PPO-Python/Pytorch/testEnv.py
Normal file
7
Aimbot-PPO-Python/Pytorch/testEnv.py
Normal file
@ -0,0 +1,7 @@
|
||||
from AimbotGym import Aimbot
|
||||
|
||||
ENV_PATH = "../Build-ParallelEnv/Aimbot-ParallelEnv"
|
||||
WORKER_ID = 1
|
||||
BASE_PORT = 2002
|
||||
|
||||
env = Aimbot(envPath=ENV_PATH,workerID= WORKER_ID,basePort= BASE_PORT)
|
453
Aimbot-PPO-Python/Pytorch/testarea.ipynb
Normal file
453
Aimbot-PPO-Python/Pytorch/testarea.ipynb
Normal file
@ -0,0 +1,453 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Action, 1 continuous ctrl 2.1\n",
|
||||
"Action, 0 continuous ctrl -1.1\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import gym\n",
|
||||
"from gym.spaces import Dict, Discrete, Box, Tuple\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class SampleGym(gym.Env):\n",
|
||||
" def __init__(self, config={}):\n",
|
||||
" self.config = config\n",
|
||||
" self.action_space = Tuple((Discrete(2), Box(-10, 10, (2,))))\n",
|
||||
" self.observation_space = Box(-10, 10, (2, 2))\n",
|
||||
" self.p_done = config.get(\"p_done\", 0.1)\n",
|
||||
"\n",
|
||||
" def reset(self):\n",
|
||||
" return self.observation_space.sample()\n",
|
||||
"\n",
|
||||
" def step(self, action):\n",
|
||||
" chosen_action = action[0]\n",
|
||||
" cnt_control = action[1][chosen_action]\n",
|
||||
"\n",
|
||||
" if chosen_action == 0:\n",
|
||||
" reward = cnt_control\n",
|
||||
" else:\n",
|
||||
" reward = -cnt_control - 1\n",
|
||||
"\n",
|
||||
" print(f\"Action, {chosen_action} continuous ctrl {cnt_control}\")\n",
|
||||
" return (\n",
|
||||
" self.observation_space.sample(),\n",
|
||||
" reward,\n",
|
||||
" bool(np.random.choice([True, False], p=[self.p_done, 1.0 - self.p_done])),\n",
|
||||
" {},\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" env = SampleGym()\n",
|
||||
" env.reset()\n",
|
||||
" env.step((1, [-1, 2.1])) # should say use action 1 with 2.1\n",
|
||||
" env.step((0, [-1.1, 2.1])) # should say use action 0 with -1.1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from mlagents_envs.environment import UnityEnvironment\n",
|
||||
"from gym_unity.envs import UnityToGymWrapper\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n",
|
||||
"WORKER_ID = 1\n",
|
||||
"BASE_PORT = 2002\n",
|
||||
"\n",
|
||||
"env = UnityEnvironment(\n",
|
||||
" file_name=ENV_PATH,\n",
|
||||
" seed=1,\n",
|
||||
" side_channels=[],\n",
|
||||
" worker_id=WORKER_ID,\n",
|
||||
" base_port=BASE_PORT,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"trackedAgent = 0\n",
|
||||
"env.reset()\n",
|
||||
"BEHA_SPECS = env.behavior_specs\n",
|
||||
"BEHA_NAME = list(BEHA_SPECS)[0]\n",
|
||||
"SPEC = BEHA_SPECS[BEHA_NAME]\n",
|
||||
"print(SPEC)\n",
|
||||
"\n",
|
||||
"decisionSteps, terminalSteps = env.get_steps(BEHA_NAME)\n",
|
||||
"\n",
|
||||
"if trackedAgent in decisionSteps: # ゲーム終了していない場合、環境状態がdecision_stepsに保存される\n",
|
||||
" nextState = decisionSteps[trackedAgent].obs[0]\n",
|
||||
" reward = decisionSteps[trackedAgent].reward\n",
|
||||
" done = False\n",
|
||||
"if trackedAgent in terminalSteps: # ゲーム終了した場合、環境状態がterminal_stepsに保存される\n",
|
||||
" nextState = terminalSteps[trackedAgent].obs[0]\n",
|
||||
" reward = terminalSteps[trackedAgent].reward\n",
|
||||
" done = True\n",
|
||||
"print(decisionSteps.agent_id)\n",
|
||||
"print(terminalSteps)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"decisionSteps.agent_id [1 2 5 7]\n",
|
||||
"decisionSteps.agent_id_to_index {1: 0, 2: 1, 5: 2, 7: 3}\n",
|
||||
"decisionSteps.reward [0. 0. 0. 0.]\n",
|
||||
"decisionSteps.action_mask [array([[False, False, False],\n",
|
||||
" [False, False, False],\n",
|
||||
" [False, False, False],\n",
|
||||
" [False, False, False]]), array([[False, False, False],\n",
|
||||
" [False, False, False],\n",
|
||||
" [False, False, False],\n",
|
||||
" [False, False, False]]), array([[False, False],\n",
|
||||
" [False, False],\n",
|
||||
" [False, False],\n",
|
||||
" [False, False]])]\n",
|
||||
"decisionSteps.obs [ 0. 0. 0. 0. 0. 0.\n",
|
||||
" 0. 0. 0. 0. 0. 0.\n",
|
||||
" 0. 0. 0. 0. 0. 0.\n",
|
||||
" 0. 0. 0. 0. 0. 0.\n",
|
||||
" 0. 0. 0. 0. 0. 0.\n",
|
||||
" 0. 0. 0. 0. 0. 0.\n",
|
||||
" 0. 0. 0. 0. 0. 0.\n",
|
||||
" 0. 0. 0. 0. 0. 0.\n",
|
||||
" 0. 0. 0. 0. 0. 0.\n",
|
||||
" 0. 0. 0. 0. 0. 0.\n",
|
||||
" 0. 0. -15.994009 1. -26.322788 1.\n",
|
||||
" 1. 1. 1. 1. 1. 2.\n",
|
||||
" 1. 1. 1. 1. 1. 1.\n",
|
||||
" 1. 1.3519633 1.6946528 2.3051548 3.673389 9.067246\n",
|
||||
" 17.521473 21.727095 22.753294 24.167128 25.905216 18.35725\n",
|
||||
" 21.02278 21.053417 0. ]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'decisionSteps.obs [array([[-15.994009 , 1. , -26.322788 , 1. , 1. ,\\n 1. , 1. , 1. , 1. , 2. ,\\n 1. , 1. , 1. , 1. , 1. ,\\n 1. , 1. , 1.3519633, 1.6946528, 2.3051548,\\n 3.673389 , 9.067246 , 17.521473 , 21.727095 , 22.753294 ,\\n 24.167128 , 25.905216 , 18.35725 , 21.02278 , 21.053417 ,\\n 0. ],\\n [ -1.8809433, 1. , -25.66834 , 1. , 2. ,\\n 1. , 1. , 1. , 1. , 1. ,\\n 1. , 1. , 1. , 1. , 1. ,\\n 1. , 1. , 16.768637 , 23.414627 , 22.04486 ,\\n 21.050663 , 20.486784 , 20.486784 , 21.050665 , 15.049731 ,\\n 11.578419 , 9.695194 , 20.398016 , 20.368341 , 20.398016 ,\\n...\\n 20.551746 , 20.00118 , 20.001116 , 20.551594 , 21.5222 ,\\n 17.707508 , 14.86889 , 19.914494 , 19.885508 , 19.914463 ,\\n 0. ]], dtype=float32)]'"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"decisionSteps.agent_id\",decisionSteps.agent_id)\n",
|
||||
"# decisionSteps.agent_id [1 2 5 7]\n",
|
||||
"print(\"decisionSteps.agent_id_to_index\",decisionSteps.agent_id_to_index)\n",
|
||||
"# decisionSteps.agent_id_to_index {1: 0, 2: 1, 5: 2, 7: 3}\n",
|
||||
"print(\"decisionSteps.reward\",decisionSteps.reward)\n",
|
||||
"# decisionSteps.reward [0. 0. 0. 0.]\n",
|
||||
"print(\"decisionSteps.action_mask\",decisionSteps.action_mask)\n",
|
||||
"'''\n",
|
||||
"decisionSteps.action_mask [array([[False, False, False],\n",
|
||||
" [False, False, False],\n",
|
||||
" [False, False, False],\n",
|
||||
" [False, False, False]]), array([[False, False, False],\n",
|
||||
" [False, False, False],\n",
|
||||
" [False, False, False],\n",
|
||||
" [False, False, False]]), array([[False, False],\n",
|
||||
" [False, False],\n",
|
||||
" [False, False],\n",
|
||||
" [False, False]])]\n",
|
||||
"'''\n",
|
||||
"print(\"decisionSteps.obs\", decisionSteps.obs[0][0])\n",
|
||||
"'''decisionSteps.obs [array([[-15.994009 , 1. , -26.322788 , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 2. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1.3519633, 1.6946528, 2.3051548,\n",
|
||||
" 3.673389 , 9.067246 , 17.521473 , 21.727095 , 22.753294 ,\n",
|
||||
" 24.167128 , 25.905216 , 18.35725 , 21.02278 , 21.053417 ,\n",
|
||||
" 0. ],\n",
|
||||
" [ -1.8809433, 1. , -25.66834 , 1. , 2. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 16.768637 , 23.414627 , 22.04486 ,\n",
|
||||
" 21.050663 , 20.486784 , 20.486784 , 21.050665 , 15.049731 ,\n",
|
||||
" 11.578419 , 9.695194 , 20.398016 , 20.368341 , 20.398016 ,\n",
|
||||
"...\n",
|
||||
" 20.551746 , 20.00118 , 20.001116 , 20.551594 , 21.5222 ,\n",
|
||||
" 17.707508 , 14.86889 , 19.914494 , 19.885508 , 19.914463 ,\n",
|
||||
" 0. ]], dtype=float32)]'''\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from AimbotEnv import Aimbot\n",
|
||||
"\n",
|
||||
"ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n",
|
||||
"WORKER_ID = 1\n",
|
||||
"BASE_PORT = 2002\n",
|
||||
"\n",
|
||||
"env = Aimbot(envPath=ENV_PATH,workerID= WORKER_ID,basePort= BASE_PORT)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(array([[ 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , -15.994009 , 1. , -26.322788 , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 2. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1.3519633, 1.6946528,\n",
|
||||
" 2.3051548, 3.673389 , 9.067246 , 17.521473 , 21.727095 ,\n",
|
||||
" 22.753294 , 24.167128 , 25.905216 , 18.35725 , 21.02278 ,\n",
|
||||
" 21.053417 , 0. , -15.994003 , 1. , -26.322784 ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1.3519667,\n",
|
||||
" 1.6946585, 2.3051722, 3.6734192, 9.067533 , 21.145092 ,\n",
|
||||
" 21.727148 , 22.753365 , 24.167217 , 25.905317 , 18.358263 ,\n",
|
||||
" 21.022812 , 21.053455 , 0. ],\n",
|
||||
" [ 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , -1.8809433, 1. , -25.66834 , 1. ,\n",
|
||||
" 2. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 16.768637 , 23.414627 ,\n",
|
||||
" 22.04486 , 21.050663 , 20.486784 , 20.486784 , 21.050665 ,\n",
|
||||
" 15.049731 , 11.578419 , 9.695194 , 20.398016 , 20.368341 ,\n",
|
||||
" 20.398016 , 0. , -1.8809433, 1. , -25.66834 ,\n",
|
||||
" 1. , 1. , 2. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 2. ,\n",
|
||||
" 2. , 1. , 1. , 1. , 25.098585 ,\n",
|
||||
" 15.749494 , 22.044899 , 21.050697 , 20.486813 , 20.486813 ,\n",
|
||||
" 21.050694 , 15.049746 , 3.872317 , 3.789325 , 20.398046 ,\n",
|
||||
" 20.368372 , 20.398046 , 0. ],\n",
|
||||
" [ 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , -13.672583 , 1. , -26.479263 , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 5.3249803, 6.401276 ,\n",
|
||||
" 8.374101 , 12.8657875, 21.302414 , 21.30242 , 21.888742 ,\n",
|
||||
" 22.92251 , 24.346794 , 26.09773 , 21.210114 , 21.179258 ,\n",
|
||||
" 21.210117 , 0. , -13.672583 , 1. , -26.479263 ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 2. , 1. , 1. ,\n",
|
||||
" 2. , 1. , 1. , 2. , 5.3249855,\n",
|
||||
" 6.4012837, 8.374114 , 12.865807 , 21.302446 , 21.30245 ,\n",
|
||||
" 16.168503 , 22.922543 , 24.346823 , 7.1110754, 21.210148 ,\n",
|
||||
" 21.17929 , 12.495141 , 0. ],\n",
|
||||
" [ 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , -4.9038744, 1. , -25.185507 , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 20.33171 , 22.859762 ,\n",
|
||||
" 21.522427 , 20.551746 , 20.00118 , 20.001116 , 20.551594 ,\n",
|
||||
" 21.5222 , 17.707508 , 14.86889 , 19.914494 , 19.885508 ,\n",
|
||||
" 19.914463 , 0. , -4.9038773, 1. , -25.185507 ,\n",
|
||||
" 1. , 2. , 1. , 2. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 2. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 15.905993 ,\n",
|
||||
" 22.85977 , 11.566693 , 20.551773 , 20.00121 , 20.001146 ,\n",
|
||||
" 20.551619 , 7.135157 , 17.707582 , 14.868943 , 19.914528 ,\n",
|
||||
" 19.88554 , 19.914494 , 0. ]], dtype=float32),\n",
|
||||
" [[-0.05], [-0.05], [-0.05], [-0.05]],\n",
|
||||
" [[False], [False], [False], [False]])"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"env.unity_observation_shape\n",
|
||||
"(128, 4) + env.unity_observation_shape\n",
|
||||
"env.reset()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[0 0 0 0]\n",
|
||||
" [0 0 0 0]\n",
|
||||
" [0 0 0 0]\n",
|
||||
" [0 0 0 0]]\n",
|
||||
"[[0]\n",
|
||||
" [0]\n",
|
||||
" [0]\n",
|
||||
" [0]]\n",
|
||||
"[[0 0 0]\n",
|
||||
" [0 0 0]\n",
|
||||
" [0 0 0]\n",
|
||||
" [0 0 0]]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"([array([ 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , -15.994009 , 1. , -26.322788 , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 2. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1.3519633, 1.6946528,\n",
|
||||
" 2.3051548, 3.673389 , 9.067246 , 17.521473 , 21.727095 ,\n",
|
||||
" 22.753294 , 24.167128 , 25.905216 , 18.35725 , 21.02278 ,\n",
|
||||
" 21.053417 , 0. , -15.994003 , 1. , -26.322784 ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 2. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1.3519667,\n",
|
||||
" 1.6946585, 2.3051722, 3.6734192, 9.067533 , 17.521563 ,\n",
|
||||
" 21.727148 , 22.753365 , 24.167217 , 25.905317 , 18.358263 ,\n",
|
||||
" 21.022812 , 21.053455 , 0. ], dtype=float32),\n",
|
||||
" array([ 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , -1.8809433, 1. , -25.66834 , 1. ,\n",
|
||||
" 2. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 16.768637 , 23.414627 ,\n",
|
||||
" 22.04486 , 21.050663 , 20.486784 , 20.486784 , 21.050665 ,\n",
|
||||
" 15.049731 , 11.578419 , 9.695194 , 20.398016 , 20.368341 ,\n",
|
||||
" 20.398016 , 0. , -1.8809433, 1. , -25.66834 ,\n",
|
||||
" 1. , 2. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 16.768671 ,\n",
|
||||
" 23.414669 , 22.044899 , 21.050697 , 20.486813 , 20.486813 ,\n",
|
||||
" 21.050694 , 15.049746 , 11.578423 , 9.695195 , 20.398046 ,\n",
|
||||
" 20.368372 , 20.398046 , 0. ], dtype=float32),\n",
|
||||
" array([ 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , -13.672583 , 1. , -26.479263 , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 5.3249803, 6.401276 ,\n",
|
||||
" 8.374101 , 12.8657875, 21.302414 , 21.30242 , 21.888742 ,\n",
|
||||
" 22.92251 , 24.346794 , 26.09773 , 21.210114 , 21.179258 ,\n",
|
||||
" 21.210117 , 0. , -13.672583 , 1. , -26.479263 ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 5.3249855,\n",
|
||||
" 6.4012837, 8.374114 , 12.865807 , 21.302446 , 21.30245 ,\n",
|
||||
" 21.888773 , 22.922543 , 24.346823 , 26.097757 , 21.210148 ,\n",
|
||||
" 21.17929 , 21.21015 , 0. ], dtype=float32),\n",
|
||||
" array([ 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , 0. , 0. , 0. , 0. ,\n",
|
||||
" 0. , -4.9038744, 1. , -25.185507 , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 20.33171 , 22.859762 ,\n",
|
||||
" 21.522427 , 20.551746 , 20.00118 , 20.001116 , 20.551594 ,\n",
|
||||
" 21.5222 , 17.707508 , 14.86889 , 19.914494 , 19.885508 ,\n",
|
||||
" 19.914463 , 0. , -4.9038773, 1. , -25.185507 ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 1. ,\n",
|
||||
" 1. , 1. , 1. , 1. , 20.331783 ,\n",
|
||||
" 22.85977 , 21.522448 , 20.551773 , 20.00121 , 20.001146 ,\n",
|
||||
" 20.551619 , 21.522217 , 17.707582 , 14.868943 , 19.914528 ,\n",
|
||||
" 19.88554 , 19.914494 , 0. ], dtype=float32)],\n",
|
||||
" [[-0.05], [-0.05], [-0.05], [-0.05]],\n",
|
||||
" [[False], [False], [False], [False]])"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"actions = np.zeros_like(np.arange(16).reshape(4, 4))\n",
|
||||
"print(actions)\n",
|
||||
"env.step(actions)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.7 64-bit",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "86e2db13b09bd6be22cb599ea60c1572b9ef36ebeaa27a4c8e961d6df315ac32"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user