ScrollBall-DQN/scrollball-main.py

314 lines
13 KiB
Python
Raw Normal View History

2024-03-05 10:11:38 +00:00
import mlagents_envs
from mlagents_envs.base_env import ActionTuple
from mlagents_envs.environment import UnityEnvironment
import argparse
import tensorflow as tf
import tensorflow_addons as tfa
import tensorboard
import numpy as np
import matplotlib.pyplot as plt
import time
import datetime
from collections import deque
from IPython.display import clear_output
print("ML-Agents Version :",mlagents_envs.__version__)
print("TensroFlow Version:",tf.__version__)
# 環境パラメータ
log_dir = "ML-logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tb_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
env_path = './ScrollBall-Build/ML-ScrollBall-Sample'
brain_name = 'RollerBallBrain'
parser = argparse.ArgumentParser()
parser.add_argument("--env_path", default=env_path)
parser.add_argument("-train", action="store_true",default=False)
parser.add_argument("-silent", action="store_true",default=False)
args = parser.parse_args()
# ゲーム環境獲得
env = UnityEnvironment(file_name=args.env_path, seed=1, side_channels=[])
env.reset()
# 環境スペック獲得
tracked_agent = -1
behavior_specs = env.behavior_specs
behavior_name = list(behavior_specs)[0]
spec = behavior_specs[behavior_name]
observation_specs = spec.observation_specs[0] # 観測spec
action_spec = spec.action_spec # 動作spec
ENV_Discrete_ACTION_SIZE = action_spec.discrete_size# 連続的な動作のSize
ENV_Continuous_ACTION_SIZE = action_spec.continuous_size# 離散的な動作のSize
STATE_SIZE = observation_specs.shape[0]# 環境観測データ数
SAVE_STEPS = 100 # SAVE_STEPS毎にNNを保存する
ACTION_SIZE = ENV_Discrete_ACTION_SIZE * 3#トータル動作数、一種類の動作に三つの動作が存在するため、*3とする
MAX_EXP_NUM = 2500 # ExperiencePoolに保存できる最大過去記録数
EPSILON_CUT_STEP = 1300
EPISODES = 500
REPLACE_STEPS = 50
BATCH_SIZE = 256
LEARNING_RATE = 0.0005
GAMMA = 0.9
epsilon = 1
epsilon_min = 0.01
print("ステップ毎に環境観測データ数",STATE_SIZE)
print("ステップ毎に実行可能な動作数",ENV_Discrete_ACTION_SIZE)
# Experience Pool
class experiencePool:
def __init__(self):
self.exp_pool = deque(maxlen=MAX_EXP_NUM)
def add(self, state, action, reward, netx_state, done):
self.exp_pool.append((state, action, reward, netx_state, done))
def get_random(self, num=1):
random_index = np.random.choice(len(self.exp_pool), num)
random_exps = [self.exp_pool[i] for i in random_index]
return random_exps
def get_len(self):
return len(self.exp_pool)
# DQNメソッド
class DQN:
def __init__(self,load,load_dir):
self.learning_rate = LEARNING_RATE
self.epsilon = 1
self.epsilon_min = 0.01
self.epsilon_cut = (1-self.epsilon_min)/EPSILON_CUT_STEP
self.gamma = GAMMA
if load:
#既存NNデータをローディングする
self.epsilon = self.epsilon_min
main_load_dir = load_dir+"main.h5"
target_load_dir = load_dir+"target.h5"
self.main_net,self.target_net = self.loadNN(main_load_dir,target_load_dir)
else:
#新規mainとtarget NNを作成する
self.main_net = self.build_net()
self.target_net = self.build_net()
self.exp_pool = experiencePool()
# ---------------------------------------------------------------------------------
def build_net(self):
# NNを作成
rectifiedAdam = tfa.optimizers.RectifiedAdam(learning_rate = self.learning_rate,weight_decay = 0.001)
#Adam = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
neural_net = tf.keras.Sequential()
neural_net.add(tf.keras.layers.Dense(
units=128, activation='relu', input_dim=STATE_SIZE))
neural_net.add(tf.keras.layers.Dense(
units=256, activation='relu'))
neural_net.add(tf.keras.layers.Dense(
units=128, activation='relu'))
neural_net.add(tf.keras.layers.Dense(
units=64, activation='relu'))
neural_net.add(tf.keras.layers.Dense(
units=ACTION_SIZE, activation='elu'))
neural_net.compile(optimizer=rectifiedAdam, loss='mse', metrics=['accuracy'])
return neural_net
def select_action(self, state):
# 動作Q値を予測と動作選択
random_num = np.random.sample()
if random_num > self.epsilon:
# DQNをベースにし、動作を選択する
predictResult = self.main_net(state).numpy()[0]
actionX = np.argmax(predictResult[0:3])-1
actionZ = np.argmax(predictResult[3:])-1
action = np.array([actionX, actionZ], dtype=np.float32)
#print("action = ",action)
else:
# ランダムで動作を選択
actionX = np.random.randint(ACTION_SIZE/2)-1
actionY = np.random.randint(ACTION_SIZE/2)-1
action = np.array([actionX, actionY], dtype=np.float32)
# 缩小epsilon
if self.epsilon > self.epsilon_min:
self.epsilon -= self.epsilon_cut
return action
def training(self):
# トレーニング開始
if self.exp_pool.get_len() >= BATCH_SIZE:
# トレーニング集を獲得
exp_set = self.exp_pool.get_random(num=BATCH_SIZE)
exp_state = [data[0] for data in exp_set] # EXP_Poolが記録した当時ラウンドの環境状態
exp_action = [data[1] for data in exp_set] # そのラウンドで選んだ動作
exp_reward = [data[2] for data in exp_set] # その動作に応じるreward
exp_next_state = [data[3] for data in exp_set] # その動作が実行した後の環境状態。
exp_done = [data[4] for data in exp_set] # 実行後にゲームが終了したか
exp_state = np.asarray(exp_state).squeeze()
exp_action = np.asarray(exp_action).squeeze()
exp_next_state = np.asarray(exp_next_state).squeeze()
# 各ネットでQ値予測
target_net_q = self.target_net(exp_next_state).numpy() # target_NN 未来状況のQ値を予測
main_net_q = self.main_net(exp_state).numpy() # main_NN 現在状況のQ値を予測
# トレーニング用Q値、目標y、
y = main_net_q.copy() # (1,6)
# Batch全体インデクス、[0,1,......,BATCH_SIZE]
batch_index = np.arange(BATCH_SIZE, dtype=np.int32)
# 動作の値(-1,0,1)によってQ値のインデクス(Xは(0,1,2)、Zは(3,4,5),各自Shapeは(1,BATCH_SIZE))を作成
exp_actionX_index = exp_action[:,0] + 1
exp_actionZ_index = exp_action[:,1] + 4
exp_actionX_index = exp_actionX_index.astype(np.int)
exp_actionZ_index = exp_actionZ_index.astype(np.int)
# target_NNが未来状況によって予測したQ値から 垂直/水平動作各自の最大値Q値を摘出
fixedX = np.max(target_net_q[:, :3], axis=1) # (batchsize,1)
fixedZ = np.max(target_net_q[:, -3:], axis=1) # (batchsize,1)
# そのラウンドで受けたreward+未来最大Q値の和で修正値となる
fixedX = exp_reward + self.gamma*fixedX
fixedZ = exp_reward + self.gamma*fixedZ
# ゲーム終了のラウンドでの修正値は元のreward,ゲーム続行する時の修正値はfixedXとfixedYとする
y_fixedX = np.where(exp_done,exp_reward,fixedX)
y_fixedZ = np.where(exp_done,exp_reward,fixedZ)
# 修正値を応用
y[batch_index, exp_actionX_index] = y_fixedX
y[batch_index, exp_actionZ_index] = y_fixedZ
# main_netに入れて、フィットする
self.main_net.fit(exp_state, y, epochs=5, verbose=0,callbacks = [tb_callback])
def saveNN(self):
# 両NNを保存する
main_save_dir= "ML-Model/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+"main.h5"
target_save_dir= "ML-Model/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+"target.h5"
self.main_net.save(main_save_dir)
self.target_net.save(target_save_dir)
print("Model Saved")
def loadNN(self,main_load_dir,target_load_dir):
# 両NNをローディングする
main_net_loaded = tf.keras.models.load_model(main_load_dir)
target_net_loaded = tf.keras.models.load_model(target_load_dir)
print("Model Loaded")
return main_net_loaded,target_net_loaded
if not args.train:
agent = DQN(load = True,load_dir="ML-Model/" + "FinalNN-")
else:
agent = DQN(load = False,load_dir="ML-Model/" + "20220205-051103")
total_steps = 0
steps_list = []
successTimes = 0
failTimes = 0
successTimes_his = []
failTimes_his = []
this10TimesWin = 0
MeanWinPerin10Times = []
for episode in range(EPISODES):
# episode 開始
done = False #ゲーム終了状態をFalse
steps = 0
# 環境初期状態を獲得
env.reset()
decision_steps, terminal_steps = env.get_steps(behavior_name)
state = decision_steps.obs[0]
state = np.reshape(state, [1, STATE_SIZE])
# ゲームスタート
while True:
reward = 0
steps+=1
total_steps += 1
# エージェントナンバーをトラックする
if tracked_agent == -1 and len(decision_steps) >= 1:
tracked_agent = decision_steps.agent_id[0]
# REPLACE_STEPS毎でtarget_net にmain_netで入れ替える
if total_steps % REPLACE_STEPS == 0 and total_steps !=0:
agent.target_net.set_weights(agent.main_net.get_weights())
print('target_net replaced')
# SAVE_STEPS毎でNNを保存する
if total_steps % SAVE_STEPS ==0 and total_steps !=0:
agent.saveNN()
# main_netで動作選択
action = agent.select_action(state=state)
continuous_actions = np.array([[0]], dtype=np.float)
discrete_actions = np.expand_dims(action,axis=0)
# 動作をML-Agentsが認識可能なActionTuple型に変換
action_Tuple = ActionTuple(
continuous=continuous_actions, discrete=discrete_actions)
# 動作をゲーム環境に渡す。
env.set_actions(behavior_name=behavior_name, action=action_Tuple)
env.step()
# 環境が動作を実行し、次の環境状態を返す。
decision_steps, terminal_steps = env.get_steps(behavior_name)
if tracked_agent in decision_steps: # ゲーム終了していない場合、環境状態がdecision_stepsに保存される
next_state = decision_steps[tracked_agent].obs[0]
next_state = np.reshape(next_state,[1,STATE_SIZE])
reward = decision_steps[tracked_agent].reward
if tracked_agent in terminal_steps: # ゲーム終了した場合、環境状態がterminal_stepsに保存される
next_state = terminal_steps[tracked_agent].obs[0]
next_state = np.reshape(next_state,[1,STATE_SIZE])
reward = terminal_steps[tracked_agent].reward
done = True
# Experience_poolに保存
agent.exp_pool.add(state,action,reward,next_state,done)
#print("Reward = ",reward)
# 環境状態を次状態に変更。
state = next_state
# ゲーム終了後処理
if done:
mean_steps = total_steps/(episode+1)
print("\nEpisode",episode,"done,Reward =",reward,"mean steps =",mean_steps,"exp_num =",agent.exp_pool.get_len(),"\nepsilon =",agent.epsilon)
agent.training()
if(reward >=10):
successTimes+=1
this10TimesWin+=1
else:
failTimes+=1
successTimes_his.append(successTimes)
failTimes_his.append(failTimes)
if episode % 10 ==0 and episode !=0:
clear_output()
this10TimesWinPer = float(this10TimesWin/10)
this10TimesWin = 0
MeanWinPerin10Times.append(this10TimesWinPer)
# 合計成功数(緑)、合計失敗数(赤)を図で表示する
if not args.silent:
plt.figure(figsize = (15,10))
plt.plot(range(len(successTimes_his)), successTimes_his,color='green',linestyle='--', linewidth=1, label='TotalWinTimes')
plt.plot(range(len(successTimes_his)), failTimes_his,color='red', linewidth=1,marker='o', markersize=1, markerfacecolor='black',markeredgecolor='black',label='TotalFaildTimes')
# plt.savefig('output.jpg')
plt.legend()
plt.savefig("wintimes.png")
plt.show()
#10回実行した後の成功確率を図で表示する
plt.figure(figsize=(15,10))
plt.plot(MeanWinPerin10Times)
plt.savefig("steps.png")
plt.show()
break
env.close()
print("Finished~")