diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..80908f4
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+ "python.pythonPath": "/anaconda3/envs/rlcode/bin/python"
+}
\ No newline at end of file
diff --git a/1-grid-world/1-policy-iteration/environment.py b/1-grid-world/1-policy-iteration/environment.py
index c6f7b6f..45c9962 100644
--- a/1-grid-world/1-policy-iteration/environment.py
+++ b/1-grid-world/1-policy-iteration/environment.py
@@ -182,7 +182,7 @@ def draw_from_policy(self, policy_table):
def print_value_table(self, value_table):
for i in range(WIDTH):
for j in range(HEIGHT):
- self.text_value(i, j, value_table[i][j])
+ self.text_value(i, j, round(value_table[i][j], 2))
def render(self):
time.sleep(0.1)
diff --git a/1-grid-world/1-policy-iteration/policy_iteration.py b/1-grid-world/1-policy-iteration/policy_iteration.py
index 617d764..6af6197 100644
--- a/1-grid-world/1-policy-iteration/policy_iteration.py
+++ b/1-grid-world/1-policy-iteration/policy_iteration.py
@@ -1,5 +1,4 @@
-# -*- coding: utf-8 -*-
-import random
+import numpy as np
from environment import GraphicDisplay, Env
@@ -11,17 +10,17 @@ def __init__(self, env):
self.value_table = [[0.0] * env.width for _ in range(env.height)]
# 상 하 좌 우 동일한 확률로 정책 초기화
self.policy_table = [[[0.25, 0.25, 0.25, 0.25]] * env.width
- for _ in range(env.height)]
+ for _ in range(env.height)]
# 마침 상태의 설정
self.policy_table[2][2] = []
- # 감가율
+ # 할인율
self.discount_factor = 0.9
+ # 벨만 기대 방정식을 통해 다음 가치함수를 계산하는 정책 평가
def policy_evaluation(self):
-
# 다음 가치함수 초기화
next_value_table = [[0.00] * self.env.width
- for _ in range(self.env.height)]
+ for _ in range(self.env.height)]
# 모든 상태에 대해서 벨만 기대방정식을 계산
for state in self.env.get_all_states():
@@ -39,7 +38,7 @@ def policy_evaluation(self):
value += (self.get_policy(state)[action] *
(reward + self.discount_factor * next_value))
- next_value_table[state[0]][state[1]] = round(value, 2)
+ next_value_table[state[0]][state[1]] = value
self.value_table = next_value_table
@@ -49,59 +48,45 @@ def policy_improvement(self):
for state in self.env.get_all_states():
if state == [2, 2]:
continue
- value = -99999
- max_index = []
+
+ value_list = []
# 반환할 정책 초기화
result = [0.0, 0.0, 0.0, 0.0]
- # 모든 행동에 대해서 [보상 + (감가율 * 다음 상태 가치함수)] 계산
+ # 모든 행동에 대해서 [보상 + (할인율 * 다음 상태 가치함수)] 계산
for index, action in enumerate(self.env.possible_actions):
next_state = self.env.state_after_action(state, action)
reward = self.env.get_reward(state, action)
next_value = self.get_value(next_state)
- temp = reward + self.discount_factor * next_value
-
- # 받을 보상이 최대인 행동의 index(최대가 복수라면 모두)를 추출
- if temp == value:
- max_index.append(index)
- elif temp > value:
- value = temp
- max_index.clear()
- max_index.append(index)
+ value = reward + self.discount_factor * next_value
+ value_list.append(value)
- # 행동의 확률 계산
- prob = 1 / len(max_index)
+ # 받을 보상이 최대인 행동들에 대해 탐욕 정책 발전
+ max_idx_list = np.argwhere(value_list == np.amax(value_list))
+ max_idx_list = max_idx_list.flatten().tolist()
+ prob = 1 / len(max_idx_list)
- for index in max_index:
- result[index] = prob
+ for idx in max_idx_list:
+ result[idx] = prob
next_policy[state[0]][state[1]] = result
self.policy_table = next_policy
- # 특정 상태에서 정책에 따른 행동을 반환
+ # 특정 상태에서 정책에 따라 무작위로 행동을 반환
def get_action(self, state):
- # 0 ~ 1 사이의 값을 무작위로 추출
- random_pick = random.randrange(100) / 100
-
policy = self.get_policy(state)
- policy_sum = 0.0
- # 정책에 담긴 행동 중에 무작위로 한 행동을 추출
- for index, value in enumerate(policy):
- policy_sum += value
- if random_pick < policy_sum:
- return index
+ policy = np.array(policy)
+ return np.random.choice(4, 1, p=policy)[0]
# 상태에 따른 정책 반환
def get_policy(self, state):
- if state == [2, 2]:
- return 0.0
return self.policy_table[state[0]][state[1]]
# 가치 함수의 값을 반환
def get_value(self, state):
- # 소숫점 둘째 자리까지만 계산
- return round(self.value_table[state[0]][state[1]], 2)
+ return self.value_table[state[0]][state[1]]
+
if __name__ == "__main__":
env = Env()
diff --git a/1-grid-world/2-value-iteration/environment.py b/1-grid-world/2-value-iteration/environment.py
index c467a92..76ebf21 100644
--- a/1-grid-world/2-value-iteration/environment.py
+++ b/1-grid-world/2-value-iteration/environment.py
@@ -197,7 +197,7 @@ def draw_from_values(self, state, action_list):
def print_values(self, values):
for i in range(WIDTH):
for j in range(HEIGHT):
- self.text_value(i, j, values[i][j])
+ self.text_value(i, j, round(values[i][j], 2))
def render(self):
time.sleep(0.1)
diff --git a/1-grid-world/2-value-iteration/value_iteration.py b/1-grid-world/2-value-iteration/value_iteration.py
index 136fc4a..e388b3f 100644
--- a/1-grid-world/2-value-iteration/value_iteration.py
+++ b/1-grid-world/2-value-iteration/value_iteration.py
@@ -1,65 +1,64 @@
-# -*- coding: utf-8 -*-
+import numpy as np
from environment import GraphicDisplay, Env
+
class ValueIteration:
def __init__(self, env):
- # 환경 객체 생성
+ # 환경에 대한 객체 선언
self.env = env
# 가치 함수를 2차원 리스트로 초기화
self.value_table = [[0.0] * env.width for _ in range(env.height)]
- # 감가율
+ # 할인율
self.discount_factor = 0.9
- # 가치 이터레이션
# 벨만 최적 방정식을 통해 다음 가치 함수 계산
def value_iteration(self):
- next_value_table = [[0.0] * self.env.width for _ in
- range(self.env.height)]
+ # 다음 가치함수 초기화
+ next_value_table = [[0.0] * self.env.width
+ for _ in range(self.env.height)]
+
+ # 모든 상태에 대해서 벨만 최적방정식을 계산
for state in self.env.get_all_states():
+ # 마침 상태의 가치 함수 = 0
if state == [2, 2]:
next_value_table[state[0]][state[1]] = 0.0
continue
- # 가치 함수를 위한 빈 리스트
- value_list = []
- # 가능한 모든 행동에 대해 계산
+ # 벨만 최적 방정식
+ value_list = []
for action in self.env.possible_actions:
next_state = self.env.state_after_action(state, action)
reward = self.env.get_reward(state, action)
next_value = self.get_value(next_state)
value_list.append((reward + self.discount_factor * next_value))
+
# 최댓값을 다음 가치 함수로 대입
- next_value_table[state[0]][state[1]] = round(max(value_list), 2)
+ next_value_table[state[0]][state[1]] = max(value_list)
+
self.value_table = next_value_table
# 현재 가치 함수로부터 행동을 반환
def get_action(self, state):
- action_list = []
- max_value = -99999
-
if state == [2, 2]:
return []
# 모든 행동에 대해 큐함수 (보상 + (감가율 * 다음 상태 가치함수))를 계산
- # 최대 큐 함수를 가진 행동(복수일 경우 여러 개)을 반환
+ value_list = []
for action in self.env.possible_actions:
-
next_state = self.env.state_after_action(state, action)
reward = self.env.get_reward(state, action)
next_value = self.get_value(next_state)
value = (reward + self.discount_factor * next_value)
+ value_list.append(value)
- if value > max_value:
- action_list.clear()
- action_list.append(action)
- max_value = value
- elif value == max_value:
- action_list.append(action)
-
+ # 최대 큐 함수를 가진 행동(복수일 경우 여러 개)을 반환
+ max_idx_list = np.argwhere(value_list == np.amax(value_list))
+ action_list = max_idx_list.flatten().tolist()
return action_list
def get_value(self, state):
- return round(self.value_table[state[0]][state[1]], 2)
+ return self.value_table[state[0]][state[1]]
+
if __name__ == "__main__":
env = Env()
diff --git a/1-grid-world/3-monte-carlo/environment.py b/1-grid-world/3-monte-carlo/environment.py
deleted file mode 100644
index f1ce8e6..0000000
--- a/1-grid-world/3-monte-carlo/environment.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import time
-import numpy as np
-import tkinter as tk
-from PIL import ImageTk, Image
-
-np.random.seed(1)
-PhotoImage = ImageTk.PhotoImage
-UNIT = 100 # 픽셀 수
-HEIGHT = 5 # 그리드 월드 세로
-WIDTH = 5 # 그리드 월드 가로
-
-
-class Env(tk.Tk):
- def __init__(self):
- super(Env, self).__init__()
- self.action_space = ['u', 'd', 'l', 'r']
- self.n_actions = len(self.action_space)
- self.title('monte carlo')
- self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
- self.shapes = self.load_images()
- self.canvas = self._build_canvas()
- self.texts = []
-
- def _build_canvas(self):
- canvas = tk.Canvas(self, bg='white',
- height=HEIGHT * UNIT,
- width=WIDTH * UNIT)
- # 그리드 생성
- for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
- x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
- canvas.create_line(x0, y0, x1, y1)
- for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
- x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
- canvas.create_line(x0, y0, x1, y1)
-
- # 캔버스에 이미지 추가
- self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
- self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1])
- self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1])
- self.circle = canvas.create_image(250, 250, image=self.shapes[2])
-
- canvas.pack()
-
- return canvas
-
- def load_images(self):
- rectangle = PhotoImage(
- Image.open("../img/rectangle.png").resize((65, 65)))
- triangle = PhotoImage(
- Image.open("../img/triangle.png").resize((65, 65)))
- circle = PhotoImage(
- Image.open("../img/circle.png").resize((65, 65)))
-
- return rectangle, triangle, circle
-
- @staticmethod
- def coords_to_state(coords):
- x = int((coords[0] - 50) / 100)
- y = int((coords[1] - 50) / 100)
- return [x, y]
-
- def reset(self):
- self.update()
- time.sleep(0.5)
- x, y = self.canvas.coords(self.rectangle)
- self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
- return self.coords_to_state(self.canvas.coords(self.rectangle))
-
- def step(self, action):
- state = self.canvas.coords(self.rectangle)
- base_action = np.array([0, 0])
- self.render()
-
- if action == 0: # 상
- if state[1] > UNIT:
- base_action[1] -= UNIT
- elif action == 1: # 하
- if state[1] < (HEIGHT - 1) * UNIT:
- base_action[1] += UNIT
- elif action == 2: # 좌
- if state[0] > UNIT:
- base_action[0] -= UNIT
- elif action == 3: # 우
- if state[0] < (WIDTH - 1) * UNIT:
- base_action[0] += UNIT
- # 에이전트 이동
- self.canvas.move(self.rectangle, base_action[0], base_action[1])
- # 에이전트(빨간 네모)를 가장 상위로 배치
- self.canvas.tag_raise(self.rectangle)
-
- next_state = self.canvas.coords(self.rectangle)
-
- # 보상 함수
- if next_state == self.canvas.coords(self.circle):
- reward = 100
- done = True
- elif next_state in [self.canvas.coords(self.triangle1),
- self.canvas.coords(self.triangle2)]:
- reward = -100
- done = True
- else:
- reward = 0
- done = False
-
- next_state = self.coords_to_state(next_state)
-
- return next_state, reward, done
-
- def render(self):
- time.sleep(0.03)
- self.update()
diff --git a/1-grid-world/3-monte-carlo/mc_agent.py b/1-grid-world/3-monte-carlo/mc_agent.py
deleted file mode 100644
index 83dfa51..0000000
--- a/1-grid-world/3-monte-carlo/mc_agent.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import numpy as np
-import random
-from collections import defaultdict
-from environment import Env
-
-
-# 몬테카를로 에이전트 (모든 에피소드 각각의 샘플로 부터 학습)
-class MCAgent:
- def __init__(self, actions):
- self.width = 5
- self.height = 5
- self.actions = actions
- self.learning_rate = 0.01
- self.discount_factor = 0.9
- self.epsilon = 0.1
- self.samples = []
- self.value_table = defaultdict(float)
-
- # 메모리에 샘플을 추가
- def save_sample(self, state, reward, done):
- self.samples.append([state, reward, done])
-
- # 모든 에피소드에서 에이전트가 방문한 상태의 큐 함수를 업데이트
- def update(self):
- G_t = 0
- visit_state = []
- for reward in reversed(self.samples):
- state = str(reward[0])
- if state not in visit_state:
- visit_state.append(state)
- G_t = reward[1] + self.discount_factor * G_t
- value = self.value_table[state]
- self.value_table[state] = (value +
- self.learning_rate * (G_t - value))
-
- # 큐 함수에 따라서 행동을 반환
- # 입실론 탐욕 정책에 따라서 행동을 반환
- def get_action(self, state):
- if np.random.rand() < self.epsilon:
- # 랜덤 행동
- action = np.random.choice(self.actions)
- else:
- # 큐 함수에 따른 행동
- next_state = self.possible_next_state(state)
- action = self.arg_max(next_state)
- return int(action)
-
- # 후보가 여럿이면 arg_max를 계산하고 무작위로 하나를 반환
- @staticmethod
- def arg_max(next_state):
- max_index_list = []
- max_value = next_state[0]
- for index, value in enumerate(next_state):
- if value > max_value:
- max_index_list.clear()
- max_value = value
- max_index_list.append(index)
- elif value == max_value:
- max_index_list.append(index)
- return random.choice(max_index_list)
-
- # 가능한 다음 모든 상태들을 반환
- def possible_next_state(self, state):
- col, row = state
- next_state = [0.0] * 4
-
- if row != 0:
- next_state[0] = self.value_table[str([col, row - 1])]
- else:
- next_state[0] = self.value_table[str(state)]
- if row != self.height - 1:
- next_state[1] = self.value_table[str([col, row + 1])]
- else:
- next_state[1] = self.value_table[str(state)]
- if col != 0:
- next_state[2] = self.value_table[str([col - 1, row])]
- else:
- next_state[2] = self.value_table[str(state)]
- if col != self.width - 1:
- next_state[3] = self.value_table[str([col + 1, row])]
- else:
- next_state[3] = self.value_table[str(state)]
-
- return next_state
-
-
-# 메인 함수
-if __name__ == "__main__":
- env = Env()
- agent = MCAgent(actions=list(range(env.n_actions)))
-
- for episode in range(1000):
- state = env.reset()
- action = agent.get_action(state)
-
- while True:
- env.render()
-
- # 다음 상태로 이동
- # 보상은 숫자이고, 완료 여부는 boolean
- next_state, reward, done = env.step(action)
- agent.save_sample(next_state, reward, done)
-
- # 다음 행동 받아옴
- action = agent.get_action(next_state)
-
- # 에피소드가 완료됐을 때, 큐 함수 업데이트
- if done:
- agent.update()
- agent.samples.clear()
- break
diff --git a/1-grid-world/4-sarsa/sarsa_agent.py b/1-grid-world/3-sarsa/agent.py
similarity index 64%
rename from 1-grid-world/4-sarsa/sarsa_agent.py
rename to 1-grid-world/3-sarsa/agent.py
index 1668471..014d032 100644
--- a/1-grid-world/4-sarsa/sarsa_agent.py
+++ b/1-grid-world/3-sarsa/agent.py
@@ -7,17 +7,19 @@
class SARSAgent:
def __init__(self, actions):
self.actions = actions
- self.learning_rate = 0.01
+ self.step_size = 0.01
self.discount_factor = 0.9
self.epsilon = 0.1
+ # 0을 초기값으로 가지는 큐함수 테이블 생성
self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])
# 의 샘플로부터 큐함수를 업데이트
def learn(self, state, action, reward, next_state, next_action):
+ state, next_state = str(state), str(next_state)
current_q = self.q_table[state][action]
next_state_q = self.q_table[next_state][next_action]
- new_q = (current_q + self.learning_rate *
- (reward + self.discount_factor * next_state_q - current_q))
+ td = reward + self.discount_factor * next_state_q - current_q
+ new_q = current_q + self.step_size * td
self.q_table[state][action] = new_q
# 입실론 탐욕 정책에 따라서 행동을 반환
@@ -27,22 +29,18 @@ def get_action(self, state):
action = np.random.choice(self.actions)
else:
# 큐함수에 따른 행동 반환
- state_action = self.q_table[state]
- action = self.arg_max(state_action)
+ state = str(state)
+ q_list = self.q_table[state]
+ action = arg_max(q_list)
return action
- @staticmethod
- def arg_max(state_action):
- max_index_list = []
- max_value = state_action[0]
- for index, value in enumerate(state_action):
- if value > max_value:
- max_index_list.clear()
- max_value = value
- max_index_list.append(index)
- elif value == max_value:
- max_index_list.append(index)
- return random.choice(max_index_list)
+
+# 큐함수의 값에 따라 최적의 행동을 반환
+def arg_max(q_list):
+ max_idx_list = np.argwhere(q_list == np.amax(q_list))
+ max_idx_list = max_idx_list.flatten().tolist()
+ return random.choice(max_idx_list)
+
if __name__ == "__main__":
env = Env()
@@ -52,7 +50,7 @@ def arg_max(state_action):
# 게임 환경과 상태를 초기화
state = env.reset()
# 현재 상태에 대한 행동을 선택
- action = agent.get_action(str(state))
+ action = agent.get_action(state)
while True:
env.render()
@@ -60,10 +58,9 @@ def arg_max(state_action):
# 행동을 위한 후 다음상태 보상 에피소드의 종료 여부를 받아옴
next_state, reward, done = env.step(action)
# 다음 상태에서의 다음 행동 선택
- next_action = agent.get_action(str(next_state))
-
+ next_action = agent.get_action(next_state)
# 로 큐함수를 업데이트
- agent.learn(str(state), action, reward, str(next_state), next_action)
+ agent.learn(state, action, reward, next_state, next_action)
state = next_state
action = next_action
@@ -72,5 +69,4 @@ def arg_max(state_action):
env.print_value_all(agent.q_table)
if done:
- break
-
+ break
\ No newline at end of file
diff --git a/1-grid-world/4-sarsa/environment.py b/1-grid-world/3-sarsa/environment.py
similarity index 98%
rename from 1-grid-world/4-sarsa/environment.py
rename to 1-grid-world/3-sarsa/environment.py
index d8fe3eb..de34ed8 100644
--- a/1-grid-world/4-sarsa/environment.py
+++ b/1-grid-world/3-sarsa/environment.py
@@ -80,7 +80,7 @@ def print_value_all(self, q_table):
state = [x, y]
if str(state) in q_table.keys():
temp = q_table[str(state)][action]
- self.text_value(y, x, round(temp, 2), action)
+ self.text_value(y, x, round(temp, 3), action)
def coords_to_state(self, coords):
x = int((coords[0] - 50) / 100)
@@ -132,11 +132,8 @@ def step(self, action):
done = False
next_state = self.coords_to_state(next_state)
-
-
-
return next_state, reward, done
def render(self):
time.sleep(0.03)
- self.update()
+ self.update()
\ No newline at end of file
diff --git a/1-grid-world/5-q-learning/q_learning_agent.py b/1-grid-world/4-q-learning/agent.py
similarity index 65%
rename from 1-grid-world/5-q-learning/q_learning_agent.py
rename to 1-grid-world/4-q-learning/agent.py
index 496aeaf..811b00b 100644
--- a/1-grid-world/5-q-learning/q_learning_agent.py
+++ b/1-grid-world/4-q-learning/agent.py
@@ -3,21 +3,22 @@
from environment import Env
from collections import defaultdict
+
class QLearningAgent:
def __init__(self, actions):
- # 행동 = [0, 1, 2, 3] 순서대로 상, 하, 좌, 우
self.actions = actions
- self.learning_rate = 0.01
+ self.step_size = 0.01
self.discount_factor = 0.9
self.epsilon = 0.9
self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])
# 샘플로부터 큐함수 업데이트
def learn(self, state, action, reward, next_state):
+ state, next_state = str(state), str(next_state)
q_1 = self.q_table[state][action]
# 벨만 최적 방정식을 사용한 큐함수의 업데이트
q_2 = reward + self.discount_factor * max(self.q_table[next_state])
- self.q_table[state][action] += self.learning_rate * (q_2 - q_1)
+ self.q_table[state][action] += self.step_size * (q_2 - q_1)
# 큐함수에 의거하여 입실론 탐욕 정책에 따라서 행동을 반환
def get_action(self, state):
@@ -26,22 +27,18 @@ def get_action(self, state):
action = np.random.choice(self.actions)
else:
# 큐함수에 따른 행동 반환
- state_action = self.q_table[state]
- action = self.arg_max(state_action)
+ state = str(state)
+ q_list = self.q_table[state]
+ action = arg_max(q_list)
return action
- @staticmethod
- def arg_max(state_action):
- max_index_list = []
- max_value = state_action[0]
- for index, value in enumerate(state_action):
- if value > max_value:
- max_index_list.clear()
- max_value = value
- max_index_list.append(index)
- elif value == max_value:
- max_index_list.append(index)
- return random.choice(max_index_list)
+
+# 큐함수의 값에 따라 최적의 행동을 반환
+def arg_max(q_list):
+ max_idx_list = np.argwhere(q_list == np.amax(q_list))
+ max_idx_list = max_idx_list.flatten().tolist()
+ return random.choice(max_idx_list)
+
if __name__ == "__main__":
env = Env()
@@ -51,16 +48,17 @@ def arg_max(state_action):
state = env.reset()
while True:
+ # 게임 환경과 상태를 초기화
env.render()
-
# 현재 상태에 대한 행동 선택
- action = agent.get_action(str(state))
+ action = agent.get_action(state)
# 행동을 취한 후 다음 상태, 보상 에피소드의 종료여부를 받아옴
next_state, reward, done = env.step(action)
-
# 로 큐함수를 업데이트
- agent.learn(str(state), action, reward, str(next_state))
+ agent.learn(state, action, reward, next_state)
+
state = next_state
+
# 모든 큐함수를 화면에 표시
env.print_value_all(agent.q_table)
diff --git a/1-grid-world/5-q-learning/environment.py b/1-grid-world/4-q-learning/environment.py
similarity index 98%
rename from 1-grid-world/5-q-learning/environment.py
rename to 1-grid-world/4-q-learning/environment.py
index 1accc84..f4e0793 100644
--- a/1-grid-world/5-q-learning/environment.py
+++ b/1-grid-world/4-q-learning/environment.py
@@ -81,7 +81,7 @@ def print_value_all(self, q_table):
state = [i, j]
if str(state) in q_table.keys():
temp = q_table[str(state)][action]
- self.text_value(j, i, round(temp, 2), action)
+ self.text_value(j, i, round(temp, 3), action)
def coords_to_state(self, coords):
x = int((coords[0] - 50) / 100)
diff --git a/1-grid-world/4-sarsa/.python-version b/1-grid-world/4-sarsa/.python-version
deleted file mode 100644
index 1545d96..0000000
--- a/1-grid-world/4-sarsa/.python-version
+++ /dev/null
@@ -1 +0,0 @@
-3.5.0
diff --git a/1-grid-world/6-deep-sarsa/environment.py b/1-grid-world/5-deep-sarsa/environment.py
similarity index 98%
rename from 1-grid-world/6-deep-sarsa/environment.py
rename to 1-grid-world/5-deep-sarsa/environment.py
index 2e47dd0..8239b5c 100755
--- a/1-grid-world/6-deep-sarsa/environment.py
+++ b/1-grid-world/5-deep-sarsa/environment.py
@@ -12,8 +12,9 @@
class Env(tk.Tk):
- def __init__(self):
+ def __init__(self, render_speed=0.01):
super(Env, self).__init__()
+ self.render_speed=render_speed
self.action_space = ['u', 'd', 'l', 'r']
self.action_size = len(self.action_space)
self.title('DeepSARSA')
@@ -102,7 +103,6 @@ def set_reward(self, state, reward):
self.rewards.append(temp)
# new methods
-
def check_if_reward(self, state):
check_list = dict()
check_list['if_goal'] = False
@@ -232,5 +232,5 @@ def move(self, target, action):
def render(self):
# 게임 속도 조정
- time.sleep(0.05)
+ time.sleep(self.render_speed)
self.update()
diff --git a/1-grid-world/5-deep-sarsa/save_graph/graph_trained.png b/1-grid-world/5-deep-sarsa/save_graph/graph_trained.png
new file mode 100644
index 0000000..b002992
Binary files /dev/null and b/1-grid-world/5-deep-sarsa/save_graph/graph_trained.png differ
diff --git a/1-grid-world/5-deep-sarsa/save_model/checkpoint b/1-grid-world/5-deep-sarsa/save_model/checkpoint
new file mode 100644
index 0000000..a6e034f
--- /dev/null
+++ b/1-grid-world/5-deep-sarsa/save_model/checkpoint
@@ -0,0 +1,2 @@
+model_checkpoint_path: "model"
+all_model_checkpoint_paths: "model"
diff --git a/1-grid-world/5-deep-sarsa/save_model/model.data-00000-of-00001 b/1-grid-world/5-deep-sarsa/save_model/model.data-00000-of-00001
new file mode 100644
index 0000000..5df8913
Binary files /dev/null and b/1-grid-world/5-deep-sarsa/save_model/model.data-00000-of-00001 differ
diff --git a/1-grid-world/5-deep-sarsa/save_model/model.index b/1-grid-world/5-deep-sarsa/save_model/model.index
new file mode 100644
index 0000000..cec03a1
Binary files /dev/null and b/1-grid-world/5-deep-sarsa/save_model/model.index differ
diff --git a/1-grid-world/5-deep-sarsa/save_model/trained/checkpoint b/1-grid-world/5-deep-sarsa/save_model/trained/checkpoint
new file mode 100644
index 0000000..a6e034f
--- /dev/null
+++ b/1-grid-world/5-deep-sarsa/save_model/trained/checkpoint
@@ -0,0 +1,2 @@
+model_checkpoint_path: "model"
+all_model_checkpoint_paths: "model"
diff --git a/1-grid-world/5-deep-sarsa/save_model/trained/model.data-00000-of-00001 b/1-grid-world/5-deep-sarsa/save_model/trained/model.data-00000-of-00001
new file mode 100644
index 0000000..64ef600
Binary files /dev/null and b/1-grid-world/5-deep-sarsa/save_model/trained/model.data-00000-of-00001 differ
diff --git a/1-grid-world/5-deep-sarsa/save_model/trained/model.index b/1-grid-world/5-deep-sarsa/save_model/trained/model.index
new file mode 100644
index 0000000..42d21e2
Binary files /dev/null and b/1-grid-world/5-deep-sarsa/save_model/trained/model.index differ
diff --git a/1-grid-world/5-deep-sarsa/test.py b/1-grid-world/5-deep-sarsa/test.py
new file mode 100644
index 0000000..1e193ae
--- /dev/null
+++ b/1-grid-world/5-deep-sarsa/test.py
@@ -0,0 +1,74 @@
+import random
+import numpy as np
+from environment import Env
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+
+
+# 딥살사 인공신경망
+class DeepSARSA(tf.keras.Model):
+ def __init__(self, action_size):
+ super(DeepSARSA, self).__init__()
+ self.fc1 = Dense(30, activation='relu')
+ self.fc2 = Dense(30, activation='relu')
+ self.fc_out = Dense(action_size)
+
+ def call(self, x):
+ x = self.fc1(x)
+ x = self.fc2(x)
+ q = self.fc_out(x)
+ return q
+
+
+# 그리드월드 예제에서의 딥살사 에이전트
+class DeepSARSAgent:
+ def __init__(self, state_size, action_size):
+ # 상태의 크기와 행동의 크기 정의
+ self.state_size = state_size
+ self.action_size = action_size
+
+ self.epsilon = 0.01
+ self.model = DeepSARSA(self.action_size)
+ self.model.load_weights('save_model/trained/model')
+
+ # 입실론 탐욕 정책으로 행동 선택
+ def get_action(self, state):
+ if np.random.rand() <= self.epsilon:
+ return random.randrange(self.action_size)
+ else:
+ q_values = self.model(state)
+ return np.argmax(q_values[0])
+
+
+if __name__ == "__main__":
+ # 환경과 에이전트 생성
+ env = Env(render_speed=0.05)
+ state_size = 15
+ action_space = [0, 1, 2, 3, 4]
+ action_size = len(action_space)
+ agent = DeepSARSAgent(state_size, action_size)
+
+ scores, episodes = [], []
+
+ EPISODES = 10
+ for e in range(EPISODES):
+ score = 0
+ done = False
+ # env 초기화
+ state = env.reset()
+ state = np.reshape(state, [1, state_size])
+
+ while not done:
+ # 현재 상태에 대한 행동 선택
+ action = agent.get_action(state)
+
+ # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집
+ next_state, reward, done = env.step(action)
+ next_state = np.reshape(next_state, [1, state_size])
+
+ state = next_state
+ score += reward
+
+ if done:
+ # 에피소드마다 학습 결과 출력
+ print("episode: {:3d} | score: {:3d}".format(e, score))
\ No newline at end of file
diff --git a/1-grid-world/5-deep-sarsa/train.py b/1-grid-world/5-deep-sarsa/train.py
new file mode 100644
index 0000000..99f530d
--- /dev/null
+++ b/1-grid-world/5-deep-sarsa/train.py
@@ -0,0 +1,123 @@
+import copy
+import pylab
+import random
+import numpy as np
+from environment import Env
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.optimizers import Adam
+
+
+# 딥살사 인공신경망
+class DeepSARSA(tf.keras.Model):
+ def __init__(self, action_size):
+ super(DeepSARSA, self).__init__()
+ self.fc1 = Dense(30, activation='relu')
+ self.fc2 = Dense(30, activation='relu')
+ self.fc_out = Dense(action_size)
+
+ def call(self, x):
+ x = self.fc1(x)
+ x = self.fc2(x)
+ q = self.fc_out(x)
+ return q
+
+
+# 그리드월드 예제에서의 딥살사 에이전트
+class DeepSARSAgent:
+ def __init__(self, state_size, action_size):
+ # 상태의 크기와 행동의 크기 정의
+ self.state_size = state_size
+ self.action_size = action_size
+
+ # 딥살사 하이퍼 파라메터
+ self.discount_factor = 0.99
+ self.learning_rate = 0.001
+ self.epsilon = 1.
+ self.epsilon_decay = .9999
+ self.epsilon_min = 0.01
+ self.model = DeepSARSA(self.action_size)
+ self.optimizer = Adam(lr=self.learning_rate)
+
+ # 입실론 탐욕 정책으로 행동 선택
+ def get_action(self, state):
+ if np.random.rand() <= self.epsilon:
+ return random.randrange(self.action_size)
+ else:
+ q_values = self.model(state)
+ return np.argmax(q_values[0])
+
+ # 의 샘플로부터 모델 업데이트
+ def train_model(self, state, action, reward, next_state, next_action, done):
+ if self.epsilon > self.epsilon_min:
+ self.epsilon *= self.epsilon_decay
+
+ # 학습 파라메터
+ model_params = self.model.trainable_variables
+ with tf.GradientTape() as tape:
+ tape.watch(model_params)
+ predict = self.model(state)[0]
+ one_hot_action = tf.one_hot([action], self.action_size)
+ predict = tf.reduce_sum(one_hot_action * predict, axis=1)
+
+ # done = True 일 경우 에피소드가 끝나서 다음 상태가 없음
+ next_q = self.model(next_state)[0][next_action]
+ target = reward + (1 - done) * self.discount_factor * next_q
+
+ # MSE 오류 함수 계산
+ loss = tf.reduce_mean(tf.square(target - predict))
+
+ # 오류함수를 줄이는 방향으로 모델 업데이트
+ grads = tape.gradient(loss, model_params)
+ self.optimizer.apply_gradients(zip(grads, model_params))
+
+
+if __name__ == "__main__":
+ # 환경과 에이전트 생성
+ env = Env(render_speed=0.01)
+ state_size = 15
+ action_space = [0, 1, 2, 3, 4]
+ action_size = len(action_space)
+ agent = DeepSARSAgent(state_size, action_size)
+
+ scores, episodes = [], []
+
+ EPISODES = 1000
+ for e in range(EPISODES):
+ done = False
+ score = 0
+ # env 초기화
+ state = env.reset()
+ state = np.reshape(state, [1, state_size])
+
+ while not done:
+ # 현재 상태에 대한 행동 선택
+ action = agent.get_action(state)
+
+ # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집
+ next_state, reward, done = env.step(action)
+ next_state = np.reshape(next_state, [1, state_size])
+ next_action = agent.get_action(next_state)
+
+ # 샘플로 모델 학습
+ agent.train_model(state, action, reward, next_state,
+ next_action, done)
+ score += reward
+ state = next_state
+
+ if done:
+ # 에피소드마다 학습 결과 출력
+ print("episode: {:3d} | score: {:3d} | epsilon: {:.3f}".format(
+ e, score, agent.epsilon))
+
+ scores.append(score)
+ episodes.append(e)
+ pylab.plot(episodes, scores, 'b')
+ pylab.xlabel("episode")
+ pylab.ylabel("score")
+ pylab.savefig("./save_graph/graph.png")
+
+
+ # 100 에피소드마다 모델 저장
+ if e % 100 == 0:
+ agent.model.save_weights('save_model/model', save_format='tf')
\ No newline at end of file
diff --git a/1-grid-world/5-q-learning/.python-version b/1-grid-world/5-q-learning/.python-version
deleted file mode 100644
index 1545d96..0000000
--- a/1-grid-world/5-q-learning/.python-version
+++ /dev/null
@@ -1 +0,0 @@
-3.5.0
diff --git a/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py b/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py
deleted file mode 100755
index 1af7fda..0000000
--- a/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import copy
-import pylab
-import random
-import numpy as np
-from environment import Env
-from keras.layers import Dense
-from keras.optimizers import Adam
-from keras.models import Sequential
-
-EPISODES = 1000
-
-
-# 그리드월드 예제에서의 딥살사 에이전트
-class DeepSARSAgent:
- def __init__(self):
- self.load_model = False
- # 에이전트가 가능한 모든 행동 정의
- self.action_space = [0, 1, 2, 3, 4]
- # 상태의 크기와 행동의 크기 정의
- self.action_size = len(self.action_space)
- self.state_size = 15
- self.discount_factor = 0.99
- self.learning_rate = 0.001
-
- self.epsilon = 1. # exploration
- self.epsilon_decay = .9999
- self.epsilon_min = 0.01
- self.model = self.build_model()
-
- if self.load_model:
- self.epsilon = 0.05
- self.model.load_weights('./save_model/deep_sarsa_trained.h5')
-
- # 상태가 입력 큐함수가 출력인 인공신경망 생성
- def build_model(self):
- model = Sequential()
- model.add(Dense(30, input_dim=self.state_size, activation='relu'))
- model.add(Dense(30, activation='relu'))
- model.add(Dense(self.action_size, activation='linear'))
- model.summary()
- model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
- return model
-
- # 입실론 탐욕 방법으로 행동 선택
- def get_action(self, state):
- if np.random.rand() <= self.epsilon:
- # 무작위 행동 반환
- return random.randrange(self.action_size)
- else:
- # 모델로부터 행동 산출
- state = np.float32(state)
- q_values = self.model.predict(state)
- return np.argmax(q_values[0])
-
- def train_model(self, state, action, reward, next_state, next_action, done):
- if self.epsilon > self.epsilon_min:
- self.epsilon *= self.epsilon_decay
-
- state = np.float32(state)
- next_state = np.float32(next_state)
- target = self.model.predict(state)[0]
- # 살사의 큐함수 업데이트 식
- if done:
- target[action] = reward
- else:
- target[action] = (reward + self.discount_factor *
- self.model.predict(next_state)[0][next_action])
-
- # 출력 값 reshape
- target = np.reshape(target, [1, 5])
- # 인공신경망 업데이트
- self.model.fit(state, target, epochs=1, verbose=0)
-
-
-if __name__ == "__main__":
- # 환경과 에이전트 생성
- env = Env()
- agent = DeepSARSAgent()
-
- global_step = 0
- scores, episodes = [], []
-
- for e in range(EPISODES):
- done = False
- score = 0
- state = env.reset()
- state = np.reshape(state, [1, 15])
-
- while not done:
- # env 초기화
- global_step += 1
-
- # 현재 상태에 대한 행동 선택
- action = agent.get_action(state)
- # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집
- next_state, reward, done = env.step(action)
- next_state = np.reshape(next_state, [1, 15])
- next_action = agent.get_action(next_state)
- # 샘플로 모델 학습
- agent.train_model(state, action, reward, next_state, next_action,
- done)
- state = next_state
- score += reward
-
- state = copy.deepcopy(next_state)
-
- if done:
- # 에피소드마다 학습 결과 출력
- scores.append(score)
- episodes.append(e)
- pylab.plot(episodes, scores, 'b')
- pylab.savefig("./save_graph/deep_sarsa_.png")
- print("episode:", e, " score:", score, "global_step",
- global_step, " epsilon:", agent.epsilon)
-
- # 100 에피소드마다 모델 저장
- if e % 100 == 0:
- agent.model.save_weights("./save_model/deep_sarsa.h5")
diff --git a/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png b/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png
deleted file mode 100644
index 8dec1d0..0000000
Binary files a/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png and /dev/null differ
diff --git a/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5 b/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5
deleted file mode 100644
index 23ba39c..0000000
Binary files a/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5 and /dev/null differ
diff --git a/1-grid-world/7-reinforce/environment.py b/1-grid-world/6-reinforce/environment.py
similarity index 94%
rename from 1-grid-world/7-reinforce/environment.py
rename to 1-grid-world/6-reinforce/environment.py
index 816c6f0..613a2b2 100644
--- a/1-grid-world/7-reinforce/environment.py
+++ b/1-grid-world/6-reinforce/environment.py
@@ -5,18 +5,19 @@
PhotoImage = ImageTk.PhotoImage
UNIT = 50 # 픽셀 수
-HEIGHT = 5 # 그리드월드 세로
-WIDTH = 5 # 그리드월드 가로
+HEIGHT = 5 # 그리드 세로
+WIDTH = 5 # 그리드 가로
np.random.seed(1)
class Env(tk.Tk):
- def __init__(self):
+ def __init__(self, render_speed=0.01):
super(Env, self).__init__()
+ self.render_speed=render_speed
self.action_space = ['u', 'd', 'l', 'r']
self.action_size = len(self.action_space)
- self.title('Reinforce')
+ self.title('REINFORCE')
self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
self.shapes = self.load_images()
self.canvas = self._build_canvas()
@@ -27,7 +28,7 @@ def __init__(self):
self.set_reward([0, 1], -1)
self.set_reward([1, 2], -1)
self.set_reward([2, 3], -1)
- # 목표지점 설정
+ # 목표 지점 설정
self.set_reward([4, 4], 1)
def _build_canvas(self):
@@ -73,7 +74,7 @@ def reset_reward(self):
self.set_reward([1, 2], -1)
self.set_reward([2, 3], -1)
- # 목표 지점
+ # #goal
self.set_reward([4, 4], 1)
def set_reward(self, state, reward):
@@ -101,6 +102,7 @@ def set_reward(self, state, reward):
temp['state'] = state
self.rewards.append(temp)
+ # new methods
def check_if_reward(self, state):
check_list = dict()
check_list['if_goal'] = False
@@ -109,7 +111,7 @@ def check_if_reward(self, state):
for reward in self.rewards:
if reward['state'] == state:
rewards += reward['reward']
- if reward['reward'] > 0:
+ if reward['reward'] == 1:
check_list['if_goal'] = True
check_list['rewards'] = rewards
@@ -123,6 +125,7 @@ def coords_to_state(self, coords):
def reset(self):
self.update()
+ time.sleep(0.5)
x, y = self.canvas.coords(self.rectangle)
self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
self.reset_reward()
@@ -139,7 +142,7 @@ def step(self, action):
check = self.check_if_reward(self.coords_to_state(next_coords))
done = check['if_goal']
reward = check['rewards']
- reward -= 0.1
+
self.canvas.tag_raise(self.rectangle)
s_ = self.get_state()
@@ -169,7 +172,7 @@ def get_state(self):
def move_rewards(self):
new_rewards = []
for temp in self.rewards:
- if temp['reward'] > 0:
+ if temp['reward'] == 1:
new_rewards.append(temp)
continue
temp['coords'] = self.move_const(temp)
@@ -218,7 +221,7 @@ def move(self, target, action):
if s[0] < (WIDTH - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # 좌
- if s[0] > UNIT:
+ if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(target, base_action[0], base_action[1])
@@ -229,5 +232,5 @@ def move(self, target, action):
def render(self):
# 게임 속도 조정
- time.sleep(0.07)
+ time.sleep(self.render_speed)
self.update()
diff --git a/1-grid-world/6-reinforce/save_graph/graph_trained.png b/1-grid-world/6-reinforce/save_graph/graph_trained.png
new file mode 100644
index 0000000..69f0fb2
Binary files /dev/null and b/1-grid-world/6-reinforce/save_graph/graph_trained.png differ
diff --git a/1-grid-world/6-reinforce/save_model/trained/checkpoint b/1-grid-world/6-reinforce/save_model/trained/checkpoint
new file mode 100644
index 0000000..a6e034f
--- /dev/null
+++ b/1-grid-world/6-reinforce/save_model/trained/checkpoint
@@ -0,0 +1,2 @@
+model_checkpoint_path: "model"
+all_model_checkpoint_paths: "model"
diff --git a/1-grid-world/6-reinforce/save_model/trained/model.data-00000-of-00001 b/1-grid-world/6-reinforce/save_model/trained/model.data-00000-of-00001
new file mode 100644
index 0000000..2ca52a5
Binary files /dev/null and b/1-grid-world/6-reinforce/save_model/trained/model.data-00000-of-00001 differ
diff --git a/1-grid-world/6-reinforce/save_model/trained/model.index b/1-grid-world/6-reinforce/save_model/trained/model.index
new file mode 100644
index 0000000..2878aef
Binary files /dev/null and b/1-grid-world/6-reinforce/save_model/trained/model.index differ
diff --git a/1-grid-world/6-reinforce/test.py b/1-grid-world/6-reinforce/test.py
new file mode 100644
index 0000000..7c1bacc
--- /dev/null
+++ b/1-grid-world/6-reinforce/test.py
@@ -0,0 +1,72 @@
+import copy
+import pylab
+import random
+import numpy as np
+from environment import Env
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.optimizers import Adam
+
+
+# 상태가 입력, 각 행동의 확률이 출력인 인공신경망 생성
+class REINFORCE(tf.keras.Model):
+ def __init__(self, action_size):
+ super(REINFORCE, self).__init__()
+ self.fc1 = Dense(24, activation='relu')
+ self.fc2 = Dense(24, activation='relu')
+ self.fc_out = Dense(action_size, activation='softmax')
+
+ def call(self, x):
+ x = self.fc1(x)
+ x = self.fc2(x)
+ policy = self.fc_out(x)
+ return policy
+
+
+# 그리드월드 예제에서의 REINFORCE 에이전트
+class REINFORCEAgent:
+ def __init__(self, state_size, action_size):
+ # 상태의 크기와 행동의 크기 정의
+ self.state_size = state_size
+ self.action_size = action_size
+
+ self.model = REINFORCE(self.action_size)
+ self.model.load_weights('save_model/trained/model')
+
+ # 정책신경망으로 행동 선택
+ def get_action(self, state):
+ policy = self.model(state)[0]
+ policy = np.array(policy)
+ return np.random.choice(self.action_size, 1, p=policy)[0]
+
+
+if __name__ == "__main__":
+ # 환경과 에이전트 생성
+ env = Env(render_speed=0.05)
+ state_size = 15
+ action_space = [0, 1, 2, 3, 4]
+ action_size = len(action_space)
+ agent = REINFORCEAgent(state_size, action_size)
+
+ EPISODES = 10
+ for e in range(EPISODES):
+ done = False
+ score = 0
+ # env 초기화
+ state = env.reset()
+ state = np.reshape(state, [1, state_size])
+
+ while not done:
+ # 현재 상태에 대한 행동 선택
+ action = agent.get_action(state)
+
+ # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집
+ next_state, reward, done = env.step(action)
+ next_state = np.reshape(next_state, [1, state_size])
+
+ score += reward
+
+ state = next_state
+
+ if done:
+ print("episode: {:3d} | score: {:3d}".format(e, score))
\ No newline at end of file
diff --git a/1-grid-world/6-reinforce/train.py b/1-grid-world/6-reinforce/train.py
new file mode 100644
index 0000000..0eacae2
--- /dev/null
+++ b/1-grid-world/6-reinforce/train.py
@@ -0,0 +1,136 @@
+import copy
+import pylab
+import random
+import numpy as np
+from environment import Env
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.optimizers import Adam
+
+
+# 상태가 입력, 각 행동의 확률이 출력인 인공신경망 생성
+class REINFORCE(tf.keras.Model):
+ def __init__(self, action_size):
+ super(REINFORCE, self).__init__()
+ self.fc1 = Dense(24, activation='relu')
+ self.fc2 = Dense(24, activation='relu')
+ self.fc_out = Dense(action_size, activation='softmax')
+
+ def call(self, x):
+ x = self.fc1(x)
+ x = self.fc2(x)
+ policy = self.fc_out(x)
+ return policy
+
+
+# 그리드월드 예제에서의 REINFORCE 에이전트
+class REINFORCEAgent:
+ def __init__(self, state_size, action_size):
+ # 상태의 크기와 행동의 크기 정의
+ self.state_size = state_size
+ self.action_size = action_size
+
+ # REINFORCE 하이퍼 파라메터
+ self.discount_factor = 0.99
+ self.learning_rate = 0.001
+
+ self.model = REINFORCE(self.action_size)
+ self.optimizer = Adam(lr=self.learning_rate)
+ self.states, self.actions, self.rewards = [], [], []
+
+ # 정책신경망으로 행동 선택
+ def get_action(self, state):
+ policy = self.model(state)[0]
+ policy = np.array(policy)
+ return np.random.choice(self.action_size, 1, p=policy)[0]
+
+ # 반환값 계산
+ def discount_rewards(self, rewards):
+ discounted_rewards = np.zeros_like(rewards)
+ running_add = 0
+ for t in reversed(range(0, len(rewards))):
+ running_add = running_add * self.discount_factor + rewards[t]
+ discounted_rewards[t] = running_add
+ return discounted_rewards
+
+ # 한 에피소드 동안의 상태, 행동, 보상을 저장
+ def append_sample(self, state, action, reward):
+ self.states.append(state[0])
+ self.rewards.append(reward)
+ act = np.zeros(self.action_size)
+ act[action] = 1
+ self.actions.append(act)
+
+ # 정책신경망 업데이트
+ def train_model(self):
+ discounted_rewards = np.float32(self.discount_rewards(self.rewards))
+ discounted_rewards -= np.mean(discounted_rewards)
+ discounted_rewards /= np.std(discounted_rewards)
+
+ # 크로스 엔트로피 오류함수 계산
+ model_params = self.model.trainable_variables
+ with tf.GradientTape() as tape:
+ tape.watch(model_params)
+ policies = self.model(np.array(self.states))
+ actions = np.array(self.actions)
+ action_prob = tf.reduce_sum(actions * policies, axis=1)
+ cross_entropy = - tf.math.log(action_prob + 1e-5)
+ loss = tf.reduce_sum(cross_entropy * discounted_rewards)
+ entropy = - policies * tf.math.log(policies)
+
+ # 오류함수를 줄이는 방향으로 모델 업데이트
+ grads = tape.gradient(loss, model_params)
+ self.optimizer.apply_gradients(zip(grads, model_params))
+ self.states, self.actions, self.rewards = [], [], []
+ return np.mean(entropy)
+
+
+if __name__ == "__main__":
+ # 환경과 에이전트 생성
+ env = Env(render_speed=0.01)
+ state_size = 15
+ action_space = [0, 1, 2, 3, 4]
+ action_size = len(action_space)
+ agent = REINFORCEAgent(state_size, action_size)
+
+ scores, episodes = [], []
+
+ EPISODES = 200
+ for e in range(EPISODES):
+ done = False
+ score = 0
+ # env 초기화
+ state = env.reset()
+ state = np.reshape(state, [1, state_size])
+
+ while not done:
+ # 현재 상태에 대한 행동 선택
+ action = agent.get_action(state)
+
+ # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집
+ next_state, reward, done = env.step(action)
+ next_state = np.reshape(next_state, [1, state_size])
+
+ agent.append_sample(state, action, reward)
+ score += reward
+
+ state = next_state
+
+ if done:
+ # 에피소드마다 정책신경망 업데이트
+ entropy = agent.train_model()
+ # 에피소드마다 학습 결과 출력
+ print("episode: {:3d} | score: {:3d} | entropy: {:.3f}".format(
+ e, score, entropy))
+
+ scores.append(score)
+ episodes.append(e)
+ pylab.plot(episodes, scores, 'b')
+ pylab.xlabel("episode")
+ pylab.ylabel("score")
+ pylab.savefig("./save_graph/graph.png")
+
+
+ # 100 에피소드마다 모델 저장
+ if e % 100 == 0:
+ agent.model.save_weights('save_model/model', save_format='tf')
\ No newline at end of file
diff --git a/1-grid-world/7-reinforce/reinforce_agent.py b/1-grid-world/7-reinforce/reinforce_agent.py
deleted file mode 100644
index 6c2aa4e..0000000
--- a/1-grid-world/7-reinforce/reinforce_agent.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import copy
-import pylab
-import numpy as np
-from environment import Env
-from keras.layers import Dense
-from keras.optimizers import Adam
-from keras.models import Sequential
-from keras import backend as K
-
-EPISODES = 2500
-
-# 그리드월드 예제에서의 REINFORCE 에이전트
-class ReinforceAgent:
- def __init__(self):
- self.load_model = False
- # 가능한 모든 행동 정의
- self.action_space = [0, 1, 2, 3, 4]
- # 상태와 행동의 크기 정의
- self.action_size = len(self.action_space)
- self.state_size = 15
- self.discount_factor = 0.99
- self.learning_rate = 0.001
-
- self.model = self.build_model()
- self.optimizer = self.optimizer()
- self.states, self.actions, self.rewards = [], [], []
-
- if self.load_model:
- self.model.load_weights('./save_model/reinforce_trained.h5')
-
- # 상태가 입력, 각 행동의 확률이 출력인 인공신경망 생성
- def build_model(self):
- model = Sequential()
- model.add(Dense(24, input_dim=self.state_size, activation='relu'))
- model.add(Dense(24, activation='relu'))
- model.add(Dense(self.action_size, activation='softmax'))
- model.summary()
- return model
-
- # 정책신경망을 업데이트 하기 위한 오류함수와 훈련함수의 생성
- def optimizer(self):
- action = K.placeholder(shape=[None, 5])
- discounted_rewards = K.placeholder(shape=[None, ])
-
- # 크로스 엔트로피 오류함수 계산
- action_prob = K.sum(action * self.model.output, axis=1)
- cross_entropy = K.log(action_prob) * discounted_rewards
- loss = -K.sum(cross_entropy)
-
- # 정책신경망을 업데이트하는 훈련함수 생성
- optimizer = Adam(lr=self.learning_rate)
- updates = optimizer.get_updates(self.model.trainable_weights,[],
- loss)
- train = K.function([self.model.input, action, discounted_rewards], [],
- updates=updates)
-
- return train
-
- # 정책신경망으로 행동 선택
- def get_action(self, state):
- policy = self.model.predict(state)[0]
- return np.random.choice(self.action_size, 1, p=policy)[0]
-
- # 반환값 계산
- def discount_rewards(self, rewards):
- discounted_rewards = np.zeros_like(rewards)
- running_add = 0
- for t in reversed(range(0, len(rewards))):
- running_add = running_add * self.discount_factor + rewards[t]
- discounted_rewards[t] = running_add
- return discounted_rewards
-
- # 한 에피소드 동안의 상태, 행동, 보상을 저장
- def append_sample(self, state, action, reward):
- self.states.append(state[0])
- self.rewards.append(reward)
- act = np.zeros(self.action_size)
- act[action] = 1
- self.actions.append(act)
-
- # 정책신경망 업데이트
- def train_model(self):
- discounted_rewards = np.float32(self.discount_rewards(self.rewards))
- discounted_rewards -= np.mean(discounted_rewards)
- discounted_rewards /= np.std(discounted_rewards)
-
- self.optimizer([self.states, self.actions, discounted_rewards])
- self.states, self.actions, self.rewards = [], [], []
-
-
-if __name__ == "__main__":
- # 환경과 에이전트의 생성
- env = Env()
- agent = ReinforceAgent()
-
- global_step = 0
- scores, episodes = [], []
-
- for e in range(EPISODES):
- done = False
- score = 0
- # env 초기화
- state = env.reset()
- state = np.reshape(state, [1, 15])
-
- while not done:
- global_step += 1
- # 현재 상태에 대한 행동 선택
- action = agent.get_action(state)
- # 선택한 행동으로 환경에서 한 타임스탭 진행 후 샘플 수집
- next_state, reward, done = env.step(action)
- next_state = np.reshape(next_state, [1, 15])
-
- agent.append_sample(state, action, reward)
- score += reward
- state = copy.deepcopy(next_state)
-
- if done:
- # 에피소드마다 정책신경망 업데이트
- agent.train_model()
- scores.append(score)
- episodes.append(e)
- score = round(score,2)
- print("episode:", e, " score:", score, " time_step:",
- global_step)
-
- # 100 에피소드마다 학습 결과 출력 및 모델 저장
- if e % 100 == 0:
- pylab.plot(episodes, scores, 'b')
- pylab.savefig("./save_graph/reinforce.png")
- agent.model.save_weights("./save_model/reinforce.h5")
diff --git a/1-grid-world/7-reinforce/save_graph/reinforce_trained.png b/1-grid-world/7-reinforce/save_graph/reinforce_trained.png
deleted file mode 100644
index 3be9edb..0000000
Binary files a/1-grid-world/7-reinforce/save_graph/reinforce_trained.png and /dev/null differ
diff --git a/1-grid-world/7-reinforce/save_model/reinforce_trained.h5 b/1-grid-world/7-reinforce/save_model/reinforce_trained.h5
deleted file mode 100644
index cb206f5..0000000
Binary files a/1-grid-world/7-reinforce/save_model/reinforce_trained.h5 and /dev/null differ
diff --git a/1-grid-world/README.md b/1-grid-world/README.md
deleted file mode 100644
index a955308..0000000
--- a/1-grid-world/README.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# Grid World with Reinforcement Learning
-This is Grid World example that we made for the simple algorithm test
-The game is simple. The red rectangle must arrive in the circle, avoiding triangle.
-
-

-
-
-
-
-
-## Dynamic Programming
-**1. Policy Iteration**
-
-**2. Value Iteration**
-
-
-
-## Reinforcement Learning Fundamental Algorithms
-**3. Monte-Carlo**
-
-**4. SARSA**
-
-**5. Q-Learning**
-
-
-
-## Futher Reinforcement Learning Algorithms
->we have changed Grid World so the obstacles are moving. To solve this problem, we have to use function approximator.
-We used Neural Network as function approximator
-
-
-
-
-
-**6. DQN**
-
-**7. Policy Gradient**
-
-
diff --git a/1-grid-world/gridworld.png b/1-grid-world/gridworld.png
deleted file mode 100644
index 71468d4..0000000
Binary files a/1-grid-world/gridworld.png and /dev/null differ
diff --git a/2-cartpole/1-dqn/save_graph/cartpole_dqn.png b/2-cartpole/1-dqn/save_graph/cartpole_dqn.png
deleted file mode 100644
index 384fef6..0000000
Binary files a/2-cartpole/1-dqn/save_graph/cartpole_dqn.png and /dev/null differ
diff --git a/2-cartpole/1-dqn/save_graph/graph_trained.png b/2-cartpole/1-dqn/save_graph/graph_trained.png
new file mode 100644
index 0000000..05d6c45
Binary files /dev/null and b/2-cartpole/1-dqn/save_graph/graph_trained.png differ
diff --git a/2-cartpole/1-dqn/save_model/cartpole_dqn_trained.h5 b/2-cartpole/1-dqn/save_model/cartpole_dqn_trained.h5
deleted file mode 100644
index 50edb6e..0000000
Binary files a/2-cartpole/1-dqn/save_model/cartpole_dqn_trained.h5 and /dev/null differ
diff --git a/2-cartpole/1-dqn/save_model/trained/checkpoint b/2-cartpole/1-dqn/save_model/trained/checkpoint
new file mode 100644
index 0000000..a6e034f
--- /dev/null
+++ b/2-cartpole/1-dqn/save_model/trained/checkpoint
@@ -0,0 +1,2 @@
+model_checkpoint_path: "model"
+all_model_checkpoint_paths: "model"
diff --git a/2-cartpole/1-dqn/save_model/trained/model.data-00000-of-00001 b/2-cartpole/1-dqn/save_model/trained/model.data-00000-of-00001
new file mode 100644
index 0000000..5476cf1
Binary files /dev/null and b/2-cartpole/1-dqn/save_model/trained/model.data-00000-of-00001 differ
diff --git a/2-cartpole/1-dqn/save_model/trained/model.index b/2-cartpole/1-dqn/save_model/trained/model.index
new file mode 100644
index 0000000..b21f192
Binary files /dev/null and b/2-cartpole/1-dqn/save_model/trained/model.index differ
diff --git a/2-cartpole/1-dqn/test.py b/2-cartpole/1-dqn/test.py
new file mode 100644
index 0000000..2e92ec0
--- /dev/null
+++ b/2-cartpole/1-dqn/test.py
@@ -0,0 +1,76 @@
+import sys
+import gym
+import pylab
+import random
+import numpy as np
+from collections import deque
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.initializers import RandomUniform
+
+
+# 상태가 입력, 큐함수가 출력인 인공신경망 생성
+class DQN(tf.keras.Model):
+ def __init__(self, action_size):
+ super(DQN, self).__init__()
+ self.fc1 = Dense(24, activation='relu')
+ self.fc2 = Dense(24, activation='relu')
+ self.fc_out = Dense(action_size,
+ kernel_initializer=RandomUniform(-1e-3, 1e-3))
+
+ def call(self, x):
+ x = self.fc1(x)
+ x = self.fc2(x)
+ q = self.fc_out(x)
+ return q
+
+
+# 카트폴 예제에서의 DQN 에이전트
+class DQNAgent:
+ def __init__(self, state_size, action_size):
+ # 상태와 행동의 크기 정의
+ self.state_size = state_size
+ self.action_size = action_size
+
+ # 모델과 타깃 모델 생성
+ self.model = DQN(action_size)
+ self.model.load_weights("./save_model/trained/model")
+
+ # 입실론 탐욕 정책으로 행동 선택
+ def get_action(self, state):
+ q_value = self.model(state)
+ return np.argmax(q_value[0])
+
+
+if __name__ == "__main__":
+ # CartPole-v1 환경, 최대 타임스텝 수가 500
+ env = gym.make('CartPole-v1')
+ state_size = env.observation_space.shape[0]
+ action_size = env.action_space.n
+
+ # DQN 에이전트 생성
+ agent = DQNAgent(state_size, action_size)
+
+ num_episode = 10
+ for e in range(num_episode):
+ done = False
+ score = 0
+ # env 초기화
+ state = env.reset()
+ state = np.reshape(state, [1, state_size])
+
+ while not done:
+ env.render()
+
+ # 현재 상태로 행동을 선택
+ action = agent.get_action(state)
+ # 선택한 행동으로 환경에서 한 타임스텝 진행
+ next_state, reward, done, info = env.step(action)
+ next_state = np.reshape(next_state, [1, state_size])
+
+ score += reward
+ state = next_state
+
+ if done:
+ # 에피소드마다 학습 결과 출력
+ print("episode: {:3d} | score: {:.3f} ".format(e, score))
\ No newline at end of file
diff --git a/2-cartpole/1-dqn/cartpole_dqn.py b/2-cartpole/1-dqn/train.py
similarity index 50%
rename from 2-cartpole/1-dqn/cartpole_dqn.py
rename to 2-cartpole/1-dqn/train.py
index 8b7c332..be04f42 100644
--- a/2-cartpole/1-dqn/cartpole_dqn.py
+++ b/2-cartpole/1-dqn/train.py
@@ -1,21 +1,36 @@
+import os
import sys
import gym
import pylab
import random
import numpy as np
from collections import deque
-from keras.layers import Dense
-from keras.optimizers import Adam
-from keras.models import Sequential
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.initializers import RandomUniform
-EPISODES = 300
+
+# 상태가 입력, 큐함수가 출력인 인공신경망 생성
+class DQN(tf.keras.Model):
+ def __init__(self, action_size):
+ super(DQN, self).__init__()
+ self.fc1 = Dense(24, activation='relu')
+ self.fc2 = Dense(24, activation='relu')
+ self.fc_out = Dense(action_size,
+ kernel_initializer=RandomUniform(-1e-3, 1e-3))
+
+ def call(self, x):
+ x = self.fc1(x)
+ x = self.fc2(x)
+ q = self.fc_out(x)
+ return q
# 카트폴 예제에서의 DQN 에이전트
class DQNAgent:
def __init__(self, state_size, action_size):
self.render = False
- self.load_model = False
# 상태와 행동의 크기 정의
self.state_size = state_size
@@ -34,28 +49,13 @@ def __init__(self, state_size, action_size):
self.memory = deque(maxlen=2000)
# 모델과 타깃 모델 생성
- self.model = self.build_model()
- self.target_model = self.build_model()
+ self.model = DQN(action_size)
+ self.target_model = DQN(action_size)
+ self.optimizer = Adam(lr=self.learning_rate)
# 타깃 모델 초기화
self.update_target_model()
- if self.load_model:
- self.model.load_weights("./save_model/cartpole_dqn_trained.h5")
-
- # 상태가 입력, 큐함수가 출력인 인공신경망 생성
- def build_model(self):
- model = Sequential()
- model.add(Dense(24, input_dim=self.state_size, activation='relu',
- kernel_initializer='he_uniform'))
- model.add(Dense(24, activation='relu',
- kernel_initializer='he_uniform'))
- model.add(Dense(self.action_size, activation='linear',
- kernel_initializer='he_uniform'))
- model.summary()
- model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
- return model
-
# 타깃 모델을 모델의 가중치로 업데이트
def update_target_model(self):
self.target_model.set_weights(self.model.get_weights())
@@ -65,7 +65,7 @@ def get_action(self, state):
if np.random.rand() <= self.epsilon:
return random.randrange(self.action_size)
else:
- q_value = self.model.predict(state)
+ q_value = self.model(state)
return np.argmax(q_value[0])
# 샘플 을 리플레이 메모리에 저장
@@ -80,32 +80,32 @@ def train_model(self):
# 메모리에서 배치 크기만큼 무작위로 샘플 추출
mini_batch = random.sample(self.memory, self.batch_size)
- states = np.zeros((self.batch_size, self.state_size))
- next_states = np.zeros((self.batch_size, self.state_size))
- actions, rewards, dones = [], [], []
+ states = np.array([sample[0][0] for sample in mini_batch])
+ actions = np.array([sample[1] for sample in mini_batch])
+ rewards = np.array([sample[2] for sample in mini_batch])
+ next_states = np.array([sample[3][0] for sample in mini_batch])
+ dones = np.array([sample[4] for sample in mini_batch])
- for i in range(self.batch_size):
- states[i] = mini_batch[i][0]
- actions.append(mini_batch[i][1])
- rewards.append(mini_batch[i][2])
- next_states[i] = mini_batch[i][3]
- dones.append(mini_batch[i][4])
+ # 학습 파라메터
+ model_params = self.model.trainable_variables
+ with tf.GradientTape() as tape:
+ # 현재 상태에 대한 모델의 큐함수
+ predicts = self.model(states)
+ one_hot_action = tf.one_hot(actions, self.action_size)
+ predicts = tf.reduce_sum(one_hot_action * predicts, axis=1)
- # 현재 상태에 대한 모델의 큐함수
- # 다음 상태에 대한 타깃 모델의 큐함수
- target = self.model.predict(states)
- target_val = self.target_model.predict(next_states)
+ # 다음 상태에 대한 타깃 모델의 큐함수
+ target_predicts = self.target_model(next_states)
+ target_predicts = tf.stop_gradient(target_predicts)
- # 벨만 최적 방정식을 이용한 업데이트 타깃
- for i in range(self.batch_size):
- if dones[i]:
- target[i][actions[i]] = rewards[i]
- else:
- target[i][actions[i]] = rewards[i] + self.discount_factor * (
- np.amax(target_val[i]))
+ # 벨만 최적 방정식을 이용한 업데이트 타깃
+ max_q = np.amax(target_predicts, axis=-1)
+ targets = rewards + (1 - dones) * self.discount_factor * max_q
+ loss = tf.reduce_mean(tf.square(targets - predicts))
- self.model.fit(states, target, batch_size=self.batch_size,
- epochs=1, verbose=0)
+ # 오류함수를 줄이는 방향으로 모델 업데이트
+ grads = tape.gradient(loss, model_params)
+ self.optimizer.apply_gradients(zip(grads, model_params))
if __name__ == "__main__":
@@ -118,8 +118,10 @@ def train_model(self):
agent = DQNAgent(state_size, action_size)
scores, episodes = [], []
+ score_avg = 0
- for e in range(EPISODES):
+ num_episode = 300
+ for e in range(num_episode):
done = False
score = 0
# env 초기화
@@ -135,8 +137,10 @@ def train_model(self):
# 선택한 행동으로 환경에서 한 타임스텝 진행
next_state, reward, done, info = env.step(action)
next_state = np.reshape(next_state, [1, state_size])
- # 에피소드가 중간에 끝나면 -100 보상
- reward = reward if not done or score == 499 else -100
+
+ # 타임스텝마다 보상 0.1, 에피소드가 중간에 끝나면 -1 보상
+ score += reward
+ reward = 0.1 if not done or score == 500 else -1
# 리플레이 메모리에 샘플 저장
agent.append_sample(state, action, reward, next_state, done)
@@ -144,23 +148,25 @@ def train_model(self):
if len(agent.memory) >= agent.train_start:
agent.train_model()
- score += reward
state = next_state
if done:
# 각 에피소드마다 타깃 모델을 모델의 가중치로 업데이트
agent.update_target_model()
-
- score = score if score == 500 else score + 100
# 에피소드마다 학습 결과 출력
- scores.append(score)
+ score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
+ print("episode: {:3d} | score avg: {:3.2f} | memory length: {:4d} | epsilon: {:.4f}".format(
+ e, score_avg, len(agent.memory), agent.epsilon))
+
+ # 에피소드마다 학습 결과 그래프로 저장
+ scores.append(score_avg)
episodes.append(e)
pylab.plot(episodes, scores, 'b')
- pylab.savefig("./save_graph/cartpole_dqn.png")
- print("episode:", e, " score:", score, " memory length:",
- len(agent.memory), " epsilon:", agent.epsilon)
-
- # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단
- if np.mean(scores[-min(10, len(scores)):]) > 490:
- agent.model.save_weights("./save_model/cartpole_dqn.h5")
- sys.exit()
+ pylab.xlabel("episode")
+ pylab.ylabel("average score")
+ pylab.savefig("./save_graph/graph.png")
+
+ # 이동 평균이 400 이상일 때 종료
+ if score_avg > 400:
+ agent.model.save_weights("./save_model/model", save_format="tf")
+ sys.exit()
\ No newline at end of file
diff --git a/2-cartpole/2-actor-critic/cartpole_a2c.py b/2-cartpole/2-actor-critic/cartpole_a2c.py
deleted file mode 100644
index 7121fa8..0000000
--- a/2-cartpole/2-actor-critic/cartpole_a2c.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import sys
-import gym
-import pylab
-import numpy as np
-from keras.layers import Dense
-from keras.models import Sequential
-from keras.optimizers import Adam
-from keras import backend as K
-
-EPISODES = 1000
-
-
-# 카트폴 예제에서의 액터-크리틱(A2C) 에이전트
-class A2CAgent:
- def __init__(self, state_size, action_size):
- self.render = False
- self.load_model = False
- # 상태와 행동의 크기 정의
- self.state_size = state_size
- self.action_size = action_size
- self.value_size = 1
-
- # 액터-크리틱 하이퍼파라미터
- self.discount_factor = 0.99
- self.actor_lr = 0.001
- self.critic_lr = 0.005
-
- # 정책신경망과 가치신경망 생성
- self.actor = self.build_actor()
- self.critic = self.build_critic()
- self.actor_updater = self.actor_optimizer()
- self.critic_updater = self.critic_optimizer()
-
- if self.load_model:
- self.actor.load_weights("./save_model/cartpole_actor_trained.h5")
- self.critic.load_weights("./save_model/cartpole_critic_trained.h5")
-
- # actor: 상태를 받아 각 행동의 확률을 계산
- def build_actor(self):
- actor = Sequential()
- actor.add(Dense(24, input_dim=self.state_size, activation='relu',
- kernel_initializer='he_uniform'))
- actor.add(Dense(self.action_size, activation='softmax',
- kernel_initializer='he_uniform'))
- actor.summary()
- return actor
-
- # critic: 상태를 받아서 상태의 가치를 계산
- def build_critic(self):
- critic = Sequential()
- critic.add(Dense(24, input_dim=self.state_size, activation='relu',
- kernel_initializer='he_uniform'))
- critic.add(Dense(24, input_dim=self.state_size, activation='relu',
- kernel_initializer='he_uniform'))
- critic.add(Dense(self.value_size, activation='linear',
- kernel_initializer='he_uniform'))
- critic.summary()
- return critic
-
- # 정책신경망의 출력을 받아 확률적으로 행동을 선택
- def get_action(self, state):
- policy = self.actor.predict(state, batch_size=1).flatten()
- return np.random.choice(self.action_size, 1, p=policy)[0]
-
- # 정책신경망을 업데이트하는 함수
- def actor_optimizer(self):
- action = K.placeholder(shape=[None, self.action_size])
- advantage = K.placeholder(shape=[None, ])
-
- action_prob = K.sum(action * self.actor.output, axis=1)
- cross_entropy = K.log(action_prob) * advantage
- loss = -K.sum(cross_entropy)
-
- optimizer = Adam(lr=self.actor_lr)
- updates = optimizer.get_updates(self.actor.trainable_weights, [], loss)
- train = K.function([self.actor.input, action, advantage], [],
- updates=updates)
- return train
-
- # 가치신경망을 업데이트하는 함수
- def critic_optimizer(self):
- target = K.placeholder(shape=[None, ])
-
- loss = K.mean(K.square(target - self.critic.output))
-
- optimizer = Adam(lr=self.critic_lr)
- updates = optimizer.get_updates(self.critic.trainable_weights, [], loss)
- train = K.function([self.critic.input, target], [], updates=updates)
-
- return train
-
- # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
- def train_model(self, state, action, reward, next_state, done):
- value = self.critic.predict(state)[0]
- next_value = self.critic.predict(next_state)[0]
-
- act = np.zeros([1, self.action_size])
- act[0][action] = 1
-
- # 벨만 기대 방정식를 이용한 어드벤티지와 업데이트 타깃
- if done:
- advantage = reward - value
- target = [reward]
- else:
- advantage = (reward + self.discount_factor * next_value) - value
- target = reward + self.discount_factor * next_value
-
- self.actor_updater([state, act, advantage])
- self.critic_updater([state, target])
-
-
-if __name__ == "__main__":
- # CartPole-v1 환경, 최대 타임스텝 수가 500
- env = gym.make('CartPole-v1')
- # 환경으로부터 상태와 행동의 크기를 받아옴
- state_size = env.observation_space.shape[0]
- action_size = env.action_space.n
-
- # 액터-크리틱(A2C) 에이전트 생성
- agent = A2CAgent(state_size, action_size)
-
- scores, episodes = [], []
-
- for e in range(EPISODES):
- done = False
- score = 0
- state = env.reset()
- state = np.reshape(state, [1, state_size])
-
- while not done:
- if agent.render:
- env.render()
-
- action = agent.get_action(state)
- next_state, reward, done, info = env.step(action)
- next_state = np.reshape(next_state, [1, state_size])
- # 에피소드가 중간에 끝나면 -100 보상
- reward = reward if not done or score == 499 else -100
-
- agent.train_model(state, action, reward, next_state, done)
-
- score += reward
- state = next_state
-
- if done:
- # 에피소드마다 학습 결과 출력
- score = score if score == 500.0 else score + 100
- scores.append(score)
- episodes.append(e)
- pylab.plot(episodes, scores, 'b')
- pylab.savefig("./save_graph/cartpole_a2c.png")
- print("episode:", e, " score:", score)
-
- # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단
- if np.mean(scores[-min(10, len(scores)):]) > 490:
- agent.actor.save_weights("./save_model/cartpole_actor.h5")
- agent.critic.save_weights(
- "./save_model/cartpole_critic.h5")
- sys.exit()
diff --git a/2-cartpole/2-actor-critic/save_graph/cartpole_a2c.png b/2-cartpole/2-actor-critic/save_graph/cartpole_a2c.png
deleted file mode 100644
index 92fdb86..0000000
Binary files a/2-cartpole/2-actor-critic/save_graph/cartpole_a2c.png and /dev/null differ
diff --git a/2-cartpole/2-actor-critic/save_graph/graph_trained.png b/2-cartpole/2-actor-critic/save_graph/graph_trained.png
new file mode 100644
index 0000000..19d61d1
Binary files /dev/null and b/2-cartpole/2-actor-critic/save_graph/graph_trained.png differ
diff --git a/2-cartpole/2-actor-critic/save_model/cartpole_actor_trained.h5 b/2-cartpole/2-actor-critic/save_model/cartpole_actor_trained.h5
deleted file mode 100644
index 19a3aa7..0000000
Binary files a/2-cartpole/2-actor-critic/save_model/cartpole_actor_trained.h5 and /dev/null differ
diff --git a/2-cartpole/2-actor-critic/save_model/cartpole_critic_trained.h5 b/2-cartpole/2-actor-critic/save_model/cartpole_critic_trained.h5
deleted file mode 100644
index d16fafd..0000000
Binary files a/2-cartpole/2-actor-critic/save_model/cartpole_critic_trained.h5 and /dev/null differ
diff --git a/2-cartpole/2-actor-critic/save_model/trained/checkpoint b/2-cartpole/2-actor-critic/save_model/trained/checkpoint
new file mode 100644
index 0000000..a6e034f
--- /dev/null
+++ b/2-cartpole/2-actor-critic/save_model/trained/checkpoint
@@ -0,0 +1,2 @@
+model_checkpoint_path: "model"
+all_model_checkpoint_paths: "model"
diff --git a/2-cartpole/2-actor-critic/save_model/trained/model.data-00000-of-00001 b/2-cartpole/2-actor-critic/save_model/trained/model.data-00000-of-00001
new file mode 100644
index 0000000..8ef32fd
Binary files /dev/null and b/2-cartpole/2-actor-critic/save_model/trained/model.data-00000-of-00001 differ
diff --git a/2-cartpole/2-actor-critic/save_model/trained/model.index b/2-cartpole/2-actor-critic/save_model/trained/model.index
new file mode 100644
index 0000000..145c7c3
Binary files /dev/null and b/2-cartpole/2-actor-critic/save_model/trained/model.index differ
diff --git a/2-cartpole/2-actor-critic/test.py b/2-cartpole/2-actor-critic/test.py
new file mode 100644
index 0000000..2adf3a9
--- /dev/null
+++ b/2-cartpole/2-actor-critic/test.py
@@ -0,0 +1,78 @@
+import sys
+import gym
+import pylab
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.initializers import RandomUniform
+
+
+# 정책 신경망과 가치 신경망 생성
+class A2C(tf.keras.Model):
+ def __init__(self, action_size):
+ super(A2C, self).__init__()
+ self.actor_fc = Dense(24, activation='tanh')
+ self.actor_out = Dense(action_size, activation='softmax',
+ kernel_initializer=RandomUniform(-1e-3, 1e-3))
+ self.critic_fc1 = Dense(24, activation='tanh')
+ self.critic_fc2 = Dense(24, activation='tanh')
+ self.critic_out = Dense(1,
+ kernel_initializer=RandomUniform(-1e-3, 1e-3))
+
+ def call(self, x):
+ actor_x = self.actor_fc(x)
+ policy = self.actor_out(actor_x)
+
+ critic_x = self.critic_fc1(x)
+ critic_x = self.critic_fc2(critic_x)
+ value = self.critic_out(critic_x)
+ return policy, value
+
+
+
+# 카트폴 예제에서의 액터-크리틱(A2C) 에이전트
+class A2CAgent:
+ def __init__(self, action_size):
+ # 행동의 크기 정의
+ self.action_size = action_size
+
+ # 정책신경망과 가치신경망 생성
+ self.model = A2C(self.action_size)
+ self.model.load_weights("./save_model/trained/model")
+
+ # 정책신경망의 출력을 받아 확률적으로 행동을 선택
+ def get_action(self, state):
+ policy, _ = self.model(state)
+ policy = np.array(policy[0])
+ return np.random.choice(self.action_size, 1, p=policy)[0]
+
+
+if __name__ == "__main__":
+ # CartPole-v1 환경, 최대 타임스텝 수가 500
+ env = gym.make('CartPole-v1')
+ # 환경으로부터 상태와 행동의 크기를 받아옴
+ state_size = env.observation_space.shape[0]
+ action_size = env.action_space.n
+
+ # 액터-크리틱(A2C) 에이전트 생성
+ agent = A2CAgent(action_size)
+
+ num_episode = 10
+ for e in range(num_episode):
+ done = False
+ score = 0
+ state = env.reset()
+ state = np.reshape(state, [1, state_size])
+
+ while not done:
+ env.render()
+
+ action = agent.get_action(state)
+ next_state, reward, done, info = env.step(action)
+ next_state = np.reshape(next_state, [1, state_size])
+
+ score += reward
+ state = next_state
+
+ if done:
+ print("episode: {:3d} | score: {:3d}".format(e, int(score)))
\ No newline at end of file
diff --git a/2-cartpole/2-actor-critic/train.py b/2-cartpole/2-actor-critic/train.py
new file mode 100644
index 0000000..eb7afb2
--- /dev/null
+++ b/2-cartpole/2-actor-critic/train.py
@@ -0,0 +1,139 @@
+import sys
+import gym
+import pylab
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.initializers import RandomUniform
+
+
+# 정책 신경망과 가치 신경망 생성
+class A2C(tf.keras.Model):
+ def __init__(self, action_size):
+ super(A2C, self).__init__()
+ self.actor_fc = Dense(24, activation='tanh')
+ self.actor_out = Dense(action_size, activation='softmax',
+ kernel_initializer=RandomUniform(-1e-3, 1e-3))
+ self.critic_fc1 = Dense(24, activation='tanh')
+ self.critic_fc2 = Dense(24, activation='tanh')
+ self.critic_out = Dense(1,
+ kernel_initializer=RandomUniform(-1e-3, 1e-3))
+
+ def call(self, x):
+ actor_x = self.actor_fc(x)
+ policy = self.actor_out(actor_x)
+
+ critic_x = self.critic_fc1(x)
+ critic_x = self.critic_fc2(critic_x)
+ value = self.critic_out(critic_x)
+ return policy, value
+
+
+# 카트폴 예제에서의 액터-크리틱(A2C) 에이전트
+class A2CAgent:
+ def __init__(self, action_size):
+ self.render = False
+
+ # 행동의 크기 정의
+ self.action_size = action_size
+
+ # 액터-크리틱 하이퍼파라미터
+ self.discount_factor = 0.99
+ self.learning_rate = 0.001
+
+ # 정책신경망과 가치신경망 생성
+ self.model = A2C(self.action_size)
+ # 최적화 알고리즘 설정, 미분값이 너무 커지는 현상을 막기 위해 clipnorm 설정
+ self.optimizer = Adam(lr=self.learning_rate, clipnorm=5.0)
+
+ # 정책신경망의 출력을 받아 확률적으로 행동을 선택
+ def get_action(self, state):
+ policy, _ = self.model(state)
+ policy = np.array(policy[0])
+ return np.random.choice(self.action_size, 1, p=policy)[0]
+
+ # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
+ def train_model(self, state, action, reward, next_state, done):
+ model_params = self.model.trainable_variables
+ with tf.GradientTape() as tape:
+ policy, value = self.model(state)
+ _, next_value = self.model(next_state)
+ target = reward + (1 - done) * self.discount_factor * next_value[0]
+
+ # 정책 신경망 오류 함수 구하기
+ one_hot_action = tf.one_hot([action], self.action_size)
+ action_prob = tf.reduce_sum(one_hot_action * policy, axis=1)
+ cross_entropy = - tf.math.log(action_prob + 1e-5)
+ advantage = tf.stop_gradient(target - value[0])
+ actor_loss = tf.reduce_mean(cross_entropy * advantage)
+
+ # 가치 신경망 오류 함수 구하기
+ critic_loss = 0.5 * tf.square(tf.stop_gradient(target) - value[0])
+ critic_loss = tf.reduce_mean(critic_loss)
+
+ # 하나의 오류 함수로 만들기
+ loss = 0.2 * actor_loss + critic_loss
+
+ # 오류함수를 줄이는 방향으로 모델 업데이트
+ grads = tape.gradient(loss, model_params)
+ self.optimizer.apply_gradients(zip(grads, model_params))
+ return np.array(loss)
+
+
+if __name__ == "__main__":
+ # CartPole-v1 환경, 최대 타임스텝 수가 500
+ env = gym.make('CartPole-v1')
+ # 환경으로부터 상태와 행동의 크기를 받아옴
+ state_size = env.observation_space.shape[0]
+ action_size = env.action_space.n
+
+ # 액터-크리틱(A2C) 에이전트 생성
+ agent = A2CAgent(action_size)
+
+ scores, episodes = [], []
+ score_avg = 0
+
+ num_episode = 1000
+ for e in range(num_episode):
+ done = False
+ score = 0
+ loss_list = []
+ state = env.reset()
+ state = np.reshape(state, [1, state_size])
+
+ while not done:
+ if agent.render:
+ env.render()
+
+ action = agent.get_action(state)
+ next_state, reward, done, info = env.step(action)
+ next_state = np.reshape(next_state, [1, state_size])
+
+ # 타임스텝마다 보상 0.1, 에피소드가 중간에 끝나면 -1 보상
+ score += reward
+ reward = 0.1 if not done or score == 500 else -1
+
+ # 매 타임스텝마다 학습
+ loss = agent.train_model(state, action, reward, next_state, done)
+ loss_list.append(loss)
+ state = next_state
+
+ if done:
+ # 에피소드마다 학습 결과 출력
+ score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
+ print("episode: {:3d} | score avg: {:3.2f} | loss: {:.3f}".format(
+ e, score_avg, np.mean(loss_list)))
+
+ # 에피소드마다 학습 결과 그래프로 저장
+ scores.append(score_avg)
+ episodes.append(e)
+ pylab.plot(episodes, scores, 'b')
+ pylab.xlabel("episode")
+ pylab.ylabel("average score")
+ pylab.savefig("./save_graph/graph.png")
+
+ # 이동 평균이 400 이상일 때 종료
+ if score_avg > 400:
+ agent.model.save_weights("./save_model/model", save_format="tf")
+ sys.exit()
\ No newline at end of file
diff --git a/2-cartpole/3-continuous-actor-critic/env.py b/2-cartpole/3-continuous-actor-critic/env.py
new file mode 100644
index 0000000..4c36a24
--- /dev/null
+++ b/2-cartpole/3-continuous-actor-critic/env.py
@@ -0,0 +1,163 @@
+"""
+Classic cart-pole system implemented by Rich Sutton et al.
+Copied from http://incompleteideas.net/sutton/book/code/pole.c
+permalink: https://perma.cc/C9ZM-652R
+"""
+
+import math
+import gym
+from gym import spaces, logger
+from gym.utils import seeding
+import numpy as np
+
+
+class ContinuousCartPoleEnv(gym.Env):
+ metadata = {
+ 'render.modes': ['human', 'rgb_array'],
+ 'video.frames_per_second': 50
+ }
+
+ def __init__(self):
+ self.gravity = 9.8
+ self.masscart = 1.0
+ self.masspole = 0.1
+ self.total_mass = (self.masspole + self.masscart)
+ self.length = 0.5 # actually half the pole's length
+ self.polemass_length = (self.masspole * self.length)
+ self.force_mag = 10.0
+ self.tau = 0.02 # seconds between state updates
+ self.max_action = 3.0
+ self.kinematics_integrator = 'euler'
+
+ # Angle at which to fail the episode
+ self.theta_threshold_radians = 12 * 2 * math.pi / 360
+ self.x_threshold = 2.4
+
+ # Angle limit set to 2 * theta_threshold_radians so failing observation
+ # is still within bounds
+ high = np.array([
+ self.x_threshold * 2,
+ np.finfo(np.float32).max,
+ self.theta_threshold_radians * 2,
+ np.finfo(np.float32).max])
+
+ self.action_space = spaces.Box(
+ low=-self.max_action,
+ high=self.max_action,
+ shape=(1,)
+ )
+ self.observation_space = spaces.Box(-high, high)
+
+ self.seed()
+ self.viewer = None
+ self.state = None
+
+ self.steps_beyond_done = None
+
+ def seed(self, seed=None):
+ self.np_random, seed = seeding.np_random(seed)
+ return [seed]
+
+ def step(self, action):
+ assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
+ state = self.state
+ x, x_dot, theta, theta_dot = state
+ force = self.force_mag * float(action)
+ costheta = math.cos(theta)
+ sintheta = math.sin(theta)
+ temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass
+ thetaacc = (self.gravity * sintheta - costheta* temp) / (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass))
+ xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
+ if self.kinematics_integrator == 'euler':
+ x = x + self.tau * x_dot
+ x_dot = x_dot + self.tau * xacc
+ theta = theta + self.tau * theta_dot
+ theta_dot = theta_dot + self.tau * thetaacc
+ else: # semi-implicit euler
+ x_dot = x_dot + self.tau * xacc
+ x = x + self.tau * x_dot
+ theta_dot = theta_dot + self.tau * thetaacc
+ theta = theta + self.tau * theta_dot
+ self.state = (x,x_dot,theta,theta_dot)
+ done = x < -self.x_threshold \
+ or x > self.x_threshold \
+ or theta < -self.theta_threshold_radians \
+ or theta > self.theta_threshold_radians
+ done = bool(done)
+
+ if not done:
+ reward = 1.0
+ elif self.steps_beyond_done is None:
+ # Pole just fell!
+ self.steps_beyond_done = 0
+ reward = 1.0
+ else:
+ if self.steps_beyond_done == 0:
+ logger.warn("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.")
+ self.steps_beyond_done += 1
+ reward = 0.0
+
+ return np.array(self.state, dtype=np.float32), reward, done, {}
+
+ def reset(self):
+ self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
+ self.steps_beyond_done = None
+ return np.array(self.state, dtype=np.float32)
+
+ def render(self, mode='human'):
+ screen_width = 600
+ screen_height = 400
+
+ world_width = self.x_threshold * 2
+ scale = screen_width /world_width
+ carty = 100 # TOP OF CART
+ polewidth = 10.0
+ polelen = scale * 1.0
+ cartwidth = 50.0
+ cartheight = 30.0
+
+ if self.viewer is None:
+ from gym.envs.classic_control import rendering
+ self.viewer = rendering.Viewer(screen_width, screen_height)
+ l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2
+ axleoffset = cartheight / 4.0
+ cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
+ self.carttrans = rendering.Transform()
+ cart.add_attr(self.carttrans)
+ self.viewer.add_geom(cart)
+ l, r, t, b = -polewidth / 2, polewidth / 2, polelen-polewidth / 2, -polewidth / 2
+ pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
+ pole.set_color(.8, .6, .4)
+ self.poletrans = rendering.Transform(translation=(0, axleoffset))
+ pole.add_attr(self.poletrans)
+ pole.add_attr(self.carttrans)
+ self.viewer.add_geom(pole)
+ self.axle = rendering.make_circle(polewidth / 2)
+ self.axle.add_attr(self.poletrans)
+ self.axle.add_attr(self.carttrans)
+ self.axle.set_color(.5, .5, .8)
+ self.viewer.add_geom(self.axle)
+ self.track = rendering.Line((0, carty), (screen_width, carty))
+ self.track.set_color(0, 0, 0)
+ self.viewer.add_geom(self.track)
+
+ self._pole_geom = pole
+
+ if self.state is None: return None
+
+ # Edit the pole polygon vertex
+ pole = self._pole_geom
+ l,r,t,b = -polewidth/2,polewidth/2,polelen-polewidth/2,-polewidth/2
+ pole.v = [(l,b), (l,t), (r,t), (r,b)]
+
+ x = self.state
+ cartx = x[0] * scale + screen_width / 2.0 # MIDDLE OF CART
+ self.carttrans.set_translation(cartx, carty)
+ self.poletrans.set_rotation(-x[2])
+
+ return self.viewer.render(return_rgb_array=(mode == 'rgb_array'))
+
+ def close(self):
+ if self.viewer:
+ self.viewer.close()
+ self.viewer = None
\ No newline at end of file
diff --git a/2-cartpole/3-continuous-actor-critic/save_graph/graph_trained.png b/2-cartpole/3-continuous-actor-critic/save_graph/graph_trained.png
new file mode 100644
index 0000000..ca942e3
Binary files /dev/null and b/2-cartpole/3-continuous-actor-critic/save_graph/graph_trained.png differ
diff --git a/2-cartpole/3-continuous-actor-critic/save_model/trained/checkpoint b/2-cartpole/3-continuous-actor-critic/save_model/trained/checkpoint
new file mode 100644
index 0000000..a6e034f
--- /dev/null
+++ b/2-cartpole/3-continuous-actor-critic/save_model/trained/checkpoint
@@ -0,0 +1,2 @@
+model_checkpoint_path: "model"
+all_model_checkpoint_paths: "model"
diff --git a/2-cartpole/3-continuous-actor-critic/save_model/trained/model.data-00000-of-00001 b/2-cartpole/3-continuous-actor-critic/save_model/trained/model.data-00000-of-00001
new file mode 100644
index 0000000..4c56d7a
Binary files /dev/null and b/2-cartpole/3-continuous-actor-critic/save_model/trained/model.data-00000-of-00001 differ
diff --git a/2-cartpole/3-continuous-actor-critic/save_model/trained/model.index b/2-cartpole/3-continuous-actor-critic/save_model/trained/model.index
new file mode 100644
index 0000000..7954513
Binary files /dev/null and b/2-cartpole/3-continuous-actor-critic/save_model/trained/model.index differ
diff --git a/2-cartpole/3-continuous-actor-critic/test.py b/2-cartpole/3-continuous-actor-critic/test.py
new file mode 100644
index 0000000..62fcde3
--- /dev/null
+++ b/2-cartpole/3-continuous-actor-critic/test.py
@@ -0,0 +1,95 @@
+import sys
+import gym
+import pylab
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.initializers import RandomUniform
+from tensorflow_probability import distributions as tfd
+
+
+# 정책 신경망과 가치 신경망 생성
+class ContinuousA2C(tf.keras.Model):
+ def __init__(self, action_size):
+ super(ContinuousA2C, self).__init__()
+ self.actor_fc1 = Dense(24, activation='tanh')
+ self.actor_mu = Dense(action_size,
+ kernel_initializer=RandomUniform(-1e-3, 1e-3))
+ self.actor_sigma = Dense(action_size, activation='softplus',
+ kernel_initializer=RandomUniform(-1e-3, 1e-3))
+
+ self.critic_fc1 = Dense(24, activation='tanh')
+ self.critic_fc2 = Dense(24, activation='tanh')
+ self.critic_out = Dense(1,
+ kernel_initializer=RandomUniform(-1e-3, 1e-3))
+
+ def call(self, x):
+ actor_x = self.actor_fc1(x)
+ mu = self.actor_mu(actor_x)
+ sigma = self.actor_sigma(actor_x)
+ sigma = sigma + 1e-5
+
+ critic_x = self.critic_fc1(x)
+ critic_x = self.critic_fc2(critic_x)
+ value = self.critic_out(critic_x)
+ return mu, sigma, value
+
+
+# 카트폴 예제에서의 액터-크리틱(A2C) 에이전트
+class ContinuousA2CAgent:
+ def __init__(self, action_size, max_action):
+ # 행동의 크기 정의
+ self.action_size = action_size
+ self.max_action = max_action
+
+ # 정책신경망과 가치신경망 생성
+ self.model = ContinuousA2C(self.action_size)
+ self.model.load_weights("./save_model/trained/model")
+
+ # 정책신경망의 출력을 받아 확률적으로 행동을 선택
+ def get_action(self, state):
+ mu, sigma, _ = self.model(state)
+ dist = tfd.Normal(loc=mu[0], scale=sigma[0])
+ action = dist.sample([1])[0]
+ action = np.clip(action, -self.max_action, self.max_action)
+ return action
+
+
+if __name__ == "__main__":
+ # CartPole-v1 환경, 최대 타임스텝 수가 500
+ gym.envs.register(
+ id='CartPoleContinuous-v0',
+ entry_point='env:ContinuousCartPoleEnv',
+ max_episode_steps=500,
+ reward_threshold=475.0)
+
+ env = gym.make('CartPoleContinuous-v0')
+ # 환경으로부터 상태와 행동의 크기를 받아옴
+ state_size = env.observation_space.shape[0]
+ action_size = env.action_space.shape[0]
+ max_action = env.action_space.high[0]
+
+ # 액터-크리틱(A2C) 에이전트 생성
+ agent = ContinuousA2CAgent(action_size, max_action)
+
+ scores, episodes = [], []
+
+ num_episode = 10
+ for e in range(num_episode):
+ done = False
+ score = 0
+ state = env.reset()
+ state = np.reshape(state, [1, state_size])
+
+ while not done:
+ env.render()
+
+ action = agent.get_action(state)
+ next_state, reward, done, info = env.step(action)
+ next_state = np.reshape(next_state, [1, state_size])
+
+ score += reward
+ state = next_state
+
+ if done:
+ print("episode: {:3d} | score: {:3d}".format(e, int(score)))
\ No newline at end of file
diff --git a/2-cartpole/3-continuous-actor-critic/train.py b/2-cartpole/3-continuous-actor-critic/train.py
new file mode 100644
index 0000000..bfce749
--- /dev/null
+++ b/2-cartpole/3-continuous-actor-critic/train.py
@@ -0,0 +1,154 @@
+import sys
+import gym
+import pylab
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.initializers import RandomUniform
+from tensorflow_probability import distributions as tfd
+
+
+# 정책 신경망과 가치 신경망 생성
+class ContinuousA2C(tf.keras.Model):
+ def __init__(self, action_size):
+ super(ContinuousA2C, self).__init__()
+ self.actor_fc1 = Dense(24, activation='tanh')
+ self.actor_mu = Dense(action_size,
+ kernel_initializer=RandomUniform(-1e-3, 1e-3))
+ self.actor_sigma = Dense(action_size, activation='sigmoid',
+ kernel_initializer=RandomUniform(-1e-3, 1e-3))
+
+ self.critic_fc1 = Dense(24, activation='tanh')
+ self.critic_fc2 = Dense(24, activation='tanh')
+ self.critic_out = Dense(1,
+ kernel_initializer=RandomUniform(-1e-3, 1e-3))
+
+ def call(self, x):
+ actor_x = self.actor_fc1(x)
+ mu = self.actor_mu(actor_x)
+ sigma = self.actor_sigma(actor_x)
+ sigma = sigma + 1e-5
+
+ critic_x = self.critic_fc1(x)
+ critic_x = self.critic_fc2(critic_x)
+ value = self.critic_out(critic_x)
+ return mu, sigma, value
+
+
+# 카트폴 예제에서의 연속적 액터-크리틱(A2C) 에이전트
+class ContinuousA2CAgent:
+ def __init__(self, action_size, max_action):
+ self.render = False
+
+ # 행동의 크기 정의
+ self.action_size = action_size
+ self.max_action = max_action
+
+ # 액터-크리틱 하이퍼파라미터
+ self.discount_factor = 0.99
+ self.learning_rate = 0.001
+
+ # 정책신경망과 가치신경망 생성
+ self.model = ContinuousA2C(self.action_size)
+ # 최적화 알고리즘 설정, 미분값이 너무 커지는 현상을 막기 위해 clipnorm 설정
+ self.optimizer = Adam(lr=self.learning_rate, clipnorm=1.0)
+
+ # 정책신경망의 출력을 받아 확률적으로 행동을 선택
+ def get_action(self, state):
+ mu, sigma, _ = self.model(state)
+ dist = tfd.Normal(loc=mu[0], scale=sigma[0])
+ action = dist.sample([1])[0]
+ action = np.clip(action, -self.max_action, self.max_action)
+ return action
+
+ # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
+ def train_model(self, state, action, reward, next_state, done):
+ model_params = self.model.trainable_variables
+ with tf.GradientTape() as tape:
+ mu, sigma, value = self.model(state)
+ _, _, next_value = self.model(next_state)
+ target = reward + (1 - done) * self.discount_factor * next_value[0]
+
+ # 정책 신경망 오류 함수 구하기
+ advantage = tf.stop_gradient(target - value[0])
+ dist = tfd.Normal(loc=mu, scale=sigma)
+ action_prob = dist.prob([action])[0]
+ cross_entropy = - tf.math.log(action_prob + 1e-5)
+ actor_loss = tf.reduce_mean(cross_entropy * advantage)
+
+ # 가치 신경망 오류 함수 구하기
+ critic_loss = 0.5 * tf.square(tf.stop_gradient(target) - value[0])
+ critic_loss = tf.reduce_mean(critic_loss)
+
+ # 하나의 오류 함수로 만들기
+ loss = 0.1 * actor_loss + critic_loss
+
+ # 오류함수를 줄이는 방향으로 모델 업데이트
+ grads = tape.gradient(loss, model_params)
+ self.optimizer.apply_gradients(zip(grads, model_params))
+ return loss, sigma
+
+
+if __name__ == "__main__":
+ # CartPole-v1 환경, 최대 타임스텝 수가 500
+ gym.envs.register(
+ id='CartPoleContinuous-v0',
+ entry_point='env:ContinuousCartPoleEnv',
+ max_episode_steps=500,
+ reward_threshold=475.0)
+
+ env = gym.make('CartPoleContinuous-v0')
+ # 환경으로부터 상태와 행동의 크기를 받아옴
+ state_size = env.observation_space.shape[0]
+ action_size = env.action_space.shape[0]
+ max_action = env.action_space.high[0]
+
+ # 액터-크리틱(A2C) 에이전트 생성
+ agent = ContinuousA2CAgent(action_size, max_action)
+ scores, episodes = [], []
+ score_avg = 0
+
+ num_episode = 1000
+ for e in range(num_episode):
+ done = False
+ score = 0
+ loss_list, sigma_list = [], []
+ state = env.reset()
+ state = np.reshape(state, [1, state_size])
+
+ while not done:
+ if agent.render:
+ env.render()
+
+ action = agent.get_action(state)
+ next_state, reward, done, info = env.step(action)
+ next_state = np.reshape(next_state, [1, state_size])
+
+ # 타임스텝마다 보상 0.1, 에피소드가 중간에 끝나면 -1 보상
+ score += reward
+ reward = 0.1 if not done or score == 500 else -1
+
+ # 매 타임스텝마다 학습
+ loss, sigma = agent.train_model(state, action, reward, next_state, done)
+ loss_list.append(loss)
+ sigma_list.append(sigma)
+ state = next_state
+
+ if done:
+ # 에피소드마다 학습 결과 출력
+ score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
+ print("episode: {:3d} | score avg: {:3.2f} | loss: {:.3f} | sigma: {:.3f}".format(
+ e, score_avg, np.mean(loss_list), np.mean(sigma)))
+
+ scores.append(score_avg)
+ episodes.append(e)
+ pylab.plot(episodes, scores, 'b')
+ pylab.xlabel("episode")
+ pylab.ylabel("average score")
+ pylab.savefig("./save_graph/graph.png")
+
+ # 이동 평균이 400 이상일 때 종료
+ if score_avg > 400:
+ agent.model.save_weights("./save_model/model", save_format="tf")
+ sys.exit()
diff --git a/2-cartpole/LICENSE b/2-cartpole/LICENSE
deleted file mode 100644
index 5c61d8a..0000000
--- a/2-cartpole/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2017 Keon Kim
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/2-cartpole/README.md b/2-cartpole/README.md
deleted file mode 100644
index 49a4d30..0000000
--- a/2-cartpole/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# OpenAI gym Cartpole
-
-
-Various reinforcement learning algorithms for Cartpole example.
-
diff --git a/2-cartpole/cartpole.png b/2-cartpole/cartpole.png
deleted file mode 100644
index c8a7aea..0000000
Binary files a/2-cartpole/cartpole.png and /dev/null differ
diff --git a/3-atari/1-breakout-dqn/save_model/trained/checkpoint b/3-atari/1-breakout-dqn/save_model/trained/checkpoint
new file mode 100644
index 0000000..a6e034f
--- /dev/null
+++ b/3-atari/1-breakout-dqn/save_model/trained/checkpoint
@@ -0,0 +1,2 @@
+model_checkpoint_path: "model"
+all_model_checkpoint_paths: "model"
diff --git a/3-atari/1-breakout-dqn/save_model/trained/model.data-00000-of-00002 b/3-atari/1-breakout-dqn/save_model/trained/model.data-00000-of-00002
new file mode 100644
index 0000000..de8bcf5
Binary files /dev/null and b/3-atari/1-breakout-dqn/save_model/trained/model.data-00000-of-00002 differ
diff --git a/3-atari/1-breakout-dqn/save_model/trained/model.data-00001-of-00002 b/3-atari/1-breakout-dqn/save_model/trained/model.data-00001-of-00002
new file mode 100644
index 0000000..cdf6a3d
Binary files /dev/null and b/3-atari/1-breakout-dqn/save_model/trained/model.data-00001-of-00002 differ
diff --git a/3-atari/1-breakout-dqn/save_model/trained/model.index b/3-atari/1-breakout-dqn/save_model/trained/model.index
new file mode 100644
index 0000000..d106e23
Binary files /dev/null and b/3-atari/1-breakout-dqn/save_model/trained/model.index differ
diff --git a/3-atari/1-breakout-dqn/summary/breakout_dqn/events.out.tfevents.1583677266.cqcpu3.11796.5.v2 b/3-atari/1-breakout-dqn/summary/breakout_dqn/events.out.tfevents.1583677266.cqcpu3.11796.5.v2
new file mode 100644
index 0000000..3010474
Binary files /dev/null and b/3-atari/1-breakout-dqn/summary/breakout_dqn/events.out.tfevents.1583677266.cqcpu3.11796.5.v2 differ
diff --git a/3-atari/1-breakout-dqn/test.py b/3-atari/1-breakout-dqn/test.py
new file mode 100644
index 0000000..27ce076
--- /dev/null
+++ b/3-atari/1-breakout-dqn/test.py
@@ -0,0 +1,133 @@
+import gym
+import time
+import random
+import numpy as np
+import tensorflow as tf
+
+from skimage.color import rgb2gray
+from skimage.transform import resize
+
+from tensorflow.keras.layers import Conv2D, Dense, Flatten
+
+
+# 상태가 입력, 큐함수가 출력인 인공신경망 생성
+class DQN(tf.keras.Model):
+ def __init__(self, action_size, state_size):
+ super(DQN, self).__init__()
+ self.conv1 = Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
+ input_shape=state_size)
+ self.conv2 = Conv2D(64, (4, 4), strides=(2, 2), activation='relu')
+ self.conv3 = Conv2D(64, (3, 3), strides=(1, 1), activation='relu')
+ self.flatten = Flatten()
+ self.fc = Dense(512, activation='relu')
+ self.fc_out = Dense(action_size)
+
+ def call(self, x):
+ x = self.conv1(x)
+ x = self.conv2(x)
+ x = self.conv3(x)
+ x = self.flatten(x)
+ x = self.fc(x)
+ q = self.fc_out(x)
+ return q
+
+
+# 브레이크아웃 예제에서의 DQN 에이전트
+class DQNAgent:
+ def __init__(self, action_size, state_size, model_path):
+ self.render = False
+
+ # 상태와 행동의 크기 정의
+ self.state_size = state_size
+ self.action_size = action_size
+
+ self.epsilon = 0.02
+
+ # 모델과 타깃 모델 생성
+ self.model = DQN(action_size, state_size)
+ self.model.load_weights(model_path)
+
+ # 입실론 탐욕 정책으로 행동 선택
+ def get_action(self, history):
+ history = np.float32(history / 255.0)
+ if np.random.rand() <= self.epsilon:
+ return random.randrange(self.action_size)
+ else:
+ q_value = self.model(history)
+ return np.argmax(q_value[0])
+
+
+def pre_processing(observe):
+ processed_observe = np.uint8(
+ resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
+ return processed_observe
+
+
+if __name__ == "__main__":
+ # 환경 세팅
+ env = gym.make("BreakoutDeterministic-v4")
+ render = True
+
+ # 테스트를 위한 에이전트 생성
+ state_size = (84, 84, 4)
+ action_size = 3
+ model_path = './save_model/trained/model'
+ agent = DQNAgent(action_size, state_size, model_path)
+
+ # 불필요한 행동을 없애주기 위한 딕셔너리 선언
+ action_dict = {0:1, 1:2, 2:3, 3:3}
+
+ num_episode = 10
+ for e in range(num_episode):
+ done = False
+ dead = False
+
+ score, start_life = 0, 5
+ # env 초기화
+ observe = env.reset()
+
+ # 랜덤으로 뽑힌 값 만큼의 프레임동안 움직이지 않음
+ for _ in range(random.randint(1, 30)):
+ observe, _, _, _ = env.step(1)
+
+ # 프레임을 전처리 한 후 4개의 상태를 쌓아서 입력값으로 사용.
+ state = pre_processing(observe)
+ history = np.stack([state, state, state, state], axis=2)
+ history = np.reshape([history], (1, 84, 84, 4))
+
+ while not done:
+ if render:
+ env.render()
+ time.sleep(0.05)
+
+ # 바로 전 history를 입력으로 받아 행동을 선택
+ action = agent.get_action(history)
+ # 1: 정지, 2: 왼쪽, 3: 오른쪽
+ real_action = action_dict[action]
+
+ # 죽었을 때 시작하기 위해 발사 행동을 함
+ if dead:
+ action, real_action, dead = 0, 1, False
+
+ # 선택한 행동으로 환경에서 한 타임스텝 진행
+ observe, reward, done, info = env.step(real_action)
+ # 각 타임스텝마다 상태 전처리
+ next_state = pre_processing(observe)
+ next_state = np.reshape([next_state], (1, 84, 84, 1))
+ next_history = np.append(next_state, history[:, :, :, :3], axis=3)
+
+ if start_life > info['ale.lives']:
+ dead, start_life = True, info['ale.lives']
+
+ score += reward
+
+ if dead:
+ history = np.stack((next_state, next_state,
+ next_state, next_state), axis=2)
+ history = np.reshape([history], (1, 84, 84, 4))
+ else:
+ history = next_history
+
+ if done:
+ # 각 에피소드 당 테스트 정보를 기록
+ print("episode: {:3d} | score : {:4.1f}".format(e, score))
diff --git a/3-atari/1-breakout-dqn/train.py b/3-atari/1-breakout-dqn/train.py
new file mode 100644
index 0000000..db0a03d
--- /dev/null
+++ b/3-atari/1-breakout-dqn/train.py
@@ -0,0 +1,252 @@
+import os
+import gym
+import random
+import numpy as np
+import tensorflow as tf
+from collections import deque
+
+from skimage.color import rgb2gray
+from skimage.transform import resize
+
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.layers import Conv2D, Dense, Flatten
+
+
+# 상태가 입력, 큐함수가 출력인 인공신경망 생성
+class DQN(tf.keras.Model):
+ def __init__(self, action_size, state_size):
+ super(DQN, self).__init__()
+ self.conv1 = Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
+ input_shape=state_size)
+ self.conv2 = Conv2D(64, (4, 4), strides=(2, 2), activation='relu')
+ self.conv3 = Conv2D(64, (3, 3), strides=(1, 1), activation='relu')
+ self.flatten = Flatten()
+ self.fc = Dense(512, activation='relu')
+ self.fc_out = Dense(action_size)
+
+ def call(self, x):
+ x = self.conv1(x)
+ x = self.conv2(x)
+ x = self.conv3(x)
+ x = self.flatten(x)
+ x = self.fc(x)
+ q = self.fc_out(x)
+ return q
+
+
+# 브레이크아웃 예제에서의 DQN 에이전트
+class DQNAgent:
+ def __init__(self, action_size, state_size=(84, 84, 4)):
+ self.render = False
+
+ # 상태와 행동의 크기 정의
+ self.state_size = state_size
+ self.action_size = action_size
+
+ # DQN 하이퍼파라미터
+ self.discount_factor = 0.99
+ self.learning_rate = 1e-4
+ self.epsilon = 1.
+ self.epsilon_start, self.epsilon_end = 1.0, 0.02
+ self.exploration_steps = 1000000.
+ self.epsilon_decay_step = self.epsilon_start - self.epsilon_end
+ self.epsilon_decay_step /= self.exploration_steps
+ self.batch_size = 32
+ self.train_start = 50000
+ self.update_target_rate = 10000
+
+ # 리플레이 메모리, 최대 크기 100,000
+ self.memory = deque(maxlen=100000)
+ # 게임 시작 후 랜덤하게 움직이지 않는 것에 대한 옵션
+ self.no_op_steps = 30
+
+ # 모델과 타깃 모델 생성
+ self.model = DQN(action_size, state_size)
+ self.target_model = DQN(action_size, state_size)
+ self.optimizer = Adam(self.learning_rate, clipnorm=10.)
+ # 타깃 모델 초기화
+ self.update_target_model()
+
+ self.avg_q_max, self.avg_loss = 0, 0
+
+ self.writer = tf.summary.create_file_writer('summary/breakout_dqn')
+ self.model_path = os.path.join(os.getcwd(), 'save_model', 'model')
+
+ # 타깃 모델을 모델의 가중치로 업데이트
+ def update_target_model(self):
+ self.target_model.set_weights(self.model.get_weights())
+
+ # 입실론 탐욕 정책으로 행동 선택
+ def get_action(self, history):
+ history = np.float32(history / 255.0)
+ if np.random.rand() <= self.epsilon:
+ return random.randrange(self.action_size)
+ else:
+ q_value = self.model(history)
+ return np.argmax(q_value[0])
+
+ # 샘플 을 리플레이 메모리에 저장
+ def append_sample(self, history, action, reward, next_history, dead):
+ self.memory.append((history, action, reward, next_history, dead))
+
+ # 텐서보드에 학습 정보를 기록
+ def draw_tensorboard(self, score, step, episode):
+ with self.writer.as_default():
+ tf.summary.scalar('Total Reward/Episode', score, step=episode)
+ tf.summary.scalar('Average Max Q/Episode',
+ self.avg_q_max / float(step), step=episode)
+ tf.summary.scalar('Duration/Episode', step, step=episode)
+ tf.summary.scalar('Average Loss/Episode',
+ self.avg_loss / float(step), step=episode)
+
+ # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습
+ def train_model(self):
+ if self.epsilon > self.epsilon_end:
+ self.epsilon -= self.epsilon_decay_step
+
+ # 메모리에서 배치 크기만큼 무작위로 샘플 추출
+ batch = random.sample(self.memory, self.batch_size)
+
+ history = np.array([sample[0][0] / 255. for sample in batch],
+ dtype=np.float32)
+ actions = np.array([sample[1] for sample in batch])
+ rewards = np.array([sample[2] for sample in batch])
+ next_history = np.array([sample[3][0] / 255. for sample in batch],
+ dtype=np.float32)
+ dones = np.array([sample[4] for sample in batch])
+
+ # 학습 파라메터
+ model_params = self.model.trainable_variables
+ with tf.GradientTape() as tape:
+ # 현재 상태에 대한 모델의 큐함수
+ predicts = self.model(history)
+ one_hot_action = tf.one_hot(actions, self.action_size)
+ predicts = tf.reduce_sum(one_hot_action * predicts, axis=1)
+
+ # 다음 상태에 대한 타깃 모델의 큐함수
+ target_predicts = self.target_model(next_history)
+
+ # 벨만 최적 방정식을 구성하기 위한 타깃과 큐함수의 최대 값 계산
+ max_q = np.amax(target_predicts, axis=1)
+ targets = rewards + (1 - dones) * self.discount_factor * max_q
+
+ # 후버로스 계산
+ error = tf.abs(targets - predicts)
+ quadratic_part = tf.clip_by_value(error, 0.0, 1.0)
+ linear_part = error - quadratic_part
+ loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)
+
+ self.avg_loss += loss.numpy()
+
+ # 오류함수를 줄이는 방향으로 모델 업데이트
+ grads = tape.gradient(loss, model_params)
+ self.optimizer.apply_gradients(zip(grads, model_params))
+
+
+# 학습속도를 높이기 위해 흑백화면으로 전처리
+def pre_processing(observe):
+ processed_observe = np.uint8(
+ resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
+ return processed_observe
+
+
+if __name__ == "__main__":
+ # 환경과 DQN 에이전트 생성
+ env = gym.make('BreakoutDeterministic-v4')
+ agent = DQNAgent(action_size=3)
+
+ global_step = 0
+ score_avg = 0
+ score_max = 0
+
+ # 불필요한 행동을 없애주기 위한 딕셔너리 선언
+ action_dict = {0:1, 1:2, 2:3, 3:3}
+
+ num_episode = 50000
+ for e in range(num_episode):
+ done = False
+ dead = False
+
+ step, score, start_life = 0, 0, 5
+ # env 초기화
+ observe = env.reset()
+
+ # 랜덤으로 뽑힌 값 만큼의 프레임동안 움직이지 않음
+ for _ in range(random.randint(1, agent.no_op_steps)):
+ observe, _, _, _ = env.step(1)
+
+ # 프레임을 전처리 한 후 4개의 상태를 쌓아서 입력값으로 사용.
+ state = pre_processing(observe)
+ history = np.stack((state, state, state, state), axis=2)
+ history = np.reshape([history], (1, 84, 84, 4))
+
+ while not done:
+ if agent.render:
+ env.render()
+ global_step += 1
+ step += 1
+
+ # 바로 전 history를 입력으로 받아 행동을 선택
+ action = agent.get_action(history)
+ # 1: 정지, 2: 왼쪽, 3: 오른쪽
+ real_action = action_dict[action]
+
+ # 죽었을 때 시작하기 위해 발사 행동을 함
+ if dead:
+ action, real_action, dead = 0, 1, False
+
+ # 선택한 행동으로 환경에서 한 타임스텝 진행
+ observe, reward, done, info = env.step(real_action)
+ # 각 타임스텝마다 상태 전처리
+ next_state = pre_processing(observe)
+ next_state = np.reshape([next_state], (1, 84, 84, 1))
+ next_history = np.append(next_state, history[:, :, :, :3], axis=3)
+
+ agent.avg_q_max += np.amax(agent.model(np.float32(history / 255.))[0])
+
+ if start_life > info['ale.lives']:
+ dead = True
+ start_life = info['ale.lives']
+
+ score += reward
+ reward = np.clip(reward, -1., 1.)
+ # 샘플 을 리플레이 메모리에 저장 후 학습
+ agent.append_sample(history, action, reward, next_history, dead)
+
+ # 리플레이 메모리 크기가 정해놓은 수치에 도달한 시점부터 모델 학습 시작
+ if len(agent.memory) >= agent.train_start:
+ agent.train_model()
+ # 일정 시간마다 타겟모델을 모델의 가중치로 업데이트
+ if global_step % agent.update_target_rate == 0:
+ agent.update_target_model()
+
+ if dead:
+ history = np.stack((next_state, next_state,
+ next_state, next_state), axis=2)
+ history = np.reshape([history], (1, 84, 84, 4))
+ else:
+ history = next_history
+
+ if done:
+ # 각 에피소드 당 학습 정보를 기록
+ if global_step > agent.train_start:
+ agent.draw_tensorboard(score, step, e)
+
+ score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
+ score_max = score if score > score_max else score_max
+
+ log = "episode: {:5d} | ".format(e)
+ log += "score: {:4.1f} | ".format(score)
+ log += "score max : {:4.1f} | ".format(score_max)
+ log += "score avg: {:4.1f} | ".format(score_avg)
+ log += "memory length: {:5d} | ".format(len(agent.memory))
+ log += "epsilon: {:.3f} | ".format(agent.epsilon)
+ log += "q avg : {:3.2f} | ".format(agent.avg_q_max / float(step))
+ log += "avg loss : {:3.2f}".format(agent.avg_loss / float(step))
+ print(log)
+
+ agent.avg_q_max, agent.avg_loss = 0, 0
+
+ # 1000 에피소드마다 모델 저장
+ if e % 1000 == 0:
+ agent.model.save_weights("./save_model/model", save_format="tf")
diff --git a/3-atari/1-breakout/breakout_a3c.py b/3-atari/1-breakout/breakout_a3c.py
deleted file mode 100644
index f6db1c0..0000000
--- a/3-atari/1-breakout/breakout_a3c.py
+++ /dev/null
@@ -1,378 +0,0 @@
-from skimage.color import rgb2gray
-from skimage.transform import resize
-from keras.layers import Dense, Flatten, Input
-from keras.layers.convolutional import Conv2D
-from keras.optimizers import RMSprop
-from keras import backend as K
-from keras.models import Model
-import tensorflow as tf
-import numpy as np
-import threading
-import random
-import time
-import gym
-
-# 멀티쓰레딩을 위한 글로벌 변수
-global episode
-episode = 0
-EPISODES = 8000000
-# 환경 생성
-env_name = "BreakoutDeterministic-v4"
-
-
-# 브레이크아웃에서의 A3CAgent 클래스(글로벌신경망)
-class A3CAgent:
- def __init__(self, action_size):
- # 상태크기와 행동크기를 갖고옴
- self.state_size = (84, 84, 4)
- self.action_size = action_size
- # A3C 하이퍼파라미터
- self.discount_factor = 0.99
- self.no_op_steps = 30
- self.actor_lr = 2.5e-4
- self.critic_lr = 2.5e-4
- # 쓰레드의 갯수
- self.threads = 8
-
- # 정책신경망과 가치신경망을 생성
- self.actor, self.critic = self.build_model()
- # 정책신경망과 가치신경망을 업데이트하는 함수 생성
- self.optimizer = [self.actor_optimizer(), self.critic_optimizer()]
-
- # 텐서보드 설정
- self.sess = tf.InteractiveSession()
- K.set_session(self.sess)
- self.sess.run(tf.global_variables_initializer())
-
- self.summary_placeholders, self.update_ops, self.summary_op = \
- self.setup_summary()
- self.summary_writer = \
- tf.summary.FileWriter('summary/breakout_a3c', self.sess.graph)
-
- # 쓰레드를 만들어 학습을 하는 함수
- def train(self):
- # 쓰레드 수만큼 Agent 클래스 생성
- agents = [Agent(self.action_size, self.state_size,
- [self.actor, self.critic], self.sess,
- self.optimizer, self.discount_factor,
- [self.summary_op, self.summary_placeholders,
- self.update_ops, self.summary_writer])
- for _ in range(self.threads)]
-
- # 각 쓰레드 시작
- for agent in agents:
- time.sleep(1)
- agent.start()
-
- # 10분(600초)에 한번씩 모델을 저장
- while True:
- time.sleep(60 * 10)
- self.save_model("./save_model/breakout_a3c")
-
- # 정책신경망과 가치신경망을 생성
- def build_model(self):
- input = Input(shape=self.state_size)
- conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input)
- conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv)
- conv = Flatten()(conv)
- fc = Dense(256, activation='relu')(conv)
-
- policy = Dense(self.action_size, activation='softmax')(fc)
- value = Dense(1, activation='linear')(fc)
-
- actor = Model(inputs=input, outputs=policy)
- critic = Model(inputs=input, outputs=value)
-
- # 가치와 정책을 예측하는 함수를 만들어냄
- actor._make_predict_function()
- critic._make_predict_function()
-
- actor.summary()
- critic.summary()
-
- return actor, critic
-
- # 정책신경망을 업데이트하는 함수
- def actor_optimizer(self):
- action = K.placeholder(shape=[None, self.action_size])
- advantages = K.placeholder(shape=[None, ])
-
- policy = self.actor.output
-
- # 정책 크로스 엔트로피 오류함수
- action_prob = K.sum(action * policy, axis=1)
- cross_entropy = K.log(action_prob + 1e-10) * advantages
- cross_entropy = -K.sum(cross_entropy)
-
- # 탐색을 지속적으로 하기 위한 엔트로피 오류
- entropy = K.sum(policy * K.log(policy + 1e-10), axis=1)
- entropy = K.sum(entropy)
-
- # 두 오류함수를 더해 최종 오류함수를 만듬
- loss = cross_entropy + 0.01 * entropy
-
- optimizer = RMSprop(lr=self.actor_lr, rho=0.99, epsilon=0.01)
- updates = optimizer.get_updates(self.actor.trainable_weights, [],loss)
- train = K.function([self.actor.input, action, advantages],
- [loss], updates=updates)
- return train
-
- # 가치신경망을 업데이트하는 함수
- def critic_optimizer(self):
- discounted_prediction = K.placeholder(shape=(None,))
-
- value = self.critic.output
-
- # [반환값 - 가치]의 제곱을 오류함수로 함
- loss = K.mean(K.square(discounted_prediction - value))
-
- optimizer = RMSprop(lr=self.critic_lr, rho=0.99, epsilon=0.01)
- updates = optimizer.get_updates(self.critic.trainable_weights, [],loss)
- train = K.function([self.critic.input, discounted_prediction],
- [loss], updates=updates)
- return train
-
- def load_model(self, name):
- self.actor.load_weights(name + "_actor.h5")
- self.critic.load_weights(name + "_critic.h5")
-
- def save_model(self, name):
- self.actor.save_weights(name + "_actor.h5")
- self.critic.save_weights(name + "_critic.h5")
-
- # 각 에피소드 당 학습 정보를 기록
- def setup_summary(self):
- episode_total_reward = tf.Variable(0.)
- episode_avg_max_q = tf.Variable(0.)
- episode_duration = tf.Variable(0.)
-
- tf.summary.scalar('Total Reward/Episode', episode_total_reward)
- tf.summary.scalar('Average Max Prob/Episode', episode_avg_max_q)
- tf.summary.scalar('Duration/Episode', episode_duration)
-
- summary_vars = [episode_total_reward,
- episode_avg_max_q,
- episode_duration]
-
- summary_placeholders = [tf.placeholder(tf.float32)
- for _ in range(len(summary_vars))]
- update_ops = [summary_vars[i].assign(summary_placeholders[i])
- for i in range(len(summary_vars))]
- summary_op = tf.summary.merge_all()
- return summary_placeholders, update_ops, summary_op
-
-
-# 액터러너 클래스(쓰레드)
-class Agent(threading.Thread):
- def __init__(self, action_size, state_size, model, sess,
- optimizer, discount_factor, summary_ops):
- threading.Thread.__init__(self)
-
- # A3CAgent 클래스에서 상속
- self.action_size = action_size
- self.state_size = state_size
- self.actor, self.critic = model
- self.sess = sess
- self.optimizer = optimizer
- self.discount_factor = discount_factor
- [self.summary_op, self.summary_placeholders,
- self.update_ops, self.summary_writer] = summary_ops
-
- # 지정된 타임스텝동안 샘플을 저장할 리스트
- self.states, self.actions, self.rewards = [], [], []
-
- # 로컬 모델 생성
- self.local_actor, self.local_critic = self.build_local_model()
-
- self.avg_p_max = 0
- self.avg_loss = 0
-
- # 모델 업데이트 주기
- self.t_max = 20
- self.t = 0
-
- def run(self):
- global episode
- env = gym.make(env_name)
-
- step = 0
-
- while episode < EPISODES:
- done = False
- dead = False
-
- score, start_life = 0, 5
- observe = env.reset()
- next_observe = observe
-
- # 0~30 상태동안 정지
- for _ in range(random.randint(1, 30)):
- observe = next_observe
- next_observe, _, _, _ = env.step(1)
-
- state = pre_processing(next_observe, observe)
- history = np.stack((state, state, state, state), axis=2)
- history = np.reshape([history], (1, 84, 84, 4))
-
- while not done:
- step += 1
- self.t += 1
- observe = next_observe
- action, policy = self.get_action(history)
-
- # 1: 정지, 2: 왼쪽, 3: 오른쪽
- if action == 0:
- real_action = 1
- elif action == 1:
- real_action = 2
- else:
- real_action = 3
-
- # 죽었을 때 시작하기 위해 발사 행동을 함
- if dead:
- action = 0
- real_action = 1
- dead = False
-
- # 선택한 행동으로 한 스텝을 실행
- next_observe, reward, done, info = env.step(real_action)
-
- # 각 타임스텝마다 상태 전처리
- next_state = pre_processing(next_observe, observe)
- next_state = np.reshape([next_state], (1, 84, 84, 1))
- next_history = np.append(next_state, history[:, :, :, :3],
- axis=3)
-
- # 정책의 최대값
- self.avg_p_max += np.amax(self.actor.predict(
- np.float32(history / 255.)))
-
- if start_life > info['ale.lives']:
- dead = True
- start_life = info['ale.lives']
-
- score += reward
- reward = np.clip(reward, -1., 1.)
-
- # 샘플을 저장
- self.append_sample(history, action, reward)
-
- if dead:
- history = np.stack((next_state, next_state,
- next_state, next_state), axis=2)
- history = np.reshape([history], (1, 84, 84, 4))
- else:
- history = next_history
-
- # 에피소드가 끝나거나 최대 타임스텝 수에 도달하면 학습을 진행
- if self.t >= self.t_max or done:
- self.train_model(done)
- self.update_local_model()
- self.t = 0
-
- if done:
- # 각 에피소드 당 학습 정보를 기록
- episode += 1
- print("episode:", episode, " score:", score, " step:",
- step)
-
- stats = [score, self.avg_p_max / float(step),
- step]
- for i in range(len(stats)):
- self.sess.run(self.update_ops[i], feed_dict={
- self.summary_placeholders[i]: float(stats[i])
- })
- summary_str = self.sess.run(self.summary_op)
- self.summary_writer.add_summary(summary_str, episode + 1)
- self.avg_p_max = 0
- self.avg_loss = 0
- step = 0
-
- # k-스텝 prediction 계산
- def discounted_prediction(self, rewards, done):
- discounted_prediction = np.zeros_like(rewards)
- running_add = 0
-
- if not done:
- running_add = self.local_critic.predict(np.float32(
- self.states[-1] / 255.))[0]
-
- for t in reversed(range(0, len(rewards))):
- running_add = running_add * self.discount_factor + rewards[t]
- discounted_prediction[t] = running_add
- return discounted_prediction
-
- # 정책신경망과 가치신경망을 업데이트
- def train_model(self, done):
- discounted_prediction = self.discounted_prediction(self.rewards, done)
-
- states = np.zeros((len(self.states), 84, 84, 4))
- for i in range(len(self.states)):
- states[i] = self.states[i]
-
- states = np.float32(states / 255.)
-
- values = self.local_critic.predict(states)
- values = np.reshape(values, len(values))
-
- advantages = discounted_prediction - values
-
- self.optimizer[0]([states, self.actions, advantages])
- self.optimizer[1]([states, discounted_prediction])
- self.states, self.actions, self.rewards = [], [], []
-
- # 로컬신경망을 생성하는 함수
- def build_local_model(self):
- input = Input(shape=self.state_size)
- conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input)
- conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv)
- conv = Flatten()(conv)
- fc = Dense(256, activation='relu')(conv)
- policy = Dense(self.action_size, activation='softmax')(fc)
- value = Dense(1, activation='linear')(fc)
-
- local_actor = Model(inputs=input, outputs=policy)
- local_critic = Model(inputs=input, outputs=value)
-
- local_actor._make_predict_function()
- local_critic._make_predict_function()
-
- local_actor.set_weights(self.actor.get_weights())
- local_critic.set_weights(self.critic.get_weights())
-
- local_actor.summary()
- local_critic.summary()
-
- return local_actor, local_critic
-
- # 로컬신경망을 글로벌신경망으로 업데이트
- def update_local_model(self):
- self.local_actor.set_weights(self.actor.get_weights())
- self.local_critic.set_weights(self.critic.get_weights())
-
- # 정책신경망의 출력을 받아서 확률적으로 행동을 선택
- def get_action(self, history):
- history = np.float32(history / 255.)
- policy = self.local_actor.predict(history)[0]
- action_index = np.random.choice(self.action_size, 1, p=policy)[0]
- return action_index, policy
-
- # 샘플을 저장
- def append_sample(self, history, action, reward):
- self.states.append(history)
- act = np.zeros(self.action_size)
- act[action] = 1
- self.actions.append(act)
- self.rewards.append(reward)
-
-
-# 학습속도를 높이기 위해 흑백화면으로 전처리
-def pre_processing(next_observe, observe):
- processed_observe = np.maximum(next_observe, observe)
- processed_observe = np.uint8(
- resize(rgb2gray(processed_observe), (84, 84), mode='constant') * 255)
- return processed_observe
-
-if __name__ == "__main__":
- global_agent = A3CAgent(action_size=3)
- global_agent.train()
diff --git a/3-atari/1-breakout/breakout_dqn.py b/3-atari/1-breakout/breakout_dqn.py
deleted file mode 100644
index eafe83a..0000000
--- a/3-atari/1-breakout/breakout_dqn.py
+++ /dev/null
@@ -1,263 +0,0 @@
-from keras.layers.convolutional import Conv2D
-from keras.layers import Dense, Flatten
-from keras.optimizers import RMSprop
-from keras.models import Sequential
-from skimage.transform import resize
-from skimage.color import rgb2gray
-from collections import deque
-from keras import backend as K
-import tensorflow as tf
-import numpy as np
-import random
-import gym
-
-EPISODES = 50000
-
-
-# 브레이크아웃에서의 DQN 에이전트
-class DQNAgent:
- def __init__(self, action_size):
- self.render = False
- self.load_model = False
- # 상태와 행동의 크기 정의
- self.state_size = (84, 84, 4)
- self.action_size = action_size
- # DQN 하이퍼파라미터
- self.epsilon = 1.
- self.epsilon_start, self.epsilon_end = 1.0, 0.1
- self.exploration_steps = 1000000.
- self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \
- / self.exploration_steps
- self.batch_size = 32
- self.train_start = 50000
- self.update_target_rate = 10000
- self.discount_factor = 0.99
- # 리플레이 메모리, 최대 크기 400000
- self.memory = deque(maxlen=400000)
- self.no_op_steps = 30
- # 모델과 타겟모델을 생성하고 타겟모델 초기화
- self.model = self.build_model()
- self.target_model = self.build_model()
- self.update_target_model()
-
- self.optimizer = self.optimizer()
-
- # 텐서보드 설정
- self.sess = tf.InteractiveSession()
- K.set_session(self.sess)
-
- self.avg_q_max, self.avg_loss = 0, 0
- self.summary_placeholders, self.update_ops, self.summary_op = \
- self.setup_summary()
- self.summary_writer = tf.summary.FileWriter(
- 'summary/breakout_dqn', self.sess.graph)
- self.sess.run(tf.global_variables_initializer())
-
- if self.load_model:
- self.model.load_weights("./save_model/breakout_dqn.h5")
-
- # Huber Loss를 이용하기 위해 최적화 함수를 직접 정의
- def optimizer(self):
- a = K.placeholder(shape=(None,), dtype='int32')
- y = K.placeholder(shape=(None,), dtype='float32')
-
- prediction = self.model.output
-
- a_one_hot = K.one_hot(a, self.action_size)
- q_value = K.sum(prediction * a_one_hot, axis=1)
- error = K.abs(y - q_value)
-
- quadratic_part = K.clip(error, 0.0, 1.0)
- linear_part = error - quadratic_part
- loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)
-
- optimizer = RMSprop(lr=0.00025, epsilon=0.01)
- updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
- train = K.function([self.model.input, a, y], [loss], updates=updates)
-
- return train
-
- # 상태가 입력, 큐함수가 출력인 인공신경망 생성
- def build_model(self):
- model = Sequential()
- model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
- input_shape=self.state_size))
- model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
- model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
- model.add(Flatten())
- model.add(Dense(512, activation='relu'))
- model.add(Dense(self.action_size))
- model.summary()
- return model
-
- # 타겟 모델을 모델의 가중치로 업데이트
- def update_target_model(self):
- self.target_model.set_weights(self.model.get_weights())
-
- # 입실론 탐욕 정책으로 행동 선택
- def get_action(self, history):
- history = np.float32(history / 255.0)
- if np.random.rand() <= self.epsilon:
- return random.randrange(self.action_size)
- else:
- q_value = self.model.predict(history)
- return np.argmax(q_value[0])
-
- # 샘플 을 리플레이 메모리에 저장
- def append_sample(self, history, action, reward, next_history, dead):
- self.memory.append((history, action, reward, next_history, dead))
-
- # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습
- def train_model(self):
- if self.epsilon > self.epsilon_end:
- self.epsilon -= self.epsilon_decay_step
-
- mini_batch = random.sample(self.memory, self.batch_size)
-
- history = np.zeros((self.batch_size, self.state_size[0],
- self.state_size[1], self.state_size[2]))
- next_history = np.zeros((self.batch_size, self.state_size[0],
- self.state_size[1], self.state_size[2]))
- target = np.zeros((self.batch_size,))
- action, reward, dead = [], [], []
-
- for i in range(self.batch_size):
- history[i] = np.float32(mini_batch[i][0] / 255.)
- next_history[i] = np.float32(mini_batch[i][3] / 255.)
- action.append(mini_batch[i][1])
- reward.append(mini_batch[i][2])
- dead.append(mini_batch[i][4])
-
- target_value = self.target_model.predict(next_history)
-
- for i in range(self.batch_size):
- if dead[i]:
- target[i] = reward[i]
- else:
- target[i] = reward[i] + self.discount_factor * \
- np.amax(target_value[i])
-
- loss = self.optimizer([history, action, target])
- self.avg_loss += loss[0]
-
- # 각 에피소드 당 학습 정보를 기록
- def setup_summary(self):
- episode_total_reward = tf.Variable(0.)
- episode_avg_max_q = tf.Variable(0.)
- episode_duration = tf.Variable(0.)
- episode_avg_loss = tf.Variable(0.)
-
- tf.summary.scalar('Total Reward/Episode', episode_total_reward)
- tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q)
- tf.summary.scalar('Duration/Episode', episode_duration)
- tf.summary.scalar('Average Loss/Episode', episode_avg_loss)
-
- summary_vars = [episode_total_reward, episode_avg_max_q,
- episode_duration, episode_avg_loss]
- summary_placeholders = [tf.placeholder(tf.float32) for _ in
- range(len(summary_vars))]
- update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in
- range(len(summary_vars))]
- summary_op = tf.summary.merge_all()
- return summary_placeholders, update_ops, summary_op
-
-
-# 학습속도를 높이기 위해 흑백화면으로 전처리
-def pre_processing(observe):
- processed_observe = np.uint8(
- resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
- return processed_observe
-
-
-if __name__ == "__main__":
- # 환경과 DQN 에이전트 생성
- env = gym.make('BreakoutDeterministic-v4')
- agent = DQNAgent(action_size=3)
-
- scores, episodes, global_step = [], [], 0
-
- for e in range(EPISODES):
- done = False
- dead = False
-
- step, score, start_life = 0, 0, 5
- observe = env.reset()
-
- for _ in range(random.randint(1, agent.no_op_steps)):
- observe, _, _, _ = env.step(1)
-
- state = pre_processing(observe)
- history = np.stack((state, state, state, state), axis=2)
- history = np.reshape([history], (1, 84, 84, 4))
-
- while not done:
- if agent.render:
- env.render()
- global_step += 1
- step += 1
-
- # 바로 전 4개의 상태로 행동을 선택
- action = agent.get_action(history)
- # 1: 정지, 2: 왼쪽, 3: 오른쪽
- if action == 0:
- real_action = 1
- elif action == 1:
- real_action = 2
- else:
- real_action = 3
-
- # 선택한 행동으로 환경에서 한 타임스텝 진행
- observe, reward, done, info = env.step(real_action)
- # 각 타임스텝마다 상태 전처리
- next_state = pre_processing(observe)
- next_state = np.reshape([next_state], (1, 84, 84, 1))
- next_history = np.append(next_state, history[:, :, :, :3], axis=3)
-
- agent.avg_q_max += np.amax(
- agent.model.predict(np.float32(history / 255.))[0])
-
- if start_life > info['ale.lives']:
- dead = True
- start_life = info['ale.lives']
-
- reward = np.clip(reward, -1., 1.)
- # 샘플 을 리플레이 메모리에 저장 후 학습
- agent.append_sample(history, action, reward, next_history, dead)
-
- if len(agent.memory) >= agent.train_start:
- agent.train_model()
-
- # 일정 시간마다 타겟모델을 모델의 가중치로 업데이트
- if global_step % agent.update_target_rate == 0:
- agent.update_target_model()
-
- score += reward
-
- if dead:
- dead = False
- else:
- history = next_history
-
- if done:
- # 각 에피소드 당 학습 정보를 기록
- if global_step > agent.train_start:
- stats = [score, agent.avg_q_max / float(step), step,
- agent.avg_loss / float(step)]
- for i in range(len(stats)):
- agent.sess.run(agent.update_ops[i], feed_dict={
- agent.summary_placeholders[i]: float(stats[i])
- })
- summary_str = agent.sess.run(agent.summary_op)
- agent.summary_writer.add_summary(summary_str, e + 1)
-
- print("episode:", e, " score:", score, " memory length:",
- len(agent.memory), " epsilon:", agent.epsilon,
- " global_step:", global_step, " average_q:",
- agent.avg_q_max / float(step), " average loss:",
- agent.avg_loss / float(step))
-
- agent.avg_q_max, agent.avg_loss = 0, 0
-
- # 1000 에피소드마다 모델 저장
- if e % 1000 == 0:
- agent.model.save_weights("./save_model/breakout_dqn.h5")
diff --git a/3-atari/1-breakout/play_a3c_model.py b/3-atari/1-breakout/play_a3c_model.py
deleted file mode 100644
index bde3088..0000000
--- a/3-atari/1-breakout/play_a3c_model.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import gym
-import random
-import numpy as np
-from skimage.color import rgb2gray
-from skimage.transform import resize
-from keras.models import Model
-from keras.layers import Dense, Flatten, Input
-from keras.layers.convolutional import Conv2D
-
-global episode
-episode = 0
-EPISODES = 100
-env_name = "BreakoutDeterministic-v4"
-
-class TestAgent:
- def __init__(self, action_size):
- self.state_size = (84, 84, 4)
- self.action_size = action_size
-
- self.discount_factor = 0.99
- self.no_op_steps = 30
-
- self.actor, self.critic = self.build_model()
-
- def build_model(self):
- input = Input(shape=self.state_size)
- conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input)
- conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv)
- conv = Flatten()(conv)
- fc = Dense(256, activation='relu')(conv)
- policy = Dense(self.action_size, activation='softmax')(fc)
- value = Dense(1, activation='linear')(fc)
-
- actor = Model(inputs=input, outputs=policy)
- critic = Model(inputs=input, outputs=value)
-
- actor.summary()
- critic.summary()
-
- return actor, critic
-
- def get_action(self, history):
- history = np.float32(history / 255.)
- policy = self.actor.predict(history)[0]
-
- action_index = np.argmax(policy)
- return action_index
-
- def load_model(self, name):
- self.actor.load_weights(name)
-
-def pre_processing(next_observe, observe):
- processed_observe = np.maximum(next_observe, observe)
- processed_observe = np.uint8(
- resize(rgb2gray(processed_observe), (84, 84), mode='constant') * 255)
- return processed_observe
-
-
-if __name__ == "__main__":
- env = gym.make(env_name)
- agent = TestAgent(action_size=3)
- agent.load_model("save_model/breakout_a3c_5_actor.h5")
-
- step = 0
-
- while episode < EPISODES:
- done = False
- dead = False
-
- score, start_life = 0, 5
- observe = env.reset()
- next_observe = observe
-
- for _ in range(random.randint(1, agent.no_op_steps)):
- observe = next_observe
- next_observe, _, _, _ = env.step(1)
-
- state = pre_processing(next_observe, observe)
- history = np.stack((state, state, state, state), axis=2)
- history = np.reshape([history], (1, 84, 84, 4))
-
- while not done:
- env.render()
- step += 1
- observe = next_observe
-
- action = agent.get_action(history)
-
- if action == 1:
- fake_action = 2
- elif action == 2:
- fake_action = 3
- else:
- fake_action = 1
-
- if dead:
- fake_action = 1
- dead = False
-
- next_observe, reward, done, info = env.step(fake_action)
-
- next_state = pre_processing(next_observe, observe)
- next_state = np.reshape([next_state], (1, 84, 84, 1))
- next_history = np.append(next_state, history[:, :, :, :3], axis=3)
-
- if start_life > info['ale.lives']:
- dead = True
- reward = -1
- start_life = info['ale.lives']
-
- score += reward
-
- # if agent is dead, then reset the history
- if dead:
- history = np.stack(
- (next_state, next_state, next_state, next_state), axis=2)
- history = np.reshape([history], (1, 84, 84, 4))
- else:
- history = next_history
-
- # if done, plot the score over episodes
- if done:
- episode += 1
- print("episode:", episode, " score:", score, " step:", step)
- step = 0
\ No newline at end of file
diff --git a/3-atari/1-breakout/play_dqn_model.py b/3-atari/1-breakout/play_dqn_model.py
deleted file mode 100644
index 2710d58..0000000
--- a/3-atari/1-breakout/play_dqn_model.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import gym
-import random
-import numpy as np
-import tensorflow as tf
-from skimage.color import rgb2gray
-from skimage.transform import resize
-from keras.models import Sequential
-from keras.layers import Dense, Flatten
-from keras.layers.convolutional import Conv2D
-from keras import backend as K
-
-EPISODES = 50000
-
-class TestAgent:
- def __init__(self, action_size):
- self.state_size = (84, 84, 4)
- self.action_size = action_size
- self.no_op_steps = 20
-
- self.model = self.build_model()
-
- self.sess = tf.InteractiveSession()
- K.set_session(self.sess)
-
- self.avg_q_max, self.avg_loss = 0, 0
- self.sess.run(tf.global_variables_initializer())
-
- def build_model(self):
- model = Sequential()
- model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
- input_shape=self.state_size))
- model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
- model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
- model.add(Flatten())
- model.add(Dense(512, activation='relu'))
- model.add(Dense(self.action_size))
- model.summary()
-
- return model
-
- def get_action(self, history):
- if np.random.random() < 0.01:
- return random.randrange(3)
- history = np.float32(history / 255.0)
- q_value = self.model.predict(history)
- return np.argmax(q_value[0])
-
- def load_model(self, filename):
- self.model.load_weights(filename)
-
-def pre_processing(observe):
- processed_observe = np.uint8(
- resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
- return processed_observe
-
-
-if __name__ == "__main__":
- env = gym.make('BreakoutDeterministic-v4')
- agent = TestAgent(action_size=3)
- agent.load_model("./save_model/breakout_dqn_5.h5")
-
- for e in range(EPISODES):
- done = False
- dead = False
-
- step, score, start_life = 0, 0, 5
- observe = env.reset()
-
- for _ in range(random.randint(1, agent.no_op_steps)):
- observe, _, _, _ = env.step(1)
-
- state = pre_processing(observe)
- history = np.stack((state, state, state, state), axis=2)
- history = np.reshape([history], (1, 84, 84, 4))
-
- while not done:
- env.render()
- step += 1
-
- action = agent.get_action(history)
-
- if action == 0:
- real_action = 1
- elif action == 1:
- real_action = 2
- else:
- real_action = 3
-
- if dead:
- real_action = 1
- dead = False
-
- observe, reward, done, info = env.step(real_action)
-
- next_state = pre_processing(observe)
- next_state = np.reshape([next_state], (1, 84, 84, 1))
- next_history = np.append(next_state, history[:, :, :, :3], axis=3)
-
- if start_life > info['ale.lives']:
- dead = True
- start_life = info['ale.lives']
-
- score += reward
-
- history = next_history
-
- if done:
- print("episode:", e, " score:", score)
-
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5
deleted file mode 100644
index 37a6a1a..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5
deleted file mode 100644
index 3d3394a..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5
deleted file mode 100644
index 21207c0..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5
deleted file mode 100644
index a26f7d8..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5
deleted file mode 100644
index a27e766..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5
deleted file mode 100644
index 62236fc..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5
deleted file mode 100644
index db855b2..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5
deleted file mode 100644
index 3636d02..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5
deleted file mode 100644
index a993bbc..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5
deleted file mode 100644
index 983e6c6..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_dqn.h5 b/3-atari/1-breakout/save_model/breakout_dqn.h5
deleted file mode 100644
index fec0537..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_dqn.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_dqn_1.h5 b/3-atari/1-breakout/save_model/breakout_dqn_1.h5
deleted file mode 100644
index c6e636a..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_dqn_1.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_dqn_2.h5 b/3-atari/1-breakout/save_model/breakout_dqn_2.h5
deleted file mode 100644
index 85544cc..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_dqn_2.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_dqn_3.h5 b/3-atari/1-breakout/save_model/breakout_dqn_3.h5
deleted file mode 100644
index 11bad3e..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_dqn_3.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_dqn_4.h5 b/3-atari/1-breakout/save_model/breakout_dqn_4.h5
deleted file mode 100644
index f871888..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_dqn_4.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/save_model/breakout_dqn_5.h5 b/3-atari/1-breakout/save_model/breakout_dqn_5.h5
deleted file mode 100644
index f82ad02..0000000
Binary files a/3-atari/1-breakout/save_model/breakout_dqn_5.h5 and /dev/null differ
diff --git a/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638 b/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638
deleted file mode 100644
index 1eb4343..0000000
Binary files a/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638 and /dev/null differ
diff --git a/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name b/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name
deleted file mode 100644
index 2e394ad..0000000
Binary files a/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name and /dev/null differ
diff --git a/3-atari/2-breakout-a3c/save_model/trained/checkpoint b/3-atari/2-breakout-a3c/save_model/trained/checkpoint
new file mode 100644
index 0000000..a6e034f
--- /dev/null
+++ b/3-atari/2-breakout-a3c/save_model/trained/checkpoint
@@ -0,0 +1,2 @@
+model_checkpoint_path: "model"
+all_model_checkpoint_paths: "model"
diff --git a/3-atari/2-breakout-a3c/save_model/trained/model.data-00000-of-00002 b/3-atari/2-breakout-a3c/save_model/trained/model.data-00000-of-00002
new file mode 100644
index 0000000..5ac027a
Binary files /dev/null and b/3-atari/2-breakout-a3c/save_model/trained/model.data-00000-of-00002 differ
diff --git a/3-atari/2-breakout-a3c/save_model/trained/model.data-00001-of-00002 b/3-atari/2-breakout-a3c/save_model/trained/model.data-00001-of-00002
new file mode 100644
index 0000000..8a3a04c
Binary files /dev/null and b/3-atari/2-breakout-a3c/save_model/trained/model.data-00001-of-00002 differ
diff --git a/3-atari/2-breakout-a3c/save_model/trained/model.index b/3-atari/2-breakout-a3c/save_model/trained/model.index
new file mode 100644
index 0000000..98da0da
Binary files /dev/null and b/3-atari/2-breakout-a3c/save_model/trained/model.index differ
diff --git a/3-atari/2-breakout-a3c/summary/breakout_a3c/events.out.tfevents.1583895598.cqcpu3.12464.143.v2 b/3-atari/2-breakout-a3c/summary/breakout_a3c/events.out.tfevents.1583895598.cqcpu3.12464.143.v2
new file mode 100644
index 0000000..beeeab0
Binary files /dev/null and b/3-atari/2-breakout-a3c/summary/breakout_a3c/events.out.tfevents.1583895598.cqcpu3.12464.143.v2 differ
diff --git a/3-atari/2-breakout-a3c/test.py b/3-atari/2-breakout-a3c/test.py
new file mode 100644
index 0000000..7787a24
--- /dev/null
+++ b/3-atari/2-breakout-a3c/test.py
@@ -0,0 +1,125 @@
+import gym
+import time
+import random
+import numpy as np
+import tensorflow as tf
+
+from skimage.color import rgb2gray
+from skimage.transform import resize
+
+from tensorflow.keras.layers import Conv2D, Flatten, Dense
+
+
+# ActorCritic 인공신경망
+class ActorCritic(tf.keras.Model):
+ def __init__(self, action_size, state_size):
+ super(ActorCritic, self).__init__()
+
+ self.conv1 = Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
+ input_shape=state_size)
+ self.conv2 = Conv2D(64, (4, 4), strides=(2, 2), activation='relu')
+ self.conv3 = Conv2D(64, (3, 3), strides=(1, 1), activation='relu')
+ self.flatten = Flatten()
+ self.shared_fc = Dense(512, activation='relu')
+
+ self.policy = Dense(action_size, activation='linear')
+ self.value = Dense(1, activation='linear')
+
+ def call(self, x):
+ x = self.conv1(x)
+ x = self.conv2(x)
+ x = self.flatten(x)
+ x = self.shared_fc(x)
+
+ policy = self.policy(x)
+ value = self.value(x)
+ return policy, value
+
+# 브레이크아웃에서의 테스트를 위한 A3C 에이전트 클래스
+
+
+class A3CTestAgent:
+ def __init__(self, action_size, state_size, model_path):
+ self.action_size = action_size
+
+ self.model = ActorCritic(action_size, state_size)
+ self.model.load_weights(model_path)
+
+ def get_action(self, history):
+ history = np.float32(history / 255.)
+ policy = self.model(history)[0][0]
+ policy = tf.nn.softmax(policy)
+ action_index = np.random.choice(self.action_size, 1, p=policy.numpy())[0]
+ return action_index, policy
+
+
+def pre_processing(observe):
+ processed_observe = np.uint8(
+ resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
+ return processed_observe
+
+
+if __name__ == "__main__":
+ # 테스트를 위한 환경, 모델 생성
+ env = gym.make("BreakoutDeterministic-v4")
+ state_size = (84, 84, 4)
+ action_size = 3
+ model_path = './save_model/trained/model'
+ render = True
+
+ agent = A3CTestAgent(action_size, state_size, model_path)
+ action_dict = {0:1, 1:2, 2:3, 3:3}
+
+ num_episode = 10
+ for e in range(num_episode):
+ done = False
+ dead = False
+
+ score, start_life = 0, 5
+ observe = env.reset()
+
+ # 랜덤으로 뽑힌 값 만큼의 프레임동안 움직이지 않음
+ for _ in range(random.randint(1, 30)):
+ observe, _, _, _ = env.step(1)
+
+ # 프레임을 전처리 한 후 4개의 상태를 쌓아서 입력값으로 사용.
+ state = pre_processing(observe)
+ history = np.stack([state, state, state, state], axis=2)
+ history = np.reshape([history], (1, 84, 84, 4))
+
+ while not done:
+ if render:
+ env.render()
+ time.sleep(0.05)
+
+ # 정책 확률에 따라 행동을 선택
+ action, policy = agent.get_action(history)
+ # 1: 정지, 2: 왼쪽, 3: 오른쪽
+ real_action = action_dict[action]
+ # 죽었을 때 시작하기 위해 발사 행동을 함
+ if dead:
+ action, real_action, dead = 0, 1, False
+
+ # 선택한 행동으로 환경에서 한 타임스텝 진행
+ observe, reward, done, info = env.step(real_action)
+
+ # 각 타임스텝마다 상태 전처리
+ next_state = pre_processing(observe)
+ next_state = np.reshape([next_state], (1, 84, 84, 1))
+ next_history = np.append(next_state, history[:, :, :, :3], axis=3)
+
+ if start_life > info['ale.lives']:
+ dead, start_life = True, info['ale.lives']
+
+ score += reward
+
+ if dead:
+ history = np.stack((next_state, next_state,
+ next_state, next_state), axis=2)
+ history = np.reshape([history], (1, 84, 84, 4))
+ else:
+ history = next_history
+
+ if done:
+ # 각 에피소드 당 학습 정보를 기록
+ print("episode: {:3d} | score : {:4.1f}".format(e, score))
diff --git a/3-atari/2-breakout-a3c/train.py b/3-atari/2-breakout-a3c/train.py
new file mode 100644
index 0000000..dd01000
--- /dev/null
+++ b/3-atari/2-breakout-a3c/train.py
@@ -0,0 +1,309 @@
+import os
+import gym
+import time
+import threading
+import random
+import numpy as np
+import tensorflow as tf
+
+from skimage.color import rgb2gray
+from skimage.transform import resize
+
+from tensorflow.compat.v1.train import AdamOptimizer
+from tensorflow.keras.layers import Conv2D, Flatten, Dense
+
+# 멀티쓰레딩을 위한 글로벌 변수
+global episode, score_avg, score_max
+episode, score_avg, score_max = 0, 0, 0
+num_episode = 8000000
+
+
+# ActorCritic 인공신경망
+class ActorCritic(tf.keras.Model):
+ def __init__(self, action_size, state_size):
+ super(ActorCritic, self).__init__()
+
+ self.conv1 = Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
+ input_shape=state_size)
+ self.conv2 = Conv2D(64, (4, 4), strides=(2, 2), activation='relu')
+ self.conv3 = Conv2D(64, (3, 3), strides=(1, 1), activation='relu')
+ self.flatten = Flatten()
+ self.shared_fc = Dense(512, activation='relu')
+
+ self.policy = Dense(action_size, activation='linear')
+ self.value = Dense(1, activation='linear')
+
+ def call(self, x):
+ x = self.conv1(x)
+ x = self.conv2(x)
+ x = self.flatten(x)
+ x = self.shared_fc(x)
+
+ policy = self.policy(x)
+ value = self.value(x)
+ return policy, value
+
+
+# 브레이크아웃에서의 A3CAgent 클래스 (글로벌신경망)
+class A3CAgent():
+ def __init__(self, action_size, env_name):
+ self.env_name = env_name
+ # 상태와 행동의 크기 정의
+ self.state_size = (84, 84, 4)
+ self.action_size = action_size
+ # A3C 하이퍼파라미터
+ self.discount_factor = 0.99
+ self.no_op_steps = 30
+ self.lr = 1e-4
+ # 쓰레드의 갯수
+ self.threads = 16
+
+ # 글로벌 인공신경망 생성
+ self.global_model = ActorCritic(self.action_size, self.state_size)
+ # 글로벌 인공신경망의 가중치 초기화
+ self.global_model.build(tf.TensorShape((None, *self.state_size)))
+
+ # 인공신경망 업데이트하는 옵티마이저 함수 생성
+ self.optimizer = AdamOptimizer(self.lr, use_locking=True)
+
+ # 텐서보드 설정
+ self.writer = tf.summary.create_file_writer('summary/breakout_a3c')
+ # 학습된 글로벌신경망 모델을 저장할 경로 설정
+ self.model_path = os.path.join(os.getcwd(), 'save_model', 'model')
+
+ # 쓰레드를 만들어 학습을 하는 함수
+ def train(self):
+ # 쓰레드 수 만큼 Runner 클래스 생성
+ runners = [Runner(self.action_size, self.state_size,
+ self.global_model, self.optimizer,
+ self.discount_factor, self.env_name,
+ self.writer) for i in range(self.threads)]
+
+ # 각 쓰레드 시정
+ for i, runner in enumerate(runners):
+ print("Start worker #{:d}".format(i))
+ runner.start()
+
+ # 10분 (600초)에 한 번씩 모델을 저장
+ while True:
+ self.global_model.save_weights(self.model_path, save_format="tf")
+ time.sleep(60 * 10)
+
+
+# 액터러너 클래스 (쓰레드)
+class Runner(threading.Thread):
+ global_episode = 0
+
+ def __init__(self, action_size, state_size, global_model,
+ optimizer, discount_factor, env_name, writer):
+ threading.Thread.__init__(self)
+
+ # A3CAgent 클래스에서 넘겨준 하이준 파라미터 설정
+ self.action_size = action_size
+ self.state_size = state_size
+ self.global_model = global_model
+ self.optimizer = optimizer
+ self.discount_factor = discount_factor
+
+ self.states, self.actions, self.rewards = [], [], []
+
+ # 환경, 로컬신경망, 텐서보드 생성
+ self.local_model = ActorCritic(action_size, state_size)
+ self.env = gym.make(env_name)
+ self.writer = writer
+
+ # 학습 정보를 기록할 변수
+ self.avg_p_max = 0
+ self.avg_loss = 0
+ # k-타임스텝 값 설정
+ self.t_max = 20
+ self.t = 0
+ # 불필요한 행동을 줄여주기 위한 dictionary
+ self.action_dict = {0:1, 1:2, 2:3, 3:3}
+
+ # 텐서보드에 학습 정보를 기록
+ def draw_tensorboard(self, score, step, e):
+ avg_p_max = self.avg_p_max / float(step)
+ with self.writer.as_default():
+ tf.summary.scalar('Total Reward/Episode', score, step=e)
+ tf.summary.scalar('Average Max Prob/Episode', avg_p_max, step=e)
+ tf.summary.scalar('Duration/Episode', step, step=e)
+
+ # 정책신경망의 출력을 받아 확률적으로 행동을 선택
+ def get_action(self, history):
+ history = np.float32(history / 255.)
+ policy = self.local_model(history)[0][0]
+ policy = tf.nn.softmax(policy)
+ action_index = np.random.choice(self.action_size, 1, p=policy.numpy())[0]
+ return action_index, policy
+
+ # 샘플을 저장
+ def append_sample(self, history, action, reward):
+ self.states.append(history)
+ act = np.zeros(self.action_size)
+ act[action] = 1
+ self.actions.append(act)
+ self.rewards.append(reward)
+
+ # k-타임스텝의 prediction 계산
+ def discounted_prediction(self, rewards, done):
+ discounted_prediction = np.zeros_like(rewards)
+ running_add = 0
+
+ if not done:
+ # value function
+ last_state = np.float32(self.states[-1] / 255.)
+ running_add = self.local_model(last_state)[-1][0].numpy()
+
+ for t in reversed(range(0, len(rewards))):
+ running_add = running_add * self.discount_factor + rewards[t]
+ discounted_prediction[t] = running_add
+ return discounted_prediction
+
+ # 저장된 샘플들로 A3C의 오류함수를 계산
+ def compute_loss(self, done):
+
+ discounted_prediction = self.discounted_prediction(self.rewards, done)
+ discounted_prediction = tf.convert_to_tensor(discounted_prediction[:, None],
+ dtype=tf.float32)
+
+ states = np.zeros((len(self.states), 84, 84, 4))
+
+ for i in range(len(self.states)):
+ states[i] = self.states[i]
+ states = np.float32(states / 255.)
+
+ policy, values = self.local_model(states)
+
+ # 가치 신경망 업데이트
+ advantages = discounted_prediction - values
+ critic_loss = 0.5 * tf.reduce_sum(tf.square(advantages))
+
+ # 정책 신경망 업데이트
+ action = tf.convert_to_tensor(self.actions, dtype=tf.float32)
+ policy_prob = tf.nn.softmax(policy)
+ action_prob = tf.reduce_sum(action * policy_prob, axis=1, keepdims=True)
+ cross_entropy = - tf.math.log(action_prob + 1e-10)
+ actor_loss = tf.reduce_sum(cross_entropy * tf.stop_gradient(advantages))
+
+ entropy = tf.reduce_sum(policy_prob * tf.math.log(policy_prob + 1e-10), axis=1)
+ entropy = tf.reduce_sum(entropy)
+ actor_loss += 0.01 * entropy
+
+ total_loss = 0.5 * critic_loss + actor_loss
+
+ return total_loss
+
+ # 로컬신경망을 통해 그레이디언트를 계산하고, 글로벌 신경망을 계산된 그레이디언트로 업데이트
+ def train_model(self, done):
+
+ global_params = self.global_model.trainable_variables
+ local_params = self.local_model.trainable_variables
+
+ with tf.GradientTape() as tape:
+ total_loss = self.compute_loss(done)
+
+ # 로컬신경망의 그레이디언트 계산
+ grads = tape.gradient(total_loss, local_params)
+ # 안정적인 학습을 위한 그레이디언트 클리핑
+ grads, _ = tf.clip_by_global_norm(grads, 40.0)
+ # 로컬신경망의 오류함수를 줄이는 방향으로 글로벌신경망을 업데이트
+ self.optimizer.apply_gradients(zip(grads, global_params))
+ # 로컬신경망의 가중치를 글로벌신경망의 가중치로 업데이트
+ self.local_model.set_weights(self.global_model.get_weights())
+ # 업데이트 후 저장된 샘플 초기화
+ self.states, self.actions, self.rewards = [], [], []
+
+ def run(self):
+ # 액터러너끼리 공유해야하는 글로벌 변수
+ global episode, score_avg, score_max
+
+ step = 0
+ while episode < num_episode:
+ done = False
+ dead = False
+
+ score, start_life = 0, 5
+ observe = self.env.reset()
+
+ # 랜덤으로 뽑힌 값 만큼의 프레임동안 움직이지 않음
+ for _ in range(random.randint(1, 30)):
+ observe, _, _, _ = self.env.step(1)
+
+ # 프레임을 전처리 한 후 4개의 상태를 쌓아서 입력값으로 사용.
+ state = pre_processing(observe)
+ history = np.stack([state, state, state, state], axis=2)
+ history = np.reshape([history], (1, 84, 84, 4))
+
+ while not done:
+ step += 1
+ self.t += 1
+
+ # 정책 확률에 따라 행동을 선택
+ action, policy = self.get_action(history)
+ # 1: 정지, 2: 왼쪽, 3: 오른쪽
+ real_action = self.action_dict[action]
+ # 죽었을 때 시작하기 위해 발사 행동을 함
+ if dead:
+ action, real_action, dead = 0, 1, False
+
+ # 선택한 행동으로 환경에서 한 타임스텝 진행
+ observe, reward, done, info = self.env.step(real_action)
+
+ # 각 타임스텝마다 상태 전처리
+ next_state = pre_processing(observe)
+ next_state = np.reshape([next_state], (1, 84, 84, 1))
+ next_history = np.append(next_state, history[:, :, :, :3], axis=3)
+
+ # 정책확률의 최대값
+ self.avg_p_max += np.amax(policy.numpy())
+
+ if start_life > info['ale.lives']:
+ dead = True
+ start_life = info['ale.lives']
+
+ score += reward
+ reward = np.clip(reward, -1., 1.)
+
+ # 샘플을 저장
+ self.append_sample(history, action, reward)
+
+ if dead:
+ history = np.stack((next_state, next_state,
+ next_state, next_state), axis=2)
+ history = np.reshape([history], (1, 84, 84, 4))
+ else:
+ history = next_history
+
+ # 에피소드가 끝나거나 최대 타임스텝 수에 도달하면 학습을 진행
+ if self.t >= self.t_max or done:
+ self.train_model(done)
+ self.t = 0
+
+ if done:
+ # 각 에피소드 당 학습 정보를 기록
+ episode += 1
+ score_max = score if score > score_max else score_max
+ score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
+
+ log = "episode: {:5d} | score : {:4.1f} | ".format(episode, score)
+ log += "score max : {:4.1f} | ".format(score_max)
+ log += "score avg : {:.3f}".format(score_avg)
+ print(log)
+
+ self.draw_tensorboard(score, step, episode)
+
+ self.avg_p_max = 0
+ step = 0
+
+
+# 학습속도를 높이기 위해 흑백화면으로 전처리
+def pre_processing(observe):
+ processed_observe = np.uint8(
+ resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
+ return processed_observe
+
+
+if __name__ == "__main__":
+ global_agent = A3CAgent(action_size=3, env_name="BreakoutDeterministic-v4")
+ global_agent.train()
diff --git a/3-atari/LICENSE b/3-atari/LICENSE
deleted file mode 100644
index 5c61d8a..0000000
--- a/3-atari/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2017 Keon Kim
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.