diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..80908f4 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "/anaconda3/envs/rlcode/bin/python" +} \ No newline at end of file diff --git a/1-grid-world/1-policy-iteration/environment.py b/1-grid-world/1-policy-iteration/environment.py index c6f7b6f..45c9962 100644 --- a/1-grid-world/1-policy-iteration/environment.py +++ b/1-grid-world/1-policy-iteration/environment.py @@ -182,7 +182,7 @@ def draw_from_policy(self, policy_table): def print_value_table(self, value_table): for i in range(WIDTH): for j in range(HEIGHT): - self.text_value(i, j, value_table[i][j]) + self.text_value(i, j, round(value_table[i][j], 2)) def render(self): time.sleep(0.1) diff --git a/1-grid-world/1-policy-iteration/policy_iteration.py b/1-grid-world/1-policy-iteration/policy_iteration.py index 617d764..6af6197 100644 --- a/1-grid-world/1-policy-iteration/policy_iteration.py +++ b/1-grid-world/1-policy-iteration/policy_iteration.py @@ -1,5 +1,4 @@ -# -*- coding: utf-8 -*- -import random +import numpy as np from environment import GraphicDisplay, Env @@ -11,17 +10,17 @@ def __init__(self, env): self.value_table = [[0.0] * env.width for _ in range(env.height)] # 상 하 좌 우 동일한 확률로 정책 초기화 self.policy_table = [[[0.25, 0.25, 0.25, 0.25]] * env.width - for _ in range(env.height)] + for _ in range(env.height)] # 마침 상태의 설정 self.policy_table[2][2] = [] - # 감가율 + # 할인율 self.discount_factor = 0.9 + # 벨만 기대 방정식을 통해 다음 가치함수를 계산하는 정책 평가 def policy_evaluation(self): - # 다음 가치함수 초기화 next_value_table = [[0.00] * self.env.width - for _ in range(self.env.height)] + for _ in range(self.env.height)] # 모든 상태에 대해서 벨만 기대방정식을 계산 for state in self.env.get_all_states(): @@ -39,7 +38,7 @@ def policy_evaluation(self): value += (self.get_policy(state)[action] * (reward + self.discount_factor * next_value)) - next_value_table[state[0]][state[1]] = round(value, 2) + next_value_table[state[0]][state[1]] = value self.value_table = next_value_table @@ -49,59 +48,45 @@ def policy_improvement(self): for state in self.env.get_all_states(): if state == [2, 2]: continue - value = -99999 - max_index = [] + + value_list = [] # 반환할 정책 초기화 result = [0.0, 0.0, 0.0, 0.0] - # 모든 행동에 대해서 [보상 + (감가율 * 다음 상태 가치함수)] 계산 + # 모든 행동에 대해서 [보상 + (할인율 * 다음 상태 가치함수)] 계산 for index, action in enumerate(self.env.possible_actions): next_state = self.env.state_after_action(state, action) reward = self.env.get_reward(state, action) next_value = self.get_value(next_state) - temp = reward + self.discount_factor * next_value - - # 받을 보상이 최대인 행동의 index(최대가 복수라면 모두)를 추출 - if temp == value: - max_index.append(index) - elif temp > value: - value = temp - max_index.clear() - max_index.append(index) + value = reward + self.discount_factor * next_value + value_list.append(value) - # 행동의 확률 계산 - prob = 1 / len(max_index) + # 받을 보상이 최대인 행동들에 대해 탐욕 정책 발전 + max_idx_list = np.argwhere(value_list == np.amax(value_list)) + max_idx_list = max_idx_list.flatten().tolist() + prob = 1 / len(max_idx_list) - for index in max_index: - result[index] = prob + for idx in max_idx_list: + result[idx] = prob next_policy[state[0]][state[1]] = result self.policy_table = next_policy - # 특정 상태에서 정책에 따른 행동을 반환 + # 특정 상태에서 정책에 따라 무작위로 행동을 반환 def get_action(self, state): - # 0 ~ 1 사이의 값을 무작위로 추출 - random_pick = random.randrange(100) / 100 - policy = self.get_policy(state) - policy_sum = 0.0 - # 정책에 담긴 행동 중에 무작위로 한 행동을 추출 - for index, value in enumerate(policy): - policy_sum += value - if random_pick < policy_sum: - return index + policy = np.array(policy) + return np.random.choice(4, 1, p=policy)[0] # 상태에 따른 정책 반환 def get_policy(self, state): - if state == [2, 2]: - return 0.0 return self.policy_table[state[0]][state[1]] # 가치 함수의 값을 반환 def get_value(self, state): - # 소숫점 둘째 자리까지만 계산 - return round(self.value_table[state[0]][state[1]], 2) + return self.value_table[state[0]][state[1]] + if __name__ == "__main__": env = Env() diff --git a/1-grid-world/2-value-iteration/environment.py b/1-grid-world/2-value-iteration/environment.py index c467a92..76ebf21 100644 --- a/1-grid-world/2-value-iteration/environment.py +++ b/1-grid-world/2-value-iteration/environment.py @@ -197,7 +197,7 @@ def draw_from_values(self, state, action_list): def print_values(self, values): for i in range(WIDTH): for j in range(HEIGHT): - self.text_value(i, j, values[i][j]) + self.text_value(i, j, round(values[i][j], 2)) def render(self): time.sleep(0.1) diff --git a/1-grid-world/2-value-iteration/value_iteration.py b/1-grid-world/2-value-iteration/value_iteration.py index 136fc4a..e388b3f 100644 --- a/1-grid-world/2-value-iteration/value_iteration.py +++ b/1-grid-world/2-value-iteration/value_iteration.py @@ -1,65 +1,64 @@ -# -*- coding: utf-8 -*- +import numpy as np from environment import GraphicDisplay, Env + class ValueIteration: def __init__(self, env): - # 환경 객체 생성 + # 환경에 대한 객체 선언 self.env = env # 가치 함수를 2차원 리스트로 초기화 self.value_table = [[0.0] * env.width for _ in range(env.height)] - # 감가율 + # 할인율 self.discount_factor = 0.9 - # 가치 이터레이션 # 벨만 최적 방정식을 통해 다음 가치 함수 계산 def value_iteration(self): - next_value_table = [[0.0] * self.env.width for _ in - range(self.env.height)] + # 다음 가치함수 초기화 + next_value_table = [[0.0] * self.env.width + for _ in range(self.env.height)] + + # 모든 상태에 대해서 벨만 최적방정식을 계산 for state in self.env.get_all_states(): + # 마침 상태의 가치 함수 = 0 if state == [2, 2]: next_value_table[state[0]][state[1]] = 0.0 continue - # 가치 함수를 위한 빈 리스트 - value_list = [] - # 가능한 모든 행동에 대해 계산 + # 벨만 최적 방정식 + value_list = [] for action in self.env.possible_actions: next_state = self.env.state_after_action(state, action) reward = self.env.get_reward(state, action) next_value = self.get_value(next_state) value_list.append((reward + self.discount_factor * next_value)) + # 최댓값을 다음 가치 함수로 대입 - next_value_table[state[0]][state[1]] = round(max(value_list), 2) + next_value_table[state[0]][state[1]] = max(value_list) + self.value_table = next_value_table # 현재 가치 함수로부터 행동을 반환 def get_action(self, state): - action_list = [] - max_value = -99999 - if state == [2, 2]: return [] # 모든 행동에 대해 큐함수 (보상 + (감가율 * 다음 상태 가치함수))를 계산 - # 최대 큐 함수를 가진 행동(복수일 경우 여러 개)을 반환 + value_list = [] for action in self.env.possible_actions: - next_state = self.env.state_after_action(state, action) reward = self.env.get_reward(state, action) next_value = self.get_value(next_state) value = (reward + self.discount_factor * next_value) + value_list.append(value) - if value > max_value: - action_list.clear() - action_list.append(action) - max_value = value - elif value == max_value: - action_list.append(action) - + # 최대 큐 함수를 가진 행동(복수일 경우 여러 개)을 반환 + max_idx_list = np.argwhere(value_list == np.amax(value_list)) + action_list = max_idx_list.flatten().tolist() return action_list def get_value(self, state): - return round(self.value_table[state[0]][state[1]], 2) + return self.value_table[state[0]][state[1]] + if __name__ == "__main__": env = Env() diff --git a/1-grid-world/3-monte-carlo/environment.py b/1-grid-world/3-monte-carlo/environment.py deleted file mode 100644 index f1ce8e6..0000000 --- a/1-grid-world/3-monte-carlo/environment.py +++ /dev/null @@ -1,111 +0,0 @@ -import time -import numpy as np -import tkinter as tk -from PIL import ImageTk, Image - -np.random.seed(1) -PhotoImage = ImageTk.PhotoImage -UNIT = 100 # 픽셀 수 -HEIGHT = 5 # 그리드 월드 세로 -WIDTH = 5 # 그리드 월드 가로 - - -class Env(tk.Tk): - def __init__(self): - super(Env, self).__init__() - self.action_space = ['u', 'd', 'l', 'r'] - self.n_actions = len(self.action_space) - self.title('monte carlo') - self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) - self.shapes = self.load_images() - self.canvas = self._build_canvas() - self.texts = [] - - def _build_canvas(self): - canvas = tk.Canvas(self, bg='white', - height=HEIGHT * UNIT, - width=WIDTH * UNIT) - # 그리드 생성 - for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 - x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT - canvas.create_line(x0, y0, x1, y1) - for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 - x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r - canvas.create_line(x0, y0, x1, y1) - - # 캔버스에 이미지 추가 - self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) - self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1]) - self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1]) - self.circle = canvas.create_image(250, 250, image=self.shapes[2]) - - canvas.pack() - - return canvas - - def load_images(self): - rectangle = PhotoImage( - Image.open("../img/rectangle.png").resize((65, 65))) - triangle = PhotoImage( - Image.open("../img/triangle.png").resize((65, 65))) - circle = PhotoImage( - Image.open("../img/circle.png").resize((65, 65))) - - return rectangle, triangle, circle - - @staticmethod - def coords_to_state(coords): - x = int((coords[0] - 50) / 100) - y = int((coords[1] - 50) / 100) - return [x, y] - - def reset(self): - self.update() - time.sleep(0.5) - x, y = self.canvas.coords(self.rectangle) - self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) - return self.coords_to_state(self.canvas.coords(self.rectangle)) - - def step(self, action): - state = self.canvas.coords(self.rectangle) - base_action = np.array([0, 0]) - self.render() - - if action == 0: # 상 - if state[1] > UNIT: - base_action[1] -= UNIT - elif action == 1: # 하 - if state[1] < (HEIGHT - 1) * UNIT: - base_action[1] += UNIT - elif action == 2: # 좌 - if state[0] > UNIT: - base_action[0] -= UNIT - elif action == 3: # 우 - if state[0] < (WIDTH - 1) * UNIT: - base_action[0] += UNIT - # 에이전트 이동 - self.canvas.move(self.rectangle, base_action[0], base_action[1]) - # 에이전트(빨간 네모)를 가장 상위로 배치 - self.canvas.tag_raise(self.rectangle) - - next_state = self.canvas.coords(self.rectangle) - - # 보상 함수 - if next_state == self.canvas.coords(self.circle): - reward = 100 - done = True - elif next_state in [self.canvas.coords(self.triangle1), - self.canvas.coords(self.triangle2)]: - reward = -100 - done = True - else: - reward = 0 - done = False - - next_state = self.coords_to_state(next_state) - - return next_state, reward, done - - def render(self): - time.sleep(0.03) - self.update() diff --git a/1-grid-world/3-monte-carlo/mc_agent.py b/1-grid-world/3-monte-carlo/mc_agent.py deleted file mode 100644 index 83dfa51..0000000 --- a/1-grid-world/3-monte-carlo/mc_agent.py +++ /dev/null @@ -1,111 +0,0 @@ -import numpy as np -import random -from collections import defaultdict -from environment import Env - - -# 몬테카를로 에이전트 (모든 에피소드 각각의 샘플로 부터 학습) -class MCAgent: - def __init__(self, actions): - self.width = 5 - self.height = 5 - self.actions = actions - self.learning_rate = 0.01 - self.discount_factor = 0.9 - self.epsilon = 0.1 - self.samples = [] - self.value_table = defaultdict(float) - - # 메모리에 샘플을 추가 - def save_sample(self, state, reward, done): - self.samples.append([state, reward, done]) - - # 모든 에피소드에서 에이전트가 방문한 상태의 큐 함수를 업데이트 - def update(self): - G_t = 0 - visit_state = [] - for reward in reversed(self.samples): - state = str(reward[0]) - if state not in visit_state: - visit_state.append(state) - G_t = reward[1] + self.discount_factor * G_t - value = self.value_table[state] - self.value_table[state] = (value + - self.learning_rate * (G_t - value)) - - # 큐 함수에 따라서 행동을 반환 - # 입실론 탐욕 정책에 따라서 행동을 반환 - def get_action(self, state): - if np.random.rand() < self.epsilon: - # 랜덤 행동 - action = np.random.choice(self.actions) - else: - # 큐 함수에 따른 행동 - next_state = self.possible_next_state(state) - action = self.arg_max(next_state) - return int(action) - - # 후보가 여럿이면 arg_max를 계산하고 무작위로 하나를 반환 - @staticmethod - def arg_max(next_state): - max_index_list = [] - max_value = next_state[0] - for index, value in enumerate(next_state): - if value > max_value: - max_index_list.clear() - max_value = value - max_index_list.append(index) - elif value == max_value: - max_index_list.append(index) - return random.choice(max_index_list) - - # 가능한 다음 모든 상태들을 반환 - def possible_next_state(self, state): - col, row = state - next_state = [0.0] * 4 - - if row != 0: - next_state[0] = self.value_table[str([col, row - 1])] - else: - next_state[0] = self.value_table[str(state)] - if row != self.height - 1: - next_state[1] = self.value_table[str([col, row + 1])] - else: - next_state[1] = self.value_table[str(state)] - if col != 0: - next_state[2] = self.value_table[str([col - 1, row])] - else: - next_state[2] = self.value_table[str(state)] - if col != self.width - 1: - next_state[3] = self.value_table[str([col + 1, row])] - else: - next_state[3] = self.value_table[str(state)] - - return next_state - - -# 메인 함수 -if __name__ == "__main__": - env = Env() - agent = MCAgent(actions=list(range(env.n_actions))) - - for episode in range(1000): - state = env.reset() - action = agent.get_action(state) - - while True: - env.render() - - # 다음 상태로 이동 - # 보상은 숫자이고, 완료 여부는 boolean - next_state, reward, done = env.step(action) - agent.save_sample(next_state, reward, done) - - # 다음 행동 받아옴 - action = agent.get_action(next_state) - - # 에피소드가 완료됐을 때, 큐 함수 업데이트 - if done: - agent.update() - agent.samples.clear() - break diff --git a/1-grid-world/4-sarsa/sarsa_agent.py b/1-grid-world/3-sarsa/agent.py similarity index 64% rename from 1-grid-world/4-sarsa/sarsa_agent.py rename to 1-grid-world/3-sarsa/agent.py index 1668471..014d032 100644 --- a/1-grid-world/4-sarsa/sarsa_agent.py +++ b/1-grid-world/3-sarsa/agent.py @@ -7,17 +7,19 @@ class SARSAgent: def __init__(self, actions): self.actions = actions - self.learning_rate = 0.01 + self.step_size = 0.01 self.discount_factor = 0.9 self.epsilon = 0.1 + # 0을 초기값으로 가지는 큐함수 테이블 생성 self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) # 의 샘플로부터 큐함수를 업데이트 def learn(self, state, action, reward, next_state, next_action): + state, next_state = str(state), str(next_state) current_q = self.q_table[state][action] next_state_q = self.q_table[next_state][next_action] - new_q = (current_q + self.learning_rate * - (reward + self.discount_factor * next_state_q - current_q)) + td = reward + self.discount_factor * next_state_q - current_q + new_q = current_q + self.step_size * td self.q_table[state][action] = new_q # 입실론 탐욕 정책에 따라서 행동을 반환 @@ -27,22 +29,18 @@ def get_action(self, state): action = np.random.choice(self.actions) else: # 큐함수에 따른 행동 반환 - state_action = self.q_table[state] - action = self.arg_max(state_action) + state = str(state) + q_list = self.q_table[state] + action = arg_max(q_list) return action - @staticmethod - def arg_max(state_action): - max_index_list = [] - max_value = state_action[0] - for index, value in enumerate(state_action): - if value > max_value: - max_index_list.clear() - max_value = value - max_index_list.append(index) - elif value == max_value: - max_index_list.append(index) - return random.choice(max_index_list) + +# 큐함수의 값에 따라 최적의 행동을 반환 +def arg_max(q_list): + max_idx_list = np.argwhere(q_list == np.amax(q_list)) + max_idx_list = max_idx_list.flatten().tolist() + return random.choice(max_idx_list) + if __name__ == "__main__": env = Env() @@ -52,7 +50,7 @@ def arg_max(state_action): # 게임 환경과 상태를 초기화 state = env.reset() # 현재 상태에 대한 행동을 선택 - action = agent.get_action(str(state)) + action = agent.get_action(state) while True: env.render() @@ -60,10 +58,9 @@ def arg_max(state_action): # 행동을 위한 후 다음상태 보상 에피소드의 종료 여부를 받아옴 next_state, reward, done = env.step(action) # 다음 상태에서의 다음 행동 선택 - next_action = agent.get_action(str(next_state)) - + next_action = agent.get_action(next_state) # 로 큐함수를 업데이트 - agent.learn(str(state), action, reward, str(next_state), next_action) + agent.learn(state, action, reward, next_state, next_action) state = next_state action = next_action @@ -72,5 +69,4 @@ def arg_max(state_action): env.print_value_all(agent.q_table) if done: - break - + break \ No newline at end of file diff --git a/1-grid-world/4-sarsa/environment.py b/1-grid-world/3-sarsa/environment.py similarity index 98% rename from 1-grid-world/4-sarsa/environment.py rename to 1-grid-world/3-sarsa/environment.py index d8fe3eb..de34ed8 100644 --- a/1-grid-world/4-sarsa/environment.py +++ b/1-grid-world/3-sarsa/environment.py @@ -80,7 +80,7 @@ def print_value_all(self, q_table): state = [x, y] if str(state) in q_table.keys(): temp = q_table[str(state)][action] - self.text_value(y, x, round(temp, 2), action) + self.text_value(y, x, round(temp, 3), action) def coords_to_state(self, coords): x = int((coords[0] - 50) / 100) @@ -132,11 +132,8 @@ def step(self, action): done = False next_state = self.coords_to_state(next_state) - - - return next_state, reward, done def render(self): time.sleep(0.03) - self.update() + self.update() \ No newline at end of file diff --git a/1-grid-world/5-q-learning/q_learning_agent.py b/1-grid-world/4-q-learning/agent.py similarity index 65% rename from 1-grid-world/5-q-learning/q_learning_agent.py rename to 1-grid-world/4-q-learning/agent.py index 496aeaf..811b00b 100644 --- a/1-grid-world/5-q-learning/q_learning_agent.py +++ b/1-grid-world/4-q-learning/agent.py @@ -3,21 +3,22 @@ from environment import Env from collections import defaultdict + class QLearningAgent: def __init__(self, actions): - # 행동 = [0, 1, 2, 3] 순서대로 상, 하, 좌, 우 self.actions = actions - self.learning_rate = 0.01 + self.step_size = 0.01 self.discount_factor = 0.9 self.epsilon = 0.9 self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) # 샘플로부터 큐함수 업데이트 def learn(self, state, action, reward, next_state): + state, next_state = str(state), str(next_state) q_1 = self.q_table[state][action] # 벨만 최적 방정식을 사용한 큐함수의 업데이트 q_2 = reward + self.discount_factor * max(self.q_table[next_state]) - self.q_table[state][action] += self.learning_rate * (q_2 - q_1) + self.q_table[state][action] += self.step_size * (q_2 - q_1) # 큐함수에 의거하여 입실론 탐욕 정책에 따라서 행동을 반환 def get_action(self, state): @@ -26,22 +27,18 @@ def get_action(self, state): action = np.random.choice(self.actions) else: # 큐함수에 따른 행동 반환 - state_action = self.q_table[state] - action = self.arg_max(state_action) + state = str(state) + q_list = self.q_table[state] + action = arg_max(q_list) return action - @staticmethod - def arg_max(state_action): - max_index_list = [] - max_value = state_action[0] - for index, value in enumerate(state_action): - if value > max_value: - max_index_list.clear() - max_value = value - max_index_list.append(index) - elif value == max_value: - max_index_list.append(index) - return random.choice(max_index_list) + +# 큐함수의 값에 따라 최적의 행동을 반환 +def arg_max(q_list): + max_idx_list = np.argwhere(q_list == np.amax(q_list)) + max_idx_list = max_idx_list.flatten().tolist() + return random.choice(max_idx_list) + if __name__ == "__main__": env = Env() @@ -51,16 +48,17 @@ def arg_max(state_action): state = env.reset() while True: + # 게임 환경과 상태를 초기화 env.render() - # 현재 상태에 대한 행동 선택 - action = agent.get_action(str(state)) + action = agent.get_action(state) # 행동을 취한 후 다음 상태, 보상 에피소드의 종료여부를 받아옴 next_state, reward, done = env.step(action) - # 로 큐함수를 업데이트 - agent.learn(str(state), action, reward, str(next_state)) + agent.learn(state, action, reward, next_state) + state = next_state + # 모든 큐함수를 화면에 표시 env.print_value_all(agent.q_table) diff --git a/1-grid-world/5-q-learning/environment.py b/1-grid-world/4-q-learning/environment.py similarity index 98% rename from 1-grid-world/5-q-learning/environment.py rename to 1-grid-world/4-q-learning/environment.py index 1accc84..f4e0793 100644 --- a/1-grid-world/5-q-learning/environment.py +++ b/1-grid-world/4-q-learning/environment.py @@ -81,7 +81,7 @@ def print_value_all(self, q_table): state = [i, j] if str(state) in q_table.keys(): temp = q_table[str(state)][action] - self.text_value(j, i, round(temp, 2), action) + self.text_value(j, i, round(temp, 3), action) def coords_to_state(self, coords): x = int((coords[0] - 50) / 100) diff --git a/1-grid-world/4-sarsa/.python-version b/1-grid-world/4-sarsa/.python-version deleted file mode 100644 index 1545d96..0000000 --- a/1-grid-world/4-sarsa/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.5.0 diff --git a/1-grid-world/6-deep-sarsa/environment.py b/1-grid-world/5-deep-sarsa/environment.py similarity index 98% rename from 1-grid-world/6-deep-sarsa/environment.py rename to 1-grid-world/5-deep-sarsa/environment.py index 2e47dd0..8239b5c 100755 --- a/1-grid-world/6-deep-sarsa/environment.py +++ b/1-grid-world/5-deep-sarsa/environment.py @@ -12,8 +12,9 @@ class Env(tk.Tk): - def __init__(self): + def __init__(self, render_speed=0.01): super(Env, self).__init__() + self.render_speed=render_speed self.action_space = ['u', 'd', 'l', 'r'] self.action_size = len(self.action_space) self.title('DeepSARSA') @@ -102,7 +103,6 @@ def set_reward(self, state, reward): self.rewards.append(temp) # new methods - def check_if_reward(self, state): check_list = dict() check_list['if_goal'] = False @@ -232,5 +232,5 @@ def move(self, target, action): def render(self): # 게임 속도 조정 - time.sleep(0.05) + time.sleep(self.render_speed) self.update() diff --git a/1-grid-world/5-deep-sarsa/save_graph/graph_trained.png b/1-grid-world/5-deep-sarsa/save_graph/graph_trained.png new file mode 100644 index 0000000..b002992 Binary files /dev/null and b/1-grid-world/5-deep-sarsa/save_graph/graph_trained.png differ diff --git a/1-grid-world/5-deep-sarsa/save_model/checkpoint b/1-grid-world/5-deep-sarsa/save_model/checkpoint new file mode 100644 index 0000000..a6e034f --- /dev/null +++ b/1-grid-world/5-deep-sarsa/save_model/checkpoint @@ -0,0 +1,2 @@ +model_checkpoint_path: "model" +all_model_checkpoint_paths: "model" diff --git a/1-grid-world/5-deep-sarsa/save_model/model.data-00000-of-00001 b/1-grid-world/5-deep-sarsa/save_model/model.data-00000-of-00001 new file mode 100644 index 0000000..5df8913 Binary files /dev/null and b/1-grid-world/5-deep-sarsa/save_model/model.data-00000-of-00001 differ diff --git a/1-grid-world/5-deep-sarsa/save_model/model.index b/1-grid-world/5-deep-sarsa/save_model/model.index new file mode 100644 index 0000000..cec03a1 Binary files /dev/null and b/1-grid-world/5-deep-sarsa/save_model/model.index differ diff --git a/1-grid-world/5-deep-sarsa/save_model/trained/checkpoint b/1-grid-world/5-deep-sarsa/save_model/trained/checkpoint new file mode 100644 index 0000000..a6e034f --- /dev/null +++ b/1-grid-world/5-deep-sarsa/save_model/trained/checkpoint @@ -0,0 +1,2 @@ +model_checkpoint_path: "model" +all_model_checkpoint_paths: "model" diff --git a/1-grid-world/5-deep-sarsa/save_model/trained/model.data-00000-of-00001 b/1-grid-world/5-deep-sarsa/save_model/trained/model.data-00000-of-00001 new file mode 100644 index 0000000..64ef600 Binary files /dev/null and b/1-grid-world/5-deep-sarsa/save_model/trained/model.data-00000-of-00001 differ diff --git a/1-grid-world/5-deep-sarsa/save_model/trained/model.index b/1-grid-world/5-deep-sarsa/save_model/trained/model.index new file mode 100644 index 0000000..42d21e2 Binary files /dev/null and b/1-grid-world/5-deep-sarsa/save_model/trained/model.index differ diff --git a/1-grid-world/5-deep-sarsa/test.py b/1-grid-world/5-deep-sarsa/test.py new file mode 100644 index 0000000..1e193ae --- /dev/null +++ b/1-grid-world/5-deep-sarsa/test.py @@ -0,0 +1,74 @@ +import random +import numpy as np +from environment import Env +import tensorflow as tf +from tensorflow.keras.layers import Dense + + +# 딥살사 인공신경망 +class DeepSARSA(tf.keras.Model): + def __init__(self, action_size): + super(DeepSARSA, self).__init__() + self.fc1 = Dense(30, activation='relu') + self.fc2 = Dense(30, activation='relu') + self.fc_out = Dense(action_size) + + def call(self, x): + x = self.fc1(x) + x = self.fc2(x) + q = self.fc_out(x) + return q + + +# 그리드월드 예제에서의 딥살사 에이전트 +class DeepSARSAgent: + def __init__(self, state_size, action_size): + # 상태의 크기와 행동의 크기 정의 + self.state_size = state_size + self.action_size = action_size + + self.epsilon = 0.01 + self.model = DeepSARSA(self.action_size) + self.model.load_weights('save_model/trained/model') + + # 입실론 탐욕 정책으로 행동 선택 + def get_action(self, state): + if np.random.rand() <= self.epsilon: + return random.randrange(self.action_size) + else: + q_values = self.model(state) + return np.argmax(q_values[0]) + + +if __name__ == "__main__": + # 환경과 에이전트 생성 + env = Env(render_speed=0.05) + state_size = 15 + action_space = [0, 1, 2, 3, 4] + action_size = len(action_space) + agent = DeepSARSAgent(state_size, action_size) + + scores, episodes = [], [] + + EPISODES = 10 + for e in range(EPISODES): + score = 0 + done = False + # env 초기화 + state = env.reset() + state = np.reshape(state, [1, state_size]) + + while not done: + # 현재 상태에 대한 행동 선택 + action = agent.get_action(state) + + # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집 + next_state, reward, done = env.step(action) + next_state = np.reshape(next_state, [1, state_size]) + + state = next_state + score += reward + + if done: + # 에피소드마다 학습 결과 출력 + print("episode: {:3d} | score: {:3d}".format(e, score)) \ No newline at end of file diff --git a/1-grid-world/5-deep-sarsa/train.py b/1-grid-world/5-deep-sarsa/train.py new file mode 100644 index 0000000..99f530d --- /dev/null +++ b/1-grid-world/5-deep-sarsa/train.py @@ -0,0 +1,123 @@ +import copy +import pylab +import random +import numpy as np +from environment import Env +import tensorflow as tf +from tensorflow.keras.layers import Dense +from tensorflow.keras.optimizers import Adam + + +# 딥살사 인공신경망 +class DeepSARSA(tf.keras.Model): + def __init__(self, action_size): + super(DeepSARSA, self).__init__() + self.fc1 = Dense(30, activation='relu') + self.fc2 = Dense(30, activation='relu') + self.fc_out = Dense(action_size) + + def call(self, x): + x = self.fc1(x) + x = self.fc2(x) + q = self.fc_out(x) + return q + + +# 그리드월드 예제에서의 딥살사 에이전트 +class DeepSARSAgent: + def __init__(self, state_size, action_size): + # 상태의 크기와 행동의 크기 정의 + self.state_size = state_size + self.action_size = action_size + + # 딥살사 하이퍼 파라메터 + self.discount_factor = 0.99 + self.learning_rate = 0.001 + self.epsilon = 1. + self.epsilon_decay = .9999 + self.epsilon_min = 0.01 + self.model = DeepSARSA(self.action_size) + self.optimizer = Adam(lr=self.learning_rate) + + # 입실론 탐욕 정책으로 행동 선택 + def get_action(self, state): + if np.random.rand() <= self.epsilon: + return random.randrange(self.action_size) + else: + q_values = self.model(state) + return np.argmax(q_values[0]) + + # 의 샘플로부터 모델 업데이트 + def train_model(self, state, action, reward, next_state, next_action, done): + if self.epsilon > self.epsilon_min: + self.epsilon *= self.epsilon_decay + + # 학습 파라메터 + model_params = self.model.trainable_variables + with tf.GradientTape() as tape: + tape.watch(model_params) + predict = self.model(state)[0] + one_hot_action = tf.one_hot([action], self.action_size) + predict = tf.reduce_sum(one_hot_action * predict, axis=1) + + # done = True 일 경우 에피소드가 끝나서 다음 상태가 없음 + next_q = self.model(next_state)[0][next_action] + target = reward + (1 - done) * self.discount_factor * next_q + + # MSE 오류 함수 계산 + loss = tf.reduce_mean(tf.square(target - predict)) + + # 오류함수를 줄이는 방향으로 모델 업데이트 + grads = tape.gradient(loss, model_params) + self.optimizer.apply_gradients(zip(grads, model_params)) + + +if __name__ == "__main__": + # 환경과 에이전트 생성 + env = Env(render_speed=0.01) + state_size = 15 + action_space = [0, 1, 2, 3, 4] + action_size = len(action_space) + agent = DeepSARSAgent(state_size, action_size) + + scores, episodes = [], [] + + EPISODES = 1000 + for e in range(EPISODES): + done = False + score = 0 + # env 초기화 + state = env.reset() + state = np.reshape(state, [1, state_size]) + + while not done: + # 현재 상태에 대한 행동 선택 + action = agent.get_action(state) + + # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집 + next_state, reward, done = env.step(action) + next_state = np.reshape(next_state, [1, state_size]) + next_action = agent.get_action(next_state) + + # 샘플로 모델 학습 + agent.train_model(state, action, reward, next_state, + next_action, done) + score += reward + state = next_state + + if done: + # 에피소드마다 학습 결과 출력 + print("episode: {:3d} | score: {:3d} | epsilon: {:.3f}".format( + e, score, agent.epsilon)) + + scores.append(score) + episodes.append(e) + pylab.plot(episodes, scores, 'b') + pylab.xlabel("episode") + pylab.ylabel("score") + pylab.savefig("./save_graph/graph.png") + + + # 100 에피소드마다 모델 저장 + if e % 100 == 0: + agent.model.save_weights('save_model/model', save_format='tf') \ No newline at end of file diff --git a/1-grid-world/5-q-learning/.python-version b/1-grid-world/5-q-learning/.python-version deleted file mode 100644 index 1545d96..0000000 --- a/1-grid-world/5-q-learning/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.5.0 diff --git a/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py b/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py deleted file mode 100755 index 1af7fda..0000000 --- a/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py +++ /dev/null @@ -1,118 +0,0 @@ -import copy -import pylab -import random -import numpy as np -from environment import Env -from keras.layers import Dense -from keras.optimizers import Adam -from keras.models import Sequential - -EPISODES = 1000 - - -# 그리드월드 예제에서의 딥살사 에이전트 -class DeepSARSAgent: - def __init__(self): - self.load_model = False - # 에이전트가 가능한 모든 행동 정의 - self.action_space = [0, 1, 2, 3, 4] - # 상태의 크기와 행동의 크기 정의 - self.action_size = len(self.action_space) - self.state_size = 15 - self.discount_factor = 0.99 - self.learning_rate = 0.001 - - self.epsilon = 1. # exploration - self.epsilon_decay = .9999 - self.epsilon_min = 0.01 - self.model = self.build_model() - - if self.load_model: - self.epsilon = 0.05 - self.model.load_weights('./save_model/deep_sarsa_trained.h5') - - # 상태가 입력 큐함수가 출력인 인공신경망 생성 - def build_model(self): - model = Sequential() - model.add(Dense(30, input_dim=self.state_size, activation='relu')) - model.add(Dense(30, activation='relu')) - model.add(Dense(self.action_size, activation='linear')) - model.summary() - model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) - return model - - # 입실론 탐욕 방법으로 행동 선택 - def get_action(self, state): - if np.random.rand() <= self.epsilon: - # 무작위 행동 반환 - return random.randrange(self.action_size) - else: - # 모델로부터 행동 산출 - state = np.float32(state) - q_values = self.model.predict(state) - return np.argmax(q_values[0]) - - def train_model(self, state, action, reward, next_state, next_action, done): - if self.epsilon > self.epsilon_min: - self.epsilon *= self.epsilon_decay - - state = np.float32(state) - next_state = np.float32(next_state) - target = self.model.predict(state)[0] - # 살사의 큐함수 업데이트 식 - if done: - target[action] = reward - else: - target[action] = (reward + self.discount_factor * - self.model.predict(next_state)[0][next_action]) - - # 출력 값 reshape - target = np.reshape(target, [1, 5]) - # 인공신경망 업데이트 - self.model.fit(state, target, epochs=1, verbose=0) - - -if __name__ == "__main__": - # 환경과 에이전트 생성 - env = Env() - agent = DeepSARSAgent() - - global_step = 0 - scores, episodes = [], [] - - for e in range(EPISODES): - done = False - score = 0 - state = env.reset() - state = np.reshape(state, [1, 15]) - - while not done: - # env 초기화 - global_step += 1 - - # 현재 상태에 대한 행동 선택 - action = agent.get_action(state) - # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집 - next_state, reward, done = env.step(action) - next_state = np.reshape(next_state, [1, 15]) - next_action = agent.get_action(next_state) - # 샘플로 모델 학습 - agent.train_model(state, action, reward, next_state, next_action, - done) - state = next_state - score += reward - - state = copy.deepcopy(next_state) - - if done: - # 에피소드마다 학습 결과 출력 - scores.append(score) - episodes.append(e) - pylab.plot(episodes, scores, 'b') - pylab.savefig("./save_graph/deep_sarsa_.png") - print("episode:", e, " score:", score, "global_step", - global_step, " epsilon:", agent.epsilon) - - # 100 에피소드마다 모델 저장 - if e % 100 == 0: - agent.model.save_weights("./save_model/deep_sarsa.h5") diff --git a/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png b/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png deleted file mode 100644 index 8dec1d0..0000000 Binary files a/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png and /dev/null differ diff --git a/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5 b/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5 deleted file mode 100644 index 23ba39c..0000000 Binary files a/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5 and /dev/null differ diff --git a/1-grid-world/7-reinforce/environment.py b/1-grid-world/6-reinforce/environment.py similarity index 94% rename from 1-grid-world/7-reinforce/environment.py rename to 1-grid-world/6-reinforce/environment.py index 816c6f0..613a2b2 100644 --- a/1-grid-world/7-reinforce/environment.py +++ b/1-grid-world/6-reinforce/environment.py @@ -5,18 +5,19 @@ PhotoImage = ImageTk.PhotoImage UNIT = 50 # 픽셀 수 -HEIGHT = 5 # 그리드월드 세로 -WIDTH = 5 # 그리드월드 가로 +HEIGHT = 5 # 그리드 세로 +WIDTH = 5 # 그리드 가로 np.random.seed(1) class Env(tk.Tk): - def __init__(self): + def __init__(self, render_speed=0.01): super(Env, self).__init__() + self.render_speed=render_speed self.action_space = ['u', 'd', 'l', 'r'] self.action_size = len(self.action_space) - self.title('Reinforce') + self.title('REINFORCE') self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) self.shapes = self.load_images() self.canvas = self._build_canvas() @@ -27,7 +28,7 @@ def __init__(self): self.set_reward([0, 1], -1) self.set_reward([1, 2], -1) self.set_reward([2, 3], -1) - # 목표지점 설정 + # 목표 지점 설정 self.set_reward([4, 4], 1) def _build_canvas(self): @@ -73,7 +74,7 @@ def reset_reward(self): self.set_reward([1, 2], -1) self.set_reward([2, 3], -1) - # 목표 지점 + # #goal self.set_reward([4, 4], 1) def set_reward(self, state, reward): @@ -101,6 +102,7 @@ def set_reward(self, state, reward): temp['state'] = state self.rewards.append(temp) + # new methods def check_if_reward(self, state): check_list = dict() check_list['if_goal'] = False @@ -109,7 +111,7 @@ def check_if_reward(self, state): for reward in self.rewards: if reward['state'] == state: rewards += reward['reward'] - if reward['reward'] > 0: + if reward['reward'] == 1: check_list['if_goal'] = True check_list['rewards'] = rewards @@ -123,6 +125,7 @@ def coords_to_state(self, coords): def reset(self): self.update() + time.sleep(0.5) x, y = self.canvas.coords(self.rectangle) self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) self.reset_reward() @@ -139,7 +142,7 @@ def step(self, action): check = self.check_if_reward(self.coords_to_state(next_coords)) done = check['if_goal'] reward = check['rewards'] - reward -= 0.1 + self.canvas.tag_raise(self.rectangle) s_ = self.get_state() @@ -169,7 +172,7 @@ def get_state(self): def move_rewards(self): new_rewards = [] for temp in self.rewards: - if temp['reward'] > 0: + if temp['reward'] == 1: new_rewards.append(temp) continue temp['coords'] = self.move_const(temp) @@ -218,7 +221,7 @@ def move(self, target, action): if s[0] < (WIDTH - 1) * UNIT: base_action[0] += UNIT elif action == 3: # 좌 - if s[0] > UNIT: + if s[0] > UNIT: base_action[0] -= UNIT self.canvas.move(target, base_action[0], base_action[1]) @@ -229,5 +232,5 @@ def move(self, target, action): def render(self): # 게임 속도 조정 - time.sleep(0.07) + time.sleep(self.render_speed) self.update() diff --git a/1-grid-world/6-reinforce/save_graph/graph_trained.png b/1-grid-world/6-reinforce/save_graph/graph_trained.png new file mode 100644 index 0000000..69f0fb2 Binary files /dev/null and b/1-grid-world/6-reinforce/save_graph/graph_trained.png differ diff --git a/1-grid-world/6-reinforce/save_model/trained/checkpoint b/1-grid-world/6-reinforce/save_model/trained/checkpoint new file mode 100644 index 0000000..a6e034f --- /dev/null +++ b/1-grid-world/6-reinforce/save_model/trained/checkpoint @@ -0,0 +1,2 @@ +model_checkpoint_path: "model" +all_model_checkpoint_paths: "model" diff --git a/1-grid-world/6-reinforce/save_model/trained/model.data-00000-of-00001 b/1-grid-world/6-reinforce/save_model/trained/model.data-00000-of-00001 new file mode 100644 index 0000000..2ca52a5 Binary files /dev/null and b/1-grid-world/6-reinforce/save_model/trained/model.data-00000-of-00001 differ diff --git a/1-grid-world/6-reinforce/save_model/trained/model.index b/1-grid-world/6-reinforce/save_model/trained/model.index new file mode 100644 index 0000000..2878aef Binary files /dev/null and b/1-grid-world/6-reinforce/save_model/trained/model.index differ diff --git a/1-grid-world/6-reinforce/test.py b/1-grid-world/6-reinforce/test.py new file mode 100644 index 0000000..7c1bacc --- /dev/null +++ b/1-grid-world/6-reinforce/test.py @@ -0,0 +1,72 @@ +import copy +import pylab +import random +import numpy as np +from environment import Env +import tensorflow as tf +from tensorflow.keras.layers import Dense +from tensorflow.keras.optimizers import Adam + + +# 상태가 입력, 각 행동의 확률이 출력인 인공신경망 생성 +class REINFORCE(tf.keras.Model): + def __init__(self, action_size): + super(REINFORCE, self).__init__() + self.fc1 = Dense(24, activation='relu') + self.fc2 = Dense(24, activation='relu') + self.fc_out = Dense(action_size, activation='softmax') + + def call(self, x): + x = self.fc1(x) + x = self.fc2(x) + policy = self.fc_out(x) + return policy + + +# 그리드월드 예제에서의 REINFORCE 에이전트 +class REINFORCEAgent: + def __init__(self, state_size, action_size): + # 상태의 크기와 행동의 크기 정의 + self.state_size = state_size + self.action_size = action_size + + self.model = REINFORCE(self.action_size) + self.model.load_weights('save_model/trained/model') + + # 정책신경망으로 행동 선택 + def get_action(self, state): + policy = self.model(state)[0] + policy = np.array(policy) + return np.random.choice(self.action_size, 1, p=policy)[0] + + +if __name__ == "__main__": + # 환경과 에이전트 생성 + env = Env(render_speed=0.05) + state_size = 15 + action_space = [0, 1, 2, 3, 4] + action_size = len(action_space) + agent = REINFORCEAgent(state_size, action_size) + + EPISODES = 10 + for e in range(EPISODES): + done = False + score = 0 + # env 초기화 + state = env.reset() + state = np.reshape(state, [1, state_size]) + + while not done: + # 현재 상태에 대한 행동 선택 + action = agent.get_action(state) + + # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집 + next_state, reward, done = env.step(action) + next_state = np.reshape(next_state, [1, state_size]) + + score += reward + + state = next_state + + if done: + print("episode: {:3d} | score: {:3d}".format(e, score)) \ No newline at end of file diff --git a/1-grid-world/6-reinforce/train.py b/1-grid-world/6-reinforce/train.py new file mode 100644 index 0000000..0eacae2 --- /dev/null +++ b/1-grid-world/6-reinforce/train.py @@ -0,0 +1,136 @@ +import copy +import pylab +import random +import numpy as np +from environment import Env +import tensorflow as tf +from tensorflow.keras.layers import Dense +from tensorflow.keras.optimizers import Adam + + +# 상태가 입력, 각 행동의 확률이 출력인 인공신경망 생성 +class REINFORCE(tf.keras.Model): + def __init__(self, action_size): + super(REINFORCE, self).__init__() + self.fc1 = Dense(24, activation='relu') + self.fc2 = Dense(24, activation='relu') + self.fc_out = Dense(action_size, activation='softmax') + + def call(self, x): + x = self.fc1(x) + x = self.fc2(x) + policy = self.fc_out(x) + return policy + + +# 그리드월드 예제에서의 REINFORCE 에이전트 +class REINFORCEAgent: + def __init__(self, state_size, action_size): + # 상태의 크기와 행동의 크기 정의 + self.state_size = state_size + self.action_size = action_size + + # REINFORCE 하이퍼 파라메터 + self.discount_factor = 0.99 + self.learning_rate = 0.001 + + self.model = REINFORCE(self.action_size) + self.optimizer = Adam(lr=self.learning_rate) + self.states, self.actions, self.rewards = [], [], [] + + # 정책신경망으로 행동 선택 + def get_action(self, state): + policy = self.model(state)[0] + policy = np.array(policy) + return np.random.choice(self.action_size, 1, p=policy)[0] + + # 반환값 계산 + def discount_rewards(self, rewards): + discounted_rewards = np.zeros_like(rewards) + running_add = 0 + for t in reversed(range(0, len(rewards))): + running_add = running_add * self.discount_factor + rewards[t] + discounted_rewards[t] = running_add + return discounted_rewards + + # 한 에피소드 동안의 상태, 행동, 보상을 저장 + def append_sample(self, state, action, reward): + self.states.append(state[0]) + self.rewards.append(reward) + act = np.zeros(self.action_size) + act[action] = 1 + self.actions.append(act) + + # 정책신경망 업데이트 + def train_model(self): + discounted_rewards = np.float32(self.discount_rewards(self.rewards)) + discounted_rewards -= np.mean(discounted_rewards) + discounted_rewards /= np.std(discounted_rewards) + + # 크로스 엔트로피 오류함수 계산 + model_params = self.model.trainable_variables + with tf.GradientTape() as tape: + tape.watch(model_params) + policies = self.model(np.array(self.states)) + actions = np.array(self.actions) + action_prob = tf.reduce_sum(actions * policies, axis=1) + cross_entropy = - tf.math.log(action_prob + 1e-5) + loss = tf.reduce_sum(cross_entropy * discounted_rewards) + entropy = - policies * tf.math.log(policies) + + # 오류함수를 줄이는 방향으로 모델 업데이트 + grads = tape.gradient(loss, model_params) + self.optimizer.apply_gradients(zip(grads, model_params)) + self.states, self.actions, self.rewards = [], [], [] + return np.mean(entropy) + + +if __name__ == "__main__": + # 환경과 에이전트 생성 + env = Env(render_speed=0.01) + state_size = 15 + action_space = [0, 1, 2, 3, 4] + action_size = len(action_space) + agent = REINFORCEAgent(state_size, action_size) + + scores, episodes = [], [] + + EPISODES = 200 + for e in range(EPISODES): + done = False + score = 0 + # env 초기화 + state = env.reset() + state = np.reshape(state, [1, state_size]) + + while not done: + # 현재 상태에 대한 행동 선택 + action = agent.get_action(state) + + # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집 + next_state, reward, done = env.step(action) + next_state = np.reshape(next_state, [1, state_size]) + + agent.append_sample(state, action, reward) + score += reward + + state = next_state + + if done: + # 에피소드마다 정책신경망 업데이트 + entropy = agent.train_model() + # 에피소드마다 학습 결과 출력 + print("episode: {:3d} | score: {:3d} | entropy: {:.3f}".format( + e, score, entropy)) + + scores.append(score) + episodes.append(e) + pylab.plot(episodes, scores, 'b') + pylab.xlabel("episode") + pylab.ylabel("score") + pylab.savefig("./save_graph/graph.png") + + + # 100 에피소드마다 모델 저장 + if e % 100 == 0: + agent.model.save_weights('save_model/model', save_format='tf') \ No newline at end of file diff --git a/1-grid-world/7-reinforce/reinforce_agent.py b/1-grid-world/7-reinforce/reinforce_agent.py deleted file mode 100644 index 6c2aa4e..0000000 --- a/1-grid-world/7-reinforce/reinforce_agent.py +++ /dev/null @@ -1,131 +0,0 @@ -import copy -import pylab -import numpy as np -from environment import Env -from keras.layers import Dense -from keras.optimizers import Adam -from keras.models import Sequential -from keras import backend as K - -EPISODES = 2500 - -# 그리드월드 예제에서의 REINFORCE 에이전트 -class ReinforceAgent: - def __init__(self): - self.load_model = False - # 가능한 모든 행동 정의 - self.action_space = [0, 1, 2, 3, 4] - # 상태와 행동의 크기 정의 - self.action_size = len(self.action_space) - self.state_size = 15 - self.discount_factor = 0.99 - self.learning_rate = 0.001 - - self.model = self.build_model() - self.optimizer = self.optimizer() - self.states, self.actions, self.rewards = [], [], [] - - if self.load_model: - self.model.load_weights('./save_model/reinforce_trained.h5') - - # 상태가 입력, 각 행동의 확률이 출력인 인공신경망 생성 - def build_model(self): - model = Sequential() - model.add(Dense(24, input_dim=self.state_size, activation='relu')) - model.add(Dense(24, activation='relu')) - model.add(Dense(self.action_size, activation='softmax')) - model.summary() - return model - - # 정책신경망을 업데이트 하기 위한 오류함수와 훈련함수의 생성 - def optimizer(self): - action = K.placeholder(shape=[None, 5]) - discounted_rewards = K.placeholder(shape=[None, ]) - - # 크로스 엔트로피 오류함수 계산 - action_prob = K.sum(action * self.model.output, axis=1) - cross_entropy = K.log(action_prob) * discounted_rewards - loss = -K.sum(cross_entropy) - - # 정책신경망을 업데이트하는 훈련함수 생성 - optimizer = Adam(lr=self.learning_rate) - updates = optimizer.get_updates(self.model.trainable_weights,[], - loss) - train = K.function([self.model.input, action, discounted_rewards], [], - updates=updates) - - return train - - # 정책신경망으로 행동 선택 - def get_action(self, state): - policy = self.model.predict(state)[0] - return np.random.choice(self.action_size, 1, p=policy)[0] - - # 반환값 계산 - def discount_rewards(self, rewards): - discounted_rewards = np.zeros_like(rewards) - running_add = 0 - for t in reversed(range(0, len(rewards))): - running_add = running_add * self.discount_factor + rewards[t] - discounted_rewards[t] = running_add - return discounted_rewards - - # 한 에피소드 동안의 상태, 행동, 보상을 저장 - def append_sample(self, state, action, reward): - self.states.append(state[0]) - self.rewards.append(reward) - act = np.zeros(self.action_size) - act[action] = 1 - self.actions.append(act) - - # 정책신경망 업데이트 - def train_model(self): - discounted_rewards = np.float32(self.discount_rewards(self.rewards)) - discounted_rewards -= np.mean(discounted_rewards) - discounted_rewards /= np.std(discounted_rewards) - - self.optimizer([self.states, self.actions, discounted_rewards]) - self.states, self.actions, self.rewards = [], [], [] - - -if __name__ == "__main__": - # 환경과 에이전트의 생성 - env = Env() - agent = ReinforceAgent() - - global_step = 0 - scores, episodes = [], [] - - for e in range(EPISODES): - done = False - score = 0 - # env 초기화 - state = env.reset() - state = np.reshape(state, [1, 15]) - - while not done: - global_step += 1 - # 현재 상태에 대한 행동 선택 - action = agent.get_action(state) - # 선택한 행동으로 환경에서 한 타임스탭 진행 후 샘플 수집 - next_state, reward, done = env.step(action) - next_state = np.reshape(next_state, [1, 15]) - - agent.append_sample(state, action, reward) - score += reward - state = copy.deepcopy(next_state) - - if done: - # 에피소드마다 정책신경망 업데이트 - agent.train_model() - scores.append(score) - episodes.append(e) - score = round(score,2) - print("episode:", e, " score:", score, " time_step:", - global_step) - - # 100 에피소드마다 학습 결과 출력 및 모델 저장 - if e % 100 == 0: - pylab.plot(episodes, scores, 'b') - pylab.savefig("./save_graph/reinforce.png") - agent.model.save_weights("./save_model/reinforce.h5") diff --git a/1-grid-world/7-reinforce/save_graph/reinforce_trained.png b/1-grid-world/7-reinforce/save_graph/reinforce_trained.png deleted file mode 100644 index 3be9edb..0000000 Binary files a/1-grid-world/7-reinforce/save_graph/reinforce_trained.png and /dev/null differ diff --git a/1-grid-world/7-reinforce/save_model/reinforce_trained.h5 b/1-grid-world/7-reinforce/save_model/reinforce_trained.h5 deleted file mode 100644 index cb206f5..0000000 Binary files a/1-grid-world/7-reinforce/save_model/reinforce_trained.h5 and /dev/null differ diff --git a/1-grid-world/README.md b/1-grid-world/README.md deleted file mode 100644 index a955308..0000000 --- a/1-grid-world/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Grid World with Reinforcement Learning -This is Grid World example that we made for the simple algorithm test -The game is simple. The red rectangle must arrive in the circle, avoiding triangle. - -

- -
- - - -## Dynamic Programming -**1. Policy Iteration** - -**2. Value Iteration** - -
- -## Reinforcement Learning Fundamental Algorithms -**3. Monte-Carlo** - -**4. SARSA** - -**5. Q-Learning** - -
- -## Futher Reinforcement Learning Algorithms ->we have changed Grid World so the obstacles are moving. To solve this problem, we have to use function approximator. -We used Neural Network as function approximator - -

- -
- -**6. DQN** - -**7. Policy Gradient** - - diff --git a/1-grid-world/gridworld.png b/1-grid-world/gridworld.png deleted file mode 100644 index 71468d4..0000000 Binary files a/1-grid-world/gridworld.png and /dev/null differ diff --git a/2-cartpole/1-dqn/save_graph/cartpole_dqn.png b/2-cartpole/1-dqn/save_graph/cartpole_dqn.png deleted file mode 100644 index 384fef6..0000000 Binary files a/2-cartpole/1-dqn/save_graph/cartpole_dqn.png and /dev/null differ diff --git a/2-cartpole/1-dqn/save_graph/graph_trained.png b/2-cartpole/1-dqn/save_graph/graph_trained.png new file mode 100644 index 0000000..05d6c45 Binary files /dev/null and b/2-cartpole/1-dqn/save_graph/graph_trained.png differ diff --git a/2-cartpole/1-dqn/save_model/cartpole_dqn_trained.h5 b/2-cartpole/1-dqn/save_model/cartpole_dqn_trained.h5 deleted file mode 100644 index 50edb6e..0000000 Binary files a/2-cartpole/1-dqn/save_model/cartpole_dqn_trained.h5 and /dev/null differ diff --git a/2-cartpole/1-dqn/save_model/trained/checkpoint b/2-cartpole/1-dqn/save_model/trained/checkpoint new file mode 100644 index 0000000..a6e034f --- /dev/null +++ b/2-cartpole/1-dqn/save_model/trained/checkpoint @@ -0,0 +1,2 @@ +model_checkpoint_path: "model" +all_model_checkpoint_paths: "model" diff --git a/2-cartpole/1-dqn/save_model/trained/model.data-00000-of-00001 b/2-cartpole/1-dqn/save_model/trained/model.data-00000-of-00001 new file mode 100644 index 0000000..5476cf1 Binary files /dev/null and b/2-cartpole/1-dqn/save_model/trained/model.data-00000-of-00001 differ diff --git a/2-cartpole/1-dqn/save_model/trained/model.index b/2-cartpole/1-dqn/save_model/trained/model.index new file mode 100644 index 0000000..b21f192 Binary files /dev/null and b/2-cartpole/1-dqn/save_model/trained/model.index differ diff --git a/2-cartpole/1-dqn/test.py b/2-cartpole/1-dqn/test.py new file mode 100644 index 0000000..2e92ec0 --- /dev/null +++ b/2-cartpole/1-dqn/test.py @@ -0,0 +1,76 @@ +import sys +import gym +import pylab +import random +import numpy as np +from collections import deque +import tensorflow as tf +from tensorflow.keras.layers import Dense +from tensorflow.keras.initializers import RandomUniform + + +# 상태가 입력, 큐함수가 출력인 인공신경망 생성 +class DQN(tf.keras.Model): + def __init__(self, action_size): + super(DQN, self).__init__() + self.fc1 = Dense(24, activation='relu') + self.fc2 = Dense(24, activation='relu') + self.fc_out = Dense(action_size, + kernel_initializer=RandomUniform(-1e-3, 1e-3)) + + def call(self, x): + x = self.fc1(x) + x = self.fc2(x) + q = self.fc_out(x) + return q + + +# 카트폴 예제에서의 DQN 에이전트 +class DQNAgent: + def __init__(self, state_size, action_size): + # 상태와 행동의 크기 정의 + self.state_size = state_size + self.action_size = action_size + + # 모델과 타깃 모델 생성 + self.model = DQN(action_size) + self.model.load_weights("./save_model/trained/model") + + # 입실론 탐욕 정책으로 행동 선택 + def get_action(self, state): + q_value = self.model(state) + return np.argmax(q_value[0]) + + +if __name__ == "__main__": + # CartPole-v1 환경, 최대 타임스텝 수가 500 + env = gym.make('CartPole-v1') + state_size = env.observation_space.shape[0] + action_size = env.action_space.n + + # DQN 에이전트 생성 + agent = DQNAgent(state_size, action_size) + + num_episode = 10 + for e in range(num_episode): + done = False + score = 0 + # env 초기화 + state = env.reset() + state = np.reshape(state, [1, state_size]) + + while not done: + env.render() + + # 현재 상태로 행동을 선택 + action = agent.get_action(state) + # 선택한 행동으로 환경에서 한 타임스텝 진행 + next_state, reward, done, info = env.step(action) + next_state = np.reshape(next_state, [1, state_size]) + + score += reward + state = next_state + + if done: + # 에피소드마다 학습 결과 출력 + print("episode: {:3d} | score: {:.3f} ".format(e, score)) \ No newline at end of file diff --git a/2-cartpole/1-dqn/cartpole_dqn.py b/2-cartpole/1-dqn/train.py similarity index 50% rename from 2-cartpole/1-dqn/cartpole_dqn.py rename to 2-cartpole/1-dqn/train.py index 8b7c332..be04f42 100644 --- a/2-cartpole/1-dqn/cartpole_dqn.py +++ b/2-cartpole/1-dqn/train.py @@ -1,21 +1,36 @@ +import os import sys import gym import pylab import random import numpy as np from collections import deque -from keras.layers import Dense -from keras.optimizers import Adam -from keras.models import Sequential +import tensorflow as tf +from tensorflow.keras.layers import Dense +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.initializers import RandomUniform -EPISODES = 300 + +# 상태가 입력, 큐함수가 출력인 인공신경망 생성 +class DQN(tf.keras.Model): + def __init__(self, action_size): + super(DQN, self).__init__() + self.fc1 = Dense(24, activation='relu') + self.fc2 = Dense(24, activation='relu') + self.fc_out = Dense(action_size, + kernel_initializer=RandomUniform(-1e-3, 1e-3)) + + def call(self, x): + x = self.fc1(x) + x = self.fc2(x) + q = self.fc_out(x) + return q # 카트폴 예제에서의 DQN 에이전트 class DQNAgent: def __init__(self, state_size, action_size): self.render = False - self.load_model = False # 상태와 행동의 크기 정의 self.state_size = state_size @@ -34,28 +49,13 @@ def __init__(self, state_size, action_size): self.memory = deque(maxlen=2000) # 모델과 타깃 모델 생성 - self.model = self.build_model() - self.target_model = self.build_model() + self.model = DQN(action_size) + self.target_model = DQN(action_size) + self.optimizer = Adam(lr=self.learning_rate) # 타깃 모델 초기화 self.update_target_model() - if self.load_model: - self.model.load_weights("./save_model/cartpole_dqn_trained.h5") - - # 상태가 입력, 큐함수가 출력인 인공신경망 생성 - def build_model(self): - model = Sequential() - model.add(Dense(24, input_dim=self.state_size, activation='relu', - kernel_initializer='he_uniform')) - model.add(Dense(24, activation='relu', - kernel_initializer='he_uniform')) - model.add(Dense(self.action_size, activation='linear', - kernel_initializer='he_uniform')) - model.summary() - model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) - return model - # 타깃 모델을 모델의 가중치로 업데이트 def update_target_model(self): self.target_model.set_weights(self.model.get_weights()) @@ -65,7 +65,7 @@ def get_action(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) else: - q_value = self.model.predict(state) + q_value = self.model(state) return np.argmax(q_value[0]) # 샘플 을 리플레이 메모리에 저장 @@ -80,32 +80,32 @@ def train_model(self): # 메모리에서 배치 크기만큼 무작위로 샘플 추출 mini_batch = random.sample(self.memory, self.batch_size) - states = np.zeros((self.batch_size, self.state_size)) - next_states = np.zeros((self.batch_size, self.state_size)) - actions, rewards, dones = [], [], [] + states = np.array([sample[0][0] for sample in mini_batch]) + actions = np.array([sample[1] for sample in mini_batch]) + rewards = np.array([sample[2] for sample in mini_batch]) + next_states = np.array([sample[3][0] for sample in mini_batch]) + dones = np.array([sample[4] for sample in mini_batch]) - for i in range(self.batch_size): - states[i] = mini_batch[i][0] - actions.append(mini_batch[i][1]) - rewards.append(mini_batch[i][2]) - next_states[i] = mini_batch[i][3] - dones.append(mini_batch[i][4]) + # 학습 파라메터 + model_params = self.model.trainable_variables + with tf.GradientTape() as tape: + # 현재 상태에 대한 모델의 큐함수 + predicts = self.model(states) + one_hot_action = tf.one_hot(actions, self.action_size) + predicts = tf.reduce_sum(one_hot_action * predicts, axis=1) - # 현재 상태에 대한 모델의 큐함수 - # 다음 상태에 대한 타깃 모델의 큐함수 - target = self.model.predict(states) - target_val = self.target_model.predict(next_states) + # 다음 상태에 대한 타깃 모델의 큐함수 + target_predicts = self.target_model(next_states) + target_predicts = tf.stop_gradient(target_predicts) - # 벨만 최적 방정식을 이용한 업데이트 타깃 - for i in range(self.batch_size): - if dones[i]: - target[i][actions[i]] = rewards[i] - else: - target[i][actions[i]] = rewards[i] + self.discount_factor * ( - np.amax(target_val[i])) + # 벨만 최적 방정식을 이용한 업데이트 타깃 + max_q = np.amax(target_predicts, axis=-1) + targets = rewards + (1 - dones) * self.discount_factor * max_q + loss = tf.reduce_mean(tf.square(targets - predicts)) - self.model.fit(states, target, batch_size=self.batch_size, - epochs=1, verbose=0) + # 오류함수를 줄이는 방향으로 모델 업데이트 + grads = tape.gradient(loss, model_params) + self.optimizer.apply_gradients(zip(grads, model_params)) if __name__ == "__main__": @@ -118,8 +118,10 @@ def train_model(self): agent = DQNAgent(state_size, action_size) scores, episodes = [], [] + score_avg = 0 - for e in range(EPISODES): + num_episode = 300 + for e in range(num_episode): done = False score = 0 # env 초기화 @@ -135,8 +137,10 @@ def train_model(self): # 선택한 행동으로 환경에서 한 타임스텝 진행 next_state, reward, done, info = env.step(action) next_state = np.reshape(next_state, [1, state_size]) - # 에피소드가 중간에 끝나면 -100 보상 - reward = reward if not done or score == 499 else -100 + + # 타임스텝마다 보상 0.1, 에피소드가 중간에 끝나면 -1 보상 + score += reward + reward = 0.1 if not done or score == 500 else -1 # 리플레이 메모리에 샘플 저장 agent.append_sample(state, action, reward, next_state, done) @@ -144,23 +148,25 @@ def train_model(self): if len(agent.memory) >= agent.train_start: agent.train_model() - score += reward state = next_state if done: # 각 에피소드마다 타깃 모델을 모델의 가중치로 업데이트 agent.update_target_model() - - score = score if score == 500 else score + 100 # 에피소드마다 학습 결과 출력 - scores.append(score) + score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score + print("episode: {:3d} | score avg: {:3.2f} | memory length: {:4d} | epsilon: {:.4f}".format( + e, score_avg, len(agent.memory), agent.epsilon)) + + # 에피소드마다 학습 결과 그래프로 저장 + scores.append(score_avg) episodes.append(e) pylab.plot(episodes, scores, 'b') - pylab.savefig("./save_graph/cartpole_dqn.png") - print("episode:", e, " score:", score, " memory length:", - len(agent.memory), " epsilon:", agent.epsilon) - - # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단 - if np.mean(scores[-min(10, len(scores)):]) > 490: - agent.model.save_weights("./save_model/cartpole_dqn.h5") - sys.exit() + pylab.xlabel("episode") + pylab.ylabel("average score") + pylab.savefig("./save_graph/graph.png") + + # 이동 평균이 400 이상일 때 종료 + if score_avg > 400: + agent.model.save_weights("./save_model/model", save_format="tf") + sys.exit() \ No newline at end of file diff --git a/2-cartpole/2-actor-critic/cartpole_a2c.py b/2-cartpole/2-actor-critic/cartpole_a2c.py deleted file mode 100644 index 7121fa8..0000000 --- a/2-cartpole/2-actor-critic/cartpole_a2c.py +++ /dev/null @@ -1,159 +0,0 @@ -import sys -import gym -import pylab -import numpy as np -from keras.layers import Dense -from keras.models import Sequential -from keras.optimizers import Adam -from keras import backend as K - -EPISODES = 1000 - - -# 카트폴 예제에서의 액터-크리틱(A2C) 에이전트 -class A2CAgent: - def __init__(self, state_size, action_size): - self.render = False - self.load_model = False - # 상태와 행동의 크기 정의 - self.state_size = state_size - self.action_size = action_size - self.value_size = 1 - - # 액터-크리틱 하이퍼파라미터 - self.discount_factor = 0.99 - self.actor_lr = 0.001 - self.critic_lr = 0.005 - - # 정책신경망과 가치신경망 생성 - self.actor = self.build_actor() - self.critic = self.build_critic() - self.actor_updater = self.actor_optimizer() - self.critic_updater = self.critic_optimizer() - - if self.load_model: - self.actor.load_weights("./save_model/cartpole_actor_trained.h5") - self.critic.load_weights("./save_model/cartpole_critic_trained.h5") - - # actor: 상태를 받아 각 행동의 확률을 계산 - def build_actor(self): - actor = Sequential() - actor.add(Dense(24, input_dim=self.state_size, activation='relu', - kernel_initializer='he_uniform')) - actor.add(Dense(self.action_size, activation='softmax', - kernel_initializer='he_uniform')) - actor.summary() - return actor - - # critic: 상태를 받아서 상태의 가치를 계산 - def build_critic(self): - critic = Sequential() - critic.add(Dense(24, input_dim=self.state_size, activation='relu', - kernel_initializer='he_uniform')) - critic.add(Dense(24, input_dim=self.state_size, activation='relu', - kernel_initializer='he_uniform')) - critic.add(Dense(self.value_size, activation='linear', - kernel_initializer='he_uniform')) - critic.summary() - return critic - - # 정책신경망의 출력을 받아 확률적으로 행동을 선택 - def get_action(self, state): - policy = self.actor.predict(state, batch_size=1).flatten() - return np.random.choice(self.action_size, 1, p=policy)[0] - - # 정책신경망을 업데이트하는 함수 - def actor_optimizer(self): - action = K.placeholder(shape=[None, self.action_size]) - advantage = K.placeholder(shape=[None, ]) - - action_prob = K.sum(action * self.actor.output, axis=1) - cross_entropy = K.log(action_prob) * advantage - loss = -K.sum(cross_entropy) - - optimizer = Adam(lr=self.actor_lr) - updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) - train = K.function([self.actor.input, action, advantage], [], - updates=updates) - return train - - # 가치신경망을 업데이트하는 함수 - def critic_optimizer(self): - target = K.placeholder(shape=[None, ]) - - loss = K.mean(K.square(target - self.critic.output)) - - optimizer = Adam(lr=self.critic_lr) - updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) - train = K.function([self.critic.input, target], [], updates=updates) - - return train - - # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 - def train_model(self, state, action, reward, next_state, done): - value = self.critic.predict(state)[0] - next_value = self.critic.predict(next_state)[0] - - act = np.zeros([1, self.action_size]) - act[0][action] = 1 - - # 벨만 기대 방정식를 이용한 어드벤티지와 업데이트 타깃 - if done: - advantage = reward - value - target = [reward] - else: - advantage = (reward + self.discount_factor * next_value) - value - target = reward + self.discount_factor * next_value - - self.actor_updater([state, act, advantage]) - self.critic_updater([state, target]) - - -if __name__ == "__main__": - # CartPole-v1 환경, 최대 타임스텝 수가 500 - env = gym.make('CartPole-v1') - # 환경으로부터 상태와 행동의 크기를 받아옴 - state_size = env.observation_space.shape[0] - action_size = env.action_space.n - - # 액터-크리틱(A2C) 에이전트 생성 - agent = A2CAgent(state_size, action_size) - - scores, episodes = [], [] - - for e in range(EPISODES): - done = False - score = 0 - state = env.reset() - state = np.reshape(state, [1, state_size]) - - while not done: - if agent.render: - env.render() - - action = agent.get_action(state) - next_state, reward, done, info = env.step(action) - next_state = np.reshape(next_state, [1, state_size]) - # 에피소드가 중간에 끝나면 -100 보상 - reward = reward if not done or score == 499 else -100 - - agent.train_model(state, action, reward, next_state, done) - - score += reward - state = next_state - - if done: - # 에피소드마다 학습 결과 출력 - score = score if score == 500.0 else score + 100 - scores.append(score) - episodes.append(e) - pylab.plot(episodes, scores, 'b') - pylab.savefig("./save_graph/cartpole_a2c.png") - print("episode:", e, " score:", score) - - # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단 - if np.mean(scores[-min(10, len(scores)):]) > 490: - agent.actor.save_weights("./save_model/cartpole_actor.h5") - agent.critic.save_weights( - "./save_model/cartpole_critic.h5") - sys.exit() diff --git a/2-cartpole/2-actor-critic/save_graph/cartpole_a2c.png b/2-cartpole/2-actor-critic/save_graph/cartpole_a2c.png deleted file mode 100644 index 92fdb86..0000000 Binary files a/2-cartpole/2-actor-critic/save_graph/cartpole_a2c.png and /dev/null differ diff --git a/2-cartpole/2-actor-critic/save_graph/graph_trained.png b/2-cartpole/2-actor-critic/save_graph/graph_trained.png new file mode 100644 index 0000000..19d61d1 Binary files /dev/null and b/2-cartpole/2-actor-critic/save_graph/graph_trained.png differ diff --git a/2-cartpole/2-actor-critic/save_model/cartpole_actor_trained.h5 b/2-cartpole/2-actor-critic/save_model/cartpole_actor_trained.h5 deleted file mode 100644 index 19a3aa7..0000000 Binary files a/2-cartpole/2-actor-critic/save_model/cartpole_actor_trained.h5 and /dev/null differ diff --git a/2-cartpole/2-actor-critic/save_model/cartpole_critic_trained.h5 b/2-cartpole/2-actor-critic/save_model/cartpole_critic_trained.h5 deleted file mode 100644 index d16fafd..0000000 Binary files a/2-cartpole/2-actor-critic/save_model/cartpole_critic_trained.h5 and /dev/null differ diff --git a/2-cartpole/2-actor-critic/save_model/trained/checkpoint b/2-cartpole/2-actor-critic/save_model/trained/checkpoint new file mode 100644 index 0000000..a6e034f --- /dev/null +++ b/2-cartpole/2-actor-critic/save_model/trained/checkpoint @@ -0,0 +1,2 @@ +model_checkpoint_path: "model" +all_model_checkpoint_paths: "model" diff --git a/2-cartpole/2-actor-critic/save_model/trained/model.data-00000-of-00001 b/2-cartpole/2-actor-critic/save_model/trained/model.data-00000-of-00001 new file mode 100644 index 0000000..8ef32fd Binary files /dev/null and b/2-cartpole/2-actor-critic/save_model/trained/model.data-00000-of-00001 differ diff --git a/2-cartpole/2-actor-critic/save_model/trained/model.index b/2-cartpole/2-actor-critic/save_model/trained/model.index new file mode 100644 index 0000000..145c7c3 Binary files /dev/null and b/2-cartpole/2-actor-critic/save_model/trained/model.index differ diff --git a/2-cartpole/2-actor-critic/test.py b/2-cartpole/2-actor-critic/test.py new file mode 100644 index 0000000..2adf3a9 --- /dev/null +++ b/2-cartpole/2-actor-critic/test.py @@ -0,0 +1,78 @@ +import sys +import gym +import pylab +import numpy as np +import tensorflow as tf +from tensorflow.keras.layers import Dense +from tensorflow.keras.initializers import RandomUniform + + +# 정책 신경망과 가치 신경망 생성 +class A2C(tf.keras.Model): + def __init__(self, action_size): + super(A2C, self).__init__() + self.actor_fc = Dense(24, activation='tanh') + self.actor_out = Dense(action_size, activation='softmax', + kernel_initializer=RandomUniform(-1e-3, 1e-3)) + self.critic_fc1 = Dense(24, activation='tanh') + self.critic_fc2 = Dense(24, activation='tanh') + self.critic_out = Dense(1, + kernel_initializer=RandomUniform(-1e-3, 1e-3)) + + def call(self, x): + actor_x = self.actor_fc(x) + policy = self.actor_out(actor_x) + + critic_x = self.critic_fc1(x) + critic_x = self.critic_fc2(critic_x) + value = self.critic_out(critic_x) + return policy, value + + + +# 카트폴 예제에서의 액터-크리틱(A2C) 에이전트 +class A2CAgent: + def __init__(self, action_size): + # 행동의 크기 정의 + self.action_size = action_size + + # 정책신경망과 가치신경망 생성 + self.model = A2C(self.action_size) + self.model.load_weights("./save_model/trained/model") + + # 정책신경망의 출력을 받아 확률적으로 행동을 선택 + def get_action(self, state): + policy, _ = self.model(state) + policy = np.array(policy[0]) + return np.random.choice(self.action_size, 1, p=policy)[0] + + +if __name__ == "__main__": + # CartPole-v1 환경, 최대 타임스텝 수가 500 + env = gym.make('CartPole-v1') + # 환경으로부터 상태와 행동의 크기를 받아옴 + state_size = env.observation_space.shape[0] + action_size = env.action_space.n + + # 액터-크리틱(A2C) 에이전트 생성 + agent = A2CAgent(action_size) + + num_episode = 10 + for e in range(num_episode): + done = False + score = 0 + state = env.reset() + state = np.reshape(state, [1, state_size]) + + while not done: + env.render() + + action = agent.get_action(state) + next_state, reward, done, info = env.step(action) + next_state = np.reshape(next_state, [1, state_size]) + + score += reward + state = next_state + + if done: + print("episode: {:3d} | score: {:3d}".format(e, int(score))) \ No newline at end of file diff --git a/2-cartpole/2-actor-critic/train.py b/2-cartpole/2-actor-critic/train.py new file mode 100644 index 0000000..eb7afb2 --- /dev/null +++ b/2-cartpole/2-actor-critic/train.py @@ -0,0 +1,139 @@ +import sys +import gym +import pylab +import numpy as np +import tensorflow as tf +from tensorflow.keras.layers import Dense +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.initializers import RandomUniform + + +# 정책 신경망과 가치 신경망 생성 +class A2C(tf.keras.Model): + def __init__(self, action_size): + super(A2C, self).__init__() + self.actor_fc = Dense(24, activation='tanh') + self.actor_out = Dense(action_size, activation='softmax', + kernel_initializer=RandomUniform(-1e-3, 1e-3)) + self.critic_fc1 = Dense(24, activation='tanh') + self.critic_fc2 = Dense(24, activation='tanh') + self.critic_out = Dense(1, + kernel_initializer=RandomUniform(-1e-3, 1e-3)) + + def call(self, x): + actor_x = self.actor_fc(x) + policy = self.actor_out(actor_x) + + critic_x = self.critic_fc1(x) + critic_x = self.critic_fc2(critic_x) + value = self.critic_out(critic_x) + return policy, value + + +# 카트폴 예제에서의 액터-크리틱(A2C) 에이전트 +class A2CAgent: + def __init__(self, action_size): + self.render = False + + # 행동의 크기 정의 + self.action_size = action_size + + # 액터-크리틱 하이퍼파라미터 + self.discount_factor = 0.99 + self.learning_rate = 0.001 + + # 정책신경망과 가치신경망 생성 + self.model = A2C(self.action_size) + # 최적화 알고리즘 설정, 미분값이 너무 커지는 현상을 막기 위해 clipnorm 설정 + self.optimizer = Adam(lr=self.learning_rate, clipnorm=5.0) + + # 정책신경망의 출력을 받아 확률적으로 행동을 선택 + def get_action(self, state): + policy, _ = self.model(state) + policy = np.array(policy[0]) + return np.random.choice(self.action_size, 1, p=policy)[0] + + # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 + def train_model(self, state, action, reward, next_state, done): + model_params = self.model.trainable_variables + with tf.GradientTape() as tape: + policy, value = self.model(state) + _, next_value = self.model(next_state) + target = reward + (1 - done) * self.discount_factor * next_value[0] + + # 정책 신경망 오류 함수 구하기 + one_hot_action = tf.one_hot([action], self.action_size) + action_prob = tf.reduce_sum(one_hot_action * policy, axis=1) + cross_entropy = - tf.math.log(action_prob + 1e-5) + advantage = tf.stop_gradient(target - value[0]) + actor_loss = tf.reduce_mean(cross_entropy * advantage) + + # 가치 신경망 오류 함수 구하기 + critic_loss = 0.5 * tf.square(tf.stop_gradient(target) - value[0]) + critic_loss = tf.reduce_mean(critic_loss) + + # 하나의 오류 함수로 만들기 + loss = 0.2 * actor_loss + critic_loss + + # 오류함수를 줄이는 방향으로 모델 업데이트 + grads = tape.gradient(loss, model_params) + self.optimizer.apply_gradients(zip(grads, model_params)) + return np.array(loss) + + +if __name__ == "__main__": + # CartPole-v1 환경, 최대 타임스텝 수가 500 + env = gym.make('CartPole-v1') + # 환경으로부터 상태와 행동의 크기를 받아옴 + state_size = env.observation_space.shape[0] + action_size = env.action_space.n + + # 액터-크리틱(A2C) 에이전트 생성 + agent = A2CAgent(action_size) + + scores, episodes = [], [] + score_avg = 0 + + num_episode = 1000 + for e in range(num_episode): + done = False + score = 0 + loss_list = [] + state = env.reset() + state = np.reshape(state, [1, state_size]) + + while not done: + if agent.render: + env.render() + + action = agent.get_action(state) + next_state, reward, done, info = env.step(action) + next_state = np.reshape(next_state, [1, state_size]) + + # 타임스텝마다 보상 0.1, 에피소드가 중간에 끝나면 -1 보상 + score += reward + reward = 0.1 if not done or score == 500 else -1 + + # 매 타임스텝마다 학습 + loss = agent.train_model(state, action, reward, next_state, done) + loss_list.append(loss) + state = next_state + + if done: + # 에피소드마다 학습 결과 출력 + score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score + print("episode: {:3d} | score avg: {:3.2f} | loss: {:.3f}".format( + e, score_avg, np.mean(loss_list))) + + # 에피소드마다 학습 결과 그래프로 저장 + scores.append(score_avg) + episodes.append(e) + pylab.plot(episodes, scores, 'b') + pylab.xlabel("episode") + pylab.ylabel("average score") + pylab.savefig("./save_graph/graph.png") + + # 이동 평균이 400 이상일 때 종료 + if score_avg > 400: + agent.model.save_weights("./save_model/model", save_format="tf") + sys.exit() \ No newline at end of file diff --git a/2-cartpole/3-continuous-actor-critic/env.py b/2-cartpole/3-continuous-actor-critic/env.py new file mode 100644 index 0000000..4c36a24 --- /dev/null +++ b/2-cartpole/3-continuous-actor-critic/env.py @@ -0,0 +1,163 @@ +""" +Classic cart-pole system implemented by Rich Sutton et al. +Copied from http://incompleteideas.net/sutton/book/code/pole.c +permalink: https://perma.cc/C9ZM-652R +""" + +import math +import gym +from gym import spaces, logger +from gym.utils import seeding +import numpy as np + + +class ContinuousCartPoleEnv(gym.Env): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 50 + } + + def __init__(self): + self.gravity = 9.8 + self.masscart = 1.0 + self.masspole = 0.1 + self.total_mass = (self.masspole + self.masscart) + self.length = 0.5 # actually half the pole's length + self.polemass_length = (self.masspole * self.length) + self.force_mag = 10.0 + self.tau = 0.02 # seconds between state updates + self.max_action = 3.0 + self.kinematics_integrator = 'euler' + + # Angle at which to fail the episode + self.theta_threshold_radians = 12 * 2 * math.pi / 360 + self.x_threshold = 2.4 + + # Angle limit set to 2 * theta_threshold_radians so failing observation + # is still within bounds + high = np.array([ + self.x_threshold * 2, + np.finfo(np.float32).max, + self.theta_threshold_radians * 2, + np.finfo(np.float32).max]) + + self.action_space = spaces.Box( + low=-self.max_action, + high=self.max_action, + shape=(1,) + ) + self.observation_space = spaces.Box(-high, high) + + self.seed() + self.viewer = None + self.state = None + + self.steps_beyond_done = None + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def step(self, action): + assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action)) + state = self.state + x, x_dot, theta, theta_dot = state + force = self.force_mag * float(action) + costheta = math.cos(theta) + sintheta = math.sin(theta) + temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass + thetaacc = (self.gravity * sintheta - costheta* temp) / (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass)) + xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass + if self.kinematics_integrator == 'euler': + x = x + self.tau * x_dot + x_dot = x_dot + self.tau * xacc + theta = theta + self.tau * theta_dot + theta_dot = theta_dot + self.tau * thetaacc + else: # semi-implicit euler + x_dot = x_dot + self.tau * xacc + x = x + self.tau * x_dot + theta_dot = theta_dot + self.tau * thetaacc + theta = theta + self.tau * theta_dot + self.state = (x,x_dot,theta,theta_dot) + done = x < -self.x_threshold \ + or x > self.x_threshold \ + or theta < -self.theta_threshold_radians \ + or theta > self.theta_threshold_radians + done = bool(done) + + if not done: + reward = 1.0 + elif self.steps_beyond_done is None: + # Pole just fell! + self.steps_beyond_done = 0 + reward = 1.0 + else: + if self.steps_beyond_done == 0: + logger.warn("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.") + self.steps_beyond_done += 1 + reward = 0.0 + + return np.array(self.state, dtype=np.float32), reward, done, {} + + def reset(self): + self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,)) + self.steps_beyond_done = None + return np.array(self.state, dtype=np.float32) + + def render(self, mode='human'): + screen_width = 600 + screen_height = 400 + + world_width = self.x_threshold * 2 + scale = screen_width /world_width + carty = 100 # TOP OF CART + polewidth = 10.0 + polelen = scale * 1.0 + cartwidth = 50.0 + cartheight = 30.0 + + if self.viewer is None: + from gym.envs.classic_control import rendering + self.viewer = rendering.Viewer(screen_width, screen_height) + l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2 + axleoffset = cartheight / 4.0 + cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)]) + self.carttrans = rendering.Transform() + cart.add_attr(self.carttrans) + self.viewer.add_geom(cart) + l, r, t, b = -polewidth / 2, polewidth / 2, polelen-polewidth / 2, -polewidth / 2 + pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)]) + pole.set_color(.8, .6, .4) + self.poletrans = rendering.Transform(translation=(0, axleoffset)) + pole.add_attr(self.poletrans) + pole.add_attr(self.carttrans) + self.viewer.add_geom(pole) + self.axle = rendering.make_circle(polewidth / 2) + self.axle.add_attr(self.poletrans) + self.axle.add_attr(self.carttrans) + self.axle.set_color(.5, .5, .8) + self.viewer.add_geom(self.axle) + self.track = rendering.Line((0, carty), (screen_width, carty)) + self.track.set_color(0, 0, 0) + self.viewer.add_geom(self.track) + + self._pole_geom = pole + + if self.state is None: return None + + # Edit the pole polygon vertex + pole = self._pole_geom + l,r,t,b = -polewidth/2,polewidth/2,polelen-polewidth/2,-polewidth/2 + pole.v = [(l,b), (l,t), (r,t), (r,b)] + + x = self.state + cartx = x[0] * scale + screen_width / 2.0 # MIDDLE OF CART + self.carttrans.set_translation(cartx, carty) + self.poletrans.set_rotation(-x[2]) + + return self.viewer.render(return_rgb_array=(mode == 'rgb_array')) + + def close(self): + if self.viewer: + self.viewer.close() + self.viewer = None \ No newline at end of file diff --git a/2-cartpole/3-continuous-actor-critic/save_graph/graph_trained.png b/2-cartpole/3-continuous-actor-critic/save_graph/graph_trained.png new file mode 100644 index 0000000..ca942e3 Binary files /dev/null and b/2-cartpole/3-continuous-actor-critic/save_graph/graph_trained.png differ diff --git a/2-cartpole/3-continuous-actor-critic/save_model/trained/checkpoint b/2-cartpole/3-continuous-actor-critic/save_model/trained/checkpoint new file mode 100644 index 0000000..a6e034f --- /dev/null +++ b/2-cartpole/3-continuous-actor-critic/save_model/trained/checkpoint @@ -0,0 +1,2 @@ +model_checkpoint_path: "model" +all_model_checkpoint_paths: "model" diff --git a/2-cartpole/3-continuous-actor-critic/save_model/trained/model.data-00000-of-00001 b/2-cartpole/3-continuous-actor-critic/save_model/trained/model.data-00000-of-00001 new file mode 100644 index 0000000..4c56d7a Binary files /dev/null and b/2-cartpole/3-continuous-actor-critic/save_model/trained/model.data-00000-of-00001 differ diff --git a/2-cartpole/3-continuous-actor-critic/save_model/trained/model.index b/2-cartpole/3-continuous-actor-critic/save_model/trained/model.index new file mode 100644 index 0000000..7954513 Binary files /dev/null and b/2-cartpole/3-continuous-actor-critic/save_model/trained/model.index differ diff --git a/2-cartpole/3-continuous-actor-critic/test.py b/2-cartpole/3-continuous-actor-critic/test.py new file mode 100644 index 0000000..62fcde3 --- /dev/null +++ b/2-cartpole/3-continuous-actor-critic/test.py @@ -0,0 +1,95 @@ +import sys +import gym +import pylab +import numpy as np +import tensorflow as tf +from tensorflow.keras.layers import Dense +from tensorflow.keras.initializers import RandomUniform +from tensorflow_probability import distributions as tfd + + +# 정책 신경망과 가치 신경망 생성 +class ContinuousA2C(tf.keras.Model): + def __init__(self, action_size): + super(ContinuousA2C, self).__init__() + self.actor_fc1 = Dense(24, activation='tanh') + self.actor_mu = Dense(action_size, + kernel_initializer=RandomUniform(-1e-3, 1e-3)) + self.actor_sigma = Dense(action_size, activation='softplus', + kernel_initializer=RandomUniform(-1e-3, 1e-3)) + + self.critic_fc1 = Dense(24, activation='tanh') + self.critic_fc2 = Dense(24, activation='tanh') + self.critic_out = Dense(1, + kernel_initializer=RandomUniform(-1e-3, 1e-3)) + + def call(self, x): + actor_x = self.actor_fc1(x) + mu = self.actor_mu(actor_x) + sigma = self.actor_sigma(actor_x) + sigma = sigma + 1e-5 + + critic_x = self.critic_fc1(x) + critic_x = self.critic_fc2(critic_x) + value = self.critic_out(critic_x) + return mu, sigma, value + + +# 카트폴 예제에서의 액터-크리틱(A2C) 에이전트 +class ContinuousA2CAgent: + def __init__(self, action_size, max_action): + # 행동의 크기 정의 + self.action_size = action_size + self.max_action = max_action + + # 정책신경망과 가치신경망 생성 + self.model = ContinuousA2C(self.action_size) + self.model.load_weights("./save_model/trained/model") + + # 정책신경망의 출력을 받아 확률적으로 행동을 선택 + def get_action(self, state): + mu, sigma, _ = self.model(state) + dist = tfd.Normal(loc=mu[0], scale=sigma[0]) + action = dist.sample([1])[0] + action = np.clip(action, -self.max_action, self.max_action) + return action + + +if __name__ == "__main__": + # CartPole-v1 환경, 최대 타임스텝 수가 500 + gym.envs.register( + id='CartPoleContinuous-v0', + entry_point='env:ContinuousCartPoleEnv', + max_episode_steps=500, + reward_threshold=475.0) + + env = gym.make('CartPoleContinuous-v0') + # 환경으로부터 상태와 행동의 크기를 받아옴 + state_size = env.observation_space.shape[0] + action_size = env.action_space.shape[0] + max_action = env.action_space.high[0] + + # 액터-크리틱(A2C) 에이전트 생성 + agent = ContinuousA2CAgent(action_size, max_action) + + scores, episodes = [], [] + + num_episode = 10 + for e in range(num_episode): + done = False + score = 0 + state = env.reset() + state = np.reshape(state, [1, state_size]) + + while not done: + env.render() + + action = agent.get_action(state) + next_state, reward, done, info = env.step(action) + next_state = np.reshape(next_state, [1, state_size]) + + score += reward + state = next_state + + if done: + print("episode: {:3d} | score: {:3d}".format(e, int(score))) \ No newline at end of file diff --git a/2-cartpole/3-continuous-actor-critic/train.py b/2-cartpole/3-continuous-actor-critic/train.py new file mode 100644 index 0000000..bfce749 --- /dev/null +++ b/2-cartpole/3-continuous-actor-critic/train.py @@ -0,0 +1,154 @@ +import sys +import gym +import pylab +import numpy as np +import tensorflow as tf +from tensorflow.keras.layers import Dense +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.initializers import RandomUniform +from tensorflow_probability import distributions as tfd + + +# 정책 신경망과 가치 신경망 생성 +class ContinuousA2C(tf.keras.Model): + def __init__(self, action_size): + super(ContinuousA2C, self).__init__() + self.actor_fc1 = Dense(24, activation='tanh') + self.actor_mu = Dense(action_size, + kernel_initializer=RandomUniform(-1e-3, 1e-3)) + self.actor_sigma = Dense(action_size, activation='sigmoid', + kernel_initializer=RandomUniform(-1e-3, 1e-3)) + + self.critic_fc1 = Dense(24, activation='tanh') + self.critic_fc2 = Dense(24, activation='tanh') + self.critic_out = Dense(1, + kernel_initializer=RandomUniform(-1e-3, 1e-3)) + + def call(self, x): + actor_x = self.actor_fc1(x) + mu = self.actor_mu(actor_x) + sigma = self.actor_sigma(actor_x) + sigma = sigma + 1e-5 + + critic_x = self.critic_fc1(x) + critic_x = self.critic_fc2(critic_x) + value = self.critic_out(critic_x) + return mu, sigma, value + + +# 카트폴 예제에서의 연속적 액터-크리틱(A2C) 에이전트 +class ContinuousA2CAgent: + def __init__(self, action_size, max_action): + self.render = False + + # 행동의 크기 정의 + self.action_size = action_size + self.max_action = max_action + + # 액터-크리틱 하이퍼파라미터 + self.discount_factor = 0.99 + self.learning_rate = 0.001 + + # 정책신경망과 가치신경망 생성 + self.model = ContinuousA2C(self.action_size) + # 최적화 알고리즘 설정, 미분값이 너무 커지는 현상을 막기 위해 clipnorm 설정 + self.optimizer = Adam(lr=self.learning_rate, clipnorm=1.0) + + # 정책신경망의 출력을 받아 확률적으로 행동을 선택 + def get_action(self, state): + mu, sigma, _ = self.model(state) + dist = tfd.Normal(loc=mu[0], scale=sigma[0]) + action = dist.sample([1])[0] + action = np.clip(action, -self.max_action, self.max_action) + return action + + # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 + def train_model(self, state, action, reward, next_state, done): + model_params = self.model.trainable_variables + with tf.GradientTape() as tape: + mu, sigma, value = self.model(state) + _, _, next_value = self.model(next_state) + target = reward + (1 - done) * self.discount_factor * next_value[0] + + # 정책 신경망 오류 함수 구하기 + advantage = tf.stop_gradient(target - value[0]) + dist = tfd.Normal(loc=mu, scale=sigma) + action_prob = dist.prob([action])[0] + cross_entropy = - tf.math.log(action_prob + 1e-5) + actor_loss = tf.reduce_mean(cross_entropy * advantage) + + # 가치 신경망 오류 함수 구하기 + critic_loss = 0.5 * tf.square(tf.stop_gradient(target) - value[0]) + critic_loss = tf.reduce_mean(critic_loss) + + # 하나의 오류 함수로 만들기 + loss = 0.1 * actor_loss + critic_loss + + # 오류함수를 줄이는 방향으로 모델 업데이트 + grads = tape.gradient(loss, model_params) + self.optimizer.apply_gradients(zip(grads, model_params)) + return loss, sigma + + +if __name__ == "__main__": + # CartPole-v1 환경, 최대 타임스텝 수가 500 + gym.envs.register( + id='CartPoleContinuous-v0', + entry_point='env:ContinuousCartPoleEnv', + max_episode_steps=500, + reward_threshold=475.0) + + env = gym.make('CartPoleContinuous-v0') + # 환경으로부터 상태와 행동의 크기를 받아옴 + state_size = env.observation_space.shape[0] + action_size = env.action_space.shape[0] + max_action = env.action_space.high[0] + + # 액터-크리틱(A2C) 에이전트 생성 + agent = ContinuousA2CAgent(action_size, max_action) + scores, episodes = [], [] + score_avg = 0 + + num_episode = 1000 + for e in range(num_episode): + done = False + score = 0 + loss_list, sigma_list = [], [] + state = env.reset() + state = np.reshape(state, [1, state_size]) + + while not done: + if agent.render: + env.render() + + action = agent.get_action(state) + next_state, reward, done, info = env.step(action) + next_state = np.reshape(next_state, [1, state_size]) + + # 타임스텝마다 보상 0.1, 에피소드가 중간에 끝나면 -1 보상 + score += reward + reward = 0.1 if not done or score == 500 else -1 + + # 매 타임스텝마다 학습 + loss, sigma = agent.train_model(state, action, reward, next_state, done) + loss_list.append(loss) + sigma_list.append(sigma) + state = next_state + + if done: + # 에피소드마다 학습 결과 출력 + score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score + print("episode: {:3d} | score avg: {:3.2f} | loss: {:.3f} | sigma: {:.3f}".format( + e, score_avg, np.mean(loss_list), np.mean(sigma))) + + scores.append(score_avg) + episodes.append(e) + pylab.plot(episodes, scores, 'b') + pylab.xlabel("episode") + pylab.ylabel("average score") + pylab.savefig("./save_graph/graph.png") + + # 이동 평균이 400 이상일 때 종료 + if score_avg > 400: + agent.model.save_weights("./save_model/model", save_format="tf") + sys.exit() diff --git a/2-cartpole/LICENSE b/2-cartpole/LICENSE deleted file mode 100644 index 5c61d8a..0000000 --- a/2-cartpole/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2017 Keon Kim - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/2-cartpole/README.md b/2-cartpole/README.md deleted file mode 100644 index 49a4d30..0000000 --- a/2-cartpole/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# OpenAI gym Cartpole - - -Various reinforcement learning algorithms for Cartpole example. -

diff --git a/2-cartpole/cartpole.png b/2-cartpole/cartpole.png deleted file mode 100644 index c8a7aea..0000000 Binary files a/2-cartpole/cartpole.png and /dev/null differ diff --git a/3-atari/1-breakout-dqn/save_model/trained/checkpoint b/3-atari/1-breakout-dqn/save_model/trained/checkpoint new file mode 100644 index 0000000..a6e034f --- /dev/null +++ b/3-atari/1-breakout-dqn/save_model/trained/checkpoint @@ -0,0 +1,2 @@ +model_checkpoint_path: "model" +all_model_checkpoint_paths: "model" diff --git a/3-atari/1-breakout-dqn/save_model/trained/model.data-00000-of-00002 b/3-atari/1-breakout-dqn/save_model/trained/model.data-00000-of-00002 new file mode 100644 index 0000000..de8bcf5 Binary files /dev/null and b/3-atari/1-breakout-dqn/save_model/trained/model.data-00000-of-00002 differ diff --git a/3-atari/1-breakout-dqn/save_model/trained/model.data-00001-of-00002 b/3-atari/1-breakout-dqn/save_model/trained/model.data-00001-of-00002 new file mode 100644 index 0000000..cdf6a3d Binary files /dev/null and b/3-atari/1-breakout-dqn/save_model/trained/model.data-00001-of-00002 differ diff --git a/3-atari/1-breakout-dqn/save_model/trained/model.index b/3-atari/1-breakout-dqn/save_model/trained/model.index new file mode 100644 index 0000000..d106e23 Binary files /dev/null and b/3-atari/1-breakout-dqn/save_model/trained/model.index differ diff --git a/3-atari/1-breakout-dqn/summary/breakout_dqn/events.out.tfevents.1583677266.cqcpu3.11796.5.v2 b/3-atari/1-breakout-dqn/summary/breakout_dqn/events.out.tfevents.1583677266.cqcpu3.11796.5.v2 new file mode 100644 index 0000000..3010474 Binary files /dev/null and b/3-atari/1-breakout-dqn/summary/breakout_dqn/events.out.tfevents.1583677266.cqcpu3.11796.5.v2 differ diff --git a/3-atari/1-breakout-dqn/test.py b/3-atari/1-breakout-dqn/test.py new file mode 100644 index 0000000..27ce076 --- /dev/null +++ b/3-atari/1-breakout-dqn/test.py @@ -0,0 +1,133 @@ +import gym +import time +import random +import numpy as np +import tensorflow as tf + +from skimage.color import rgb2gray +from skimage.transform import resize + +from tensorflow.keras.layers import Conv2D, Dense, Flatten + + +# 상태가 입력, 큐함수가 출력인 인공신경망 생성 +class DQN(tf.keras.Model): + def __init__(self, action_size, state_size): + super(DQN, self).__init__() + self.conv1 = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', + input_shape=state_size) + self.conv2 = Conv2D(64, (4, 4), strides=(2, 2), activation='relu') + self.conv3 = Conv2D(64, (3, 3), strides=(1, 1), activation='relu') + self.flatten = Flatten() + self.fc = Dense(512, activation='relu') + self.fc_out = Dense(action_size) + + def call(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.flatten(x) + x = self.fc(x) + q = self.fc_out(x) + return q + + +# 브레이크아웃 예제에서의 DQN 에이전트 +class DQNAgent: + def __init__(self, action_size, state_size, model_path): + self.render = False + + # 상태와 행동의 크기 정의 + self.state_size = state_size + self.action_size = action_size + + self.epsilon = 0.02 + + # 모델과 타깃 모델 생성 + self.model = DQN(action_size, state_size) + self.model.load_weights(model_path) + + # 입실론 탐욕 정책으로 행동 선택 + def get_action(self, history): + history = np.float32(history / 255.0) + if np.random.rand() <= self.epsilon: + return random.randrange(self.action_size) + else: + q_value = self.model(history) + return np.argmax(q_value[0]) + + +def pre_processing(observe): + processed_observe = np.uint8( + resize(rgb2gray(observe), (84, 84), mode='constant') * 255) + return processed_observe + + +if __name__ == "__main__": + # 환경 세팅 + env = gym.make("BreakoutDeterministic-v4") + render = True + + # 테스트를 위한 에이전트 생성 + state_size = (84, 84, 4) + action_size = 3 + model_path = './save_model/trained/model' + agent = DQNAgent(action_size, state_size, model_path) + + # 불필요한 행동을 없애주기 위한 딕셔너리 선언 + action_dict = {0:1, 1:2, 2:3, 3:3} + + num_episode = 10 + for e in range(num_episode): + done = False + dead = False + + score, start_life = 0, 5 + # env 초기화 + observe = env.reset() + + # 랜덤으로 뽑힌 값 만큼의 프레임동안 움직이지 않음 + for _ in range(random.randint(1, 30)): + observe, _, _, _ = env.step(1) + + # 프레임을 전처리 한 후 4개의 상태를 쌓아서 입력값으로 사용. + state = pre_processing(observe) + history = np.stack([state, state, state, state], axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + + while not done: + if render: + env.render() + time.sleep(0.05) + + # 바로 전 history를 입력으로 받아 행동을 선택 + action = agent.get_action(history) + # 1: 정지, 2: 왼쪽, 3: 오른쪽 + real_action = action_dict[action] + + # 죽었을 때 시작하기 위해 발사 행동을 함 + if dead: + action, real_action, dead = 0, 1, False + + # 선택한 행동으로 환경에서 한 타임스텝 진행 + observe, reward, done, info = env.step(real_action) + # 각 타임스텝마다 상태 전처리 + next_state = pre_processing(observe) + next_state = np.reshape([next_state], (1, 84, 84, 1)) + next_history = np.append(next_state, history[:, :, :, :3], axis=3) + + if start_life > info['ale.lives']: + dead, start_life = True, info['ale.lives'] + + score += reward + + if dead: + history = np.stack((next_state, next_state, + next_state, next_state), axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + else: + history = next_history + + if done: + # 각 에피소드 당 테스트 정보를 기록 + print("episode: {:3d} | score : {:4.1f}".format(e, score)) diff --git a/3-atari/1-breakout-dqn/train.py b/3-atari/1-breakout-dqn/train.py new file mode 100644 index 0000000..db0a03d --- /dev/null +++ b/3-atari/1-breakout-dqn/train.py @@ -0,0 +1,252 @@ +import os +import gym +import random +import numpy as np +import tensorflow as tf +from collections import deque + +from skimage.color import rgb2gray +from skimage.transform import resize + +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.layers import Conv2D, Dense, Flatten + + +# 상태가 입력, 큐함수가 출력인 인공신경망 생성 +class DQN(tf.keras.Model): + def __init__(self, action_size, state_size): + super(DQN, self).__init__() + self.conv1 = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', + input_shape=state_size) + self.conv2 = Conv2D(64, (4, 4), strides=(2, 2), activation='relu') + self.conv3 = Conv2D(64, (3, 3), strides=(1, 1), activation='relu') + self.flatten = Flatten() + self.fc = Dense(512, activation='relu') + self.fc_out = Dense(action_size) + + def call(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.flatten(x) + x = self.fc(x) + q = self.fc_out(x) + return q + + +# 브레이크아웃 예제에서의 DQN 에이전트 +class DQNAgent: + def __init__(self, action_size, state_size=(84, 84, 4)): + self.render = False + + # 상태와 행동의 크기 정의 + self.state_size = state_size + self.action_size = action_size + + # DQN 하이퍼파라미터 + self.discount_factor = 0.99 + self.learning_rate = 1e-4 + self.epsilon = 1. + self.epsilon_start, self.epsilon_end = 1.0, 0.02 + self.exploration_steps = 1000000. + self.epsilon_decay_step = self.epsilon_start - self.epsilon_end + self.epsilon_decay_step /= self.exploration_steps + self.batch_size = 32 + self.train_start = 50000 + self.update_target_rate = 10000 + + # 리플레이 메모리, 최대 크기 100,000 + self.memory = deque(maxlen=100000) + # 게임 시작 후 랜덤하게 움직이지 않는 것에 대한 옵션 + self.no_op_steps = 30 + + # 모델과 타깃 모델 생성 + self.model = DQN(action_size, state_size) + self.target_model = DQN(action_size, state_size) + self.optimizer = Adam(self.learning_rate, clipnorm=10.) + # 타깃 모델 초기화 + self.update_target_model() + + self.avg_q_max, self.avg_loss = 0, 0 + + self.writer = tf.summary.create_file_writer('summary/breakout_dqn') + self.model_path = os.path.join(os.getcwd(), 'save_model', 'model') + + # 타깃 모델을 모델의 가중치로 업데이트 + def update_target_model(self): + self.target_model.set_weights(self.model.get_weights()) + + # 입실론 탐욕 정책으로 행동 선택 + def get_action(self, history): + history = np.float32(history / 255.0) + if np.random.rand() <= self.epsilon: + return random.randrange(self.action_size) + else: + q_value = self.model(history) + return np.argmax(q_value[0]) + + # 샘플 을 리플레이 메모리에 저장 + def append_sample(self, history, action, reward, next_history, dead): + self.memory.append((history, action, reward, next_history, dead)) + + # 텐서보드에 학습 정보를 기록 + def draw_tensorboard(self, score, step, episode): + with self.writer.as_default(): + tf.summary.scalar('Total Reward/Episode', score, step=episode) + tf.summary.scalar('Average Max Q/Episode', + self.avg_q_max / float(step), step=episode) + tf.summary.scalar('Duration/Episode', step, step=episode) + tf.summary.scalar('Average Loss/Episode', + self.avg_loss / float(step), step=episode) + + # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습 + def train_model(self): + if self.epsilon > self.epsilon_end: + self.epsilon -= self.epsilon_decay_step + + # 메모리에서 배치 크기만큼 무작위로 샘플 추출 + batch = random.sample(self.memory, self.batch_size) + + history = np.array([sample[0][0] / 255. for sample in batch], + dtype=np.float32) + actions = np.array([sample[1] for sample in batch]) + rewards = np.array([sample[2] for sample in batch]) + next_history = np.array([sample[3][0] / 255. for sample in batch], + dtype=np.float32) + dones = np.array([sample[4] for sample in batch]) + + # 학습 파라메터 + model_params = self.model.trainable_variables + with tf.GradientTape() as tape: + # 현재 상태에 대한 모델의 큐함수 + predicts = self.model(history) + one_hot_action = tf.one_hot(actions, self.action_size) + predicts = tf.reduce_sum(one_hot_action * predicts, axis=1) + + # 다음 상태에 대한 타깃 모델의 큐함수 + target_predicts = self.target_model(next_history) + + # 벨만 최적 방정식을 구성하기 위한 타깃과 큐함수의 최대 값 계산 + max_q = np.amax(target_predicts, axis=1) + targets = rewards + (1 - dones) * self.discount_factor * max_q + + # 후버로스 계산 + error = tf.abs(targets - predicts) + quadratic_part = tf.clip_by_value(error, 0.0, 1.0) + linear_part = error - quadratic_part + loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part) + + self.avg_loss += loss.numpy() + + # 오류함수를 줄이는 방향으로 모델 업데이트 + grads = tape.gradient(loss, model_params) + self.optimizer.apply_gradients(zip(grads, model_params)) + + +# 학습속도를 높이기 위해 흑백화면으로 전처리 +def pre_processing(observe): + processed_observe = np.uint8( + resize(rgb2gray(observe), (84, 84), mode='constant') * 255) + return processed_observe + + +if __name__ == "__main__": + # 환경과 DQN 에이전트 생성 + env = gym.make('BreakoutDeterministic-v4') + agent = DQNAgent(action_size=3) + + global_step = 0 + score_avg = 0 + score_max = 0 + + # 불필요한 행동을 없애주기 위한 딕셔너리 선언 + action_dict = {0:1, 1:2, 2:3, 3:3} + + num_episode = 50000 + for e in range(num_episode): + done = False + dead = False + + step, score, start_life = 0, 0, 5 + # env 초기화 + observe = env.reset() + + # 랜덤으로 뽑힌 값 만큼의 프레임동안 움직이지 않음 + for _ in range(random.randint(1, agent.no_op_steps)): + observe, _, _, _ = env.step(1) + + # 프레임을 전처리 한 후 4개의 상태를 쌓아서 입력값으로 사용. + state = pre_processing(observe) + history = np.stack((state, state, state, state), axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + + while not done: + if agent.render: + env.render() + global_step += 1 + step += 1 + + # 바로 전 history를 입력으로 받아 행동을 선택 + action = agent.get_action(history) + # 1: 정지, 2: 왼쪽, 3: 오른쪽 + real_action = action_dict[action] + + # 죽었을 때 시작하기 위해 발사 행동을 함 + if dead: + action, real_action, dead = 0, 1, False + + # 선택한 행동으로 환경에서 한 타임스텝 진행 + observe, reward, done, info = env.step(real_action) + # 각 타임스텝마다 상태 전처리 + next_state = pre_processing(observe) + next_state = np.reshape([next_state], (1, 84, 84, 1)) + next_history = np.append(next_state, history[:, :, :, :3], axis=3) + + agent.avg_q_max += np.amax(agent.model(np.float32(history / 255.))[0]) + + if start_life > info['ale.lives']: + dead = True + start_life = info['ale.lives'] + + score += reward + reward = np.clip(reward, -1., 1.) + # 샘플 을 리플레이 메모리에 저장 후 학습 + agent.append_sample(history, action, reward, next_history, dead) + + # 리플레이 메모리 크기가 정해놓은 수치에 도달한 시점부터 모델 학습 시작 + if len(agent.memory) >= agent.train_start: + agent.train_model() + # 일정 시간마다 타겟모델을 모델의 가중치로 업데이트 + if global_step % agent.update_target_rate == 0: + agent.update_target_model() + + if dead: + history = np.stack((next_state, next_state, + next_state, next_state), axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + else: + history = next_history + + if done: + # 각 에피소드 당 학습 정보를 기록 + if global_step > agent.train_start: + agent.draw_tensorboard(score, step, e) + + score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score + score_max = score if score > score_max else score_max + + log = "episode: {:5d} | ".format(e) + log += "score: {:4.1f} | ".format(score) + log += "score max : {:4.1f} | ".format(score_max) + log += "score avg: {:4.1f} | ".format(score_avg) + log += "memory length: {:5d} | ".format(len(agent.memory)) + log += "epsilon: {:.3f} | ".format(agent.epsilon) + log += "q avg : {:3.2f} | ".format(agent.avg_q_max / float(step)) + log += "avg loss : {:3.2f}".format(agent.avg_loss / float(step)) + print(log) + + agent.avg_q_max, agent.avg_loss = 0, 0 + + # 1000 에피소드마다 모델 저장 + if e % 1000 == 0: + agent.model.save_weights("./save_model/model", save_format="tf") diff --git a/3-atari/1-breakout/breakout_a3c.py b/3-atari/1-breakout/breakout_a3c.py deleted file mode 100644 index f6db1c0..0000000 --- a/3-atari/1-breakout/breakout_a3c.py +++ /dev/null @@ -1,378 +0,0 @@ -from skimage.color import rgb2gray -from skimage.transform import resize -from keras.layers import Dense, Flatten, Input -from keras.layers.convolutional import Conv2D -from keras.optimizers import RMSprop -from keras import backend as K -from keras.models import Model -import tensorflow as tf -import numpy as np -import threading -import random -import time -import gym - -# 멀티쓰레딩을 위한 글로벌 변수 -global episode -episode = 0 -EPISODES = 8000000 -# 환경 생성 -env_name = "BreakoutDeterministic-v4" - - -# 브레이크아웃에서의 A3CAgent 클래스(글로벌신경망) -class A3CAgent: - def __init__(self, action_size): - # 상태크기와 행동크기를 갖고옴 - self.state_size = (84, 84, 4) - self.action_size = action_size - # A3C 하이퍼파라미터 - self.discount_factor = 0.99 - self.no_op_steps = 30 - self.actor_lr = 2.5e-4 - self.critic_lr = 2.5e-4 - # 쓰레드의 갯수 - self.threads = 8 - - # 정책신경망과 가치신경망을 생성 - self.actor, self.critic = self.build_model() - # 정책신경망과 가치신경망을 업데이트하는 함수 생성 - self.optimizer = [self.actor_optimizer(), self.critic_optimizer()] - - # 텐서보드 설정 - self.sess = tf.InteractiveSession() - K.set_session(self.sess) - self.sess.run(tf.global_variables_initializer()) - - self.summary_placeholders, self.update_ops, self.summary_op = \ - self.setup_summary() - self.summary_writer = \ - tf.summary.FileWriter('summary/breakout_a3c', self.sess.graph) - - # 쓰레드를 만들어 학습을 하는 함수 - def train(self): - # 쓰레드 수만큼 Agent 클래스 생성 - agents = [Agent(self.action_size, self.state_size, - [self.actor, self.critic], self.sess, - self.optimizer, self.discount_factor, - [self.summary_op, self.summary_placeholders, - self.update_ops, self.summary_writer]) - for _ in range(self.threads)] - - # 각 쓰레드 시작 - for agent in agents: - time.sleep(1) - agent.start() - - # 10분(600초)에 한번씩 모델을 저장 - while True: - time.sleep(60 * 10) - self.save_model("./save_model/breakout_a3c") - - # 정책신경망과 가치신경망을 생성 - def build_model(self): - input = Input(shape=self.state_size) - conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input) - conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv) - conv = Flatten()(conv) - fc = Dense(256, activation='relu')(conv) - - policy = Dense(self.action_size, activation='softmax')(fc) - value = Dense(1, activation='linear')(fc) - - actor = Model(inputs=input, outputs=policy) - critic = Model(inputs=input, outputs=value) - - # 가치와 정책을 예측하는 함수를 만들어냄 - actor._make_predict_function() - critic._make_predict_function() - - actor.summary() - critic.summary() - - return actor, critic - - # 정책신경망을 업데이트하는 함수 - def actor_optimizer(self): - action = K.placeholder(shape=[None, self.action_size]) - advantages = K.placeholder(shape=[None, ]) - - policy = self.actor.output - - # 정책 크로스 엔트로피 오류함수 - action_prob = K.sum(action * policy, axis=1) - cross_entropy = K.log(action_prob + 1e-10) * advantages - cross_entropy = -K.sum(cross_entropy) - - # 탐색을 지속적으로 하기 위한 엔트로피 오류 - entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) - entropy = K.sum(entropy) - - # 두 오류함수를 더해 최종 오류함수를 만듬 - loss = cross_entropy + 0.01 * entropy - - optimizer = RMSprop(lr=self.actor_lr, rho=0.99, epsilon=0.01) - updates = optimizer.get_updates(self.actor.trainable_weights, [],loss) - train = K.function([self.actor.input, action, advantages], - [loss], updates=updates) - return train - - # 가치신경망을 업데이트하는 함수 - def critic_optimizer(self): - discounted_prediction = K.placeholder(shape=(None,)) - - value = self.critic.output - - # [반환값 - 가치]의 제곱을 오류함수로 함 - loss = K.mean(K.square(discounted_prediction - value)) - - optimizer = RMSprop(lr=self.critic_lr, rho=0.99, epsilon=0.01) - updates = optimizer.get_updates(self.critic.trainable_weights, [],loss) - train = K.function([self.critic.input, discounted_prediction], - [loss], updates=updates) - return train - - def load_model(self, name): - self.actor.load_weights(name + "_actor.h5") - self.critic.load_weights(name + "_critic.h5") - - def save_model(self, name): - self.actor.save_weights(name + "_actor.h5") - self.critic.save_weights(name + "_critic.h5") - - # 각 에피소드 당 학습 정보를 기록 - def setup_summary(self): - episode_total_reward = tf.Variable(0.) - episode_avg_max_q = tf.Variable(0.) - episode_duration = tf.Variable(0.) - - tf.summary.scalar('Total Reward/Episode', episode_total_reward) - tf.summary.scalar('Average Max Prob/Episode', episode_avg_max_q) - tf.summary.scalar('Duration/Episode', episode_duration) - - summary_vars = [episode_total_reward, - episode_avg_max_q, - episode_duration] - - summary_placeholders = [tf.placeholder(tf.float32) - for _ in range(len(summary_vars))] - update_ops = [summary_vars[i].assign(summary_placeholders[i]) - for i in range(len(summary_vars))] - summary_op = tf.summary.merge_all() - return summary_placeholders, update_ops, summary_op - - -# 액터러너 클래스(쓰레드) -class Agent(threading.Thread): - def __init__(self, action_size, state_size, model, sess, - optimizer, discount_factor, summary_ops): - threading.Thread.__init__(self) - - # A3CAgent 클래스에서 상속 - self.action_size = action_size - self.state_size = state_size - self.actor, self.critic = model - self.sess = sess - self.optimizer = optimizer - self.discount_factor = discount_factor - [self.summary_op, self.summary_placeholders, - self.update_ops, self.summary_writer] = summary_ops - - # 지정된 타임스텝동안 샘플을 저장할 리스트 - self.states, self.actions, self.rewards = [], [], [] - - # 로컬 모델 생성 - self.local_actor, self.local_critic = self.build_local_model() - - self.avg_p_max = 0 - self.avg_loss = 0 - - # 모델 업데이트 주기 - self.t_max = 20 - self.t = 0 - - def run(self): - global episode - env = gym.make(env_name) - - step = 0 - - while episode < EPISODES: - done = False - dead = False - - score, start_life = 0, 5 - observe = env.reset() - next_observe = observe - - # 0~30 상태동안 정지 - for _ in range(random.randint(1, 30)): - observe = next_observe - next_observe, _, _, _ = env.step(1) - - state = pre_processing(next_observe, observe) - history = np.stack((state, state, state, state), axis=2) - history = np.reshape([history], (1, 84, 84, 4)) - - while not done: - step += 1 - self.t += 1 - observe = next_observe - action, policy = self.get_action(history) - - # 1: 정지, 2: 왼쪽, 3: 오른쪽 - if action == 0: - real_action = 1 - elif action == 1: - real_action = 2 - else: - real_action = 3 - - # 죽었을 때 시작하기 위해 발사 행동을 함 - if dead: - action = 0 - real_action = 1 - dead = False - - # 선택한 행동으로 한 스텝을 실행 - next_observe, reward, done, info = env.step(real_action) - - # 각 타임스텝마다 상태 전처리 - next_state = pre_processing(next_observe, observe) - next_state = np.reshape([next_state], (1, 84, 84, 1)) - next_history = np.append(next_state, history[:, :, :, :3], - axis=3) - - # 정책의 최대값 - self.avg_p_max += np.amax(self.actor.predict( - np.float32(history / 255.))) - - if start_life > info['ale.lives']: - dead = True - start_life = info['ale.lives'] - - score += reward - reward = np.clip(reward, -1., 1.) - - # 샘플을 저장 - self.append_sample(history, action, reward) - - if dead: - history = np.stack((next_state, next_state, - next_state, next_state), axis=2) - history = np.reshape([history], (1, 84, 84, 4)) - else: - history = next_history - - # 에피소드가 끝나거나 최대 타임스텝 수에 도달하면 학습을 진행 - if self.t >= self.t_max or done: - self.train_model(done) - self.update_local_model() - self.t = 0 - - if done: - # 각 에피소드 당 학습 정보를 기록 - episode += 1 - print("episode:", episode, " score:", score, " step:", - step) - - stats = [score, self.avg_p_max / float(step), - step] - for i in range(len(stats)): - self.sess.run(self.update_ops[i], feed_dict={ - self.summary_placeholders[i]: float(stats[i]) - }) - summary_str = self.sess.run(self.summary_op) - self.summary_writer.add_summary(summary_str, episode + 1) - self.avg_p_max = 0 - self.avg_loss = 0 - step = 0 - - # k-스텝 prediction 계산 - def discounted_prediction(self, rewards, done): - discounted_prediction = np.zeros_like(rewards) - running_add = 0 - - if not done: - running_add = self.local_critic.predict(np.float32( - self.states[-1] / 255.))[0] - - for t in reversed(range(0, len(rewards))): - running_add = running_add * self.discount_factor + rewards[t] - discounted_prediction[t] = running_add - return discounted_prediction - - # 정책신경망과 가치신경망을 업데이트 - def train_model(self, done): - discounted_prediction = self.discounted_prediction(self.rewards, done) - - states = np.zeros((len(self.states), 84, 84, 4)) - for i in range(len(self.states)): - states[i] = self.states[i] - - states = np.float32(states / 255.) - - values = self.local_critic.predict(states) - values = np.reshape(values, len(values)) - - advantages = discounted_prediction - values - - self.optimizer[0]([states, self.actions, advantages]) - self.optimizer[1]([states, discounted_prediction]) - self.states, self.actions, self.rewards = [], [], [] - - # 로컬신경망을 생성하는 함수 - def build_local_model(self): - input = Input(shape=self.state_size) - conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input) - conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv) - conv = Flatten()(conv) - fc = Dense(256, activation='relu')(conv) - policy = Dense(self.action_size, activation='softmax')(fc) - value = Dense(1, activation='linear')(fc) - - local_actor = Model(inputs=input, outputs=policy) - local_critic = Model(inputs=input, outputs=value) - - local_actor._make_predict_function() - local_critic._make_predict_function() - - local_actor.set_weights(self.actor.get_weights()) - local_critic.set_weights(self.critic.get_weights()) - - local_actor.summary() - local_critic.summary() - - return local_actor, local_critic - - # 로컬신경망을 글로벌신경망으로 업데이트 - def update_local_model(self): - self.local_actor.set_weights(self.actor.get_weights()) - self.local_critic.set_weights(self.critic.get_weights()) - - # 정책신경망의 출력을 받아서 확률적으로 행동을 선택 - def get_action(self, history): - history = np.float32(history / 255.) - policy = self.local_actor.predict(history)[0] - action_index = np.random.choice(self.action_size, 1, p=policy)[0] - return action_index, policy - - # 샘플을 저장 - def append_sample(self, history, action, reward): - self.states.append(history) - act = np.zeros(self.action_size) - act[action] = 1 - self.actions.append(act) - self.rewards.append(reward) - - -# 학습속도를 높이기 위해 흑백화면으로 전처리 -def pre_processing(next_observe, observe): - processed_observe = np.maximum(next_observe, observe) - processed_observe = np.uint8( - resize(rgb2gray(processed_observe), (84, 84), mode='constant') * 255) - return processed_observe - -if __name__ == "__main__": - global_agent = A3CAgent(action_size=3) - global_agent.train() diff --git a/3-atari/1-breakout/breakout_dqn.py b/3-atari/1-breakout/breakout_dqn.py deleted file mode 100644 index eafe83a..0000000 --- a/3-atari/1-breakout/breakout_dqn.py +++ /dev/null @@ -1,263 +0,0 @@ -from keras.layers.convolutional import Conv2D -from keras.layers import Dense, Flatten -from keras.optimizers import RMSprop -from keras.models import Sequential -from skimage.transform import resize -from skimage.color import rgb2gray -from collections import deque -from keras import backend as K -import tensorflow as tf -import numpy as np -import random -import gym - -EPISODES = 50000 - - -# 브레이크아웃에서의 DQN 에이전트 -class DQNAgent: - def __init__(self, action_size): - self.render = False - self.load_model = False - # 상태와 행동의 크기 정의 - self.state_size = (84, 84, 4) - self.action_size = action_size - # DQN 하이퍼파라미터 - self.epsilon = 1. - self.epsilon_start, self.epsilon_end = 1.0, 0.1 - self.exploration_steps = 1000000. - self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \ - / self.exploration_steps - self.batch_size = 32 - self.train_start = 50000 - self.update_target_rate = 10000 - self.discount_factor = 0.99 - # 리플레이 메모리, 최대 크기 400000 - self.memory = deque(maxlen=400000) - self.no_op_steps = 30 - # 모델과 타겟모델을 생성하고 타겟모델 초기화 - self.model = self.build_model() - self.target_model = self.build_model() - self.update_target_model() - - self.optimizer = self.optimizer() - - # 텐서보드 설정 - self.sess = tf.InteractiveSession() - K.set_session(self.sess) - - self.avg_q_max, self.avg_loss = 0, 0 - self.summary_placeholders, self.update_ops, self.summary_op = \ - self.setup_summary() - self.summary_writer = tf.summary.FileWriter( - 'summary/breakout_dqn', self.sess.graph) - self.sess.run(tf.global_variables_initializer()) - - if self.load_model: - self.model.load_weights("./save_model/breakout_dqn.h5") - - # Huber Loss를 이용하기 위해 최적화 함수를 직접 정의 - def optimizer(self): - a = K.placeholder(shape=(None,), dtype='int32') - y = K.placeholder(shape=(None,), dtype='float32') - - prediction = self.model.output - - a_one_hot = K.one_hot(a, self.action_size) - q_value = K.sum(prediction * a_one_hot, axis=1) - error = K.abs(y - q_value) - - quadratic_part = K.clip(error, 0.0, 1.0) - linear_part = error - quadratic_part - loss = K.mean(0.5 * K.square(quadratic_part) + linear_part) - - optimizer = RMSprop(lr=0.00025, epsilon=0.01) - updates = optimizer.get_updates(self.model.trainable_weights, [], loss) - train = K.function([self.model.input, a, y], [loss], updates=updates) - - return train - - # 상태가 입력, 큐함수가 출력인 인공신경망 생성 - def build_model(self): - model = Sequential() - model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', - input_shape=self.state_size)) - model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu')) - model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu')) - model.add(Flatten()) - model.add(Dense(512, activation='relu')) - model.add(Dense(self.action_size)) - model.summary() - return model - - # 타겟 모델을 모델의 가중치로 업데이트 - def update_target_model(self): - self.target_model.set_weights(self.model.get_weights()) - - # 입실론 탐욕 정책으로 행동 선택 - def get_action(self, history): - history = np.float32(history / 255.0) - if np.random.rand() <= self.epsilon: - return random.randrange(self.action_size) - else: - q_value = self.model.predict(history) - return np.argmax(q_value[0]) - - # 샘플 을 리플레이 메모리에 저장 - def append_sample(self, history, action, reward, next_history, dead): - self.memory.append((history, action, reward, next_history, dead)) - - # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습 - def train_model(self): - if self.epsilon > self.epsilon_end: - self.epsilon -= self.epsilon_decay_step - - mini_batch = random.sample(self.memory, self.batch_size) - - history = np.zeros((self.batch_size, self.state_size[0], - self.state_size[1], self.state_size[2])) - next_history = np.zeros((self.batch_size, self.state_size[0], - self.state_size[1], self.state_size[2])) - target = np.zeros((self.batch_size,)) - action, reward, dead = [], [], [] - - for i in range(self.batch_size): - history[i] = np.float32(mini_batch[i][0] / 255.) - next_history[i] = np.float32(mini_batch[i][3] / 255.) - action.append(mini_batch[i][1]) - reward.append(mini_batch[i][2]) - dead.append(mini_batch[i][4]) - - target_value = self.target_model.predict(next_history) - - for i in range(self.batch_size): - if dead[i]: - target[i] = reward[i] - else: - target[i] = reward[i] + self.discount_factor * \ - np.amax(target_value[i]) - - loss = self.optimizer([history, action, target]) - self.avg_loss += loss[0] - - # 각 에피소드 당 학습 정보를 기록 - def setup_summary(self): - episode_total_reward = tf.Variable(0.) - episode_avg_max_q = tf.Variable(0.) - episode_duration = tf.Variable(0.) - episode_avg_loss = tf.Variable(0.) - - tf.summary.scalar('Total Reward/Episode', episode_total_reward) - tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q) - tf.summary.scalar('Duration/Episode', episode_duration) - tf.summary.scalar('Average Loss/Episode', episode_avg_loss) - - summary_vars = [episode_total_reward, episode_avg_max_q, - episode_duration, episode_avg_loss] - summary_placeholders = [tf.placeholder(tf.float32) for _ in - range(len(summary_vars))] - update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in - range(len(summary_vars))] - summary_op = tf.summary.merge_all() - return summary_placeholders, update_ops, summary_op - - -# 학습속도를 높이기 위해 흑백화면으로 전처리 -def pre_processing(observe): - processed_observe = np.uint8( - resize(rgb2gray(observe), (84, 84), mode='constant') * 255) - return processed_observe - - -if __name__ == "__main__": - # 환경과 DQN 에이전트 생성 - env = gym.make('BreakoutDeterministic-v4') - agent = DQNAgent(action_size=3) - - scores, episodes, global_step = [], [], 0 - - for e in range(EPISODES): - done = False - dead = False - - step, score, start_life = 0, 0, 5 - observe = env.reset() - - for _ in range(random.randint(1, agent.no_op_steps)): - observe, _, _, _ = env.step(1) - - state = pre_processing(observe) - history = np.stack((state, state, state, state), axis=2) - history = np.reshape([history], (1, 84, 84, 4)) - - while not done: - if agent.render: - env.render() - global_step += 1 - step += 1 - - # 바로 전 4개의 상태로 행동을 선택 - action = agent.get_action(history) - # 1: 정지, 2: 왼쪽, 3: 오른쪽 - if action == 0: - real_action = 1 - elif action == 1: - real_action = 2 - else: - real_action = 3 - - # 선택한 행동으로 환경에서 한 타임스텝 진행 - observe, reward, done, info = env.step(real_action) - # 각 타임스텝마다 상태 전처리 - next_state = pre_processing(observe) - next_state = np.reshape([next_state], (1, 84, 84, 1)) - next_history = np.append(next_state, history[:, :, :, :3], axis=3) - - agent.avg_q_max += np.amax( - agent.model.predict(np.float32(history / 255.))[0]) - - if start_life > info['ale.lives']: - dead = True - start_life = info['ale.lives'] - - reward = np.clip(reward, -1., 1.) - # 샘플 을 리플레이 메모리에 저장 후 학습 - agent.append_sample(history, action, reward, next_history, dead) - - if len(agent.memory) >= agent.train_start: - agent.train_model() - - # 일정 시간마다 타겟모델을 모델의 가중치로 업데이트 - if global_step % agent.update_target_rate == 0: - agent.update_target_model() - - score += reward - - if dead: - dead = False - else: - history = next_history - - if done: - # 각 에피소드 당 학습 정보를 기록 - if global_step > agent.train_start: - stats = [score, agent.avg_q_max / float(step), step, - agent.avg_loss / float(step)] - for i in range(len(stats)): - agent.sess.run(agent.update_ops[i], feed_dict={ - agent.summary_placeholders[i]: float(stats[i]) - }) - summary_str = agent.sess.run(agent.summary_op) - agent.summary_writer.add_summary(summary_str, e + 1) - - print("episode:", e, " score:", score, " memory length:", - len(agent.memory), " epsilon:", agent.epsilon, - " global_step:", global_step, " average_q:", - agent.avg_q_max / float(step), " average loss:", - agent.avg_loss / float(step)) - - agent.avg_q_max, agent.avg_loss = 0, 0 - - # 1000 에피소드마다 모델 저장 - if e % 1000 == 0: - agent.model.save_weights("./save_model/breakout_dqn.h5") diff --git a/3-atari/1-breakout/play_a3c_model.py b/3-atari/1-breakout/play_a3c_model.py deleted file mode 100644 index bde3088..0000000 --- a/3-atari/1-breakout/play_a3c_model.py +++ /dev/null @@ -1,125 +0,0 @@ -import gym -import random -import numpy as np -from skimage.color import rgb2gray -from skimage.transform import resize -from keras.models import Model -from keras.layers import Dense, Flatten, Input -from keras.layers.convolutional import Conv2D - -global episode -episode = 0 -EPISODES = 100 -env_name = "BreakoutDeterministic-v4" - -class TestAgent: - def __init__(self, action_size): - self.state_size = (84, 84, 4) - self.action_size = action_size - - self.discount_factor = 0.99 - self.no_op_steps = 30 - - self.actor, self.critic = self.build_model() - - def build_model(self): - input = Input(shape=self.state_size) - conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input) - conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv) - conv = Flatten()(conv) - fc = Dense(256, activation='relu')(conv) - policy = Dense(self.action_size, activation='softmax')(fc) - value = Dense(1, activation='linear')(fc) - - actor = Model(inputs=input, outputs=policy) - critic = Model(inputs=input, outputs=value) - - actor.summary() - critic.summary() - - return actor, critic - - def get_action(self, history): - history = np.float32(history / 255.) - policy = self.actor.predict(history)[0] - - action_index = np.argmax(policy) - return action_index - - def load_model(self, name): - self.actor.load_weights(name) - -def pre_processing(next_observe, observe): - processed_observe = np.maximum(next_observe, observe) - processed_observe = np.uint8( - resize(rgb2gray(processed_observe), (84, 84), mode='constant') * 255) - return processed_observe - - -if __name__ == "__main__": - env = gym.make(env_name) - agent = TestAgent(action_size=3) - agent.load_model("save_model/breakout_a3c_5_actor.h5") - - step = 0 - - while episode < EPISODES: - done = False - dead = False - - score, start_life = 0, 5 - observe = env.reset() - next_observe = observe - - for _ in range(random.randint(1, agent.no_op_steps)): - observe = next_observe - next_observe, _, _, _ = env.step(1) - - state = pre_processing(next_observe, observe) - history = np.stack((state, state, state, state), axis=2) - history = np.reshape([history], (1, 84, 84, 4)) - - while not done: - env.render() - step += 1 - observe = next_observe - - action = agent.get_action(history) - - if action == 1: - fake_action = 2 - elif action == 2: - fake_action = 3 - else: - fake_action = 1 - - if dead: - fake_action = 1 - dead = False - - next_observe, reward, done, info = env.step(fake_action) - - next_state = pre_processing(next_observe, observe) - next_state = np.reshape([next_state], (1, 84, 84, 1)) - next_history = np.append(next_state, history[:, :, :, :3], axis=3) - - if start_life > info['ale.lives']: - dead = True - reward = -1 - start_life = info['ale.lives'] - - score += reward - - # if agent is dead, then reset the history - if dead: - history = np.stack( - (next_state, next_state, next_state, next_state), axis=2) - history = np.reshape([history], (1, 84, 84, 4)) - else: - history = next_history - - # if done, plot the score over episodes - if done: - episode += 1 - print("episode:", episode, " score:", score, " step:", step) - step = 0 \ No newline at end of file diff --git a/3-atari/1-breakout/play_dqn_model.py b/3-atari/1-breakout/play_dqn_model.py deleted file mode 100644 index 2710d58..0000000 --- a/3-atari/1-breakout/play_dqn_model.py +++ /dev/null @@ -1,109 +0,0 @@ -import gym -import random -import numpy as np -import tensorflow as tf -from skimage.color import rgb2gray -from skimage.transform import resize -from keras.models import Sequential -from keras.layers import Dense, Flatten -from keras.layers.convolutional import Conv2D -from keras import backend as K - -EPISODES = 50000 - -class TestAgent: - def __init__(self, action_size): - self.state_size = (84, 84, 4) - self.action_size = action_size - self.no_op_steps = 20 - - self.model = self.build_model() - - self.sess = tf.InteractiveSession() - K.set_session(self.sess) - - self.avg_q_max, self.avg_loss = 0, 0 - self.sess.run(tf.global_variables_initializer()) - - def build_model(self): - model = Sequential() - model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', - input_shape=self.state_size)) - model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu')) - model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu')) - model.add(Flatten()) - model.add(Dense(512, activation='relu')) - model.add(Dense(self.action_size)) - model.summary() - - return model - - def get_action(self, history): - if np.random.random() < 0.01: - return random.randrange(3) - history = np.float32(history / 255.0) - q_value = self.model.predict(history) - return np.argmax(q_value[0]) - - def load_model(self, filename): - self.model.load_weights(filename) - -def pre_processing(observe): - processed_observe = np.uint8( - resize(rgb2gray(observe), (84, 84), mode='constant') * 255) - return processed_observe - - -if __name__ == "__main__": - env = gym.make('BreakoutDeterministic-v4') - agent = TestAgent(action_size=3) - agent.load_model("./save_model/breakout_dqn_5.h5") - - for e in range(EPISODES): - done = False - dead = False - - step, score, start_life = 0, 0, 5 - observe = env.reset() - - for _ in range(random.randint(1, agent.no_op_steps)): - observe, _, _, _ = env.step(1) - - state = pre_processing(observe) - history = np.stack((state, state, state, state), axis=2) - history = np.reshape([history], (1, 84, 84, 4)) - - while not done: - env.render() - step += 1 - - action = agent.get_action(history) - - if action == 0: - real_action = 1 - elif action == 1: - real_action = 2 - else: - real_action = 3 - - if dead: - real_action = 1 - dead = False - - observe, reward, done, info = env.step(real_action) - - next_state = pre_processing(observe) - next_state = np.reshape([next_state], (1, 84, 84, 1)) - next_history = np.append(next_state, history[:, :, :, :3], axis=3) - - if start_life > info['ale.lives']: - dead = True - start_life = info['ale.lives'] - - score += reward - - history = next_history - - if done: - print("episode:", e, " score:", score) - diff --git a/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5 deleted file mode 100644 index 37a6a1a..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5 deleted file mode 100644 index 3d3394a..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5 deleted file mode 100644 index 21207c0..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5 deleted file mode 100644 index a26f7d8..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5 deleted file mode 100644 index a27e766..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5 deleted file mode 100644 index 62236fc..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5 deleted file mode 100644 index db855b2..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5 deleted file mode 100644 index 3636d02..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5 deleted file mode 100644 index a993bbc..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5 deleted file mode 100644 index 983e6c6..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_dqn.h5 b/3-atari/1-breakout/save_model/breakout_dqn.h5 deleted file mode 100644 index fec0537..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_dqn.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_dqn_1.h5 b/3-atari/1-breakout/save_model/breakout_dqn_1.h5 deleted file mode 100644 index c6e636a..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_dqn_1.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_dqn_2.h5 b/3-atari/1-breakout/save_model/breakout_dqn_2.h5 deleted file mode 100644 index 85544cc..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_dqn_2.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_dqn_3.h5 b/3-atari/1-breakout/save_model/breakout_dqn_3.h5 deleted file mode 100644 index 11bad3e..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_dqn_3.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_dqn_4.h5 b/3-atari/1-breakout/save_model/breakout_dqn_4.h5 deleted file mode 100644 index f871888..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_dqn_4.h5 and /dev/null differ diff --git a/3-atari/1-breakout/save_model/breakout_dqn_5.h5 b/3-atari/1-breakout/save_model/breakout_dqn_5.h5 deleted file mode 100644 index f82ad02..0000000 Binary files a/3-atari/1-breakout/save_model/breakout_dqn_5.h5 and /dev/null differ diff --git a/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638 b/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638 deleted file mode 100644 index 1eb4343..0000000 Binary files a/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638 and /dev/null differ diff --git a/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name b/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name deleted file mode 100644 index 2e394ad..0000000 Binary files a/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name and /dev/null differ diff --git a/3-atari/2-breakout-a3c/save_model/trained/checkpoint b/3-atari/2-breakout-a3c/save_model/trained/checkpoint new file mode 100644 index 0000000..a6e034f --- /dev/null +++ b/3-atari/2-breakout-a3c/save_model/trained/checkpoint @@ -0,0 +1,2 @@ +model_checkpoint_path: "model" +all_model_checkpoint_paths: "model" diff --git a/3-atari/2-breakout-a3c/save_model/trained/model.data-00000-of-00002 b/3-atari/2-breakout-a3c/save_model/trained/model.data-00000-of-00002 new file mode 100644 index 0000000..5ac027a Binary files /dev/null and b/3-atari/2-breakout-a3c/save_model/trained/model.data-00000-of-00002 differ diff --git a/3-atari/2-breakout-a3c/save_model/trained/model.data-00001-of-00002 b/3-atari/2-breakout-a3c/save_model/trained/model.data-00001-of-00002 new file mode 100644 index 0000000..8a3a04c Binary files /dev/null and b/3-atari/2-breakout-a3c/save_model/trained/model.data-00001-of-00002 differ diff --git a/3-atari/2-breakout-a3c/save_model/trained/model.index b/3-atari/2-breakout-a3c/save_model/trained/model.index new file mode 100644 index 0000000..98da0da Binary files /dev/null and b/3-atari/2-breakout-a3c/save_model/trained/model.index differ diff --git a/3-atari/2-breakout-a3c/summary/breakout_a3c/events.out.tfevents.1583895598.cqcpu3.12464.143.v2 b/3-atari/2-breakout-a3c/summary/breakout_a3c/events.out.tfevents.1583895598.cqcpu3.12464.143.v2 new file mode 100644 index 0000000..beeeab0 Binary files /dev/null and b/3-atari/2-breakout-a3c/summary/breakout_a3c/events.out.tfevents.1583895598.cqcpu3.12464.143.v2 differ diff --git a/3-atari/2-breakout-a3c/test.py b/3-atari/2-breakout-a3c/test.py new file mode 100644 index 0000000..7787a24 --- /dev/null +++ b/3-atari/2-breakout-a3c/test.py @@ -0,0 +1,125 @@ +import gym +import time +import random +import numpy as np +import tensorflow as tf + +from skimage.color import rgb2gray +from skimage.transform import resize + +from tensorflow.keras.layers import Conv2D, Flatten, Dense + + +# ActorCritic 인공신경망 +class ActorCritic(tf.keras.Model): + def __init__(self, action_size, state_size): + super(ActorCritic, self).__init__() + + self.conv1 = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', + input_shape=state_size) + self.conv2 = Conv2D(64, (4, 4), strides=(2, 2), activation='relu') + self.conv3 = Conv2D(64, (3, 3), strides=(1, 1), activation='relu') + self.flatten = Flatten() + self.shared_fc = Dense(512, activation='relu') + + self.policy = Dense(action_size, activation='linear') + self.value = Dense(1, activation='linear') + + def call(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.flatten(x) + x = self.shared_fc(x) + + policy = self.policy(x) + value = self.value(x) + return policy, value + +# 브레이크아웃에서의 테스트를 위한 A3C 에이전트 클래스 + + +class A3CTestAgent: + def __init__(self, action_size, state_size, model_path): + self.action_size = action_size + + self.model = ActorCritic(action_size, state_size) + self.model.load_weights(model_path) + + def get_action(self, history): + history = np.float32(history / 255.) + policy = self.model(history)[0][0] + policy = tf.nn.softmax(policy) + action_index = np.random.choice(self.action_size, 1, p=policy.numpy())[0] + return action_index, policy + + +def pre_processing(observe): + processed_observe = np.uint8( + resize(rgb2gray(observe), (84, 84), mode='constant') * 255) + return processed_observe + + +if __name__ == "__main__": + # 테스트를 위한 환경, 모델 생성 + env = gym.make("BreakoutDeterministic-v4") + state_size = (84, 84, 4) + action_size = 3 + model_path = './save_model/trained/model' + render = True + + agent = A3CTestAgent(action_size, state_size, model_path) + action_dict = {0:1, 1:2, 2:3, 3:3} + + num_episode = 10 + for e in range(num_episode): + done = False + dead = False + + score, start_life = 0, 5 + observe = env.reset() + + # 랜덤으로 뽑힌 값 만큼의 프레임동안 움직이지 않음 + for _ in range(random.randint(1, 30)): + observe, _, _, _ = env.step(1) + + # 프레임을 전처리 한 후 4개의 상태를 쌓아서 입력값으로 사용. + state = pre_processing(observe) + history = np.stack([state, state, state, state], axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + + while not done: + if render: + env.render() + time.sleep(0.05) + + # 정책 확률에 따라 행동을 선택 + action, policy = agent.get_action(history) + # 1: 정지, 2: 왼쪽, 3: 오른쪽 + real_action = action_dict[action] + # 죽었을 때 시작하기 위해 발사 행동을 함 + if dead: + action, real_action, dead = 0, 1, False + + # 선택한 행동으로 환경에서 한 타임스텝 진행 + observe, reward, done, info = env.step(real_action) + + # 각 타임스텝마다 상태 전처리 + next_state = pre_processing(observe) + next_state = np.reshape([next_state], (1, 84, 84, 1)) + next_history = np.append(next_state, history[:, :, :, :3], axis=3) + + if start_life > info['ale.lives']: + dead, start_life = True, info['ale.lives'] + + score += reward + + if dead: + history = np.stack((next_state, next_state, + next_state, next_state), axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + else: + history = next_history + + if done: + # 각 에피소드 당 학습 정보를 기록 + print("episode: {:3d} | score : {:4.1f}".format(e, score)) diff --git a/3-atari/2-breakout-a3c/train.py b/3-atari/2-breakout-a3c/train.py new file mode 100644 index 0000000..dd01000 --- /dev/null +++ b/3-atari/2-breakout-a3c/train.py @@ -0,0 +1,309 @@ +import os +import gym +import time +import threading +import random +import numpy as np +import tensorflow as tf + +from skimage.color import rgb2gray +from skimage.transform import resize + +from tensorflow.compat.v1.train import AdamOptimizer +from tensorflow.keras.layers import Conv2D, Flatten, Dense + +# 멀티쓰레딩을 위한 글로벌 변수 +global episode, score_avg, score_max +episode, score_avg, score_max = 0, 0, 0 +num_episode = 8000000 + + +# ActorCritic 인공신경망 +class ActorCritic(tf.keras.Model): + def __init__(self, action_size, state_size): + super(ActorCritic, self).__init__() + + self.conv1 = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', + input_shape=state_size) + self.conv2 = Conv2D(64, (4, 4), strides=(2, 2), activation='relu') + self.conv3 = Conv2D(64, (3, 3), strides=(1, 1), activation='relu') + self.flatten = Flatten() + self.shared_fc = Dense(512, activation='relu') + + self.policy = Dense(action_size, activation='linear') + self.value = Dense(1, activation='linear') + + def call(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.flatten(x) + x = self.shared_fc(x) + + policy = self.policy(x) + value = self.value(x) + return policy, value + + +# 브레이크아웃에서의 A3CAgent 클래스 (글로벌신경망) +class A3CAgent(): + def __init__(self, action_size, env_name): + self.env_name = env_name + # 상태와 행동의 크기 정의 + self.state_size = (84, 84, 4) + self.action_size = action_size + # A3C 하이퍼파라미터 + self.discount_factor = 0.99 + self.no_op_steps = 30 + self.lr = 1e-4 + # 쓰레드의 갯수 + self.threads = 16 + + # 글로벌 인공신경망 생성 + self.global_model = ActorCritic(self.action_size, self.state_size) + # 글로벌 인공신경망의 가중치 초기화 + self.global_model.build(tf.TensorShape((None, *self.state_size))) + + # 인공신경망 업데이트하는 옵티마이저 함수 생성 + self.optimizer = AdamOptimizer(self.lr, use_locking=True) + + # 텐서보드 설정 + self.writer = tf.summary.create_file_writer('summary/breakout_a3c') + # 학습된 글로벌신경망 모델을 저장할 경로 설정 + self.model_path = os.path.join(os.getcwd(), 'save_model', 'model') + + # 쓰레드를 만들어 학습을 하는 함수 + def train(self): + # 쓰레드 수 만큼 Runner 클래스 생성 + runners = [Runner(self.action_size, self.state_size, + self.global_model, self.optimizer, + self.discount_factor, self.env_name, + self.writer) for i in range(self.threads)] + + # 각 쓰레드 시정 + for i, runner in enumerate(runners): + print("Start worker #{:d}".format(i)) + runner.start() + + # 10분 (600초)에 한 번씩 모델을 저장 + while True: + self.global_model.save_weights(self.model_path, save_format="tf") + time.sleep(60 * 10) + + +# 액터러너 클래스 (쓰레드) +class Runner(threading.Thread): + global_episode = 0 + + def __init__(self, action_size, state_size, global_model, + optimizer, discount_factor, env_name, writer): + threading.Thread.__init__(self) + + # A3CAgent 클래스에서 넘겨준 하이준 파라미터 설정 + self.action_size = action_size + self.state_size = state_size + self.global_model = global_model + self.optimizer = optimizer + self.discount_factor = discount_factor + + self.states, self.actions, self.rewards = [], [], [] + + # 환경, 로컬신경망, 텐서보드 생성 + self.local_model = ActorCritic(action_size, state_size) + self.env = gym.make(env_name) + self.writer = writer + + # 학습 정보를 기록할 변수 + self.avg_p_max = 0 + self.avg_loss = 0 + # k-타임스텝 값 설정 + self.t_max = 20 + self.t = 0 + # 불필요한 행동을 줄여주기 위한 dictionary + self.action_dict = {0:1, 1:2, 2:3, 3:3} + + # 텐서보드에 학습 정보를 기록 + def draw_tensorboard(self, score, step, e): + avg_p_max = self.avg_p_max / float(step) + with self.writer.as_default(): + tf.summary.scalar('Total Reward/Episode', score, step=e) + tf.summary.scalar('Average Max Prob/Episode', avg_p_max, step=e) + tf.summary.scalar('Duration/Episode', step, step=e) + + # 정책신경망의 출력을 받아 확률적으로 행동을 선택 + def get_action(self, history): + history = np.float32(history / 255.) + policy = self.local_model(history)[0][0] + policy = tf.nn.softmax(policy) + action_index = np.random.choice(self.action_size, 1, p=policy.numpy())[0] + return action_index, policy + + # 샘플을 저장 + def append_sample(self, history, action, reward): + self.states.append(history) + act = np.zeros(self.action_size) + act[action] = 1 + self.actions.append(act) + self.rewards.append(reward) + + # k-타임스텝의 prediction 계산 + def discounted_prediction(self, rewards, done): + discounted_prediction = np.zeros_like(rewards) + running_add = 0 + + if not done: + # value function + last_state = np.float32(self.states[-1] / 255.) + running_add = self.local_model(last_state)[-1][0].numpy() + + for t in reversed(range(0, len(rewards))): + running_add = running_add * self.discount_factor + rewards[t] + discounted_prediction[t] = running_add + return discounted_prediction + + # 저장된 샘플들로 A3C의 오류함수를 계산 + def compute_loss(self, done): + + discounted_prediction = self.discounted_prediction(self.rewards, done) + discounted_prediction = tf.convert_to_tensor(discounted_prediction[:, None], + dtype=tf.float32) + + states = np.zeros((len(self.states), 84, 84, 4)) + + for i in range(len(self.states)): + states[i] = self.states[i] + states = np.float32(states / 255.) + + policy, values = self.local_model(states) + + # 가치 신경망 업데이트 + advantages = discounted_prediction - values + critic_loss = 0.5 * tf.reduce_sum(tf.square(advantages)) + + # 정책 신경망 업데이트 + action = tf.convert_to_tensor(self.actions, dtype=tf.float32) + policy_prob = tf.nn.softmax(policy) + action_prob = tf.reduce_sum(action * policy_prob, axis=1, keepdims=True) + cross_entropy = - tf.math.log(action_prob + 1e-10) + actor_loss = tf.reduce_sum(cross_entropy * tf.stop_gradient(advantages)) + + entropy = tf.reduce_sum(policy_prob * tf.math.log(policy_prob + 1e-10), axis=1) + entropy = tf.reduce_sum(entropy) + actor_loss += 0.01 * entropy + + total_loss = 0.5 * critic_loss + actor_loss + + return total_loss + + # 로컬신경망을 통해 그레이디언트를 계산하고, 글로벌 신경망을 계산된 그레이디언트로 업데이트 + def train_model(self, done): + + global_params = self.global_model.trainable_variables + local_params = self.local_model.trainable_variables + + with tf.GradientTape() as tape: + total_loss = self.compute_loss(done) + + # 로컬신경망의 그레이디언트 계산 + grads = tape.gradient(total_loss, local_params) + # 안정적인 학습을 위한 그레이디언트 클리핑 + grads, _ = tf.clip_by_global_norm(grads, 40.0) + # 로컬신경망의 오류함수를 줄이는 방향으로 글로벌신경망을 업데이트 + self.optimizer.apply_gradients(zip(grads, global_params)) + # 로컬신경망의 가중치를 글로벌신경망의 가중치로 업데이트 + self.local_model.set_weights(self.global_model.get_weights()) + # 업데이트 후 저장된 샘플 초기화 + self.states, self.actions, self.rewards = [], [], [] + + def run(self): + # 액터러너끼리 공유해야하는 글로벌 변수 + global episode, score_avg, score_max + + step = 0 + while episode < num_episode: + done = False + dead = False + + score, start_life = 0, 5 + observe = self.env.reset() + + # 랜덤으로 뽑힌 값 만큼의 프레임동안 움직이지 않음 + for _ in range(random.randint(1, 30)): + observe, _, _, _ = self.env.step(1) + + # 프레임을 전처리 한 후 4개의 상태를 쌓아서 입력값으로 사용. + state = pre_processing(observe) + history = np.stack([state, state, state, state], axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + + while not done: + step += 1 + self.t += 1 + + # 정책 확률에 따라 행동을 선택 + action, policy = self.get_action(history) + # 1: 정지, 2: 왼쪽, 3: 오른쪽 + real_action = self.action_dict[action] + # 죽었을 때 시작하기 위해 발사 행동을 함 + if dead: + action, real_action, dead = 0, 1, False + + # 선택한 행동으로 환경에서 한 타임스텝 진행 + observe, reward, done, info = self.env.step(real_action) + + # 각 타임스텝마다 상태 전처리 + next_state = pre_processing(observe) + next_state = np.reshape([next_state], (1, 84, 84, 1)) + next_history = np.append(next_state, history[:, :, :, :3], axis=3) + + # 정책확률의 최대값 + self.avg_p_max += np.amax(policy.numpy()) + + if start_life > info['ale.lives']: + dead = True + start_life = info['ale.lives'] + + score += reward + reward = np.clip(reward, -1., 1.) + + # 샘플을 저장 + self.append_sample(history, action, reward) + + if dead: + history = np.stack((next_state, next_state, + next_state, next_state), axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + else: + history = next_history + + # 에피소드가 끝나거나 최대 타임스텝 수에 도달하면 학습을 진행 + if self.t >= self.t_max or done: + self.train_model(done) + self.t = 0 + + if done: + # 각 에피소드 당 학습 정보를 기록 + episode += 1 + score_max = score if score > score_max else score_max + score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score + + log = "episode: {:5d} | score : {:4.1f} | ".format(episode, score) + log += "score max : {:4.1f} | ".format(score_max) + log += "score avg : {:.3f}".format(score_avg) + print(log) + + self.draw_tensorboard(score, step, episode) + + self.avg_p_max = 0 + step = 0 + + +# 학습속도를 높이기 위해 흑백화면으로 전처리 +def pre_processing(observe): + processed_observe = np.uint8( + resize(rgb2gray(observe), (84, 84), mode='constant') * 255) + return processed_observe + + +if __name__ == "__main__": + global_agent = A3CAgent(action_size=3, env_name="BreakoutDeterministic-v4") + global_agent.train() diff --git a/3-atari/LICENSE b/3-atari/LICENSE deleted file mode 100644 index 5c61d8a..0000000 --- a/3-atari/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2017 Keon Kim - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE.