Added CNN network for ddpg

Jason-CKY · Jason-CKY · commit d19d2f6efcb7 · 2020-12-16T22:53:34.000+08:00
diff --git a/Algorithms/ddpg/core.py b/Algorithms/ddpg/core.py
@@ -2,6 +2,10 @@
 import torch.nn as nn
 import torch
 
+##########################################################################################################
+#MLP ACTOR-CRITIC##
+##########################################################################################################
+
 def mlp(sizes, activation, output_activation=nn.Identity):
     '''
     Create a multi-layer perceptron model from input sizes and activations
@@ -87,6 +91,154 @@ def __init__(self, observation_space, action_space, hidden_sizes=(256, 256), act
         self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit).to(device)
         self.q = MLPCritic(obs_dim, act_dim, hidden_sizes, activation).to(device)
     
+    def act(self, obs):
+        with torch.no_grad():
+            return self.pi(obs).cpu().numpy()
+
+
+##########################################################################################################
+#CNN ACTOR-CRITIC##
+##########################################################################################################
+
+def cnn(in_channels, conv_layer_sizes, activation, batchnorm=True):
+  '''
+  Create a Convolutional Neural Network with given number of cnn layers
+  Each convolutional layer has kernel_size=2, and stride=2, which effectively
+  halves the spatial dimensions and doubles the channel size.
+  Args:
+    con_layer_sizes (list): list of 3-tuples consisting of 
+                            (output_channel, kernel_size, stride)
+    in_channels (int): incoming number of channels
+    num_layers (int): number of convolutional layers needed
+    activation (nn.Module.Activation): Activation function after each conv layer
+    batchnorm (bool): If true, add a batchnorm2d layer after activation layer
+  Returns:
+    nn.Sequential module for the CNN
+  '''
+  layers = []
+  channels = in_channels
+  for i in range(len(conv_layer_sizes)):
+    out_channel, kernel, stride = conv_layer_sizes[i]
+    layers += [nn.Conv2d(in_channels, out_channel, kernel, stride),
+               activation()]
+    if batchnorm:
+      layers += [nn.BatchNorm2d(out_channel)]
+    
+    in_channels = out_channel
+
+  return nn.Sequential(*layers)
+
+
+
+class CNNActor(nn.Module):
+    def __init__(self, obs_dim, act_dim, conv_layer_sizes, hidden_sizes, activation, act_limit):
+        '''
+        A Convolutional Neural Net for the Actor network
+        Network Architecture: (input) -> CNN -> MLP -> (output)
+        Assume input is in the shape: (128, 128, 3)
+        Args:
+            obs_dim (tuple): observation dimension of the environment in the form of (H, W, C)
+            act_dim (int): action dimension of the environment
+            conv_layer_sizes (list): list of 3-tuples consisting of (output_channel, kernel_size, stride)
+                                    that describes the cnn architecture
+            hidden_sizes (list): list of number of neurons in each layer of MLP after output from CNN
+            activation (nn.modules.activation): Activation function for each layer of MLP
+            act_limit (float): the greatest magnitude possible for the action in the environment
+        '''
+        super().__init__()
+        
+        self.pi_cnn = cnn(obs_dim[2], conv_layer_sizes, nn.ReLU, batchnorm=True)
+        self.start_dim = self.calc_shape(obs_dim, self.pi_cnn)
+        mlp_sizes = [self.start_dim] + list(hidden_sizes) + [act_dim]
+        self.pi_mlp = mlp(mlp_sizes, activation, output_activation=nn.Tanh)
+        self.act_limit = act_limit
+
+    def calc_shape(self, obs_dim, pi_cnn):
+      '''
+      Function to determine the shape of the data after the conv layers
+      to determine how many neurons for the MLP.
+      '''
+      H, W, C = obs_dim
+      dummy_input = torch.randn(1, C, H, W)
+      with torch.no_grad():
+        cnn_out = pi_cnn(dummy_input)
+      shape = cnn_out.view(-1, ).shape[0]
+      return shape
+
+    def forward(self, obs):
+        '''
+        Forward propagation for actor network
+        Args:
+            obs (Tensor [n, obs_dim]): batch of observation from environment
+        Return:
+            output of actor network * act_limit
+        '''
+        obs = self.pi_cnn(obs)
+        obs = obs.view(-1, self.start_dim)
+        obs = self.pi_mlp(obs)
+        return obs*self.act_limit
+       
+class CNNCritic(nn.Module):
+    def __init__(self, obs_dim, act_dim, conv_layer_sizes, hidden_sizes, activation):
+        '''
+        A Convolutional Neural Net for the Critic network
+        Args:
+            obs_dim (tuple): observation dimension of the environment in the form of (H, W, C)
+            act_dim (int): action dimension of the environment
+            conv_layer_sizes (list): list of 3-tuples consisting of (output_channel, kernel_size, stride)
+                        that describes the cnn architecture
+            hidden_sizes (list): list of number of neurons in each layer of MLP
+            activation (nn.modules.activation): Activation function for each layer of MLP
+        '''
+        super().__init__()
+        self.q_cnn = cnn(obs_dim[2], conv_layer_sizes, nn.ReLU, batchnorm=True)
+        self.start_dim = self.calc_shape(obs_dim, self.q_cnn)
+        self.q_mlp = mlp([self.start_dim + act_dim] + list(hidden_sizes) + [1], activation)
+
+    def calc_shape(self, obs_dim, pi_cnn):
+      '''
+      Function to determine the shape of the data after the conv layers
+      to determine how many neurons for the MLP.
+      '''
+      H, W, C = obs_dim
+      dummy_input = torch.randn(1, C, H, W)
+      with torch.no_grad():
+        cnn_out = pi_cnn(dummy_input)
+      shape = cnn_out.view(-1, ).shape[0]
+      return shape
+
+    def forward(self, obs, act):
+        '''
+        Forward propagation for critic network
+        Args:
+            obs (Tensor [n, obs_dim]): batch of observation from environment
+            act (Tensor [n, act_dim]): batch of actions taken by actor
+        '''
+        obs = self.q_cnn(obs)
+        obs = obs.view(-1, self.start_dim)
+        q = self.q_mlp(torch.cat([obs, act], dim=-1))
+        return torch.squeeze(q, -1)     # ensure q has the right shape
+
+class CNNActorCritic(nn.Module):
+    def __init__(self, observation_space, action_space, conv_layer_sizes, hidden_sizes=(256, 256), activation=nn.ReLU, device='cpu'):
+        '''
+        A Multi-Layer Perceptron for the Actor_Critic network
+        Args:
+            observation_space (gym.spaces): observation space of the environment
+            act_space (gym.spaces): action space of the environment
+            hidden_sizes (tuple): list of number of neurons in each layer of MLP
+            activation (nn.modules.activation): Activation function for each layer of MLP
+            device (str): whether to use cpu or gpu to run the model
+        '''
+        super().__init__()
+        obs_dim = observation_space.shape
+        act_dim = action_space.shape[0]
+        act_limit = action_space.high[0]
+
+        # Create Actor and Critic networks
+        self.pi = CNNActor(obs_dim, act_dim, conv_layer_sizes, hidden_sizes, activation, act_limit).to(device)
+        self.q = CNNCritic(obs_dim, act_dim, conv_layer_sizes, hidden_sizes, activation).to(device)
+    
     def act(self, obs):
         with torch.no_grad():
             return self.pi(obs).cpu().numpy()
diff --git a/Algorithms/ddpg/ddpg.py b/Algorithms/ddpg/ddpg.py
@@ -93,7 +93,7 @@ def __init__(self, env_fn, save_dir, actor_critic=MLPActorCritic, ac_kwargs=dict
         self.gamma = gamma
         self.tau = tau
         self.act_noise = act_noise
-        self.obs_dim = self.env.observation_space.shape[0]
+        # self.obs_dim = self.env.observation_space.shape[0]
         self.act_dim = self.env.action_space.shape[0]
         self.num_test_episodes = num_test_episodes
         self.max_ep_len = self.env.spec.max_episode_steps if self.env.spec.max_episode_steps is not None else max_ep_len
@@ -267,7 +267,7 @@ def load_weights(self, best=True, load_buffer=True):
 
             env_pkl_path = os.path.join(self.save_dir, "env.pickle")
             if os.path.isfile(env_pkl_path):
-                self.env = Normalize_Observation.load(env_pkl_path)
+                self.env = self.env.__class__.load(env_pkl_path)
                 print("Environment loaded")
             
             print('checkpoint loaded at {}'.format(checkpoint_path))
diff --git a/Algorithms/ddpg/ddpg_config_cnn.json b/Algorithms/ddpg/ddpg_config_cnn.json
@@ -0,0 +1,21 @@
+{
+    "ac_kwargs": {
+        "hidden_sizes": [512, 256],
+        "conv_layer_sizes": [[16, 5, 2],
+        [32, 5, 2], 
+        [64, 5, 2], 
+        [64, 3, 1]]
+    },
+    "replay_size": 1e6,
+    "gamma": 0.99,
+    "tau": 0.995,
+    "pi_lr": 1e-3,
+    "q_lr": 1e-3,
+    "batch_size": 100,
+    "start_steps": 1000,
+    "update_after": 1000,
+    "update_every": 50,
+    "act_noise": 0.1,
+    "max_ep_len": 1000,
+    "save_freq": 1
+}
diff --git a/Algorithms/ddpg/ddpg_config_mlp.json b/Algorithms/ddpg/ddpg_config_mlp.json
diff --git a/Algorithms/ppo/ppo.py b/Algorithms/ppo/ppo.py
@@ -198,7 +198,7 @@ def load_weights(self, best=True):
 
             env_pkl_path = os.path.join(self.save_dir, "env.pickle")
             if os.path.isfile(env_pkl_path):
-                self.env = Normalize_Observation.load(env_pkl_path)
+                self.env = self.env.__class__.load(env_pkl_path)
                 print("Environment loaded")
             print('checkpoint loaded at {}'.format(checkpoint_path))
         else:
diff --git a/Algorithms/ppo/ppo_config_mlp.json b/Algorithms/ppo/ppo_config_mlp.json
diff --git a/Algorithms/td3/td3.py b/Algorithms/td3/td3.py
@@ -289,7 +289,7 @@ def load_weights(self, best=True, load_buffer=True):
             
             env_pkl_path = os.path.join(self.save_dir, "env.pickle")
             if os.path.isfile(env_pkl_path):
-                self.env = Normalize_Observation.load(env_pkl_path)
+                self.env = self.env.__class__.load(env_pkl_path)
                 print("Environment loaded")
 
             print('checkpoint loaded at {}'.format(checkpoint_path))
diff --git a/Algorithms/td3/td3_config_mlp.json b/Algorithms/td3/td3_config_mlp.json
diff --git a/Algorithms/trpo/trpo.py b/Algorithms/trpo/trpo.py
@@ -296,7 +296,7 @@ def load_weights(self, best=True):
 
             env_pkl_path = os.path.join(self.save_dir, "env.pickle")
             if os.path.isfile(env_pkl_path):
-                self.env = Normalize_Observation.load(env_pkl_path)
+                self.env = self.env.__class__.load(env_pkl_path)
                 print("Environment loaded")
 
             print('checkpoint loaded at {}'.format(checkpoint_path))
diff --git a/Algorithms/trpo/trpo_config_mlp.json b/Algorithms/trpo/trpo_config_mlp.json
diff --git a/Wrappers/normalize_observation.py b/Wrappers/normalize_observation.py
@@ -3,7 +3,6 @@
 import pickle
 from typing import Tuple
 
-K = n = Ex = Ex2 = 0.0
 class Running_Stat:
     '''
     Class to store variables required to compute 1st and 2nd order statistics
diff --git a/Wrappers/rlbench_wrapper.py b/Wrappers/rlbench_wrapper.py
@@ -0,0 +1,36 @@
+import gym
+import numpy as np
+import pickle
+from typing import Tuple
+
+class RLBench_Wrapper(gym.ObservationWrapper):
+    '''
+    Observation Wrapper for the RLBench environment to only output 1 of the 
+    camera views during training/testing instead of a dictionary of all camera views
+    '''
+    def __init__(self, env, view):
+        '''
+        Args:
+            view (str): Dictionary key to specify which camera view to use. 
+                        RLBench observation comes in a dictionary of
+                        ['state', 'left_shoulder_rgb', 'right_shoulder_rgb', 'wrist_rgb', 'front_rgb']
+        '''
+        super(RLBench_Wrapper, self).__init__(env)
+        self.view = view
+        self.observation_space = self.observation_space[view]
+
+    def reset(self, **kwargs):
+        observation = self.env.reset(**kwargs)
+        return self.observation(observation)
+
+    def observation(self, observation):
+        return observation[self.view]
+    
+    def save(self, fname):
+        with open(fname, 'wb') as f:
+            pickle.dump(self, f)
+
+    @classmethod
+    def load(cls, filename):
+        with open(filename, 'rb') as f:
+            return pickle.load(f)
diff --git a/Wrappers/serialize_env.py b/Wrappers/serialize_env.py
@@ -0,0 +1,20 @@
+import gym
+import numpy as np
+import pickle
+from typing import Tuple
+
+class Serialize_Env(gym.ObservationWrapper):
+    '''
+    Simple wrapper to add the save and load functionality
+    '''
+    def __init__(self, env, training=True):
+        super(Serialize_Env, self).__init__(env)
+    
+    def save(self, fname):
+        with open(fname, 'wb') as f:
+            pickle.dump(self, f)
+
+    @classmethod
+    def load(cls, filename):
+        with open(filename, 'rb') as f:
+            return pickle.load(f)
diff --git a/test.py b/test.py
@@ -67,6 +67,10 @@ def parse_arguments():
 
 def main():
     args = parse_arguments()
+
+    save_dir = os.path.join("Model_Weights", args.env, args.agent.lower())
+    config_path = os.path.join(save_dir, args.agent.lower() + "_config.json")
+
     if args.agent.lower() == 'random':
         save_dir = os.path.join("Model_Weights", args.env) if args.gif else None
         if not os.path.isdir(save_dir):
@@ -77,8 +81,6 @@ def main():
 
     elif args.agent.lower() == 'ddpg':
         from Algorithms.ddpg.ddpg import DDPG
-        save_dir = os.path.join("Model_Weights", args.env, "ddpg")
-        config_path = os.path.join(save_dir, "ddpg_config.json") 
         logger_kwargs = {
             "output_dir": save_dir
         }
@@ -89,8 +91,6 @@ def main():
         model.load_weights(load_buffer=False)
     elif args.agent.lower() == 'td3':
         from Algorithms.td3.td3 import TD3
-        save_dir = os.path.join("Model_Weights", args.env, "td3")
-        config_path = os.path.join(save_dir, "td3_config.json") 
         logger_kwargs = {
             "output_dir": save_dir
         }
@@ -101,8 +101,6 @@ def main():
         model.load_weights(load_buffer=False)
     elif args.agent.lower() == 'trpo':
         from Algorithms.trpo.trpo import TRPO
-        save_dir = os.path.join("Model_Weights", args.env, "trpo")
-        config_path = os.path.join(save_dir, "trpo_config.json") 
         logger_kwargs = {
             "output_dir": save_dir
         }
@@ -113,8 +111,6 @@ def main():
         model.load_weights()
     elif args.agent.lower() == 'ppo':
         from Algorithms.ppo.ppo import PPO
-        save_dir = os.path.join("Model_Weights", args.env, "ppo")
-        config_path = os.path.join(save_dir, "ppo_config.json") 
         logger_kwargs = {
             "output_dir": save_dir
         }
diff --git a/train.py b/train.py