From 5e12336801a18bd001a942ac9a0f384703119cec Mon Sep 17 00:00:00 2001
From: Reuben Feinman <rfeinman16@gmail.com>
Date: Wed, 22 Jul 2015 14:23:52 -0700
Subject: [PATCH] Update weight initialization scheme in mlp.py

The sparse initialization scheme is considered state-of-the art in random weight initialization for MLPs. In this scheme we hard limit the number of non-zero incoming connection weights to each unit (we used 15 in our experiments) and set the biases to 0 (or 0.5 for tanh units). Doing this allows the units to be both highly differentiated as well as unsaturated, avoiding the problem in dense initializations where the connection weights must all be scaled very small in order to prevent saturation, leading to poor differentiation between units.
---
 code/mlp.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/code/mlp.py b/code/mlp.py
index 18f34e7c..4379dd14 100644
--- a/code/mlp.py
+++ b/code/mlp.py
@@ -66,10 +66,9 @@ def __init__(self, rng, input, n_in, n_out, W=None, b=None,
         self.input = input
         # end-snippet-1
 
-        # `W` is initialized with `W_values` which is uniformely sampled
-        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
-        # for tanh activation function
-        # the output of uniform if converted using asarray to dtype
+        # Sparse initialization scheme from section 5 of Martens (2010):
+        # http://www.icml2010.org/papers/458.pdf
+        # the output weight matrix is converted using asarray to dtype
         # theano.config.floatX so that the code is runable on GPU
         # Note : optimal initialization of weights is dependent on the
         #        activation function used (among other things).
@@ -78,22 +77,25 @@ def __init__(self, rng, input, n_in, n_out, W=None, b=None,
         #        compared to tanh
         #        We have no info for other function, so we use the same as
         #        tanh.
+        num_connections = min(15,n_in)
         if W is None:
-            W_values = numpy.asarray(
-                rng.uniform(
-                    low=-numpy.sqrt(6. / (n_in + n_out)),
-                    high=numpy.sqrt(6. / (n_in + n_out)),
-                    size=(n_in, n_out)
-                ),
-                dtype=theano.config.floatX
-            )
+            indices = range(n_in)
+            weights = numpy.zeros((n_in, n_out),dtype=theano.config.floatX)
+            for i in range(n_out):
+                random.shuffle(indices)
+                for j in indices[:num_connections]:
+                    weights[j,i] = random.gauss(0.0, 0.8)
+
             if activation == theano.tensor.nnet.sigmoid:
                 W_values *= 4
 
             W = theano.shared(value=W_values, name='W', borrow=True)
 
         if b is None:
-            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
+            if activation == theano.tensor.tanh:
+                b_values = 0.5*numpy.ones((n_out,), dtype=theano.config.floatX)
+            else:
+                b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
             b = theano.shared(value=b_values, name='b', borrow=True)
 
         self.W = W