diff --git a/sklearn/neural_network/_base.py b/sklearn/neural_network/_base.py
index 7a1c17c6ff75e..7977a5702b638 100644
--- a/sklearn/neural_network/_base.py
+++ b/sklearn/neural_network/_base.py
@@ -75,6 +75,26 @@ def relu(X):
     return X
 
 
+def leaky_relu(X, alpha=0.01):
+    """Compute the leaky rectified linear unit function inplace.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    
+    alpha : float
+        The slope of the function when x < 0, default value is 0.01
+
+    Returns
+    -------
+    X_new : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The transformed data.
+    """
+    np.clip(X, alpha*X, np.finfo(X.dtype).max, out=X)
+    return X
+
+
 def softmax(X):
     """Compute the K-way softmax function inplace.
 
@@ -96,7 +116,7 @@ def softmax(X):
 
 
 ACTIVATIONS = {'identity': identity, 'tanh': tanh, 'logistic': logistic,
-               'relu': relu, 'softmax': softmax}
+               'relu': relu, 'leaky_relu': leaky_relu, 'softmax': softmax}
 
 
 def inplace_identity_derivative(Z, delta):
@@ -169,10 +189,29 @@ def inplace_relu_derivative(Z, delta):
     delta[Z == 0] = 0
 
 
+def inplace_leaky_relu_derivative(Z, delta, alpha=0.01):
+    """Apply the derivative of the leaky relu function.
+
+    It exploits the fact that the derivative is a simple function of the output
+    value from leaky rectified linear units activation function.
+
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the rectified linear units activation
+        function during the forward pass.
+
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    delta[Z < 0] = alpha
+
+
 DERIVATIVES = {'identity': inplace_identity_derivative,
                'tanh': inplace_tanh_derivative,
                'logistic': inplace_logistic_derivative,
-               'relu': inplace_relu_derivative}
+               'relu': inplace_relu_derivative,
+               'leaky_relu': inplace_leaky_relu_derivative}
 
 
 def squared_loss(y_true, y_pred):
diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index c8c3e18708b9d..c8108fe2c00b0 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -419,7 +419,8 @@ def _validate_hyperparameters(self):
                              % self.n_iter_no_change)
 
         # raise ValueError if not registered
-        supported_activations = ('identity', 'logistic', 'tanh', 'relu')
+        supported_activations = ('identity', 'logistic', 'tanh',
+                                 'relu', 'leaky_relu')
         if self.activation not in supported_activations:
             raise ValueError("The activation '%s' is not supported. Supported "
                              "activations are %s." % (self.activation,
@@ -699,7 +700,8 @@ class MLPClassifier(BaseMultilayerPerceptron, ClassifierMixin):
         The ith element represents the number of neurons in the ith
         hidden layer.
 
-    activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
+    activation : {'identity', 'logistic', 'tanh', 'relu', 'leaky_relu'}, 
+                  default 'relu'
         Activation function for the hidden layer.
 
         - 'identity', no-op activation, useful to implement linear bottleneck,
@@ -713,6 +715,9 @@ class MLPClassifier(BaseMultilayerPerceptron, ClassifierMixin):
 
         - 'relu', the rectified linear unit function,
           returns f(x) = max(0, x)
+        
+        - 'relu', the leaky rectified linear unit function,
+          returns f(x) = max(alpha*x, x)
 
     solver : {'lbfgs', 'sgd', 'adam'}, default 'adam'
         The solver for weight optimization.