Gradient of MLPs

Here is my gradient implementation for MLPs, and a test case. Would you be interested in a pull request to make this a `BaseMultilayerPerceptron` method?

`gradient.py`:
```python
# Authors: Issam H. Laradji <issam.laradji@gmail.com>
#          Andreas Mueller
#          Jiyuan Qian
#          Robert Pollak <robert.pollak@jku.at>
# License: BSD 3 clause

import numpy as np
from sklearn.neural_network._base import DERIVATIVES
safe_sparse_dot = np.matmul # instead of import


# Local backpropagation.
# Based on code from Scikit-Learn 1.3.2.
def get_gradient(mlp, X):
    
    # See BaseMultilayerPerceptron._fit:
        
    n_samples, n_features = X.shape
    
    layer_units = [n_features] + list(mlp.hidden_layer_sizes) + [mlp.n_outputs_]
    
    # Initialize lists
    activations = [X] + [None] * (len(layer_units) - 1)
    deltas = [None] * (len(activations) - 1)
    
    
    # See BaseMultilayerPerceptron._backprop:
        
    # Forward propagate
    activations = mlp._forward_pass(activations)
    
    # Backward propagate
    
    last = mlp.n_layers_ - 2
    
    # Set the gradient to one in the output layer.
    #
    # The docstring of _backprop says:
    #> deltas are gradients of loss with respect to z
    #> in each layer, where z = wx + b is the value of a particular layer
    #> before passing through the activation function
    deltas[last] = np.ones(activations[-1].shape)
    
    inplace_derivative = DERIVATIVES[mlp.activation]
    # Iterate over the hidden layers
    for i in range(mlp.n_layers_ - 2, 0, -1):
        deltas[i - 1] = safe_sparse_dot(deltas[i], mlp.coefs_[i].T)
        inplace_derivative(activations[i], deltas[i - 1])
    
    
    # Get the input gradient.
    first_layer = 0
    input_gradient = safe_sparse_dot(deltas[first_layer], mlp.coefs_[first_layer].T)

    return input_gradient
```

Test case:
```python
import numpy as np
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt

from gradient import get_gradient


def plot(X, y, title):
    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')
    ax.view_init(elev=30, azim=-100)
    ax.scatter(X[:,0], X[:,1], y)
    ax.set_xlabel('X0')
    ax.set_ylabel('X1')
    ax.set_zlabel('y')
    plt.title(title)
    plt.xlim(round(X[:,0].min()), round(X[:,0].max()))
    plt.ylim(round(X[:,1].min()), round(X[:,1].max()))
    ax.set_zlim(round(y.min()), round(y.max()))
    fig.tight_layout()
    

#%% Test two hidden layers
regr = MLPRegressor(hidden_layer_sizes=(2,1), solver='lbfgs', max_iter=200, random_state=4)

n = 12

X_1d = np.linspace(-1, 4, n)
X = np.array(np.meshgrid(X_1d, X_1d)).T.reshape(-1, 2)
y = np.maximum(X.min(axis=1), 0)
#plot(X, y, 'ground truth')

regr.fit(X, y)
yp = regr.predict(X)
plot(X, yp, 'prediction')
    
np.testing.assert_allclose(regr.predict([[2, 4], [-1, -1]]), [2, 0], atol=1e-4)

X_grad = np.array([[2, 4], [4, 2]])
gradient = get_gradient(regr, X_grad)
np.testing.assert_allclose(gradient, [[1, 0], [0, 1]], atol=1e-3)
```

![grafik](https://github.com/scikit-learn/scikit-learn/assets/21648/524b23cc-feb3-4d16-bc5f-a0ff2b4a6caa)

_Originally posted by @jondo in https://github.com/scikit-learn/scikit-learn/discussions/27465#discussioncomment-7894980_

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Gradient of MLPs #27996

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

Gradient of MLPs #27996

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions