scikit-learn · lorentzenchr · Apr 9, 2022 · Apr 10, 2022 · Apr 14, 2022 · jjerphan
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
@@ -271,6 +271,12 @@ def fit(self, X, y, sample_weight=None):
             else:
                 coef = np.zeros(n_features, dtype=loss_dtype)
 
+        # To save some memory, we preallocate a ndarray used as per row loss and
+        # gradient inside of LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
+        # others).
-        # To save some memory, we preallocate a ndarray used as per row loss and
-        # gradient inside of LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
-        # others).
+          # To save some memory, we preallocate two ndarrays used respectively
+          # as per row loss, gradient inside of LinearLoss by several methods
+          # e.g. by LinearLoss.base_loss.{loss,gradient,gradient_hessian_product}.
-        # To save some memory, we preallocate a ndarray used as per row loss and
-        # gradient inside of LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
-        # others).
+          # To save some memory, we preallocate two ndarrays used respectively
+          # as per row loss, gradient inside of LinearLoss by several methods
+          # e.g. by LinearLoss.base_loss.{loss,gradient,gradient_hessian_product}.
+        per_sample_loss_out = np.empty_like(y)
+        per_sample_gradient_out = np.empty_like(y)
+
         # Algorithms for optimization:
         # Note again that our losses implement 1/2 * deviance.
         if solver == "lbfgs":
@@ -289,7 +295,15 @@ def fit(self, X, y, sample_weight=None):
                     "gtol": self.tol,
                     "ftol": 1e3 * np.finfo(float).eps,
                 },
-                args=(X, y, sample_weight, l2_reg_strength, n_threads),
+                args=(
+                    X,
+                    y,
+                    sample_weight,
+                    l2_reg_strength,
+                    n_threads,
+                    per_sample_loss_out,
+                    per_sample_gradient_out,
+                ),
             )
             self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
             coef = opt_res.x

diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
@@ -112,7 +112,16 @@ def _w_intercept_raw(self, coef, X):
 
         return weights, intercept, raw_prediction
 
-    def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1):
+    def loss(
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        per_sample_loss_out=None,
+    ):
         """Compute the loss as sum over point-wise losses.
 
         Parameters
@@ -132,6 +141,10 @@ def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1)
             L2 regularization strength
         n_threads : int, default=1
             Number of OpenMP threads to use.
+        per_sample_loss_out : None or C-contiguous array of shape (n_samples,), \
+            default=None
+            A location into which the per sample loss is stored. If None, a new array
+            might be created. Providing such an array can save a little memory.
 
         Returns
         -------
@@ -144,6 +157,7 @@ def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1)
             y_true=y,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
+            loss_out=per_sample_loss_out,
             n_threads=n_threads,
         )
         loss = loss.sum()
@@ -152,7 +166,15 @@ def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1)
         return loss + 0.5 * l2_reg_strength * norm2_w
 
     def loss_gradient(
-        self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        per_sample_loss_out=None,
+        per_sample_gradient_out=None,
     ):
         """Computes the sum of loss and gradient w.r.t. coef.
 
@@ -173,6 +195,14 @@ def loss_gradient(
             L2 regularization strength
         n_threads : int, default=1
             Number of OpenMP threads to use.
+        per_sample_loss_out : None or C-contiguous array of shape (n_samples,), \
+            default=None
+            A location into which the per sample loss is stored. If None, a new array
+            might be created. Providing such an array can save a little memory.
+        per_sample_gradient_out : None or C-contiguous array of shape (n_samples,) or
+            array of shape (n_samples, n_classes), default=None
+            A location into which the per sample gradient is stored. If None, a new
+            array might be created. Providing such an array can save a little memory.
 
         Returns
         -------
@@ -190,6 +220,8 @@ def loss_gradient(
             y_true=y,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
+            loss_out=per_sample_loss_out,
+            gradient_out=per_sample_gradient_out,
             n_threads=n_threads,
         )
         loss = loss.sum()
@@ -213,7 +245,14 @@ def loss_gradient(
         return loss, grad
 
     def gradient(
-        self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        per_sample_gradient_out=None,
     ):
         """Computes the gradient w.r.t. coef.
 
@@ -234,6 +273,10 @@ def gradient(
             L2 regularization strength
         n_threads : int, default=1
             Number of OpenMP threads to use.
+        per_sample_gradient_out : None or C-contiguous array of shape (n_samples,) or
+            array of shape (n_samples, n_classes), default=None
+            A location into which the per sample gradient is stored. If None, a new
+            array might be created. Providing such an array can save a little memory.
 
         Returns
         -------
@@ -248,6 +291,7 @@ def gradient(
             y_true=y,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
+            gradient_out=per_sample_gradient_out,
             n_threads=n_threads,
         )
 
@@ -269,7 +313,15 @@ def gradient(
                 return grad
 
     def gradient_hessian_product(
-        self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        per_sample_gradient_out=None,
+        per_sample_hessian_out=None,
     ):
         """Computes gradient and hessp (hessian product function) w.r.t. coef.
 
@@ -290,6 +342,14 @@ def gradient_hessian_product(
             L2 regularization strength
         n_threads : int, default=1
             Number of OpenMP threads to use.
+        per_sample_gradient_out : None or C-contiguous array of shape (n_samples,) or
+            array of shape (n_samples, n_classes), default=None
+            A location into which the per sample gradient is stored. If None, a new
+            array might be created. Providing such an array can save a little memory.
+        per_sample_hessian_out : None or C-contiguous array of shape (n_samples,) or
+            array of shape (n_samples, n_classes), default=None
+            A location into which the per sample hessian is stored. If None, a new
+            array might be created. Providing such an array can save a little memory.
 
         Returns
         -------
@@ -309,6 +369,8 @@ def gradient_hessian_product(
                 y_true=y,
                 raw_prediction=raw_prediction,
                 sample_weight=sample_weight,
+                gradient_out=per_sample_gradient_out,
+                hessian_out=per_sample_hessian_out,
                 n_threads=n_threads,
             )
             grad = np.empty_like(coef, dtype=weights.dtype)
@@ -356,6 +418,8 @@ def hessp(s):
                 y_true=y,
                 raw_prediction=raw_prediction,
                 sample_weight=sample_weight,
+                gradient_out=per_sample_gradient_out,
+                proba_out=per_sample_hessian_out,
                 n_threads=n_threads,
             )
             grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
@@ -10,6 +10,7 @@
 #         Simon Wu <s8wu@uwaterloo.ca>
 #         Arthur Mensch <arthur.mensch@m4x.org
 
+import functools
 import numbers
 import warnings
 
@@ -397,34 +398,67 @@ def _logistic_regression_path(
             # reconstructs the 2d-array via w0.reshape((n_classes, -1), order="F").
             # As w0 is F-contiguous, ravel(order="F") also avoids a copy.
             w0 = w0.ravel(order="F")
-            loss = LinearModelLoss(
+            linear_loss = LinearModelLoss(
                 base_loss=HalfMultinomialLoss(n_classes=classes.size),
                 fit_intercept=fit_intercept,
             )
         target = Y_multi
-        if solver in "lbfgs":
-            func = loss.loss_gradient
-        elif solver == "newton-cg":
-            func = loss.loss
-            grad = loss.gradient
-            hess = loss.gradient_hessian_product  # hess = [gradient, hessp]
         warm_start_sag = {"coef": w0.T}
     else:
         target = y_bin
         if solver == "lbfgs":
-            loss = LinearModelLoss(
+            linear_loss = LinearModelLoss(
                 base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept
             )
-            func = loss.loss_gradient
         elif solver == "newton-cg":
-            loss = LinearModelLoss(
+            linear_loss = LinearModelLoss(
                 base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept
             )
-            func = loss.loss
-            grad = loss.gradient
-            hess = loss.gradient_hessian_product  # hess = [gradient, hessp]
         warm_start_sag = {"coef": np.expand_dims(w0, axis=1)}
 
+    if solver == "lbfgs":
+        # To save some memory, we preallocate a ndarray used as per row loss and
+        # gradient inside od LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
+        # others).
+        per_sample_loss_out = np.empty_like(target)
+        if linear_loss.base_loss.is_multiclass:
+            per_sample_gradient_out = np.empty(
+                shape=(X.shape[0], classes.size), dtype=X.dtype, order="C"
+            )
+        else:
+            per_sample_gradient_out = np.empty_like(target, order="C")
+
+        func = functools.partial(
+            linear_loss.loss_gradient,
+            per_sample_loss_out=per_sample_loss_out,
+            per_sample_gradient_out=per_sample_gradient_out,
+        )
+    elif solver == "newton-cg":
+        # To save some memory, we preallocate a ndarray used as per row loss and
+        # gradient inside od LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
+        # others).
+        per_sample_loss_out = np.empty_like(target)
+        if linear_loss.base_loss.is_multiclass:
+            per_sample_gradient_out = np.empty(
+                shape=(X.shape[0], classes.size), dtype=X.dtype, order="C"
+            )
+        else:
+            per_sample_gradient_out = np.empty_like(target, order="C")
-    if solver == "lbfgs":
-        # To save some memory, we preallocate a ndarray used as per row loss and
-        # gradient inside od LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
-        # others).
-        per_sample_loss_out = np.empty_like(target)
-        if linear_loss.base_loss.is_multiclass:
-            per_sample_gradient_out = np.empty(
-                shape=(X.shape[0], classes.size), dtype=X.dtype, order="C"
-            )
-        else:
-            per_sample_gradient_out = np.empty_like(target, order="C")
-
-        func = functools.partial(
-            linear_loss.loss_gradient,
-            per_sample_loss_out=per_sample_loss_out,
-            per_sample_gradient_out=per_sample_gradient_out,
-        )
-    elif solver == "newton-cg":
-        # To save some memory, we preallocate a ndarray used as per row loss and
-        # gradient inside od LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
-        # others).
-        per_sample_loss_out = np.empty_like(target)
-        if linear_loss.base_loss.is_multiclass:
-            per_sample_gradient_out = np.empty(
-                shape=(X.shape[0], classes.size), dtype=X.dtype, order="C"
-            )
-        else:
-            per_sample_gradient_out = np.empty_like(target, order="C")
+    # To save some memory, we preallocate two ndarrays used respectively
+    # as per row loss, gradient inside of LinearLoss by several methods
+    # e.g. by LinearLoss.base_loss.{loss,gradient,gradient_hessian_product}.
+    per_sample_loss_out = np.empty_like(target)
+    if linear_loss.base_loss.is_multiclass:
+        per_sample_gradient_out = np.empty(
+            shape=(X.shape[0], classes.size), dtype=X.dtype, order="C"
+        )
+    else:
+        per_sample_gradient_out = np.empty_like(target, dtype=X.dtype, order="C")
+
+    if solver == "lbfgs":
+        func = functools.partial(
+            linear_loss.loss_gradient,
+            per_sample_loss_out=per_sample_loss_out,
+            per_sample_gradient_out=per_sample_gradient_out,
+        )
+    elif solver == "newton-cg":
-    if solver == "lbfgs":
-        # To save some memory, we preallocate a ndarray used as per row loss and
-        # gradient inside od LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
-        # others).
-        per_sample_loss_out = np.empty_like(target)
-        if linear_loss.base_loss.is_multiclass:
-            per_sample_gradient_out = np.empty(
-                shape=(X.shape[0], classes.size), dtype=X.dtype, order="C"
-            )
-        else:
-            per_sample_gradient_out = np.empty_like(target, order="C")
-
-        func = functools.partial(
-            linear_loss.loss_gradient,
-            per_sample_loss_out=per_sample_loss_out,
-            per_sample_gradient_out=per_sample_gradient_out,
-        )
-    elif solver == "newton-cg":
-        # To save some memory, we preallocate a ndarray used as per row loss and
-        # gradient inside od LinearLoss, e.g. by LinearLoss.base_loss.gradient (and
-        # others).
-        per_sample_loss_out = np.empty_like(target)
-        if linear_loss.base_loss.is_multiclass:
-            per_sample_gradient_out = np.empty(
-                shape=(X.shape[0], classes.size), dtype=X.dtype, order="C"
-            )
-        else:
-            per_sample_gradient_out = np.empty_like(target, order="C")
+    # To save some memory, we preallocate two ndarrays used respectively
+    # as per row loss, gradient inside of LinearLoss by several methods
+    # e.g. by LinearLoss.base_loss.{loss,gradient,gradient_hessian_product}.
+    per_sample_loss_out = np.empty_like(target)
+    if linear_loss.base_loss.is_multiclass:
+        per_sample_gradient_out = np.empty(
+            shape=(X.shape[0], classes.size), dtype=X.dtype, order="C"
+        )
+    else:
+        per_sample_gradient_out = np.empty_like(target, dtype=X.dtype, order="C")
+
+    if solver == "lbfgs":
+        func = functools.partial(
+            linear_loss.loss_gradient,
+            per_sample_loss_out=per_sample_loss_out,
+            per_sample_gradient_out=per_sample_gradient_out,
+        )
+    elif solver == "newton-cg":
+        per_sample_hessian_out = np.empty_like(per_sample_gradient_out)
+
+        func = functools.partial(
+            linear_loss.loss,
+            per_sample_loss_out=per_sample_loss_out,
+        )
+        grad = functools.partial(
+            linear_loss.gradient, per_sample_gradient_out=per_sample_gradient_out
+        )
+        hess = functools.partial(
+            linear_loss.gradient_hessian_product,  # hess = [gradient, hessp]
+            per_sample_gradient_out=per_sample_gradient_out,
+            per_sample_hessian_out=per_sample_hessian_out,
+        )
-        hess = functools.partial(
-            linear_loss.gradient_hessian_product,  # hess = [gradient, hessp]
-            per_sample_gradient_out=per_sample_gradient_out,
-            per_sample_hessian_out=per_sample_hessian_out,
-        )
+        # hess = [gradient, hessp]
+        hess = functools.partial(
+            linear_loss.gradient_hessian_product,
+            per_sample_gradient_out=per_sample_gradient_out,
+            per_sample_hessian_out=per_sample_hessian_out,
+        )
-        hess = functools.partial(
-            linear_loss.gradient_hessian_product,  # hess = [gradient, hessp]
-            per_sample_gradient_out=per_sample_gradient_out,
-            per_sample_hessian_out=per_sample_hessian_out,
-        )
+        # hess = [gradient, hessp]
+        hess = functools.partial(
+            linear_loss.gradient_hessian_product,
+            per_sample_gradient_out=per_sample_gradient_out,
+            per_sample_hessian_out=per_sample_hessian_out,
+        )
+
     coefs = list()
     n_iter = np.zeros(len(Cs), dtype=np.int32)
     for i, C in enumerate(Cs):