scikit-learn · lesteve · Oct 31, 2023 · Oct 26, 2023 · Oct 26, 2023 · Oct 26, 2023
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -35,6 +35,9 @@ random sampling procedures.
       solvers (when fit on the same data again). The amount of change depends on the
       specified `tol`, for small values you will get more precise results.
 
+- |Fix| fixes a memory leak seen in PyPy for estimators using the Cython loss functions.
+  :pr:`27670` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Changes impacting all modules
 -----------------------------
 

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
@@ -870,7 +870,9 @@ cdef class CyLossFunction:
         floating_out[::1] loss_out,             # OUT
         int n_threads=1
     ):
-        """Compute the pointwise loss value for each input.
+        """Compute the point-wise loss value for each input.
+
+        The point-wise loss is written to `loss_out` and no array is returned.
 
         Parameters
         ----------
@@ -884,11 +886,6 @@ cdef class CyLossFunction:
             A location into which the result is stored.
         n_threads : int
             Number of threads used by OpenMP (if any).
-
-        Returns
-        -------
-        loss : array of shape (n_samples,)
-            Element-wise loss function.
         """
         pass
 
@@ -902,6 +899,8 @@ cdef class CyLossFunction:
     ):
         """Compute gradient of loss w.r.t raw_prediction for each input.
 
+        The gradient is written to `gradient_out` and no array is returned.
+
         Parameters
         ----------
         y_true : array of shape (n_samples,)
@@ -914,11 +913,6 @@ cdef class CyLossFunction:
             A location into which the result is stored.
         n_threads : int
             Number of threads used by OpenMP (if any).
-
-        Returns
-        -------
-        gradient : array of shape (n_samples,)
-            Element-wise gradients.
         """
         pass
 
@@ -933,6 +927,9 @@ cdef class CyLossFunction:
     ):
         """Compute loss and gradient of loss w.r.t raw_prediction.
 
+        The loss and gradient are written to `loss_out` and `gradient_out` and no arrays
+        are returned.
+
         Parameters
         ----------
         y_true : array of shape (n_samples,)
@@ -947,18 +944,9 @@ cdef class CyLossFunction:
             A location into which the gradient is stored.
         n_threads : int
             Number of threads used by OpenMP (if any).
-
-        Returns
-        -------
-        loss : array of shape (n_samples,)
-            Element-wise loss function.
-
-        gradient : array of shape (n_samples,)
-            Element-wise gradients.
         """
         self.loss(y_true, raw_prediction, sample_weight, loss_out, n_threads)
         self.gradient(y_true, raw_prediction, sample_weight, gradient_out, n_threads)
-        return np.asarray(loss_out), np.asarray(gradient_out)
 
     def gradient_hessian(
         self,
@@ -971,6 +959,9 @@ cdef class CyLossFunction:
     ):
         """Compute gradient and hessian of loss w.r.t raw_prediction.
 
+        The gradient and hessian are written to `gradient_out` and `hessian_out` and no
+        arrays are returned.
+
         Parameters
         ----------
         y_true : array of shape (n_samples,)
@@ -985,14 +976,6 @@ cdef class CyLossFunction:
             A location into which the hessian is stored.
         n_threads : int
             Number of threads used by OpenMP (if any).
-
-        Returns
-        -------
-        gradient : array of shape (n_samples,)
-            Element-wise gradients.
-
-        hessian : array of shape (n_samples,)
-            Element-wise hessians.
         """
         pass
 
@@ -1045,8 +1028,6 @@ cdef class {{name}}(CyLossFunction):
             ):
                 loss_out[i] = sample_weight[i] * {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
 
-        return np.asarray(loss_out)
-
     {{if closs_grad is not None}}
     def loss_gradient(
         self,
@@ -1077,7 +1058,6 @@ cdef class {{name}}(CyLossFunction):
                 loss_out[i] = sample_weight[i] * dbl2.val1
                 gradient_out[i] = sample_weight[i] * dbl2.val2
 
-        return np.asarray(loss_out), np.asarray(gradient_out)
     {{endif}}
 
     def gradient(
@@ -1103,8 +1083,6 @@ cdef class {{name}}(CyLossFunction):
             ):
                 gradient_out[i] = sample_weight[i] * {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
 
-        return np.asarray(gradient_out)
-
     def gradient_hessian(
         self,
         const floating_in[::1] y_true,          # IN
@@ -1134,8 +1112,6 @@ cdef class {{name}}(CyLossFunction):
                 gradient_out[i] = sample_weight[i] * dbl2.val1
                 hessian_out[i] = sample_weight[i] * dbl2.val2
 
-        return np.asarray(gradient_out), np.asarray(hessian_out)
-
 {{endfor}}
 
 
@@ -1216,8 +1192,6 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
 
                 free(p)
 
-        return np.asarray(loss_out)
-
     def loss_gradient(
         self,
         const floating_in[::1] y_true,           # IN
@@ -1278,8 +1252,6 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
 
                 free(p)
 
-        return np.asarray(loss_out), np.asarray(gradient_out)
-
     def gradient(
         self,
         const floating_in[::1] y_true,           # IN
@@ -1327,8 +1299,6 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
 
                 free(p)
 
-        return np.asarray(gradient_out)
-
     def gradient_hessian(
         self,
         const floating_in[::1] y_true,           # IN
@@ -1381,9 +1351,6 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
 
                 free(p)
 
-        return np.asarray(gradient_out), np.asarray(hessian_out)
-
-
     # This method simplifies the implementation of hessp in linear models,
     # i.e. the matrix-vector product of the full hessian, not only of the
     # diagonal (in the classes) approximation as implemented above.
@@ -1434,5 +1401,3 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
                         gradient_out[i, k] = (proba_out[i, k] - (y_true[i] == k)) * sample_weight[i]
 
                 free(p)
-
-        return np.asarray(gradient_out), np.asarray(proba_out)
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
@@ -189,13 +189,14 @@ def loss(
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
 
-        return self.closs.loss(
+        self.closs.loss(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
             loss_out=loss_out,
             n_threads=n_threads,
         )
+        return loss_out
 
     def loss_gradient(
         self,
@@ -250,14 +251,15 @@ def loss_gradient(
         if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
             gradient_out = gradient_out.squeeze(1)
 
-        return self.closs.loss_gradient(
+        self.closs.loss_gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
             loss_out=loss_out,
             gradient_out=gradient_out,
             n_threads=n_threads,
         )
+        return loss_out, gradient_out
 
     def gradient(
         self,
@@ -299,13 +301,14 @@ def gradient(
         if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
             gradient_out = gradient_out.squeeze(1)
 
-        return self.closs.gradient(
+        self.closs.gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
             gradient_out=gradient_out,
             n_threads=n_threads,
         )
+        return gradient_out
 
     def gradient_hessian(
         self,
@@ -363,14 +366,15 @@ def gradient_hessian(
         if hessian_out.ndim == 2 and hessian_out.shape[1] == 1:
             hessian_out = hessian_out.squeeze(1)
 
-        return self.closs.gradient_hessian(
+        self.closs.gradient_hessian(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
             gradient_out=gradient_out,
             hessian_out=hessian_out,
             n_threads=n_threads,
         )
+        return gradient_out, hessian_out
 
     def __call__(self, y_true, raw_prediction, sample_weight=None, n_threads=1):
         """Compute the weighted average loss.
@@ -1075,14 +1079,15 @@ def gradient_proba(
         elif proba_out is None:
             proba_out = np.empty_like(gradient_out)
 
-        return self.closs.gradient_proba(
+        self.closs.gradient_proba(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
             gradient_out=gradient_out,
             proba_out=proba_out,
             n_threads=n_threads,
         )
+        return gradient_out, proba_out
 
 
 class ExponentialLoss(BaseLoss):

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
@@ -383,34 +383,32 @@ def test_loss_same_as_C_functions(loss, sample_weight):
     out_g2 = np.empty_like(raw_prediction)
     out_h1 = np.empty_like(raw_prediction)
     out_h2 = np.empty_like(raw_prediction)
-    assert_allclose(
-        loss.loss(
-            y_true=y_true,
-            raw_prediction=raw_prediction,
-            sample_weight=sample_weight,
-            loss_out=out_l1,
-        ),
-        loss.closs.loss(
-            y_true=y_true,
-            raw_prediction=raw_prediction,
-            sample_weight=sample_weight,
-            loss_out=out_l2,
-        ),
+    loss.loss(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss_out=out_l1,
     )
-    assert_allclose(
-        loss.gradient(
-            y_true=y_true,
-            raw_prediction=raw_prediction,
-            sample_weight=sample_weight,
-            gradient_out=out_g1,
-        ),
-        loss.closs.gradient(
-            y_true=y_true,
-            raw_prediction=raw_prediction,
-            sample_weight=sample_weight,
-            gradient_out=out_g2,
-        ),
+    loss.closs.loss(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss_out=out_l2,
+    ),
+    assert_allclose(out_l1, out_l2)
+    loss.gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient_out=out_g1,
+    )
+    loss.closs.gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient_out=out_g2,
     )
+    assert_allclose(out_g1, out_g2)
     loss.closs.loss_gradient(
         y_true=y_true,
         raw_prediction=raw_prediction,