scikit-learn · glemaitre · Sep 11, 2023 · Sep 10, 2023
diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd
@@ -1,13 +1,17 @@
 # cython: language_level=3
 
-# Fused types for y_true, y_pred, raw_prediction
-ctypedef fused Y_DTYPE_C:
+# Fused types for input like y_true, raw_prediction, sample_weights.
+ctypedef fused floating_in:
     double
     float
 
 
-# Fused types for gradient and hessian
-ctypedef fused G_DTYPE_C:
+# Fused types for output like gradient and hessian
+# We use a different fused types for input (floating_in) and output (floating_out), such
+# that input and output can have different dtypes in the same function call. A single
+# fused type can only take on one single value (type) for all arguments in one function
+# call.
+ctypedef fused floating_out:
     double
     float
 

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
@@ -268,8 +268,8 @@ cdef inline double log1pexp(double x) noexcept nogil:
 
 cdef inline void sum_exp_minus_max(
     const int i,
-    const Y_DTYPE_C[:, :] raw_prediction,  # IN
-    Y_DTYPE_C *p                           # OUT
+    const floating_in[:, :] raw_prediction,  # IN
+    floating_in *p                           # OUT
 ) noexcept nogil:
     # Thread local buffers are used to stores results of this function via p.
     # The results are stored as follows:
@@ -744,7 +744,7 @@ cdef inline double_pair cgrad_hess_half_binomial(
     double raw_prediction
 ) noexcept nogil:
     # with y_pred = expit(raw)
-    # hessian = y_pred * (1 - y_pred) = exp(raw) / (1 + exp(raw))**2
+    # hessian = y_pred * (1 - y_pred) = exp( raw) / (1 + exp( raw))**2
     #                                 = exp(-raw) / (1 + exp(-raw))**2
     cdef double_pair gh
     gh.val2 = exp(-raw_prediction)  # used as temporary
@@ -835,7 +835,9 @@ cdef class CyLossFunction:
         """
         pass
 
-    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil:
+    cdef double_pair cy_grad_hess(
+        self, double y_true, double raw_prediction
+    ) noexcept nogil:
         """Compute gradient and hessian.
 
         Gradient and hessian of loss w.r.t. raw_prediction for a single sample.
@@ -862,10 +864,10 @@ cdef class CyLossFunction:
 
     def loss(
         self,
-        const Y_DTYPE_C[::1] y_true,          # IN
-        const Y_DTYPE_C[::1] raw_prediction,  # IN
-        const Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] loss_out,        # OUT
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] loss_out,             # OUT
         int n_threads=1
     ):
         """Compute the pointwise loss value for each input.
@@ -892,10 +894,10 @@ cdef class CyLossFunction:
 
     def gradient(
         self,
-        const Y_DTYPE_C[::1] y_true,          # IN
-        const Y_DTYPE_C[::1] raw_prediction,  # IN
-        const Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] gradient_out,    # OUT
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] gradient_out,         # OUT
         int n_threads=1
     ):
         """Compute gradient of loss w.r.t raw_prediction for each input.
@@ -922,11 +924,11 @@ cdef class CyLossFunction:
 
     def loss_gradient(
         self,
-        const Y_DTYPE_C[::1] y_true,          # IN
-        const Y_DTYPE_C[::1] raw_prediction,  # IN
-        const Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] loss_out,        # OUT
-        G_DTYPE_C[::1] gradient_out,    # OUT
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] loss_out,             # OUT
+        floating_out[::1] gradient_out,         # OUT
         int n_threads=1
     ):
         """Compute loss and gradient of loss w.r.t raw_prediction.
@@ -960,11 +962,11 @@ cdef class CyLossFunction:
 
     def gradient_hessian(
         self,
-        const Y_DTYPE_C[::1] y_true,          # IN
-        const Y_DTYPE_C[::1] raw_prediction,  # IN
-        const Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] gradient_out,    # OUT
-        G_DTYPE_C[::1] hessian_out,     # OUT
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] gradient_out,         # OUT
+        floating_out[::1] hessian_out,          # OUT
         int n_threads=1
     ):
         """Compute gradient and hessian of loss w.r.t raw_prediction.
@@ -1022,10 +1024,10 @@ cdef class {{name}}(CyLossFunction):
 
     def loss(
         self,
-        const Y_DTYPE_C[::1] y_true,          # IN
-        const Y_DTYPE_C[::1] raw_prediction,  # IN
-        const Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] loss_out,              # OUT
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] loss_out,             # OUT
         int n_threads=1
     ):
         cdef:
@@ -1048,11 +1050,11 @@ cdef class {{name}}(CyLossFunction):
     {{if closs_grad is not None}}
     def loss_gradient(
         self,
-        const Y_DTYPE_C[::1] y_true,          # IN
-        const Y_DTYPE_C[::1] raw_prediction,  # IN
-        const Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] loss_out,              # OUT
-        G_DTYPE_C[::1] gradient_out,          # OUT
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] loss_out,             # OUT
+        floating_out[::1] gradient_out,         # OUT
         int n_threads=1
     ):
         cdef:
@@ -1080,10 +1082,10 @@ cdef class {{name}}(CyLossFunction):
 
     def gradient(
         self,
-        const Y_DTYPE_C[::1] y_true,          # IN
-        const Y_DTYPE_C[::1] raw_prediction,  # IN
-        const Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] gradient_out,          # OUT
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] gradient_out,         # OUT
         int n_threads=1
     ):
         cdef:
@@ -1105,11 +1107,11 @@ cdef class {{name}}(CyLossFunction):
 
     def gradient_hessian(
         self,
-        const Y_DTYPE_C[::1] y_true,          # IN
-        const Y_DTYPE_C[::1] raw_prediction,  # IN
-        const Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] gradient_out,          # OUT
-        G_DTYPE_C[::1] hessian_out,           # OUT
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] gradient_out,         # OUT
+        floating_out[::1] hessian_out,          # OUT
         int n_threads=1
     ):
         cdef:
@@ -1158,18 +1160,18 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
     # opposite are welcome.
     def loss(
         self,
-        const Y_DTYPE_C[::1] y_true,           # IN
-        const Y_DTYPE_C[:, :] raw_prediction,  # IN
-        const Y_DTYPE_C[::1] sample_weight,    # IN
-        G_DTYPE_C[::1] loss_out,               # OUT
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[::1] loss_out,              # OUT
         int n_threads=1
     ):
         cdef:
             int i, k
             int n_samples = y_true.shape[0]
             int n_classes = raw_prediction.shape[1]
-            Y_DTYPE_C max_value, sum_exps
-            Y_DTYPE_C*  p  # temporary buffer
+            floating_in max_value, sum_exps
+            floating_in*  p  # temporary buffer
 
         # We assume n_samples > n_classes. In this case having the inner loop
         # over n_classes is a good default.
@@ -1181,7 +1183,7 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
             with nogil, parallel(num_threads=n_threads):
                 # Define private buffer variables as each thread might use its
                 # own.
-                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
 
                 for i in prange(n_samples, schedule='static'):
                     sum_exp_minus_max(i, raw_prediction, p)
@@ -1197,7 +1199,7 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
                 free(p)
         else:
             with nogil, parallel(num_threads=n_threads):
-                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
 
                 for i in prange(n_samples, schedule='static'):
                     sum_exp_minus_max(i, raw_prediction, p)
@@ -1218,26 +1220,26 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
 
     def loss_gradient(
         self,
-        const Y_DTYPE_C[::1] y_true,           # IN
-        const Y_DTYPE_C[:, :] raw_prediction,  # IN
-        const Y_DTYPE_C[::1] sample_weight,    # IN
-        G_DTYPE_C[::1] loss_out,               # OUT
-        G_DTYPE_C[:, :] gradient_out,          # OUT
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[::1] loss_out,              # OUT
+        floating_out[:, :] gradient_out,         # OUT
         int n_threads=1
     ):
         cdef:
             int i, k
             int n_samples = y_true.shape[0]
             int n_classes = raw_prediction.shape[1]
-            Y_DTYPE_C max_value, sum_exps
-            Y_DTYPE_C*  p  # temporary buffer
+            floating_in max_value, sum_exps
+            floating_in*  p  # temporary buffer
 
         if sample_weight is None:
             # inner loop over n_classes
             with nogil, parallel(num_threads=n_threads):
                 # Define private buffer variables as each thread might use its
                 # own.
-                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
 
                 for i in prange(n_samples, schedule='static'):
                     sum_exp_minus_max(i, raw_prediction, p)
@@ -1256,7 +1258,7 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
                 free(p)
         else:
             with nogil, parallel(num_threads=n_threads):
-                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
 
                 for i in prange(n_samples, schedule='static'):
                     sum_exp_minus_max(i, raw_prediction, p)
@@ -1280,25 +1282,25 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
 
     def gradient(
         self,
-        const Y_DTYPE_C[::1] y_true,           # IN
-        const Y_DTYPE_C[:, :] raw_prediction,  # IN
-        const Y_DTYPE_C[::1] sample_weight,    # IN
-        G_DTYPE_C[:, :] gradient_out,          # OUT
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[:, :] gradient_out,         # OUT
         int n_threads=1
     ):
         cdef:
             int i, k
             int n_samples = y_true.shape[0]
             int n_classes = raw_prediction.shape[1]
-            Y_DTYPE_C sum_exps
-            Y_DTYPE_C*  p  # temporary buffer
+            floating_in sum_exps
+            floating_in*  p  # temporary buffer
 
         if sample_weight is None:
             # inner loop over n_classes
             with nogil, parallel(num_threads=n_threads):
                 # Define private buffer variables as each thread might use its
                 # own.
-                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
 
                 for i in prange(n_samples, schedule='static'):
                     sum_exp_minus_max(i, raw_prediction, p)
@@ -1312,7 +1314,7 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
                 free(p)
         else:
             with nogil, parallel(num_threads=n_threads):
-                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
 
                 for i in prange(n_samples, schedule='static'):
                     sum_exp_minus_max(i, raw_prediction, p)
@@ -1329,26 +1331,26 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
 
     def gradient_hessian(
         self,
-        const Y_DTYPE_C[::1] y_true,           # IN
-        const Y_DTYPE_C[:, :] raw_prediction,  # IN
-        const Y_DTYPE_C[::1] sample_weight,    # IN
-        G_DTYPE_C[:, :] gradient_out,          # OUT
-        G_DTYPE_C[:, :] hessian_out,           # OUT
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[:, :] gradient_out,         # OUT
+        floating_out[:, :] hessian_out,          # OUT
         int n_threads=1
     ):
         cdef:
             int i, k
             int n_samples = y_true.shape[0]
             int n_classes = raw_prediction.shape[1]
-            Y_DTYPE_C sum_exps
-            Y_DTYPE_C* p  # temporary buffer
+            floating_in sum_exps
+            floating_in* p  # temporary buffer
 
         if sample_weight is None:
             # inner loop over n_classes
             with nogil, parallel(num_threads=n_threads):
                 # Define private buffer variables as each thread might use its
                 # own.
-                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
 
                 for i in prange(n_samples, schedule='static'):
                     sum_exp_minus_max(i, raw_prediction, p)
@@ -1364,7 +1366,7 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
                 free(p)
         else:
             with nogil, parallel(num_threads=n_threads):
-                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
 
                 for i in prange(n_samples, schedule='static'):
                     sum_exp_minus_max(i, raw_prediction, p)
@@ -1387,26 +1389,26 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
     # diagonal (in the classes) approximation as implemented above.
     def gradient_proba(
         self,
-        const Y_DTYPE_C[::1] y_true,           # IN
-        const Y_DTYPE_C[:, :] raw_prediction,  # IN
-        const Y_DTYPE_C[::1] sample_weight,    # IN
-        G_DTYPE_C[:, :] gradient_out,          # OUT
-        G_DTYPE_C[:, :] proba_out,             # OUT
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[:, :] gradient_out,         # OUT
+        floating_out[:, :] proba_out,            # OUT
         int n_threads=1
     ):
         cdef:
             int i, k
             int n_samples = y_true.shape[0]
             int n_classes = raw_prediction.shape[1]
-            Y_DTYPE_C sum_exps
-            Y_DTYPE_C*  p  # temporary buffer
+            floating_in sum_exps
+            floating_in*  p  # temporary buffer
 
         if sample_weight is None:
             # inner loop over n_classes
             with nogil, parallel(num_threads=n_threads):
                 # Define private buffer variables as each thread might use its
                 # own.
-                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
 
                 for i in prange(n_samples, schedule='static'):
                     sum_exp_minus_max(i, raw_prediction, p)
@@ -1420,7 +1422,7 @@ cdef class CyHalfMultinomialLoss(CyLossFunction):
                 free(p)
         else:
             with nogil, parallel(num_threads=n_threads):
-                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
 
                 for i in prange(n_samples, schedule='static'):
                     sum_exp_minus_max(i, raw_prediction, p)