ENH: stats.boxcox_llf: vectorize for n-D arrays (scipy#21233)

lucascolley · mdhaber · web-flow · commit fe927e35d7e2 · 2025-04-13T18:14:21.000-07:00
* ENH: `stats.boxcox_llf`: vectorize for n-D arrays

---------

Co-authored-by: Matt Haberland &lt;mhaberla@calpoly.edu&gt;
diff --git a/scipy/stats/_morestats.py b/scipy/stats/_morestats.py
@@ -854,25 +854,28 @@ def ppcc_plot(x, a, b, dist='tukeylambda', plot=None, N=80):
     return svals, ppcc
 
 
-def _log_mean(logx):
+def _log_mean(logx, axis):
     # compute log of mean of x from log(x)
-    res = special.logsumexp(logx, axis=0) - math.log(logx.shape[0])
-    return res
+    return (
+        special.logsumexp(logx, axis=axis, keepdims=True)
+        - math.log(logx.shape[axis])
+    )
 
 
-def _log_var(logx, xp):
+def _log_var(logx, xp, axis):
     # compute log of variance of x from log(x)
-    logmean = _log_mean(logx)
+    logmean = _log_mean(logx, axis=axis)
     # get complex dtype with component dtypes same as `logx` dtype;
     dtype = xp.result_type(logx.dtype, 1j)
     pij = xp.full(logx.shape, pi * 1j, dtype=dtype)
     logxmu = special.logsumexp(xp.stack((logx, logmean + pij)), axis=0)
-    res = (xp.real(xp.asarray(special.logsumexp(2 * logxmu, axis=0)))
-           - math.log(logx.shape[0]))
-    return res
+    return (
+        xp.real(xp.asarray(special.logsumexp(2 * logxmu, axis=axis)))
+        - math.log(logx.shape[axis])
+    )
 
 
-def boxcox_llf(lmb, data):
+def boxcox_llf(lmb, data, *, axis=0, keepdims=False, nan_policy='propagate'):
     r"""The boxcox log-likelihood function.
 
     Parameters
@@ -883,6 +886,26 @@ def boxcox_llf(lmb, data):
         Data to calculate Box-Cox log-likelihood for.  If `data` is
         multi-dimensional, the log-likelihood is calculated along the first
         axis.
+    axis : int, default: 0
+        If an int, the axis of the input along which to compute the statistic.
+        The statistic of each axis-slice (e.g. row) of the input will appear in a
+        corresponding element of the output.
+        If ``None``, the input will be raveled before computing the statistic.
+    nan_policy : {'propagate', 'omit', 'raise'
+        Defines how to handle input NaNs.
+
+        - ``propagate``: if a NaN is present in the axis slice (e.g. row) along
+          which the  statistic is computed, the corresponding entry of the output
+          will be NaN.
+        - ``omit``: NaNs will be omitted when performing the calculation.
+          If insufficient data remains in the axis slice along which the
+          statistic is computed, the corresponding entry of the output will be
+          NaN.
+        - ``raise``: if a NaN is present, a ``ValueError`` will be raised.
+    keepdims : bool, default: False
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input array.
 
     Returns
     -------
@@ -955,28 +978,39 @@ def boxcox_llf(lmb, data):
     >>> plt.show()
 
     """
+    # _axis_nan_policy decorator does not currently support these for non-NumPy arrays
+    kwargs = {}
+    if keepdims is not False:
+        kwargs[keepdims] = keepdims
+    if nan_policy != 'propagate':
+        kwargs[nan_policy] = nan_policy
+    return _boxcox_llf(data, lmb=lmb, axis=axis, **kwargs)
+
+
+@_axis_nan_policy_factory(lambda x: x, n_outputs=1, default_axis=0,
+                          result_to_tuple=lambda x: (x,))
+def _boxcox_llf(data, axis=0, *, lmb):
     xp = array_namespace(data)
     data = xp_promote(data, force_floating=True, xp=xp)
-
-    N = data.shape[0]
+    N = data.shape[axis]
     if N == 0:
-        return xp.nan
+        return _get_nan(data, xp=xp)
 
     logdata = xp.log(data)
 
     # Compute the variance of the transformed data.
     if lmb == 0:
-        logvar = xp.log(xp.var(logdata, axis=0))
+        logvar = xp.log(xp.var(logdata, axis=axis))
     else:
         # Transform without the constant offset 1/lmb.  The offset does
         # not affect the variance, and the subtraction of the offset can
         # lead to loss of precision.
         # Division by lmb can be factored out to enhance numerical stability.
         logx = lmb * logdata
-        logvar = _log_var(logx, xp) - 2 * math.log(abs(lmb))
+        logvar = _log_var(logx, xp, axis) - 2 * math.log(abs(lmb))
 
-    res = (lmb - 1) * xp.sum(logdata, axis=0) - N/2 * logvar
-    res = xp.astype(res, data.dtype, copy=False)
+    res = (lmb - 1) * xp.sum(logdata, axis=axis) - N/2 * logvar
+    res = xp.astype(res, data.dtype)
     res = res[()] if res.ndim == 0 else res
     return res
 
@@ -1081,7 +1115,7 @@ def boxcox(x, lmbda=None, alpha=None, optimizer=None):
     Notes
     -----
     The Box-Cox transform is given by:
-    
+
     .. math::
 
         y =
diff --git a/scipy/stats/tests/test_axis_nan_policy.py b/scipy/stats/tests/test_axis_nan_policy.py
@@ -173,6 +173,7 @@ def weightedtau_weighted(x, y, rank, **kwargs):
     (gstd, tuple(), dict(), 1, 1, False, lambda x: (x,)),
     (stats.power_divergence, tuple(), dict(), 1, 2, False, None),
     (stats.chisquare, tuple(), dict(), 1, 2, False, None),
+    (stats._morestats._boxcox_llf, tuple(), dict(lmb=1.5), 1, 1, False, lambda x: (x,)),
 ]
 
 # If the message is one of those expected, put nans in
diff --git a/scipy/stats/tests/test_morestats.py b/scipy/stats/tests/test_morestats.py
@@ -5,6 +5,7 @@
 import math
 import warnings
 import sys
+import contextlib
 from functools import partial
 
 import numpy as np
@@ -2024,7 +2025,11 @@ def test_2d_input(self, xp):
         xp_assert_close(xp.asarray([llf, llf]), xp.asarray(llf2), rtol=1e-12)
 
     def test_empty(self, xp):
-        assert xp.isnan(xp.asarray(stats.boxcox_llf(1, xp.asarray([]))))
+        message = "One or more sample arguments is too small..."
+        context = (pytest.warns(SmallSampleWarning, match=message) if is_numpy(xp)
+                   else contextlib.nullcontext())
+        with context:
+            assert xp.isnan(xp.asarray(stats.boxcox_llf(1, xp.asarray([]))))
 
     def test_gh_6873(self, xp):
         # Regression test for gh-6873.
@@ -2041,6 +2046,22 @@ def test_instability_gh20021(self, xp):
         # expect float64 output for integer input
         xp_assert_close(llf, xp.asarray(-15.32401272869016598, dtype=xp.float64))
 
+    def test_axis(self, xp):
+        data = xp.asarray([[100, 200], [300, 400]])
+        llf_axis_0 = stats.boxcox_llf(1, data, axis=0)
+        data_axes_swapped = xp.moveaxis(data, 0, -1)
+        llf_0 = xp.asarray([
+                stats.boxcox_llf(1, data_axes_swapped[0, :]),
+                stats.boxcox_llf(1, data_axes_swapped[1, :]),
+        ])
+        xp_assert_close(llf_axis_0, llf_0)
+        llf_axis_1 = stats.boxcox_llf(1, data, axis=1)
+        llf_1 = xp.asarray([
+            stats.boxcox_llf(1, data[0, :]),
+            stats.boxcox_llf(1, data[1, :]),
+        ])
+        xp_assert_close(llf_axis_1, llf_1)
+
 
 # This is the data from GitHub user Qukaiyi, given as an example
 # of a data set that caused boxcox to fail.

Original file line number	Diff line number	Diff line change
`@@ -173,6 +173,7 @@ def weightedtau_weighted(x, y, rank, **kwargs):`
`173`	`173`	`(gstd, tuple(), dict(), 1, 1, False, lambda x: (x,)),`
`174`	`174`	`(stats.power_divergence, tuple(), dict(), 1, 2, False, None),`
`175`	`175`	`(stats.chisquare, tuple(), dict(), 1, 2, False, None),`
	`176`	`+ (stats._morestats._boxcox_llf, tuple(), dict(lmb=1.5), 1, 1, False, lambda x: (x,)),`
`176`	`177`	`]`
`177`	`178`
`178`	`179`	`# If the message is one of those expected, put nans in`