diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index c0c64b4cc791c..6d8b4bc36c3d6 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -26,6 +26,14 @@ Changelog (regression introduced in 0.21) :issue:`13910` by :user:`Jérémie du Boisberranger `. +:mod:`sklearn.utils.sparsefuncs` +................................ + +- |Fix| Fixed a bug where :func:`min_max_axis` would fail on 32-bit systems + for certain large inputs. This affects :class:`preprocessing.MaxAbsScaler`, + :func:`preprocessing.normalize` and :class:`preprocessing.LabelBinarizer`. + :pr:`13741` by :user:`Roddy MacSween `. + .. _changes_0_21_1: Version 0.21.1 diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index 918f32e6da3e5..92b4f8dbfae19 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -341,6 +341,11 @@ def inplace_swap_column(X, m, n): def _minor_reduce(X, ufunc): major_index = np.flatnonzero(np.diff(X.indptr)) + + # reduceat tries casts X.indptr to intp, which errors + # if it is int64 on a 32 bit system. + # Reinitializing prevents this where possible, see #13737 + X = type(X)((X.data, X.indices, X.indptr), shape=X.shape) value = ufunc.reduceat(X.data, X.indptr[major_index]) return major_index, value diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 8011854f3270b..31118b2a921f3 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -393,14 +393,18 @@ def test_inplace_swap_column(): [(0, np.min, np.max, False), (np.nan, np.nanmin, np.nanmax, True)] ) +@pytest.mark.parametrize("large_indices", [True, False]) def test_min_max(dtype, axis, sparse_format, missing_values, min_func, - max_func, ignore_nan): + max_func, ignore_nan, large_indices): X = np.array([[0, 3, 0], [2, -1, missing_values], [0, 0, 0], [9, missing_values, 7], [4, 0, 5]], dtype=dtype) X_sparse = sparse_format(X) + if large_indices: + X_sparse.indices = X_sparse.indices.astype('int64') + X_sparse.indptr = X_sparse.indptr.astype('int64') mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis, ignore_nan=ignore_nan)