FIX downcast large matrix indices where possible in sparsefuncs._minor_reduce (fix scikit-learn#13737) (scikit-learn#13741)

rlms · jnothman · commit e833c923461e · 2019-05-23T23:12:01.000+10:00
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -33,6 +33,14 @@ Changelog
   `drop` parameter was not reflected in `get_feature_names`. :pr:`13894`
   by :user:`James Myatt <jamesmyatt>`.
 
+:mod:`sklearn.utils.sparsefuncs`
+................................
+
+- |Fix| Fixed a bug where :func:`min_max_axis` would fail on 32-bit systems
+  for certain large inputs. This affects :class:`preprocessing.MaxAbsScaler`, 
+  :func:`preprocessing.normalize` and :class:`preprocessing.LabelBinarizer`.
+  :pr:`13741` by :user:`Roddy MacSween <rlms>`.
+
 .. _changes_0_21_1:
 
 Version 0.21.1
diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py
@@ -341,6 +341,11 @@ def inplace_swap_column(X, m, n):
 
 def _minor_reduce(X, ufunc):
     major_index = np.flatnonzero(np.diff(X.indptr))
+
+    # reduceat tries casts X.indptr to intp, which errors
+    # if it is int64 on a 32 bit system.
+    # Reinitializing prevents this where possible, see #13737
+    X = type(X)((X.data, X.indices, X.indptr), shape=X.shape)
     value = ufunc.reduceat(X.data, X.indptr[major_index])
     return major_index, value
 
diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py
@@ -393,14 +393,18 @@ def test_inplace_swap_column():
     [(0, np.min, np.max, False),
      (np.nan, np.nanmin, np.nanmax, True)]
 )
+@pytest.mark.parametrize("large_indices", [True, False])
 def test_min_max(dtype, axis, sparse_format, missing_values, min_func,
-                 max_func, ignore_nan):
+                 max_func, ignore_nan, large_indices):
     X = np.array([[0, 3, 0],
                   [2, -1, missing_values],
                   [0, 0, 0],
                   [9, missing_values, 7],
                   [4, 0, 5]], dtype=dtype)
     X_sparse = sparse_format(X)
+    if large_indices:
+        X_sparse.indices = X_sparse.indices.astype('int64')
+        X_sparse.indptr = X_sparse.indptr.astype('int64')
 
     mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis,
                                             ignore_nan=ignore_nan)