Skip to content

Commit e833c92

Browse files
rlmsjnothman
authored andcommitted
FIX downcast large matrix indices where possible in sparsefuncs._minor_reduce (fix scikit-learn#13737) (scikit-learn#13741)
1 parent 45ab4c6 commit e833c92

File tree

3 files changed

+18
-1
lines changed

3 files changed

+18
-1
lines changed

doc/whats_new/v0.21.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,14 @@ Changelog
3333
`drop` parameter was not reflected in `get_feature_names`. :pr:`13894`
3434
by :user:`James Myatt <jamesmyatt>`.
3535

36+
:mod:`sklearn.utils.sparsefuncs`
37+
................................
38+
39+
- |Fix| Fixed a bug where :func:`min_max_axis` would fail on 32-bit systems
40+
for certain large inputs. This affects :class:`preprocessing.MaxAbsScaler`,
41+
:func:`preprocessing.normalize` and :class:`preprocessing.LabelBinarizer`.
42+
:pr:`13741` by :user:`Roddy MacSween <rlms>`.
43+
3644
.. _changes_0_21_1:
3745

3846
Version 0.21.1

sklearn/utils/sparsefuncs.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,11 @@ def inplace_swap_column(X, m, n):
341341

342342
def _minor_reduce(X, ufunc):
343343
major_index = np.flatnonzero(np.diff(X.indptr))
344+
345+
# reduceat tries casts X.indptr to intp, which errors
346+
# if it is int64 on a 32 bit system.
347+
# Reinitializing prevents this where possible, see #13737
348+
X = type(X)((X.data, X.indices, X.indptr), shape=X.shape)
344349
value = ufunc.reduceat(X.data, X.indptr[major_index])
345350
return major_index, value
346351

sklearn/utils/tests/test_sparsefuncs.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,14 +393,18 @@ def test_inplace_swap_column():
393393
[(0, np.min, np.max, False),
394394
(np.nan, np.nanmin, np.nanmax, True)]
395395
)
396+
@pytest.mark.parametrize("large_indices", [True, False])
396397
def test_min_max(dtype, axis, sparse_format, missing_values, min_func,
397-
max_func, ignore_nan):
398+
max_func, ignore_nan, large_indices):
398399
X = np.array([[0, 3, 0],
399400
[2, -1, missing_values],
400401
[0, 0, 0],
401402
[9, missing_values, 7],
402403
[4, 0, 5]], dtype=dtype)
403404
X_sparse = sparse_format(X)
405+
if large_indices:
406+
X_sparse.indices = X_sparse.indices.astype('int64')
407+
X_sparse.indptr = X_sparse.indptr.astype('int64')
404408

405409
mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis,
406410
ignore_nan=ignore_nan)

0 commit comments

Comments
 (0)