From 35afc42639e00bb5afbe3437deabe40f57cddab1 Mon Sep 17 00:00:00 2001 From: Roddy Date: Tue, 23 Apr 2019 16:47:32 +0100 Subject: [PATCH 01/13] Changed VarianceThreshold behaviour when threshold is zero. See #13691 --- sklearn/feature_selection/variance_threshold.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py index c9e018d94a84e..361711edde572 100644 --- a/sklearn/feature_selection/variance_threshold.py +++ b/sklearn/feature_selection/variance_threshold.py @@ -5,7 +5,7 @@ from ..base import BaseEstimator from .base import SelectorMixin from ..utils import check_array -from ..utils.sparsefuncs import mean_variance_axis +from ..utils.sparsefuncs import mean_variance_axis, min_max_axis from ..utils.validation import check_is_fitted @@ -64,9 +64,16 @@ def fit(self, X, y=None): X = check_array(X, ('csr', 'csc'), dtype=np.float64) if hasattr(X, "toarray"): # sparse matrix - _, self.variances_ = mean_variance_axis(X, axis=0) + if self.threshold == 0.: + mins, maxes = min_max_axis(X, axis=0) + self.variances_ = maxes - mins + else: + _, self.variances_ = mean_variance_axis(X, axis=0) else: - self.variances_ = np.var(X, axis=0) + if self.threshold == 0.: + self.variances_ = np.ptp(X, axis=0) + else: + self.variances_ = np.var(X, axis=0) if np.all(self.variances_ <= self.threshold): msg = "No feature in X meets the variance threshold {0:.5f}" From 330d5de7bc58748aaee4ebef148bc99bd01b03b1 Mon Sep 17 00:00:00 2001 From: Roddy Date: Wed, 24 Apr 2019 12:23:43 +0100 Subject: [PATCH 02/13] Slightly modified change to VarianceThreshold and added test --- .../tests/test_variance_threshold.py | 18 +++++++++++++++++- .../feature_selection/variance_threshold.py | 11 +++++------ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py index a40491302f350..674e2b8f3671b 100644 --- a/sklearn/feature_selection/tests/test_variance_threshold.py +++ b/sklearn/feature_selection/tests/test_variance_threshold.py @@ -1,5 +1,7 @@ +import numpy as np + from sklearn.utils.testing import (assert_array_equal, assert_equal, - assert_raises) + assert_not_equal, assert_raises) from scipy.sparse import bsr_matrix, csc_matrix, csr_matrix @@ -26,3 +28,17 @@ def test_variance_threshold(): for X in [data, csr_matrix(data)]: X = VarianceThreshold(threshold=.4).fit_transform(X) assert_equal((len(data), 1), X.shape) + + +def test_zero_variance_floating_point_error(): + # Test that VarianceThreshold(0.0).fit eliminates features that have + # the same value in every sample, even when floating point errors + # cause np.var not to be 0 for the feature. + # See #13691 + + data = [[-0.13725701]] * 10 + assert_not_equal(np.var(data), 0.) + for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]: + assert_raises(ValueError, VarianceThreshold().fit, X) + + diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py index 361711edde572..a18e37c1a79f1 100644 --- a/sklearn/feature_selection/variance_threshold.py +++ b/sklearn/feature_selection/variance_threshold.py @@ -64,16 +64,15 @@ def fit(self, X, y=None): X = check_array(X, ('csr', 'csc'), dtype=np.float64) if hasattr(X, "toarray"): # sparse matrix + _, self.variances_ = mean_variance_axis(X, axis=0) if self.threshold == 0.: mins, maxes = min_max_axis(X, axis=0) - self.variances_ = maxes - mins - else: - _, self.variances_ = mean_variance_axis(X, axis=0) + self.variances_ = np.minimum(self.variances_, maxes - mins) else: + self.variances_ = np.var(X, axis=0) if self.threshold == 0.: - self.variances_ = np.ptp(X, axis=0) - else: - self.variances_ = np.var(X, axis=0) + peak_to_peaks = np.ptp(X, axis=0) + self.variances_ = np.minimum(self.variances_, peak_to_peaks) if np.all(self.variances_ <= self.threshold): msg = "No feature in X meets the variance threshold {0:.5f}" From d00257b27892560ef9fa5834c002ea0ef9521fb7 Mon Sep 17 00:00:00 2001 From: Roddy Date: Wed, 24 Apr 2019 12:28:16 +0100 Subject: [PATCH 03/13] Removed blank lines from end of file --- sklearn/feature_selection/tests/test_variance_threshold.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py index 674e2b8f3671b..0e95d255ff4f2 100644 --- a/sklearn/feature_selection/tests/test_variance_threshold.py +++ b/sklearn/feature_selection/tests/test_variance_threshold.py @@ -40,5 +40,3 @@ def test_zero_variance_floating_point_error(): assert_not_equal(np.var(data), 0.) for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]: assert_raises(ValueError, VarianceThreshold().fit, X) - - From 77bda7948da22d772b85b9e4297f02b534a66276 Mon Sep 17 00:00:00 2001 From: Roddy Date: Wed, 24 Apr 2019 13:09:30 +0100 Subject: [PATCH 04/13] Minor changes to new VarianceThreshold behaviour --- sklearn/feature_selection/variance_threshold.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py index a18e37c1a79f1..80698aa68eaf5 100644 --- a/sklearn/feature_selection/variance_threshold.py +++ b/sklearn/feature_selection/variance_threshold.py @@ -67,12 +67,16 @@ def fit(self, X, y=None): _, self.variances_ = mean_variance_axis(X, axis=0) if self.threshold == 0.: mins, maxes = min_max_axis(X, axis=0) - self.variances_ = np.minimum(self.variances_, maxes - mins) + peak_to_peaks = maxes - mins else: self.variances_ = np.var(X, axis=0) if self.threshold == 0.: peak_to_peaks = np.ptp(X, axis=0) - self.variances_ = np.minimum(self.variances_, peak_to_peaks) + + if self.threshold == 0: + # Use peak-to-peak to avoid numeric precision issues + # for constant features + self.variances_ = np.minimum(self.variances_, peak_to_peaks) if np.all(self.variances_ <= self.threshold): msg = "No feature in X meets the variance threshold {0:.5f}" From 4ac45040d7197382b3eaa61a170de6b356b935cd Mon Sep 17 00:00:00 2001 From: Roddy Date: Wed, 24 Apr 2019 13:13:19 +0100 Subject: [PATCH 05/13] Commented test --- sklearn/feature_selection/tests/test_variance_threshold.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py index 0e95d255ff4f2..9a94d8bfa0355 100644 --- a/sklearn/feature_selection/tests/test_variance_threshold.py +++ b/sklearn/feature_selection/tests/test_variance_threshold.py @@ -39,4 +39,5 @@ def test_zero_variance_floating_point_error(): data = [[-0.13725701]] * 10 assert_not_equal(np.var(data), 0.) for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]: + # All features removed, so exception should be thrown assert_raises(ValueError, VarianceThreshold().fit, X) From ea02681b74773e0b519f5db1c9a3a4fdd9d0c1f5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 24 Apr 2019 16:18:34 +0100 Subject: [PATCH 06/13] Update sklearn/feature_selection/variance_threshold.py Co-Authored-By: rlms --- sklearn/feature_selection/variance_threshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py index 80698aa68eaf5..924ff14647bcf 100644 --- a/sklearn/feature_selection/variance_threshold.py +++ b/sklearn/feature_selection/variance_threshold.py @@ -70,7 +70,7 @@ def fit(self, X, y=None): peak_to_peaks = maxes - mins else: self.variances_ = np.var(X, axis=0) - if self.threshold == 0.: + if self.threshold == 0: peak_to_peaks = np.ptp(X, axis=0) if self.threshold == 0: From 52ca4f5a1fe49c0ce0095b63d4d52c4ab2dcdbe9 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 24 Apr 2019 16:18:43 +0100 Subject: [PATCH 07/13] Update sklearn/feature_selection/variance_threshold.py Co-Authored-By: rlms --- sklearn/feature_selection/variance_threshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py index 924ff14647bcf..7d98de82c9711 100644 --- a/sklearn/feature_selection/variance_threshold.py +++ b/sklearn/feature_selection/variance_threshold.py @@ -65,7 +65,7 @@ def fit(self, X, y=None): if hasattr(X, "toarray"): # sparse matrix _, self.variances_ = mean_variance_axis(X, axis=0) - if self.threshold == 0.: + if self.threshold == 0: mins, maxes = min_max_axis(X, axis=0) peak_to_peaks = maxes - mins else: From b6fd15ac6415b7010564eede3f479a02d95763f6 Mon Sep 17 00:00:00 2001 From: Roddy Date: Wed, 24 Apr 2019 16:24:29 +0100 Subject: [PATCH 08/13] Changed test format --- sklearn/feature_selection/tests/test_variance_threshold.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py index 9a94d8bfa0355..0d3185153d538 100644 --- a/sklearn/feature_selection/tests/test_variance_threshold.py +++ b/sklearn/feature_selection/tests/test_variance_threshold.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from sklearn.utils.testing import (assert_array_equal, assert_equal, assert_not_equal, assert_raises) @@ -39,5 +40,6 @@ def test_zero_variance_floating_point_error(): data = [[-0.13725701]] * 10 assert_not_equal(np.var(data), 0.) for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]: - # All features removed, so exception should be thrown - assert_raises(ValueError, VarianceThreshold().fit, X) + msg = "No feature in X meets the variance threshold 0.00000" + with pytest.raises(ValueError, match=msg): + VarianceThreshold().fit(X) From f23d5ab00d0e7f894e8a0c50e37b9a93c4df97e5 Mon Sep 17 00:00:00 2001 From: Roddy Date: Wed, 24 Apr 2019 17:10:54 +0100 Subject: [PATCH 09/13] Reformatted assertion in test --- sklearn/feature_selection/tests/test_variance_threshold.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py index 0d3185153d538..fba4478a28e2f 100644 --- a/sklearn/feature_selection/tests/test_variance_threshold.py +++ b/sklearn/feature_selection/tests/test_variance_threshold.py @@ -2,7 +2,7 @@ import pytest from sklearn.utils.testing import (assert_array_equal, assert_equal, - assert_not_equal, assert_raises) + assert_raises) from scipy.sparse import bsr_matrix, csc_matrix, csr_matrix @@ -38,7 +38,7 @@ def test_zero_variance_floating_point_error(): # See #13691 data = [[-0.13725701]] * 10 - assert_not_equal(np.var(data), 0.) + assert np.var(data) != 0 for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]: msg = "No feature in X meets the variance threshold 0.00000" with pytest.raises(ValueError, match=msg): From 40b0c1a823cc6ef00097e929418c44af110c2c40 Mon Sep 17 00:00:00 2001 From: Roddy Date: Sun, 28 Apr 2019 23:12:09 +0100 Subject: [PATCH 10/13] Added test --- sklearn/utils/sparsefuncs.py | 5 +++++ sklearn/utils/tests/test_sparsefuncs.py | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index 918f32e6da3e5..220fd2e2bfaaf 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -341,6 +341,11 @@ def inplace_swap_column(X, m, n): def _minor_reduce(X, ufunc): major_index = np.flatnonzero(np.diff(X.indptr)) + + # reduceat tries casts X.indptr to intp, which errors + # if it is int64 on a 32 bit system. + # Reinitializing prevents this where possible, see #13737 + # X = type(X)((X.data, X.indices, X.indptr), shape=X.shape) value = ufunc.reduceat(X.data, X.indptr[major_index]) return major_index, value diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 8011854f3270b..31118b2a921f3 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -393,14 +393,18 @@ def test_inplace_swap_column(): [(0, np.min, np.max, False), (np.nan, np.nanmin, np.nanmax, True)] ) +@pytest.mark.parametrize("large_indices", [True, False]) def test_min_max(dtype, axis, sparse_format, missing_values, min_func, - max_func, ignore_nan): + max_func, ignore_nan, large_indices): X = np.array([[0, 3, 0], [2, -1, missing_values], [0, 0, 0], [9, missing_values, 7], [4, 0, 5]], dtype=dtype) X_sparse = sparse_format(X) + if large_indices: + X_sparse.indices = X_sparse.indices.astype('int64') + X_sparse.indptr = X_sparse.indptr.astype('int64') mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis, ignore_nan=ignore_nan) From 137c4c6cf09bdcd009130f796711e01396707b05 Mon Sep 17 00:00:00 2001 From: Roddy Date: Sun, 28 Apr 2019 23:29:13 +0100 Subject: [PATCH 11/13] Rolled back changes from other branch --- .../tests/test_variance_threshold.py | 17 ----------------- sklearn/feature_selection/variance_threshold.py | 12 +----------- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py index fba4478a28e2f..a40491302f350 100644 --- a/sklearn/feature_selection/tests/test_variance_threshold.py +++ b/sklearn/feature_selection/tests/test_variance_threshold.py @@ -1,6 +1,3 @@ -import numpy as np -import pytest - from sklearn.utils.testing import (assert_array_equal, assert_equal, assert_raises) @@ -29,17 +26,3 @@ def test_variance_threshold(): for X in [data, csr_matrix(data)]: X = VarianceThreshold(threshold=.4).fit_transform(X) assert_equal((len(data), 1), X.shape) - - -def test_zero_variance_floating_point_error(): - # Test that VarianceThreshold(0.0).fit eliminates features that have - # the same value in every sample, even when floating point errors - # cause np.var not to be 0 for the feature. - # See #13691 - - data = [[-0.13725701]] * 10 - assert np.var(data) != 0 - for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]: - msg = "No feature in X meets the variance threshold 0.00000" - with pytest.raises(ValueError, match=msg): - VarianceThreshold().fit(X) diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py index 7d98de82c9711..c9e018d94a84e 100644 --- a/sklearn/feature_selection/variance_threshold.py +++ b/sklearn/feature_selection/variance_threshold.py @@ -5,7 +5,7 @@ from ..base import BaseEstimator from .base import SelectorMixin from ..utils import check_array -from ..utils.sparsefuncs import mean_variance_axis, min_max_axis +from ..utils.sparsefuncs import mean_variance_axis from ..utils.validation import check_is_fitted @@ -65,18 +65,8 @@ def fit(self, X, y=None): if hasattr(X, "toarray"): # sparse matrix _, self.variances_ = mean_variance_axis(X, axis=0) - if self.threshold == 0: - mins, maxes = min_max_axis(X, axis=0) - peak_to_peaks = maxes - mins else: self.variances_ = np.var(X, axis=0) - if self.threshold == 0: - peak_to_peaks = np.ptp(X, axis=0) - - if self.threshold == 0: - # Use peak-to-peak to avoid numeric precision issues - # for constant features - self.variances_ = np.minimum(self.variances_, peak_to_peaks) if np.all(self.variances_ <= self.threshold): msg = "No feature in X meets the variance threshold {0:.5f}" From 35c9b8297aa17df52597a07968ad121b8aba12ed Mon Sep 17 00:00:00 2001 From: Roddy Date: Mon, 29 Apr 2019 00:00:58 +0100 Subject: [PATCH 12/13] Uncommented change --- sklearn/utils/sparsefuncs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index 220fd2e2bfaaf..92b4f8dbfae19 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -345,7 +345,7 @@ def _minor_reduce(X, ufunc): # reduceat tries casts X.indptr to intp, which errors # if it is int64 on a 32 bit system. # Reinitializing prevents this where possible, see #13737 - # X = type(X)((X.data, X.indices, X.indptr), shape=X.shape) + X = type(X)((X.data, X.indices, X.indptr), shape=X.shape) value = ufunc.reduceat(X.data, X.indptr[major_index]) return major_index, value From d4e09cfcea390153e55fef2a26062a6a74445147 Mon Sep 17 00:00:00 2001 From: Roddy Date: Thu, 23 May 2019 12:27:45 +0100 Subject: [PATCH 13/13] Updated change log --- doc/whats_new/v0.21.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index c0c64b4cc791c..6d8b4bc36c3d6 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -26,6 +26,14 @@ Changelog (regression introduced in 0.21) :issue:`13910` by :user:`Jérémie du Boisberranger `. +:mod:`sklearn.utils.sparsefuncs` +................................ + +- |Fix| Fixed a bug where :func:`min_max_axis` would fail on 32-bit systems + for certain large inputs. This affects :class:`preprocessing.MaxAbsScaler`, + :func:`preprocessing.normalize` and :class:`preprocessing.LabelBinarizer`. + :pr:`13741` by :user:`Roddy MacSween `. + .. _changes_0_21_1: Version 0.21.1