Skip to content

Commit be03467

Browse files
rlmsjnothman
authored andcommitted
FIX Changed VarianceThreshold behaviour when threshold is zero. See #13691 (#13704)
1 parent 2a7194d commit be03467

File tree

3 files changed

+35
-3
lines changed

3 files changed

+35
-3
lines changed

doc/whats_new/v0.22.rst

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,21 @@ Changelog
6565
``decision_function_shape='ovr'``, and the number of target classes > 2.
6666
:pr:`12557` by `Adrin Jalali`_.
6767

68-
6968
:mod:`sklearn.cluster`
7069
..................
7170

7271
- |Enhancement| :class:`cluster.SpectralClustering` now accepts a ``n_components``
7372
parameter. This parameter extends `SpectralClustering` class functionality to
7473
match `spectral_clustering`.
7574
:pr:`13726` by :user:`Shuzhe Xiao <fdas3213>`.
75+
76+
:mod:`sklearn.feature_selection`
77+
................................
78+
- |Fix| Fixed a bug where :class:`VarianceThreshold` with `threshold=0` did not
79+
remove constant features due to numerical instability, by using range
80+
rather than variance in this case.
81+
:pr:`13704` by `Roddy MacSween <rlms>`.
7682

77-
7883
Miscellaneous
7984
.............
8085

sklearn/feature_selection/tests/test_variance_threshold.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import numpy as np
2+
import pytest
3+
14
from sklearn.utils.testing import (assert_array_equal, assert_equal,
25
assert_raises)
36

@@ -26,3 +29,17 @@ def test_variance_threshold():
2629
for X in [data, csr_matrix(data)]:
2730
X = VarianceThreshold(threshold=.4).fit_transform(X)
2831
assert_equal((len(data), 1), X.shape)
32+
33+
34+
def test_zero_variance_floating_point_error():
35+
# Test that VarianceThreshold(0.0).fit eliminates features that have
36+
# the same value in every sample, even when floating point errors
37+
# cause np.var not to be 0 for the feature.
38+
# See #13691
39+
40+
data = [[-0.13725701]] * 10
41+
assert np.var(data) != 0
42+
for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]:
43+
msg = "No feature in X meets the variance threshold 0.00000"
44+
with pytest.raises(ValueError, match=msg):
45+
VarianceThreshold().fit(X)

sklearn/feature_selection/variance_threshold.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from ..base import BaseEstimator
66
from .base import SelectorMixin
77
from ..utils import check_array
8-
from ..utils.sparsefuncs import mean_variance_axis
8+
from ..utils.sparsefuncs import mean_variance_axis, min_max_axis
99
from ..utils.validation import check_is_fitted
1010

1111

@@ -65,8 +65,18 @@ def fit(self, X, y=None):
6565

6666
if hasattr(X, "toarray"): # sparse matrix
6767
_, self.variances_ = mean_variance_axis(X, axis=0)
68+
if self.threshold == 0:
69+
mins, maxes = min_max_axis(X, axis=0)
70+
peak_to_peaks = maxes - mins
6871
else:
6972
self.variances_ = np.var(X, axis=0)
73+
if self.threshold == 0:
74+
peak_to_peaks = np.ptp(X, axis=0)
75+
76+
if self.threshold == 0:
77+
# Use peak-to-peak to avoid numeric precision issues
78+
# for constant features
79+
self.variances_ = np.minimum(self.variances_, peak_to_peaks)
7080

7181
if np.all(self.variances_ <= self.threshold):
7282
msg = "No feature in X meets the variance threshold {0:.5f}"

0 commit comments

Comments
 (0)