|
2 | 2 | import numpy as np
|
3 | 3 | import numpy.linalg as la
|
4 | 4 | from scipy import sparse
|
| 5 | +from distutils.version import LooseVersion |
5 | 6 |
|
6 | 7 | from sklearn.utils.testing import assert_almost_equal, clean_warning_registry
|
7 | 8 | from sklearn.utils.testing import assert_array_almost_equal
|
|
14 | 15 | from sklearn.utils.testing import assert_true
|
15 | 16 | from sklearn.utils.testing import assert_false
|
16 | 17 | from sklearn.utils.testing import assert_warns_message
|
| 18 | +from sklearn.utils.testing import assert_no_warnings |
17 | 19 |
|
18 | 20 | from sklearn.utils.sparsefuncs import mean_variance_axis
|
19 | 21 | from sklearn.preprocessing.data import _transform_selected
|
@@ -99,6 +101,45 @@ def test_scaler_1d():
|
99 | 101 | assert_array_equal(scale(X, with_mean=False), X)
|
100 | 102 |
|
101 | 103 |
|
| 104 | +def test_standard_scaler_numerical_stability(): |
| 105 | + """Test numerical stability of scaling""" |
| 106 | + # np.log(1e-5) is taken because of its floating point representation |
| 107 | + # was empirically found to cause numerical problems with np.mean & np.std. |
| 108 | + |
| 109 | + x = np.zeros(8, dtype=np.float64) + np.log(1e-5, dtype=np.float64) |
| 110 | + if LooseVersion(np.__version__) >= LooseVersion('1.9'): |
| 111 | + # This does not raise a warning as the number of samples is too low |
| 112 | + # to trigger the problem in recent numpy |
| 113 | + x_scaled = assert_no_warnings(scale, x) |
| 114 | + assert_array_almost_equal(scale(x), np.zeros(8)) |
| 115 | + else: |
| 116 | + w = "standard deviation of the data is probably very close to 0" |
| 117 | + x_scaled = assert_warns_message(UserWarning, w, scale, x) |
| 118 | + assert_array_almost_equal(x_scaled, np.zeros(8)) |
| 119 | + |
| 120 | + # with 2 more samples, the std computation run into numerical issues: |
| 121 | + x = np.zeros(10, dtype=np.float64) + np.log(1e-5, dtype=np.float64) |
| 122 | + w = "standard deviation of the data is probably very close to 0" |
| 123 | + x_scaled = assert_warns_message(UserWarning, w, scale, x) |
| 124 | + assert_array_almost_equal(x_scaled, np.zeros(10)) |
| 125 | + |
| 126 | + x = np.ones(10, dtype=np.float64) * 1e-100 |
| 127 | + x_small_scaled = assert_no_warnings(scale, x) |
| 128 | + assert_array_almost_equal(x_small_scaled, np.zeros(10)) |
| 129 | + |
| 130 | + # Large values can cause (often recoverable) numerical stability issues: |
| 131 | + x_big = np.ones(10, dtype=np.float64) * 1e100 |
| 132 | + w = "Dataset may contain too large values" |
| 133 | + x_big_scaled = assert_warns_message(UserWarning, w, scale, x_big) |
| 134 | + assert_array_almost_equal(x_big_scaled, np.zeros(10)) |
| 135 | + assert_array_almost_equal(x_big_scaled, x_small_scaled) |
| 136 | + |
| 137 | + x_big_centered = assert_warns_message(UserWarning, w, scale, x_big, |
| 138 | + with_std=False) |
| 139 | + assert_array_almost_equal(x_big_centered, np.zeros(10)) |
| 140 | + assert_array_almost_equal(x_big_centered, x_small_scaled) |
| 141 | + |
| 142 | + |
102 | 143 | def test_scaler_2d_arrays():
|
103 | 144 | # Test scaling of 2d array along first axis
|
104 | 145 | rng = np.random.RandomState(0)
|
@@ -735,6 +776,7 @@ def test_one_hot_encoder_sparse():
|
735 | 776 | enc.fit([[0], [1]])
|
736 | 777 | assert_raises(ValueError, enc.transform, [[0], [-1]])
|
737 | 778 |
|
| 779 | + |
738 | 780 | def test_one_hot_encoder_dense():
|
739 | 781 | # check for sparse=False
|
740 | 782 | X = [[3, 2, 1], [0, 1, 1]]
|
@@ -828,7 +870,7 @@ def test_one_hot_encoder_unknown_transform():
|
828 | 870 | oh.fit(X)
|
829 | 871 | assert_array_equal(
|
830 | 872 | oh.transform(y).toarray(),
|
831 |
| - np.array([[ 0., 0., 0., 0., 1., 0., 0.]]) |
| 873 | + np.array([[0., 0., 0., 0., 1., 0., 0.]]) |
832 | 874 | )
|
833 | 875 |
|
834 | 876 | # Raise error if handle_unknown is neither ignore or error.
|
|
0 commit comments