Skip to content

Commit 574ebfd

Browse files
committed
Merge pull request #4436 from ogrisel/rebased-pr-3747
[MRG+2] FIX make StandardScaler & scale more numerically stable
2 parents 12b2f16 + b596dfc commit 574ebfd

File tree

4 files changed

+79
-2
lines changed

4 files changed

+79
-2
lines changed

doc/whats_new.rst

+5
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,9 @@ Enhancements
229229
- Significant performance and memory usage improvements in
230230
:class:`preprocessing.PolynomialFeatures`. By `Eric Martin`_.
231231

232+
- Numerical stability improvements for :class:`preprocessing.StandardScaler`
233+
and :func:`preprocessing.scale`. By `Nicolas Goix`_
234+
232235
Documentation improvements
233236
..........................
234237

@@ -3376,3 +3379,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
33763379
.. _Dan Blanchard: https://github.com/dan-blanchard
33773380

33783381
.. _Eric Martin: http://ericmart.in
3382+
3383+
.. _Nicolas Goix: https://webperso.telecom-paristech.fr/front/frontoffice.php?SP_ID=241

sklearn/preprocessing/_weights.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import numpy as np
22
from ..utils.fixes import bincount
33

4+
45
def _balance_weights(y):
56
"""Compute sample weights such that the class distribution of y becomes
67
balanced.

sklearn/preprocessing/data.py

+30-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from itertools import chain, combinations
99
import numbers
10+
import warnings
1011

1112
import numpy as np
1213
from scipy import sparse
@@ -18,6 +19,7 @@
1819
from ..utils.extmath import row_norms
1920
from ..utils.fixes import (combinations_with_replacement as combinations_w_r,
2021
bincount)
22+
from ..utils.fixes import isclose
2123
from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
2224
inplace_csr_row_normalize_l2)
2325
from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis)
@@ -57,7 +59,7 @@ def _mean_and_std(X, axis=0, with_mean=True, with_std=True):
5759
if with_std:
5860
std_ = Xr.std(axis=0)
5961
if isinstance(std_, np.ndarray):
60-
std_[std_ == 0.0] = 1.0
62+
std_[std_ == 0.] = 1.0
6163
elif std_ == 0.:
6264
std_ = 1.
6365
else:
@@ -141,8 +143,35 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
141143
Xr = np.rollaxis(X, axis)
142144
if with_mean:
143145
Xr -= mean_
146+
mean_1 = Xr.mean(axis=0)
147+
# Verify that mean_1 is 'close to zero'. If X contains very
148+
# large values, mean_1 can also be very large, due to a lack of
149+
# precision of mean_. In this case, a pre-scaling of the
150+
# concerned feature is efficient, for instance by its mean or
151+
# maximum.
152+
if not np.allclose(mean_1, 0):
153+
warnings.warn("Numerical issues were encountered "
154+
"when centering the data "
155+
"and might not be solved. Dataset may "
156+
"contain too large values. You may need "
157+
"to prescale your features.")
158+
Xr -= mean_1
144159
if with_std:
145160
Xr /= std_
161+
if with_mean:
162+
mean_2 = Xr.mean(axis=0)
163+
# If mean_2 is not 'close to zero', it comes from the fact that
164+
# std_ is very small so that mean_2 = mean_1/std_ > 0, even if
165+
# mean_1 was close to zero. The problem is thus essentially due
166+
# to the lack of precision of mean_. A solution is then to
167+
# substract the mean again:
168+
if not np.allclose(mean_2, 0):
169+
warnings.warn("Numerical issues were encountered "
170+
"when scaling the data "
171+
"and might not be solved. The standard "
172+
"deviation of the data is probably "
173+
"very close to 0. ")
174+
Xr -= mean_2
146175
return X
147176

148177

sklearn/preprocessing/tests/test_data.py

+43-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import numpy as np
33
import numpy.linalg as la
44
from scipy import sparse
5+
from distutils.version import LooseVersion
56

67
from sklearn.utils.testing import assert_almost_equal, clean_warning_registry
78
from sklearn.utils.testing import assert_array_almost_equal
@@ -14,6 +15,7 @@
1415
from sklearn.utils.testing import assert_true
1516
from sklearn.utils.testing import assert_false
1617
from sklearn.utils.testing import assert_warns_message
18+
from sklearn.utils.testing import assert_no_warnings
1719

1820
from sklearn.utils.sparsefuncs import mean_variance_axis
1921
from sklearn.preprocessing.data import _transform_selected
@@ -99,6 +101,45 @@ def test_scaler_1d():
99101
assert_array_equal(scale(X, with_mean=False), X)
100102

101103

104+
def test_standard_scaler_numerical_stability():
105+
"""Test numerical stability of scaling"""
106+
# np.log(1e-5) is taken because of its floating point representation
107+
# was empirically found to cause numerical problems with np.mean & np.std.
108+
109+
x = np.zeros(8, dtype=np.float64) + np.log(1e-5, dtype=np.float64)
110+
if LooseVersion(np.__version__) >= LooseVersion('1.9'):
111+
# This does not raise a warning as the number of samples is too low
112+
# to trigger the problem in recent numpy
113+
x_scaled = assert_no_warnings(scale, x)
114+
assert_array_almost_equal(scale(x), np.zeros(8))
115+
else:
116+
w = "standard deviation of the data is probably very close to 0"
117+
x_scaled = assert_warns_message(UserWarning, w, scale, x)
118+
assert_array_almost_equal(x_scaled, np.zeros(8))
119+
120+
# with 2 more samples, the std computation run into numerical issues:
121+
x = np.zeros(10, dtype=np.float64) + np.log(1e-5, dtype=np.float64)
122+
w = "standard deviation of the data is probably very close to 0"
123+
x_scaled = assert_warns_message(UserWarning, w, scale, x)
124+
assert_array_almost_equal(x_scaled, np.zeros(10))
125+
126+
x = np.ones(10, dtype=np.float64) * 1e-100
127+
x_small_scaled = assert_no_warnings(scale, x)
128+
assert_array_almost_equal(x_small_scaled, np.zeros(10))
129+
130+
# Large values can cause (often recoverable) numerical stability issues:
131+
x_big = np.ones(10, dtype=np.float64) * 1e100
132+
w = "Dataset may contain too large values"
133+
x_big_scaled = assert_warns_message(UserWarning, w, scale, x_big)
134+
assert_array_almost_equal(x_big_scaled, np.zeros(10))
135+
assert_array_almost_equal(x_big_scaled, x_small_scaled)
136+
137+
x_big_centered = assert_warns_message(UserWarning, w, scale, x_big,
138+
with_std=False)
139+
assert_array_almost_equal(x_big_centered, np.zeros(10))
140+
assert_array_almost_equal(x_big_centered, x_small_scaled)
141+
142+
102143
def test_scaler_2d_arrays():
103144
# Test scaling of 2d array along first axis
104145
rng = np.random.RandomState(0)
@@ -735,6 +776,7 @@ def test_one_hot_encoder_sparse():
735776
enc.fit([[0], [1]])
736777
assert_raises(ValueError, enc.transform, [[0], [-1]])
737778

779+
738780
def test_one_hot_encoder_dense():
739781
# check for sparse=False
740782
X = [[3, 2, 1], [0, 1, 1]]
@@ -828,7 +870,7 @@ def test_one_hot_encoder_unknown_transform():
828870
oh.fit(X)
829871
assert_array_equal(
830872
oh.transform(y).toarray(),
831-
np.array([[ 0., 0., 0., 0., 1., 0., 0.]])
873+
np.array([[0., 0., 0., 0., 1., 0., 0.]])
832874
)
833875

834876
# Raise error if handle_unknown is neither ignore or error.

0 commit comments

Comments
 (0)