Merge pull request #4436 from ogrisel/rebased-pr-3747

amueller · amueller · commit 574ebfda85d1 · 2015-03-23T11:12:32.000-05:00
[MRG+2] FIX make StandardScaler &amp; scale more numerically stable
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -229,6 +229,9 @@ Enhancements
    - Significant performance and memory usage improvements in
      :class:`preprocessing.PolynomialFeatures`. By `Eric Martin`_.
 
+   - Numerical stability improvements for :class:`preprocessing.StandardScaler`
+     and :func:`preprocessing.scale`. By `Nicolas Goix`_
+
 Documentation improvements
 ..........................
 
@@ -3376,3 +3379,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Dan Blanchard: https://github.com/dan-blanchard
 
 .. _Eric Martin: http://ericmart.in
+
+.. _Nicolas Goix: https://webperso.telecom-paristech.fr/front/frontoffice.php?SP_ID=241
diff --git a/sklearn/preprocessing/_weights.py b/sklearn/preprocessing/_weights.py
@@ -1,6 +1,7 @@
 import numpy as np
 from ..utils.fixes import bincount
 
+
 def _balance_weights(y):
     """Compute sample weights such that the class distribution of y becomes
        balanced.
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
@@ -7,6 +7,7 @@
 
 from itertools import chain, combinations
 import numbers
+import warnings
 
 import numpy as np
 from scipy import sparse
@@ -18,6 +19,7 @@
 from ..utils.extmath import row_norms
 from ..utils.fixes import (combinations_with_replacement as combinations_w_r,
                            bincount)
+from ..utils.fixes import isclose
 from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
                                       inplace_csr_row_normalize_l2)
 from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis)
@@ -57,7 +59,7 @@ def _mean_and_std(X, axis=0, with_mean=True, with_std=True):
     if with_std:
         std_ = Xr.std(axis=0)
         if isinstance(std_, np.ndarray):
-            std_[std_ == 0.0] = 1.0
+            std_[std_ == 0.] = 1.0
         elif std_ == 0.:
             std_ = 1.
     else:
@@ -141,8 +143,35 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
         Xr = np.rollaxis(X, axis)
         if with_mean:
             Xr -= mean_
+            mean_1 = Xr.mean(axis=0)
+            # Verify that mean_1 is 'close to zero'. If X contains very
+            # large values, mean_1 can also be very large, due to a lack of
+            # precision of mean_. In this case, a pre-scaling of the
+            # concerned feature is efficient, for instance by its mean or
+            # maximum.
+            if not np.allclose(mean_1, 0):
+                warnings.warn("Numerical issues were encountered "
+                              "when centering the data "
+                              "and might not be solved. Dataset may "
+                              "contain too large values. You may need "
+                              "to prescale your features.")
+                Xr -= mean_1
         if with_std:
             Xr /= std_
+            if with_mean:
+                mean_2 = Xr.mean(axis=0)
+                # If mean_2 is not 'close to zero', it comes from the fact that
+                # std_ is very small so that mean_2 = mean_1/std_ > 0, even if
+                # mean_1 was close to zero. The problem is thus essentially due
+                # to the lack of precision of mean_. A solution is then to
+                # substract the mean again:
+                if not np.allclose(mean_2, 0):
+                    warnings.warn("Numerical issues were encountered "
+                                  "when scaling the data "
+                                  "and might not be solved. The standard "
+                                  "deviation of the data is probably "
+                                  "very close to 0. ")
+                    Xr -= mean_2
     return X
 
 
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
@@ -2,6 +2,7 @@
 import numpy as np
 import numpy.linalg as la
 from scipy import sparse
+from distutils.version import LooseVersion
 
 from sklearn.utils.testing import assert_almost_equal, clean_warning_registry
 from sklearn.utils.testing import assert_array_almost_equal
@@ -14,6 +15,7 @@
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_false
 from sklearn.utils.testing import assert_warns_message
+from sklearn.utils.testing import assert_no_warnings
 
 from sklearn.utils.sparsefuncs import mean_variance_axis
 from sklearn.preprocessing.data import _transform_selected
@@ -99,6 +101,45 @@ def test_scaler_1d():
     assert_array_equal(scale(X, with_mean=False), X)
 
 
+def test_standard_scaler_numerical_stability():
+    """Test numerical stability of scaling"""
+    # np.log(1e-5) is taken because of its floating point representation
+    # was empirically found to cause numerical problems with np.mean & np.std.
+
+    x = np.zeros(8, dtype=np.float64) + np.log(1e-5, dtype=np.float64)
+    if LooseVersion(np.__version__) >= LooseVersion('1.9'):
+        # This does not raise a warning as the number of samples is too low
+        # to trigger the problem in recent numpy
+        x_scaled = assert_no_warnings(scale, x)
+        assert_array_almost_equal(scale(x), np.zeros(8))
+    else:
+        w = "standard deviation of the data is probably very close to 0"
+        x_scaled = assert_warns_message(UserWarning, w, scale, x)
+        assert_array_almost_equal(x_scaled, np.zeros(8))
+
+    # with 2 more samples, the std computation run into numerical issues:
+    x = np.zeros(10, dtype=np.float64) + np.log(1e-5, dtype=np.float64)
+    w = "standard deviation of the data is probably very close to 0"
+    x_scaled = assert_warns_message(UserWarning, w, scale, x)
+    assert_array_almost_equal(x_scaled, np.zeros(10))
+
+    x = np.ones(10, dtype=np.float64) * 1e-100
+    x_small_scaled = assert_no_warnings(scale, x)
+    assert_array_almost_equal(x_small_scaled, np.zeros(10))
+
+    # Large values can cause (often recoverable) numerical stability issues:
+    x_big = np.ones(10, dtype=np.float64) * 1e100
+    w = "Dataset may contain too large values"
+    x_big_scaled = assert_warns_message(UserWarning, w, scale, x_big)
+    assert_array_almost_equal(x_big_scaled, np.zeros(10))
+    assert_array_almost_equal(x_big_scaled, x_small_scaled)
+
+    x_big_centered = assert_warns_message(UserWarning, w, scale, x_big,
+                                          with_std=False)
+    assert_array_almost_equal(x_big_centered, np.zeros(10))
+    assert_array_almost_equal(x_big_centered, x_small_scaled)
+
+
 def test_scaler_2d_arrays():
     # Test scaling of 2d array along first axis
     rng = np.random.RandomState(0)
@@ -735,6 +776,7 @@ def test_one_hot_encoder_sparse():
     enc.fit([[0], [1]])
     assert_raises(ValueError, enc.transform, [[0], [-1]])
 
+
 def test_one_hot_encoder_dense():
     # check for sparse=False
     X = [[3, 2, 1], [0, 1, 1]]
@@ -828,7 +870,7 @@ def test_one_hot_encoder_unknown_transform():
     oh.fit(X)
     assert_array_equal(
         oh.transform(y).toarray(),
-        np.array([[ 0.,  0.,  0.,  0.,  1.,  0.,  0.]])
+        np.array([[0.,  0.,  0.,  0.,  1.,  0.,  0.]])
         )
 
     # Raise error if handle_unknown is neither ignore or error.