scikit-learn · ogrisel · Mar 31, 2021 · Mar 25, 2021 · Mar 26, 2021 · Mar 29, 2021
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -288,12 +288,19 @@ Changelog
   :user:`Clifford Akai-Nettey<cliffordEmmanuel>`.
 
 :mod:`sklearn.calibration`
-............................
+..........................
 
 - |Fix| The predict and predict_proba methods of
-  :class:`calibration.CalibratedClassifierCV can now properly be used on
+  :class:`calibration.CalibratedClassifierCV` can now properly be used on
   prefitted pipelines. :pr:`19641` by :user:`Alek Lefebvre <AlekLefebvre>`
 
+:mod:`sklearn.utils`
+....................
+
+  - |Fix| Fixed a bug in :func:`utils.sparsefuncs.mean_variance_axis` where the
+    precision of the computed variance was very poor when the real variance is
+    exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
 Code and Documentation Contributors
 -----------------------------------
 

diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx
@@ -124,23 +124,32 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
     variances = np.zeros_like(means, dtype=dtype)
 
     cdef:
-        np.ndarray[floating, ndim=1] sum_weights = \
-            np.full(fill_value=np.sum(weights), shape=n_features, dtype=dtype)
-        np.ndarray[floating, ndim=1] sum_weights_nan = \
-            np.zeros(shape=n_features, dtype=dtype)
-        np.ndarray[floating, ndim=1] sum_weights_nz = \
-            np.zeros(shape=n_features, dtype=dtype)
+        np.ndarray[floating, ndim=1] sum_weights = np.full(
+            fill_value=np.sum(weights), shape=n_features, dtype=dtype)
+        np.ndarray[floating, ndim=1] sum_weights_nz = np.zeros(
+            shape=n_features, dtype=dtype)
+
+        np.ndarray[np.uint64_t, ndim=1] counts = np.full(
-        np.ndarray[np.uint64_t, ndim=1] counts = np.full(
+        np.ndarray[np.uint64_t, ndim=1] counts_nan = np.full(
-        np.ndarray[np.uint64_t, ndim=1] counts = np.full(
+        np.ndarray[np.uint64_t, ndim=1] counts_nan = np.full(
+            fill_value=weights.shape[0], shape=n_features, dtype=np.uint64)
+        np.ndarray[np.uint64_t, ndim=1] counts_nz = np.zeros(
+            shape=n_features, dtype=np.uint64)
 
     for row_ind in range(len(X_indptr) - 1):
         for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
             col_ind = X_indices[i]
             if not isnan(X_data[i]):
                 means[col_ind] += (X_data[i] * weights[row_ind])
+                # sum of weights where X[:, col_ind] is non-zero
+                sum_weights_nz[col_ind] += weights[row_ind]
+                # number of non-zero elements of X[:, col_ind]
+                counts_nz[col_ind] += 1
             else:
-                sum_weights_nan[col_ind] += weights[row_ind]
+                # sum of weights where X[:, col_ind] is not nan
+                sum_weights[col_ind] -= weights[row_ind]
+                # number of non nan elements of X[:, col_ind]
+                counts[col_ind] -= 1
 
     for i in range(n_features):
-        sum_weights[i] -= sum_weights_nan[i]
         means[i] /= sum_weights[i]
 
     for row_ind in range(len(X_indptr) - 1):
@@ -149,10 +158,12 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
             if not isnan(X_data[i]):
                 diff = X_data[i] - means[col_ind]
                 variances[col_ind] += diff * diff * weights[row_ind]
-                sum_weights_nz[col_ind] += weights[row_ind]
 
     for i in range(n_features):
-        variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2
+        if counts[i] != counts_nz[i]:
+            # only compute it when it's guaranteed to be non-zero to avoid
+            # catastrophic cancellation.
+            variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2
         variances[i] /= sum_weights[i]
 
     return means, variances, sum_weights
@@ -228,23 +239,32 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
     variances = np.zeros_like(means, dtype=dtype)
 
     cdef:
-        np.ndarray[floating, ndim=1] sum_weights = \
-            np.full(fill_value=np.sum(weights), shape=n_features, dtype=dtype)
-        np.ndarray[floating, ndim=1] sum_weights_nan = \
-            np.zeros(shape=n_features, dtype=dtype)
-        np.ndarray[floating, ndim=1] sum_weights_nz = \
-            np.zeros(shape=n_features, dtype=dtype)
+        np.ndarray[floating, ndim=1] sum_weights = np.full(
+            fill_value=np.sum(weights), shape=n_features, dtype=dtype)
+        np.ndarray[floating, ndim=1] sum_weights_nz = np.zeros(
+            shape=n_features, dtype=dtype)
+
+        np.ndarray[np.uint64_t, ndim=1] counts = np.full(
+            fill_value=weights.shape[0], shape=n_features, dtype=np.uint64)
+        np.ndarray[np.uint64_t, ndim=1] counts_nz = np.zeros(
+            shape=n_features, dtype=np.uint64)
 
     for col_ind in range(n_features):
         for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
             row_ind = X_indices[i]
             if not isnan(X_data[i]):
                 means[col_ind] += (X_data[i] * weights[row_ind])
+                # sum of weights where X[:, col_ind] is non-zero
+                sum_weights_nz[col_ind] += weights[row_ind]
+                # number of non-zero elements of X[:, col_ind]
+                counts_nz[col_ind] += 1
             else:
-                sum_weights_nan[col_ind] += weights[row_ind]
+                # sum of weights where X[:, col_ind] is not nan
+                sum_weights[col_ind] -= weights[row_ind]
+                # number of non nan elements of X[:, col_ind]
+                counts[col_ind] -= 1
 
     for i in range(n_features):
-        sum_weights[i] -= sum_weights_nan[i]
         means[i] /= sum_weights[i]
 
     for col_ind in range(n_features):
@@ -253,10 +273,12 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
             if not isnan(X_data[i]):
                 diff = X_data[i] - means[col_ind]
                 variances[col_ind] += diff * diff * weights[row_ind]
-                sum_weights_nz[col_ind] += weights[row_ind]
 
     for i in range(n_features):
-        variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2
+        if counts[i] != counts_nz[i]:
+            # only compute it when it's guaranteed to be non-zero to avoid
+            # catastrophic cancellation.
+            variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2
         variances[i] /= sum_weights[i]
 
     return means, variances, sum_weights

diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py
@@ -53,6 +53,26 @@ def test_mean_variance_axis0():
             assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
 
 
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("sparse_constructor", [sp.csr_matrix, sp.csc_matrix])
+def test_mean_variance_axis0_precision(dtype, sparse_constructor):
+    # Check that there's no big loss of precision when the real variance is
+    # exactly 0. (#19766)
+    rng = np.random.RandomState(0)
+    X = np.full(fill_value=100., shape=(1000, 1), dtype=dtype)
+    # Add some missing records which should be ignored:
+    missing_indices = rng.choice(np.arange(X.shape[0]), 10, replace=False)
+    X[missing_indices, 0] = np.nan
+    X = sparse_constructor(X)
+
+    # Random positive weights:
+    sample_weight = rng.rand(X.shape[0]).astype(dtype)
+
+    _, var = mean_variance_axis(X, weights=sample_weight, axis=0)
+
+    assert var < np.finfo(dtype).eps
+
+
 def test_mean_variance_axis1():
     X, _ = make_classification(5, 4, random_state=0)
     # Sparsify the array a little bit