diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index be894774f5a27..2b108d2f0e197 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -288,12 +288,19 @@ Changelog :user:`Clifford Akai-Nettey`. :mod:`sklearn.calibration` -............................ +.......................... - |Fix| The predict and predict_proba methods of - :class:`calibration.CalibratedClassifierCV can now properly be used on + :class:`calibration.CalibratedClassifierCV` can now properly be used on prefitted pipelines. :pr:`19641` by :user:`Alek Lefebvre ` +:mod:`sklearn.utils` +.................... + + - |Fix| Fixed a bug in :func:`utils.sparsefuncs.mean_variance_axis` where the + precision of the computed variance was very poor when the real variance is + exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger `. + Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index e89599918ec5e..4a84c03eff86b 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -124,23 +124,32 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, variances = np.zeros_like(means, dtype=dtype) cdef: - np.ndarray[floating, ndim=1] sum_weights = \ - np.full(fill_value=np.sum(weights), shape=n_features, dtype=dtype) - np.ndarray[floating, ndim=1] sum_weights_nan = \ - np.zeros(shape=n_features, dtype=dtype) - np.ndarray[floating, ndim=1] sum_weights_nz = \ - np.zeros(shape=n_features, dtype=dtype) + np.ndarray[floating, ndim=1] sum_weights = np.full( + fill_value=np.sum(weights), shape=n_features, dtype=dtype) + np.ndarray[floating, ndim=1] sum_weights_nz = np.zeros( + shape=n_features, dtype=dtype) + + np.ndarray[np.uint64_t, ndim=1] counts = np.full( + fill_value=weights.shape[0], shape=n_features, dtype=np.uint64) + np.ndarray[np.uint64_t, ndim=1] counts_nz = np.zeros( + shape=n_features, dtype=np.uint64) for row_ind in range(len(X_indptr) - 1): for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]): col_ind = X_indices[i] if not isnan(X_data[i]): means[col_ind] += (X_data[i] * weights[row_ind]) + # sum of weights where X[:, col_ind] is non-zero + sum_weights_nz[col_ind] += weights[row_ind] + # number of non-zero elements of X[:, col_ind] + counts_nz[col_ind] += 1 else: - sum_weights_nan[col_ind] += weights[row_ind] + # sum of weights where X[:, col_ind] is not nan + sum_weights[col_ind] -= weights[row_ind] + # number of non nan elements of X[:, col_ind] + counts[col_ind] -= 1 for i in range(n_features): - sum_weights[i] -= sum_weights_nan[i] means[i] /= sum_weights[i] for row_ind in range(len(X_indptr) - 1): @@ -149,10 +158,12 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, if not isnan(X_data[i]): diff = X_data[i] - means[col_ind] variances[col_ind] += diff * diff * weights[row_ind] - sum_weights_nz[col_ind] += weights[row_ind] for i in range(n_features): - variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2 + if counts[i] != counts_nz[i]: + # only compute it when it's guaranteed to be non-zero to avoid + # catastrophic cancellation. + variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2 variances[i] /= sum_weights[i] return means, variances, sum_weights @@ -228,23 +239,32 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, variances = np.zeros_like(means, dtype=dtype) cdef: - np.ndarray[floating, ndim=1] sum_weights = \ - np.full(fill_value=np.sum(weights), shape=n_features, dtype=dtype) - np.ndarray[floating, ndim=1] sum_weights_nan = \ - np.zeros(shape=n_features, dtype=dtype) - np.ndarray[floating, ndim=1] sum_weights_nz = \ - np.zeros(shape=n_features, dtype=dtype) + np.ndarray[floating, ndim=1] sum_weights = np.full( + fill_value=np.sum(weights), shape=n_features, dtype=dtype) + np.ndarray[floating, ndim=1] sum_weights_nz = np.zeros( + shape=n_features, dtype=dtype) + + np.ndarray[np.uint64_t, ndim=1] counts = np.full( + fill_value=weights.shape[0], shape=n_features, dtype=np.uint64) + np.ndarray[np.uint64_t, ndim=1] counts_nz = np.zeros( + shape=n_features, dtype=np.uint64) for col_ind in range(n_features): for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]): row_ind = X_indices[i] if not isnan(X_data[i]): means[col_ind] += (X_data[i] * weights[row_ind]) + # sum of weights where X[:, col_ind] is non-zero + sum_weights_nz[col_ind] += weights[row_ind] + # number of non-zero elements of X[:, col_ind] + counts_nz[col_ind] += 1 else: - sum_weights_nan[col_ind] += weights[row_ind] + # sum of weights where X[:, col_ind] is not nan + sum_weights[col_ind] -= weights[row_ind] + # number of non nan elements of X[:, col_ind] + counts[col_ind] -= 1 for i in range(n_features): - sum_weights[i] -= sum_weights_nan[i] means[i] /= sum_weights[i] for col_ind in range(n_features): @@ -253,10 +273,12 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, if not isnan(X_data[i]): diff = X_data[i] - means[col_ind] variances[col_ind] += diff * diff * weights[row_ind] - sum_weights_nz[col_ind] += weights[row_ind] for i in range(n_features): - variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2 + if counts[i] != counts_nz[i]: + # only compute it when it's guaranteed to be non-zero to avoid + # catastrophic cancellation. + variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2 variances[i] /= sum_weights[i] return means, variances, sum_weights diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 8366aabd751ad..8b087145c3d36 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -53,6 +53,26 @@ def test_mean_variance_axis0(): assert_array_almost_equal(X_vars, np.var(X_test, axis=0)) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("sparse_constructor", [sp.csr_matrix, sp.csc_matrix]) +def test_mean_variance_axis0_precision(dtype, sparse_constructor): + # Check that there's no big loss of precision when the real variance is + # exactly 0. (#19766) + rng = np.random.RandomState(0) + X = np.full(fill_value=100., shape=(1000, 1), dtype=dtype) + # Add some missing records which should be ignored: + missing_indices = rng.choice(np.arange(X.shape[0]), 10, replace=False) + X[missing_indices, 0] = np.nan + X = sparse_constructor(X) + + # Random positive weights: + sample_weight = rng.rand(X.shape[0]).astype(dtype) + + _, var = mean_variance_axis(X, weights=sample_weight, axis=0) + + assert var < np.finfo(dtype).eps + + def test_mean_variance_axis1(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit