diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index bb245aa466152..5e7c89437a631 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -483,6 +483,10 @@ Changelog The `sample_interval_` attribute is deprecated and will be removed in 1.5. :pr:`25190` by :user:`Vincent Maladière `. +- |Fix| :class:`PowerTransformer` no longer causes overflow for certain input data when + fitting with the Yeo-Johnson method. + :pr:`26188` by :user:`Laurent Sorber `. + :mod:`sklearn.tree` ................... diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index deaf1422705e6..a41813ecf847a 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -3273,11 +3273,11 @@ def _yeo_johnson_inverse_transform(self, x, lmbda): if abs(lmbda) < np.spacing(1.0): x_inv[pos] = np.exp(x[pos]) - 1 else: # lmbda != 0 - x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1 + x_inv[pos] = np.exp(np.log1p(x[pos] * lmbda) / lmbda) - 1 # when x < 0 if abs(lmbda - 2) > np.spacing(1.0): - x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, 1 / (2 - lmbda)) + x_inv[~pos] = 1 - np.exp(np.log1p(-(2 - lmbda) * x[~pos]) / (2 - lmbda)) else: # lmbda == 2 x_inv[~pos] = 1 - np.exp(-x[~pos]) @@ -3295,11 +3295,11 @@ def _yeo_johnson_transform(self, x, lmbda): if abs(lmbda) < np.spacing(1.0): out[pos] = np.log1p(x[pos]) else: # lmbda != 0 - out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda + out[pos] = (np.exp(np.log1p(x[pos]) * lmbda) - 1) / lmbda # when x < 0 if abs(lmbda - 2) > np.spacing(1.0): - out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda) + out[~pos] = -(np.exp(np.log1p(-x[~pos]) * (2 - lmbda)) - 1) / (2 - lmbda) else: # lmbda == 2 out[~pos] = -np.log1p(-x[~pos]) @@ -3340,6 +3340,11 @@ def _neg_log_likelihood(lmbda): loglike = -n_samples / 2 * log_var loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum() + # Regularize the exponents to avoid blowing them up for a marginal gain in + # log likelihood + x_trans_exp = np.log(np.abs(x_trans[x_trans != 0])) + loglike -= 1e-3 * np.sum(np.abs(x_trans_exp[np.isfinite(x_trans_exp)])) + return -loglike # the computation of lambda is influenced by NaNs so we need to diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index b5b016e8289f9..0fdbed32514b3 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2654,3 +2654,25 @@ def test_kernel_centerer_feature_names_out(): names_out = centerer.get_feature_names_out() samples_out2 = X_pairwise.shape[1] assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)]) + + +def test_yeo_johnson_overflow(): + """Test that Yeo-Johnson doesn't trigger overflow. + + Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/23319 + """ + x = np.array( + [2003.0, 1950.0, 1997.0, 2000.0, 2009.0, 2009.0, 1980.0, 1999.0, 2007.0, 1991.0] + ) + with np.errstate(over="raise"): + # Attempt to trigger overflow in `x_trans_var = x_trans.var()`. This overflow is + # mitigated by replacing instances of np.power with np.exp. + pt = PowerTransformer(method="yeo-johnson") + X1_trans = pt.fit_transform(x[:, np.newaxis]) + assert np.abs(X1_trans).max() < 3 + # Attempt to trigger overflow in `out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / + # lmbda`. Without regularization, the exponent of the transformed output could + # blow up for a marginal gain in log likelihood. + pt = PowerTransformer(method="yeo-johnson") + X2_trans = pt.fit_transform(x[:5, np.newaxis]) + assert np.abs(X2_trans).max() < 3