diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 74d7b1909c4e1..3d729de8e14c1 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -10,6 +10,7 @@ from scipy.special import boxcox, inv_boxcox from sklearn.utils import metadata_routing +from sklearn.utils.stats import _averaged_weighted_percentile from ..base import ( BaseEstimator, @@ -2726,7 +2727,7 @@ def __init__( self.random_state = random_state self.copy = copy - def _dense_fit(self, X, random_state): + def _dense_fit(self, X, random_state, sample_weight=None): """Compute percentiles for dense matrices. Parameters @@ -2748,8 +2749,30 @@ def _dense_fit(self, X, random_state): X = resample( X, replace=False, n_samples=self.subsample, random_state=random_state ) + if sample_weight is not None: + sample_weight = sample_weight[: X.shape[0]] + + self.quantiles_ = np.zeros((len(references), n_features)) + + for i in range(n_features): + col = X[:, i] + mask = ~np.isnan(col) + col_clean = col[mask] + + if col_clean.size == 0: + self.quantiles_[:, i] = np.nan + continue + + if sample_weight is not None: + weights_clean = sample_weight[mask] + self.quantiles_[:, i] = _averaged_weighted_percentile( + col_clean, sample_weight=weights_clean, quantile=references / 100.0 + ) + else: + self.quantiles_[:, i] = np.nanquantile( + col_clean, references / 100.0, method="averaged_inverted_cdf" + ) - self.quantiles_ = np.nanpercentile(X, references, axis=0) # Due to floating-point precision error in `np.nanpercentile`, # make sure that quantiles are monotonically increasing. # Upstream issue in numpy: @@ -2802,7 +2825,7 @@ def _sparse_fit(self, X, random_state): self.quantiles_ = np.maximum.accumulate(self.quantiles_) @_fit_context(prefer_skip_nested_validation=True) - def fit(self, X, y=None): + def fit(self, X, y=None, sample_weight=None): """Compute the quantiles used for transforming. Parameters @@ -2844,9 +2867,11 @@ def fit(self, X, y=None): # Create the quantiles of reference self.references_ = np.linspace(0, 1, self.n_quantiles_, endpoint=True) if sparse.issparse(X): + if sample_weight is not None: + raise ValueError("sample_weight is not supported for sparse input.") self._sparse_fit(X, rng) else: - self._dense_fit(X, rng) + self._dense_fit(X, rng, sample_weight=sample_weight) return self diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index ac303a1c93e96..5521651ea846e 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -1528,6 +1528,28 @@ def test_quantile_transformer_sorted_quantiles(array_type): assert all(np.diff(quantiles) >= 0) +def test_quantile_transformer_with_sample_weight(): + rng = np.random.RandomState(42) + X = rng.randn(100, 1) + sample_weight = np.linspace(1, 2, X.shape[0]) # increasing weights + + qt_weighted = QuantileTransformer(n_quantiles=10, random_state=0) + qt_unweighted = QuantileTransformer(n_quantiles=10, random_state=0) + + Xt_weighted = qt_weighted.fit_transform(X, sample_weight=sample_weight) + Xt_unweighted = qt_unweighted.fit_transform(X) + + # Should not be equal + with pytest.raises(AssertionError): + np.testing.assert_allclose(Xt_weighted, Xt_unweighted) + + # Inverse transform round-trip check + X_roundtrip = qt_weighted.inverse_transform(Xt_weighted) + np.testing.assert_allclose( + X[~np.isnan(X)], X_roundtrip[~np.isnan(X)], rtol=1e-2, atol=1e-2 + ) + + def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py index e619deab1c93e..0d3b3e5fdd040 100644 --- a/sklearn/utils/_test_common/instance_generator.py +++ b/sklearn/utils/_test_common/instance_generator.py @@ -162,6 +162,7 @@ StandardScaler, TargetEncoder, ) +from sklearn.preprocessing._data import QuantileTransformer from sklearn.random_projection import ( GaussianRandomProjection, SparseRandomProjection, @@ -1255,6 +1256,11 @@ def _yield_instances_for_check(check, estimator_orig): " tests are added for TunedThresholdClassifierCV specifically." ), }, + QuantileTransformer: { + "check_sample_weight_equivalence_on_sparse_data": ( + "QuantileTransformer does not yet support sample_weight on sparse data." + ), + }, } # TODO: remove when scipy min version >= 1.11