-
-
Notifications
You must be signed in to change notification settings - Fork 26k
Add sample_weight
support for QuantileTransformer
when fit on dense data
#31147
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
841ed26
eb78c16
7d51a7d
0b0f0e6
06960cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -10,6 +10,7 @@ | |||||||||
from scipy.special import boxcox, inv_boxcox | ||||||||||
|
||||||||||
from sklearn.utils import metadata_routing | ||||||||||
from sklearn.utils.stats import _averaged_weighted_percentile | ||||||||||
|
||||||||||
from ..base import ( | ||||||||||
BaseEstimator, | ||||||||||
|
@@ -2726,7 +2727,7 @@ def __init__( | |||||||||
self.random_state = random_state | ||||||||||
self.copy = copy | ||||||||||
|
||||||||||
def _dense_fit(self, X, random_state): | ||||||||||
def _dense_fit(self, X, random_state, sample_weight=None): | ||||||||||
"""Compute percentiles for dense matrices. | ||||||||||
|
||||||||||
Parameters | ||||||||||
|
@@ -2748,8 +2749,30 @@ def _dense_fit(self, X, random_state): | |||||||||
X = resample( | ||||||||||
X, replace=False, n_samples=self.subsample, random_state=random_state | ||||||||||
) | ||||||||||
if sample_weight is not None: | ||||||||||
sample_weight = sample_weight[: X.shape[0]] | ||||||||||
|
||||||||||
self.quantiles_ = np.zeros((len(references), n_features)) | ||||||||||
|
||||||||||
for i in range(n_features): | ||||||||||
col = X[:, i] | ||||||||||
mask = ~np.isnan(col) | ||||||||||
col_clean = col[mask] | ||||||||||
|
||||||||||
if col_clean.size == 0: | ||||||||||
self.quantiles_[:, i] = np.nan | ||||||||||
continue | ||||||||||
|
||||||||||
if sample_weight is not None: | ||||||||||
weights_clean = sample_weight[mask] | ||||||||||
self.quantiles_[:, i] = _averaged_weighted_percentile( | ||||||||||
col_clean, sample_weight=weights_clean, quantile=references / 100.0 | ||||||||||
) | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please check the source code of In particular:
|
||||||||||
else: | ||||||||||
self.quantiles_[:, i] = np.nanquantile( | ||||||||||
col_clean, references / 100.0, method="averaged_inverted_cdf" | ||||||||||
) | ||||||||||
|
||||||||||
self.quantiles_ = np.nanpercentile(X, references, axis=0) | ||||||||||
# Due to floating-point precision error in `np.nanpercentile`, | ||||||||||
# make sure that quantiles are monotonically increasing. | ||||||||||
# Upstream issue in numpy: | ||||||||||
|
@@ -2802,7 +2825,7 @@ def _sparse_fit(self, X, random_state): | |||||||||
self.quantiles_ = np.maximum.accumulate(self.quantiles_) | ||||||||||
|
||||||||||
@_fit_context(prefer_skip_nested_validation=True) | ||||||||||
def fit(self, X, y=None): | ||||||||||
def fit(self, X, y=None, sample_weight=None): | ||||||||||
"""Compute the quantiles used for transforming. | ||||||||||
|
||||||||||
Parameters | ||||||||||
|
@@ -2844,9 +2867,11 @@ def fit(self, X, y=None): | |||||||||
# Create the quantiles of reference | ||||||||||
self.references_ = np.linspace(0, 1, self.n_quantiles_, endpoint=True) | ||||||||||
if sparse.issparse(X): | ||||||||||
if sample_weight is not None: | ||||||||||
raise ValueError("sample_weight is not supported for sparse input.") | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
self._sparse_fit(X, rng) | ||||||||||
else: | ||||||||||
self._dense_fit(X, rng) | ||||||||||
self._dense_fit(X, rng, sample_weight=sample_weight) | ||||||||||
|
||||||||||
return self | ||||||||||
|
||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1528,6 +1528,28 @@ def test_quantile_transformer_sorted_quantiles(array_type): | |
assert all(np.diff(quantiles) >= 0) | ||
|
||
|
||
def test_quantile_transformer_with_sample_weight(): | ||
rng = np.random.RandomState(42) | ||
X = rng.randn(100, 1) | ||
sample_weight = np.linspace(1, 2, X.shape[0]) # increasing weights | ||
|
||
qt_weighted = QuantileTransformer(n_quantiles=10, random_state=0) | ||
qt_unweighted = QuantileTransformer(n_quantiles=10, random_state=0) | ||
|
||
Xt_weighted = qt_weighted.fit_transform(X, sample_weight=sample_weight) | ||
Xt_unweighted = qt_unweighted.fit_transform(X) | ||
|
||
# Should not be equal | ||
with pytest.raises(AssertionError): | ||
np.testing.assert_allclose(Xt_weighted, Xt_unweighted) | ||
|
||
# Inverse transform round-trip check | ||
X_roundtrip = qt_weighted.inverse_transform(Xt_weighted) | ||
np.testing.assert_allclose( | ||
X[~np.isnan(X)], X_roundtrip[~np.isnan(X)], rtol=1e-2, atol=1e-2 | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this test is not necessary since there is already a common test
|
||
|
||
|
||
def test_robust_scaler_invalid_range(): | ||
for range_ in [ | ||
(-1, 90), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please rename
i
tofeature_idx
to make the code easier to follow.