Skip to content

Commit 57a7222

Browse files
authored
MAINT Clean up deprecations for 1.5: in KBinsDiscretizer (#28817)
1 parent 311c6e2 commit 57a7222

File tree

7 files changed

+14
-59
lines changed

7 files changed

+14
-59
lines changed

examples/cluster/plot_face_compress.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,6 @@
8181
encode="ordinal",
8282
strategy="uniform",
8383
random_state=0,
84-
subsample=200_000,
8584
)
8685
compressed_raccoon_uniform = encoder.fit_transform(raccoon_face.reshape(-1, 1)).reshape(
8786
raccoon_face.shape
@@ -130,7 +129,6 @@
130129
encode="ordinal",
131130
strategy="kmeans",
132131
random_state=0,
133-
subsample=200_000,
134132
)
135133
compressed_raccoon_kmeans = encoder.fit_transform(raccoon_face.reshape(-1, 1)).reshape(
136134
raccoon_face.shape

examples/linear_model/plot_poisson_regression_non_normal_loss.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@
112112
("passthrough_numeric", "passthrough", ["BonusMalus"]),
113113
(
114114
"binned_numeric",
115-
KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),
115+
KBinsDiscretizer(n_bins=10, random_state=0),
116116
["VehAge", "DrivAge"],
117117
),
118118
("log_scaled_numeric", log_scale_transformer, ["Density"]),

examples/linear_model/plot_tweedie_regression_insurance_claims.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ def score_estimator(
241241
[
242242
(
243243
"binned_numeric",
244-
KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),
244+
KBinsDiscretizer(n_bins=10, random_state=0),
245245
["VehAge", "DrivAge"],
246246
),
247247
(

examples/preprocessing/plot_discretization_strategies.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,7 @@
7676
i += 1
7777
# transform the dataset with KBinsDiscretizer
7878
for strategy in strategies:
79-
enc = KBinsDiscretizer(
80-
n_bins=4, encode="ordinal", strategy=strategy, subsample=200_000
81-
)
79+
enc = KBinsDiscretizer(n_bins=4, encode="ordinal", strategy=strategy)
8280
enc.fit(X)
8381
grid_encoded = enc.transform(grid)
8482

sklearn/preprocessing/_discretization.py

Lines changed: 11 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from ..base import BaseEstimator, TransformerMixin, _fit_context
1313
from ..utils import resample
14-
from ..utils._param_validation import Hidden, Interval, Options, StrOptions
14+
from ..utils._param_validation import Interval, Options, StrOptions
1515
from ..utils.stats import _weighted_percentile
1616
from ..utils.validation import (
1717
_check_feature_names_in,
@@ -64,10 +64,9 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
6464
6565
.. versionadded:: 0.24
6666
67-
subsample : int or None, default='warn'
67+
subsample : int or None, default=200_000
6868
Maximum number of samples, used to fit the model, for computational
69-
efficiency. Defaults to 200_000 when `strategy='quantile'` and to `None`
70-
when `strategy='uniform'` or `strategy='kmeans'`.
69+
efficiency.
7170
`subsample=None` means that all the training samples are used when
7271
computing the quantiles that determine the binning thresholds.
7372
Since quantile computation relies on sorting each column of `X` and
@@ -147,7 +146,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
147146
... [ 0, 3, -2, 0.5],
148147
... [ 1, 4, -1, 2]]
149148
>>> est = KBinsDiscretizer(
150-
... n_bins=3, encode='ordinal', strategy='uniform', subsample=None
149+
... n_bins=3, encode='ordinal', strategy='uniform'
151150
... )
152151
>>> est.fit(X)
153152
KBinsDiscretizer(...)
@@ -177,11 +176,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
177176
"encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})],
178177
"strategy": [StrOptions({"uniform", "quantile", "kmeans"})],
179178
"dtype": [Options(type, {np.float64, np.float32}), None],
180-
"subsample": [
181-
Interval(Integral, 1, None, closed="left"),
182-
None,
183-
Hidden(StrOptions({"warn"})),
184-
],
179+
"subsample": [Interval(Integral, 1, None, closed="left"), None],
185180
"random_state": ["random_state"],
186181
}
187182

@@ -192,7 +187,7 @@ def __init__(
192187
encode="onehot",
193188
strategy="quantile",
194189
dtype=None,
195-
subsample="warn",
190+
subsample=200_000,
196191
random_state=None,
197192
):
198193
self.n_bins = n_bins
@@ -243,24 +238,13 @@ def fit(self, X, y=None, sample_weight=None):
243238
f"{self.strategy!r} instead."
244239
)
245240

246-
if self.strategy in ("uniform", "kmeans") and self.subsample == "warn":
247-
warnings.warn(
248-
(
249-
"In version 1.5 onwards, subsample=200_000 "
250-
"will be used by default. Set subsample explicitly to "
251-
"silence this warning in the mean time. Set "
252-
"subsample=None to disable subsampling explicitly."
253-
),
254-
FutureWarning,
255-
)
256-
257-
subsample = self.subsample
258-
if subsample == "warn":
259-
subsample = 200000 if self.strategy == "quantile" else None
260-
if subsample is not None and n_samples > subsample:
241+
if self.subsample is not None and n_samples > self.subsample:
261242
# Take a subsample of `X`
262243
X = resample(
263-
X, replace=False, n_samples=subsample, random_state=self.random_state
244+
X,
245+
replace=False,
246+
n_samples=self.subsample,
247+
random_state=self.random_state,
264248
)
265249

266250
n_features = X.shape[1]

sklearn/preprocessing/tests/test_discretization.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,6 @@
4949
),
5050
],
5151
)
52-
# TODO(1.5) remove warning filter when kbd's subsample default is changed
53-
@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
5452
def test_fit_transform(strategy, expected, sample_weight):
5553
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
5654
est.fit(X, sample_weight=sample_weight)
@@ -149,8 +147,6 @@ def test_invalid_n_bins_array():
149147
),
150148
],
151149
)
152-
# TODO(1.5) remove warning filter when kbd's subsample default is changed
153-
@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
154150
def test_fit_transform_n_bins_array(strategy, expected, sample_weight):
155151
est = KBinsDiscretizer(
156152
n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy
@@ -176,8 +172,6 @@ def test_kbinsdiscretizer_effect_sample_weight():
176172
assert_allclose(est.transform(X), [[0.0], [1.0], [2.0], [2.0], [2.0], [2.0]])
177173

178174

179-
# TODO(1.5) remove warning filter when kbd's subsample default is changed
180-
@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
181175
@pytest.mark.parametrize("strategy", ["kmeans", "quantile"])
182176
def test_kbinsdiscretizer_no_mutating_sample_weight(strategy):
183177
"""Make sure that `sample_weight` is not changed in place."""
@@ -258,8 +252,6 @@ def test_encode_options():
258252
("quantile", [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4]),
259253
],
260254
)
261-
# TODO(1.5) remove warning filter when kbd's subsample default is changed
262-
@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
263255
def test_nonuniform_strategies(
264256
strategy, expected_2bins, expected_3bins, expected_5bins
265257
):
@@ -313,8 +305,6 @@ def test_nonuniform_strategies(
313305
),
314306
],
315307
)
316-
# TODO(1.5) remove warning filter when kbd's subsample default is changed
317-
@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
318308
@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
319309
def test_inverse_transform(strategy, encode, expected_inv):
320310
kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
@@ -323,8 +313,6 @@ def test_inverse_transform(strategy, encode, expected_inv):
323313
assert_array_almost_equal(expected_inv, Xinv)
324314

325315

326-
# TODO(1.5) remove warning filter when kbd's subsample default is changed
327-
@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
328316
@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
329317
def test_transform_outside_fit_range(strategy):
330318
X = np.array([0, 1, 2, 3])[:, None]
@@ -490,14 +478,3 @@ def test_kbinsdiscretizer_subsample(strategy, global_random_seed):
490478
assert_allclose(
491479
kbd_subsampling.bin_edges_[0], kbd_no_subsampling.bin_edges_[0], rtol=1e-2
492480
)
493-
494-
495-
# TODO(1.5) remove this test
496-
@pytest.mark.parametrize("strategy", ["uniform", "kmeans"])
497-
def test_kbd_subsample_warning(strategy):
498-
# Check the future warning for the change of default of subsample
499-
X = np.random.RandomState(0).random_sample((100, 1))
500-
501-
kbd = KBinsDiscretizer(strategy=strategy, random_state=0)
502-
with pytest.warns(FutureWarning, match="subsample=200_000 will be used by default"):
503-
kbd.fit(X)

sklearn/preprocessing/tests/test_target_encoder.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -586,8 +586,6 @@ def test_invariance_of_encoding_under_label_permutation(smooth, global_random_se
586586
assert_allclose(X_test_encoded, X_test_permuted_encoded)
587587

588588

589-
# TODO(1.5) remove warning filter when kbd's subsample default is changed
590-
@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
591589
@pytest.mark.parametrize("smooth", [0.0, "auto"])
592590
def test_target_encoding_for_linear_regression(smooth, global_random_seed):
593591
# Check some expected statistical properties when fitting a linear

0 commit comments

Comments
 (0)