MAINT Clean up deprecations for 1.5: in KBinsDiscretizer (#28817)

jeremiedbb · web-flow · commit 57a72222da09 · 2024-04-11T17:59:05.000Z
diff --git a/examples/cluster/plot_face_compress.py b/examples/cluster/plot_face_compress.py
@@ -81,7 +81,6 @@
     encode="ordinal",
     strategy="uniform",
     random_state=0,
-    subsample=200_000,
 )
 compressed_raccoon_uniform = encoder.fit_transform(raccoon_face.reshape(-1, 1)).reshape(
     raccoon_face.shape
@@ -130,7 +129,6 @@
     encode="ordinal",
     strategy="kmeans",
     random_state=0,
-    subsample=200_000,
 )
 compressed_raccoon_kmeans = encoder.fit_transform(raccoon_face.reshape(-1, 1)).reshape(
     raccoon_face.shape
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -112,7 +112,7 @@
         ("passthrough_numeric", "passthrough", ["BonusMalus"]),
         (
             "binned_numeric",
-            KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),
+            KBinsDiscretizer(n_bins=10, random_state=0),
             ["VehAge", "DrivAge"],
         ),
         ("log_scaled_numeric", log_scale_transformer, ["Density"]),
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -241,7 +241,7 @@ def score_estimator(
     [
         (
             "binned_numeric",
-            KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),
+            KBinsDiscretizer(n_bins=10, random_state=0),
             ["VehAge", "DrivAge"],
         ),
         (
diff --git a/examples/preprocessing/plot_discretization_strategies.py b/examples/preprocessing/plot_discretization_strategies.py
@@ -76,9 +76,7 @@
     i += 1
     # transform the dataset with KBinsDiscretizer
     for strategy in strategies:
-        enc = KBinsDiscretizer(
-            n_bins=4, encode="ordinal", strategy=strategy, subsample=200_000
-        )
+        enc = KBinsDiscretizer(n_bins=4, encode="ordinal", strategy=strategy)
         enc.fit(X)
         grid_encoded = enc.transform(grid)
 
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
@@ -11,7 +11,7 @@
 
 from ..base import BaseEstimator, TransformerMixin, _fit_context
 from ..utils import resample
-from ..utils._param_validation import Hidden, Interval, Options, StrOptions
+from ..utils._param_validation import Interval, Options, StrOptions
 from ..utils.stats import _weighted_percentile
 from ..utils.validation import (
     _check_feature_names_in,
@@ -64,10 +64,9 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
 
         .. versionadded:: 0.24
 
-    subsample : int or None, default='warn'
+    subsample : int or None, default=200_000
         Maximum number of samples, used to fit the model, for computational
-        efficiency. Defaults to 200_000 when `strategy='quantile'` and to `None`
-        when `strategy='uniform'` or `strategy='kmeans'`.
+        efficiency.
         `subsample=None` means that all the training samples are used when
         computing the quantiles that determine the binning thresholds.
         Since quantile computation relies on sorting each column of `X` and
@@ -147,7 +146,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
     ...      [ 0, 3, -2,  0.5],
     ...      [ 1, 4, -1,    2]]
     >>> est = KBinsDiscretizer(
-    ...     n_bins=3, encode='ordinal', strategy='uniform', subsample=None
+    ...     n_bins=3, encode='ordinal', strategy='uniform'
     ... )
     >>> est.fit(X)
     KBinsDiscretizer(...)
@@ -177,11 +176,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
         "encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})],
         "strategy": [StrOptions({"uniform", "quantile", "kmeans"})],
         "dtype": [Options(type, {np.float64, np.float32}), None],
-        "subsample": [
-            Interval(Integral, 1, None, closed="left"),
-            None,
-            Hidden(StrOptions({"warn"})),
-        ],
+        "subsample": [Interval(Integral, 1, None, closed="left"), None],
         "random_state": ["random_state"],
     }
 
@@ -192,7 +187,7 @@ def __init__(
         encode="onehot",
         strategy="quantile",
         dtype=None,
-        subsample="warn",
+        subsample=200_000,
         random_state=None,
     ):
         self.n_bins = n_bins
@@ -243,24 +238,13 @@ def fit(self, X, y=None, sample_weight=None):
                 f"{self.strategy!r} instead."
             )
 
-        if self.strategy in ("uniform", "kmeans") and self.subsample == "warn":
-            warnings.warn(
-                (
-                    "In version 1.5 onwards, subsample=200_000 "
-                    "will be used by default. Set subsample explicitly to "
-                    "silence this warning in the mean time. Set "
-                    "subsample=None to disable subsampling explicitly."
-                ),
-                FutureWarning,
-            )
-
-        subsample = self.subsample
-        if subsample == "warn":
-            subsample = 200000 if self.strategy == "quantile" else None
-        if subsample is not None and n_samples > subsample:
+        if self.subsample is not None and n_samples > self.subsample:
             # Take a subsample of `X`
             X = resample(
-                X, replace=False, n_samples=subsample, random_state=self.random_state
+                X,
+                replace=False,
+                n_samples=self.subsample,
+                random_state=self.random_state,
             )
 
         n_features = X.shape[1]
diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py
@@ -49,8 +49,6 @@
         ),
     ],
 )
-# TODO(1.5) remove warning filter when kbd's subsample default is changed
-@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
 def test_fit_transform(strategy, expected, sample_weight):
     est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
     est.fit(X, sample_weight=sample_weight)
@@ -149,8 +147,6 @@ def test_invalid_n_bins_array():
         ),
     ],
 )
-# TODO(1.5) remove warning filter when kbd's subsample default is changed
-@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
 def test_fit_transform_n_bins_array(strategy, expected, sample_weight):
     est = KBinsDiscretizer(
         n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy
@@ -176,8 +172,6 @@ def test_kbinsdiscretizer_effect_sample_weight():
     assert_allclose(est.transform(X), [[0.0], [1.0], [2.0], [2.0], [2.0], [2.0]])
 
 
-# TODO(1.5) remove warning filter when kbd's subsample default is changed
-@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
 @pytest.mark.parametrize("strategy", ["kmeans", "quantile"])
 def test_kbinsdiscretizer_no_mutating_sample_weight(strategy):
     """Make sure that `sample_weight` is not changed in place."""
@@ -258,8 +252,6 @@ def test_encode_options():
         ("quantile", [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4]),
     ],
 )
-# TODO(1.5) remove warning filter when kbd's subsample default is changed
-@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
 def test_nonuniform_strategies(
     strategy, expected_2bins, expected_3bins, expected_5bins
 ):
@@ -313,8 +305,6 @@ def test_nonuniform_strategies(
         ),
     ],
 )
-# TODO(1.5) remove warning filter when kbd's subsample default is changed
-@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
 @pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"])
 def test_inverse_transform(strategy, encode, expected_inv):
     kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
@@ -323,8 +313,6 @@ def test_inverse_transform(strategy, encode, expected_inv):
     assert_array_almost_equal(expected_inv, Xinv)
 
 
-# TODO(1.5) remove warning filter when kbd's subsample default is changed
-@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
 @pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"])
 def test_transform_outside_fit_range(strategy):
     X = np.array([0, 1, 2, 3])[:, None]
@@ -490,14 +478,3 @@ def test_kbinsdiscretizer_subsample(strategy, global_random_seed):
     assert_allclose(
         kbd_subsampling.bin_edges_[0], kbd_no_subsampling.bin_edges_[0], rtol=1e-2
     )
-
-
-# TODO(1.5) remove this test
-@pytest.mark.parametrize("strategy", ["uniform", "kmeans"])
-def test_kbd_subsample_warning(strategy):
-    # Check the future warning for the change of default of subsample
-    X = np.random.RandomState(0).random_sample((100, 1))
-
-    kbd = KBinsDiscretizer(strategy=strategy, random_state=0)
-    with pytest.warns(FutureWarning, match="subsample=200_000 will be used by default"):
-        kbd.fit(X)
diff --git a/sklearn/preprocessing/tests/test_target_encoder.py b/sklearn/preprocessing/tests/test_target_encoder.py
@@ -586,8 +586,6 @@ def test_invariance_of_encoding_under_label_permutation(smooth, global_random_se
     assert_allclose(X_test_encoded, X_test_permuted_encoded)
 
 
-# TODO(1.5) remove warning filter when kbd's subsample default is changed
-@pytest.mark.filterwarnings("ignore:In version 1.5 onwards, subsample=200_000")
 @pytest.mark.parametrize("smooth", [0.0, "auto"])
 def test_target_encoding_for_linear_regression(smooth, global_random_seed):
     # Check some expected statistical properties when fitting a linear

Original file line number	Diff line number	Diff line change
`@@ -112,7 +112,7 @@`
`112`	`112`	`("passthrough_numeric", "passthrough", ["BonusMalus"]),`
`113`	`113`	`(`
`114`	`114`	`"binned_numeric",`
`115`		`- KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),`
	`115`	`+ KBinsDiscretizer(n_bins=10, random_state=0),`
`116`	`116`	`["VehAge", "DrivAge"],`
`117`	`117`	`),`
`118`	`118`	`("log_scaled_numeric", log_scale_transformer, ["Density"]),`
Original file line number	Diff line number	Diff line change
`@@ -241,7 +241,7 @@ def score_estimator(`
`241`	`241`	`[`
`242`	`242`	`(`
`243`	`243`	`"binned_numeric",`
`244`		`- KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),`
	`244`	`+ KBinsDiscretizer(n_bins=10, random_state=0),`
`245`	`245`	`["VehAge", "DrivAge"],`
`246`	`246`	`),`
`247`	`247`	`(`