From df282ad8b12df1cb964c2cf6c73b5c779936d338 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 9 Apr 2025 16:23:18 +0200 Subject: [PATCH 01/18] use sample weight for sampling --- sklearn/ensemble/_bagging.py | 111 ++++++++++--------------- sklearn/ensemble/tests/test_bagging.py | 22 ----- 2 files changed, 45 insertions(+), 88 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 901c63c9250bc..8b622e90a9113 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -30,7 +30,6 @@ MethodMapping, _raise_for_params, _routing_enabled, - get_routing_for_object, process_routing, ) from ..utils.metaestimators import available_if @@ -74,6 +73,7 @@ def _generate_bagging_indices( n_samples, max_features, max_samples, + sample_weight, ): """Randomly draw feature and sample indices.""" # Get valid random state @@ -83,10 +83,18 @@ def _generate_bagging_indices( feature_indices = _generate_indices( random_state, bootstrap_features, n_features, max_features ) - sample_indices = _generate_indices( - random_state, bootstrap_samples, n_samples, max_samples - ) - + if sample_weight is None: + sample_indices = random_state.choice( + n_samples, max_samples, replace=bootstrap_samples + ) + else: + normalized_sample_weight = sample_weight / np.sum(sample_weight) + sample_indices = random_state.choice( + n_samples, + max_samples, + replace=bootstrap_samples, + p=normalized_sample_weight, + ) return feature_indices, sample_indices @@ -95,6 +103,7 @@ def _parallel_build_estimators( ensemble, X, y, + sample_weight, seeds, total_n_estimators, verbose, @@ -115,17 +124,6 @@ def _parallel_build_estimators( estimators = [] estimators_features = [] - # TODO: (slep6) remove if condition for unrouted sample_weight when metadata - # routing can't be disabled. - support_sample_weight = has_fit_parameter(ensemble.estimator_, "sample_weight") - if not _routing_enabled() and ( - not support_sample_weight and fit_params.get("sample_weight") is not None - ): - raise ValueError( - "The base estimator doesn't support sample weight, but sample_weight is " - "passed to the fit method." - ) - for i in range(n_estimators): if verbose > 1: print( @@ -150,51 +148,17 @@ def _parallel_build_estimators( n_samples, max_features, max_samples, + sample_weight, ) fit_params_ = fit_params.copy() - # TODO(SLEP6): remove if condition for unrouted sample_weight when metadata - # routing can't be disabled. - # 1. If routing is enabled, we will check if the routing supports sample - # weight and use it if it does. - # 2. If routing is not enabled, we will check if the base - # estimator supports sample_weight and use it if it does. - - # Note: Row sampling can be achieved either through setting sample_weight or - # by indexing. The former is more efficient. Therefore, use this method - # if possible, otherwise use indexing. - if _routing_enabled(): - request_or_router = get_routing_for_object(ensemble.estimator_) - consumes_sample_weight = request_or_router.consumes( - "fit", ("sample_weight",) - ) - else: - consumes_sample_weight = support_sample_weight - if consumes_sample_weight: - # Draw sub samples, using sample weights, and then fit - curr_sample_weight = _check_sample_weight( - fit_params_.pop("sample_weight", None), X - ).copy() - - if bootstrap: - sample_counts = np.bincount(indices, minlength=n_samples) - curr_sample_weight *= sample_counts - else: - not_indices_mask = ~indices_to_mask(indices, n_samples) - curr_sample_weight[not_indices_mask] = 0 - - fit_params_["sample_weight"] = curr_sample_weight - X_ = X[:, features] if requires_feature_indexing else X - estimator_fit(X_, y, **fit_params_) - else: - # cannot use sample_weight, so use indexing - y_ = _safe_indexing(y, indices) - X_ = _safe_indexing(X, indices) - fit_params_ = _check_method_params(X, params=fit_params_, indices=indices) - if requires_feature_indexing: - X_ = X_[:, features] - estimator_fit(X_, y_, **fit_params_) + y_ = _safe_indexing(y, indices) + X_ = _safe_indexing(X, indices) + fit_params_ = _check_method_params(X, params=fit_params, indices=indices) + if requires_feature_indexing: + X_ = X_[:, features] + estimator_fit(X_, y_, **fit_params_) estimators.append(estimator) estimators_features.append(features) @@ -361,8 +325,12 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. - Note that this is supported only if the base estimator supports - sample weighting. + Used as probabilties to draw the samples. + + .. versionchanged:: 1.8 + The sample weights are used to draw the samples and are no + longer forwarded to the underlying estimators. It is now okay + to use a base estimator that does not support sample weight. **fit_params : dict Parameters to pass to the underlying estimators. @@ -395,9 +363,14 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=None) - fit_params["sample_weight"] = sample_weight - return self._fit(X, y, max_samples=self.max_samples, **fit_params) + return self._fit( + X, + y, + sample_weight=sample_weight, + max_samples=self.max_samples, + **fit_params, + ) def _parallel_args(self): return {} @@ -406,6 +379,7 @@ def _fit( self, X, y, + sample_weight=None, max_samples=None, max_depth=None, check_input=True, @@ -424,6 +398,10 @@ def _fit( The target values (class labels in classification, real numbers in regression). + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. + Used as probabilties to draw the samples. + max_samples : int or float, default=None Argument to use instead of self.max_samples. @@ -453,6 +431,9 @@ def _fit( self._n_samples = n_samples y = self._validate_y(y) + # Store sample_weight for _get_estimators_indices + self._sample_weight = sample_weight + # Check parameters self._validate_estimator(self._get_estimator()) @@ -461,10 +442,6 @@ def _fit( else: routed_params = Bunch() routed_params.estimator = Bunch(fit=fit_params) - if "sample_weight" in fit_params: - routed_params.estimator.fit["sample_weight"] = fit_params[ - "sample_weight" - ] if max_depth is not None: self.estimator_.max_depth = max_depth @@ -522,7 +499,7 @@ def _fit( elif n_more_estimators == 0: warn( "Warm-start fitting without increasing n_estimators does not " - "fit new trees." + "fit new estimators." ) return self @@ -548,6 +525,7 @@ def _fit( self, X, y, + sample_weight, seeds[starts[i] : starts[i + 1]], total_n_estimators, verbose=self.verbose, @@ -592,6 +570,7 @@ def _get_estimators_indices(self): self._n_samples, self._max_features, self._max_samples, + self._sample_weight, ) yield feature_indices, sample_indices diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 2cb9336bfd759..9a8bcc5f3a7ea 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -589,28 +589,6 @@ def test_bagging_with_pipeline(): assert isinstance(estimator[0].steps[-1][1].random_state, int) -class DummyZeroEstimator(BaseEstimator): - def fit(self, X, y): - self.classes_ = np.unique(y) - return self - - def predict(self, X): - return self.classes_[np.zeros(X.shape[0], dtype=int)] - - -def test_bagging_sample_weight_unsupported_but_passed(): - estimator = BaggingClassifier(DummyZeroEstimator()) - rng = check_random_state(0) - - estimator.fit(iris.data, iris.target).predict(iris.data) - with pytest.raises(ValueError): - estimator.fit( - iris.data, - iris.target, - sample_weight=rng.randint(10, size=(iris.data.shape[0])), - ) - - def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. From ecf3bcf259d45d291ea12012edd3c0fe304b49f5 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 9 Apr 2025 16:55:44 +0200 Subject: [PATCH 02/18] fix call in IsolationForest --- sklearn/ensemble/_iforest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 4e5287af7f699..808ba0a275c28 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -350,7 +350,7 @@ def fit(self, X, y=None, sample_weight=None): super()._fit( X, y, - max_samples, + max_samples=max_samples, max_depth=max_depth, sample_weight=sample_weight, check_input=False, From 1b6392062b7ac24f61d55a747acb719ec8375509 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 9 Apr 2025 17:13:14 +0200 Subject: [PATCH 03/18] changelog --- doc/whats_new/upcoming_changes/sklearn.ensemble/31165.fix.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 doc/whats_new/upcoming_changes/sklearn.ensemble/31165.fix.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.ensemble/31165.fix.rst b/doc/whats_new/upcoming_changes/sklearn.ensemble/31165.fix.rst new file mode 100644 index 0000000000000..91317deb0f009 --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.ensemble/31165.fix.rst @@ -0,0 +1,4 @@ +- :class:`ensemble.BaggingClassfier`, :class:`ensemble.BaggingRegressor` + and :class:`ensemble.isolationForest` now use `sample_weight` to draw + the samples instead of forwarding them to the underlying estimators. + By :user:`Antoine Baker `. From 084f64d5d955fa4d918122a1cf4139efdcfe8ed8 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 9 Apr 2025 17:36:48 +0200 Subject: [PATCH 04/18] validate sample weight --- sklearn/ensemble/_iforest.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 808ba0a275c28..31c5491ccb6c9 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -20,7 +20,12 @@ from ..utils._chunking import get_chunk_n_rows from ..utils._param_validation import Interval, RealNotInt, StrOptions from ..utils.parallel import Parallel, delayed -from ..utils.validation import _num_samples, check_is_fitted, validate_data +from ..utils.validation import ( + _check_sample_weight, + _num_samples, + check_is_fitted, + validate_data, +) from ._bagging import BaseBagging __all__ = ["IsolationForest"] @@ -317,6 +322,10 @@ def fit(self, X, y=None, sample_weight=None): X = validate_data( self, X, accept_sparse=["csc"], dtype=tree_dtype, ensure_all_finite=False ) + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=None) + if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. From db1c1e84c5a78262781e96b6f03441859cd9ad42 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 9 Apr 2025 17:41:00 +0200 Subject: [PATCH 05/18] harmonize kwargs order --- sklearn/ensemble/_iforest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 31c5491ccb6c9..5fed54c3b6430 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -359,9 +359,9 @@ def fit(self, X, y=None, sample_weight=None): super()._fit( X, y, + sample_weight=sample_weight, max_samples=max_samples, max_depth=max_depth, - sample_weight=sample_weight, check_input=False, ) From 85c7e8cf88a544e6cce3adc8210b9fd154260bb3 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 9 Apr 2025 18:09:53 +0200 Subject: [PATCH 06/18] add more samples --- sklearn/ensemble/_bagging.py | 2 ++ sklearn/ensemble/tests/test_bagging.py | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 8b622e90a9113..ec460dd186a6f 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -95,6 +95,8 @@ def _generate_bagging_indices( replace=bootstrap_samples, p=normalized_sample_weight, ) + print(f"{feature_indices=}") + print(f"{sample_indices=}") return feature_indices, sample_indices diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 9a8bcc5f3a7ea..6fc11e0970a04 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -887,8 +887,9 @@ def test_bagging_classifier_with_missing_inputs(): def test_bagging_small_max_features(): # Check that Bagging estimator can accept low fractional max_features - X = np.array([[1, 2], [3, 4]]) - y = np.array([1, 0]) + rng = np.random.RandomState(42) + X = rng.randn(10, 2) + y = rng.randint(2, size=X.shape[0]) bagging = BaggingClassifier(LogisticRegression(), max_features=0.3, random_state=1) bagging.fit(X, y) From df49db3c208fdcbbaf326371e85909361a4f9e31 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 9 Apr 2025 18:11:30 +0200 Subject: [PATCH 07/18] oops forgot print statements --- sklearn/ensemble/_bagging.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index ec460dd186a6f..8b622e90a9113 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -95,8 +95,6 @@ def _generate_bagging_indices( replace=bootstrap_samples, p=normalized_sample_weight, ) - print(f"{feature_indices=}") - print(f"{sample_indices=}") return feature_indices, sample_indices From 7863d5e719963c236e1e6f9c1e180a4ec59f7e2f Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Thu, 10 Apr 2025 10:35:56 +0200 Subject: [PATCH 08/18] doc --- sklearn/ensemble/_bagging.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 8b622e90a9113..7f920c7ec460f 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -325,9 +325,9 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. - Used as probabilties to draw the samples. + Used as probabilities to draw the samples. - .. versionchanged:: 1.8 + .. versionchanged:: 1.7 The sample weights are used to draw the samples and are no longer forwarded to the underlying estimators. It is now okay to use a base estimator that does not support sample weight. @@ -400,7 +400,7 @@ def _fit( sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. - Used as probabilties to draw the samples. + Used as probabilities to draw the samples. max_samples : int or float, default=None Argument to use instead of self.max_samples. From 75afc7354cf7833583e8fda52c97eacfa14ae9dc Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Thu, 10 Apr 2025 14:00:46 +0200 Subject: [PATCH 09/18] doc --- sklearn/ensemble/_bagging.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 7f920c7ec460f..6022722ef179c 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -328,9 +328,10 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): Used as probabilities to draw the samples. .. versionchanged:: 1.7 - The sample weights are used to draw the samples and are no - longer forwarded to the underlying estimators. It is now okay - to use a base estimator that does not support sample weight. + + The sample weights are used to draw the samples and are no + longer forwarded to the underlying estimators. It is now okay + to use a base estimator that does not support sample weight. **fit_params : dict Parameters to pass to the underlying estimators. From 6a1af69d34f9e6bbefafc7699a6ca8264a979c20 Mon Sep 17 00:00:00 2001 From: antoinebaker Date: Fri, 11 Apr 2025 14:55:24 +0200 Subject: [PATCH 10/18] Update doc/whats_new/upcoming_changes/sklearn.ensemble/31165.fix.rst Co-authored-by: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com> --- doc/whats_new/upcoming_changes/sklearn.ensemble/31165.fix.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/upcoming_changes/sklearn.ensemble/31165.fix.rst b/doc/whats_new/upcoming_changes/sklearn.ensemble/31165.fix.rst index 91317deb0f009..22a11f0d276d8 100644 --- a/doc/whats_new/upcoming_changes/sklearn.ensemble/31165.fix.rst +++ b/doc/whats_new/upcoming_changes/sklearn.ensemble/31165.fix.rst @@ -1,4 +1,4 @@ - :class:`ensemble.BaggingClassfier`, :class:`ensemble.BaggingRegressor` - and :class:`ensemble.isolationForest` now use `sample_weight` to draw + and :class:`ensemble.IsolationForest` now use `sample_weight` to draw the samples instead of forwarding them to the underlying estimators. By :user:`Antoine Baker `. From 55ce89173f29b3668beea5cd838db807f9833504 Mon Sep 17 00:00:00 2001 From: antoinebaker Date: Fri, 11 Apr 2025 14:56:03 +0200 Subject: [PATCH 11/18] Update doc/whats_new/upcoming_changes/sklearn.ensemble/31165.fix.rst Co-authored-by: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com> From 1de2b07d047f2a8869a8257bb764a8e3a818bcc8 Mon Sep 17 00:00:00 2001 From: antoinebaker Date: Wed, 16 Apr 2025 10:22:20 +0200 Subject: [PATCH 12/18] Update sklearn/ensemble/_bagging.py Co-authored-by: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com> --- sklearn/ensemble/_bagging.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 6022722ef179c..a3c33ebefaf86 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -325,7 +325,8 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. - Used as probabilities to draw the samples. + Used as probabilities to draw the samples that are used to fit the + sub-estimators. .. versionchanged:: 1.7 From 29c19443c22c8d099275c838dd8aaf0beed89efc Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 16 Apr 2025 10:50:21 +0200 Subject: [PATCH 13/18] fix test metadata --- .../test_metaestimators_metadata_routing.py | 50 +++++++++++++++---- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index ae2a186a3c5c2..49381b5732a77 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -330,7 +330,7 @@ "y": y, "preserves_metadata": False, "estimator_routing_methods": [ - "fit", + ("fit", ["metadata"]), "predict", "predict_proba", "predict_log_proba", @@ -349,7 +349,7 @@ "X": X, "y": y, "preserves_metadata": False, - "estimator_routing_methods": ["fit", "predict"], + "estimator_routing_methods": [("fit", ["metadata"]), "predict"], }, { "metaestimator": RidgeCV, @@ -459,7 +459,13 @@ - X: X-data to fit and predict - y: y-data to fit - estimator_routing_methods: list of all methods to check for routing metadata - to the sub-estimator + to the sub-estimator. Each value is either a str or a tuple: + - str: the name of the method, all metadata in this method must be routed to the + sub-estimator + - tuple: the name of the method, the second element is a list of metadata keys + to be passed to the sub-estimator. This is useful if certain metadata such as + `sample_weight` are never routed and only consumed, such as in `BaggingClassifier` + and `BaggingRegressor`. - preserves_metadata: - True (default): the metaestimator passes the metadata to the sub-estimator without modification. We check that the values recorded by @@ -562,6 +568,23 @@ def get_init_args(metaestimator_info, sub_estimator_consumes): ) +def process_routing_methods(estimator_routing_methods): + """Process estimator_routing_methods and return a dict. + + The dictionary is of the form {"method": ["metadata", ...]}. + By default, the list includes `sample_weight` and `metadata`. + """ + res = dict() + for method_spec in estimator_routing_methods: + if isinstance(method_spec, str): + method = method_spec + metadata = ["sample_weight", "metadata"] + else: + method, metadata = method_spec + res[method] = metadata + return res + + def set_requests(obj, *, method_mapping, methods, metadata_name, value=True): """Call `set_{method}_request` on a list of methods from the sub-estimator. @@ -662,10 +685,12 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator): metaestimator_class = metaestimator["metaestimator"] X = metaestimator["X"] y = metaestimator["y"] - routing_methods = metaestimator["estimator_routing_methods"] + routing_methods = process_routing_methods( + metaestimator["estimator_routing_methods"] + ) - for method_name in routing_methods: - for key in ["sample_weight", "metadata"]: + for method_name, metadata_keys in routing_methods.items(): + for key in metadata_keys: kwargs, (estimator, _), (scorer, _), *_ = get_init_args( metaestimator, sub_estimator_consumes=True ) @@ -721,12 +746,14 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): metaestimator_class = metaestimator["metaestimator"] X = metaestimator["X"] y = metaestimator["y"] - routing_methods = metaestimator["estimator_routing_methods"] + routing_methods = process_routing_methods( + metaestimator["estimator_routing_methods"] + ) method_mapping = metaestimator.get("method_mapping", {}) preserves_metadata = metaestimator.get("preserves_metadata", True) - for method_name in routing_methods: - for key in ["sample_weight", "metadata"]: + for method_name, metadata_keys in routing_methods.items(): + for key in metadata_keys: val = {"sample_weight": sample_weight, "metadata": metadata}[key] method_kwargs = {key: val} @@ -797,8 +824,9 @@ def set_request(estimator, method_name): metaestimator_class = metaestimator["metaestimator"] X = metaestimator["X"] y = metaestimator["y"] - routing_methods = metaestimator["estimator_routing_methods"] - + routing_methods = process_routing_methods( + metaestimator["estimator_routing_methods"] + ) for method_name in routing_methods: kwargs, (estimator, _), (_, _), (_, _) = get_init_args( metaestimator, sub_estimator_consumes=False From c4541fd7c756c41b0de75faf65287396e31c5e7d Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 16 Apr 2025 11:20:05 +0200 Subject: [PATCH 14/18] add warnings --- sklearn/ensemble/_bagging.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index a3c33ebefaf86..6753f1a30c85b 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -325,8 +325,9 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. - Used as probabilities to draw the samples that are used to fit the - sub-estimators. + Used as probabilities to draw the samples that are used to fit the + sub-estimators. It is strongly recommended to use bootstrap=True + (draw with replacement) for statistical soundness. .. versionchanged:: 1.7 @@ -365,6 +366,11 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=None) + if sample_weight is not None and not self.bootstrap: + warn( + f"When fitting {self.__class__.__name__} with sample_weight " + f"it is recommended to use bootstrap=True, got {self.bootstrap}." + ) return self._fit( X, @@ -715,7 +721,9 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): bootstrap : bool, default=True Whether samples are drawn with replacement. If False, sampling - without replacement is performed. + without replacement is performed. When fitting with `sample_weight` + it is strongly recommended to use bootstrap=True for statistical + soundness. bootstrap_features : bool, default=False Whether features are drawn with replacement. @@ -1223,7 +1231,9 @@ class BaggingRegressor(RegressorMixin, BaseBagging): bootstrap : bool, default=True Whether samples are drawn with replacement. If False, sampling - without replacement is performed. + without replacement is performed. When fitting with `sample_weight` + it is strongly recommended to use bootstrap=True for statistical + soundness. bootstrap_features : bool, default=False Whether features are drawn with replacement. From a8368bd99a2db8383419df005a391abb5a35110a Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Thu, 17 Apr 2025 10:56:08 +0200 Subject: [PATCH 15/18] fix doctest --- sklearn/ensemble/_bagging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index dfa417eeacec4..ab040ddfde53d 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -1337,7 +1337,7 @@ class BaggingRegressor(RegressorMixin, BaseBagging): >>> regr = BaggingRegressor(estimator=SVR(), ... n_estimators=10, random_state=0).fit(X, y) >>> regr.predict([[0, 0, 0, 0]]) - array([-2.8720...]) + array([-2.8948...]) """ def __init__( From b118ffe1ac1d874a5440d3980dd9fa48745c4206 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Tue, 22 Apr 2025 10:40:53 +0200 Subject: [PATCH 16/18] renaming --- .../test_metaestimators_metadata_routing.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 49381b5732a77..0ae60ae12c14c 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -568,11 +568,20 @@ def get_init_args(metaestimator_info, sub_estimator_consumes): ) -def process_routing_methods(estimator_routing_methods): +def filter_metadata_in_routing_methods(estimator_routing_methods): """Process estimator_routing_methods and return a dict. - The dictionary is of the form {"method": ["metadata", ...]}. - By default, the list includes `sample_weight` and `metadata`. + Parameters + ---------- + estimator_routing_methods : list of str or tuple + The estimator_routing_methods info from METAESTIMATORS. + + Returns + ------- + routing_methods : dict + The dictionary is of the form {"method": ["metadata", ...]}. + It specifies the list of metadata keys for each routing method. + By default the list includes `sample_weight` and `metadata`. """ res = dict() for method_spec in estimator_routing_methods: @@ -685,7 +694,7 @@ def test_error_on_missing_requests_for_sub_estimator(metaestimator): metaestimator_class = metaestimator["metaestimator"] X = metaestimator["X"] y = metaestimator["y"] - routing_methods = process_routing_methods( + routing_methods = filter_metadata_in_routing_methods( metaestimator["estimator_routing_methods"] ) @@ -746,7 +755,7 @@ def test_setting_request_on_sub_estimator_removes_error(metaestimator): metaestimator_class = metaestimator["metaestimator"] X = metaestimator["X"] y = metaestimator["y"] - routing_methods = process_routing_methods( + routing_methods = filter_metadata_in_routing_methods( metaestimator["estimator_routing_methods"] ) method_mapping = metaestimator.get("method_mapping", {}) @@ -824,7 +833,7 @@ def set_request(estimator, method_name): metaestimator_class = metaestimator["metaestimator"] X = metaestimator["X"] y = metaestimator["y"] - routing_methods = process_routing_methods( + routing_methods = filter_metadata_in_routing_methods( metaestimator["estimator_routing_methods"] ) for method_name in routing_methods: From bf6977d4fec3ea3ccc0eb2c808a27aff2d247602 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 21 May 2025 11:36:01 +0200 Subject: [PATCH 17/18] define sampling_strategy --- sklearn/ensemble/_bagging.py | 154 +++++++++++++++++++++++++++++------ 1 file changed, 129 insertions(+), 25 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index ab040ddfde53d..048c7154ed971 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -22,13 +22,14 @@ column_or_1d, ) from ..utils._mask import indices_to_mask -from ..utils._param_validation import HasMethods, Interval, RealNotInt +from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions from ..utils._tags import get_tags from ..utils.metadata_routing import ( MetadataRouter, MethodMapping, _raise_for_params, _routing_enabled, + get_routing_for_object, process_routing, ) from ..utils.metaestimators import available_if @@ -82,8 +83,8 @@ def _generate_bagging_indices( random_state, bootstrap_features, n_features, max_features ) if sample_weight is None: - sample_indices = random_state.choice( - n_samples, max_samples, replace=bootstrap_samples + sample_indices = _generate_indices( + random_state, bootstrap_samples, n_samples, max_samples ) else: normalized_sample_weight = sample_weight / np.sum(sample_weight) @@ -96,6 +97,22 @@ def _generate_bagging_indices( return feature_indices, sample_indices +def _consumes_sample_weight(estimator): + # TODO(SLEP6): remove if condition for unrouted sample_weight when metadata + # routing can't be disabled. + # 1. If routing is enabled, we will check if the routing supports sample + # weight and use it if it does. + # 2. If routing is not enabled, we will check if the base + # estimator supports sample_weight and use it if it does. + support_sample_weight = has_fit_parameter(estimator, "sample_weight") + if _routing_enabled(): + request_or_router = get_routing_for_object(estimator) + consumes_sample_weight = request_or_router.consumes("fit", ("sample_weight",)) + else: + consumes_sample_weight = support_sample_weight + return consumes_sample_weight + + def _parallel_build_estimators( n_estimators, ensemble, @@ -105,6 +122,7 @@ def _parallel_build_estimators( seeds, total_n_estimators, verbose, + sampling_strategy, check_input, fit_params, ): @@ -138,6 +156,9 @@ def _parallel_build_estimators( estimator_fit = estimator.fit # Draw random feature, sample indices + sample_weight_in_indices = ( + sample_weight if sampling_strategy == "indexing" else None + ) features, indices = _generate_bagging_indices( random_state, bootstrap_features, @@ -146,17 +167,36 @@ def _parallel_build_estimators( n_samples, max_features, max_samples, - sample_weight, + sample_weight_in_indices, ) fit_params_ = fit_params.copy() - - y_ = _safe_indexing(y, indices) - X_ = _safe_indexing(X, indices) - fit_params_ = _check_method_params(X, params=fit_params, indices=indices) - if requires_feature_indexing: - X_ = X_[:, features] - estimator_fit(X_, y_, **fit_params_) + # Note: Row sampling can be achieved either through setting sample_weight or + # by indexing, controled by the `sampling_strategy` argument. + if sampling_strategy == "weighting": + # Draw sub samples, using sample weights, and then fit + curr_sample_weight = _check_sample_weight(sample_weight, X).copy() + + if bootstrap: + sample_counts = np.bincount(indices, minlength=n_samples) + curr_sample_weight *= sample_counts + else: + not_indices_mask = ~indices_to_mask(indices, n_samples) + curr_sample_weight[not_indices_mask] = 0 + + fit_params_["sample_weight"] = curr_sample_weight + X_ = X[:, features] if requires_feature_indexing else X + estimator_fit(X_, y, **fit_params_) + elif sampling_strategy == "indexing": + # cannot use sample_weight, so use indexing + y_ = _safe_indexing(y, indices) + X_ = _safe_indexing(X, indices) + fit_params_ = _check_method_params(X, params=fit_params_, indices=indices) + if requires_feature_indexing: + X_ = X_[:, features] + estimator_fit(X_, y_, **fit_params_) + else: + raise ValueError(f"{sampling_strategy=} must be 'indexing' or 'weighting'.") estimators.append(estimator) estimators_features.append(features) @@ -268,6 +308,7 @@ class BaseBagging(BaseEnsemble, metaclass=ABCMeta): "n_jobs": [None, Integral], "random_state": ["random_state"], "verbose": ["verbose"], + "sampling_strategy": [StrOptions({"auto", "indexing", "weighting"})], } @abstractmethod @@ -285,6 +326,7 @@ def __init__( n_jobs=None, random_state=None, verbose=0, + sampling_strategy="auto", ): super().__init__( estimator=estimator, @@ -299,6 +341,7 @@ def __init__( self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose + self.sampling_strategy = sampling_strategy @_fit_context( # BaseBagging.estimator is not validated yet @@ -365,6 +408,11 @@ def fit(self, X, y, sample_weight=None, **fit_params): f"When fitting {self.__class__.__name__} with sample_weight " f"it is recommended to use bootstrap=True, got {self.bootstrap}." ) + if sample_weight is not None and self.sampling_strategy == "weighting": + warn( + f"When fitting {self.__class__.__name__} with sample_weight " + f"it is recommended to use sampling_strategy='indexing' or 'auto', got {self.sampling_strategy}." + ) return self._fit( X, @@ -417,11 +465,6 @@ def _fit( If the meta-estimator already checks the input, set this value to False to prevent redundant input validation. - sample_weight : array-like of shape (n_samples,), default=None - Sample weights. If None, then samples are equally weighted. - Note that this is supported only if the base estimator supports - sample weighting. - **fit_params : dict, default=None Parameters to pass to the :term:`fit` method of the underlying estimator. @@ -438,15 +481,9 @@ def _fit( self._n_samples = n_samples y = self._validate_y(y) - # Store sample_weight for _get_estimators_indices - self._sample_weight = sample_weight - # Check parameters self._validate_estimator(self._get_estimator()) - if sample_weight is not None: - fit_params["sample_weight"] = sample_weight - if _routing_enabled(): routed_params = process_routing(self, "fit", **fit_params) else: @@ -482,6 +519,28 @@ def _fit( # Store validated integer feature sampling value self._max_features = max_features + # Validate sampling_strategy + consumes_sample_weight = _consumes_sample_weight(self.estimator_) + sampling_strategy = self.sampling_strategy + if sampling_strategy == "auto": + if consumes_sample_weight and sample_weight is None: + sampling_strategy = "weighting" + else: + sampling_strategy = "indexing" + if (sampling_strategy == "weighting") and not consumes_sample_weight: + raise ValueError( + "The base estimator doesn't support sample weight, but sample_weight is " + "passed to the fit method." + ) + + # Store validated sampling_strategy + self._sampling_strategy = sampling_strategy + + # Store sample_weight_in_indices for _get_estimators_indices + self._sample_weight_in_indices = ( + sample_weight if sampling_strategy == "indexing" else None + ) + # Other checks if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available if bootstrap=True") @@ -539,6 +598,7 @@ def _fit( seeds[starts[i] : starts[i + 1]], total_n_estimators, verbose=self.verbose, + sampling_strategy=sampling_strategy, check_input=check_input, fit_params=routed_params.estimator.fit, ) @@ -580,7 +640,7 @@ def _get_estimators_indices(self): self._n_samples, self._max_features, self._max_samples, - self._sample_weight, + self._sample_weight_in_indices, ) yield feature_indices, sample_indices @@ -749,7 +809,7 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): processors. See :term:`Glossary ` for more details. random_state : int, RandomState instance or None, default=None - Controls the random resampling of the original dataset + Controls the random sampling_strategy of the original dataset (sample wise and feature wise). If the base estimator accepts a `random_state` attribute, a different seed is generated for each instance in the ensemble. @@ -759,6 +819,26 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): verbose : int, default=0 Controls the verbosity when fitting and predicting. + sampling_strategy : {'auto', 'indexing', 'weighting'}, default='auto' + How to handle the samples drawn from the original dataset. + + - 'indexing' explicitly indexes the original dataset. On the downside, + it creates copies and has therefore a memory overhead. On the upside, + it does not require the base estimator to support `sample_weight`. + - 'weighting' do not index the original dataset and is + therefore more memory efficient. Instead it passes the selected indices + as `sample_weight` to the base estimator, which must therefore + support `sample_weight`. + - 'auto' will select 'indexing' if fitting the bagging estimator with + `sample_weight` or if the base estimator does not support `sample_weight`, + and `weighting` otherwise. + + .. warning:: + Only the 'indexing' option along with boostrap=True gives statistically + correct results when fitting the bagging estimator with `sample_weight`. + + .. versionadded:: 1.8 + Attributes ---------- estimator_ : estimator @@ -853,6 +933,7 @@ def __init__( n_jobs=None, random_state=None, verbose=0, + sampling_strategy="auto", ): super().__init__( estimator=estimator, @@ -866,6 +947,7 @@ def __init__( n_jobs=n_jobs, random_state=random_state, verbose=verbose, + sampling_strategy=sampling_strategy, ) def _get_estimator(self): @@ -1256,7 +1338,7 @@ class BaggingRegressor(RegressorMixin, BaseBagging): processors. See :term:`Glossary ` for more details. random_state : int, RandomState instance or None, default=None - Controls the random resampling of the original dataset + Controls the random sampling_strategy of the original dataset (sample wise and feature wise). If the base estimator accepts a `random_state` attribute, a different seed is generated for each instance in the ensemble. @@ -1266,6 +1348,26 @@ class BaggingRegressor(RegressorMixin, BaseBagging): verbose : int, default=0 Controls the verbosity when fitting and predicting. + sampling_strategy : {'auto', 'indexing', 'weighting'}, default='auto' + How to handle the samples drawn from the original dataset. + + - 'indexing' explicitly indexes the original dataset. On the downside, + it creates copies and has therefore a memory overhead. On the upside, + it does not require the base estimator to support `sample_weight`. + - 'weighting' do not index the original dataset and is + therefore more memory efficient. Instead it passes the selected indices + as `sample_weight` to the base estimator, which must therefore + support `sample_weight`. + - 'auto' will select 'indexing' if fitting the bagging estimator with + `sample_weight` or if the base estimator does not support `sample_weight`, + and `weighting` otherwise. + + .. warning:: + Only the 'indexing' option along with boostrap=True gives statistically + correct results when fitting the bagging estimator with `sample_weight`. + + .. versionadded:: 1.8 + Attributes ---------- estimator_ : estimator @@ -1354,6 +1456,7 @@ def __init__( n_jobs=None, random_state=None, verbose=0, + sampling_strategy="auto", ): super().__init__( estimator=estimator, @@ -1367,6 +1470,7 @@ def __init__( n_jobs=n_jobs, random_state=random_state, verbose=verbose, + sampling_strategy=sampling_strategy, ) def predict(self, X, **params): From 575a6bd41ff0b612d9df430eb0fe0ca0271e62d2 Mon Sep 17 00:00:00 2001 From: Antoine Baker Date: Wed, 21 May 2025 12:25:41 +0200 Subject: [PATCH 18/18] bullet list ident --- sklearn/ensemble/_bagging.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 22c740a3d0415..9efd1540f15fe 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -824,15 +824,15 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): How to handle the samples drawn from the original dataset. - 'indexing' explicitly indexes the original dataset. On the downside, - it creates copies and has therefore a memory overhead. On the upside, - it does not require the base estimator to support `sample_weight`. + it creates copies and has therefore a memory overhead. On the upside, + it does not require the base estimator to support `sample_weight`. - 'weighting' do not index the original dataset and is - therefore more memory efficient. Instead it passes the selected indices - as `sample_weight` to the base estimator, which must therefore - support `sample_weight`. + therefore more memory efficient. Instead it passes the selected indices + as `sample_weight` to the base estimator, which must therefore + support `sample_weight`. - 'auto' will select 'indexing' if fitting the bagging estimator with - `sample_weight` or if the base estimator does not support `sample_weight`, - and `weighting` otherwise. + `sample_weight` or if the base estimator does not support `sample_weight`, + and `weighting` otherwise. .. warning:: Only the 'indexing' option along with boostrap=True gives statistically @@ -1353,15 +1353,15 @@ class BaggingRegressor(RegressorMixin, BaseBagging): How to handle the samples drawn from the original dataset. - 'indexing' explicitly indexes the original dataset. On the downside, - it creates copies and has therefore a memory overhead. On the upside, - it does not require the base estimator to support `sample_weight`. + it creates copies and has therefore a memory overhead. On the upside, + it does not require the base estimator to support `sample_weight`. - 'weighting' do not index the original dataset and is - therefore more memory efficient. Instead it passes the selected indices - as `sample_weight` to the base estimator, which must therefore - support `sample_weight`. + therefore more memory efficient. Instead it passes the selected indices + as `sample_weight` to the base estimator, which must therefore + support `sample_weight`. - 'auto' will select 'indexing' if fitting the bagging estimator with - `sample_weight` or if the base estimator does not support `sample_weight`, - and `weighting` otherwise. + `sample_weight` or if the base estimator does not support `sample_weight`, + and `weighting` otherwise. .. warning:: Only the 'indexing' option along with boostrap=True gives statistically