scikit-learn · vitaliset · May 5, 2023 · May 5, 2023 · May 5, 2023 · May 6, 2023
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -871,6 +871,11 @@ Changelog
   combines input arguments `(input_feature, category)` to a string.
   :pr:`22506` by :user:`Mario Kostelac <mariokostelac>`.
 
+- |Enhancement| Adds support for `sample_weight` in
+  :class:`preprocessing.OneHotEncoder`. When using `sample_weight`, then `min_frequency`
+  and `max_categories` will filter according to sum samples' weight for that category
+  instead of count. :pr:`26330` by :user:`Carlo Lemos <vitaliset>`.
+
 - |Enhancement| Added support for `sample_weight` in
   :class:`preprocessing.KBinsDiscretizer`. This allows specifying the parameter
   `sample_weight` for each sample to be used while fitting. The option is only

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -16,7 +16,11 @@
 from ..utils._missing import is_scalar_nan
 from ..utils._param_validation import Interval, RealNotInt, StrOptions
 from ..utils._set_output import _get_output_config
-from ..utils.validation import _check_feature_names_in, check_is_fitted
+from ..utils.validation import (
+    _check_feature_names_in,
+    _check_sample_weight,
+    check_is_fitted,
+)
 
 __all__ = ["OneHotEncoder", "OrdinalEncoder"]
 
@@ -68,6 +72,7 @@ def _check_X(self, X, force_all_finite=True):
     def _fit(
         self,
         X,
+        sample_weight=None,
         handle_unknown="error",
         force_all_finite=True,
         return_counts=False,
@@ -80,6 +85,13 @@ def _fit(
             X, force_all_finite=force_all_finite
         )
         self.n_features_in_ = n_features
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+            # Filtering rows with sample_weight equals zero so we don't get extra dummy
+            # columns.
+            X_list = [Xi[sample_weight != 0] for Xi in X_list]
+            sample_weight = sample_weight[sample_weight != 0]
+            n_samples = np.sum(sample_weight)
 
         if self.categories != "auto":
             if len(self.categories) != n_features:
@@ -96,7 +108,9 @@ def _fit(
             Xi = X_list[i]
 
             if self.categories == "auto":
-                result = _unique(Xi, return_counts=compute_counts)
+                result = _unique(
+                    Xi, sample_weight=sample_weight, return_counts=compute_counts
+                )
                 if compute_counts:
                     cats, counts = result
                     category_counts.append(counts)
@@ -159,7 +173,7 @@ def _fit(
                         )
                         raise ValueError(msg)
                 if compute_counts:
-                    category_counts.append(_get_counts(Xi, cats))
+                    category_counts.append(_get_counts(Xi, cats, sample_weight))
 
             self.categories_.append(cats)
 
@@ -277,11 +291,13 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
 
         Parameters
         ----------
-        category_count : ndarray of shape (n_cardinality,)
-            Category counts.
+        category_count : array-like of shape (n_cardinality,)
+            Category counts or sum of `sample_weight` for the samples from the
+            category when `sample_weight` is different from `None`.
 
         n_samples : int
-            Number of samples.
+            Number of samples in training set or total sum of `sample_weight`
+            for all samples when `sample_weight` is different from `None`.
 
         col_idx : int
             Index of the current category. Only used for the error message.
@@ -344,7 +360,8 @@ def _fit_infrequent_category_mapping(
         Parameters
         ----------
         n_samples : int
-            Number of samples in training set.
+            Number of samples in training set or total sum of `sample_weight`
+            for all samples when `sample_weight` is different from `None`.
         category_counts: list of ndarray
             `category_counts[i]` is the category counts corresponding to
             `self.categories_[i]`.
@@ -565,13 +582,15 @@ class OneHotEncoder(_BaseEncoder):
 
     min_frequency : int or float, default=None
         Specifies the minimum frequency below which a category will be
-        considered infrequent.
+        considered infrequent. If during fit `sample_weight` is different from
+        default, then count will be done with sum of samples' weight.
 
         - If `int`, categories with a smaller cardinality will be considered
           infrequent.
 
         - If `float`, categories with a smaller cardinality than
-          `min_frequency * n_samples`  will be considered infrequent.
+          `min_frequency * n_samples` will be considered infrequent. If
+          `sample_weight` is different from `None`, `n_samples = sum(sample_weight)`.
 
         .. versionadded:: 1.1
             Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
@@ -955,7 +974,7 @@ def _compute_n_features_outs(self):
         return output
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, sample_weight=None):
         """
         Fit OneHotEncoder to X.
 
@@ -968,13 +987,21 @@ def fit(self, X, y=None):
             Ignored. This parameter exists only for compatibility with
             :class:`~sklearn.pipeline.Pipeline`.
 
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights used to weight the categories when using filtering
+            catergories with `max_categories` and `min_frequency`. If `None`,
+            then samples are equally weighted. If both `max_categories` and
+            `min_frequency` are set to default values, then `sample_weight`
+            is ignored.
+
         Returns
         -------
         self
             Fitted encoder.
         """
         self._fit(
             X,
+            sample_weight=sample_weight,
             handle_unknown=self.handle_unknown,
             force_all_finite="allow-nan",
         )
@@ -1294,7 +1321,7 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
           infrequent.
 
         - If `float`, categories with a smaller cardinality than
-          `min_frequency * n_samples`  will be considered infrequent.
+          `min_frequency * n_samples` will be considered infrequent.
 
         .. versionadded:: 1.3
             Read more in the :ref:`User Guide <encoder_infrequent_categories>`.

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -2323,6 +2323,98 @@ def test_ordinal_encoder_missing_appears_infrequent():
     assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]])
 
 
+@pytest.mark.parametrize(
+    "X, sample_weight, expected_shape",
+    [
+        (
+            [
+                ["car", 3],
+                ["bike", 3],
+                ["car", 1],
+                ["bike", 3],
+                ["boat", 2],
+                ["airplane", 4],
+            ],
+            np.array([2, 2.5, 0.5, 0.1, 0, 0]),
+            (6, 4),  # columns: car, bike, 3, infrequent (1)
+        ),
+        (
+            [["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]],
+            np.array([5, 5, 0.1, 0.3, 4, 0.9]),
+            (6, 3),
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "min_frequency",
+    [0.3, 0.9, 2],
+)
+def test_one_hot_encoder_sample_weight_min_frequency(
+    X, sample_weight, expected_shape, min_frequency
+):
+    ohe = OneHotEncoder(min_frequency=2, handle_unknown="infrequent_if_exist")
+    X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
+    assert_allclose(X_trans.shape, expected_shape)
+
+
+@pytest.mark.parametrize(
+    "X, sample_weight, expected_shape",
+    [
+        (
+            [
+                ["car", 3],
+                ["bike", 3],
+                ["car", 1],
+                ["bike", 3],
+                ["boat", 2],
+                ["airplane", 4],
+            ],
+            np.array([2, 2.5, 0.5, 0.1, 0, 0]),
+            (6, 4),
+        ),
+        (
+            [["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]],
+            np.array([5, 5, 0.1, 0.3, 4, 0.9]),
+            (6, 2),
+        ),
+    ],
+)
+def test_one_hot_encoder_sample_weight_max_categories(X, sample_weight, expected_shape):
+    ohe = OneHotEncoder(max_categories=2, handle_unknown="ignore")
+    X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
+    assert_allclose(X_trans.shape, expected_shape)
+
+
+@pytest.mark.parametrize(
+    "X",
+    [[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]],
+)
+@pytest.mark.parametrize(
+    "min_frequency",
+    [0.1, 0.3, 0.5, 0.9],
+)
+def test_one_hot_encoder_sample_weight_constant(X, min_frequency):
+    ohe = OneHotEncoder(min_frequency=min_frequency)
+    X_sw_None = ohe.fit_transform(X, sample_weight=None).toarray()
+    X_sw_constant1 = ohe.fit_transform(X, sample_weight=np.ones(len(X))).toarray()
+    X_sw_constant5 = ohe.fit_transform(X, sample_weight=5 * np.ones(len(X))).toarray()
+
+    assert_array_equal(X_sw_None, X_sw_constant1)
+    assert_array_equal(X_sw_None, X_sw_constant5)
+
+
+@pytest.mark.parametrize(
+    "X",
+    [[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]],
+)
+def test_one_hot_encoder_sample_weight_is_ignored(X):
+    ohe = OneHotEncoder()
+    X_sw_None = ohe.fit_transform(X).toarray()
+    X_sw_ones = ohe.fit_transform(X, sample_weight=np.ones(len(X))).toarray()
+
+    assert_array_equal(X_sw_None, X_sw_ones)
+
+
 @pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
 def test_encoder_not_fitted(Encoder):
     """Check that we raise a `NotFittedError` by calling transform before fit with