scikit-learn · MiguelParece · May 21, 2024 · May 25, 2024 · May 25, 2024 · May 26, 2024
diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -88,6 +88,13 @@ Changelog
   whether to raise an exception if a subset of the scorers in multimetric scoring fails
   or to return an error code. :pr:`28992` by :user:`Stefanie Senger <StefanieSenger>`.
 
+:mod:`sklearn.preprocessing`
+............................
+
+- |Feature| :class:`preprocessing.Target_encoder` now supports the `sample_weight`
+  parameter in the `fit` and `fit_transform` methods. :pr:`29110`
+  by :user: `Duarte São José <DuarteSJ>` and `Miguel Parece <MiguelParece>`.
+
 Thanks to everyone who has contributed to the maintenance and improvement of
 the project since version 1.5, including:
 

diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py
@@ -7,6 +7,7 @@
 from ..utils.multiclass import type_of_target
 from ..utils.validation import (
     _check_feature_names_in,
+    _check_sample_weight,
     _check_y,
     check_consistent_length,
     check_is_fitted,
@@ -209,7 +210,7 @@ def __init__(
         self.random_state = random_state
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
         """Fit the :class:`TargetEncoder` to X and y.
 
         Parameters
@@ -220,16 +221,19 @@ def fit(self, X, y):
         y : array-like of shape (n_samples,)
             The target data used to encode the categories.
 
+        sample_weight : ndarray of shape (n_samples,)
+            Contains weight values to be associated with each sample.
+
         Returns
         -------
         self : object
             Fitted encoder.
         """
-        self._fit_encodings_all(X, y)
+        self._fit_encodings_all(X, y, sample_weight)
         return self
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit_transform(self, X, y):
+    def fit_transform(self, X, y, sample_weight=None):
         """Fit :class:`TargetEncoder` and transform X with the target encoding.
 
         .. note::
@@ -245,6 +249,9 @@ def fit_transform(self, X, y):
         y : array-like of shape (n_samples,)
             The target data used to encode the categories.
 
+        sample_weight : ndarray of shape (n_samples,)
+            Contains weight values to be associated with each sample.
+
         Returns
         -------
         X_trans : ndarray of shape (n_samples, n_features) or \
@@ -253,11 +260,14 @@ def fit_transform(self, X, y):
         """
         from ..model_selection import KFold, StratifiedKFold  # avoid circular import
 
-        X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y)
+        X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(
+            X, y, sample_weight
+        )
 
         # The cv splitter is voluntarily restricted to *KFold to enforce non
         # overlapping validation folds, otherwise the fit_transform output will
         # not be well-specified.
+
         if self.target_type_ == "continuous":
             cv = KFold(self.cv, shuffle=self.shuffle, random_state=self.random_state)
         else:
@@ -273,24 +283,26 @@ def fit_transform(self, X, y):
             )
         else:
             X_out = np.empty_like(X_ordinal, dtype=np.float64)
-
+        sample_weight = _check_sample_weight(sample_weight, X)
         for train_idx, test_idx in cv.split(X, y):
             X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx]
-            y_train_mean = np.mean(y_train, axis=0)
-
+            sample_weight_train = sample_weight[train_idx]
+            y_train_mean = np.average(y_train, weights=sample_weight_train, axis=0)
             if self.target_type_ == "multiclass":
                 encodings = self._fit_encoding_multiclass(
                     X_train,
                     y_train,
                     n_categories,
                     y_train_mean,
+                    sample_weight_train,
                 )
             else:
                 encodings = self._fit_encoding_binary_or_continuous(
                     X_train,
                     y_train,
                     n_categories,
                     y_train_mean,
+                    sample_weight_train,
                 )
             self._transform_X_ordinal(
                 X_out,
@@ -344,7 +356,7 @@ def transform(self, X):
         )
         return X_out
 
-    def _fit_encodings_all(self, X, y):
+    def _fit_encodings_all(self, X, y, sample_weight=None):
         """Fit a target encoding with all the data."""
         # avoid circular import
         from ..preprocessing import (
@@ -353,6 +365,7 @@ def _fit_encodings_all(self, X, y):
         )
 
         check_consistent_length(X, y)
+        sample_weight = _check_sample_weight(sample_weight, X)
         self._fit(X, handle_unknown="ignore", force_all_finite="allow-nan")
 
         if self.target_type == "auto":
@@ -380,7 +393,7 @@ def _fit_encodings_all(self, X, y):
         else:  # continuous
             y = _check_y(y, y_numeric=True, estimator=self)
 
-        self.target_mean_ = np.mean(y, axis=0)
+        self.target_mean_ = np.average(y, weights=sample_weight, axis=0)
 
         X_ordinal, X_known_mask = self._transform(
             X, handle_unknown="ignore", force_all_finite="allow-nan"
@@ -396,42 +409,52 @@ def _fit_encodings_all(self, X, y):
                 y,
                 n_categories,
                 self.target_mean_,
+                sample_weight,
             )
         else:
             encodings = self._fit_encoding_binary_or_continuous(
                 X_ordinal,
                 y,
                 n_categories,
                 self.target_mean_,
+                sample_weight,
             )
         self.encodings_ = encodings
 
         return X_ordinal, X_known_mask, y, n_categories
 
     def _fit_encoding_binary_or_continuous(
-        self, X_ordinal, y, n_categories, target_mean
+        self, X_ordinal, y, n_categories, target_mean, sample_weight
     ):
         """Learn target encodings."""
         if self.smooth == "auto":
-            y_variance = np.var(y)
+            y_variance = np.sum(sample_weight * (y - target_mean) ** 2) / (
+                np.sum(sample_weight)
+            )
+
             encodings = _fit_encoding_fast_auto_smooth(
                 X_ordinal,
                 y,
+                sample_weight,
                 n_categories,
                 target_mean,
                 y_variance,
             )
         else:
+
             encodings = _fit_encoding_fast(
                 X_ordinal,
                 y,
+                sample_weight,
                 n_categories,
                 self.smooth,
                 target_mean,
             )
         return encodings
 
-    def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
+    def _fit_encoding_multiclass(
+        self, X_ordinal, y, n_categories, target_mean, sample_weight
+    ):
         """Learn multiclass encodings.
 
         Learn encodings for each class (c) then reorder encodings such that
@@ -452,6 +475,7 @@ def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
                 y_class,
                 n_categories,
                 target_mean[i],
+                sample_weight,
             )
             encodings.extend(encoding)
 

diff --git a/sklearn/preprocessing/_target_encoder_fast.pyx b/sklearn/preprocessing/_target_encoder_fast.pyx
@@ -15,11 +15,17 @@ ctypedef fused Y_DTYPE:
     int32_t
     float64_t
     float32_t
+ctypedef fused W_DTYPE:
+    int64_t
+    int32_t
+    float64_t
+    float32_t
 
 
 def _fit_encoding_fast(
     INT_DTYPE[:, ::1] X_int,
     const Y_DTYPE[:] y,
+    const W_DTYPE[:] sample_weight,
     int64_t[::1] n_categories,
     double smooth,
     double y_mean,
@@ -65,8 +71,8 @@ def _fit_encoding_fast(
                 # -1 are unknown categories, which are not counted
                 if X_int_tmp == -1:
                     continue
-                sums[X_int_tmp] += y[sample_idx]
-                counts[X_int_tmp] += 1.0
+                sums[X_int_tmp] += y[sample_idx] * sample_weight[sample_idx]
+                counts[X_int_tmp] += sample_weight[sample_idx]
 
             for cat_idx in range(n_cats):
                 if counts[cat_idx] == 0:
@@ -80,6 +86,7 @@ def _fit_encoding_fast(
 def _fit_encoding_fast_auto_smooth(
     INT_DTYPE[:, ::1] X_int,
     const Y_DTYPE[:] y,
+    const W_DTYPE[:] sample_weight,
     int64_t[::1] n_categories,
     double y_mean,
     double y_variance,
@@ -99,7 +106,7 @@ def _fit_encoding_fast_auto_smooth(
         int n_features = X_int.shape[1]
         int64_t max_n_cats = np.max(n_categories)
         double[::1] means = np.empty(max_n_cats, dtype=np.float64)
-        int64_t[::1] counts = np.empty(max_n_cats, dtype=np.int64)
+        double[::1] weighted_counts = np.empty(max_n_cats, dtype=np.float64)
         double[::1] sum_of_squared_diffs = np.empty(max_n_cats, dtype=np.float64)
         double lambda_
         list encodings = []
@@ -124,35 +131,35 @@ def _fit_encoding_fast_auto_smooth(
 
             for cat_idx in range(n_cats):
                 means[cat_idx] = 0.0
-                counts[cat_idx] = 0
+                weighted_counts[cat_idx] = 0.0
                 sum_of_squared_diffs[cat_idx] = 0.0
 
-            # first pass to compute the mean
+            # first pass to compute the weighted mean
             for sample_idx in range(n_samples):
                 X_int_tmp = X_int[sample_idx, feat_idx]
 
                 # -1 are unknown categories, which are not counted
                 if X_int_tmp == -1:
                     continue
-                counts[X_int_tmp] += 1
-                means[X_int_tmp] += y[sample_idx]
+                weighted_counts[X_int_tmp] += sample_weight[sample_idx]
+                means[X_int_tmp] += y[sample_idx] * sample_weight[sample_idx]
 
             for cat_idx in range(n_cats):
-                means[cat_idx] /= counts[cat_idx]
+                means[cat_idx] /= weighted_counts[cat_idx]
 
             # second pass to compute the sum of squared differences
             for sample_idx in range(n_samples):
                 X_int_tmp = X_int[sample_idx, feat_idx]
                 if X_int_tmp == -1:
                     continue
                 diff = y[sample_idx] - means[X_int_tmp]
-                sum_of_squared_diffs[X_int_tmp] += diff * diff
+                sum_of_squared_diffs[X_int_tmp] += diff * diff * sample_weight[sample_idx]
 
             for cat_idx in range(n_cats):
                 lambda_ = (
-                    y_variance * counts[cat_idx] /
-                    (y_variance * counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
-                     counts[cat_idx])
+                    y_variance * weighted_counts[cat_idx] /
+                    (y_variance * weighted_counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
+                     weighted_counts[cat_idx])
                 )
                 if isnan(lambda_):
                     # A nan can happen when: