scikit-learn · DuarteSJ · May 6, 2025
diff --git a/doc/whats_new/upcoming_changes/sklearn.preprocessing/31324.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.preprocessing/31324.enhancement.rst
@@ -0,0 +1,2 @@
+- Now supporting sample weights in `TargetEncoder` to respect different observation frequency during encoding.
+  by :user:`[DuarteSJ] <[DuarteSJ]>`.
diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py
@@ -1,7 +1,7 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-from numbers import Integral, Real
+from numbers import Real
 
 import numpy as np
 
@@ -10,6 +10,7 @@
 from ..utils.multiclass import type_of_target
 from ..utils.validation import (
     _check_feature_names_in,
+    _check_sample_weight,
     _check_y,
     check_consistent_length,
     check_is_fitted,
@@ -91,10 +92,24 @@ class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
         more weight on the global target mean.
         If `"auto"`, then `smooth` is set to an empirical Bayes estimate.
 
-    cv : int, default=5
-        Determines the number of folds in the :term:`cross fitting` strategy used in
-        :meth:`fit_transform`. For classification targets, `StratifiedKFold` is used
-        and for continuous targets, `KFold` is used.
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass,
+        :class:`~sklearn.model_selection.StratifiedKFold` is used. In all other
+        cases, :class:`~sklearn.model_selection.KFold` is used. These splitters
+        are instantiated with `shuffle=False` so the splits will be the same
+        across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
 
     shuffle : bool, default=True
         Whether to shuffle the data in :meth:`fit_transform` before splitting into
@@ -190,7 +205,7 @@ class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
         "categories": [StrOptions({"auto"}), list],
         "target_type": [StrOptions({"auto", "continuous", "binary", "multiclass"})],
         "smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")],
-        "cv": [Interval(Integral, 2, None, closed="left")],
+        "cv": ["cv_object"],
         "shuffle": ["boolean"],
         "random_state": ["random_state"],
     }
@@ -212,7 +227,7 @@ def __init__(
         self.random_state = random_state
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
         """Fit the :class:`TargetEncoder` to X and y.
 
         Parameters
@@ -223,16 +238,19 @@ def fit(self, X, y):
         y : array-like of shape (n_samples,)
             The target data used to encode the categories.
 
+        sample_weight : ndarray of shape (n_samples,)
+            Contains weight values to be associated with each sample.
+
         Returns
         -------
         self : object
             Fitted encoder.
         """
-        self._fit_encodings_all(X, y)
+        self._fit_encodings_all(X, y, sample_weight)
         return self
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit_transform(self, X, y):
+    def fit_transform(self, X, y, sample_weight=None):
         """Fit :class:`TargetEncoder` and transform X with the target encoding.
 
         .. note::
@@ -248,6 +266,9 @@ def fit_transform(self, X, y):
         y : array-like of shape (n_samples,)
             The target data used to encode the categories.
 
+        sample_weight : ndarray of shape (n_samples,)
+            Contains weight values to be associated with each sample.
+
         Returns
         -------
         X_trans : ndarray of shape (n_samples, n_features) or \
@@ -256,7 +277,9 @@ def fit_transform(self, X, y):
         """
         from ..model_selection import KFold, StratifiedKFold  # avoid circular import
 
-        X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y)
+        X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(
+            X, y, sample_weight
+        )
 
         # The cv splitter is voluntarily restricted to *KFold to enforce non
         # overlapping validation folds, otherwise the fit_transform output will
@@ -277,23 +300,27 @@ def fit_transform(self, X, y):
         else:
             X_out = np.empty_like(X_ordinal, dtype=np.float64)
 
+        sample_weight = _check_sample_weight(sample_weight, X)
         for train_idx, test_idx in cv.split(X, y):
             X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx]
-            y_train_mean = np.mean(y_train, axis=0)
+            sample_weight_train = sample_weight[train_idx]
+            y_train_mean = np.average(y_train, weights=sample_weight_train, axis=0)
 
             if self.target_type_ == "multiclass":
                 encodings = self._fit_encoding_multiclass(
                     X_train,
                     y_train,
                     n_categories,
                     y_train_mean,
+                    sample_weight_train,
                 )
             else:
                 encodings = self._fit_encoding_binary_or_continuous(
                     X_train,
                     y_train,
                     n_categories,
                     y_train_mean,
+                    sample_weight_train,
                 )
             self._transform_X_ordinal(
                 X_out,
@@ -347,7 +374,7 @@ def transform(self, X):
         )
         return X_out
 
-    def _fit_encodings_all(self, X, y):
+    def _fit_encodings_all(self, X, y, sample_weight=None):
         """Fit a target encoding with all the data."""
         # avoid circular import
         from ..preprocessing import (
@@ -356,6 +383,7 @@ def _fit_encodings_all(self, X, y):
         )
 
         check_consistent_length(X, y)
+        sample_weight = _check_sample_weight(sample_weight, X)
         self._fit(X, handle_unknown="ignore", ensure_all_finite="allow-nan")
 
         if self.target_type == "auto":
@@ -383,7 +411,7 @@ def _fit_encodings_all(self, X, y):
         else:  # continuous
             y = _check_y(y, y_numeric=True, estimator=self)
 
-        self.target_mean_ = np.mean(y, axis=0)
+        self.target_mean_ = np.average(y, weights=sample_weight, axis=0)
 
         X_ordinal, X_known_mask = self._transform(
             X, handle_unknown="ignore", ensure_all_finite="allow-nan"
@@ -399,27 +427,32 @@ def _fit_encodings_all(self, X, y):
                 y,
                 n_categories,
                 self.target_mean_,
+                sample_weight,
             )
         else:
             encodings = self._fit_encoding_binary_or_continuous(
                 X_ordinal,
                 y,
                 n_categories,
                 self.target_mean_,
+                sample_weight,
             )
         self.encodings_ = encodings
 
         return X_ordinal, X_known_mask, y, n_categories
 
     def _fit_encoding_binary_or_continuous(
-        self, X_ordinal, y, n_categories, target_mean
+        self, X_ordinal, y, n_categories, target_mean, sample_weight
     ):
         """Learn target encodings."""
         if self.smooth == "auto":
-            y_variance = np.var(y)
+            y_variance = np.sum(sample_weight * (y - target_mean) ** 2) / np.sum(
+                sample_weight
+            )
             encodings = _fit_encoding_fast_auto_smooth(
                 X_ordinal,
                 y,
+                sample_weight,
                 n_categories,
                 target_mean,
                 y_variance,
@@ -428,13 +461,16 @@ def _fit_encoding_binary_or_continuous(
             encodings = _fit_encoding_fast(
                 X_ordinal,
                 y,
+                sample_weight,
                 n_categories,
                 self.smooth,
                 target_mean,
             )
         return encodings
 
-    def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
+    def _fit_encoding_multiclass(
+        self, X_ordinal, y, n_categories, target_mean, sample_weight
+    ):
         """Learn multiclass encodings.
 
         Learn encodings for each class (c) then reorder encodings such that
@@ -455,6 +491,7 @@ def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
                 y_class,
                 n_categories,
                 target_mean[i],
+                sample_weight,
             )
             encodings.extend(encoding)
 

diff --git a/sklearn/preprocessing/_target_encoder_fast.pyx b/sklearn/preprocessing/_target_encoder_fast.pyx
@@ -16,10 +16,17 @@ ctypedef fused Y_DTYPE:
     float64_t
     float32_t
 
+ctypedef fused WEIGHT_DTYPE:
+    int64_t
+    int32_t
+    float64_t
+    float32_t
+
 
 def _fit_encoding_fast(
     INT_DTYPE[:, ::1] X_int,
     const Y_DTYPE[:] y,
+    const WEIGHT_DTYPE[:] sample_weight,
     int64_t[::1] n_categories,
     double smooth,
     double y_mean,
@@ -65,8 +72,8 @@ def _fit_encoding_fast(
                 # -1 are unknown categories, which are not counted
                 if X_int_tmp == -1:
                     continue
-                sums[X_int_tmp] += y[sample_idx]
-                counts[X_int_tmp] += 1.0
+                sums[X_int_tmp] += y[sample_idx] * sample_weight[sample_idx]
+                counts[X_int_tmp] += sample_weight[sample_idx]
 
             for cat_idx in range(n_cats):
                 if counts[cat_idx] == 0:
@@ -80,6 +87,7 @@ def _fit_encoding_fast(
 def _fit_encoding_fast_auto_smooth(
     INT_DTYPE[:, ::1] X_int,
     const Y_DTYPE[:] y,
+    const WEIGHT_DTYPE[:] sample_weight,
     int64_t[::1] n_categories,
     double y_mean,
     double y_variance,
@@ -99,7 +107,7 @@ def _fit_encoding_fast_auto_smooth(
         int n_features = X_int.shape[1]
         int64_t max_n_cats = np.max(n_categories)
         double[::1] means = np.empty(max_n_cats, dtype=np.float64)
-        int64_t[::1] counts = np.empty(max_n_cats, dtype=np.int64)
+        double[::1] weighed_counts = np.empty(max_n_cats, dtype=np.float64)
         double[::1] sum_of_squared_diffs = np.empty(max_n_cats, dtype=np.float64)
         double lambda_
         list encodings = []
@@ -124,35 +132,35 @@ def _fit_encoding_fast_auto_smooth(
 
             for cat_idx in range(n_cats):
                 means[cat_idx] = 0.0
-                counts[cat_idx] = 0
+                weighed_counts[cat_idx] = 0.0
                 sum_of_squared_diffs[cat_idx] = 0.0
 
-            # first pass to compute the mean
+            # first pass to compute the weighted mean
             for sample_idx in range(n_samples):
                 X_int_tmp = X_int[sample_idx, feat_idx]
 
                 # -1 are unknown categories, which are not counted
                 if X_int_tmp == -1:
                     continue
-                counts[X_int_tmp] += 1
-                means[X_int_tmp] += y[sample_idx]
+                weighed_counts[X_int_tmp] += sample_weight[sample_idx]
+                means[X_int_tmp] += y[sample_idx] * sample_weight[sample_idx]
 
             for cat_idx in range(n_cats):
-                means[cat_idx] /= counts[cat_idx]
+                means[cat_idx] /= weighed_counts[cat_idx]
 
             # second pass to compute the sum of squared differences
             for sample_idx in range(n_samples):
                 X_int_tmp = X_int[sample_idx, feat_idx]
                 if X_int_tmp == -1:
                     continue
                 diff = y[sample_idx] - means[X_int_tmp]
-                sum_of_squared_diffs[X_int_tmp] += diff * diff
+                sum_of_squared_diffs[X_int_tmp] += diff * diff * sample_weight[sample_idx]
 
             for cat_idx in range(n_cats):
                 lambda_ = (
-                    y_variance * counts[cat_idx] /
-                    (y_variance * counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
-                     counts[cat_idx])
+                    y_variance * weighed_counts[cat_idx] /
+                    (y_variance * weighed_counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
+                     weighed_counts[cat_idx])
                 )
                 if isnan(lambda_):
                     # A nan can happen when:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		- Now supporting sample weights in `TargetEncoder` to respect different observation frequency during encoding.
		by :user:`[DuarteSJ] <[DuarteSJ]>`.