Skip to content

feat: Add support for sample_weights in TargetEncoder #31324

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- Now supporting sample weights in `TargetEncoder` to respect different observation frequency during encoding.
by :user:`[DuarteSJ] <[DuarteSJ]>`.
69 changes: 53 additions & 16 deletions sklearn/preprocessing/_target_encoder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

from numbers import Integral, Real
from numbers import Real

import numpy as np

Expand All @@ -10,6 +10,7 @@
from ..utils.multiclass import type_of_target
from ..utils.validation import (
_check_feature_names_in,
_check_sample_weight,
_check_y,
check_consistent_length,
check_is_fitted,
Expand Down Expand Up @@ -91,10 +92,24 @@ class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
more weight on the global target mean.
If `"auto"`, then `smooth` is set to an empirical Bayes estimate.

cv : int, default=5
Determines the number of folds in the :term:`cross fitting` strategy used in
:meth:`fit_transform`. For classification targets, `StratifiedKFold` is used
and for continuous targets, `KFold` is used.
cv : int, cross-validation generator or an iterable, default=None
Determines the cross-validation splitting strategy.
Possible inputs for cv are:

- None, to use the default 5-fold cross validation,
- integer, to specify the number of folds in a `(Stratified)KFold`,
- :term:`CV splitter`,
- An iterable yielding (train, test) splits as arrays of indices.

For integer/None inputs, if the estimator is a classifier and ``y`` is
either binary or multiclass,
:class:`~sklearn.model_selection.StratifiedKFold` is used. In all other
cases, :class:`~sklearn.model_selection.KFold` is used. These splitters
are instantiated with `shuffle=False` so the splits will be the same
across calls.

Refer :ref:`User Guide <cross_validation>` for the various
cross-validation strategies that can be used here.

shuffle : bool, default=True
Whether to shuffle the data in :meth:`fit_transform` before splitting into
Expand Down Expand Up @@ -190,7 +205,7 @@ class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
"categories": [StrOptions({"auto"}), list],
"target_type": [StrOptions({"auto", "continuous", "binary", "multiclass"})],
"smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")],
"cv": [Interval(Integral, 2, None, closed="left")],
"cv": ["cv_object"],
"shuffle": ["boolean"],
"random_state": ["random_state"],
}
Expand All @@ -212,7 +227,7 @@ def __init__(
self.random_state = random_state

@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y):
def fit(self, X, y, sample_weight=None):
"""Fit the :class:`TargetEncoder` to X and y.

Parameters
Expand All @@ -223,16 +238,19 @@ def fit(self, X, y):
y : array-like of shape (n_samples,)
The target data used to encode the categories.

sample_weight : ndarray of shape (n_samples,)
Contains weight values to be associated with each sample.

Returns
-------
self : object
Fitted encoder.
"""
self._fit_encodings_all(X, y)
self._fit_encodings_all(X, y, sample_weight)
return self

@_fit_context(prefer_skip_nested_validation=True)
def fit_transform(self, X, y):
def fit_transform(self, X, y, sample_weight=None):
"""Fit :class:`TargetEncoder` and transform X with the target encoding.

.. note::
Expand All @@ -248,6 +266,9 @@ def fit_transform(self, X, y):
y : array-like of shape (n_samples,)
The target data used to encode the categories.

sample_weight : ndarray of shape (n_samples,)
Contains weight values to be associated with each sample.

Returns
-------
X_trans : ndarray of shape (n_samples, n_features) or \
Expand All @@ -256,7 +277,9 @@ def fit_transform(self, X, y):
"""
from ..model_selection import KFold, StratifiedKFold # avoid circular import

X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y)
X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(
X, y, sample_weight
)

# The cv splitter is voluntarily restricted to *KFold to enforce non
# overlapping validation folds, otherwise the fit_transform output will
Expand All @@ -277,23 +300,27 @@ def fit_transform(self, X, y):
else:
X_out = np.empty_like(X_ordinal, dtype=np.float64)

sample_weight = _check_sample_weight(sample_weight, X)
for train_idx, test_idx in cv.split(X, y):
X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx]
y_train_mean = np.mean(y_train, axis=0)
sample_weight_train = sample_weight[train_idx]
y_train_mean = np.average(y_train, weights=sample_weight_train, axis=0)

if self.target_type_ == "multiclass":
encodings = self._fit_encoding_multiclass(
X_train,
y_train,
n_categories,
y_train_mean,
sample_weight_train,
)
else:
encodings = self._fit_encoding_binary_or_continuous(
X_train,
y_train,
n_categories,
y_train_mean,
sample_weight_train,
)
self._transform_X_ordinal(
X_out,
Expand Down Expand Up @@ -347,7 +374,7 @@ def transform(self, X):
)
return X_out

def _fit_encodings_all(self, X, y):
def _fit_encodings_all(self, X, y, sample_weight=None):
"""Fit a target encoding with all the data."""
# avoid circular import
from ..preprocessing import (
Expand All @@ -356,6 +383,7 @@ def _fit_encodings_all(self, X, y):
)

check_consistent_length(X, y)
sample_weight = _check_sample_weight(sample_weight, X)
self._fit(X, handle_unknown="ignore", ensure_all_finite="allow-nan")

if self.target_type == "auto":
Expand Down Expand Up @@ -383,7 +411,7 @@ def _fit_encodings_all(self, X, y):
else: # continuous
y = _check_y(y, y_numeric=True, estimator=self)

self.target_mean_ = np.mean(y, axis=0)
self.target_mean_ = np.average(y, weights=sample_weight, axis=0)

X_ordinal, X_known_mask = self._transform(
X, handle_unknown="ignore", ensure_all_finite="allow-nan"
Expand All @@ -399,27 +427,32 @@ def _fit_encodings_all(self, X, y):
y,
n_categories,
self.target_mean_,
sample_weight,
)
else:
encodings = self._fit_encoding_binary_or_continuous(
X_ordinal,
y,
n_categories,
self.target_mean_,
sample_weight,
)
self.encodings_ = encodings

return X_ordinal, X_known_mask, y, n_categories

def _fit_encoding_binary_or_continuous(
self, X_ordinal, y, n_categories, target_mean
self, X_ordinal, y, n_categories, target_mean, sample_weight
):
"""Learn target encodings."""
if self.smooth == "auto":
y_variance = np.var(y)
y_variance = np.sum(sample_weight * (y - target_mean) ** 2) / np.sum(
sample_weight
)
encodings = _fit_encoding_fast_auto_smooth(
X_ordinal,
y,
sample_weight,
n_categories,
target_mean,
y_variance,
Expand All @@ -428,13 +461,16 @@ def _fit_encoding_binary_or_continuous(
encodings = _fit_encoding_fast(
X_ordinal,
y,
sample_weight,
n_categories,
self.smooth,
target_mean,
)
return encodings

def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
def _fit_encoding_multiclass(
self, X_ordinal, y, n_categories, target_mean, sample_weight
):
"""Learn multiclass encodings.

Learn encodings for each class (c) then reorder encodings such that
Expand All @@ -455,6 +491,7 @@ def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
y_class,
n_categories,
target_mean[i],
sample_weight,
)
encodings.extend(encoding)

Expand Down
32 changes: 20 additions & 12 deletions sklearn/preprocessing/_target_encoder_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,17 @@ ctypedef fused Y_DTYPE:
float64_t
float32_t

ctypedef fused WEIGHT_DTYPE:
int64_t
int32_t
float64_t
float32_t


def _fit_encoding_fast(
INT_DTYPE[:, ::1] X_int,
const Y_DTYPE[:] y,
const WEIGHT_DTYPE[:] sample_weight,
int64_t[::1] n_categories,
double smooth,
double y_mean,
Expand Down Expand Up @@ -65,8 +72,8 @@ def _fit_encoding_fast(
# -1 are unknown categories, which are not counted
if X_int_tmp == -1:
continue
sums[X_int_tmp] += y[sample_idx]
counts[X_int_tmp] += 1.0
sums[X_int_tmp] += y[sample_idx] * sample_weight[sample_idx]
counts[X_int_tmp] += sample_weight[sample_idx]

for cat_idx in range(n_cats):
if counts[cat_idx] == 0:
Expand All @@ -80,6 +87,7 @@ def _fit_encoding_fast(
def _fit_encoding_fast_auto_smooth(
INT_DTYPE[:, ::1] X_int,
const Y_DTYPE[:] y,
const WEIGHT_DTYPE[:] sample_weight,
int64_t[::1] n_categories,
double y_mean,
double y_variance,
Expand All @@ -99,7 +107,7 @@ def _fit_encoding_fast_auto_smooth(
int n_features = X_int.shape[1]
int64_t max_n_cats = np.max(n_categories)
double[::1] means = np.empty(max_n_cats, dtype=np.float64)
int64_t[::1] counts = np.empty(max_n_cats, dtype=np.int64)
double[::1] weighed_counts = np.empty(max_n_cats, dtype=np.float64)
double[::1] sum_of_squared_diffs = np.empty(max_n_cats, dtype=np.float64)
double lambda_
list encodings = []
Expand All @@ -124,35 +132,35 @@ def _fit_encoding_fast_auto_smooth(

for cat_idx in range(n_cats):
means[cat_idx] = 0.0
counts[cat_idx] = 0
weighed_counts[cat_idx] = 0.0
sum_of_squared_diffs[cat_idx] = 0.0

# first pass to compute the mean
# first pass to compute the weighted mean
for sample_idx in range(n_samples):
X_int_tmp = X_int[sample_idx, feat_idx]

# -1 are unknown categories, which are not counted
if X_int_tmp == -1:
continue
counts[X_int_tmp] += 1
means[X_int_tmp] += y[sample_idx]
weighed_counts[X_int_tmp] += sample_weight[sample_idx]
means[X_int_tmp] += y[sample_idx] * sample_weight[sample_idx]

for cat_idx in range(n_cats):
means[cat_idx] /= counts[cat_idx]
means[cat_idx] /= weighed_counts[cat_idx]

# second pass to compute the sum of squared differences
for sample_idx in range(n_samples):
X_int_tmp = X_int[sample_idx, feat_idx]
if X_int_tmp == -1:
continue
diff = y[sample_idx] - means[X_int_tmp]
sum_of_squared_diffs[X_int_tmp] += diff * diff
sum_of_squared_diffs[X_int_tmp] += diff * diff * sample_weight[sample_idx]

for cat_idx in range(n_cats):
lambda_ = (
y_variance * counts[cat_idx] /
(y_variance * counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
counts[cat_idx])
y_variance * weighed_counts[cat_idx] /
(y_variance * weighed_counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
weighed_counts[cat_idx])
)
if isnan(lambda_):
# A nan can happen when:
Expand Down
Loading