Skip to content

FEA TargetEncoder should respect sample_weights #29110

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/whats_new/v1.6.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,13 @@ Changelog
whether to raise an exception if a subset of the scorers in multimetric scoring fails
or to return an error code. :pr:`28992` by :user:`Stefanie Senger <StefanieSenger>`.

:mod:`sklearn.preprocessing`
............................

- |Feature| :class:`preprocessing.Target_encoder` now supports the `sample_weight`
parameter in the `fit` and `fit_transform` methods. :pr:`29110`
by :user: `Duarte São José <DuarteSJ>` and `Miguel Parece <MiguelParece>`.

Thanks to everyone who has contributed to the maintenance and improvement of
the project since version 1.5, including:

Expand Down
48 changes: 36 additions & 12 deletions sklearn/preprocessing/_target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ..utils.multiclass import type_of_target
from ..utils.validation import (
_check_feature_names_in,
_check_sample_weight,
_check_y,
check_consistent_length,
check_is_fitted,
Expand Down Expand Up @@ -209,7 +210,7 @@ def __init__(
self.random_state = random_state

@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y):
def fit(self, X, y, sample_weight=None):
"""Fit the :class:`TargetEncoder` to X and y.

Parameters
Expand All @@ -220,16 +221,19 @@ def fit(self, X, y):
y : array-like of shape (n_samples,)
The target data used to encode the categories.

sample_weight : ndarray of shape (n_samples,)
Contains weight values to be associated with each sample.

Returns
-------
self : object
Fitted encoder.
"""
self._fit_encodings_all(X, y)
self._fit_encodings_all(X, y, sample_weight)
return self

@_fit_context(prefer_skip_nested_validation=True)
def fit_transform(self, X, y):
def fit_transform(self, X, y, sample_weight=None):
"""Fit :class:`TargetEncoder` and transform X with the target encoding.

.. note::
Expand All @@ -245,6 +249,9 @@ def fit_transform(self, X, y):
y : array-like of shape (n_samples,)
The target data used to encode the categories.

sample_weight : ndarray of shape (n_samples,)
Contains weight values to be associated with each sample.

Returns
-------
X_trans : ndarray of shape (n_samples, n_features) or \
Expand All @@ -253,11 +260,14 @@ def fit_transform(self, X, y):
"""
from ..model_selection import KFold, StratifiedKFold # avoid circular import

X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y)
X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(
X, y, sample_weight
)

# The cv splitter is voluntarily restricted to *KFold to enforce non
# overlapping validation folds, otherwise the fit_transform output will
# not be well-specified.

if self.target_type_ == "continuous":
cv = KFold(self.cv, shuffle=self.shuffle, random_state=self.random_state)
else:
Expand All @@ -273,24 +283,26 @@ def fit_transform(self, X, y):
)
else:
X_out = np.empty_like(X_ordinal, dtype=np.float64)

sample_weight = _check_sample_weight(sample_weight, X)
for train_idx, test_idx in cv.split(X, y):
X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx]
y_train_mean = np.mean(y_train, axis=0)

sample_weight_train = sample_weight[train_idx]
y_train_mean = np.average(y_train, weights=sample_weight_train, axis=0)
if self.target_type_ == "multiclass":
encodings = self._fit_encoding_multiclass(
X_train,
y_train,
n_categories,
y_train_mean,
sample_weight_train,
)
else:
encodings = self._fit_encoding_binary_or_continuous(
X_train,
y_train,
n_categories,
y_train_mean,
sample_weight_train,
)
self._transform_X_ordinal(
X_out,
Expand Down Expand Up @@ -344,7 +356,7 @@ def transform(self, X):
)
return X_out

def _fit_encodings_all(self, X, y):
def _fit_encodings_all(self, X, y, sample_weight=None):
"""Fit a target encoding with all the data."""
# avoid circular import
from ..preprocessing import (
Expand All @@ -353,6 +365,7 @@ def _fit_encodings_all(self, X, y):
)

check_consistent_length(X, y)
sample_weight = _check_sample_weight(sample_weight, X)
self._fit(X, handle_unknown="ignore", force_all_finite="allow-nan")

if self.target_type == "auto":
Expand Down Expand Up @@ -380,7 +393,7 @@ def _fit_encodings_all(self, X, y):
else: # continuous
y = _check_y(y, y_numeric=True, estimator=self)

self.target_mean_ = np.mean(y, axis=0)
self.target_mean_ = np.average(y, weights=sample_weight, axis=0)

X_ordinal, X_known_mask = self._transform(
X, handle_unknown="ignore", force_all_finite="allow-nan"
Expand All @@ -396,42 +409,52 @@ def _fit_encodings_all(self, X, y):
y,
n_categories,
self.target_mean_,
sample_weight,
)
else:
encodings = self._fit_encoding_binary_or_continuous(
X_ordinal,
y,
n_categories,
self.target_mean_,
sample_weight,
)
self.encodings_ = encodings

return X_ordinal, X_known_mask, y, n_categories

def _fit_encoding_binary_or_continuous(
self, X_ordinal, y, n_categories, target_mean
self, X_ordinal, y, n_categories, target_mean, sample_weight
):
"""Learn target encodings."""
if self.smooth == "auto":
y_variance = np.var(y)
y_variance = np.sum(sample_weight * (y - target_mean) ** 2) / (
np.sum(sample_weight)
)

encodings = _fit_encoding_fast_auto_smooth(
X_ordinal,
y,
sample_weight,
n_categories,
target_mean,
y_variance,
)
else:

encodings = _fit_encoding_fast(
X_ordinal,
y,
sample_weight,
n_categories,
self.smooth,
target_mean,
)
return encodings

def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
def _fit_encoding_multiclass(
self, X_ordinal, y, n_categories, target_mean, sample_weight
):
"""Learn multiclass encodings.

Learn encodings for each class (c) then reorder encodings such that
Expand All @@ -452,6 +475,7 @@ def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
y_class,
n_categories,
target_mean[i],
sample_weight,
)
encodings.extend(encoding)

Expand Down
31 changes: 19 additions & 12 deletions sklearn/preprocessing/_target_encoder_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,17 @@ ctypedef fused Y_DTYPE:
int32_t
float64_t
float32_t
ctypedef fused W_DTYPE:
int64_t
int32_t
float64_t
float32_t


def _fit_encoding_fast(
INT_DTYPE[:, ::1] X_int,
const Y_DTYPE[:] y,
const W_DTYPE[:] sample_weight,
int64_t[::1] n_categories,
double smooth,
double y_mean,
Expand Down Expand Up @@ -65,8 +71,8 @@ def _fit_encoding_fast(
# -1 are unknown categories, which are not counted
if X_int_tmp == -1:
continue
sums[X_int_tmp] += y[sample_idx]
counts[X_int_tmp] += 1.0
sums[X_int_tmp] += y[sample_idx] * sample_weight[sample_idx]
counts[X_int_tmp] += sample_weight[sample_idx]

for cat_idx in range(n_cats):
if counts[cat_idx] == 0:
Expand All @@ -80,6 +86,7 @@ def _fit_encoding_fast(
def _fit_encoding_fast_auto_smooth(
INT_DTYPE[:, ::1] X_int,
const Y_DTYPE[:] y,
const W_DTYPE[:] sample_weight,
int64_t[::1] n_categories,
double y_mean,
double y_variance,
Expand All @@ -99,7 +106,7 @@ def _fit_encoding_fast_auto_smooth(
int n_features = X_int.shape[1]
int64_t max_n_cats = np.max(n_categories)
double[::1] means = np.empty(max_n_cats, dtype=np.float64)
int64_t[::1] counts = np.empty(max_n_cats, dtype=np.int64)
double[::1] weighted_counts = np.empty(max_n_cats, dtype=np.float64)
double[::1] sum_of_squared_diffs = np.empty(max_n_cats, dtype=np.float64)
double lambda_
list encodings = []
Expand All @@ -124,35 +131,35 @@ def _fit_encoding_fast_auto_smooth(

for cat_idx in range(n_cats):
means[cat_idx] = 0.0
counts[cat_idx] = 0
weighted_counts[cat_idx] = 0.0
sum_of_squared_diffs[cat_idx] = 0.0

# first pass to compute the mean
# first pass to compute the weighted mean
for sample_idx in range(n_samples):
X_int_tmp = X_int[sample_idx, feat_idx]

# -1 are unknown categories, which are not counted
if X_int_tmp == -1:
continue
counts[X_int_tmp] += 1
means[X_int_tmp] += y[sample_idx]
weighted_counts[X_int_tmp] += sample_weight[sample_idx]
means[X_int_tmp] += y[sample_idx] * sample_weight[sample_idx]

for cat_idx in range(n_cats):
means[cat_idx] /= counts[cat_idx]
means[cat_idx] /= weighted_counts[cat_idx]

# second pass to compute the sum of squared differences
for sample_idx in range(n_samples):
X_int_tmp = X_int[sample_idx, feat_idx]
if X_int_tmp == -1:
continue
diff = y[sample_idx] - means[X_int_tmp]
sum_of_squared_diffs[X_int_tmp] += diff * diff
sum_of_squared_diffs[X_int_tmp] += diff * diff * sample_weight[sample_idx]

for cat_idx in range(n_cats):
lambda_ = (
y_variance * counts[cat_idx] /
(y_variance * counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
counts[cat_idx])
y_variance * weighted_counts[cat_idx] /
(y_variance * weighted_counts[cat_idx] + sum_of_squared_diffs[cat_idx] /
weighted_counts[cat_idx])
)
if isnan(lambda_):
# A nan can happen when:
Expand Down
Loading
Loading