Skip to content

ENH Add sample_weight parameter to OneHotEncoder's .fit #26330

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
5 changes: 5 additions & 0 deletions doc/whats_new/v1.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -871,6 +871,11 @@ Changelog
combines input arguments `(input_feature, category)` to a string.
:pr:`22506` by :user:`Mario Kostelac <mariokostelac>`.

- |Enhancement| Adds support for `sample_weight` in
:class:`preprocessing.OneHotEncoder`. When using `sample_weight`, then `min_frequency`
and `max_categories` will filter according to sum samples' weight for that category
instead of count. :pr:`26330` by :user:`Carlo Lemos <vitaliset>`.

- |Enhancement| Added support for `sample_weight` in
:class:`preprocessing.KBinsDiscretizer`. This allows specifying the parameter
`sample_weight` for each sample to be used while fitting. The option is only
Expand Down
49 changes: 38 additions & 11 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
from ..utils._missing import is_scalar_nan
from ..utils._param_validation import Interval, RealNotInt, StrOptions
from ..utils._set_output import _get_output_config
from ..utils.validation import _check_feature_names_in, check_is_fitted
from ..utils.validation import (
_check_feature_names_in,
_check_sample_weight,
check_is_fitted,
)

__all__ = ["OneHotEncoder", "OrdinalEncoder"]

Expand Down Expand Up @@ -68,6 +72,7 @@ def _check_X(self, X, force_all_finite=True):
def _fit(
self,
X,
sample_weight=None,
handle_unknown="error",
force_all_finite=True,
return_counts=False,
Expand All @@ -80,6 +85,13 @@ def _fit(
X, force_all_finite=force_all_finite
)
self.n_features_in_ = n_features
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
# Filtering rows with sample_weight equals zero so we don't get extra dummy
# columns.
X_list = [Xi[sample_weight != 0] for Xi in X_list]
sample_weight = sample_weight[sample_weight != 0]
n_samples = np.sum(sample_weight)

if self.categories != "auto":
if len(self.categories) != n_features:
Expand All @@ -96,7 +108,9 @@ def _fit(
Xi = X_list[i]

if self.categories == "auto":
result = _unique(Xi, return_counts=compute_counts)
result = _unique(
Xi, sample_weight=sample_weight, return_counts=compute_counts
)
if compute_counts:
cats, counts = result
category_counts.append(counts)
Expand Down Expand Up @@ -159,7 +173,7 @@ def _fit(
)
raise ValueError(msg)
if compute_counts:
category_counts.append(_get_counts(Xi, cats))
category_counts.append(_get_counts(Xi, cats, sample_weight))

self.categories_.append(cats)

Expand Down Expand Up @@ -277,11 +291,13 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):

Parameters
----------
category_count : ndarray of shape (n_cardinality,)
Category counts.
category_count : array-like of shape (n_cardinality,)
Category counts or sum of `sample_weight` for the samples from the
category when `sample_weight` is different from `None`.

n_samples : int
Number of samples.
Number of samples in training set or total sum of `sample_weight`
for all samples when `sample_weight` is different from `None`.

col_idx : int
Index of the current category. Only used for the error message.
Expand Down Expand Up @@ -344,7 +360,8 @@ def _fit_infrequent_category_mapping(
Parameters
----------
n_samples : int
Number of samples in training set.
Number of samples in training set or total sum of `sample_weight`
for all samples when `sample_weight` is different from `None`.
category_counts: list of ndarray
`category_counts[i]` is the category counts corresponding to
`self.categories_[i]`.
Expand Down Expand Up @@ -565,13 +582,15 @@ class OneHotEncoder(_BaseEncoder):

min_frequency : int or float, default=None
Specifies the minimum frequency below which a category will be
considered infrequent.
considered infrequent. If during fit `sample_weight` is different from
default, then count will be done with sum of samples' weight.

- If `int`, categories with a smaller cardinality will be considered
infrequent.

- If `float`, categories with a smaller cardinality than
`min_frequency * n_samples` will be considered infrequent.
`min_frequency * n_samples` will be considered infrequent. If
`sample_weight` is different from `None`, `n_samples = sum(sample_weight)`.

.. versionadded:: 1.1
Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
Expand Down Expand Up @@ -955,7 +974,7 @@ def _compute_n_features_outs(self):
return output

@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
def fit(self, X, y=None, sample_weight=None):
"""
Fit OneHotEncoder to X.

Expand All @@ -968,13 +987,21 @@ def fit(self, X, y=None):
Ignored. This parameter exists only for compatibility with
:class:`~sklearn.pipeline.Pipeline`.

sample_weight : array-like of shape (n_samples,), default=None
Sample weights used to weight the categories when using filtering
catergories with `max_categories` and `min_frequency`. If `None`,
then samples are equally weighted. If both `max_categories` and
`min_frequency` are set to default values, then `sample_weight`
is ignored.

Returns
-------
self
Fitted encoder.
"""
self._fit(
X,
sample_weight=sample_weight,
handle_unknown=self.handle_unknown,
force_all_finite="allow-nan",
)
Expand Down Expand Up @@ -1294,7 +1321,7 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
infrequent.

- If `float`, categories with a smaller cardinality than
`min_frequency * n_samples` will be considered infrequent.
`min_frequency * n_samples` will be considered infrequent.

.. versionadded:: 1.3
Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
Expand Down
92 changes: 92 additions & 0 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2323,6 +2323,98 @@ def test_ordinal_encoder_missing_appears_infrequent():
assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]])


@pytest.mark.parametrize(
"X, sample_weight, expected_shape",
[
(
[
["car", 3],
["bike", 3],
["car", 1],
["bike", 3],
["boat", 2],
["airplane", 4],
],
np.array([2, 2.5, 0.5, 0.1, 0, 0]),
(6, 4), # columns: car, bike, 3, infrequent (1)
),
(
[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]],
np.array([5, 5, 0.1, 0.3, 4, 0.9]),
(6, 3),
),
],
)
@pytest.mark.parametrize(
"min_frequency",
[0.3, 0.9, 2],
)
def test_one_hot_encoder_sample_weight_min_frequency(
X, sample_weight, expected_shape, min_frequency
):
ohe = OneHotEncoder(min_frequency=2, handle_unknown="infrequent_if_exist")
X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
assert_allclose(X_trans.shape, expected_shape)


@pytest.mark.parametrize(
"X, sample_weight, expected_shape",
[
(
[
["car", 3],
["bike", 3],
["car", 1],
["bike", 3],
["boat", 2],
["airplane", 4],
],
np.array([2, 2.5, 0.5, 0.1, 0, 0]),
(6, 4),
),
(
[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]],
np.array([5, 5, 0.1, 0.3, 4, 0.9]),
(6, 2),
),
],
)
def test_one_hot_encoder_sample_weight_max_categories(X, sample_weight, expected_shape):
ohe = OneHotEncoder(max_categories=2, handle_unknown="ignore")
X_trans = ohe.fit_transform(X, sample_weight=sample_weight)
assert_allclose(X_trans.shape, expected_shape)


@pytest.mark.parametrize(
"X",
[[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]],
)
@pytest.mark.parametrize(
"min_frequency",
[0.1, 0.3, 0.5, 0.9],
)
def test_one_hot_encoder_sample_weight_constant(X, min_frequency):
ohe = OneHotEncoder(min_frequency=min_frequency)
X_sw_None = ohe.fit_transform(X, sample_weight=None).toarray()
X_sw_constant1 = ohe.fit_transform(X, sample_weight=np.ones(len(X))).toarray()
X_sw_constant5 = ohe.fit_transform(X, sample_weight=5 * np.ones(len(X))).toarray()

assert_array_equal(X_sw_None, X_sw_constant1)
assert_array_equal(X_sw_None, X_sw_constant5)


@pytest.mark.parametrize(
"X",
[[["car"], ["car"], ["bike"], ["bike"], ["boat"], ["airplane"]]],
)
def test_one_hot_encoder_sample_weight_is_ignored(X):
ohe = OneHotEncoder()
X_sw_None = ohe.fit_transform(X).toarray()
X_sw_ones = ohe.fit_transform(X, sample_weight=np.ones(len(X))).toarray()

assert_array_equal(X_sw_None, X_sw_ones)


@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
def test_encoder_not_fitted(Encoder):
"""Check that we raise a `NotFittedError` by calling transform before fit with
Expand Down
Loading
Loading