Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/whats_new/v0.24.rst
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,8 @@ Changelog
``use_encoded_value`` option, along with a new ``unknown_value`` parameter,
to :class:`preprocessing.OrdinalEncoder` to allow unknown categories during
transform and set the encoded value of the unknown categories.
:pr:`17406` by :user:`Felix Wick <FelixWick>`.
:pr:`17406` by :user:`Felix Wick <FelixWick>` and :pr:`18406` by
`Nicolas Hug`_.

- |Feature| Add ``clip`` parameter to :class:`preprocessing.MinMaxScaler`,
which clips the transformed values of test data to ``feature_range``.
Expand Down
28 changes: 19 additions & 9 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import numbers

from ..base import BaseEstimator, TransformerMixin
from ..utils import check_array
from ..utils import check_array, is_scalar_nan
from ..utils.validation import check_is_fitted
from ..utils.validation import _deprecate_positional_args

Expand Down Expand Up @@ -631,11 +631,12 @@ class OrdinalEncoder(_BaseEncoder):

.. versionadded:: 0.24

unknown_value : int, default=None
unknown_value : int or np.nan, default=None
When the parameter handle_unknown is set to 'use_encoded_value', this
parameter is required and will set the encoded value of unknown
categories. It has to be distinct from the values used to encode any of
the categories in `fit`.
the categories in `fit`. If set to np.nan, the `dtype` parameter must
be a float dtype.

.. versionadded:: 0.24

Expand Down Expand Up @@ -699,13 +700,21 @@ def fit(self, X, y=None):
self
"""
if self.handle_unknown == 'use_encoded_value':
if not isinstance(self.unknown_value, numbers.Integral):
raise TypeError(f"unknown_value should be an integer when "
f"`handle_unknown is 'use_encoded_value'`, "
if is_scalar_nan(self.unknown_value):
if np.dtype(self.dtype).kind != 'f':
raise ValueError(
f"When unknown_value is np.nan, the dtype "
"parameter should be "
f"a float dtype. Got {self.dtype}."
)
elif not isinstance(self.unknown_value, numbers.Integral):
raise TypeError(f"unknown_value should be an integer or "
f"np.nan when "
f"handle_unknown is 'use_encoded_value', "
f"got {self.unknown_value}.")
elif self.unknown_value is not None:
raise TypeError(f"unknown_value should only be set when "
f"`handle_unknown is 'use_encoded_value'`, "
f"handle_unknown is 'use_encoded_value', "
f"got {self.unknown_value}.")

self._fit(X)
Expand Down Expand Up @@ -735,11 +744,12 @@ def transform(self, X):
Transformed input.
"""
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
X_trans = X_int.astype(self.dtype, copy=False)

# create separate category for unknown values
if self.handle_unknown == 'use_encoded_value':
X_int[~X_mask] = self.unknown_value
return X_int.astype(self.dtype, copy=False)
X_trans[~X_mask] = self.unknown_value
return X_trans

def inverse_transform(self, X):
"""
Expand Down
36 changes: 30 additions & 6 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,21 +589,21 @@ def test_ordinal_encoder_handle_unknowns_raise():
X = np.array([['a', 'x'], ['b', 'y']], dtype=object)

enc = OrdinalEncoder(handle_unknown='use_encoded_value')
msg = ("unknown_value should be an integer when `handle_unknown is "
"'use_encoded_value'`, got None.")
msg = ("unknown_value should be an integer or np.nan when handle_unknown "
"is 'use_encoded_value', got None.")
with pytest.raises(TypeError, match=msg):
enc.fit(X)

enc = OrdinalEncoder(unknown_value=-2)
msg = ("unknown_value should only be set when `handle_unknown is "
"'use_encoded_value'`, got -2.")
msg = ("unknown_value should only be set when handle_unknown is "
"'use_encoded_value', got -2.")
with pytest.raises(TypeError, match=msg):
enc.fit(X)

enc = OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value='bla')
msg = ("unknown_value should be an integer when `handle_unknown is "
"'use_encoded_value'`, got bla.")
msg = ("unknown_value should be an integer or np.nan when handle_unknown "
"is 'use_encoded_value', got bla.")
with pytest.raises(TypeError, match=msg):
enc.fit(X)

Expand All @@ -614,6 +614,30 @@ def test_ordinal_encoder_handle_unknowns_raise():
enc.fit(X)


def test_ordinal_encoder_handle_unknowns_nan():
# Make sure unknown_value=np.nan properly works

enc = OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=np.nan)

X_fit = np.array([[1], [2], [3]])
enc.fit(X_fit)
X_trans = enc.transform([[1], [2], [4]])
assert_array_equal(X_trans, [[0], [1], [np.nan]])


def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype():
# Make sure an error is raised when unknown_value=np.nan and the dtype
# isn't a float dtype
enc = OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=np.nan, dtype=int)

X_fit = np.array([[1], [2], [3]])
with pytest.raises(ValueError,
match="dtype parameter should be a float dtype"):
enc.fit(X_fit)


def test_ordinal_encoder_raise_categories_shape():

X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
Expand Down