diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 98ccc5d143bcb..6127e0a8cba9c 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -537,7 +537,8 @@ Changelog ``use_encoded_value`` option, along with a new ``unknown_value`` parameter, to :class:`preprocessing.OrdinalEncoder` to allow unknown categories during transform and set the encoded value of the unknown categories. - :pr:`17406` by :user:`Felix Wick `. + :pr:`17406` by :user:`Felix Wick ` and :pr:`18406` by + `Nicolas Hug`_. - |Feature| Add ``clip`` parameter to :class:`preprocessing.MinMaxScaler`, which clips the transformed values of test data to ``feature_range``. diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index a1f762110032f..c19ccfbcc88b5 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -7,7 +7,7 @@ import numbers from ..base import BaseEstimator, TransformerMixin -from ..utils import check_array +from ..utils import check_array, is_scalar_nan from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args @@ -631,11 +631,12 @@ class OrdinalEncoder(_BaseEncoder): .. versionadded:: 0.24 - unknown_value : int, default=None + unknown_value : int or np.nan, default=None When the parameter handle_unknown is set to 'use_encoded_value', this parameter is required and will set the encoded value of unknown categories. It has to be distinct from the values used to encode any of - the categories in `fit`. + the categories in `fit`. If set to np.nan, the `dtype` parameter must + be a float dtype. .. versionadded:: 0.24 @@ -699,13 +700,21 @@ def fit(self, X, y=None): self """ if self.handle_unknown == 'use_encoded_value': - if not isinstance(self.unknown_value, numbers.Integral): - raise TypeError(f"unknown_value should be an integer when " - f"`handle_unknown is 'use_encoded_value'`, " + if is_scalar_nan(self.unknown_value): + if np.dtype(self.dtype).kind != 'f': + raise ValueError( + f"When unknown_value is np.nan, the dtype " + "parameter should be " + f"a float dtype. Got {self.dtype}." + ) + elif not isinstance(self.unknown_value, numbers.Integral): + raise TypeError(f"unknown_value should be an integer or " + f"np.nan when " + f"handle_unknown is 'use_encoded_value', " f"got {self.unknown_value}.") elif self.unknown_value is not None: raise TypeError(f"unknown_value should only be set when " - f"`handle_unknown is 'use_encoded_value'`, " + f"handle_unknown is 'use_encoded_value', " f"got {self.unknown_value}.") self._fit(X) @@ -735,11 +744,12 @@ def transform(self, X): Transformed input. """ X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) + X_trans = X_int.astype(self.dtype, copy=False) # create separate category for unknown values if self.handle_unknown == 'use_encoded_value': - X_int[~X_mask] = self.unknown_value - return X_int.astype(self.dtype, copy=False) + X_trans[~X_mask] = self.unknown_value + return X_trans def inverse_transform(self, X): """ diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index f030406e070fa..239d388ebd9d1 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -589,21 +589,21 @@ def test_ordinal_encoder_handle_unknowns_raise(): X = np.array([['a', 'x'], ['b', 'y']], dtype=object) enc = OrdinalEncoder(handle_unknown='use_encoded_value') - msg = ("unknown_value should be an integer when `handle_unknown is " - "'use_encoded_value'`, got None.") + msg = ("unknown_value should be an integer or np.nan when handle_unknown " + "is 'use_encoded_value', got None.") with pytest.raises(TypeError, match=msg): enc.fit(X) enc = OrdinalEncoder(unknown_value=-2) - msg = ("unknown_value should only be set when `handle_unknown is " - "'use_encoded_value'`, got -2.") + msg = ("unknown_value should only be set when handle_unknown is " + "'use_encoded_value', got -2.") with pytest.raises(TypeError, match=msg): enc.fit(X) enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value='bla') - msg = ("unknown_value should be an integer when `handle_unknown is " - "'use_encoded_value'`, got bla.") + msg = ("unknown_value should be an integer or np.nan when handle_unknown " + "is 'use_encoded_value', got bla.") with pytest.raises(TypeError, match=msg): enc.fit(X) @@ -614,6 +614,30 @@ def test_ordinal_encoder_handle_unknowns_raise(): enc.fit(X) +def test_ordinal_encoder_handle_unknowns_nan(): + # Make sure unknown_value=np.nan properly works + + enc = OrdinalEncoder(handle_unknown='use_encoded_value', + unknown_value=np.nan) + + X_fit = np.array([[1], [2], [3]]) + enc.fit(X_fit) + X_trans = enc.transform([[1], [2], [4]]) + assert_array_equal(X_trans, [[0], [1], [np.nan]]) + + +def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype(): + # Make sure an error is raised when unknown_value=np.nan and the dtype + # isn't a float dtype + enc = OrdinalEncoder(handle_unknown='use_encoded_value', + unknown_value=np.nan, dtype=int) + + X_fit = np.array([[1], [2], [3]]) + with pytest.raises(ValueError, + match="dtype parameter should be a float dtype"): + enc.fit(X_fit) + + def test_ordinal_encoder_raise_categories_shape(): X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T