From b495815077068e6213a03b8b2e9f9639196c7fe5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 15 Sep 2020 20:54:36 -0400 Subject: [PATCH 1/4] support np.nan --- sklearn/preprocessing/_encoders.py | 30 +++++++++++----- sklearn/preprocessing/tests/test_encoders.py | 37 ++++++++++++++++---- 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index a1f762110032f..5e7f53392375d 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -631,11 +631,12 @@ class OrdinalEncoder(_BaseEncoder): .. versionadded:: 0.24 - unknown_value : int, default=None + unknown_value : int or np.nan, default=None When the parameter handle_unknown is set to 'use_encoded_value', this parameter is required and will set the encoded value of unknown categories. It has to be distinct from the values used to encode any of - the categories in `fit`. + the categories in `fit`. If set to np.nan, the `dtype` parameter must + be either a float dtype or `object`. .. versionadded:: 0.24 @@ -699,13 +700,25 @@ def fit(self, X, y=None): self """ if self.handle_unknown == 'use_encoded_value': - if not isinstance(self.unknown_value, numbers.Integral): - raise TypeError(f"unknown_value should be an integer when " - f"`handle_unknown is 'use_encoded_value'`, " + try: + isnan = np.isnan(self.unknown_value) + except TypeError: + isnan = False + + if isnan: + if np.dtype(self.dtype).kind not in ('f', 'O'): + raise ValueError( + f"When unknown_value is np.nan, dtype should be " + f"either float or object. Got {self.dtype}." + ) + elif not isinstance(self.unknown_value, numbers.Integral): + raise TypeError(f"unknown_value should be an integer or " + f"np.nan when " + f"handle_unknown is 'use_encoded_value', " f"got {self.unknown_value}.") elif self.unknown_value is not None: raise TypeError(f"unknown_value should only be set when " - f"`handle_unknown is 'use_encoded_value'`, " + f"handle_unknown is 'use_encoded_value', " f"got {self.unknown_value}.") self._fit(X) @@ -735,11 +748,12 @@ def transform(self, X): Transformed input. """ X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) + X_trans = X_int.astype(self.dtype, copy=False) # create separate category for unknown values if self.handle_unknown == 'use_encoded_value': - X_int[~X_mask] = self.unknown_value - return X_int.astype(self.dtype, copy=False) + X_trans[~X_mask] = self.unknown_value + return X_trans def inverse_transform(self, X): """ diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index f030406e070fa..59b1d7a590b69 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -589,21 +589,21 @@ def test_ordinal_encoder_handle_unknowns_raise(): X = np.array([['a', 'x'], ['b', 'y']], dtype=object) enc = OrdinalEncoder(handle_unknown='use_encoded_value') - msg = ("unknown_value should be an integer when `handle_unknown is " - "'use_encoded_value'`, got None.") + msg = ("unknown_value should be an integer or np.nan when handle_unknown " + "is 'use_encoded_value', got None.") with pytest.raises(TypeError, match=msg): enc.fit(X) enc = OrdinalEncoder(unknown_value=-2) - msg = ("unknown_value should only be set when `handle_unknown is " - "'use_encoded_value'`, got -2.") + msg = ("unknown_value should only be set when handle_unknown is " + "'use_encoded_value', got -2.") with pytest.raises(TypeError, match=msg): enc.fit(X) enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value='bla') - msg = ("unknown_value should be an integer when `handle_unknown is " - "'use_encoded_value'`, got bla.") + msg = ("unknown_value should be an integer or np.nan when handle_unknown " + "is 'use_encoded_value', got bla.") with pytest.raises(TypeError, match=msg): enc.fit(X) @@ -614,6 +614,31 @@ def test_ordinal_encoder_handle_unknowns_raise(): enc.fit(X) +@pytest.mark.parametrize('dtype', (float, object)) +def test_ordinal_encoder_handle_unknowns_nan(dtype): + enc = OrdinalEncoder(handle_unknown='use_encoded_value', + unknown_value=np.nan, dtype=dtype) + + X_fit = np.array([[1], [2], [3]]) + enc.fit(X_fit) + X_trans = enc.transform([[1], [2], [4]]) + assert X_trans.dtype == dtype + # Convert to float because assert_array_equal fails when comparing object + # dtypes arrays that contain nans + X_trans = X_trans.astype(float) + assert_array_equal(X_trans, [[0], [1], [np.nan]]) + + +def test_ordinal_encoder_handle_unknowns_nan_int(): + enc = OrdinalEncoder(handle_unknown='use_encoded_value', + unknown_value=np.nan, dtype=int) + + X_fit = np.array([[1], [2], [3]]) + with pytest.raises(ValueError, + match="dtype should be either float or object"): + enc.fit(X_fit) + + def test_ordinal_encoder_raise_categories_shape(): X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T From 4fc2758b7171ef5421c69aabe728536af94892c3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 15 Sep 2020 22:21:48 -0400 Subject: [PATCH 2/4] use is_scalar_nan --- sklearn/preprocessing/_encoders.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 5e7f53392375d..cfcac3b986837 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -7,7 +7,7 @@ import numbers from ..base import BaseEstimator, TransformerMixin -from ..utils import check_array +from ..utils import check_array, is_scalar_nan from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args @@ -700,12 +700,7 @@ def fit(self, X, y=None): self """ if self.handle_unknown == 'use_encoded_value': - try: - isnan = np.isnan(self.unknown_value) - except TypeError: - isnan = False - - if isnan: + if is_scalar_nan(self.unknown_value): if np.dtype(self.dtype).kind not in ('f', 'O'): raise ValueError( f"When unknown_value is np.nan, dtype should be " From 28bf0db4a866ba0bbdd6ee22cbe196f01c045865 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 17 Sep 2020 14:31:08 -0400 Subject: [PATCH 3/4] disallow object dtype --- sklearn/preprocessing/_encoders.py | 9 +++++---- sklearn/preprocessing/tests/test_encoders.py | 17 ++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index cfcac3b986837..c19ccfbcc88b5 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -636,7 +636,7 @@ class OrdinalEncoder(_BaseEncoder): parameter is required and will set the encoded value of unknown categories. It has to be distinct from the values used to encode any of the categories in `fit`. If set to np.nan, the `dtype` parameter must - be either a float dtype or `object`. + be a float dtype. .. versionadded:: 0.24 @@ -701,10 +701,11 @@ def fit(self, X, y=None): """ if self.handle_unknown == 'use_encoded_value': if is_scalar_nan(self.unknown_value): - if np.dtype(self.dtype).kind not in ('f', 'O'): + if np.dtype(self.dtype).kind != 'f': raise ValueError( - f"When unknown_value is np.nan, dtype should be " - f"either float or object. Got {self.dtype}." + f"When unknown_value is np.nan, the dtype " + "parameter should be " + f"a float dtype. Got {self.dtype}." ) elif not isinstance(self.unknown_value, numbers.Integral): raise TypeError(f"unknown_value should be an integer or " diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 59b1d7a590b69..239d388ebd9d1 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -614,28 +614,27 @@ def test_ordinal_encoder_handle_unknowns_raise(): enc.fit(X) -@pytest.mark.parametrize('dtype', (float, object)) -def test_ordinal_encoder_handle_unknowns_nan(dtype): +def test_ordinal_encoder_handle_unknowns_nan(): + # Make sure unknown_value=np.nan properly works + enc = OrdinalEncoder(handle_unknown='use_encoded_value', - unknown_value=np.nan, dtype=dtype) + unknown_value=np.nan) X_fit = np.array([[1], [2], [3]]) enc.fit(X_fit) X_trans = enc.transform([[1], [2], [4]]) - assert X_trans.dtype == dtype - # Convert to float because assert_array_equal fails when comparing object - # dtypes arrays that contain nans - X_trans = X_trans.astype(float) assert_array_equal(X_trans, [[0], [1], [np.nan]]) -def test_ordinal_encoder_handle_unknowns_nan_int(): +def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype(): + # Make sure an error is raised when unknown_value=np.nan and the dtype + # isn't a float dtype enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan, dtype=int) X_fit = np.array([[1], [2], [3]]) with pytest.raises(ValueError, - match="dtype should be either float or object"): + match="dtype parameter should be a float dtype"): enc.fit(X_fit) From acea21caea89f088c073f8a56d46d7b46f26f873 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 22 Sep 2020 11:02:29 -0400 Subject: [PATCH 4/4] whatsnew --- doc/whats_new/v0.24.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 98ccc5d143bcb..6127e0a8cba9c 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -537,7 +537,8 @@ Changelog ``use_encoded_value`` option, along with a new ``unknown_value`` parameter, to :class:`preprocessing.OrdinalEncoder` to allow unknown categories during transform and set the encoded value of the unknown categories. - :pr:`17406` by :user:`Felix Wick `. + :pr:`17406` by :user:`Felix Wick ` and :pr:`18406` by + `Nicolas Hug`_. - |Feature| Add ``clip`` parameter to :class:`preprocessing.MinMaxScaler`, which clips the transformed values of test data to ``feature_range``.