From b495815077068e6213a03b8b2e9f9639196c7fe5 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 15 Sep 2020 20:54:36 -0400
Subject: [PATCH 1/4] support np.nan

---
 sklearn/preprocessing/_encoders.py           | 30 +++++++++++-----
 sklearn/preprocessing/tests/test_encoders.py | 37 ++++++++++++++++----
 2 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index a1f762110032f..5e7f53392375d 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -631,11 +631,12 @@ class OrdinalEncoder(_BaseEncoder):
 
         .. versionadded:: 0.24
 
-    unknown_value : int, default=None
+    unknown_value : int or np.nan, default=None
         When the parameter handle_unknown is set to 'use_encoded_value', this
         parameter is required and will set the encoded value of unknown
         categories. It has to be distinct from the values used to encode any of
-        the categories in `fit`.
+        the categories in `fit`. If set to np.nan, the `dtype` parameter must
+        be either a float dtype or `object`.
 
         .. versionadded:: 0.24
 
@@ -699,13 +700,25 @@ def fit(self, X, y=None):
         self
         """
         if self.handle_unknown == 'use_encoded_value':
-            if not isinstance(self.unknown_value, numbers.Integral):
-                raise TypeError(f"unknown_value should be an integer when "
-                                f"`handle_unknown is 'use_encoded_value'`, "
+            try:
+                isnan = np.isnan(self.unknown_value)
+            except TypeError:
+                isnan = False
+
+            if isnan:
+                if np.dtype(self.dtype).kind not in ('f', 'O'):
+                    raise ValueError(
+                        f"When unknown_value is np.nan, dtype should be "
+                        f"either float or object. Got {self.dtype}."
+                    )
+            elif not isinstance(self.unknown_value, numbers.Integral):
+                raise TypeError(f"unknown_value should be an integer or "
+                                f"np.nan when "
+                                f"handle_unknown is 'use_encoded_value', "
                                 f"got {self.unknown_value}.")
         elif self.unknown_value is not None:
             raise TypeError(f"unknown_value should only be set when "
-                            f"`handle_unknown is 'use_encoded_value'`, "
+                            f"handle_unknown is 'use_encoded_value', "
                             f"got {self.unknown_value}.")
 
         self._fit(X)
@@ -735,11 +748,12 @@ def transform(self, X):
             Transformed input.
         """
         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
+        X_trans = X_int.astype(self.dtype, copy=False)
 
         # create separate category for unknown values
         if self.handle_unknown == 'use_encoded_value':
-            X_int[~X_mask] = self.unknown_value
-        return X_int.astype(self.dtype, copy=False)
+            X_trans[~X_mask] = self.unknown_value
+        return X_trans
 
     def inverse_transform(self, X):
         """
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index f030406e070fa..59b1d7a590b69 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -589,21 +589,21 @@ def test_ordinal_encoder_handle_unknowns_raise():
     X = np.array([['a', 'x'], ['b', 'y']], dtype=object)
 
     enc = OrdinalEncoder(handle_unknown='use_encoded_value')
-    msg = ("unknown_value should be an integer when `handle_unknown is "
-           "'use_encoded_value'`, got None.")
+    msg = ("unknown_value should be an integer or np.nan when handle_unknown "
+           "is 'use_encoded_value', got None.")
     with pytest.raises(TypeError, match=msg):
         enc.fit(X)
 
     enc = OrdinalEncoder(unknown_value=-2)
-    msg = ("unknown_value should only be set when `handle_unknown is "
-           "'use_encoded_value'`, got -2.")
+    msg = ("unknown_value should only be set when handle_unknown is "
+           "'use_encoded_value', got -2.")
     with pytest.raises(TypeError, match=msg):
         enc.fit(X)
 
     enc = OrdinalEncoder(handle_unknown='use_encoded_value',
                          unknown_value='bla')
-    msg = ("unknown_value should be an integer when `handle_unknown is "
-           "'use_encoded_value'`, got bla.")
+    msg = ("unknown_value should be an integer or np.nan when handle_unknown "
+           "is 'use_encoded_value', got bla.")
     with pytest.raises(TypeError, match=msg):
         enc.fit(X)
 
@@ -614,6 +614,31 @@ def test_ordinal_encoder_handle_unknowns_raise():
         enc.fit(X)
 
 
+@pytest.mark.parametrize('dtype', (float, object))
+def test_ordinal_encoder_handle_unknowns_nan(dtype):
+    enc = OrdinalEncoder(handle_unknown='use_encoded_value',
+                         unknown_value=np.nan, dtype=dtype)
+
+    X_fit = np.array([[1], [2], [3]])
+    enc.fit(X_fit)
+    X_trans = enc.transform([[1], [2], [4]])
+    assert X_trans.dtype == dtype
+    # Convert to float because assert_array_equal fails when comparing object
+    # dtypes arrays that contain nans
+    X_trans = X_trans.astype(float)
+    assert_array_equal(X_trans, [[0], [1], [np.nan]])
+
+
+def test_ordinal_encoder_handle_unknowns_nan_int():
+    enc = OrdinalEncoder(handle_unknown='use_encoded_value',
+                         unknown_value=np.nan, dtype=int)
+
+    X_fit = np.array([[1], [2], [3]])
+    with pytest.raises(ValueError,
+                       match="dtype should be either float or object"):
+        enc.fit(X_fit)
+
+
 def test_ordinal_encoder_raise_categories_shape():
 
     X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T

From 4fc2758b7171ef5421c69aabe728536af94892c3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 15 Sep 2020 22:21:48 -0400
Subject: [PATCH 2/4] use is_scalar_nan

---
 sklearn/preprocessing/_encoders.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 5e7f53392375d..cfcac3b986837 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -7,7 +7,7 @@
 import numbers
 
 from ..base import BaseEstimator, TransformerMixin
-from ..utils import check_array
+from ..utils import check_array, is_scalar_nan
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _deprecate_positional_args
 
@@ -700,12 +700,7 @@ def fit(self, X, y=None):
         self
         """
         if self.handle_unknown == 'use_encoded_value':
-            try:
-                isnan = np.isnan(self.unknown_value)
-            except TypeError:
-                isnan = False
-
-            if isnan:
+            if is_scalar_nan(self.unknown_value):
                 if np.dtype(self.dtype).kind not in ('f', 'O'):
                     raise ValueError(
                         f"When unknown_value is np.nan, dtype should be "

From 28bf0db4a866ba0bbdd6ee22cbe196f01c045865 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 17 Sep 2020 14:31:08 -0400
Subject: [PATCH 3/4] disallow object dtype

---
 sklearn/preprocessing/_encoders.py           |  9 +++++----
 sklearn/preprocessing/tests/test_encoders.py | 17 ++++++++---------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index cfcac3b986837..c19ccfbcc88b5 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -636,7 +636,7 @@ class OrdinalEncoder(_BaseEncoder):
         parameter is required and will set the encoded value of unknown
         categories. It has to be distinct from the values used to encode any of
         the categories in `fit`. If set to np.nan, the `dtype` parameter must
-        be either a float dtype or `object`.
+        be a float dtype.
 
         .. versionadded:: 0.24
 
@@ -701,10 +701,11 @@ def fit(self, X, y=None):
         """
         if self.handle_unknown == 'use_encoded_value':
             if is_scalar_nan(self.unknown_value):
-                if np.dtype(self.dtype).kind not in ('f', 'O'):
+                if np.dtype(self.dtype).kind != 'f':
                     raise ValueError(
-                        f"When unknown_value is np.nan, dtype should be "
-                        f"either float or object. Got {self.dtype}."
+                        f"When unknown_value is np.nan, the dtype "
+                        "parameter should be "
+                        f"a float dtype. Got {self.dtype}."
                     )
             elif not isinstance(self.unknown_value, numbers.Integral):
                 raise TypeError(f"unknown_value should be an integer or "
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 59b1d7a590b69..239d388ebd9d1 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -614,28 +614,27 @@ def test_ordinal_encoder_handle_unknowns_raise():
         enc.fit(X)
 
 
-@pytest.mark.parametrize('dtype', (float, object))
-def test_ordinal_encoder_handle_unknowns_nan(dtype):
+def test_ordinal_encoder_handle_unknowns_nan():
+    # Make sure unknown_value=np.nan properly works
+
     enc = OrdinalEncoder(handle_unknown='use_encoded_value',
-                         unknown_value=np.nan, dtype=dtype)
+                         unknown_value=np.nan)
 
     X_fit = np.array([[1], [2], [3]])
     enc.fit(X_fit)
     X_trans = enc.transform([[1], [2], [4]])
-    assert X_trans.dtype == dtype
-    # Convert to float because assert_array_equal fails when comparing object
-    # dtypes arrays that contain nans
-    X_trans = X_trans.astype(float)
     assert_array_equal(X_trans, [[0], [1], [np.nan]])
 
 
-def test_ordinal_encoder_handle_unknowns_nan_int():
+def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype():
+    # Make sure an error is raised when unknown_value=np.nan and the dtype
+    # isn't a float dtype
     enc = OrdinalEncoder(handle_unknown='use_encoded_value',
                          unknown_value=np.nan, dtype=int)
 
     X_fit = np.array([[1], [2], [3]])
     with pytest.raises(ValueError,
-                       match="dtype should be either float or object"):
+                       match="dtype parameter should be a float dtype"):
         enc.fit(X_fit)
 
 

From acea21caea89f088c073f8a56d46d7b46f26f873 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 22 Sep 2020 11:02:29 -0400
Subject: [PATCH 4/4] whatsnew

---
 doc/whats_new/v0.24.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 98ccc5d143bcb..6127e0a8cba9c 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -537,7 +537,8 @@ Changelog
   ``use_encoded_value`` option, along with a new ``unknown_value`` parameter,
   to :class:`preprocessing.OrdinalEncoder` to allow unknown categories during
   transform and set the encoded value of the unknown categories.
-  :pr:`17406` by :user:`Felix Wick <FelixWick>`.
+  :pr:`17406` by :user:`Felix Wick <FelixWick>` and :pr:`18406` by
+  `Nicolas Hug`_.
 
 - |Feature| Add ``clip`` parameter to :class:`preprocessing.MinMaxScaler`,
   which clips the transformed values of test data to ``feature_range``.