From 815a25dce7359c22ba496ec74c38fa6c8e1bb094 Mon Sep 17 00:00:00 2001
From: nathanielmhld <nathanielmhld@gmail.com>
Date: Wed, 31 Jul 2019 16:46:14 -0400
Subject: [PATCH 01/10] Allowing Virtual Category instead of error for
 OrdinalEncoder

---
 sklearn/preprocessing/_encoders.py | 47 ++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index c1d3b1e80c352..e0fa486ae3870 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -122,6 +122,17 @@ def _transform(self, X, handle_unknown='error'):
                     msg = ("Found unknown categories {0} in column {1}"
                            " during transform".format(diff, i))
                     raise ValueError(msg)
+                elif handle_unknown == 'virtual':
+                    # Set the problematic rows to None and if necessary
+                    # append the None category to the category list.
+                    # This will create a virtual category that maps
+                    # back to None.
+                    if not None in self.categories_[i]:
+                        self.categories_[i] = np.append(
+                            self.categories_[i],
+                            np.array(None)
+                        )
+                    Xi[~valid_mask] = None
                 else:
                     # Set the problematic rows to an acceptable value and
                     # continue `The rows are marked `X_mask` and will be
@@ -552,6 +563,14 @@ class OrdinalEncoder(_BaseEncoder):
     dtype : number type, default np.float64
         Desired dtype of output.
 
+    handle_unknown : 'error' or 'virtual', default='error'.
+        Whether to raise an error or to create a virtual cagetory if an unknown 
+        categorical feature is present during transform (default is to raise). 
+        When this parameter is set to 'virtual' and an unknown category is 
+        encountered during transform, an additional ordinal value will be
+        appended to the existing values, at the final ordinal positon. In the 
+        inverse transform, an unknown category will be denoted as None.
+
     Attributes
     ----------
     categories_ : list of arrays
@@ -579,6 +598,24 @@ class OrdinalEncoder(_BaseEncoder):
     array([['Male', 1],
            ['Female', 2]], dtype=object)
 
+    >>> encvirtual = OrdinalEncoder(handle_unknown='virtual')
+    >>> X = [["Red","Coffee"], ["Green","Tea"],  ["Blue","Water"]]
+    >>> encvirtual.fit(X)
+    ... # doctest: +ELLIPSIS
+    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
+        handle_unknown='virtual')
+    >>> encvirtual.transform([["Red","Coffee"], ["Green","Tea"]])
+    array([[2., 0.],
+           [1., 1.]])
+    >>> encvirtual.transform([["Purple","Coffee"], ["Green","Tea"]])
+    array([[3., 0.],
+           [1., 1.]])
+
+    >>> encvirtual.inverse_transform([[3, 0], [1, 1]])
+    array([[None, "Coffee"],
+           ["Green","Tea"]], dtype=object)
+
+
     See also
     --------
     sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
@@ -587,9 +624,10 @@ class OrdinalEncoder(_BaseEncoder):
       between 0 and n_classes-1.
     """
 
-    def __init__(self, categories='auto', dtype=np.float64):
+    def __init__(self, categories='auto', dtype=np.float64, handle_unknown="error"):
         self.categories = categories
         self.dtype = dtype
+        self.handle_unknown = handle_unknown
 
     def fit(self, X, y=None):
         """Fit the OrdinalEncoder to X.
@@ -622,7 +660,12 @@ def transform(self, X):
             Transformed input.
 
         """
-        X_int, _ = self._transform(X)
+        if self.handle_unknown not in ('error', 'virtual'):
+            msg = ("handle_unknown should be either 'error' or 'virtual', "
+                   "got {0}.".format(self.handle_unknown))
+            raise ValueError(msg)
+            
+        X_int, _ = self._transform(X, handle_unknown=self.handle_unknown)
         return X_int.astype(self.dtype, copy=False)
 
     def inverse_transform(self, X):

From cccdd8cac633ff6b46bd1af4f4edb8b1fd1627b8 Mon Sep 17 00:00:00 2001
From: Nathaniel <nathanielmhld@berkeley.edu>
Date: Thu, 1 Aug 2019 16:53:05 -0400
Subject: [PATCH 02/10] Removed line with whitespace

---
 sklearn/preprocessing/_encoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index e0fa486ae3870..59425ccc09ec2 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -664,7 +664,7 @@ def transform(self, X):
             msg = ("handle_unknown should be either 'error' or 'virtual', "
                    "got {0}.".format(self.handle_unknown))
             raise ValueError(msg)
-            
+
         X_int, _ = self._transform(X, handle_unknown=self.handle_unknown)
         return X_int.astype(self.dtype, copy=False)
 

From 1190e2957f743ec8b139666588d52806c6173001 Mon Sep 17 00:00:00 2001
From: nathanielmhld <nathanielmhld@gmail.com>
Date: Wed, 7 Aug 2019 15:32:51 -0400
Subject: [PATCH 03/10] categories_ no longer modified in transform

---
 sklearn/preprocessing/_encoders.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index e0fa486ae3870..1fc7cee5b3d97 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -96,6 +96,8 @@ def _fit(self, X, handle_unknown='error'):
                         msg = ("Found unknown categories {0} in column {1}"
                                " during fit".format(diff, i))
                         raise ValueError(msg)
+                if handle_unknown == 'virtual':
+                    cats.append([None])
             self.categories_.append(cats)
 
     def _transform(self, X, handle_unknown='error'):
@@ -127,11 +129,6 @@ def _transform(self, X, handle_unknown='error'):
                     # append the None category to the category list.
                     # This will create a virtual category that maps
                     # back to None.
-                    if not None in self.categories_[i]:
-                        self.categories_[i] = np.append(
-                            self.categories_[i],
-                            np.array(None)
-                        )
                     Xi[~valid_mask] = None
                 else:
                     # Set the problematic rows to an acceptable value and
@@ -664,7 +661,7 @@ def transform(self, X):
             msg = ("handle_unknown should be either 'error' or 'virtual', "
                    "got {0}.".format(self.handle_unknown))
             raise ValueError(msg)
-            
+
         X_int, _ = self._transform(X, handle_unknown=self.handle_unknown)
         return X_int.astype(self.dtype, copy=False)
 

From 6d5a63de633d788c61c6bf581e5763a61f0e906a Mon Sep 17 00:00:00 2001
From: nathanielmhld <nathanielmhld@gmail.com>
Date: Wed, 7 Aug 2019 16:28:29 -0400
Subject: [PATCH 04/10] Now is inline with local tests and flake8

---
 sklearn/preprocessing/_encoders.py | 39 ++++++++++++++++++------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 1fc7cee5b3d97..20855ceed613f 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -96,8 +96,8 @@ def _fit(self, X, handle_unknown='error'):
                         msg = ("Found unknown categories {0} in column {1}"
                                " during fit".format(diff, i))
                         raise ValueError(msg)
-                if handle_unknown == 'virtual':
-                    cats.append([None])
+            if handle_unknown == 'virtual':
+                cats = np.append(cats, None)
             self.categories_.append(cats)
 
     def _transform(self, X, handle_unknown='error'):
@@ -561,11 +561,11 @@ class OrdinalEncoder(_BaseEncoder):
         Desired dtype of output.
 
     handle_unknown : 'error' or 'virtual', default='error'.
-        Whether to raise an error or to create a virtual cagetory if an unknown 
-        categorical feature is present during transform (default is to raise). 
-        When this parameter is set to 'virtual' and an unknown category is 
+        Whether to raise an error or to create a virtual cagetory if an unknown
+        categorical feature is present during transform (default is to raise).
+        When this parameter is set to 'virtual' and an unknown category is
         encountered during transform, an additional ordinal value will be
-        appended to the existing values, at the final ordinal positon. In the 
+        appended to the existing values, at the final ordinal positon. In the
         inverse transform, an unknown category will be denoted as None.
 
     Attributes
@@ -596,21 +596,20 @@ class OrdinalEncoder(_BaseEncoder):
            ['Female', 2]], dtype=object)
 
     >>> encvirtual = OrdinalEncoder(handle_unknown='virtual')
-    >>> X = [["Red","Coffee"], ["Green","Tea"],  ["Blue","Water"]]
+    >>> X = [['Red','Coffee'], ['Green','Tea'],  ['Blue','Water']]
     >>> encvirtual.fit(X)
     ... # doctest: +ELLIPSIS
-    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
-        handle_unknown='virtual')
-    >>> encvirtual.transform([["Red","Coffee"], ["Green","Tea"]])
+    OrdinalEncoder(handle_unknown='virtual')
+    >>> encvirtual.transform([['Red','Coffee'], ['Green','Tea']])
     array([[2., 0.],
            [1., 1.]])
-    >>> encvirtual.transform([["Purple","Coffee"], ["Green","Tea"]])
+    >>> encvirtual.transform([['Purple','Coffee'], ['Green','Tea']])
     array([[3., 0.],
            [1., 1.]])
 
     >>> encvirtual.inverse_transform([[3, 0], [1, 1]])
-    array([[None, "Coffee"],
-           ["Green","Tea"]], dtype=object)
+    array([[None, 'Coffee'],
+           ['Green', 'Tea']], dtype=object)
 
 
     See also
@@ -621,7 +620,12 @@ class OrdinalEncoder(_BaseEncoder):
       between 0 and n_classes-1.
     """
 
-    def __init__(self, categories='auto', dtype=np.float64, handle_unknown="error"):
+    def __init__(
+            self,
+            categories='auto',
+            dtype=np.float64,
+            handle_unknown="error"
+            ):
         self.categories = categories
         self.dtype = dtype
         self.handle_unknown = handle_unknown
@@ -639,7 +643,12 @@ def fit(self, X, y=None):
         self
 
         """
-        self._fit(X)
+        if self.handle_unknown not in ('error', 'virtual'):
+            msg = ("handle_unknown should be either 'error' or 'virtual', "
+                   "got {0}.".format(self.handle_unknown))
+            raise ValueError(msg)
+
+        self._fit(X, handle_unknown=self.handle_unknown)
 
         return self
 

From 3dc067b0eb9c8317c3ba63fa5f7c4daa4ecd88ac Mon Sep 17 00:00:00 2001
From: nathanielmhld <nathanielmhld@gmail.com>
Date: Wed, 7 Aug 2019 16:37:39 -0400
Subject: [PATCH 05/10] Updated docs to reflect new implementation

---
 sklearn/preprocessing/_encoders.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 20855ceed613f..36ce694ab0f59 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -97,6 +97,7 @@ def _fit(self, X, handle_unknown='error'):
                                " during fit".format(diff, i))
                         raise ValueError(msg)
             if handle_unknown == 'virtual':
+                # Add the None virtual category
                 cats = np.append(cats, None)
             self.categories_.append(cats)
 
@@ -125,10 +126,7 @@ def _transform(self, X, handle_unknown='error'):
                            " during transform".format(diff, i))
                     raise ValueError(msg)
                 elif handle_unknown == 'virtual':
-                    # Set the problematic rows to None and if necessary
-                    # append the None category to the category list.
-                    # This will create a virtual category that maps
-                    # back to None.
+                    # Set the problematic rows to None
                     Xi[~valid_mask] = None
                 else:
                     # Set the problematic rows to an acceptable value and
@@ -561,12 +559,13 @@ class OrdinalEncoder(_BaseEncoder):
         Desired dtype of output.
 
     handle_unknown : 'error' or 'virtual', default='error'.
-        Whether to raise an error or to create a virtual cagetory if an unknown
-        categorical feature is present during transform (default is to raise).
-        When this parameter is set to 'virtual' and an unknown category is
-        encountered during transform, an additional ordinal value will be
-        appended to the existing values, at the final ordinal positon. In the
-        inverse transform, an unknown category will be denoted as None.
+        Whether to raise an error if an unknown categorical feature is present
+        during transform, or to create a virtual cagetory "None" at
+        instantiation, and pass unknown categorical features to that "None"
+        category (default is to raise). When this parameter is set to 'virtual'
+        an additional ordinal value will be appended to the existing values at
+        fit, at the final ordinal positon. In the inverse transform, an unknown
+        category will be denoted as None.
 
     Attributes
     ----------

From 3c1526d0386c0740bcfbf46d804c8942ca21ada3 Mon Sep 17 00:00:00 2001
From: nathanielmhld <nathanielmhld@gmail.com>
Date: Wed, 7 Aug 2019 17:19:35 -0400
Subject: [PATCH 06/10] Added a test for the behavior

---
 sklearn/preprocessing/tests/test_encoders.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 8e1a61781544a..591a86b655129 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -523,6 +523,20 @@ def test_ordinal_encoder_inverse():
     assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
 
 
+def test_ordinal_encoder_handle_unknown():
+    X = [["Red", "Coffee"], ["Green", "Tea"],  ["Blue", "Water"]]
+    Y = [["Purple", "Coffee"], ["Green", "Tea"]]
+    enc = OrdinalEncoder()
+    enc.fit(X)
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        enc.transform(Y)
+    encVirtual = OrdinalEncoder(handle_unknown='virtual')
+    encVirtual.fit(X)
+    Y_tr = encVirtual.transform(Y)
+    exp = np.array([[None, 'Coffee'], ['Green', 'Tea']], dtype=object)
+    assert_array_equal(encVirtual.inverse_transform(Y_tr), exp)
+
+
 @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
                                np.array([['a', np.nan]], dtype=object).T],
                          ids=['numeric', 'object'])

From 56ce00f79f61bd26df272635ae873404868ec724 Mon Sep 17 00:00:00 2001
From: nathanielmhld <nathanielmhld@gmail.com>
Date: Wed, 7 Aug 2019 19:38:00 -0400
Subject: [PATCH 07/10] Added another test to comply with the umbrella thing

---
 sklearn/preprocessing/tests/test_encoders.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 591a86b655129..6862111cc23ef 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -537,6 +537,26 @@ def test_ordinal_encoder_handle_unknown():
     assert_array_equal(encVirtual.inverse_transform(Y_tr), exp)
 
 
+def test_ordinal_encoder_categories_():
+    X = [["Red", "Coffee"], ["Green", "Tea"],  ["Blue", "Water"]]
+    enc = OrdinalEncoder()
+    enc.fit(X)
+    assert_array_equal(
+                        enc.categories_[0],
+                        np.array(['Blue', 'Green', 'Red'], dtype=object))
+    assert_array_equal(
+                        enc.categories_[1],
+                        np.array(['Coffee', 'Tea', 'Water'], dtype=object))
+    encVirtual = OrdinalEncoder(handle_unknown='virtual')
+    encVirtual.fit(X)
+    assert_array_equal(
+                        encVirtual.categories_[0],
+                        np.array(['Blue', 'Green', 'Red', None], dtype=object))
+    assert_array_equal(
+                        encVirtual.categories_[1],
+                        np.array(['Coffee', 'Tea', 'Water', None], dtype=object))
+
+
 @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
                                np.array([['a', np.nan]], dtype=object).T],
                          ids=['numeric', 'object'])

From 6aa053d41d430f76eabbaea818bfcb1e657cd80a Mon Sep 17 00:00:00 2001
From: nathanielmhld <nathanielmhld@gmail.com>
Date: Wed, 7 Aug 2019 22:26:37 -0400
Subject: [PATCH 08/10] Nagging flake8 errors solved

---
 sklearn/preprocessing/tests/test_encoders.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 6862111cc23ef..231af0c0094fe 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -551,10 +551,12 @@ def test_ordinal_encoder_categories_():
     encVirtual.fit(X)
     assert_array_equal(
                         encVirtual.categories_[0],
-                        np.array(['Blue', 'Green', 'Red', None], dtype=object))
+                        np.array(['Blue', 'Green', 'Red', None],
+                                 dtype=object))
     assert_array_equal(
                         encVirtual.categories_[1],
-                        np.array(['Coffee', 'Tea', 'Water', None], dtype=object))
+                        np.array(['Coffee', 'Tea', 'Water', None],
+                                 dtype=object))
 
 
 @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,

From d4c241fbc8aa5fbdbe7cc23c34d10d41be4febd6 Mon Sep 17 00:00:00 2001
From: nathanielmhld <nathanielmhld@gmail.com>
Date: Thu, 8 Aug 2019 09:34:49 -0400
Subject: [PATCH 09/10] Removed a small amount of dead code and improve test
 coverage aiming at that 100

---
 sklearn/preprocessing/_encoders.py           | 4 ----
 sklearn/preprocessing/tests/test_encoders.py | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 36ce694ab0f59..ba2216a36aaf7 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -665,10 +665,6 @@ def transform(self, X):
             Transformed input.
 
         """
-        if self.handle_unknown not in ('error', 'virtual'):
-            msg = ("handle_unknown should be either 'error' or 'virtual', "
-                   "got {0}.".format(self.handle_unknown))
-            raise ValueError(msg)
 
         X_int, _ = self._transform(X, handle_unknown=self.handle_unknown)
         return X_int.astype(self.dtype, copy=False)
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 231af0c0094fe..bcfbfefc38e5b 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -535,6 +535,9 @@ def test_ordinal_encoder_handle_unknown():
     Y_tr = encVirtual.transform(Y)
     exp = np.array([[None, 'Coffee'], ['Green', 'Tea']], dtype=object)
     assert_array_equal(encVirtual.inverse_transform(Y_tr), exp)
+    encIssue = OrdinalEncoder(handle_unknown='wrong')
+    with pytest.raises(ValueError, match="handle_unknown should be"):
+        encIssue.fit(X)
 
 
 def test_ordinal_encoder_categories_():
@@ -587,6 +590,7 @@ def test_ordinal_encoder_raise_categories_shape():
     with pytest.raises(ValueError, match=msg):
         enc.fit(X)
 
+
 def test_encoder_dtypes():
     # check that dtypes are preserved when determining categories
     enc = OneHotEncoder(categories='auto')

From 39be03102be3b735df8d5a228dad0edb3b7ff37b Mon Sep 17 00:00:00 2001
From: nathanielmhld <nathanielmhld@gmail.com>
Date: Mon, 10 Feb 2020 16:10:26 -0500
Subject: [PATCH 10/10] Accidentally had 2 see alsos

---
 sklearn/preprocessing/_encoders.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 44047486b4893..a640be511b8db 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -624,13 +624,6 @@ class OrdinalEncoder(_BaseEncoder):
         (in order of the features in X and corresponding with the output
         of ``transform``).
 
-    See Also
-    --------
-    sklearn.preprocessing.OneHotEncoder : Performs a one-hot encoding of
-      categorical features.
-    sklearn.preprocessing.LabelEncoder : Encodes target labels with values
-      between 0 and n_classes-1.
-
     Examples
     --------
     Given a dataset with two features, we let the encoder find the unique