From 815a25dce7359c22ba496ec74c38fa6c8e1bb094 Mon Sep 17 00:00:00 2001 From: nathanielmhld Date: Wed, 31 Jul 2019 16:46:14 -0400 Subject: [PATCH 01/10] Allowing Virtual Category instead of error for OrdinalEncoder --- sklearn/preprocessing/_encoders.py | 47 ++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index c1d3b1e80c352..e0fa486ae3870 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -122,6 +122,17 @@ def _transform(self, X, handle_unknown='error'): msg = ("Found unknown categories {0} in column {1}" " during transform".format(diff, i)) raise ValueError(msg) + elif handle_unknown == 'virtual': + # Set the problematic rows to None and if necessary + # append the None category to the category list. + # This will create a virtual category that maps + # back to None. + if not None in self.categories_[i]: + self.categories_[i] = np.append( + self.categories_[i], + np.array(None) + ) + Xi[~valid_mask] = None else: # Set the problematic rows to an acceptable value and # continue `The rows are marked `X_mask` and will be @@ -552,6 +563,14 @@ class OrdinalEncoder(_BaseEncoder): dtype : number type, default np.float64 Desired dtype of output. + handle_unknown : 'error' or 'virtual', default='error'. + Whether to raise an error or to create a virtual cagetory if an unknown + categorical feature is present during transform (default is to raise). + When this parameter is set to 'virtual' and an unknown category is + encountered during transform, an additional ordinal value will be + appended to the existing values, at the final ordinal positon. In the + inverse transform, an unknown category will be denoted as None. + Attributes ---------- categories_ : list of arrays @@ -579,6 +598,24 @@ class OrdinalEncoder(_BaseEncoder): array([['Male', 1], ['Female', 2]], dtype=object) + >>> encvirtual = OrdinalEncoder(handle_unknown='virtual') + >>> X = [["Red","Coffee"], ["Green","Tea"], ["Blue","Water"]] + >>> encvirtual.fit(X) + ... # doctest: +ELLIPSIS + OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, + handle_unknown='virtual') + >>> encvirtual.transform([["Red","Coffee"], ["Green","Tea"]]) + array([[2., 0.], + [1., 1.]]) + >>> encvirtual.transform([["Purple","Coffee"], ["Green","Tea"]]) + array([[3., 0.], + [1., 1.]]) + + >>> encvirtual.inverse_transform([[3, 0], [1, 1]]) + array([[None, "Coffee"], + ["Green","Tea"]], dtype=object) + + See also -------- sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of @@ -587,9 +624,10 @@ class OrdinalEncoder(_BaseEncoder): between 0 and n_classes-1. """ - def __init__(self, categories='auto', dtype=np.float64): + def __init__(self, categories='auto', dtype=np.float64, handle_unknown="error"): self.categories = categories self.dtype = dtype + self.handle_unknown = handle_unknown def fit(self, X, y=None): """Fit the OrdinalEncoder to X. @@ -622,7 +660,12 @@ def transform(self, X): Transformed input. """ - X_int, _ = self._transform(X) + if self.handle_unknown not in ('error', 'virtual'): + msg = ("handle_unknown should be either 'error' or 'virtual', " + "got {0}.".format(self.handle_unknown)) + raise ValueError(msg) + + X_int, _ = self._transform(X, handle_unknown=self.handle_unknown) return X_int.astype(self.dtype, copy=False) def inverse_transform(self, X): From cccdd8cac633ff6b46bd1af4f4edb8b1fd1627b8 Mon Sep 17 00:00:00 2001 From: Nathaniel Date: Thu, 1 Aug 2019 16:53:05 -0400 Subject: [PATCH 02/10] Removed line with whitespace --- sklearn/preprocessing/_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index e0fa486ae3870..59425ccc09ec2 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -664,7 +664,7 @@ def transform(self, X): msg = ("handle_unknown should be either 'error' or 'virtual', " "got {0}.".format(self.handle_unknown)) raise ValueError(msg) - + X_int, _ = self._transform(X, handle_unknown=self.handle_unknown) return X_int.astype(self.dtype, copy=False) From 1190e2957f743ec8b139666588d52806c6173001 Mon Sep 17 00:00:00 2001 From: nathanielmhld Date: Wed, 7 Aug 2019 15:32:51 -0400 Subject: [PATCH 03/10] categories_ no longer modified in transform --- sklearn/preprocessing/_encoders.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index e0fa486ae3870..1fc7cee5b3d97 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -96,6 +96,8 @@ def _fit(self, X, handle_unknown='error'): msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) + if handle_unknown == 'virtual': + cats.append([None]) self.categories_.append(cats) def _transform(self, X, handle_unknown='error'): @@ -127,11 +129,6 @@ def _transform(self, X, handle_unknown='error'): # append the None category to the category list. # This will create a virtual category that maps # back to None. - if not None in self.categories_[i]: - self.categories_[i] = np.append( - self.categories_[i], - np.array(None) - ) Xi[~valid_mask] = None else: # Set the problematic rows to an acceptable value and @@ -664,7 +661,7 @@ def transform(self, X): msg = ("handle_unknown should be either 'error' or 'virtual', " "got {0}.".format(self.handle_unknown)) raise ValueError(msg) - + X_int, _ = self._transform(X, handle_unknown=self.handle_unknown) return X_int.astype(self.dtype, copy=False) From 6d5a63de633d788c61c6bf581e5763a61f0e906a Mon Sep 17 00:00:00 2001 From: nathanielmhld Date: Wed, 7 Aug 2019 16:28:29 -0400 Subject: [PATCH 04/10] Now is inline with local tests and flake8 --- sklearn/preprocessing/_encoders.py | 39 ++++++++++++++++++------------ 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 1fc7cee5b3d97..20855ceed613f 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -96,8 +96,8 @@ def _fit(self, X, handle_unknown='error'): msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) - if handle_unknown == 'virtual': - cats.append([None]) + if handle_unknown == 'virtual': + cats = np.append(cats, None) self.categories_.append(cats) def _transform(self, X, handle_unknown='error'): @@ -561,11 +561,11 @@ class OrdinalEncoder(_BaseEncoder): Desired dtype of output. handle_unknown : 'error' or 'virtual', default='error'. - Whether to raise an error or to create a virtual cagetory if an unknown - categorical feature is present during transform (default is to raise). - When this parameter is set to 'virtual' and an unknown category is + Whether to raise an error or to create a virtual cagetory if an unknown + categorical feature is present during transform (default is to raise). + When this parameter is set to 'virtual' and an unknown category is encountered during transform, an additional ordinal value will be - appended to the existing values, at the final ordinal positon. In the + appended to the existing values, at the final ordinal positon. In the inverse transform, an unknown category will be denoted as None. Attributes @@ -596,21 +596,20 @@ class OrdinalEncoder(_BaseEncoder): ['Female', 2]], dtype=object) >>> encvirtual = OrdinalEncoder(handle_unknown='virtual') - >>> X = [["Red","Coffee"], ["Green","Tea"], ["Blue","Water"]] + >>> X = [['Red','Coffee'], ['Green','Tea'], ['Blue','Water']] >>> encvirtual.fit(X) ... # doctest: +ELLIPSIS - OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, - handle_unknown='virtual') - >>> encvirtual.transform([["Red","Coffee"], ["Green","Tea"]]) + OrdinalEncoder(handle_unknown='virtual') + >>> encvirtual.transform([['Red','Coffee'], ['Green','Tea']]) array([[2., 0.], [1., 1.]]) - >>> encvirtual.transform([["Purple","Coffee"], ["Green","Tea"]]) + >>> encvirtual.transform([['Purple','Coffee'], ['Green','Tea']]) array([[3., 0.], [1., 1.]]) >>> encvirtual.inverse_transform([[3, 0], [1, 1]]) - array([[None, "Coffee"], - ["Green","Tea"]], dtype=object) + array([[None, 'Coffee'], + ['Green', 'Tea']], dtype=object) See also @@ -621,7 +620,12 @@ class OrdinalEncoder(_BaseEncoder): between 0 and n_classes-1. """ - def __init__(self, categories='auto', dtype=np.float64, handle_unknown="error"): + def __init__( + self, + categories='auto', + dtype=np.float64, + handle_unknown="error" + ): self.categories = categories self.dtype = dtype self.handle_unknown = handle_unknown @@ -639,7 +643,12 @@ def fit(self, X, y=None): self """ - self._fit(X) + if self.handle_unknown not in ('error', 'virtual'): + msg = ("handle_unknown should be either 'error' or 'virtual', " + "got {0}.".format(self.handle_unknown)) + raise ValueError(msg) + + self._fit(X, handle_unknown=self.handle_unknown) return self From 3dc067b0eb9c8317c3ba63fa5f7c4daa4ecd88ac Mon Sep 17 00:00:00 2001 From: nathanielmhld Date: Wed, 7 Aug 2019 16:37:39 -0400 Subject: [PATCH 05/10] Updated docs to reflect new implementation --- sklearn/preprocessing/_encoders.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 20855ceed613f..36ce694ab0f59 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -97,6 +97,7 @@ def _fit(self, X, handle_unknown='error'): " during fit".format(diff, i)) raise ValueError(msg) if handle_unknown == 'virtual': + # Add the None virtual category cats = np.append(cats, None) self.categories_.append(cats) @@ -125,10 +126,7 @@ def _transform(self, X, handle_unknown='error'): " during transform".format(diff, i)) raise ValueError(msg) elif handle_unknown == 'virtual': - # Set the problematic rows to None and if necessary - # append the None category to the category list. - # This will create a virtual category that maps - # back to None. + # Set the problematic rows to None Xi[~valid_mask] = None else: # Set the problematic rows to an acceptable value and @@ -561,12 +559,13 @@ class OrdinalEncoder(_BaseEncoder): Desired dtype of output. handle_unknown : 'error' or 'virtual', default='error'. - Whether to raise an error or to create a virtual cagetory if an unknown - categorical feature is present during transform (default is to raise). - When this parameter is set to 'virtual' and an unknown category is - encountered during transform, an additional ordinal value will be - appended to the existing values, at the final ordinal positon. In the - inverse transform, an unknown category will be denoted as None. + Whether to raise an error if an unknown categorical feature is present + during transform, or to create a virtual cagetory "None" at + instantiation, and pass unknown categorical features to that "None" + category (default is to raise). When this parameter is set to 'virtual' + an additional ordinal value will be appended to the existing values at + fit, at the final ordinal positon. In the inverse transform, an unknown + category will be denoted as None. Attributes ---------- From 3c1526d0386c0740bcfbf46d804c8942ca21ada3 Mon Sep 17 00:00:00 2001 From: nathanielmhld Date: Wed, 7 Aug 2019 17:19:35 -0400 Subject: [PATCH 06/10] Added a test for the behavior --- sklearn/preprocessing/tests/test_encoders.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 8e1a61781544a..591a86b655129 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -523,6 +523,20 @@ def test_ordinal_encoder_inverse(): assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) +def test_ordinal_encoder_handle_unknown(): + X = [["Red", "Coffee"], ["Green", "Tea"], ["Blue", "Water"]] + Y = [["Purple", "Coffee"], ["Green", "Tea"]] + enc = OrdinalEncoder() + enc.fit(X) + with pytest.raises(ValueError, match="Found unknown categories"): + enc.transform(Y) + encVirtual = OrdinalEncoder(handle_unknown='virtual') + encVirtual.fit(X) + Y_tr = encVirtual.transform(Y) + exp = np.array([[None, 'Coffee'], ['Green', 'Tea']], dtype=object) + assert_array_equal(encVirtual.inverse_transform(Y_tr), exp) + + @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T, np.array([['a', np.nan]], dtype=object).T], ids=['numeric', 'object']) From 56ce00f79f61bd26df272635ae873404868ec724 Mon Sep 17 00:00:00 2001 From: nathanielmhld Date: Wed, 7 Aug 2019 19:38:00 -0400 Subject: [PATCH 07/10] Added another test to comply with the umbrella thing --- sklearn/preprocessing/tests/test_encoders.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 591a86b655129..6862111cc23ef 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -537,6 +537,26 @@ def test_ordinal_encoder_handle_unknown(): assert_array_equal(encVirtual.inverse_transform(Y_tr), exp) +def test_ordinal_encoder_categories_(): + X = [["Red", "Coffee"], ["Green", "Tea"], ["Blue", "Water"]] + enc = OrdinalEncoder() + enc.fit(X) + assert_array_equal( + enc.categories_[0], + np.array(['Blue', 'Green', 'Red'], dtype=object)) + assert_array_equal( + enc.categories_[1], + np.array(['Coffee', 'Tea', 'Water'], dtype=object)) + encVirtual = OrdinalEncoder(handle_unknown='virtual') + encVirtual.fit(X) + assert_array_equal( + encVirtual.categories_[0], + np.array(['Blue', 'Green', 'Red', None], dtype=object)) + assert_array_equal( + encVirtual.categories_[1], + np.array(['Coffee', 'Tea', 'Water', None], dtype=object)) + + @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T, np.array([['a', np.nan]], dtype=object).T], ids=['numeric', 'object']) From 6aa053d41d430f76eabbaea818bfcb1e657cd80a Mon Sep 17 00:00:00 2001 From: nathanielmhld Date: Wed, 7 Aug 2019 22:26:37 -0400 Subject: [PATCH 08/10] Nagging flake8 errors solved --- sklearn/preprocessing/tests/test_encoders.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 6862111cc23ef..231af0c0094fe 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -551,10 +551,12 @@ def test_ordinal_encoder_categories_(): encVirtual.fit(X) assert_array_equal( encVirtual.categories_[0], - np.array(['Blue', 'Green', 'Red', None], dtype=object)) + np.array(['Blue', 'Green', 'Red', None], + dtype=object)) assert_array_equal( encVirtual.categories_[1], - np.array(['Coffee', 'Tea', 'Water', None], dtype=object)) + np.array(['Coffee', 'Tea', 'Water', None], + dtype=object)) @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T, From d4c241fbc8aa5fbdbe7cc23c34d10d41be4febd6 Mon Sep 17 00:00:00 2001 From: nathanielmhld Date: Thu, 8 Aug 2019 09:34:49 -0400 Subject: [PATCH 09/10] Removed a small amount of dead code and improve test coverage aiming at that 100 --- sklearn/preprocessing/_encoders.py | 4 ---- sklearn/preprocessing/tests/test_encoders.py | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 36ce694ab0f59..ba2216a36aaf7 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -665,10 +665,6 @@ def transform(self, X): Transformed input. """ - if self.handle_unknown not in ('error', 'virtual'): - msg = ("handle_unknown should be either 'error' or 'virtual', " - "got {0}.".format(self.handle_unknown)) - raise ValueError(msg) X_int, _ = self._transform(X, handle_unknown=self.handle_unknown) return X_int.astype(self.dtype, copy=False) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 231af0c0094fe..bcfbfefc38e5b 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -535,6 +535,9 @@ def test_ordinal_encoder_handle_unknown(): Y_tr = encVirtual.transform(Y) exp = np.array([[None, 'Coffee'], ['Green', 'Tea']], dtype=object) assert_array_equal(encVirtual.inverse_transform(Y_tr), exp) + encIssue = OrdinalEncoder(handle_unknown='wrong') + with pytest.raises(ValueError, match="handle_unknown should be"): + encIssue.fit(X) def test_ordinal_encoder_categories_(): @@ -587,6 +590,7 @@ def test_ordinal_encoder_raise_categories_shape(): with pytest.raises(ValueError, match=msg): enc.fit(X) + def test_encoder_dtypes(): # check that dtypes are preserved when determining categories enc = OneHotEncoder(categories='auto') From 39be03102be3b735df8d5a228dad0edb3b7ff37b Mon Sep 17 00:00:00 2001 From: nathanielmhld Date: Mon, 10 Feb 2020 16:10:26 -0500 Subject: [PATCH 10/10] Accidentally had 2 see alsos --- sklearn/preprocessing/_encoders.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 44047486b4893..a640be511b8db 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -624,13 +624,6 @@ class OrdinalEncoder(_BaseEncoder): (in order of the features in X and corresponding with the output of ``transform``). - See Also - -------- - sklearn.preprocessing.OneHotEncoder : Performs a one-hot encoding of - categorical features. - sklearn.preprocessing.LabelEncoder : Encodes target labels with values - between 0 and n_classes-1. - Examples -------- Given a dataset with two features, we let the encoder find the unique