diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 33b53073cfc31..a640be511b8db 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -96,6 +96,9 @@ def _fit(self, X, handle_unknown='error'): msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) + if handle_unknown == 'virtual': + # Add the None virtual category + cats = np.append(cats, None) self.categories_.append(cats) def _transform(self, X, handle_unknown='error'): @@ -122,6 +125,9 @@ def _transform(self, X, handle_unknown='error'): msg = ("Found unknown categories {0} in column {1}" " during transform".format(diff, i)) raise ValueError(msg) + elif handle_unknown == 'virtual': + # Set the problematic rows to None + Xi[~valid_mask] = None else: # Set the problematic rows to an acceptable value and # continue `The rows are marked `X_mask` and will be @@ -602,6 +608,15 @@ class OrdinalEncoder(_BaseEncoder): dtype : number type, default np.float64 Desired dtype of output. + handle_unknown : 'error' or 'virtual', default='error'. + Whether to raise an error if an unknown categorical feature is present + during transform, or to create a virtual cagetory "None" at + instantiation, and pass unknown categorical features to that "None" + category (default is to raise). When this parameter is set to 'virtual' + an additional ordinal value will be appended to the existing values at + fit, at the final ordinal positon. In the inverse transform, an unknown + category will be denoted as None. + Attributes ---------- categories_ : list of arrays @@ -609,13 +624,6 @@ class OrdinalEncoder(_BaseEncoder): (in order of the features in X and corresponding with the output of ``transform``). - See Also - -------- - sklearn.preprocessing.OneHotEncoder : Performs a one-hot encoding of - categorical features. - sklearn.preprocessing.LabelEncoder : Encodes target labels with values - between 0 and n_classes-1. - Examples -------- Given a dataset with two features, we let the encoder find the unique @@ -635,11 +643,41 @@ class OrdinalEncoder(_BaseEncoder): >>> enc.inverse_transform([[1, 0], [0, 1]]) array([['Male', 1], ['Female', 2]], dtype=object) + + >>> encvirtual = OrdinalEncoder(handle_unknown='virtual') + >>> X = [['Red','Coffee'], ['Green','Tea'], ['Blue','Water']] + >>> encvirtual.fit(X) + ... # doctest: +ELLIPSIS + OrdinalEncoder(handle_unknown='virtual') + >>> encvirtual.transform([['Red','Coffee'], ['Green','Tea']]) + array([[2., 0.], + [1., 1.]]) + >>> encvirtual.transform([['Purple','Coffee'], ['Green','Tea']]) + array([[3., 0.], + [1., 1.]]) + + >>> encvirtual.inverse_transform([[3, 0], [1, 1]]) + array([[None, 'Coffee'], + ['Green', 'Tea']], dtype=object) + + + See also + -------- + sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of + categorical features. + sklearn.preprocessing.LabelEncoder : encodes target labels with values + between 0 and n_classes-1. """ - def __init__(self, categories='auto', dtype=np.float64): + def __init__( + self, + categories='auto', + dtype=np.float64, + handle_unknown="error" + ): self.categories = categories self.dtype = dtype + self.handle_unknown = handle_unknown def fit(self, X, y=None): """ @@ -658,7 +696,12 @@ def fit(self, X, y=None): ------- self """ - self._fit(X) + if self.handle_unknown not in ('error', 'virtual'): + msg = ("handle_unknown should be either 'error' or 'virtual', " + "got {0}.".format(self.handle_unknown)) + raise ValueError(msg) + + self._fit(X, handle_unknown=self.handle_unknown) return self @@ -676,7 +719,8 @@ def transform(self, X): X_out : sparse matrix or a 2-d array Transformed input. """ - X_int, _ = self._transform(X) + + X_int, _ = self._transform(X, handle_unknown=self.handle_unknown) return X_int.astype(self.dtype, copy=False) def inverse_transform(self, X): diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index f6283c71313b6..7c74165211bc7 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -523,6 +523,45 @@ def test_ordinal_encoder_inverse(): enc.inverse_transform(X_tr) +def test_ordinal_encoder_handle_unknown(): + X = [["Red", "Coffee"], ["Green", "Tea"], ["Blue", "Water"]] + Y = [["Purple", "Coffee"], ["Green", "Tea"]] + enc = OrdinalEncoder() + enc.fit(X) + with pytest.raises(ValueError, match="Found unknown categories"): + enc.transform(Y) + encVirtual = OrdinalEncoder(handle_unknown='virtual') + encVirtual.fit(X) + Y_tr = encVirtual.transform(Y) + exp = np.array([[None, 'Coffee'], ['Green', 'Tea']], dtype=object) + assert_array_equal(encVirtual.inverse_transform(Y_tr), exp) + encIssue = OrdinalEncoder(handle_unknown='wrong') + with pytest.raises(ValueError, match="handle_unknown should be"): + encIssue.fit(X) + + +def test_ordinal_encoder_categories_(): + X = [["Red", "Coffee"], ["Green", "Tea"], ["Blue", "Water"]] + enc = OrdinalEncoder() + enc.fit(X) + assert_array_equal( + enc.categories_[0], + np.array(['Blue', 'Green', 'Red'], dtype=object)) + assert_array_equal( + enc.categories_[1], + np.array(['Coffee', 'Tea', 'Water'], dtype=object)) + encVirtual = OrdinalEncoder(handle_unknown='virtual') + encVirtual.fit(X) + assert_array_equal( + encVirtual.categories_[0], + np.array(['Blue', 'Green', 'Red', None], + dtype=object)) + assert_array_equal( + encVirtual.categories_[1], + np.array(['Coffee', 'Tea', 'Water', None], + dtype=object)) + + @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T, np.array([['a', np.nan]], dtype=object).T], ids=['numeric', 'object'])