Skip to content
Closed
64 changes: 54 additions & 10 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ def _fit(self, X, handle_unknown='error'):
msg = ("Found unknown categories {0} in column {1}"
" during fit".format(diff, i))
raise ValueError(msg)
if handle_unknown == 'virtual':
# Add the None virtual category
cats = np.append(cats, None)
self.categories_.append(cats)

def _transform(self, X, handle_unknown='error'):
Expand All @@ -122,6 +125,9 @@ def _transform(self, X, handle_unknown='error'):
msg = ("Found unknown categories {0} in column {1}"
" during transform".format(diff, i))
raise ValueError(msg)
elif handle_unknown == 'virtual':
# Set the problematic rows to None
Xi[~valid_mask] = None
else:
# Set the problematic rows to an acceptable value and
# continue `The rows are marked `X_mask` and will be
Expand Down Expand Up @@ -602,20 +608,22 @@ class OrdinalEncoder(_BaseEncoder):
dtype : number type, default np.float64
Desired dtype of output.

handle_unknown : 'error' or 'virtual', default='error'.
Whether to raise an error if an unknown categorical feature is present
during transform, or to create a virtual cagetory "None" at
instantiation, and pass unknown categorical features to that "None"
category (default is to raise). When this parameter is set to 'virtual'
an additional ordinal value will be appended to the existing values at
fit, at the final ordinal positon. In the inverse transform, an unknown
category will be denoted as None.

Attributes
----------
categories_ : list of arrays
The categories of each feature determined during fitting
(in order of the features in X and corresponding with the output
of ``transform``).

See Also
--------
sklearn.preprocessing.OneHotEncoder : Performs a one-hot encoding of
categorical features.
sklearn.preprocessing.LabelEncoder : Encodes target labels with values
between 0 and n_classes-1.

Examples
--------
Given a dataset with two features, we let the encoder find the unique
Expand All @@ -635,11 +643,41 @@ class OrdinalEncoder(_BaseEncoder):
>>> enc.inverse_transform([[1, 0], [0, 1]])
array([['Male', 1],
['Female', 2]], dtype=object)

>>> encvirtual = OrdinalEncoder(handle_unknown='virtual')
>>> X = [['Red','Coffee'], ['Green','Tea'], ['Blue','Water']]
>>> encvirtual.fit(X)
... # doctest: +ELLIPSIS
OrdinalEncoder(handle_unknown='virtual')
>>> encvirtual.transform([['Red','Coffee'], ['Green','Tea']])
array([[2., 0.],
[1., 1.]])
>>> encvirtual.transform([['Purple','Coffee'], ['Green','Tea']])
array([[3., 0.],
[1., 1.]])

>>> encvirtual.inverse_transform([[3, 0], [1, 1]])
array([[None, 'Coffee'],
['Green', 'Tea']], dtype=object)


See also
--------
sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
categorical features.
sklearn.preprocessing.LabelEncoder : encodes target labels with values
between 0 and n_classes-1.
"""

def __init__(self, categories='auto', dtype=np.float64):
def __init__(
self,
categories='auto',
dtype=np.float64,
handle_unknown="error"
):
self.categories = categories
self.dtype = dtype
self.handle_unknown = handle_unknown

def fit(self, X, y=None):
"""
Expand All @@ -658,7 +696,12 @@ def fit(self, X, y=None):
-------
self
"""
self._fit(X)
if self.handle_unknown not in ('error', 'virtual'):
msg = ("handle_unknown should be either 'error' or 'virtual', "
"got {0}.".format(self.handle_unknown))
raise ValueError(msg)

self._fit(X, handle_unknown=self.handle_unknown)

return self

Expand All @@ -676,7 +719,8 @@ def transform(self, X):
X_out : sparse matrix or a 2-d array
Transformed input.
"""
X_int, _ = self._transform(X)

X_int, _ = self._transform(X, handle_unknown=self.handle_unknown)
return X_int.astype(self.dtype, copy=False)

def inverse_transform(self, X):
Expand Down
39 changes: 39 additions & 0 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,45 @@ def test_ordinal_encoder_inverse():
enc.inverse_transform(X_tr)


def test_ordinal_encoder_handle_unknown():
X = [["Red", "Coffee"], ["Green", "Tea"], ["Blue", "Water"]]
Y = [["Purple", "Coffee"], ["Green", "Tea"]]
enc = OrdinalEncoder()
enc.fit(X)
with pytest.raises(ValueError, match="Found unknown categories"):
enc.transform(Y)
encVirtual = OrdinalEncoder(handle_unknown='virtual')
encVirtual.fit(X)
Y_tr = encVirtual.transform(Y)
exp = np.array([[None, 'Coffee'], ['Green', 'Tea']], dtype=object)
assert_array_equal(encVirtual.inverse_transform(Y_tr), exp)
encIssue = OrdinalEncoder(handle_unknown='wrong')
with pytest.raises(ValueError, match="handle_unknown should be"):
encIssue.fit(X)


def test_ordinal_encoder_categories_():
X = [["Red", "Coffee"], ["Green", "Tea"], ["Blue", "Water"]]
enc = OrdinalEncoder()
enc.fit(X)
assert_array_equal(
enc.categories_[0],
np.array(['Blue', 'Green', 'Red'], dtype=object))
assert_array_equal(
enc.categories_[1],
np.array(['Coffee', 'Tea', 'Water'], dtype=object))
encVirtual = OrdinalEncoder(handle_unknown='virtual')
encVirtual.fit(X)
assert_array_equal(
encVirtual.categories_[0],
np.array(['Blue', 'Green', 'Red', None],
dtype=object))
assert_array_equal(
encVirtual.categories_[1],
np.array(['Coffee', 'Tea', 'Water', None],
dtype=object))


@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
np.array([['a', np.nan]], dtype=object).T],
ids=['numeric', 'object'])
Expand Down