Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/whats_new/v0.24.rst
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,12 @@ Changelog
which clips the transformed values of test data to ``feature_range``.
:pr:`17833` by :user:`Yashika Sharma <yashika51>`.

- |Enhancement| Add a new ``handle_unknown`` parameter with a
``use_encoded_value`` option, along with a new ``unknown_value`` parameter,
to :class:`preprocessing.OrdinalEncoder` to allow unknown categories during
transform and set the encoded value of the unknown categories.
:pr:`17406` by :user:`Felix Wick <FelixWick>`.

:mod:`sklearn.svm`
..................

Expand Down
72 changes: 65 additions & 7 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import numpy as np
from scipy import sparse
import numbers

from ..base import BaseEstimator, TransformerMixin
from ..utils import check_array
Expand Down Expand Up @@ -115,7 +116,7 @@ def _transform(self, X, handle_unknown='error'):
for i in range(n_features):
Xi = X_list[i]
diff, valid_mask = _check_unknown(Xi, self.categories_[i],
return_mask=True)
return_mask=True)

if not np.all(valid_mask):
if handle_unknown == 'error':
Expand Down Expand Up @@ -621,12 +622,29 @@ class OrdinalEncoder(_BaseEncoder):
dtype : number type, default np.float64
Desired dtype of output.

handle_unknown : {'error', 'use_encoded_value}', default='error'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo:

handle_unknown : {'error', 'use_encoded_value'}, default='error'

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thx, fixed ;)

When set to 'error' an error will be raised in case an unknown
categorical feature is present during transform. When set to
'use_encoded_value', the encoded value of unknown categories will be
set to the value given for the parameter `unknown_value`. In
:meth:`inverse_transform`, an unknown category will be denoted as None.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please use .. versionadded

.. versionadded:: 0.24

unknown_value : int, default=None
When the parameter handle_unknown is set to 'use_encoded_value', this
parameter is required and will set the encoded value of unknown
categories. It has to be distinct from the values used to encode any of
the categories in `fit`.

.. versionadded:: 0.24

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please use .. versionadded::

Attributes
----------
categories_ : list of arrays
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This says that categories_ will correspond with the output of transform. I suspect this is no longer true. Is there a way we can make them line up?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct, in case of unknown categories there will be outputs of transform that are not in categories_, namely unknown_value. Are you suggesting a code change or just another description here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure how to change the code here. But the description, yes.

The categories of each feature determined during fitting
(in order of the features in X and corresponding with the output
of ``transform``).
The categories of each feature determined during ``fit`` (in order of
the features in X and corresponding with the output of ``transform``).
This does not include categories that weren't seen during ``fit``.

See Also
--------
Expand Down Expand Up @@ -657,9 +675,12 @@ class OrdinalEncoder(_BaseEncoder):
"""

@_deprecate_positional_args
def __init__(self, *, categories='auto', dtype=np.float64):
def __init__(self, *, categories='auto', dtype=np.float64,
handle_unknown='error', unknown_value=None):
self.categories = categories
self.dtype = dtype
self.handle_unknown = handle_unknown
self.unknown_value = unknown_value

def fit(self, X, y=None):
"""
Expand All @@ -678,8 +699,26 @@ def fit(self, X, y=None):
-------
self
"""
if self.handle_unknown == 'use_encoded_value':
if not isinstance(self.unknown_value, numbers.Integral):
raise TypeError(f"unknown_value should be an integer when "
f"`handle_unknown is 'use_encoded_value'`, "
f"got {self.unknown_value}.")
elif self.unknown_value is not None:
raise TypeError(f"unknown_value should only be set when "
f"`handle_unknown is 'use_encoded_value'`, "
f"got {self.unknown_value}.")

self._fit(X)

if self.handle_unknown == 'use_encoded_value':
for feature_cats in self.categories_:
if 0 <= self.unknown_value < len(feature_cats):
raise ValueError(f"The used value for unknown_value "
f"{self.unknown_value} is one of the "
f"values already used for encoding the "
f"seen categories.")

return self

def transform(self, X):
Expand All @@ -696,7 +735,11 @@ def transform(self, X):
X_out : sparse matrix or a 2-d array
Transformed input.
"""
X_int, _ = self._transform(X)
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)

# create separate category for unknown values
if self.handle_unknown == 'use_encoded_value':
X_int[~X_mask] = self.unknown_value
return X_int.astype(self.dtype, copy=False)

def inverse_transform(self, X):
Expand Down Expand Up @@ -729,8 +772,23 @@ def inverse_transform(self, X):
dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
X_tr = np.empty((n_samples, n_features), dtype=dt)

found_unknown = {}

for i in range(n_features):
labels = X[:, i].astype('int64', copy=False)
X_tr[:, i] = self.categories_[i][labels]
if self.handle_unknown == 'use_encoded_value':
unknown_labels = labels == self.unknown_value
X_tr[:, i] = self.categories_[i][np.where(
unknown_labels, 0, labels)]
found_unknown[i] = unknown_labels
Comment on lines +781 to +783
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just do X_tr[:, i] = self.categories_[i][labels] as in else: case?
Since these would be overridden later anyway?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately not, as this results in an out of bounds error.

else:
X_tr[:, i] = self.categories_[i][labels]

# insert None values for unknown values
if found_unknown:
X_tr = X_tr.astype(object, copy=False)

for idx, mask in found_unknown.items():
X_tr[mask, idx] = None

return X_tr
61 changes: 61 additions & 0 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,67 @@ def test_ordinal_encoder_raise_missing(X):
ohe.transform(X)


def test_ordinal_encoder_handle_unknowns_string():
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should test that unknown_value=0 or 1 is also valid (but not for inverse_transform).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what you mean. Using unknown_value=0 or 1 would usually lead to an error during fit, because these are values already used for encoding the seen categories.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would that be an error? What's wrong with the user wanting to conflate unknowns with a specific known category? I had thought we would want to handle that case. If we won't allow that case then we should certainly test that an error is raised if it is attempted.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a test already for checking correct raise in case of using an unknown_value that is already used for encoding one of the categories in the fit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And nothing wrong with conflating a specific known category with unknowns. I think it is just a matter of taste whether we want to allow conflating or not. However, I would suggest to allow it only in mode use_category (Which we said we would include in a later version to keep it small for now, see #16959.), not in use_encoded_value, because it would be clearer to explicitly state the category to be conflated. But I am fine to allow it here for use_encoded_value as well, if you prefer that (It just means deleting a few lines of code ;).).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TBH, I don't mind whether we allow or disallow it at this point, but we need to explicitly do one or the other, and test it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. Please let me know if the test I added for this (last check in test_ordinal_encoder_handle_unknowns_raise) is not sufficient.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@thomasjpfan can you confirm whether it was your expectation to allow/disallow unknown_value=0?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm personally fine with being conservative and not allowing this for now. We can allow it later without breaking backward compatibility.

The test is good

X_fit = np.array([['a', 'x'], ['b', 'y'], ['c', 'z']], dtype=object)
X_trans = np.array([['c', 'xy'], ['bla', 'y'], ['a', 'x']], dtype=object)
enc.fit(X_fit)

X_trans_enc = enc.transform(X_trans)
exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype='int64')
assert_array_equal(X_trans_enc, exp)

X_trans_inv = enc.inverse_transform(X_trans_enc)
inv_exp = np.array([['c', None], [None, 'y'], ['a', 'x']], dtype=object)
assert_array_equal(X_trans_inv, inv_exp)


@pytest.mark.parametrize('dtype', [float, int])
def test_ordinal_encoder_handle_unknowns_numeric(dtype):
enc = OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value=-999)
X_fit = np.array([[1, 7], [2, 8], [3, 9]], dtype=dtype)
X_trans = np.array([[3, 12], [23, 8], [1, 7]], dtype=dtype)
enc.fit(X_fit)

X_trans_enc = enc.transform(X_trans)
exp = np.array([[2, -999], [-999, 1], [0, 0]], dtype='int64')
assert_array_equal(X_trans_enc, exp)

X_trans_inv = enc.inverse_transform(X_trans_enc)
inv_exp = np.array([[3, None], [None, 8], [1, 7]], dtype=object)
assert_array_equal(X_trans_inv, inv_exp)


def test_ordinal_encoder_handle_unknowns_raise():
X = np.array([['a', 'x'], ['b', 'y']], dtype=object)

enc = OrdinalEncoder(handle_unknown='use_encoded_value')
msg = ("unknown_value should be an integer when `handle_unknown is "
"'use_encoded_value'`, got None.")
with pytest.raises(TypeError, match=msg):
enc.fit(X)

enc = OrdinalEncoder(unknown_value=-2)
msg = ("unknown_value should only be set when `handle_unknown is "
"'use_encoded_value'`, got -2.")
with pytest.raises(TypeError, match=msg):
enc.fit(X)

enc = OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value='bla')
msg = ("unknown_value should be an integer when `handle_unknown is "
"'use_encoded_value'`, got bla.")
with pytest.raises(TypeError, match=msg):
enc.fit(X)

enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=1)
msg = ("The used value for unknown_value (1) is one of the values already "
"used for encoding the seen categories.")
with pytest.raises(ValueError, match=msg):
enc.fit(X)


def test_ordinal_encoder_raise_categories_shape():

X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
Expand Down