-
-
Notifications
You must be signed in to change notification settings - Fork 26.2k
FEA allow unknowns in OrdinalEncoder transform #17406
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
04b263a
38456cc
82e31d3
439b08e
ce75639
274b999
3394490
6be44fa
19e9474
1d84ec4
72a7b75
c86a834
9b5359e
d02c410
215ec88
3127688
41fb313
8c8b565
fd7c9fe
46f8f1a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
|
||
import numpy as np | ||
from scipy import sparse | ||
import numbers | ||
|
||
from ..base import BaseEstimator, TransformerMixin | ||
from ..utils import check_array | ||
|
@@ -115,7 +116,7 @@ def _transform(self, X, handle_unknown='error'): | |
for i in range(n_features): | ||
Xi = X_list[i] | ||
diff, valid_mask = _check_unknown(Xi, self.categories_[i], | ||
return_mask=True) | ||
return_mask=True) | ||
|
||
if not np.all(valid_mask): | ||
if handle_unknown == 'error': | ||
|
@@ -621,12 +622,29 @@ class OrdinalEncoder(_BaseEncoder): | |
dtype : number type, default np.float64 | ||
Desired dtype of output. | ||
|
||
handle_unknown : {'error', 'use_encoded_value}', default='error' | ||
When set to 'error' an error will be raised in case an unknown | ||
categorical feature is present during transform. When set to | ||
'use_encoded_value', the encoded value of unknown categories will be | ||
set to the value given for the parameter `unknown_value`. In | ||
:meth:`inverse_transform`, an unknown category will be denoted as None. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please use |
||
.. versionadded:: 0.24 | ||
|
||
unknown_value : int, default=None | ||
When the parameter handle_unknown is set to 'use_encoded_value', this | ||
parameter is required and will set the encoded value of unknown | ||
categories. It has to be distinct from the values used to encode any of | ||
the categories in `fit`. | ||
|
||
.. versionadded:: 0.24 | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please use |
||
Attributes | ||
---------- | ||
categories_ : list of arrays | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This says that categories_ will correspond with the output of transform. I suspect this is no longer true. Is there a way we can make them line up? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct, in case of unknown categories there will be outputs of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure how to change the code here. But the description, yes. |
||
The categories of each feature determined during fitting | ||
(in order of the features in X and corresponding with the output | ||
of ``transform``). | ||
The categories of each feature determined during ``fit`` (in order of | ||
the features in X and corresponding with the output of ``transform``). | ||
This does not include categories that weren't seen during ``fit``. | ||
|
||
See Also | ||
-------- | ||
|
@@ -657,9 +675,12 @@ class OrdinalEncoder(_BaseEncoder): | |
""" | ||
|
||
@_deprecate_positional_args | ||
def __init__(self, *, categories='auto', dtype=np.float64): | ||
def __init__(self, *, categories='auto', dtype=np.float64, | ||
handle_unknown='error', unknown_value=None): | ||
self.categories = categories | ||
self.dtype = dtype | ||
self.handle_unknown = handle_unknown | ||
self.unknown_value = unknown_value | ||
|
||
def fit(self, X, y=None): | ||
""" | ||
|
@@ -678,8 +699,26 @@ def fit(self, X, y=None): | |
------- | ||
self | ||
""" | ||
if self.handle_unknown == 'use_encoded_value': | ||
if not isinstance(self.unknown_value, numbers.Integral): | ||
raise TypeError(f"unknown_value should be an integer when " | ||
f"`handle_unknown is 'use_encoded_value'`, " | ||
f"got {self.unknown_value}.") | ||
elif self.unknown_value is not None: | ||
raise TypeError(f"unknown_value should only be set when " | ||
f"`handle_unknown is 'use_encoded_value'`, " | ||
f"got {self.unknown_value}.") | ||
|
||
NicolasHug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self._fit(X) | ||
|
||
if self.handle_unknown == 'use_encoded_value': | ||
for feature_cats in self.categories_: | ||
if 0 <= self.unknown_value < len(feature_cats): | ||
raise ValueError(f"The used value for unknown_value " | ||
f"{self.unknown_value} is one of the " | ||
f"values already used for encoding the " | ||
f"seen categories.") | ||
|
||
return self | ||
|
||
def transform(self, X): | ||
|
@@ -696,7 +735,11 @@ def transform(self, X): | |
X_out : sparse matrix or a 2-d array | ||
Transformed input. | ||
""" | ||
X_int, _ = self._transform(X) | ||
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) | ||
|
||
# create separate category for unknown values | ||
if self.handle_unknown == 'use_encoded_value': | ||
X_int[~X_mask] = self.unknown_value | ||
return X_int.astype(self.dtype, copy=False) | ||
|
||
def inverse_transform(self, X): | ||
|
@@ -729,8 +772,23 @@ def inverse_transform(self, X): | |
dt = np.find_common_type([cat.dtype for cat in self.categories_], []) | ||
X_tr = np.empty((n_samples, n_features), dtype=dt) | ||
|
||
found_unknown = {} | ||
|
||
for i in range(n_features): | ||
labels = X[:, i].astype('int64', copy=False) | ||
X_tr[:, i] = self.categories_[i][labels] | ||
if self.handle_unknown == 'use_encoded_value': | ||
unknown_labels = labels == self.unknown_value | ||
X_tr[:, i] = self.categories_[i][np.where( | ||
unknown_labels, 0, labels)] | ||
found_unknown[i] = unknown_labels | ||
Comment on lines
+781
to
+783
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we just do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately not, as this results in an out of bounds error. |
||
else: | ||
X_tr[:, i] = self.categories_[i][labels] | ||
|
||
# insert None values for unknown values | ||
if found_unknown: | ||
X_tr = X_tr.astype(object, copy=False) | ||
|
||
for idx, mask in found_unknown.items(): | ||
X_tr[mask, idx] = None | ||
|
||
return X_tr |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -553,6 +553,67 @@ def test_ordinal_encoder_raise_missing(X): | |
ohe.transform(X) | ||
|
||
|
||
def test_ordinal_encoder_handle_unknowns_string(): | ||
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should test that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what you mean. Using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why would that be an error? What's wrong with the user wanting to conflate unknowns with a specific known category? I had thought we would want to handle that case. If we won't allow that case then we should certainly test that an error is raised if it is attempted. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a test already for checking correct raise in case of using an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And nothing wrong with conflating a specific known category with unknowns. I think it is just a matter of taste whether we want to allow conflating or not. However, I would suggest to allow it only in mode There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TBH, I don't mind whether we allow or disallow it at this point, but we need to explicitly do one or the other, and test it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed. Please let me know if the test I added for this (last check in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @thomasjpfan can you confirm whether it was your expectation to allow/disallow There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm personally fine with being conservative and not allowing this for now. We can allow it later without breaking backward compatibility. The test is good |
||
X_fit = np.array([['a', 'x'], ['b', 'y'], ['c', 'z']], dtype=object) | ||
X_trans = np.array([['c', 'xy'], ['bla', 'y'], ['a', 'x']], dtype=object) | ||
enc.fit(X_fit) | ||
|
||
X_trans_enc = enc.transform(X_trans) | ||
exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype='int64') | ||
assert_array_equal(X_trans_enc, exp) | ||
|
||
X_trans_inv = enc.inverse_transform(X_trans_enc) | ||
inv_exp = np.array([['c', None], [None, 'y'], ['a', 'x']], dtype=object) | ||
assert_array_equal(X_trans_inv, inv_exp) | ||
|
||
|
||
@pytest.mark.parametrize('dtype', [float, int]) | ||
def test_ordinal_encoder_handle_unknowns_numeric(dtype): | ||
enc = OrdinalEncoder(handle_unknown='use_encoded_value', | ||
unknown_value=-999) | ||
X_fit = np.array([[1, 7], [2, 8], [3, 9]], dtype=dtype) | ||
X_trans = np.array([[3, 12], [23, 8], [1, 7]], dtype=dtype) | ||
enc.fit(X_fit) | ||
|
||
X_trans_enc = enc.transform(X_trans) | ||
exp = np.array([[2, -999], [-999, 1], [0, 0]], dtype='int64') | ||
assert_array_equal(X_trans_enc, exp) | ||
|
||
X_trans_inv = enc.inverse_transform(X_trans_enc) | ||
inv_exp = np.array([[3, None], [None, 8], [1, 7]], dtype=object) | ||
assert_array_equal(X_trans_inv, inv_exp) | ||
|
||
|
||
def test_ordinal_encoder_handle_unknowns_raise(): | ||
X = np.array([['a', 'x'], ['b', 'y']], dtype=object) | ||
|
||
enc = OrdinalEncoder(handle_unknown='use_encoded_value') | ||
msg = ("unknown_value should be an integer when `handle_unknown is " | ||
"'use_encoded_value'`, got None.") | ||
with pytest.raises(TypeError, match=msg): | ||
enc.fit(X) | ||
|
||
enc = OrdinalEncoder(unknown_value=-2) | ||
msg = ("unknown_value should only be set when `handle_unknown is " | ||
"'use_encoded_value'`, got -2.") | ||
with pytest.raises(TypeError, match=msg): | ||
enc.fit(X) | ||
|
||
enc = OrdinalEncoder(handle_unknown='use_encoded_value', | ||
unknown_value='bla') | ||
msg = ("unknown_value should be an integer when `handle_unknown is " | ||
"'use_encoded_value'`, got bla.") | ||
with pytest.raises(TypeError, match=msg): | ||
enc.fit(X) | ||
|
||
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=1) | ||
msg = ("The used value for unknown_value (1) is one of the values already " | ||
"used for encoding the seen categories.") | ||
with pytest.raises(ValueError, match=msg): | ||
enc.fit(X) | ||
|
||
|
||
def test_ordinal_encoder_raise_categories_shape(): | ||
|
||
X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo:
handle_unknown : {'error', 'use_encoded_value'}, default='error'
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thx, fixed ;)