Skip to content

Handle unseen labels in LabelEncoder #13423

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 63 additions & 18 deletions sklearn/preprocessing/label.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
]


def _encode_numpy(values, uniques=None, encode=False):
def _encode_numpy(values, uniques=None, encode=False, impute_value=None):
# only used in _encode below, see docstring there for details
if uniques is None:
if encode:
Expand All @@ -43,8 +43,10 @@ def _encode_numpy(values, uniques=None, encode=False):
# unique sorts
return np.unique(values)
if encode:
diff = _encode_check_unknown(values, uniques)
if diff:
diff, mask = _encode_check_unknown(values, uniques, return_mask=True)
if impute_value:
values[~mask] = impute_value
elif diff:
raise ValueError("y contains previously unseen labels: %s"
% str(diff))
encoded = np.searchsorted(uniques, values)
Expand All @@ -53,24 +55,28 @@ def _encode_numpy(values, uniques=None, encode=False):
return uniques


def _encode_python(values, uniques=None, encode=False):
def _encode_python(values, uniques=None, encode=False, impute_value=None):
# only used in _encode below, see docstring there for details
if uniques is None:
uniques = sorted(set(values))
uniques = np.array(uniques, dtype=values.dtype)
if encode:
table = {val: i for i, val in enumerate(uniques)}
try:
encoded = np.array([table[v] for v in values])
except KeyError as e:
raise ValueError("y contains previously unseen labels: %s"
% str(e))
if impute_value:
encoded = np.array([table[v] if v in table else table[impute_value]
for v in values])
else:
try:
encoded = np.array([table[v] for v in values])
except KeyError as e:
raise ValueError("y contains previously unseen labels: %s"
% str(e))
return uniques, encoded
else:
return uniques


def _encode(values, uniques=None, encode=False):
def _encode(values, uniques=None, encode=False, impute_value=None):
"""Helper function to factorize (find uniques) and encode values.

Uses pure python method for object dtype, and numpy method for
Expand All @@ -90,6 +96,9 @@ def _encode(values, uniques=None, encode=False):
already have been determined in fit).
encode : bool, default False
If True, also encode the values into integer codes based on `uniques`.
impute_value: str, int or float, optional
If passed, never seen values will be replaced by this value during the
encoding process.

Returns
-------
Expand All @@ -102,12 +111,12 @@ def _encode(values, uniques=None, encode=False):
"""
if values.dtype == object:
try:
res = _encode_python(values, uniques, encode)
res = _encode_python(values, uniques, encode, impute_value)
except TypeError:
raise TypeError("argument must be a string or number")
return res
else:
return _encode_numpy(values, uniques, encode)
return _encode_numpy(values, uniques, encode, impute_value)


def _encode_check_unknown(values, uniques, return_mask=False):
Expand Down Expand Up @@ -165,6 +174,13 @@ class LabelEncoder(BaseEstimator, TransformerMixin):

Read more in the :ref:`User Guide <preprocessing_targets>`.

Parameters
----------
impute_method : str (default: None)
Method to use for imputation of unseen labels during transform. By
default no imputation is done and a ValueError will be raised if any
previously unseen labels are found.

Attributes
----------
classes_ : array of shape (n_class,)
Expand All @@ -177,7 +193,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
>>> from sklearn import preprocessing
>>> le = preprocessing.LabelEncoder()
>>> le.fit([1, 2, 2, 6])
LabelEncoder()
LabelEncoder(impute_method=None)
>>> le.classes_
array([1, 2, 6])
>>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS
Expand All @@ -190,7 +206,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):

>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
LabelEncoder(impute_method=None)
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS
Expand All @@ -204,6 +220,10 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
using a one-hot or ordinal encoding scheme.
"""

def __init__(self, impute_method=None):
self.impute_method = impute_method
return super().__init__()

def fit(self, y):
"""Fit label encoder

Expand All @@ -217,7 +237,9 @@ def fit(self, y):
self : returns an instance of self.
"""
y = column_or_1d(y, warn=True)
self.classes_ = _encode(y)
self.classes_, y = _encode(y, encode=True)
if self.impute_method:
self.impute_value = self._calculate_impute_value(y)
return self

def fit_transform(self, y):
Expand All @@ -234,6 +256,8 @@ def fit_transform(self, y):
"""
y = column_or_1d(y, warn=True)
self.classes_, y = _encode(y, encode=True)
if self.impute_method:
self.impute_value = self._calculate_impute_value(y)
return y

def transform(self, y):
Expand All @@ -253,8 +277,9 @@ def transform(self, y):
# transform of empty array is empty array
if _num_samples(y) == 0:
return np.array([])

_, y = _encode(y, uniques=self.classes_, encode=True)
impute_value = self.impute_value if self.impute_method else None
_, y = _encode(y, uniques=self.classes_,
encode=True, impute_value=impute_value)
return y

def inverse_transform(self, y):
Expand All @@ -278,13 +303,33 @@ def inverse_transform(self, y):
diff = np.setdiff1d(y, np.arange(len(self.classes_)))
if len(diff):
raise ValueError(
"y contains previously unseen labels: %s" % str(diff))
"y contains previously unseen labels: %s" % str(diff)
)
y = np.asarray(y)
return self.classes_[y]

def _more_tags(self):
return {'X_types': ['1dlabels']}

def _calculate_impute_value(self, y):
"""Calculates the value to be imputed to unseen labels.

Parameters
----------
y : numpy array of shape [n_samples]
Target values.

Returns
-------
impute_value : str, int or float
"""
if self.impute_method == 'most_common':
values, counts = np.unique(y, return_counts=True)
impute_value = self.classes_[values[np.argmax(counts)]]
else:
raise ValueError('impute_method can only be "most_common" or None')
return impute_value


class LabelBinarizer(BaseEstimator, TransformerMixin):
"""Binarize labels in a one-vs-all fashion
Expand Down
32 changes: 32 additions & 0 deletions sklearn/preprocessing/tests/test_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,38 @@ def test_label_encoder(values, classes, unknown):
with pytest.raises(ValueError, match="unseen labels"):
le.transform(unknown)

with pytest.raises(ValueError, match="impute_method"):
LabelEncoder(impute_method='test').fit(values)

with pytest.raises(ValueError, match="impute_method"):
LabelEncoder(impute_method='test').fit_transform(values)


@pytest.mark.parametrize(
"values, classes, unknown",
[(np.array([2, 1, 3, 1, 3, 3], dtype='int64'),
np.array([1, 2, 3], dtype='int64'),
np.array([4], dtype='int64')),
(np.array(['b', 'a', 'c', 'a', 'c', 'c'], dtype=object),
np.array(['a', 'b', 'c'], dtype=object),
np.array(['d'], dtype=object)),
(np.array(['b', 'a', 'c', 'a', 'c', 'c']),
np.array(['a', 'b', 'c']),
np.array(['d']))],
ids=['int64', 'object', 'str'])
def test_label_encoder_impute_most_common(values, classes, unknown):
# Test LabelEncoder's transform, fit_transform and
# inverse_transform methods
le = LabelEncoder(impute_method='most_common')
le.fit(values)
assert_array_equal(le.classes_, classes)
assert_array_equal(le.transform(values), [1, 0, 2, 0, 2, 2])
assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2, 2]), values)
assert_array_equal(le.transform(unknown), [2])
ret = le.fit_transform(values)
assert_array_equal(ret, [1, 0, 2, 0, 2, 2])
assert_array_equal(le.transform(unknown), [2])


def test_label_encoder_negative_ints():
le = LabelEncoder()
Expand Down