Skip to content

[MRG] Label Encoder Unseen Labels #3599

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions doc/modules/preprocessing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -397,9 +397,9 @@ follows::
>>> from sklearn import preprocessing
>>> le = preprocessing.LabelEncoder()
>>> le.fit([1, 2, 2, 6])
LabelEncoder()
>>> le.classes_
array([1, 2, 6])
LabelEncoder(classes=None, new_labels=None)
>>> list(le.classes_)
[1, 2, 6]
>>> le.transform([1, 1, 2, 6])
array([0, 0, 1, 2])
>>> le.inverse_transform([0, 0, 1, 2])
Expand All @@ -410,14 +410,27 @@ hashable and comparable) to numerical labels::

>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
LabelEncoder(classes=None, new_labels=None)
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"])
array([2, 2, 1])
>>> list(le.inverse_transform([2, 2, 1]))
['tokyo', 'tokyo', 'paris']

By default, ``LabelEncoder`` will throw a ``ValueError`` in the event that
labels are passed in ``transform`` that were not seen in ``fit``. This
behavior can be handled with the ``new_labels`` parameter, which will assign
the unseen values an arbitrary, user-specified integer label (e.g., ``-1``
below).

>>> le = preprocessing.LabelEncoder(new_labels=-1)
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder(classes=None, new_labels=-1)
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris", "rome"])
array([ 2, 2, 1, -1])

Imputation of missing values
============================
Expand Down
74 changes: 63 additions & 11 deletions sklearn/preprocessing/label.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Andreas Mueller <amueller@ais.uni-bonn.de>
# Joel Nothman <joel.nothman@gmail.com>
# Hamzeh Alsalhi <ha258@cornell.edu>
# Michael Bommarito <michael@bommaritollc.com>
# License: BSD 3 clause

from collections import defaultdict
Expand Down Expand Up @@ -53,6 +54,22 @@ def _check_numpy_unicode_bug(labels):
class LabelEncoder(BaseEstimator, TransformerMixin):
"""Encode labels with value between 0 and n_classes-1.

Parameters
----------

classes : array-like of shape [n_class], optional (default: None)
Holds the label for each class. List of unique sorted labels to encode
the target data against. Using this parameter in initilization will
allow skipping a call fit before calling transform.


new_labels : Int, optional (default: None)
re-label with this value.
N.B. that default values are in [0, 1, ...], so caution should be
taken if a non-negative value is passed to not accidentally
intersect. Additionally, ``inverse_transform`` will fail for a
value that does not intersect with the ``fit``-time label set.

Attributes
----------
`classes_` : array of shape (n_class,)
Expand All @@ -65,7 +82,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
>>> from sklearn import preprocessing
>>> le = preprocessing.LabelEncoder()
>>> le.fit([1, 2, 2, 6])
LabelEncoder()
LabelEncoder(classes=None, new_labels=None)
>>> le.classes_
array([1, 2, 6])
>>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS
Expand All @@ -78,7 +95,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):

>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
LabelEncoder(classes=None, new_labels=None)
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS
Expand All @@ -88,6 +105,17 @@ class LabelEncoder(BaseEstimator, TransformerMixin):

"""

def __init__(self, classes=None, new_labels=None):
if classes is not None:
self.classes_ = np.asarray(classes)

if new_labels is not None and type(new_labels) is not int:
raise ValueError("Value of argument `new_labels`={0} is "
"unknown and not an "
"integer.".format(new_labels))

self.new_labels = new_labels

def _check_fitted(self):
if not hasattr(self, "classes_"):
raise ValueError("LabelEncoder was not fitted yet.")
Expand Down Expand Up @@ -126,26 +154,41 @@ def fit_transform(self, y):
self.classes_, y = np.unique(y, return_inverse=True)
return y

def transform(self, y):
def transform(self, y, classes=None):
"""Transform labels to normalized encoding.

Parameters
----------
y : array-like of shape [n_samples]
Target values.

classes : array-like of shape [n_class], optional (default: None)
List of unique sorted labels to encode the target data against.
If None the LabelEncoder must have already been fit and the unique
labels from the fit will be used.

Returns
-------
y : array-like of shape [n_samples]
"""
self._check_fitted()
if classes is None:
self._check_fitted()
classes = self.classes_

classes = np.unique(y)
_check_numpy_unicode_bug(classes)
if len(np.intersect1d(classes, self.classes_)) < len(classes):
diff = np.setdiff1d(classes, self.classes_)
raise ValueError("y contains new labels: %s" % str(diff))
return np.searchsorted(self.classes_, y)
y_classes = np.unique(y)
_check_numpy_unicode_bug(y_classes)
if len(np.intersect1d(y_classes, classes)) < len(y_classes):
# Get the new labels
unseen = np.setdiff1d(y_classes, classes)

if type(self.new_labels) is int:
ret = np.searchsorted(classes, y)
ret[np.in1d(y, unseen)] = self.new_labels
return ret
elif self.new_labels is None:
raise ValueError("y contains new label(s): %s" % str(unseen))

return np.searchsorted(classes, y)

def inverse_transform(self, y):
"""Transform labels back to original encoding.
Expand All @@ -161,6 +204,14 @@ def inverse_transform(self, y):
"""
self._check_fitted()

if type(self.new_labels) is int:
warnings.warn('When ``new_labels`` uses an integer '
're-labeling strategy, the ``inverse_transform`` '
'is not necessarily one-to-one mapping; any '
'labels not present during initial ``fit`` will '
'not be mapped.',
UserWarning)

y = np.asarray(y)
return self.classes_[y]

Expand Down Expand Up @@ -446,7 +497,8 @@ def label_binarize(y, classes, neg_label=0, pos_label=1,
allow for fitting to classes independently of the transform operation
"""
if not isinstance(y, list):
# XXX Workaround that will be removed when list of list format is dropped
# XXX Workaround that will be removed when list of list format is
# dropped
y = check_array(y, accept_sparse='csr', ensure_2d=False)
if neg_label >= pos_label:
raise ValueError("neg_label={0} must be strictly less than "
Expand Down
33 changes: 33 additions & 0 deletions sklearn/preprocessing/tests/test_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,37 @@ def test_label_encoder():
assert_raises(ValueError, le.transform, [0, 6])


def test_label_encoder_new_label_replace():
"""Test LabelEncoder's transform on new labels"""
le = LabelEncoder(new_labels=-99)
le.fit(["a", "b", "b", "c"])
assert_array_equal(le.classes_, ["a", "b", "c"])
assert_array_equal(le.transform(["a", "a", "c"]), [0, 0, 2])
assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"])
assert_array_equal(le.transform(["b", "c", "d"]), [1, 2, -99])


def test_label_encoder_classes_parameter():
"""Test LabelEncoder's classes parameter"""
le = LabelEncoder(classes=["a", "b", "c"], new_labels=None)
assert_array_equal(le.classes_, ["a", "b", "c"])
assert_array_equal(le.transform(["c", "a", "b", "c"],), [2, 0, 1, 2])
assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"])
assert_raises(ValueError, le.transform, ["b", "c", "d"])


def test_label_encoder_transform_classes_parameter():
"""Test LabelEncoder's transform using the classes parameter"""
le = LabelEncoder(new_labels=None)
le.fit(["a", "b", "b", "c"])
assert_array_equal(le.classes_, ["a", "b", "c"])
assert_array_equal(le.transform(["d", "f", "e", "e"],
classes=["d", "e", "f"]),
[0, 2, 1, 1])
assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"])
assert_raises(ValueError, le.transform, ["b", "c", "d"])


def test_label_encoder_fit_transform():
"""Test fit_transform"""
le = LabelEncoder()
Expand All @@ -226,6 +257,8 @@ def test_label_encoder_errors():
le = LabelEncoder()
assert_raises(ValueError, le.transform, [])
assert_raises(ValueError, le.inverse_transform, [])
# Fail on unrecognized value for the 'new_label' parameter
assert_raises(ValueError, LabelEncoder, new_labels="xyz")


def test_sparse_output_multilabel_binarizer():
Expand Down