diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 4d3b04ade3c7b..6f4b2d66b6dfc 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -397,9 +397,9 @@ follows:: >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder() - >>> le.classes_ - array([1, 2, 6]) + LabelEncoder(classes=None, new_labels=None) + >>> list(le.classes_) + [1, 2, 6] >>> le.transform([1, 1, 2, 6]) array([0, 0, 1, 2]) >>> le.inverse_transform([0, 0, 1, 2]) @@ -410,7 +410,7 @@ hashable and comparable) to numerical labels:: >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder() + LabelEncoder(classes=None, new_labels=None) >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) @@ -418,6 +418,19 @@ hashable and comparable) to numerical labels:: >>> list(le.inverse_transform([2, 2, 1])) ['tokyo', 'tokyo', 'paris'] +By default, ``LabelEncoder`` will throw a ``ValueError`` in the event that +labels are passed in ``transform`` that were not seen in ``fit``. This +behavior can be handled with the ``new_labels`` parameter, which will assign +the unseen values an arbitrary, user-specified integer label (e.g., ``-1`` +below). + + >>> le = preprocessing.LabelEncoder(new_labels=-1) + >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) + LabelEncoder(classes=None, new_labels=-1) + >>> list(le.classes_) + ['amsterdam', 'paris', 'tokyo'] + >>> le.transform(["tokyo", "tokyo", "paris", "rome"]) + array([ 2, 2, 1, -1]) Imputation of missing values ============================ diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index f304bf6104cae..5ef1cafcb2c3b 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -4,6 +4,7 @@ # Andreas Mueller # Joel Nothman # Hamzeh Alsalhi +# Michael Bommarito # License: BSD 3 clause from collections import defaultdict @@ -53,6 +54,22 @@ def _check_numpy_unicode_bug(labels): class LabelEncoder(BaseEstimator, TransformerMixin): """Encode labels with value between 0 and n_classes-1. + Parameters + ---------- + + classes : array-like of shape [n_class], optional (default: None) + Holds the label for each class. List of unique sorted labels to encode + the target data against. Using this parameter in initilization will + allow skipping a call fit before calling transform. + + + new_labels : Int, optional (default: None) + re-label with this value. + N.B. that default values are in [0, 1, ...], so caution should be + taken if a non-negative value is passed to not accidentally + intersect. Additionally, ``inverse_transform`` will fail for a + value that does not intersect with the ``fit``-time label set. + Attributes ---------- `classes_` : array of shape (n_class,) @@ -65,7 +82,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder() + LabelEncoder(classes=None, new_labels=None) >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS @@ -78,7 +95,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder() + LabelEncoder(classes=None, new_labels=None) >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS @@ -88,6 +105,17 @@ class LabelEncoder(BaseEstimator, TransformerMixin): """ + def __init__(self, classes=None, new_labels=None): + if classes is not None: + self.classes_ = np.asarray(classes) + + if new_labels is not None and type(new_labels) is not int: + raise ValueError("Value of argument `new_labels`={0} is " + "unknown and not an " + "integer.".format(new_labels)) + + self.new_labels = new_labels + def _check_fitted(self): if not hasattr(self, "classes_"): raise ValueError("LabelEncoder was not fitted yet.") @@ -126,7 +154,7 @@ def fit_transform(self, y): self.classes_, y = np.unique(y, return_inverse=True) return y - def transform(self, y): + def transform(self, y, classes=None): """Transform labels to normalized encoding. Parameters @@ -134,18 +162,33 @@ def transform(self, y): y : array-like of shape [n_samples] Target values. + classes : array-like of shape [n_class], optional (default: None) + List of unique sorted labels to encode the target data against. + If None the LabelEncoder must have already been fit and the unique + labels from the fit will be used. + Returns ------- y : array-like of shape [n_samples] """ - self._check_fitted() + if classes is None: + self._check_fitted() + classes = self.classes_ - classes = np.unique(y) - _check_numpy_unicode_bug(classes) - if len(np.intersect1d(classes, self.classes_)) < len(classes): - diff = np.setdiff1d(classes, self.classes_) - raise ValueError("y contains new labels: %s" % str(diff)) - return np.searchsorted(self.classes_, y) + y_classes = np.unique(y) + _check_numpy_unicode_bug(y_classes) + if len(np.intersect1d(y_classes, classes)) < len(y_classes): + # Get the new labels + unseen = np.setdiff1d(y_classes, classes) + + if type(self.new_labels) is int: + ret = np.searchsorted(classes, y) + ret[np.in1d(y, unseen)] = self.new_labels + return ret + elif self.new_labels is None: + raise ValueError("y contains new label(s): %s" % str(unseen)) + + return np.searchsorted(classes, y) def inverse_transform(self, y): """Transform labels back to original encoding. @@ -161,6 +204,14 @@ def inverse_transform(self, y): """ self._check_fitted() + if type(self.new_labels) is int: + warnings.warn('When ``new_labels`` uses an integer ' + 're-labeling strategy, the ``inverse_transform`` ' + 'is not necessarily one-to-one mapping; any ' + 'labels not present during initial ``fit`` will ' + 'not be mapped.', + UserWarning) + y = np.asarray(y) return self.classes_[y] @@ -446,7 +497,8 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, allow for fitting to classes independently of the transform operation """ if not isinstance(y, list): - # XXX Workaround that will be removed when list of list format is dropped + # XXX Workaround that will be removed when list of list format is + # dropped y = check_array(y, accept_sparse='csr', ensure_2d=False) if neg_label >= pos_label: raise ValueError("neg_label={0} must be strictly less than " diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index dfdb2d23a2134..60521a670b1cb 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -210,6 +210,37 @@ def test_label_encoder(): assert_raises(ValueError, le.transform, [0, 6]) +def test_label_encoder_new_label_replace(): + """Test LabelEncoder's transform on new labels""" + le = LabelEncoder(new_labels=-99) + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) + assert_array_equal(le.transform(["b", "c", "d"]), [1, 2, -99]) + + +def test_label_encoder_classes_parameter(): + """Test LabelEncoder's classes parameter""" + le = LabelEncoder(classes=["a", "b", "c"], new_labels=None) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["c", "a", "b", "c"],), [2, 0, 1, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) + assert_raises(ValueError, le.transform, ["b", "c", "d"]) + + +def test_label_encoder_transform_classes_parameter(): + """Test LabelEncoder's transform using the classes parameter""" + le = LabelEncoder(new_labels=None) + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["d", "f", "e", "e"], + classes=["d", "e", "f"]), + [0, 2, 1, 1]) + assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) + assert_raises(ValueError, le.transform, ["b", "c", "d"]) + + def test_label_encoder_fit_transform(): """Test fit_transform""" le = LabelEncoder() @@ -226,6 +257,8 @@ def test_label_encoder_errors(): le = LabelEncoder() assert_raises(ValueError, le.transform, []) assert_raises(ValueError, le.inverse_transform, []) + # Fail on unrecognized value for the 'new_label' parameter + assert_raises(ValueError, LabelEncoder, new_labels="xyz") def test_sparse_output_multilabel_binarizer():