scikit-learn · hamsal · Jul 24, 2014 · Jul 24, 2014 · Jul 24, 2014 · Jul 24, 2014
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -397,9 +397,9 @@ follows::
     >>> from sklearn import preprocessing
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder()
-    >>> le.classes_
-    array([1, 2, 6])
+    LabelEncoder(classes=None, new_labels=None)
+    >>> list(le.classes_)
+    [1, 2, 6]
     >>> le.transform([1, 1, 2, 6])
     array([0, 0, 1, 2])
     >>> le.inverse_transform([0, 0, 1, 2])
@@ -410,14 +410,27 @@ hashable and comparable) to numerical labels::
 
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder()
+    LabelEncoder(classes=None, new_labels=None)
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"])
     array([2, 2, 1])
     >>> list(le.inverse_transform([2, 2, 1]))
     ['tokyo', 'tokyo', 'paris']
 
+By default, ``LabelEncoder`` will throw a ``ValueError`` in the event that
+labels are passed in ``transform`` that were not seen in ``fit``.  This
+behavior can be handled with the ``new_labels`` parameter, which will assign
+the unseen values an arbitrary, user-specified integer label (e.g., ``-1``
+below).
+
+    >>> le = preprocessing.LabelEncoder(new_labels=-1)
+    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
+    LabelEncoder(classes=None, new_labels=-1)
+    >>> list(le.classes_)
+    ['amsterdam', 'paris', 'tokyo']
+    >>> le.transform(["tokyo", "tokyo", "paris", "rome"])
+    array([ 2,  2,  1, -1])
 
 Imputation of missing values
 ============================

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
@@ -4,6 +4,7 @@
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Hamzeh Alsalhi <ha258@cornell.edu>
+#          Michael Bommarito <michael@bommaritollc.com>
 # License: BSD 3 clause
 
 from collections import defaultdict
@@ -53,6 +54,22 @@ def _check_numpy_unicode_bug(labels):
 class LabelEncoder(BaseEstimator, TransformerMixin):
     """Encode labels with value between 0 and n_classes-1.
 
+    Parameters
+    ----------
+
+    classes : array-like of shape [n_class], optional (default: None)
+        Holds the label for each class. List of unique sorted labels to encode
+        the target data against. Using this parameter in initilization will
+        allow skipping a call fit before calling transform.
+
+
+    new_labels : Int, optional (default: None)
+          re-label with this value.
+          N.B. that default values are in [0, 1, ...], so caution should be
+          taken if a non-negative value is passed to not accidentally
+          intersect.  Additionally, ``inverse_transform`` will fail for a
+          value that does not intersect with the ``fit``-time label set.
+
     Attributes
     ----------
     `classes_` : array of shape (n_class,)
@@ -65,7 +82,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     >>> from sklearn import preprocessing
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder()
+    LabelEncoder(classes=None, new_labels=None)
     >>> le.classes_
     array([1, 2, 6])
     >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS
@@ -78,7 +95,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
 
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder()
+    LabelEncoder(classes=None, new_labels=None)
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS
@@ -88,6 +105,17 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
 
     """
 
+    def __init__(self, classes=None, new_labels=None):
+        if classes is not None:
+            self.classes_ = np.asarray(classes)
+
+        if new_labels is not None and type(new_labels) is not int:
+                raise ValueError("Value of argument `new_labels`={0} is "
+                                 "unknown and not an "
+                                 "integer.".format(new_labels))
+
+        self.new_labels = new_labels
+
     def _check_fitted(self):
         if not hasattr(self, "classes_"):
             raise ValueError("LabelEncoder was not fitted yet.")
@@ -126,26 +154,41 @@ def fit_transform(self, y):
         self.classes_, y = np.unique(y, return_inverse=True)
         return y
 
-    def transform(self, y):
+    def transform(self, y, classes=None):
         """Transform labels to normalized encoding.
 
         Parameters
         ----------
         y : array-like of shape [n_samples]
             Target values.
 
+        classes : array-like of shape [n_class], optional (default: None)
+            List of unique sorted labels to encode the target data against.
+            If None the LabelEncoder must have already been fit and the unique
+            labels from the fit will be used.
+
         Returns
         -------
         y : array-like of shape [n_samples]
         """
-        self._check_fitted()
+        if classes is None:
+            self._check_fitted()
+            classes = self.classes_
 
-        classes = np.unique(y)
-        _check_numpy_unicode_bug(classes)
-        if len(np.intersect1d(classes, self.classes_)) < len(classes):
-            diff = np.setdiff1d(classes, self.classes_)
-            raise ValueError("y contains new labels: %s" % str(diff))
-        return np.searchsorted(self.classes_, y)
+        y_classes = np.unique(y)
+        _check_numpy_unicode_bug(y_classes)
+        if len(np.intersect1d(y_classes, classes)) < len(y_classes):
+            # Get the new labels
+            unseen = np.setdiff1d(y_classes, classes)
+
+            if type(self.new_labels) is int:
+                ret = np.searchsorted(classes, y)
+                ret[np.in1d(y, unseen)] = self.new_labels
+                return ret
+            elif self.new_labels is None:
+                raise ValueError("y contains new label(s): %s" % str(unseen))
+
+        return np.searchsorted(classes, y)
 
     def inverse_transform(self, y):
         """Transform labels back to original encoding.
@@ -161,6 +204,14 @@ def inverse_transform(self, y):
         """
         self._check_fitted()
 
+        if type(self.new_labels) is int:
+            warnings.warn('When ``new_labels`` uses an integer '
+                          're-labeling strategy, the ``inverse_transform`` '
+                          'is not necessarily one-to-one mapping; any '
+                          'labels not present during initial ``fit`` will '
+                          'not be mapped.',
+                          UserWarning)
+
         y = np.asarray(y)
         return self.classes_[y]
 
@@ -446,7 +497,8 @@ def label_binarize(y, classes, neg_label=0, pos_label=1,
         allow for fitting to classes independently of the transform operation
     """
     if not isinstance(y, list):
-        # XXX Workaround that will be removed when list of list format is dropped
+        # XXX Workaround that will be removed when list of list format is
+        # dropped
         y = check_array(y, accept_sparse='csr', ensure_2d=False)
     if neg_label >= pos_label:
         raise ValueError("neg_label={0} must be strictly less than "

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
@@ -210,6 +210,37 @@ def test_label_encoder():
     assert_raises(ValueError, le.transform, [0, 6])
 
 
+def test_label_encoder_new_label_replace():
+    """Test LabelEncoder's transform on new labels"""
+    le = LabelEncoder(new_labels=-99)
+    le.fit(["a", "b", "b", "c"])
+    assert_array_equal(le.classes_, ["a", "b", "c"])
+    assert_array_equal(le.transform(["a", "a", "c"]), [0, 0, 2])
+    assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"])
+    assert_array_equal(le.transform(["b", "c", "d"]), [1, 2, -99])
+
+
+def test_label_encoder_classes_parameter():
+    """Test LabelEncoder's classes parameter"""
+    le = LabelEncoder(classes=["a", "b", "c"], new_labels=None)
+    assert_array_equal(le.classes_, ["a", "b", "c"])
+    assert_array_equal(le.transform(["c", "a", "b", "c"],), [2, 0, 1, 2])
+    assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"])
+    assert_raises(ValueError, le.transform, ["b", "c", "d"])
+
+
+def test_label_encoder_transform_classes_parameter():
+    """Test LabelEncoder's transform using the classes parameter"""
+    le = LabelEncoder(new_labels=None)
+    le.fit(["a", "b", "b", "c"])
+    assert_array_equal(le.classes_, ["a", "b", "c"])
+    assert_array_equal(le.transform(["d", "f", "e", "e"],
+                                    classes=["d", "e", "f"]),
+                       [0, 2, 1, 1])
+    assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"])
+    assert_raises(ValueError, le.transform, ["b", "c", "d"])
+
+
 def test_label_encoder_fit_transform():
     """Test fit_transform"""
     le = LabelEncoder()
@@ -226,6 +257,8 @@ def test_label_encoder_errors():
     le = LabelEncoder()
     assert_raises(ValueError, le.transform, [])
     assert_raises(ValueError, le.inverse_transform, [])
+    # Fail on unrecognized value for the 'new_label' parameter
+    assert_raises(ValueError, LabelEncoder, new_labels="xyz")
 
 
 def test_sparse_output_multilabel_binarizer():