scikit-learn · mjbommar · Jul 24, 2014 · Jul 24, 2014 · Jul 24, 2014 · Jul 24, 2014
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -397,9 +397,9 @@ follows::
     >>> from sklearn import preprocessing
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder()
-    >>> le.classes_
-    array([1, 2, 6])
+    LabelEncoder(new_labels='raise')
+    >>> list(le.classes_)
+    [1, 2, 6]
     >>> le.transform([1, 1, 2, 6])
     array([0, 0, 1, 2])
     >>> le.inverse_transform([0, 0, 1, 2])
@@ -410,14 +410,31 @@ hashable and comparable) to numerical labels::
 
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder()
+    LabelEncoder(new_labels='raise')
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"])
     array([2, 2, 1])
     >>> list(le.inverse_transform([2, 2, 1]))
     ['tokyo', 'tokyo', 'paris']
 
+By default, ``LabelEncoder`` will throw a ``ValueError`` in the event that
+labels are passed in ``transform`` that were not seen in ``fit``.  This
+behavior can be handled with the ``new_labels`` parameter, which supports
+``"raise"``, ``"update"``, and integer strategies for
+handling new labels.  For example, the integer strategy will assign
+the unseen values an arbitrary, user-specified integer label (e.g., ``-1``
+below).
+
+    >>> le = preprocessing.LabelEncoder(new_labels=-1)
+    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
+    LabelEncoder(new_labels=-1)
+    >>> list(le.classes_)
+    ['amsterdam', 'paris', 'tokyo']
+    >>> le.transform(["tokyo", "tokyo", "paris", "rome"])
+    array([ 2,  2,  1, -1])
+    >>> list(le.classes_)
+    ['amsterdam', 'paris', 'tokyo', 'rome']
 
 Imputation of missing values
 ============================

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
@@ -1,16 +1,18 @@
 # Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
-#          Mathieu Blondel <mathieu@mblondel.org>
-#          Olivier Grisel <olivier.grisel@ensta.org>
+# Mathieu Blondel <mathieu@mblondel.org>
+# Olivier Grisel <olivier.grisel@ensta.org>
 #          Andreas Mueller <amueller@ais.uni-bonn.de>
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Hamzeh Alsalhi <ha258@cornell.edu>
+#          Michael Bommarito <michael@bommaritollc.com>
 # License: BSD 3 clause
 
 from collections import defaultdict
 import itertools
 import array
 import warnings
 
+import operator
 import numpy as np
 import scipy.sparse as sp
 
@@ -53,10 +55,34 @@ def _check_numpy_unicode_bug(labels):
 class LabelEncoder(BaseEstimator, TransformerMixin):
     """Encode labels with value between 0 and n_classes-1.
 
+    Parameters
+    ----------
+
+    new_labels : string, optional (default: "raise")
+        Determines how to handle new labels, i.e., data
+        not seen in the training domain.
+
+        - If ``"raise"``, then raise ValueError.
+        - If ``"update"``, then re-map the new labels to
+          classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels.
+        - If an integer value is passed, then re-label with this value.
+          N.B. that default values are in [0, 1, ...], so caution should be
+          taken if a non-negative value is passed to not accidentally
+          intersect.  Additionally, ``inverse_transform`` will fail for a
+          value that does not intersect with the ``fit``-time label set.
+
     Attributes
     ----------
     `classes_` : array of shape (n_class,)
-        Holds the label for each class.
+        Property that holds the label for each class that were seen at fit.
+          See ``get_classes()`` and ``set_classes()`` to retrieve all
+          view getter and setter for observed labels.
+
+    `fit_labels` : array of shape (n_class,)
+        Stores the labels seen at ``fit``-time.
+
+    `new_label_mapping_` : dictionary
+        Stores the mapping for classes not seen during original ``fit``.
 
     Examples
     --------
@@ -65,7 +91,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
     >>> from sklearn import preprocessing
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit([1, 2, 2, 6])
-    LabelEncoder()
+    LabelEncoder(new_labels='raise')
     >>> le.classes_
     array([1, 2, 6])
     >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS
@@ -78,7 +104,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
 
     >>> le = preprocessing.LabelEncoder()
     >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
-    LabelEncoder()
+    LabelEncoder(new_labels='raise')
     >>> list(le.classes_)
     ['amsterdam', 'paris', 'tokyo']
     >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS
@@ -88,10 +114,42 @@ class LabelEncoder(BaseEstimator, TransformerMixin):
 
     """
 
+    def __init__(self, new_labels="raise"):
+        """Constructor"""
+        self.new_labels = new_labels
+        self.new_label_mapping_ = {}
+        self.fit_labels_ = []
+
     def _check_fitted(self):
-        if not hasattr(self, "classes_"):
+        if len(self.fit_labels_) == 0:
             raise ValueError("LabelEncoder was not fitted yet.")
 
+    def get_classes(self):
+        """Get classes that have been observed by the encoder.  Note that this
+        method returns classes seen both at original ``fit`` time (i.e.,
+        ``self.classes_``) and classes seen after ``fit`` (i.e.,
+        ``self.new_label_mapping_.keys()``) for applicable values of
+        ``new_labels``.
+
+        Returns
+        -------
+        classes : array-like of shape [n_classes]
+        """
+        # If we've seen updates, include them in the order they were added.
+        if len(self.new_label_mapping_) > 0:
+            # Sort the post-fit time labels to return into the class array.
+            sorted_new, _ = zip(*sorted(self.new_label_mapping_.items(),
+                                        key=operator.itemgetter(1)))
+            return np.append(self.fit_labels_, sorted_new)
+        else:
+            return self.fit_labels_
+
+    def set_classes(self, classes):
+        """Set the classes via property."""
+        self.fit_labels_ = classes
+
+    classes_ = property(get_classes, set_classes)
+
     def fit(self, y):
         """Fit label encoder
 
@@ -104,9 +162,17 @@ def fit(self, y):
         -------
         self : returns an instance of self.
         """
+        # Check new_labels parameter
+        if self.new_labels not in ["update", "raise"] and \
+                type(self.new_labels) not in [int]:
+            # Raise on invalid argument.
+            raise ValueError("Value of argument `new_labels`={0} "
+                             "is unknown and not integer."
+                             .format(self.new_labels))
+
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
-        self.classes_ = np.unique(y)
+        self.fit_labels_ = np.unique(y)
         return self
 
     def fit_transform(self, y):
@@ -121,9 +187,17 @@ def fit_transform(self, y):
         -------
         y : array-like of shape [n_samples]
         """
+        # Check new_labels parameter
+        if self.new_labels not in ["update", "raise"] and \
+                type(self.new_labels) not in [int]:
+            # Raise on invalid argument.
+            raise ValueError("Value of argument `new_labels`={0} "
+                             "is unknown and not integer."
+                             .format(self.new_labels))
+
         y = column_or_1d(y, warn=True)
         _check_numpy_unicode_bug(y)
-        self.classes_, y = np.unique(y, return_inverse=True)
+        self.fit_labels_, y = np.unique(y, return_inverse=True)
         return y
 
     def transform(self, y):
@@ -142,10 +216,53 @@ def transform(self, y):
 
         classes = np.unique(y)
         _check_numpy_unicode_bug(classes)
-        if len(np.intersect1d(classes, self.classes_)) < len(classes):
-            diff = np.setdiff1d(classes, self.classes_)
-            raise ValueError("y contains new labels: %s" % str(diff))
-        return np.searchsorted(self.classes_, y)
+        if len(np.intersect1d(classes, self.get_classes())) < len(classes):
+            # Get the new classes
+            diff_fit = np.setdiff1d(classes, self.fit_labels_)
+            diff_new = np.setdiff1d(classes, self.get_classes())
+
+            # Create copy of array and return
+            y = np.array(y)
+
+            # If we are mapping new labels, get "new" ID and change in copy.
+            if self.new_labels == "update":
+                # Update the new label mapping
+                next_label = len(self.get_classes())
+                self.new_label_mapping_.update(dict(zip(diff_new,
+                                                        range(next_label,
+                                                              next_label +
+                                                              len(diff_new)))))
+
+                # Find entries with new labels
+                missing_mask = np.in1d(y, diff_fit)
+
+                # Populate return array properly by mask and return
+                out = np.searchsorted(self.fit_labels_, y)
+                out[missing_mask] = [self.new_label_mapping_[value]
+                                     for value in y[missing_mask]]
+                return out
+            elif type(self.new_labels) in [int]:
+                # Update the new label mapping
+                self.new_label_mapping_.update(dict(zip(diff_new,
+                                                        [self.new_labels]
+                                                        * len(diff_new))))
+
+                # Find entries with new labels
+                missing_mask = np.in1d(y, diff_fit)
+
+                # Populate return array properly by mask and return
+                out = np.searchsorted(self.fit_labels_, y)
+                out[missing_mask] = self.new_labels
+                return out
+            elif self.new_labels == "raise":
+                # Return ValueError, original behavior.
+                raise ValueError("y contains new labels: %s" % str(diff_fit))
+            else:
+                # Raise on invalid argument.
+                raise ValueError("Value of argument `new_labels`={0} "
+                                 "is unknown.".format(self.new_labels))
+
+        return np.searchsorted(self.fit_labels_, y)
 
     def inverse_transform(self, y):
         """Transform labels back to original encoding.
@@ -161,8 +278,24 @@ def inverse_transform(self, y):
         """
         self._check_fitted()
 
+        if type(self.new_labels) in [int]:
+            warnings.warn('When ``new_labels`` uses an integer '
+                          're-labeling strategy, the ``inverse_transform`` '
+                          'is not necessarily one-to-one mapping; any '
+                          'labels not present during initial ``fit`` will '
+                          'not be mapped.',
+                          UserWarning)
+
         y = np.asarray(y)
-        return self.classes_[y]
+        try:
+            return self.get_classes()[y]
+        except IndexError:
+            # Raise exception
+            num_classes = len(self.get_classes())
+            raise ValueError("Classes were passed to ``inverse_transform`` "
+                             "with integer new_labels strategy ``fit``-time: "
+                             "{0}"
+                             .format(np.setdiff1d(y, range(num_classes))))
 
 
 class LabelBinarizer(BaseEstimator, TransformerMixin):

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
@@ -210,6 +210,54 @@ def test_label_encoder():
     assert_raises(ValueError, le.transform, [0, 6])
 
 
+def test_label_encoder_get_classes():
+    """Test LabelEncoder's get_classes method."""
+    le = LabelEncoder(new_labels="update")
+    le.fit([1, 1, 4, 5, -1, 0])
+    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
+    assert_array_equal(le.classes_, le.classes_)
+    le.transform([10])
+    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5, 10])
+
+
+def test_label_encoder_new_label_update():
+    """Test LabelEncoder's transform on new labels"""
+    le = LabelEncoder(new_labels="update")
+    le.fit(["a", "b", "b", "c"])
+    assert_array_equal(le.classes_, ["a", "b", "c"])
+    assert_array_equal(le.transform(["a", "a", "c"]),
+                       [0, 0, 2])
+    assert_array_equal(le.inverse_transform([2, 1, 0]),
+                       ["c", "b", "a"])
+    assert_array_equal(le.transform(["b", "c", "_"]),
+                       [1, 2, 3])
+    assert_array_equal(le.get_classes(), ["a", "b", "c", "_"])
+    assert_array_equal(le.transform(["_", "z", "a"]),
+                       [3, 4, 0])
+    assert_array_equal(le.inverse_transform([3, 4, 0]),
+                       ["_", "z", "a"])
+
+
+def test_label_encoder_new_label_replace():
+    """Test LabelEncoder's transform on new labels"""
+    le = LabelEncoder(new_labels=-99)
+    le.fit(["a", "b", "b", "c"])
+    assert_array_equal(le.classes_, ["a", "b", "c"])
+    assert_array_equal(le.transform(["a", "a", "c"]),
+                       [0, 0, 2])
+    assert_array_equal(le.inverse_transform([2, 1, 0]),
+                       ["c", "b", "a"])
+    assert_array_equal(le.transform(["b", "c", "d"]),
+                       [1, 2, -99])
+    assert_warns(UserWarning, le.inverse_transform, [2, 1, 0])
+
+
+def test_label_encoder_new_label_arg():
+    """Test LabelEncoder's  new_labels argument handling"""
+    le = LabelEncoder(new_labels="xyz")
+    assert_raises(ValueError, le.fit, ["a", "b", "b", "c"])
+
+
 def test_label_encoder_fit_transform():
     """Test fit_transform"""
     le = LabelEncoder()