From 7fabf542c48568ddb001174e23aef34d8aa4a9f6 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Thu, 24 Jul 2014 08:17:42 -0400 Subject: [PATCH 01/18] Clean commit for PR 3243 --- doc/modules/preprocessing.rst | 18 +++- sklearn/preprocessing/label.py | 112 ++++++++++++++++++++-- sklearn/preprocessing/tests/test_label.py | 46 +++++++++ 3 files changed, 167 insertions(+), 9 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 4d3b04ade3c7b..2915eb9c45dd1 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -397,7 +397,7 @@ follows:: >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder() + LabelEncoder(new_label_class=-1, new_labels='raise') >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) @@ -410,7 +410,7 @@ hashable and comparable) to numerical labels:: >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder() + LabelEncoder(new_label_class=-1, new_labels='raise') >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) @@ -418,6 +418,20 @@ hashable and comparable) to numerical labels:: >>> list(le.inverse_transform([2, 2, 1])) ['tokyo', 'tokyo', 'paris'] +By default, ``LabelEncoder`` will throw a ``ValueError`` in the event that +labels are passed in ``transform`` that were not seen in ``fit``. This +behavior can be handled with the ``new_labels`` parameter, which supports +``"raise"``, ``"nan"``, ``"update"``, and ``"label"`` strategies for +handling new labels. For example, the ``"label"`` strategy will assign +the unseen values a label of ``-1``. + + >>> le = preprocessing.LabelEncoder(new_labels="label") + >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) + LabelEncoder(new_label_class=-1, new_labels='label') + >>> list(le.classes_) + ['amsterdam', 'paris', 'tokyo'] + >>> le.transform(["tokyo", "tokyo", "paris", "rome"]) + array([ 2, 2, 1, -1]) Imputation of missing values ============================ diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index f304bf6104cae..a319ee0e55ffd 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -1,6 +1,6 @@ # Authors: Alexandre Gramfort -# Mathieu Blondel -# Olivier Grisel +# Mathieu Blondel +# Olivier Grisel # Andreas Mueller # Joel Nothman # Hamzeh Alsalhi @@ -10,7 +10,9 @@ import itertools import array import warnings +import operator +import operator import numpy as np import scipy.sparse as sp @@ -53,11 +55,29 @@ def _check_numpy_unicode_bug(labels): class LabelEncoder(BaseEstimator, TransformerMixin): """Encode labels with value between 0 and n_classes-1. + Parameters + ---------- + + new_labels : string, optional (default: "raise") + Determines how to handle new labels, i.e., data + not seen in the training domain. + + - If ``"raise"``, then raise ValueError. + - If ``"update"``, then re-map the new labels to + classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. + - If an integer value is passed, then use re-label with this value. + N.B. that default values are in [0, 1, ...], so caution should be + taken if a non-negative value is passed to not accidentally + intersect. + Attributes ---------- `classes_` : array of shape (n_class,) Holds the label for each class. + `new_label_mapping_` : dictionary + Stores the mapping for classes not seen during original ``fit``. + Examples -------- `LabelEncoder` can be used to normalize labels. @@ -65,7 +85,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder() + LabelEncoder(new_label_class=-1, new_labels='raise') >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS @@ -78,7 +98,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder() + LabelEncoder(new_label_class=-1, new_labels='raise') >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS @@ -88,10 +108,34 @@ class LabelEncoder(BaseEstimator, TransformerMixin): """ + def __init__(self, new_labels="raise"): + """Constructor""" + self.new_labels = new_labels + self.new_label_mapping_ = {} + def _check_fitted(self): if not hasattr(self, "classes_"): raise ValueError("LabelEncoder was not fitted yet.") + def get_classes(self): + """Get classes that have been observed by the encoder. Note that this + method returns classes seen both at original ``fit`` time (i.e., + ``self.classes_``) and classes seen after ``fit`` (i.e., + ``self.new_label_mapping_.keys()``) for applicable values of + ``new_labels``. + + Returns + ------- + classes : array-like of shape [n_classes] + """ + # If we've seen updates, include them in the order they were added. + if len(self.new_label_mapping_) > 0: + sorted_new, _ = zip(*sorted(self.new_label_mapping_.iteritems(), + key=operator.itemgetter(1))) + return np.append(self.classes_, sorted_new) + else: + return self.classes_ + def fit(self, y): """Fit label encoder @@ -104,6 +148,14 @@ def fit(self, y): ------- self : returns an instance of self. """ + # Check new_labels parameter + if self.new_labels not in ["update", "raise"] and \ + type(self.new_labels) not in [int]: + # Raise on invalid argument. + raise ValueError("Value of argument `new_labels`={0} " + "is unknown and not integer." + .format(self.new_labels)) + y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) self.classes_ = np.unique(y) @@ -121,6 +173,14 @@ def fit_transform(self, y): ------- y : array-like of shape [n_samples] """ + # Check new_labels parameter + if self.new_labels not in ["update", "raise"] and \ + type(self.new_labels) not in [int]: + # Raise on invalid argument. + raise ValueError("Value of argument `new_labels`={0} " + "is unknown and not integer." + .format(self.new_labels)) + y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) self.classes_, y = np.unique(y, return_inverse=True) @@ -142,9 +202,47 @@ def transform(self, y): classes = np.unique(y) _check_numpy_unicode_bug(classes) - if len(np.intersect1d(classes, self.classes_)) < len(classes): - diff = np.setdiff1d(classes, self.classes_) - raise ValueError("y contains new labels: %s" % str(diff)) + if len(np.intersect1d(classes, self.get_classes())) < len(classes): + # Get the new classes + diff_fit = np.setdiff1d(classes, self.classes_) + diff_new = np.setdiff1d(classes, self.get_classes()) + + # Create copy of array and return + y = np.array(y) + + # If we are mapping new labels, get "new" ID and change in copy. + if self.new_labels == "update": + # Update the new label mapping + next_label = len(self.get_classes()) + self.new_label_mapping_.update(dict(zip(diff_new, + range(next_label, + next_label + + len(diff_new))))) + + # Find entries with new labels + missing_mask = np.in1d(y, diff_fit) + + # Populate return array properly by mask and return + out = np.searchsorted(self.classes_, y) + out[missing_mask] = [self.new_label_mapping_[value] + for value in y[missing_mask]] + return out + elif type(self.new_labels) in [int]: + # Find entries with new labels + missing_mask = np.in1d(y, diff_fit) + + # Populate return array properly by mask and return + out = np.searchsorted(self.classes_, y) + out[missing_mask] = self.new_labels + return out + elif self.new_labels == "raise": + # Return ValueError, original behavior. + raise ValueError("y contains new labels: %s" % str(diff_fit)) + else: + # Raise on invalid argument. + raise ValueError("Value of argument `new_labels`={0} " + "is unknown.".format(self.new_labels)) + return np.searchsorted(self.classes_, y) def inverse_transform(self, y): diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index dfdb2d23a2134..70a47fcffd498 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -210,6 +210,52 @@ def test_label_encoder(): assert_raises(ValueError, le.transform, [0, 6]) +def test_label_encoder_get_classes(): + """Test LabelEncoder's get_classes method.""" + le = LabelEncoder(new_labels="update") + le.fit([1, 1, 4, 5, -1, 0]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + assert_array_equal(le.classes_, le.get_classes()) + le.transform([10]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + assert_array_equal(le.get_classes(), [-1, 0, 1, 4, 5, 10]) + + +def test_label_encoder_new_label_update(): + """Test LabelEncoder's transform on new labels""" + le = LabelEncoder(new_labels="update") + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), + [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), + ["c", "b", "a"]) + assert_array_equal(le.transform(["b", "c", "_"]), + [1, 2, 3]) + assert_array_equal(le.get_classes(), ["a", "b", "c", "_"]) + assert_array_equal(le.transform(["_", "z", "a"]), + [3, 4, 0]) + + +def test_label_encoder_new_label_replace(): + """Test LabelEncoder's transform on new labels""" + le = LabelEncoder(new_labels=-99) + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), + [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), + ["c", "b", "a"]) + assert_array_equal(le.transform(["b", "c", "d"]), + [1, 2, -99]) + + +def test_label_encoder_new_label_arg(): + """Test LabelEncoder's new_labels argument handling""" + le = LabelEncoder(new_labels="xyz") + assert_raises(ValueError, le.fit, ["a", "b", "b", "c"]) + + def test_label_encoder_fit_transform(): """Test fit_transform""" le = LabelEncoder() From fac95e1877d61c3ebbfbe93ad15658107fbc9199 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Thu, 24 Jul 2014 09:16:56 -0400 Subject: [PATCH 02/18] Updating docstrings --- sklearn/preprocessing/label.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index a319ee0e55ffd..16b15682b56d6 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -4,13 +4,13 @@ # Andreas Mueller # Joel Nothman # Hamzeh Alsalhi +# Michael Bommarito # License: BSD 3 clause from collections import defaultdict import itertools import array import warnings -import operator import operator import numpy as np @@ -65,7 +65,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): - If ``"raise"``, then raise ValueError. - If ``"update"``, then re-map the new labels to classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. - - If an integer value is passed, then use re-label with this value. + - If an integer value is passed, then re-label with this value. N.B. that default values are in [0, 1, ...], so caution should be taken if a non-negative value is passed to not accidentally intersect. @@ -85,8 +85,8 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder(new_label_class=-1, new_labels='raise') - >>> le.classes_ + LabelEncoder(new_labels='raise') + >>> le.get_classes() array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS array([0, 0, 1, 2]...) @@ -98,8 +98,8 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder(new_label_class=-1, new_labels='raise') - >>> list(le.classes_) + LabelEncoder(new_labels='raise') + >>> list(le.get_classes()) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS array([2, 2, 1]...) From 0d3851fc8828eb453a28ffef0ffdbde1e087b401 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Thu, 24 Jul 2014 09:17:56 -0400 Subject: [PATCH 03/18] Adding test coverage and support for inverse_transform after new labels seen. --- sklearn/preprocessing/label.py | 2 +- sklearn/preprocessing/tests/test_label.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 16b15682b56d6..436fa14f84888 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -260,7 +260,7 @@ def inverse_transform(self, y): self._check_fitted() y = np.asarray(y) - return self.classes_[y] + return self.get_classes()[y] class LabelBinarizer(BaseEstimator, TransformerMixin): diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 70a47fcffd498..61a75e137ca1b 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -235,6 +235,8 @@ def test_label_encoder_new_label_update(): assert_array_equal(le.get_classes(), ["a", "b", "c", "_"]) assert_array_equal(le.transform(["_", "z", "a"]), [3, 4, 0]) + assert_array_equal(le.inverse_transform([3, 4, 0]), + ["_", "z", "a"]) def test_label_encoder_new_label_replace(): From 4ac58afad0da07943a217a1d7fc019133e434bc2 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Thu, 24 Jul 2014 09:18:15 -0400 Subject: [PATCH 04/18] Updating documentation examples --- doc/modules/preprocessing.rst | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 2915eb9c45dd1..0a6706f2e86e7 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -397,7 +397,7 @@ follows:: >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder(new_label_class=-1, new_labels='raise') + LabelEncoder(new_labels='raise') >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) @@ -410,7 +410,7 @@ hashable and comparable) to numerical labels:: >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder(new_label_class=-1, new_labels='raise') + LabelEncoder(new_labels='raise') >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) @@ -421,17 +421,20 @@ hashable and comparable) to numerical labels:: By default, ``LabelEncoder`` will throw a ``ValueError`` in the event that labels are passed in ``transform`` that were not seen in ``fit``. This behavior can be handled with the ``new_labels`` parameter, which supports -``"raise"``, ``"nan"``, ``"update"``, and ``"label"`` strategies for -handling new labels. For example, the ``"label"`` strategy will assign -the unseen values a label of ``-1``. +``"raise"``, ``"update"``, and integer strategies for +handling new labels. For example, the integer strategy will assign +the unseen values an arbitrary, user-specified integer label (e.g., ``-1`` +below). - >>> le = preprocessing.LabelEncoder(new_labels="label") + >>> le = preprocessing.LabelEncoder(new_labels=-1) >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder(new_label_class=-1, new_labels='label') - >>> list(le.classes_) + LabelEncoder(new_labels=-1) + >>> le.get_classes() ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris", "rome"]) array([ 2, 2, 1, -1]) + >>> le.get_classes() + ['amsterdam', 'paris', 'tokyo', 'rome'] Imputation of missing values ============================ From e314ed6759dce60d694f3cccc9992e631c76f3bc Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Thu, 24 Jul 2014 14:19:51 -0400 Subject: [PATCH 05/18] Updating docs --- doc/modules/preprocessing.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 0a6706f2e86e7..1a51a427692d6 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -398,8 +398,8 @@ follows:: >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) LabelEncoder(new_labels='raise') - >>> le.classes_ - array([1, 2, 6]) + >>> list(le.get_classes()) + [1, 2, 6] >>> le.transform([1, 1, 2, 6]) array([0, 0, 1, 2]) >>> le.inverse_transform([0, 0, 1, 2]) @@ -429,11 +429,11 @@ below). >>> le = preprocessing.LabelEncoder(new_labels=-1) >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) LabelEncoder(new_labels=-1) - >>> le.get_classes() + >>> list(le.get_classes()) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris", "rome"]) array([ 2, 2, 1, -1]) - >>> le.get_classes() + >>> list(le.get_classes()) ['amsterdam', 'paris', 'tokyo', 'rome'] Imputation of missing values From 751b585cce54153589fef2bd106245e9a01d9388 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Thu, 24 Jul 2014 14:20:32 -0400 Subject: [PATCH 06/18] Improving error-handling for inverse_transform --- sklearn/preprocessing/label.py | 23 ++++++++++++++++++++++- sklearn/preprocessing/tests/test_label.py | 1 + 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 436fa14f84888..7cee5b490ef71 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -228,6 +228,11 @@ def transform(self, y): for value in y[missing_mask]] return out elif type(self.new_labels) in [int]: + # Update the new label mapping + self.new_label_mapping_.update(dict(zip(diff_new, + [self.new_labels] + * len(diff_new)))) + # Find entries with new labels missing_mask = np.in1d(y, diff_fit) @@ -259,8 +264,24 @@ def inverse_transform(self, y): """ self._check_fitted() + if type(self.new_labels) in [int]: + warnings.warn('When ``new_labels`` uses an integer ' + 're-labeling strategy, the ``inverse_transform`` ' + 'is not necessarily one-to-one mapping; any ' + 'labels not present during initial ``fit`` will ' + 'not be mapped.', + UserWarning) + y = np.asarray(y) - return self.get_classes()[y] + try: + return self.get_classes()[y] + except IndexError: + # Raise exception + num_classes = len(self.get_classes()) + raise ValueError("Classes were passed to ``inverse_transform`` " + "with integer new_labels strategy ``fit``-time: " + "{0}" + .format(np.setdiff1d(y, range(num_classes)))) class LabelBinarizer(BaseEstimator, TransformerMixin): diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 61a75e137ca1b..f3bcc7483760b 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -250,6 +250,7 @@ def test_label_encoder_new_label_replace(): ["c", "b", "a"]) assert_array_equal(le.transform(["b", "c", "d"]), [1, 2, -99]) + assert_warns(UserWarning, le.inverse_transform, [2, 1, 0]) def test_label_encoder_new_label_arg(): From 866e939e751f5e92d0531c63eadf89deff8fccc1 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Thu, 24 Jul 2014 14:24:45 -0400 Subject: [PATCH 07/18] Improving docstrings --- sklearn/preprocessing/label.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 7cee5b490ef71..55734fd7c5785 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -68,12 +68,14 @@ class LabelEncoder(BaseEstimator, TransformerMixin): - If an integer value is passed, then re-label with this value. N.B. that default values are in [0, 1, ...], so caution should be taken if a non-negative value is passed to not accidentally - intersect. + intersect. Additionally, ``inverse_transform`` will fail for a + value that does not intersect with the ``fit``-time label set. Attributes ---------- `classes_` : array of shape (n_class,) - Holds the label for each class. + Holds the label for each class that were seen at fit. See + ``get_classes()`` to retrieve all observed labels. `new_label_mapping_` : dictionary Stores the mapping for classes not seen during original ``fit``. @@ -130,6 +132,7 @@ def get_classes(self): """ # If we've seen updates, include them in the order they were added. if len(self.new_label_mapping_) > 0: + # Sort the post-fit time labels to return into the class array. sorted_new, _ = zip(*sorted(self.new_label_mapping_.iteritems(), key=operator.itemgetter(1))) return np.append(self.classes_, sorted_new) From da4cafb3dda5f6d5b0f40f27e80fe979698e98db Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Thu, 24 Jul 2014 16:46:10 -0400 Subject: [PATCH 08/18] python3 dict.iteritems deprecation fix --- sklearn/preprocessing/label.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 55734fd7c5785..99a458c492c70 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -133,7 +133,7 @@ def get_classes(self): # If we've seen updates, include them in the order they were added. if len(self.new_label_mapping_) > 0: # Sort the post-fit time labels to return into the class array. - sorted_new, _ = zip(*sorted(self.new_label_mapping_.iteritems(), + sorted_new, _ = zip(*sorted(self.new_label_mapping_.items(), key=operator.itemgetter(1))) return np.append(self.classes_, sorted_new) else: From 0f3e3d3ff3630853c8587736e5bdc62acf431bf2 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Fri, 25 Jul 2014 00:04:03 -0400 Subject: [PATCH 09/18] Switching from classes_/get_classes() to classes_ property. --- doc/modules/preprocessing.rst | 6 ++-- sklearn/preprocessing/label.py | 37 +++++++++++++++-------- sklearn/preprocessing/tests/test_label.py | 5 ++- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 1a51a427692d6..c9ac59a32d12e 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -398,7 +398,7 @@ follows:: >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) LabelEncoder(new_labels='raise') - >>> list(le.get_classes()) + >>> list(le.classes_) [1, 2, 6] >>> le.transform([1, 1, 2, 6]) array([0, 0, 1, 2]) @@ -429,11 +429,11 @@ below). >>> le = preprocessing.LabelEncoder(new_labels=-1) >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) LabelEncoder(new_labels=-1) - >>> list(le.get_classes()) + >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris", "rome"]) array([ 2, 2, 1, -1]) - >>> list(le.get_classes()) + >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo', 'rome'] Imputation of missing values diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 99a458c492c70..6854384649672 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -74,8 +74,12 @@ class LabelEncoder(BaseEstimator, TransformerMixin): Attributes ---------- `classes_` : array of shape (n_class,) - Holds the label for each class that were seen at fit. See - ``get_classes()`` to retrieve all observed labels. + Property that holds the label for each class that were seen at fit. + See ``get_classes()`` and ``set_classes()`` to retrieve all + view getter and setter for observed labels. + + `fit_labels` : array of shape (n_class,) + Stores the labels seen at ``fit``-time. `new_label_mapping_` : dictionary Stores the mapping for classes not seen during original ``fit``. @@ -88,7 +92,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) LabelEncoder(new_labels='raise') - >>> le.get_classes() + >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS array([0, 0, 1, 2]...) @@ -101,7 +105,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) LabelEncoder(new_labels='raise') - >>> list(le.get_classes()) + >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS array([2, 2, 1]...) @@ -114,9 +118,10 @@ def __init__(self, new_labels="raise"): """Constructor""" self.new_labels = new_labels self.new_label_mapping_ = {} + self.fit_labels_ = [] def _check_fitted(self): - if not hasattr(self, "classes_"): + if len(self.fit_labels_) == 0: raise ValueError("LabelEncoder was not fitted yet.") def get_classes(self): @@ -135,9 +140,15 @@ def get_classes(self): # Sort the post-fit time labels to return into the class array. sorted_new, _ = zip(*sorted(self.new_label_mapping_.items(), key=operator.itemgetter(1))) - return np.append(self.classes_, sorted_new) + return np.append(self.fit_labels_, sorted_new) else: - return self.classes_ + return self.fit_labels_ + + def set_classes(self, classes): + """Set the classes via property.""" + self.fit_labels_ = classes + + classes_ = property(get_classes, set_classes) def fit(self, y): """Fit label encoder @@ -161,7 +172,7 @@ def fit(self, y): y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) - self.classes_ = np.unique(y) + self.fit_labels_ = np.unique(y) return self def fit_transform(self, y): @@ -186,7 +197,7 @@ def fit_transform(self, y): y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) - self.classes_, y = np.unique(y, return_inverse=True) + self.fit_labels_, y = np.unique(y, return_inverse=True) return y def transform(self, y): @@ -207,7 +218,7 @@ def transform(self, y): _check_numpy_unicode_bug(classes) if len(np.intersect1d(classes, self.get_classes())) < len(classes): # Get the new classes - diff_fit = np.setdiff1d(classes, self.classes_) + diff_fit = np.setdiff1d(classes, self.fit_labels_) diff_new = np.setdiff1d(classes, self.get_classes()) # Create copy of array and return @@ -226,7 +237,7 @@ def transform(self, y): missing_mask = np.in1d(y, diff_fit) # Populate return array properly by mask and return - out = np.searchsorted(self.classes_, y) + out = np.searchsorted(self.fit_labels_, y) out[missing_mask] = [self.new_label_mapping_[value] for value in y[missing_mask]] return out @@ -240,7 +251,7 @@ def transform(self, y): missing_mask = np.in1d(y, diff_fit) # Populate return array properly by mask and return - out = np.searchsorted(self.classes_, y) + out = np.searchsorted(self.fit_labels_, y) out[missing_mask] = self.new_labels return out elif self.new_labels == "raise": @@ -251,7 +262,7 @@ def transform(self, y): raise ValueError("Value of argument `new_labels`={0} " "is unknown.".format(self.new_labels)) - return np.searchsorted(self.classes_, y) + return np.searchsorted(self.fit_labels_, y) def inverse_transform(self, y): """Transform labels back to original encoding. diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index f3bcc7483760b..8d67eccd82a01 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -215,10 +215,9 @@ def test_label_encoder_get_classes(): le = LabelEncoder(new_labels="update") le.fit([1, 1, 4, 5, -1, 0]) assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) - assert_array_equal(le.classes_, le.get_classes()) + assert_array_equal(le.classes_, le.classes_) le.transform([10]) - assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) - assert_array_equal(le.get_classes(), [-1, 0, 1, 4, 5, 10]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5, 10]) def test_label_encoder_new_label_update(): From 468c92a2f43b47b0a109673455b68855d8c855d1 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Sat, 30 Aug 2014 12:15:20 -0400 Subject: [PATCH 10/18] Strip unnecessary attributes, comment out implementation of int label for now --- sklearn/preprocessing/label.py | 140 ++++++---------------- sklearn/preprocessing/tests/test_label.py | 51 ++++---- 2 files changed, 56 insertions(+), 135 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 6854384649672..a395f7791206a 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -1,6 +1,6 @@ # Authors: Alexandre Gramfort -# Mathieu Blondel -# Olivier Grisel +# Mathieu Blondel +# Olivier Grisel # Andreas Mueller # Joel Nothman # Hamzeh Alsalhi @@ -74,15 +74,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): Attributes ---------- `classes_` : array of shape (n_class,) - Property that holds the label for each class that were seen at fit. - See ``get_classes()`` and ``set_classes()`` to retrieve all - view getter and setter for observed labels. - - `fit_labels` : array of shape (n_class,) - Stores the labels seen at ``fit``-time. - - `new_label_mapping_` : dictionary - Stores the mapping for classes not seen during original ``fit``. + Holds the label for each class. Examples -------- @@ -115,41 +107,16 @@ class LabelEncoder(BaseEstimator, TransformerMixin): """ def __init__(self, new_labels="raise"): - """Constructor""" + # Check new_labels parameter + if new_labels not in ["update", "raise"] and type(new_labels) is not int: + raise ValueError("Value of argument `new_labels`={0} is unknown " + "and not an integer.".format(new_labels)) self.new_labels = new_labels - self.new_label_mapping_ = {} - self.fit_labels_ = [] def _check_fitted(self): - if len(self.fit_labels_) == 0: + if not hasattr(self, "classes_"): raise ValueError("LabelEncoder was not fitted yet.") - def get_classes(self): - """Get classes that have been observed by the encoder. Note that this - method returns classes seen both at original ``fit`` time (i.e., - ``self.classes_``) and classes seen after ``fit`` (i.e., - ``self.new_label_mapping_.keys()``) for applicable values of - ``new_labels``. - - Returns - ------- - classes : array-like of shape [n_classes] - """ - # If we've seen updates, include them in the order they were added. - if len(self.new_label_mapping_) > 0: - # Sort the post-fit time labels to return into the class array. - sorted_new, _ = zip(*sorted(self.new_label_mapping_.items(), - key=operator.itemgetter(1))) - return np.append(self.fit_labels_, sorted_new) - else: - return self.fit_labels_ - - def set_classes(self, classes): - """Set the classes via property.""" - self.fit_labels_ = classes - - classes_ = property(get_classes, set_classes) - def fit(self, y): """Fit label encoder @@ -162,17 +129,9 @@ def fit(self, y): ------- self : returns an instance of self. """ - # Check new_labels parameter - if self.new_labels not in ["update", "raise"] and \ - type(self.new_labels) not in [int]: - # Raise on invalid argument. - raise ValueError("Value of argument `new_labels`={0} " - "is unknown and not integer." - .format(self.new_labels)) - y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) - self.fit_labels_ = np.unique(y) + self.classes_ = np.unique(y) return self def fit_transform(self, y): @@ -187,17 +146,9 @@ def fit_transform(self, y): ------- y : array-like of shape [n_samples] """ - # Check new_labels parameter - if self.new_labels not in ["update", "raise"] and \ - type(self.new_labels) not in [int]: - # Raise on invalid argument. - raise ValueError("Value of argument `new_labels`={0} " - "is unknown and not integer." - .format(self.new_labels)) - y = column_or_1d(y, warn=True) _check_numpy_unicode_bug(y) - self.fit_labels_, y = np.unique(y, return_inverse=True) + self.classes_, y = np.unique(y, return_inverse=True) return y def transform(self, y): @@ -216,53 +167,29 @@ def transform(self, y): classes = np.unique(y) _check_numpy_unicode_bug(classes) - if len(np.intersect1d(classes, self.get_classes())) < len(classes): - # Get the new classes - diff_fit = np.setdiff1d(classes, self.fit_labels_) - diff_new = np.setdiff1d(classes, self.get_classes()) - - # Create copy of array and return - y = np.array(y) + if len(np.intersect1d(classes, self.classes_)) < len(classes): + # # Get the new classes + diff_fit = np.setdiff1d(classes, self.classes_) + # diff_new = np.setdiff1d(classes, self.get_classes()) # If we are mapping new labels, get "new" ID and change in copy. if self.new_labels == "update": # Update the new label mapping - next_label = len(self.get_classes()) - self.new_label_mapping_.update(dict(zip(diff_new, - range(next_label, - next_label + - len(diff_new))))) - - # Find entries with new labels - missing_mask = np.in1d(y, diff_fit) - - # Populate return array properly by mask and return - out = np.searchsorted(self.fit_labels_, y) - out[missing_mask] = [self.new_label_mapping_[value] - for value in y[missing_mask]] - return out + # XXX there is a more efficient way to do this by inserting + # new labels into the sorted .classes_ + self.classes_ = np.unique(np.concatenate((np.asarray(y), + self.classes_))) elif type(self.new_labels) in [int]: # Update the new label mapping - self.new_label_mapping_.update(dict(zip(diff_new, - [self.new_labels] - * len(diff_new)))) - - # Find entries with new labels - missing_mask = np.in1d(y, diff_fit) - - # Populate return array properly by mask and return - out = np.searchsorted(self.fit_labels_, y) - out[missing_mask] = self.new_labels - return out + pass elif self.new_labels == "raise": - # Return ValueError, original behavior. - raise ValueError("y contains new labels: %s" % str(diff_fit)) + raise ValueError("y contains new label(s): %s" % str(diff_fit)) else: # Raise on invalid argument. - raise ValueError("Value of argument `new_labels`={0} " - "is unknown.".format(self.new_labels)) + raise ValueError("Value of argument `new_labels`={0} is " + "unknown.".format(self.new_labels)) - return np.searchsorted(self.fit_labels_, y) + return np.searchsorted(self.classes_, y) def inverse_transform(self, y): """Transform labels back to original encoding. @@ -278,7 +205,7 @@ def inverse_transform(self, y): """ self._check_fitted() - if type(self.new_labels) in [int]: + if type(self.new_labels) is int: warnings.warn('When ``new_labels`` uses an integer ' 're-labeling strategy, the ``inverse_transform`` ' 'is not necessarily one-to-one mapping; any ' @@ -287,15 +214,16 @@ def inverse_transform(self, y): UserWarning) y = np.asarray(y) - try: - return self.get_classes()[y] - except IndexError: - # Raise exception - num_classes = len(self.get_classes()) - raise ValueError("Classes were passed to ``inverse_transform`` " - "with integer new_labels strategy ``fit``-time: " - "{0}" - .format(np.setdiff1d(y, range(num_classes)))) + return self.classes_[y] + # try: + # return self.get_classes()[y] + # except IndexError: + # # Raise exception + # num_classes = len(self.get_classes()) + # raise ValueError("Classes were passed to ``inverse_transform`` " + # "with integer new_labels strategy ``fit``-time: " + # "{0}" + # .format(np.setdiff1d(y, range(num_classes)))) class LabelBinarizer(BaseEstimator, TransformerMixin): diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 8d67eccd82a01..0cc185561ec0b 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -210,16 +210,6 @@ def test_label_encoder(): assert_raises(ValueError, le.transform, [0, 6]) -def test_label_encoder_get_classes(): - """Test LabelEncoder's get_classes method.""" - le = LabelEncoder(new_labels="update") - le.fit([1, 1, 4, 5, -1, 0]) - assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) - assert_array_equal(le.classes_, le.classes_) - le.transform([10]) - assert_array_equal(le.classes_, [-1, 0, 1, 4, 5, 10]) - - def test_label_encoder_new_label_update(): """Test LabelEncoder's transform on new labels""" le = LabelEncoder(new_labels="update") @@ -229,33 +219,33 @@ def test_label_encoder_new_label_update(): [0, 0, 2]) assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) - assert_array_equal(le.transform(["b", "c", "_"]), + # Unseen label "d" + assert_array_equal(le.transform(["b", "c", "d"]), [1, 2, 3]) - assert_array_equal(le.get_classes(), ["a", "b", "c", "_"]) - assert_array_equal(le.transform(["_", "z", "a"]), + assert_array_equal(le.classes_, ["a", "b", "c", "d"]) + assert_array_equal(le.transform(["d", "z", "a"]), [3, 4, 0]) assert_array_equal(le.inverse_transform([3, 4, 0]), - ["_", "z", "a"]) + ["d", "z", "a"]) -def test_label_encoder_new_label_replace(): - """Test LabelEncoder's transform on new labels""" - le = LabelEncoder(new_labels=-99) - le.fit(["a", "b", "b", "c"]) - assert_array_equal(le.classes_, ["a", "b", "c"]) - assert_array_equal(le.transform(["a", "a", "c"]), - [0, 0, 2]) - assert_array_equal(le.inverse_transform([2, 1, 0]), - ["c", "b", "a"]) - assert_array_equal(le.transform(["b", "c", "d"]), - [1, 2, -99]) - assert_warns(UserWarning, le.inverse_transform, [2, 1, 0]) +# def test_label_encoder_new_label_replace(): +# """Test LabelEncoder's transform on new labels""" +# le = LabelEncoder(new_labels=-99) +# le.fit(["a", "b", "b", "c"]) +# assert_array_equal(le.classes_, ["a", "b", "c"]) +# assert_array_equal(le.transform(["a", "a", "c"]), +# [0, 0, 2]) +# assert_array_equal(le.inverse_transform([2, 1, 0]), +# ["c", "b", "a"]) +# assert_array_equal(le.transform(["b", "c", "d"]), +# [1, 2, -99]) +# assert_warns(UserWarning, le.inverse_transform, [2, 1, 0]) def test_label_encoder_new_label_arg(): - """Test LabelEncoder's new_labels argument handling""" - le = LabelEncoder(new_labels="xyz") - assert_raises(ValueError, le.fit, ["a", "b", "b", "c"]) + """Test LabelEncoder's new_labels argument handling""" + assert_raises(ValueError, LabelEncoder, "xyz") def test_label_encoder_fit_transform(): @@ -275,6 +265,9 @@ def test_label_encoder_errors(): assert_raises(ValueError, le.transform, []) assert_raises(ValueError, le.inverse_transform, []) + # Fail on unrecognized vlaue for the 'new_label' parameter + assert_raises(ValueError, LabelEncoder, "xyz") + def test_sparse_output_multilabel_binarizer(): # test input as iterable of iterables From 39bd9e8a4f713f41d2c96db2686aa6a5dc34bf11 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Sat, 30 Aug 2014 19:01:10 -0400 Subject: [PATCH 11/18] Implement new_labels with integer type --- doc/modules/preprocessing.rst | 2 +- sklearn/preprocessing/label.py | 28 +++++++++++------------ sklearn/preprocessing/tests/test_label.py | 22 ++++++++---------- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index c9ac59a32d12e..dfd9b81d042b5 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -434,7 +434,7 @@ below). >>> le.transform(["tokyo", "tokyo", "paris", "rome"]) array([ 2, 2, 1, -1]) >>> list(le.classes_) - ['amsterdam', 'paris', 'tokyo', 'rome'] + ['amsterdam', 'paris', 'rome', 'tokyo'] Imputation of missing values ============================ diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index a395f7791206a..3465a6b3fc69f 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -168,22 +168,29 @@ def transform(self, y): classes = np.unique(y) _check_numpy_unicode_bug(classes) if len(np.intersect1d(classes, self.classes_)) < len(classes): - # # Get the new classes - diff_fit = np.setdiff1d(classes, self.classes_) - # diff_new = np.setdiff1d(classes, self.get_classes()) + # Get the new labels + unseen = np.setdiff1d(classes, self.classes_) # If we are mapping new labels, get "new" ID and change in copy. if self.new_labels == "update": # Update the new label mapping + # XXX there is a more efficient way to do this by inserting # new labels into the sorted .classes_ self.classes_ = np.unique(np.concatenate((np.asarray(y), self.classes_))) - elif type(self.new_labels) in [int]: + elif type(self.new_labels) is int: # Update the new label mapping - pass + ret = np.searchsorted(self.classes_, y) + ret[np.in1d(y, unseen)] = self.new_labels + + # XXX there is a more efficient way to do this by inserting + # new labels into the sorted .classes_ + self.classes_ = np.unique(np.concatenate((np.asarray(y), + self.classes_))) + return ret elif self.new_labels == "raise": - raise ValueError("y contains new label(s): %s" % str(diff_fit)) + raise ValueError("y contains new label(s): %s" % str(unseen)) else: # Raise on invalid argument. raise ValueError("Value of argument `new_labels`={0} is " @@ -215,15 +222,6 @@ def inverse_transform(self, y): y = np.asarray(y) return self.classes_[y] - # try: - # return self.get_classes()[y] - # except IndexError: - # # Raise exception - # num_classes = len(self.get_classes()) - # raise ValueError("Classes were passed to ``inverse_transform`` " - # "with integer new_labels strategy ``fit``-time: " - # "{0}" - # .format(np.setdiff1d(y, range(num_classes)))) class LabelBinarizer(BaseEstimator, TransformerMixin): diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 0cc185561ec0b..a5e89e109a220 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -229,18 +229,15 @@ def test_label_encoder_new_label_update(): ["d", "z", "a"]) -# def test_label_encoder_new_label_replace(): -# """Test LabelEncoder's transform on new labels""" -# le = LabelEncoder(new_labels=-99) -# le.fit(["a", "b", "b", "c"]) -# assert_array_equal(le.classes_, ["a", "b", "c"]) -# assert_array_equal(le.transform(["a", "a", "c"]), -# [0, 0, 2]) -# assert_array_equal(le.inverse_transform([2, 1, 0]), -# ["c", "b", "a"]) -# assert_array_equal(le.transform(["b", "c", "d"]), -# [1, 2, -99]) -# assert_warns(UserWarning, le.inverse_transform, [2, 1, 0]) +def test_label_encoder_new_label_replace(): + """Test LabelEncoder's transform on new labels""" + le = LabelEncoder(new_labels=-99) + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["a", "a", "c"]), [0, 0, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) + assert_array_equal(le.transform(["b", "c", "d"]), [1, 2, -99]) + assert_array_equal(le.inverse_transform([3, 1, 0]), ["d", "b", "a"]) def test_label_encoder_new_label_arg(): @@ -264,7 +261,6 @@ def test_label_encoder_errors(): le = LabelEncoder() assert_raises(ValueError, le.transform, []) assert_raises(ValueError, le.inverse_transform, []) - # Fail on unrecognized vlaue for the 'new_label' parameter assert_raises(ValueError, LabelEncoder, "xyz") From 767bea95e68eb6bc10501c6140ffa91745d9067e Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Sat, 30 Aug 2014 19:27:51 -0400 Subject: [PATCH 12/18] Fomate pep8 and remove redundant test for 'new_labels' error --- sklearn/preprocessing/label.py | 12 +++++++----- sklearn/preprocessing/tests/test_label.py | 7 +------ 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 3465a6b3fc69f..ab5a877066e0c 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -12,7 +12,6 @@ import array import warnings -import operator import numpy as np import scipy.sparse as sp @@ -108,9 +107,11 @@ class LabelEncoder(BaseEstimator, TransformerMixin): def __init__(self, new_labels="raise"): # Check new_labels parameter - if new_labels not in ["update", "raise"] and type(new_labels) is not int: - raise ValueError("Value of argument `new_labels`={0} is unknown " - "and not an integer.".format(new_labels)) + if (new_labels not in ["update", "raise"] and + type(new_labels) is not int): + raise ValueError("Value of argument `new_labels`={0} is " + "unknown and not an " + "integer.".format(new_labels)) self.new_labels = new_labels def _check_fitted(self): @@ -505,7 +506,8 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, allow for fitting to classes independently of the transform operation """ if not isinstance(y, list): - # XXX Workaround that will be removed when list of list format is dropped + # XXX Workaround that will be removed when list of list format is + # dropped y = check_array(y, accept_sparse='csr', ensure_2d=False) if neg_label >= pos_label: raise ValueError("neg_label={0} must be strictly less than " diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index a5e89e109a220..1e1ae2bf6ae21 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -240,11 +240,6 @@ def test_label_encoder_new_label_replace(): assert_array_equal(le.inverse_transform([3, 1, 0]), ["d", "b", "a"]) -def test_label_encoder_new_label_arg(): - """Test LabelEncoder's new_labels argument handling""" - assert_raises(ValueError, LabelEncoder, "xyz") - - def test_label_encoder_fit_transform(): """Test fit_transform""" le = LabelEncoder() @@ -261,7 +256,7 @@ def test_label_encoder_errors(): le = LabelEncoder() assert_raises(ValueError, le.transform, []) assert_raises(ValueError, le.inverse_transform, []) - # Fail on unrecognized vlaue for the 'new_label' parameter + # Fail on unrecognized vlaue for the 'new_label' parameter assert_raises(ValueError, LabelEncoder, "xyz") From bcdd05f49f0a974a71d58bf363e95a7d21a823bb Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 2 Sep 2014 08:43:57 -0400 Subject: [PATCH 13/18] Remove update behavior, Make new_labels an int param --- doc/modules/preprocessing.rst | 10 ++---- sklearn/preprocessing/label.py | 37 +++++------------------ sklearn/preprocessing/tests/test_label.py | 20 ------------ 3 files changed, 11 insertions(+), 56 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index dfd9b81d042b5..675f560b5db7a 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -397,7 +397,7 @@ follows:: >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder(new_labels='raise') + LabelEncoder(new_labels=None) >>> list(le.classes_) [1, 2, 6] >>> le.transform([1, 1, 2, 6]) @@ -410,7 +410,7 @@ hashable and comparable) to numerical labels:: >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder(new_labels='raise') + LabelEncoder(new_labels=None) >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) @@ -420,9 +420,7 @@ hashable and comparable) to numerical labels:: By default, ``LabelEncoder`` will throw a ``ValueError`` in the event that labels are passed in ``transform`` that were not seen in ``fit``. This -behavior can be handled with the ``new_labels`` parameter, which supports -``"raise"``, ``"update"``, and integer strategies for -handling new labels. For example, the integer strategy will assign +behavior can be handled with the ``new_labels`` parameter, which will assign the unseen values an arbitrary, user-specified integer label (e.g., ``-1`` below). @@ -433,8 +431,6 @@ below). ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris", "rome"]) array([ 2, 2, 1, -1]) - >>> list(le.classes_) - ['amsterdam', 'paris', 'rome', 'tokyo'] Imputation of missing values ============================ diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index ab5a877066e0c..73797c8cc435a 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -57,14 +57,8 @@ class LabelEncoder(BaseEstimator, TransformerMixin): Parameters ---------- - new_labels : string, optional (default: "raise") - Determines how to handle new labels, i.e., data - not seen in the training domain. - - - If ``"raise"``, then raise ValueError. - - If ``"update"``, then re-map the new labels to - classes ``[N, ..., N+m-1]``, where ``m`` is the number of new labels. - - If an integer value is passed, then re-label with this value. + new_labels : Int, optional (default: None) + re-label with this value. N.B. that default values are in [0, 1, ...], so caution should be taken if a non-negative value is passed to not accidentally intersect. Additionally, ``inverse_transform`` will fail for a @@ -82,7 +76,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder(new_labels='raise') + LabelEncoder() >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS @@ -95,7 +89,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder(new_labels='raise') + LabelEncoder() >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS @@ -105,10 +99,9 @@ class LabelEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, new_labels="raise"): + def __init__(self, new_labels=None): # Check new_labels parameter - if (new_labels not in ["update", "raise"] and - type(new_labels) is not int): + if new_labels is not None and type(new_labels) is not int: raise ValueError("Value of argument `new_labels`={0} is " "unknown and not an " "integer.".format(new_labels)) @@ -172,25 +165,11 @@ def transform(self, y): # Get the new labels unseen = np.setdiff1d(classes, self.classes_) - # If we are mapping new labels, get "new" ID and change in copy. - if self.new_labels == "update": - # Update the new label mapping - - # XXX there is a more efficient way to do this by inserting - # new labels into the sorted .classes_ - self.classes_ = np.unique(np.concatenate((np.asarray(y), - self.classes_))) - elif type(self.new_labels) is int: - # Update the new label mapping + if type(self.new_labels) is int: ret = np.searchsorted(self.classes_, y) ret[np.in1d(y, unseen)] = self.new_labels - - # XXX there is a more efficient way to do this by inserting - # new labels into the sorted .classes_ - self.classes_ = np.unique(np.concatenate((np.asarray(y), - self.classes_))) return ret - elif self.new_labels == "raise": + elif self.new_labels is None: raise ValueError("y contains new label(s): %s" % str(unseen)) else: # Raise on invalid argument. diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 1e1ae2bf6ae21..875f07a13fba6 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -210,25 +210,6 @@ def test_label_encoder(): assert_raises(ValueError, le.transform, [0, 6]) -def test_label_encoder_new_label_update(): - """Test LabelEncoder's transform on new labels""" - le = LabelEncoder(new_labels="update") - le.fit(["a", "b", "b", "c"]) - assert_array_equal(le.classes_, ["a", "b", "c"]) - assert_array_equal(le.transform(["a", "a", "c"]), - [0, 0, 2]) - assert_array_equal(le.inverse_transform([2, 1, 0]), - ["c", "b", "a"]) - # Unseen label "d" - assert_array_equal(le.transform(["b", "c", "d"]), - [1, 2, 3]) - assert_array_equal(le.classes_, ["a", "b", "c", "d"]) - assert_array_equal(le.transform(["d", "z", "a"]), - [3, 4, 0]) - assert_array_equal(le.inverse_transform([3, 4, 0]), - ["d", "z", "a"]) - - def test_label_encoder_new_label_replace(): """Test LabelEncoder's transform on new labels""" le = LabelEncoder(new_labels=-99) @@ -237,7 +218,6 @@ def test_label_encoder_new_label_replace(): assert_array_equal(le.transform(["a", "a", "c"]), [0, 0, 2]) assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) assert_array_equal(le.transform(["b", "c", "d"]), [1, 2, -99]) - assert_array_equal(le.inverse_transform([3, 1, 0]), ["d", "b", "a"]) def test_label_encoder_fit_transform(): From 235d681bf9407a510154fcbaf8b64d7f1965f79a Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 2 Sep 2014 12:28:55 -0400 Subject: [PATCH 14/18] Update doctest, new_labels=None --- sklearn/preprocessing/label.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 73797c8cc435a..6770f90cef01d 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -76,7 +76,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder() + LabelEncoder(new_labels=None) >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS @@ -89,7 +89,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder() + LabelEncoder(new_labels=None) >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS From 9153fc0724364c92974fbffcb3e6880cf8f30803 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 2 Sep 2014 13:02:55 -0400 Subject: [PATCH 15/18] Add a classes parameter to LabelEncoder.transform --- sklearn/preprocessing/label.py | 23 +++++++++++++++-------- sklearn/preprocessing/tests/test_label.py | 12 ++++++++++++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 6770f90cef01d..ee922aca1d7a4 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -145,7 +145,7 @@ def fit_transform(self, y): self.classes_, y = np.unique(y, return_inverse=True) return y - def transform(self, y): + def transform(self, y, classes=None): """Transform labels to normalized encoding. Parameters @@ -153,20 +153,27 @@ def transform(self, y): y : array-like of shape [n_samples] Target values. + classes : array-like, optional (default: None) + List of unique sorted labels to encode the target data against. + If None the LabelEncoder must have already been fit and the unique + labels from the fit will be used. + Returns ------- y : array-like of shape [n_samples] """ - self._check_fitted() + if classes is None: + self._check_fitted() + classes = self.classes_ - classes = np.unique(y) - _check_numpy_unicode_bug(classes) - if len(np.intersect1d(classes, self.classes_)) < len(classes): + y_classes = np.unique(y) + _check_numpy_unicode_bug(y_classes) + if len(np.intersect1d(y_classes, classes)) < len(y_classes): # Get the new labels - unseen = np.setdiff1d(classes, self.classes_) + unseen = np.setdiff1d(y_classes, classes) if type(self.new_labels) is int: - ret = np.searchsorted(self.classes_, y) + ret = np.searchsorted(classes, y) ret[np.in1d(y, unseen)] = self.new_labels return ret elif self.new_labels is None: @@ -176,7 +183,7 @@ def transform(self, y): raise ValueError("Value of argument `new_labels`={0} is " "unknown.".format(self.new_labels)) - return np.searchsorted(self.classes_, y) + return np.searchsorted(classes, y) def inverse_transform(self, y): """Transform labels back to original encoding. diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 875f07a13fba6..4109638884dd7 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -220,6 +220,18 @@ def test_label_encoder_new_label_replace(): assert_array_equal(le.transform(["b", "c", "d"]), [1, 2, -99]) +def test_label_encoder_transform_classes_parameter(): + """Test LabelEncoder's transform using the classes parameter""" + le = LabelEncoder(new_labels=None) + le.fit(["a", "b", "b", "c"]) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["d", "f", "e", "e"], + classes=["d", "e", "f"]), + [0, 2, 1, 1]) + assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) + assert_raises(ValueError, le.transform, ["b", "c", "d"]) + + def test_label_encoder_fit_transform(): """Test fit_transform""" le = LabelEncoder() From e35b92bdee5a9b92b5370b22b880b8198af0ca14 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 2 Sep 2014 19:06:46 -0400 Subject: [PATCH 16/18] Add classes paramter to LabelEncoder init --- doc/modules/preprocessing.rst | 6 +++--- sklearn/preprocessing/label.py | 19 ++++++++++++++----- sklearn/preprocessing/tests/test_label.py | 13 +++++++++++-- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 675f560b5db7a..6f4b2d66b6dfc 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -397,7 +397,7 @@ follows:: >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder(new_labels=None) + LabelEncoder(classes=None, new_labels=None) >>> list(le.classes_) [1, 2, 6] >>> le.transform([1, 1, 2, 6]) @@ -410,7 +410,7 @@ hashable and comparable) to numerical labels:: >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder(new_labels=None) + LabelEncoder(classes=None, new_labels=None) >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) @@ -426,7 +426,7 @@ below). >>> le = preprocessing.LabelEncoder(new_labels=-1) >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder(new_labels=-1) + LabelEncoder(classes=None, new_labels=-1) >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris", "rome"]) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index ee922aca1d7a4..72c5c66abf6e7 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -57,6 +57,12 @@ class LabelEncoder(BaseEstimator, TransformerMixin): Parameters ---------- + classes : array-like of shape [n_class], optional (default: None) + Holds the label for each class. List of unique sorted labels to encode + the target data against. Using this parameter in initilization will + allow skipping a call fit before calling transform. + + new_labels : Int, optional (default: None) re-label with this value. N.B. that default values are in [0, 1, ...], so caution should be @@ -76,7 +82,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> from sklearn import preprocessing >>> le = preprocessing.LabelEncoder() >>> le.fit([1, 2, 2, 6]) - LabelEncoder(new_labels=None) + LabelEncoder(classes=None, new_labels=None) >>> le.classes_ array([1, 2, 6]) >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS @@ -89,7 +95,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> le = preprocessing.LabelEncoder() >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) - LabelEncoder(new_labels=None) + LabelEncoder(classes=None, new_labels=None) >>> list(le.classes_) ['amsterdam', 'paris', 'tokyo'] >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS @@ -99,12 +105,15 @@ class LabelEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, new_labels=None): - # Check new_labels parameter + def __init__(self, classes=None, new_labels=None): + if classes is not None: + self.classes_ = classes + if new_labels is not None and type(new_labels) is not int: raise ValueError("Value of argument `new_labels`={0} is " "unknown and not an " "integer.".format(new_labels)) + self.new_labels = new_labels def _check_fitted(self): @@ -153,7 +162,7 @@ def transform(self, y, classes=None): y : array-like of shape [n_samples] Target values. - classes : array-like, optional (default: None) + classes : array-like of shape [n_class], optional (default: None) List of unique sorted labels to encode the target data against. If None the LabelEncoder must have already been fit and the unique labels from the fit will be used. diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 4109638884dd7..60521a670b1cb 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -220,6 +220,15 @@ def test_label_encoder_new_label_replace(): assert_array_equal(le.transform(["b", "c", "d"]), [1, 2, -99]) +def test_label_encoder_classes_parameter(): + """Test LabelEncoder's classes parameter""" + le = LabelEncoder(classes=["a", "b", "c"], new_labels=None) + assert_array_equal(le.classes_, ["a", "b", "c"]) + assert_array_equal(le.transform(["c", "a", "b", "c"],), [2, 0, 1, 2]) + assert_array_equal(le.inverse_transform([2, 1, 0]), ["c", "b", "a"]) + assert_raises(ValueError, le.transform, ["b", "c", "d"]) + + def test_label_encoder_transform_classes_parameter(): """Test LabelEncoder's transform using the classes parameter""" le = LabelEncoder(new_labels=None) @@ -248,8 +257,8 @@ def test_label_encoder_errors(): le = LabelEncoder() assert_raises(ValueError, le.transform, []) assert_raises(ValueError, le.inverse_transform, []) - # Fail on unrecognized vlaue for the 'new_label' parameter - assert_raises(ValueError, LabelEncoder, "xyz") + # Fail on unrecognized value for the 'new_label' parameter + assert_raises(ValueError, LabelEncoder, new_labels="xyz") def test_sparse_output_multilabel_binarizer(): From aaf0425398e4ea9210bdca35888b7908ab5e5671 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Wed, 3 Sep 2014 09:17:20 -0400 Subject: [PATCH 17/18] Initiallize selc.classes_ asarray --- sklearn/preprocessing/label.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 72c5c66abf6e7..b506d901fb613 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -107,7 +107,7 @@ class LabelEncoder(BaseEstimator, TransformerMixin): def __init__(self, classes=None, new_labels=None): if classes is not None: - self.classes_ = classes + self.classes_ = np.asarray(classes) if new_labels is not None and type(new_labels) is not int: raise ValueError("Value of argument `new_labels`={0} is " From fd4d4269f19a22249caa511c46995e9a7575ff23 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Thu, 4 Sep 2014 07:10:02 -0400 Subject: [PATCH 18/18] Drop unnecessary error from transform --- sklearn/preprocessing/label.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index b506d901fb613..5ef1cafcb2c3b 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -187,10 +187,6 @@ def transform(self, y, classes=None): return ret elif self.new_labels is None: raise ValueError("y contains new label(s): %s" % str(unseen)) - else: - # Raise on invalid argument. - raise ValueError("Value of argument `new_labels`={0} is " - "unknown.".format(self.new_labels)) return np.searchsorted(classes, y)