From 09a85ea5e340c6f64401b389ce38297365ac96cf Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 28 Feb 2020 11:37:45 +0100 Subject: [PATCH 01/11] Add check for non binary variables. --- sklearn/preprocessing/_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 3be4540498591..74568c14fcf84 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -581,7 +581,7 @@ def get_feature_names(self, input_features=None): for i in range(len(cats)): names = [ input_features[i] + '_' + str(t) for t in cats[i]] - if self.drop is not None: + if (self.drop is not None) and (self.drop_idx_[i] != -1): names.pop(self.drop_idx_[i]) feature_names.extend(names) From ec6fd454136e9bb5950d573bec46c99fb4bff969 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 2 Mar 2020 15:01:28 +0100 Subject: [PATCH 02/11] Make drop_idx_ an array of objects. Add test. --- sklearn/preprocessing/_encoders.py | 25 ++++++++++---------- sklearn/preprocessing/tests/test_encoders.py | 11 +++++---- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 74568c14fcf84..adc9075b6fb0c 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -227,7 +227,7 @@ class OneHotEncoder(_BaseEncoder): drop_idx_ : array of shape (n_features,) ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to be dropped for each feature. - ``drop_idx_[i] = -1`` if no category is to be dropped from the feature + ``drop_idx_[i] = None`` if no category is to be dropped from the feature with index ``i``, e.g. when `drop='if_binary'` and the feature isn't binary @@ -316,10 +316,10 @@ def _compute_drop_idx(self): return None elif isinstance(self.drop, str): if self.drop == 'first': - return np.zeros(len(self.categories_), dtype=np.int_) + return np.zeros(len(self.categories_), dtype=np.object) elif self.drop == 'if_binary': - return np.array([0 if len(cats) == 2 else -1 - for cats in self.categories_], dtype=np.int_) + return np.array([0 if len(cats) == 2 else None + for cats in self.categories_], dtype=np.object) else: msg = ( "Wrong input for parameter `drop`. Expected " @@ -354,7 +354,7 @@ def _compute_drop_idx(self): raise ValueError(msg) return np.array([np.where(cat_list == val)[0][0] for (val, cat_list) in - zip(self.drop, self.categories_)], dtype=np.int_) + zip(self.drop, self.categories_)], dtype=np.object) def fit(self, X, y=None): """ @@ -431,7 +431,7 @@ def transform(self, X): n_cats = len(cats) # drop='if_binary' but feature isn't binary - if to_drop[i] == -1: + if to_drop[i] == None: # set to cardinality to not drop from X_int to_drop[i] = n_cats n_values.append(n_cats) @@ -509,17 +509,17 @@ def inverse_transform(self, X): found_unknown = {} for i in range(n_features): - if self.drop is None: + if (self.drop is None) or (self.drop_idx_[i] is None): cats = self.categories_[i] else: - cats = np.delete(self.categories_[i], self.drop_idx_[i]) + cats = np.delete(self.categories_[i], int(self.drop_idx_[i])) n_categories = len(cats) # Only happens if there was a column with a unique # category. In this case we just fill the column with this # unique category value. if n_categories == 0: - X_tr[:, i] = self.categories_[i][self.drop_idx_[i]] + X_tr[:, i] = self.categories_[i][int(self.drop_idx_[i])] j += n_categories continue sub = X[:, j:j + n_categories] @@ -537,7 +537,8 @@ def inverse_transform(self, X): elif self.drop is not None: dropped = np.asarray(sub.sum(axis=1) == 0).flatten() if dropped.any(): - X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]] + X_tr[dropped, i] = \ + self.categories_[i][int(self.drop_idx_[i])] j += n_categories @@ -581,8 +582,8 @@ def get_feature_names(self, input_features=None): for i in range(len(cats)): names = [ input_features[i] + '_' + str(t) for t in cats[i]] - if (self.drop is not None) and (self.drop_idx_[i] != -1): - names.pop(self.drop_idx_[i]) + if (self.drop is not None) and (self.drop_idx_[i] is not None): + names.pop(int(self.drop_idx_[i])) feature_names.extend(names) return np.array(feature_names, dtype=object) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 2a872c2e06c49..175671ecbad33 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -388,8 +388,9 @@ def test_one_hot_encoder_pandas(): @pytest.mark.parametrize("drop, expected_names", [('first', ['x0_c', 'x2_b']), + ('if_binary', ['x0_c', 'x1_2', 'x2_b']), (['c', 2, 'b'], ['x0_b', 'x2_a'])], - ids=['first', 'manual']) + ids=['first', 'binary', 'manual']) def test_one_hot_encoder_feature_names_drop(drop, expected_names): X = [['c', 2, 'a'], ['b', 2, 'b']] @@ -409,7 +410,7 @@ def test_one_hot_encoder_drop_equals_if_binary(): expected = np.array([[1., 0., 0., 1.], [0., 1., 0., 0.], [0., 0., 1., 1.]]) - expected_drop_idx = np.array([-1, 0]) + expected_drop_idx = np.array([None, 0]) ohe = OneHotEncoder(drop='if_binary', sparse=False) result = ohe.fit_transform(X) @@ -423,7 +424,7 @@ def test_one_hot_encoder_drop_equals_if_binary(): expected = np.array([[1., 1.], [0., 1.], [0., 1.]]) - expected_drop_idx = np.array([0, -1]) + expected_drop_idx = np.array([0, None]) ohe = OneHotEncoder(drop='if_binary', sparse=False) result = ohe.fit_transform(X) @@ -662,9 +663,9 @@ def test_categories(density, drop): for drop_cat, drop_idx, cat_list in zip(drop, ohe_test.drop_idx_, ohe_test.categories_): - assert cat_list[drop_idx] == drop_cat + assert cat_list[int(drop_idx)] == drop_cat assert isinstance(ohe_test.drop_idx_, np.ndarray) - assert ohe_test.drop_idx_.dtype == np.int_ + assert ohe_test.drop_idx_.dtype == np.object @pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder]) From f83245bc2b3c87498251fa7d375bb3397baeb64e Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 2 Mar 2020 15:04:10 +0100 Subject: [PATCH 03/11] Fix lint issues. --- sklearn/preprocessing/_encoders.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index adc9075b6fb0c..f26ebe50bcb1a 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -227,9 +227,9 @@ class OneHotEncoder(_BaseEncoder): drop_idx_ : array of shape (n_features,) ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to be dropped for each feature. - ``drop_idx_[i] = None`` if no category is to be dropped from the feature - with index ``i``, e.g. when `drop='if_binary'` and the feature isn't - binary + ``drop_idx_[i] = None`` if no category is to be dropped from the + feature with index ``i``, e.g. when `drop='if_binary'` and the feature + isn't binary ``drop_idx_ = None`` if all the transformed features will be retained. @@ -354,7 +354,8 @@ def _compute_drop_idx(self): raise ValueError(msg) return np.array([np.where(cat_list == val)[0][0] for (val, cat_list) in - zip(self.drop, self.categories_)], dtype=np.object) + zip(self.drop, self.categories_)], + dtype=np.object) def fit(self, X, y=None): """ @@ -431,7 +432,7 @@ def transform(self, X): n_cats = len(cats) # drop='if_binary' but feature isn't binary - if to_drop[i] == None: + if to_drop[i] is None: # set to cardinality to not drop from X_int to_drop[i] = n_cats n_values.append(n_cats) From 75bbe4458a1c62620e4d6c790eee05f4811762a8 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 3 Mar 2020 17:35:11 +0100 Subject: [PATCH 04/11] Update sklearn/preprocessing/_encoders.py Co-Authored-By: Thomas J Fan --- sklearn/preprocessing/_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index f26ebe50bcb1a..03e6af04065fd 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -513,7 +513,7 @@ def inverse_transform(self, X): if (self.drop is None) or (self.drop_idx_[i] is None): cats = self.categories_[i] else: - cats = np.delete(self.categories_[i], int(self.drop_idx_[i])) + cats = np.delete(self.categories_[i], self.drop_idx_[i]) n_categories = len(cats) # Only happens if there was a column with a unique From 67cbc81dc229952c03429e80259fc10d6154a969 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 3 Mar 2020 17:39:30 +0100 Subject: [PATCH 05/11] Address @thomasjpfan comments. --- sklearn/preprocessing/_encoders.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 03e6af04065fd..5f0b32164db3f 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -520,7 +520,7 @@ def inverse_transform(self, X): # category. In this case we just fill the column with this # unique category value. if n_categories == 0: - X_tr[:, i] = self.categories_[i][int(self.drop_idx_[i])] + X_tr[:, i] = self.categories_[i][self.drop_idx_[i]] j += n_categories continue sub = X[:, j:j + n_categories] @@ -539,7 +539,7 @@ def inverse_transform(self, X): dropped = np.asarray(sub.sum(axis=1) == 0).flatten() if dropped.any(): X_tr[dropped, i] = \ - self.categories_[i][int(self.drop_idx_[i])] + self.categories_[i][self.drop_idx_[i]] j += n_categories @@ -584,7 +584,7 @@ def get_feature_names(self, input_features=None): names = [ input_features[i] + '_' + str(t) for t in cats[i]] if (self.drop is not None) and (self.drop_idx_[i] is not None): - names.pop(int(self.drop_idx_[i])) + names.pop(self.drop_idx_[i]) feature_names.extend(names) return np.array(feature_names, dtype=object) From 06cbf52bcdc43cccf77b7f2ed2cd8792a0103d38 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 3 Mar 2020 20:49:04 +0100 Subject: [PATCH 06/11] Remove newline. --- sklearn/preprocessing/_encoders.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 5f0b32164db3f..3de7a702d5c1e 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -538,8 +538,7 @@ def inverse_transform(self, X): elif self.drop is not None: dropped = np.asarray(sub.sum(axis=1) == 0).flatten() if dropped.any(): - X_tr[dropped, i] = \ - self.categories_[i][self.drop_idx_[i]] + X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]] j += n_categories From 93ae5c4a29545e291e280858accd242791c1aeba Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 5 Mar 2020 21:10:59 +0100 Subject: [PATCH 07/11] Update sklearn/preprocessing/_encoders.py Co-Authored-By: Guillaume Lemaitre --- sklearn/preprocessing/_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 3de7a702d5c1e..71574c5f1ea7d 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -229,7 +229,7 @@ class OneHotEncoder(_BaseEncoder): be dropped for each feature. ``drop_idx_[i] = None`` if no category is to be dropped from the feature with index ``i``, e.g. when `drop='if_binary'` and the feature - isn't binary + isn't binary. ``drop_idx_ = None`` if all the transformed features will be retained. From 166abc29fbc8ef572fdd6087d1838223db563c02 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 6 Mar 2020 09:56:37 +0100 Subject: [PATCH 08/11] Address @glemaitre comments. --- sklearn/preprocessing/_encoders.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 71574c5f1ea7d..dccb3bb4baf37 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -225,13 +225,13 @@ class OneHotEncoder(_BaseEncoder): (if any). drop_idx_ : array of shape (n_features,) - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to - be dropped for each feature. - ``drop_idx_[i] = None`` if no category is to be dropped from the - feature with index ``i``, e.g. when `drop='if_binary'` and the feature - isn't binary. - - ``drop_idx_ = None`` if all the transformed features will be retained. + - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category + to be dropped for each feature. + - ``drop_idx_[i] = None`` if no category is to be dropped from the + feature with index ``i``, e.g. when `drop='if_binary'` and the + feature isn't binary. + - ``drop_idx_ = None`` if all the transformed features will be + retained. See Also -------- @@ -510,7 +510,7 @@ def inverse_transform(self, X): found_unknown = {} for i in range(n_features): - if (self.drop is None) or (self.drop_idx_[i] is None): + if self.drop is None or self.drop_idx_[i] is None: cats = self.categories_[i] else: cats = np.delete(self.categories_[i], self.drop_idx_[i]) @@ -582,7 +582,7 @@ def get_feature_names(self, input_features=None): for i in range(len(cats)): names = [ input_features[i] + '_' + str(t) for t in cats[i]] - if (self.drop is not None) and (self.drop_idx_[i] is not None): + if self.drop is not None and self.drop_idx_[i] is not None: names.pop(self.drop_idx_[i]) feature_names.extend(names) From 9ff6e526463fd0d62994622b0d14245f27c151f2 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 9 Mar 2020 12:06:32 +0100 Subject: [PATCH 09/11] Only check fitted parameters. Add test for consistency when restting drop. --- sklearn/preprocessing/_encoders.py | 22 +++++++++----------- sklearn/preprocessing/tests/test_encoders.py | 16 ++++++++++++++ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index dccb3bb4baf37..86be9d335bd9e 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -422,7 +422,7 @@ def transform(self, X): n_samples, n_features = X_int.shape - if self.drop is not None: + if self.drop_idx_ is not None: to_drop = self.drop_idx_.copy() # We remove all the dropped categories from mask, and decrement all # categories that occur after them to avoid an empty column. @@ -485,16 +485,14 @@ def inverse_transform(self, X): n_samples, _ = X.shape n_features = len(self.categories_) - if self.drop is None: + if self.drop_idx_ is None: n_transformed_features = sum(len(cats) for cats in self.categories_) - elif isinstance(self.drop, str) and self.drop == 'if_binary': - n_transformed_features = sum(1 if len(cats) == 2 - else len(cats) - for cats in self.categories_) else: - n_transformed_features = sum(len(cats) - 1 - for cats in self.categories_) + n_transformed_features = sum( + len(cats) - 1 if to_drop is not None else len(cats) + for cats, to_drop in zip(self.categories_, self.drop_idx_) + ) # validate shape of passed X msg = ("Shape of the passed X data is not correct. Expected {0} " @@ -510,7 +508,7 @@ def inverse_transform(self, X): found_unknown = {} for i in range(n_features): - if self.drop is None or self.drop_idx_[i] is None: + if self.drop_idx_ is None or self.drop_idx_[i] is None: cats = self.categories_[i] else: cats = np.delete(self.categories_[i], self.drop_idx_[i]) @@ -533,9 +531,9 @@ def inverse_transform(self, X): if unknown.any(): found_unknown[i] = unknown # drop will either be None or handle_unknown will be error. If - # self.drop is not None, then we can safely assume that all of + # self.drop_idx_ is not None, then we can safely assume that all of # the nulls in each column are the dropped value - elif self.drop is not None: + elif self.drop_idx_ is not None: dropped = np.asarray(sub.sum(axis=1) == 0).flatten() if dropped.any(): X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]] @@ -582,7 +580,7 @@ def get_feature_names(self, input_features=None): for i in range(len(cats)): names = [ input_features[i] + '_' + str(t) for t in cats[i]] - if self.drop is not None and self.drop_idx_[i] is not None: + if self.drop_idx_ is not None and self.drop_idx_[i] is not None: names.pop(self.drop_idx_[i]) feature_names.extend(names) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 175671ecbad33..13f849d0d411d 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -268,6 +268,22 @@ def test_one_hot_encoder_inverse_if_binary(): assert_array_equal(ohe.inverse_transform(X_tr), X) +# check that resetting drop option without refitting does not throw an error +@pytest.mark.parametrize('drop', ['if_binary', 'first', None]) +@pytest.mark.parametrize('reset_drop', ['if_binary', 'first', None]) +def test_one_hot_encoder_drop_reset(drop, reset_drop): + X = np.array([['Male', 1], + ['Female', 3], + ['Female', 2]], dtype=object) + ohe = OneHotEncoder(drop=drop, sparse=False) + X_fit = ohe.fit(X) + X_tr = ohe.transform(X) + feature_names = ohe.get_feature_names() + ohe.set_params(drop=reset_drop) + assert_array_equal(ohe.inverse_transform(X_tr), X) + assert_array_equal(ohe.transform(X), X_tr) + assert_array_equal(ohe.get_feature_names(), feature_names) + @pytest.mark.parametrize("method", ['fit', 'fit_transform']) @pytest.mark.parametrize("X", [ [1, 2], From 3aaddb718594bb7a5b1f5c73c80dc31bd8897fc5 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 9 Mar 2020 12:10:09 +0100 Subject: [PATCH 10/11] Fix lint error --- sklearn/preprocessing/tests/test_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 13f849d0d411d..4a74ac113f446 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -276,7 +276,7 @@ def test_one_hot_encoder_drop_reset(drop, reset_drop): ['Female', 3], ['Female', 2]], dtype=object) ohe = OneHotEncoder(drop=drop, sparse=False) - X_fit = ohe.fit(X) + ohe.fit(X) X_tr = ohe.transform(X) feature_names = ohe.get_feature_names() ohe.set_params(drop=reset_drop) From f15d841318cfdbc5bb9df198ac523a40276b5648 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 9 Mar 2020 15:28:18 +0100 Subject: [PATCH 11/11] Update sklearn/preprocessing/tests/test_encoders.py Co-Authored-By: Guillaume Lemaitre --- sklearn/preprocessing/tests/test_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 4a74ac113f446..7e23aa2d485c2 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -281,7 +281,7 @@ def test_one_hot_encoder_drop_reset(drop, reset_drop): feature_names = ohe.get_feature_names() ohe.set_params(drop=reset_drop) assert_array_equal(ohe.inverse_transform(X_tr), X) - assert_array_equal(ohe.transform(X), X_tr) + assert_allclose(ohe.transform(X), X_tr) assert_array_equal(ohe.get_feature_names(), feature_names) @pytest.mark.parametrize("method", ['fit', 'fit_transform'])