Skip to content

Commit f763c61

Browse files
cmarmothomasjpfanglemaitre
authored
ENH Add check for non binary variables in OneHotEncoder. (scikit-learn#16585)
Co-authored-by: Thomas J Fan <thomasjpfan@gmail.com> Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent b189bf6 commit f763c61

File tree

2 files changed

+45
-29
lines changed

2 files changed

+45
-29
lines changed

sklearn/preprocessing/_encoders.py

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -225,13 +225,13 @@ class OneHotEncoder(_BaseEncoder):
225225
(if any).
226226
227227
drop_idx_ : array of shape (n_features,)
228-
``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to
229-
be dropped for each feature.
230-
``drop_idx_[i] = -1`` if no category is to be dropped from the feature
231-
with index ``i``, e.g. when `drop='if_binary'` and the feature isn't
232-
binary
233-
234-
``drop_idx_ = None`` if all the transformed features will be retained.
228+
- ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
229+
to be dropped for each feature.
230+
- ``drop_idx_[i] = None`` if no category is to be dropped from the
231+
feature with index ``i``, e.g. when `drop='if_binary'` and the
232+
feature isn't binary.
233+
- ``drop_idx_ = None`` if all the transformed features will be
234+
retained.
235235
236236
See Also
237237
--------
@@ -316,10 +316,10 @@ def _compute_drop_idx(self):
316316
return None
317317
elif isinstance(self.drop, str):
318318
if self.drop == 'first':
319-
return np.zeros(len(self.categories_), dtype=np.int_)
319+
return np.zeros(len(self.categories_), dtype=np.object)
320320
elif self.drop == 'if_binary':
321-
return np.array([0 if len(cats) == 2 else -1
322-
for cats in self.categories_], dtype=np.int_)
321+
return np.array([0 if len(cats) == 2 else None
322+
for cats in self.categories_], dtype=np.object)
323323
else:
324324
msg = (
325325
"Wrong input for parameter `drop`. Expected "
@@ -354,7 +354,8 @@ def _compute_drop_idx(self):
354354
raise ValueError(msg)
355355
return np.array([np.where(cat_list == val)[0][0]
356356
for (val, cat_list) in
357-
zip(self.drop, self.categories_)], dtype=np.int_)
357+
zip(self.drop, self.categories_)],
358+
dtype=np.object)
358359

359360
def fit(self, X, y=None):
360361
"""
@@ -421,7 +422,7 @@ def transform(self, X):
421422

422423
n_samples, n_features = X_int.shape
423424

424-
if self.drop is not None:
425+
if self.drop_idx_ is not None:
425426
to_drop = self.drop_idx_.copy()
426427
# We remove all the dropped categories from mask, and decrement all
427428
# categories that occur after them to avoid an empty column.
@@ -431,7 +432,7 @@ def transform(self, X):
431432
n_cats = len(cats)
432433

433434
# drop='if_binary' but feature isn't binary
434-
if to_drop[i] == -1:
435+
if to_drop[i] is None:
435436
# set to cardinality to not drop from X_int
436437
to_drop[i] = n_cats
437438
n_values.append(n_cats)
@@ -484,16 +485,14 @@ def inverse_transform(self, X):
484485

485486
n_samples, _ = X.shape
486487
n_features = len(self.categories_)
487-
if self.drop is None:
488+
if self.drop_idx_ is None:
488489
n_transformed_features = sum(len(cats)
489490
for cats in self.categories_)
490-
elif isinstance(self.drop, str) and self.drop == 'if_binary':
491-
n_transformed_features = sum(1 if len(cats) == 2
492-
else len(cats)
493-
for cats in self.categories_)
494491
else:
495-
n_transformed_features = sum(len(cats) - 1
496-
for cats in self.categories_)
492+
n_transformed_features = sum(
493+
len(cats) - 1 if to_drop is not None else len(cats)
494+
for cats, to_drop in zip(self.categories_, self.drop_idx_)
495+
)
497496

498497
# validate shape of passed X
499498
msg = ("Shape of the passed X data is not correct. Expected {0} "
@@ -509,7 +508,7 @@ def inverse_transform(self, X):
509508
found_unknown = {}
510509

511510
for i in range(n_features):
512-
if self.drop is None:
511+
if self.drop_idx_ is None or self.drop_idx_[i] is None:
513512
cats = self.categories_[i]
514513
else:
515514
cats = np.delete(self.categories_[i], self.drop_idx_[i])
@@ -532,9 +531,9 @@ def inverse_transform(self, X):
532531
if unknown.any():
533532
found_unknown[i] = unknown
534533
# drop will either be None or handle_unknown will be error. If
535-
# self.drop is not None, then we can safely assume that all of
534+
# self.drop_idx_ is not None, then we can safely assume that all of
536535
# the nulls in each column are the dropped value
537-
elif self.drop is not None:
536+
elif self.drop_idx_ is not None:
538537
dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
539538
if dropped.any():
540539
X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]]
@@ -581,7 +580,7 @@ def get_feature_names(self, input_features=None):
581580
for i in range(len(cats)):
582581
names = [
583582
input_features[i] + '_' + str(t) for t in cats[i]]
584-
if self.drop is not None:
583+
if self.drop_idx_ is not None and self.drop_idx_[i] is not None:
585584
names.pop(self.drop_idx_[i])
586585
feature_names.extend(names)
587586

sklearn/preprocessing/tests/test_encoders.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,22 @@ def test_one_hot_encoder_inverse_if_binary():
268268
assert_array_equal(ohe.inverse_transform(X_tr), X)
269269

270270

271+
# check that resetting drop option without refitting does not throw an error
272+
@pytest.mark.parametrize('drop', ['if_binary', 'first', None])
273+
@pytest.mark.parametrize('reset_drop', ['if_binary', 'first', None])
274+
def test_one_hot_encoder_drop_reset(drop, reset_drop):
275+
X = np.array([['Male', 1],
276+
['Female', 3],
277+
['Female', 2]], dtype=object)
278+
ohe = OneHotEncoder(drop=drop, sparse=False)
279+
ohe.fit(X)
280+
X_tr = ohe.transform(X)
281+
feature_names = ohe.get_feature_names()
282+
ohe.set_params(drop=reset_drop)
283+
assert_array_equal(ohe.inverse_transform(X_tr), X)
284+
assert_allclose(ohe.transform(X), X_tr)
285+
assert_array_equal(ohe.get_feature_names(), feature_names)
286+
271287
@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
272288
@pytest.mark.parametrize("X", [
273289
[1, 2],
@@ -388,8 +404,9 @@ def test_one_hot_encoder_pandas():
388404

389405
@pytest.mark.parametrize("drop, expected_names",
390406
[('first', ['x0_c', 'x2_b']),
407+
('if_binary', ['x0_c', 'x1_2', 'x2_b']),
391408
(['c', 2, 'b'], ['x0_b', 'x2_a'])],
392-
ids=['first', 'manual'])
409+
ids=['first', 'binary', 'manual'])
393410
def test_one_hot_encoder_feature_names_drop(drop, expected_names):
394411
X = [['c', 2, 'a'],
395412
['b', 2, 'b']]
@@ -409,7 +426,7 @@ def test_one_hot_encoder_drop_equals_if_binary():
409426
expected = np.array([[1., 0., 0., 1.],
410427
[0., 1., 0., 0.],
411428
[0., 0., 1., 1.]])
412-
expected_drop_idx = np.array([-1, 0])
429+
expected_drop_idx = np.array([None, 0])
413430

414431
ohe = OneHotEncoder(drop='if_binary', sparse=False)
415432
result = ohe.fit_transform(X)
@@ -423,7 +440,7 @@ def test_one_hot_encoder_drop_equals_if_binary():
423440
expected = np.array([[1., 1.],
424441
[0., 1.],
425442
[0., 1.]])
426-
expected_drop_idx = np.array([0, -1])
443+
expected_drop_idx = np.array([0, None])
427444

428445
ohe = OneHotEncoder(drop='if_binary', sparse=False)
429446
result = ohe.fit_transform(X)
@@ -662,9 +679,9 @@ def test_categories(density, drop):
662679
for drop_cat, drop_idx, cat_list in zip(drop,
663680
ohe_test.drop_idx_,
664681
ohe_test.categories_):
665-
assert cat_list[drop_idx] == drop_cat
682+
assert cat_list[int(drop_idx)] == drop_cat
666683
assert isinstance(ohe_test.drop_idx_, np.ndarray)
667-
assert ohe_test.drop_idx_.dtype == np.int_
684+
assert ohe_test.drop_idx_.dtype == np.object
668685

669686

670687
@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])

0 commit comments

Comments
 (0)