Skip to content

ENH Add check for non binary variables in OneHotEncoder. #16585

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 10, 2020
47 changes: 23 additions & 24 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,13 +225,13 @@ class OneHotEncoder(_BaseEncoder):
(if any).

drop_idx_ : array of shape (n_features,)
``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to
be dropped for each feature.
``drop_idx_[i] = -1`` if no category is to be dropped from the feature
with index ``i``, e.g. when `drop='if_binary'` and the feature isn't
binary

``drop_idx_ = None`` if all the transformed features will be retained.
- ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
to be dropped for each feature.
- ``drop_idx_[i] = None`` if no category is to be dropped from the
feature with index ``i``, e.g. when `drop='if_binary'` and the
feature isn't binary.
- ``drop_idx_ = None`` if all the transformed features will be
retained.

See Also
--------
Expand Down Expand Up @@ -316,10 +316,10 @@ def _compute_drop_idx(self):
return None
elif isinstance(self.drop, str):
if self.drop == 'first':
return np.zeros(len(self.categories_), dtype=np.int_)
return np.zeros(len(self.categories_), dtype=np.object)
elif self.drop == 'if_binary':
return np.array([0 if len(cats) == 2 else -1
for cats in self.categories_], dtype=np.int_)
return np.array([0 if len(cats) == 2 else None
for cats in self.categories_], dtype=np.object)
else:
msg = (
"Wrong input for parameter `drop`. Expected "
Expand Down Expand Up @@ -354,7 +354,8 @@ def _compute_drop_idx(self):
raise ValueError(msg)
return np.array([np.where(cat_list == val)[0][0]
for (val, cat_list) in
zip(self.drop, self.categories_)], dtype=np.int_)
zip(self.drop, self.categories_)],
dtype=np.object)

def fit(self, X, y=None):
"""
Expand Down Expand Up @@ -421,7 +422,7 @@ def transform(self, X):

n_samples, n_features = X_int.shape

if self.drop is not None:
if self.drop_idx_ is not None:
to_drop = self.drop_idx_.copy()
# We remove all the dropped categories from mask, and decrement all
# categories that occur after them to avoid an empty column.
Expand All @@ -431,7 +432,7 @@ def transform(self, X):
n_cats = len(cats)

# drop='if_binary' but feature isn't binary
if to_drop[i] == -1:
if to_drop[i] is None:
# set to cardinality to not drop from X_int
to_drop[i] = n_cats
n_values.append(n_cats)
Expand Down Expand Up @@ -484,16 +485,14 @@ def inverse_transform(self, X):

n_samples, _ = X.shape
n_features = len(self.categories_)
if self.drop is None:
if self.drop_idx_ is None:
n_transformed_features = sum(len(cats)
for cats in self.categories_)
elif isinstance(self.drop, str) and self.drop == 'if_binary':
n_transformed_features = sum(1 if len(cats) == 2
else len(cats)
for cats in self.categories_)
else:
n_transformed_features = sum(len(cats) - 1
for cats in self.categories_)
n_transformed_features = sum(
len(cats) - 1 if to_drop is not None else len(cats)
for cats, to_drop in zip(self.categories_, self.drop_idx_)
)

# validate shape of passed X
msg = ("Shape of the passed X data is not correct. Expected {0} "
Expand All @@ -509,7 +508,7 @@ def inverse_transform(self, X):
found_unknown = {}

for i in range(n_features):
if self.drop is None:
if self.drop_idx_ is None or self.drop_idx_[i] is None:
cats = self.categories_[i]
else:
cats = np.delete(self.categories_[i], self.drop_idx_[i])
Expand All @@ -532,9 +531,9 @@ def inverse_transform(self, X):
if unknown.any():
found_unknown[i] = unknown
# drop will either be None or handle_unknown will be error. If
# self.drop is not None, then we can safely assume that all of
# self.drop_idx_ is not None, then we can safely assume that all of
# the nulls in each column are the dropped value
elif self.drop is not None:
elif self.drop_idx_ is not None:
dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
if dropped.any():
X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]]
Expand Down Expand Up @@ -581,7 +580,7 @@ def get_feature_names(self, input_features=None):
for i in range(len(cats)):
names = [
input_features[i] + '_' + str(t) for t in cats[i]]
if self.drop is not None:
if self.drop_idx_ is not None and self.drop_idx_[i] is not None:
names.pop(self.drop_idx_[i])
feature_names.extend(names)

Expand Down
27 changes: 22 additions & 5 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,22 @@ def test_one_hot_encoder_inverse_if_binary():
assert_array_equal(ohe.inverse_transform(X_tr), X)


# check that resetting drop option without refitting does not throw an error
@pytest.mark.parametrize('drop', ['if_binary', 'first', None])
@pytest.mark.parametrize('reset_drop', ['if_binary', 'first', None])
def test_one_hot_encoder_drop_reset(drop, reset_drop):
X = np.array([['Male', 1],
['Female', 3],
['Female', 2]], dtype=object)
ohe = OneHotEncoder(drop=drop, sparse=False)
ohe.fit(X)
X_tr = ohe.transform(X)
feature_names = ohe.get_feature_names()
ohe.set_params(drop=reset_drop)
assert_array_equal(ohe.inverse_transform(X_tr), X)
assert_allclose(ohe.transform(X), X_tr)
assert_array_equal(ohe.get_feature_names(), feature_names)

@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
@pytest.mark.parametrize("X", [
[1, 2],
Expand Down Expand Up @@ -388,8 +404,9 @@ def test_one_hot_encoder_pandas():

@pytest.mark.parametrize("drop, expected_names",
[('first', ['x0_c', 'x2_b']),
('if_binary', ['x0_c', 'x1_2', 'x2_b']),
(['c', 2, 'b'], ['x0_b', 'x2_a'])],
ids=['first', 'manual'])
ids=['first', 'binary', 'manual'])
def test_one_hot_encoder_feature_names_drop(drop, expected_names):
X = [['c', 2, 'a'],
['b', 2, 'b']]
Expand All @@ -409,7 +426,7 @@ def test_one_hot_encoder_drop_equals_if_binary():
expected = np.array([[1., 0., 0., 1.],
[0., 1., 0., 0.],
[0., 0., 1., 1.]])
expected_drop_idx = np.array([-1, 0])
expected_drop_idx = np.array([None, 0])

ohe = OneHotEncoder(drop='if_binary', sparse=False)
result = ohe.fit_transform(X)
Expand All @@ -423,7 +440,7 @@ def test_one_hot_encoder_drop_equals_if_binary():
expected = np.array([[1., 1.],
[0., 1.],
[0., 1.]])
expected_drop_idx = np.array([0, -1])
expected_drop_idx = np.array([0, None])

ohe = OneHotEncoder(drop='if_binary', sparse=False)
result = ohe.fit_transform(X)
Expand Down Expand Up @@ -662,9 +679,9 @@ def test_categories(density, drop):
for drop_cat, drop_idx, cat_list in zip(drop,
ohe_test.drop_idx_,
ohe_test.categories_):
assert cat_list[drop_idx] == drop_cat
assert cat_list[int(drop_idx)] == drop_cat
assert isinstance(ohe_test.drop_idx_, np.ndarray)
assert ohe_test.drop_idx_.dtype == np.int_
assert ohe_test.drop_idx_.dtype == np.object


@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
Expand Down