Skip to content

[MRG] Fix incorrect error when OneHotEncoder.transform called prior to fit #12443

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Nov 12, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,8 @@
from .base import _transform_selected
from .label import _encode, _encode_check_unknown


range = six.moves.range


__all__ = [
'OneHotEncoder',
'OrdinalEncoder'
Expand Down Expand Up @@ -383,6 +381,12 @@ def _handle_deprecations(self, X):
"The 'categorical_features' keyword is deprecated in "
"version 0.20 and will be removed in 0.22. You can "
"use the ColumnTransformer instead.", DeprecationWarning)
# Set categories_ to empty list if no categorical columns exist
n_features = X.shape[1]
sel = np.zeros(n_features, dtype=bool)
sel[np.asarray(self.categorical_features)] = True
if sum(sel) == 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have not looked at the code in detail, but isn't this equivalent to,

not self.categorical_features.any()

or something similar ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

categorical_features can be a array of indices or a mask. This works in either case. I followed the pattern in sklearn.preprocessing.base._transform_selected

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment above this code block to explain the specific case that this code is checking for?

self.categories_ = []
self._legacy_mode = True
self._categorical_features = self.categorical_features
else:
Expand Down Expand Up @@ -591,6 +595,7 @@ def transform(self, X):
X_out : sparse matrix if sparse=True else a 2-d array
Transformed input.
"""
check_is_fitted(self, 'categories_')
if self._legacy_mode:
return _transform_selected(X, self._legacy_transform, self.dtype,
self._categorical_features,
Expand Down Expand Up @@ -683,7 +688,7 @@ def get_feature_names(self, input_features=None):
cats = self.categories_
if input_features is None:
input_features = ['x%d' % i for i in range(len(cats))]
elif(len(input_features) != len(self.categories_)):
elif len(input_features) != len(self.categories_):
raise ValueError(
"input_features should have length equal to number of "
"features ({}), got {}".format(len(self.categories_),
Expand Down
23 changes: 23 additions & 0 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from scipy import sparse
import pytest

from sklearn.exceptions import NotFittedError
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_raises
Expand Down Expand Up @@ -250,6 +251,28 @@ def test_one_hot_encoder_handle_unknown():
assert_raises(ValueError, oh.fit, X)


def test_one_hot_encoder_not_fitted():
X = np.array([['a'], ['b']])
enc = OneHotEncoder(categories=['a', 'b'])
msg = ("This OneHotEncoder instance is not fitted yet. "
"Call 'fit' with appropriate arguments before using this method.")
with pytest.raises(NotFittedError, match=msg):
enc.transform(X)


def test_one_hot_encoder_no_categorical_features():
X = np.array([[3, 2, 1], [0, 1, 1]], dtype='float64')

cat = [False, False, False]
enc = OneHotEncoder(categorical_features=cat)
with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
X_tr = enc.fit_transform(X)
expected_features = np.array(list(), dtype='object')
assert_array_equal(X, X_tr)
assert_array_equal(enc.get_feature_names(), expected_features)
assert enc.categories_ == []


@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
@pytest.mark.parametrize("input_dtype", [np.int32, np.float32, np.float64])
def test_one_hot_encoder_dtype(input_dtype, output_dtype):
Expand Down