Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 33 additions & 14 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class _BaseEncoder(BaseEstimator, TransformerMixin):

"""

def _check_X(self, X):
def _check_X(self, X, is_fit=False):
"""
Perform custom check_array:
- convert list of strings to object dtype
Expand Down Expand Up @@ -61,10 +61,38 @@ def _check_X(self, X):
n_samples, n_features = X.shape
X_columns = []

if is_fit:
self.features_dtype = []
for i in range(n_features):
Xi = self._get_feature(X, feature_idx=i)
Xi = check_array(Xi, ensure_2d=False, dtype=None,
force_all_finite=needs_validation)

if Xi.dtype.name == 'category':
# categorical dtype; do not want to convert to an array,
# check if there are no nans (otherwise done in check_array())
if Xi.isna().any():
raise ValueError("Input contains NaN")
else:
Xi = check_array(Xi, ensure_2d=False, dtype=None,
force_all_finite=needs_validation)

if is_fit:
# save the dtype or exact categories if category dtype
if Xi.dtype.name == 'category':
f_dtype = Xi.cat.categories
else:
f_dtype = Xi.dtype
self.features_dtype.append(f_dtype)
else:
# transform, check if dtype is the same as it was passed by fit
if not (Xi.dtype == self.features_dtype[i]):
if Xi.dtype.name == 'category':
# check if categories are the same
if not (Xi.cat.categories ==
self.features_dtype[i]).all():
raise ValueError("""Categories of
the features were
different in fit() and
in the transform()""")
X_columns.append(Xi)

return X_columns, n_samples, n_features
Expand All @@ -77,7 +105,7 @@ def _get_feature(self, X, feature_idx):
return X[:, feature_idx]

def _fit(self, X, handle_unknown='error'):
X_list, n_samples, n_features = self._check_X(X)
X_list, n_samples, n_features = self._check_X(X, is_fit=True)

if self._categories != 'auto':
if len(self._categories) != n_features:
Expand Down Expand Up @@ -114,7 +142,6 @@ def _transform(self, X, handle_unknown='error'):
Xi = X_list[i]
diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
return_mask=True)

if not np.all(valid_mask):
if handle_unknown == 'error':
msg = ("Found unknown categories {0} in column {1}"
Expand Down Expand Up @@ -675,22 +702,14 @@ def _legacy_transform(self, X):

def _transform_new(self, X):
"""New implementation assuming categorical input"""
X_temp = check_array(X, dtype=None)
if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
X = check_array(X, dtype=np.object)
else:
X = X_temp

n_samples, n_features = X.shape

X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
n_samples, n_features = X_int.shape

if self.drop is not None:
to_drop = self.drop_idx_.reshape(1, -1)

# We remove all the dropped categories from mask, and decrement all
# categories that occur after them to avoid an empty column.

keep_cells = X_int != to_drop
X_mask &= keep_cells
X_int[X_int > to_drop] -= 1
Expand Down
26 changes: 25 additions & 1 deletion sklearn/preprocessing/label.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,17 @@ def _encode_python(values, uniques=None, encode=False):
return uniques


def _encode_categorical(values, uniques=None, encode=False):
# only used in _encode below, see docstring there for details
cat_values = values.array
if uniques is None:
uniques = cat_values.categories
if encode:
# check if all values.categories are included in uniques
return uniques, cat_values.codes
else:
return uniques

def _encode(values, uniques=None, encode=False):
"""Helper function to factorize (find uniques) and encode values.

Expand Down Expand Up @@ -104,8 +115,11 @@ def _encode(values, uniques=None, encode=False):
try:
res = _encode_python(values, uniques, encode)
except TypeError:
raise TypeError("argument must be a string or number")
raise TypeError("argument must be a string or a number")
return res
elif values.dtype.name == "category":
# pandas dtype category
return _encode_categorical(values, uniques, encode)
else:
return _encode_numpy(values, uniques, encode)

Expand Down Expand Up @@ -147,6 +161,16 @@ def _encode_check_unknown(values, uniques, return_mask=False):
return diff, valid_mask
else:
return diff
elif values.dtype.name == "category":
# if category Pandas datatype is used there should never been diff
cat_values = values.array
uniques_set = set(uniques)
diff = list(set(cat_values) - uniques_set)
if return_mask:
valid_mask = np.ones(len(values), dtype=bool)
return diff, valid_mask
else:
return diff
else:
unique_values = np.unique(values)
diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
Expand Down
38 changes: 38 additions & 0 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,44 @@ def test_one_hot_encoder_dtype_pandas(output_dtype):
assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)


@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
def test_one_hot_encoder_categorical_dtype(method):
pd = pytest.importorskip('pandas')
cat = pd.Categorical(["a", "b", "c"], categories=["b", "a", "c", "d"])
X_df = pd.DataFrame({"A": cat, "B": ["a", "c", "c"]})

oh = OneHotEncoder()
getattr(oh, method)(X_df)
cats = oh.categories_
assert_array_equal(cats[0], X_df['A'].cat.categories)
assert_array_equal(cats[1], X_df['B'].unique())


@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
def test_categorical_nans(method):
# ensures error if categorical datatype contains Nones
pd = pytest.importorskip('pandas')
cat = pd.Categorical(["a", None, "c"], categories=["b", "a", "c", "d"])
X_df = pd.DataFrame({"A": cat, "B": ["a", "c", "c"]})

oh = OneHotEncoder()
with pytest.raises(ValueError, match="Input contains NaN"):
getattr(oh, method)(X_df)


def test_categorical_same_category_fit_transform():
# tests that all the categories are included within specified categories
pd = pytest.importorskip('pandas')
cat_fit = pd.Categorical(["a", "b"], categories=["b", "a", "c", "d"])
cat_transform = pd.Categorical(["b", "a"], categories=["a", "c", "d", "b"])
X_fit = pd.DataFrame({"A": cat_fit, "B": ["a", "c"]})
X_transform = pd.DataFrame({"A": cat_transform, "B": ["a", "c"]})

oh = OneHotEncoder()
oh.fit(X_fit)
with pytest.raises(ValueError, match="Categories of"):
oh.transform(X_transform)

def test_one_hot_encoder_set_params():
X = np.array([[1, 2]]).T
oh = OneHotEncoder()
Expand Down