Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
6ba1d47
ENH Uses pandas categories when encoding
thomasjpfan Oct 29, 2019
fe5908d
TST Adds test for dropping
thomasjpfan Oct 29, 2019
e03ef4f
DOC Adds comments
thomasjpfan Oct 29, 2019
20808c0
ENH Uses values instead
thomasjpfan Oct 29, 2019
6706e33
TST Ordered dtypes
thomasjpfan Oct 29, 2019
25c2bdf
TST Uses pip to install pandas
thomasjpfan Oct 29, 2019
6cd2c62
TST Always install pandas
thomasjpfan Oct 29, 2019
131e266
DOC Adds to user guide
thomasjpfan Oct 29, 2019
24d9434
DOC Adds whats new
thomasjpfan Oct 29, 2019
3545088
DOC Update whats new
thomasjpfan Oct 29, 2019
b127441
Merge remote-tracking branch 'upstream/master' into dtype_ordering_ca…
thomasjpfan Oct 29, 2019
2b8fc5e
REV Revert build
thomasjpfan Oct 29, 2019
bd061dc
MNT Support python35
thomasjpfan Oct 29, 2019
72a8ade
ENH Enable support for non dataframes
thomasjpfan Oct 29, 2019
3d72106
DOC Includes fallback to auto in docs
thomasjpfan Oct 29, 2019
53cb537
DOC Update comment
thomasjpfan Oct 29, 2019
b82eec2
CLN Address some joris's comments
thomasjpfan Oct 29, 2019
3d6ff26
CLN Moves encoding to _encode
thomasjpfan Oct 30, 2019
3834b54
CLN Less diffs
thomasjpfan Oct 30, 2019
36ef623
DOC Adds comment regarding unknowns
thomasjpfan Oct 30, 2019
d76ddda
TST Adds pandas to osx
thomasjpfan Oct 30, 2019
1a3e7ae
DOC Remove dtypes in user guide
thomasjpfan Oct 30, 2019
e54b5ee
ENH Only checks categories
thomasjpfan Oct 30, 2019
e0b69d8
DOC Adds tests for learnt categories
thomasjpfan Oct 30, 2019
cadf2e7
DOC Makes changes in _encode_check_unknown
thomasjpfan Oct 30, 2019
8433076
Merge remote-tracking branch 'upstream/master' into dtype_ordering_ca…
thomasjpfan Oct 30, 2019
4fb9784
CLN Smaller diff
thomasjpfan Oct 30, 2019
37faa32
Merge remote-tracking branch 'upstream/master' into dtype_ordering_ca…
thomasjpfan Nov 6, 2019
2407236
TST Adds test to check for nans
thomasjpfan Nov 6, 2019
b77b3e1
Merge remote-tracking branch 'upstream/master' into dtype_ordering_ca…
thomasjpfan Jan 7, 2020
9b34ac5
DOC Move to 0.23
thomasjpfan Jan 7, 2020
21f6342
TST Adds test for categories with more categories than training
thomasjpfan Jan 7, 2020
bd4ec03
Merge remote-tracking branch 'upstream/master' into dtype_ordering_ca…
thomasjpfan Apr 19, 2020
6475789
Merge remote-tracking branch 'upstream/master' into dtype_ordering_ca…
thomasjpfan Apr 29, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/whats_new/v0.23.rst
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,14 @@ Changelog
:mod:`sklearn.preprocessing`
............................

- |Feature| :class:`preprocessing.OneHotEncoder` and
:class:`preprocessing.OrdinalEncoder` now supports `categories='dtype'`,
which enables using pandas categorical dtypes for encoding. :pr:`15396` by
`Thomas Fan`_.

- |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at
transforming. :pr:`15762` by `Thomas Fan`_.

- |Feature| argument `drop` of :class:`preprocessing.OneHotEncoder`
will now accept value 'if_binary' and will drop the first category of
each feature with two categories. :pr:`16245`
Expand Down
70 changes: 62 additions & 8 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ..base import BaseEstimator, TransformerMixin
from ..utils import check_array
from ..utils.validation import check_is_fitted
from ..utils.validation import _assert_all_finite
from ..utils.validation import _deprecate_positional_args

from ._label import _encode, _encode_check_unknown
Expand All @@ -26,7 +27,34 @@ class _BaseEncoder(TransformerMixin, BaseEstimator):

"""

def _check_X(self, X):
def _check_categories_dtypes_equal(self, fit_cat_dict, trans_dtypes):
"""Return True if the categorical dtypes in fit_cat_dtypes are in
trans_dtypes."""
msg = "categorical dtypes in X must match the dtypes used when fitting"

# one is None and the other is not
if ((fit_cat_dict is None and trans_dtypes is not None) or
(fit_cat_dict is not None and trans_dtypes is None)):
raise ValueError(msg)

trans_dtypes_dict = {name: dtype for name, dtype in
trans_dtypes.items() if dtype.name == 'category'}

# names do not match
if set(trans_dtypes_dict) ^ set(fit_cat_dict):
raise ValueError(msg)

for name, fit_cat_dtype in fit_cat_dict.items():
try:
trans_cats = trans_dtypes[name].categories
except (AttributeError, KeyError):
raise ValueError(msg)

# both are categories and are not equal
if all(fit_cat_dtype.categories != trans_cats):
raise ValueError(msg)

def _check_X(self, X, is_fitting):
"""
Perform custom check_array:
- convert list of strings to object dtype
Expand All @@ -36,8 +64,24 @@ def _check_X(self, X):
constructed feature by feature to preserve the data types
of pandas DataFrame columns, as otherwise information is lost
and cannot be used, eg for the `categories_` attribute.

If categories == 'dtypes' and the pandas column is a category,
the pandas series will be returned in this list.
"""
if self.categories == 'dtypes':
X_dtypes = getattr(X, "dtypes", None)
if not is_fitting: # transform
self._check_categories_dtypes_equal(
self._X_fit_cat_dict, X_dtypes)
else:
if X_dtypes is not None:
# only remember categorical dtypes
self._X_fit_cat_dict = {
name: dtype for name, dtype in X_dtypes.items()
if dtype.name == 'category'}
else:
# not a pandas dataframe
self._X_fit_cat_dict = None

if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
# if not a dataframe, do normal check_array validation
X_temp = check_array(X, dtype=None)
Expand All @@ -57,8 +101,12 @@ def _check_X(self, X):

for i in range(n_features):
Xi = self._get_feature(X, feature_idx=i)
Xi = check_array(Xi, ensure_2d=False, dtype=None,
force_all_finite=needs_validation)
if self.categories == 'dtypes' and Xi.dtype.name == 'category':
# TODO: Change when missing value support is added
_assert_all_finite(Xi)
else:
Xi = check_array(Xi, ensure_2d=False, dtype=None,
force_all_finite=needs_validation)
X_columns.append(Xi)

return X_columns, n_samples, n_features
Expand All @@ -71,9 +119,9 @@ def _get_feature(self, X, feature_idx):
return X[:, feature_idx]

def _fit(self, X, handle_unknown='error'):
X_list, n_samples, n_features = self._check_X(X)
X_list, n_samples, n_features = self._check_X(X, is_fitting=True)

if self.categories != 'auto':
if self.categories not in ('auto', 'dtypes'):
if len(self.categories) != n_features:
raise ValueError("Shape mismatch: if categories is an array,"
" it has to be of shape (n_features,).")
Expand All @@ -82,7 +130,7 @@ def _fit(self, X, handle_unknown='error'):

for i in range(n_features):
Xi = X_list[i]
if self.categories == 'auto':
if self.categories in ('auto', 'dtypes'):
cats = _encode(Xi)
else:
cats = np.array(self.categories[i], dtype=Xi.dtype)
Expand All @@ -99,7 +147,7 @@ def _fit(self, X, handle_unknown='error'):
self.categories_.append(cats)

def _transform(self, X, handle_unknown='error'):
X_list, n_samples, n_features = self._check_X(X)
X_list, n_samples, n_features = self._check_X(X, is_fitting=False)

X_int = np.zeros((n_samples, n_features), dtype=np.int)
X_mask = np.ones((n_samples, n_features), dtype=np.bool)
Expand Down Expand Up @@ -179,6 +227,9 @@ class OneHotEncoder(_BaseEncoder):
Categories (unique values) per feature:

- 'auto' : Determine categories automatically from the training data.
- 'dtypes' : Uses pandas categorical dtype to encode categories. For
non pandas categorical data, the categories are automatically
determined from the training data.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe this could be just part of 'auto' as mentioned in the comments.

But if we keep the 'dtypes' option, we should probably raise an error if a non-df is passed in.

- list : ``categories[i]`` holds the categories expected in the ith
column. The passed categories should not mix strings and numeric
values within a single feature, and should be sorted in case of
Expand Down Expand Up @@ -613,6 +664,9 @@ class OrdinalEncoder(_BaseEncoder):
Categories (unique values) per feature:

- 'auto' : Determine categories automatically from the training data.
- 'dtypes' : Uses pandas categorical dtype to encode categories. For
non pandas categorical data, the categories are automatically
determined from the training data.
- list : ``categories[i]`` holds the categories expected in the ith
column. The passed categories should not mix strings and numeric
values, and should be sorted in case of numeric values.
Expand Down
27 changes: 26 additions & 1 deletion sklearn/preprocessing/_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,16 @@ def _encode_python(values, uniques=None, encode=False):
return uniques


def _encode_categorical(values, uniques=None, encode=False):
"""Encode values of a pandas Series with a categorical dtype"""
if uniques is None:
uniques = values.cat.categories.values.copy()
if encode:
encoded = values.cat.codes
return uniques, encoded
return uniques


def _encode(values, uniques=None, encode=False, check_unknown=True):
"""Helper function to factorize (find uniques) and encode values.

Expand All @@ -82,9 +92,12 @@ def _encode(values, uniques=None, encode=False, check_unknown=True):
the case. The calling method needs to ensure this for all non-object
values.

If values is a pandas Series with a categorical dtype then the encoding
will be infered from the series.

Parameters
----------
values : array
values : array or pandas Series
Values to factorize or encode.
uniques : array, optional
If passed, uniques are not determined from passed values (this
Expand Down Expand Up @@ -117,6 +130,8 @@ def _encode(values, uniques=None, encode=False, check_unknown=True):
raise TypeError("Encoders require their input to be uniformly "
f"strings or numbers. Got {types}")
return res
elif values.dtype.name == "category":
return _encode_categorical(values, uniques=uniques, encode=encode)
else:
return _encode_numpy(values, uniques, encode,
check_unknown=check_unknown)
Expand All @@ -129,6 +144,10 @@ def _encode_check_unknown(values, uniques, return_mask=False):
Uses pure python method for object dtype, and numpy method for
all other dtypes.

If values is a pandas Series with a categorical dtype, then we assume that
the dtype is checked to be the same as fit time and no missing values.


Parameters
----------
values : array
Expand Down Expand Up @@ -159,6 +178,12 @@ def _encode_check_unknown(values, uniques, return_mask=False):
return diff, valid_mask
else:
return diff
elif values.dtype.name == "category":
# Assume there are no missing vlaues in categorical
diff = []
if return_mask:
return diff, np.ones_like(len(values), dtype=bool)
return diff
else:
unique_values = np.unique(values)
diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
Expand Down
177 changes: 177 additions & 0 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,3 +696,180 @@ def test_encoders_does_not_support_none_values(Encoder):
with pytest.raises(TypeError, match="Encoders require their input to be "
"uniformly strings or numbers."):
Encoder().fit(values)


@pytest.mark.parametrize('encoder', [
OneHotEncoder(categories="dtypes"),
OrdinalEncoder(categories="dtypes")])
def test_encoder_pd_error_mismatch_dtype(encoder):
pd = pytest.importorskip('pandas')
msg = "categorical dtypes in X must match the dtypes used when fitting"

X_df_orig = pd.DataFrame({
'col_str': ['a', 'b', 'b', 'a'],
'col_int': [3, 2, 1, 2]}, columns=['col_str', 'col_int'])

enc_no_categories = encoder.fit(X_df_orig)

X_df0 = X_df_orig.copy()
X_df0['col_int'] = X_df0['col_int'].astype('category')

# X_df0 has categories while the trained dataframe does not
with pytest.raises(ValueError, match=msg):
enc_no_categories.transform(X_df0)

str_category = pd.api.types.CategoricalDtype(
categories=['b', 'a'], ordered=True)
X_df1 = X_df_orig.copy()
X_df1['col_str'] = X_df1['col_str'].astype(str_category)
X_df1['col_int'] = X_df1['col_int'].astype('category')

# X_df1 has categories while the trained dataframe does not
with pytest.raises(ValueError, match=msg):
enc_no_categories.transform(X_df1)

# Train encoder with categoricals
enc = encoder.fit(X_df1)

# col_str dtype not ordered correctly
X_df2 = X_df_orig.copy()
str_category_lex_ordered = pd.api.types.CategoricalDtype(
categories=['a', 'b'], ordered=True)
X_df2['col_str'] = X_df2['col_str'].astype(str_category_lex_ordered)
X_df2['col_int'] = X_df2['col_int'].astype('category')

with pytest.raises(ValueError, match=msg):
enc.transform(X_df2)

# col_int not a categorical dtype
X_df3 = X_df_orig.copy()
X_df3['col_int'] = X_df3['col_int'].astype(int)

with pytest.raises(ValueError, match=msg):
enc.transform(X_df3)

# number of features is not correct
X_df4 = pd.DataFrame({
'col_str': ['a', 'b', 'b', 'a']})

with pytest.raises(ValueError, match=msg):
enc.transform(X_df4)


@pytest.mark.parametrize("drop", ["first", None])
@pytest.mark.parametrize("dtype", [np.float64, np.int8])
def test_one_hot_encoder_pd_categories_mixed(drop, dtype):
pd = pytest.importorskip('pandas')

X_df = pd.DataFrame(
{'col_str': pd.Categorical(['a', 'b', 'b', 'a'],
categories=['b', 'a'], ordered=True),
'col_int': pd.Categorical([3, 2, 1, 2],
categories=[3, 1, 2], ordered=True),
'norm_float': [1.0, 2.0, 1.0, 1.0], # not a pandas category
'norm_str': ['z', 'd', 'z', 'd']}, # not a pandas category
columns=['col_str', 'col_int', 'norm_float', 'norm_str'])

ohe = OneHotEncoder(categories="dtypes",
dtype=dtype,
sparse=False,
drop=drop).fit(X_df)

assert_array_equal(ohe.categories_[0], ['b', 'a'])
assert_array_equal(ohe.categories_[1], [3, 1, 2])
assert_allclose(ohe.categories_[2], [1.0, 2.0])
assert_array_equal(ohe.categories_[3], ['d', 'z'])

expected_trans = np.array([
[0, 1, 1, 0, 0, 1, 0, 0, 1],
[1, 0, 0, 0, 1, 0, 1, 1, 0],
[1, 0, 0, 1, 0, 1, 0, 0, 1],
[0, 1, 0, 0, 1, 1, 0, 1, 0]], dtype=dtype)

if drop == 'first':
expected_trans = expected_trans[:, [1, 3, 4, 6, 8]]

X_trans = ohe.transform(X_df)
assert_allclose(X_trans, expected_trans)
X_inverse = ohe.inverse_transform(expected_trans)

assert_array_equal(X_inverse, X_df.values)


def test_one_hot_encoder_pd_categories_with_more_categories():
# pandas category contains more categories than in training
pd = pytest.importorskip('pandas')

X_df = pd.DataFrame(
{'col_str': pd.Categorical(['a', 'b', 'b', 'a'],
categories=['b', 'c', 'a'], ordered=True),
'col_int': pd.Categorical([3, 2, 1, 2],
categories=[3, 1, 2, 4], ordered=True)},
columns=['col_str', 'col_int'])

ohe = OneHotEncoder(categories="dtypes", sparse=False).fit(X_df)

assert_array_equal(ohe.categories_[0], ['b', 'c', 'a'])
assert_array_equal(ohe.categories_[1], [3, 1, 2, 4])

expected_trans = np.array([
[0, 0, 1, 1, 0, 0, 0],
[1, 0, 0, 0, 0, 1, 0],
[1, 0, 0, 0, 1, 0, 0],
[0, 0, 1, 0, 0, 1, 0]])

X_trans = ohe.transform(X_df)
assert_allclose(X_trans, expected_trans)
X_inverse = ohe.inverse_transform(expected_trans)

assert_array_equal(X_inverse, X_df.values)


@pytest.mark.parametrize("dtype", [np.float64, np.int8])
def test_ordinal_encoder_pd_categories_mixed(dtype):
pd = pytest.importorskip('pandas')

X_df = pd.DataFrame(
{'col_str': pd.Categorical(['a', 'b', 'b', 'a'],
categories=['b', 'a'], ordered=True),
'col_int': pd.Categorical([3, 2, 1, 2],
categories=[3, 1, 2], ordered=True),
'norm_float': [1.0, 2.0, 1.0, 1.0], # not a pandas category
'norm_str': ['z', 'd', 'z', 'd']}, # not a pandas category
columns=['col_str', 'col_int', 'norm_float', 'norm_str'])

oe = OrdinalEncoder(categories="dtypes", dtype=dtype).fit(X_df)

expected_trans = np.array([
[1, 0, 0, 1], # col_str
[0, 2, 1, 2], # col_int
[0, 1, 0, 0], # norm_float
[1, 0, 1, 0], # norm_str
], dtype=dtype).T

X_trans = oe.fit_transform(X_df)
assert_array_equal(oe.categories_[0], ['b', 'a'])
assert_array_equal(oe.categories_[1], [3, 1, 2])
assert_allclose(oe.categories_[2], [1.0, 2.0])
assert_array_equal(oe.categories_[3], ['d', 'z'])

assert_allclose(X_trans, expected_trans)
X_inverse = oe.inverse_transform(expected_trans)

assert_array_equal(X_inverse, X_df.values)


@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
def test_encoders_does_not_support_missing_values_in_pd_categories(Encoder):
pd = pytest.importorskip('pandas')

X_df = pd.DataFrame(
{'col_str': pd.Categorical(['a', 'b', 'b', 'a', np.nan],
categories=['b', 'a'], ordered=True),
'col_int': pd.Categorical([3, 2, 1, 2, np.nan],
categories=[3, 1, 2], ordered=True)},
columns=['col_str', 'col_int'])

enc = Encoder(categories="dtypes")
with pytest.raises(ValueError, match="Input contains NaN"):
enc.fit(X_df)