From 6ba1d470f7daf22890ee3ef73ae0d4f4eb1dc986 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 11:24:54 -0400 Subject: [PATCH 01/28] ENH Uses pandas categories when encoding --- sklearn/preprocessing/_encoders.py | 82 +++++--- sklearn/preprocessing/tests/test_encoders.py | 185 +++++++++++++++++++ 2 files changed, 240 insertions(+), 27 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index fe3d3aa7f7d25..98f7dfc88afd6 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -9,6 +9,7 @@ from ..utils import check_array from ..utils.fixes import _argmax from ..utils.validation import check_is_fitted +from ..utils.validation import _assert_all_finite from ._label import _encode, _encode_check_unknown @@ -38,6 +39,20 @@ def _check_X(self, X): and cannot be used, eg for the `categories_` attribute. """ + if self.categories == 'dtypes': + if not hasattr(X, 'dtypes'): + raise TypeError("X must be a dataframe when " + "categories='dtypes'") + X_dtypes = getattr(X, 'dtypes') + + if hasattr(self, "_X_fit_dtypes"): + if (len(self._X_fit_dtypes) != len(X_dtypes) or + not all(self._X_fit_dtypes == X_dtypes)): + raise ValueError("X.dtypes must match the dtypes used " + "when fitting") + else: + self._X_fit_dtypes = X_dtypes + if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2): # if not a dataframe, do normal check_array validation X_temp = check_array(X, dtype=None) @@ -57,8 +72,12 @@ def _check_X(self, X): for i in range(n_features): Xi = self._get_feature(X, feature_idx=i) - Xi = check_array(Xi, ensure_2d=False, dtype=None, - force_all_finite=needs_validation) + if self.categories == 'dtypes' and hasattr(Xi, 'cat'): + # TODO: Change missing value support is added + _assert_all_finite(Xi) + else: + Xi = check_array(Xi, ensure_2d=False, dtype=None, + force_all_finite=needs_validation) X_columns.append(Xi) return X_columns, n_samples, n_features @@ -73,7 +92,7 @@ def _get_feature(self, X, feature_idx): def _fit(self, X, handle_unknown='error'): X_list, n_samples, n_features = self._check_X(X) - if self.categories != 'auto': + if self.categories not in ('auto', 'dtypes'): if len(self.categories) != n_features: raise ValueError("Shape mismatch: if categories is an array," " it has to be of shape (n_features,).") @@ -84,6 +103,11 @@ def _fit(self, X, handle_unknown='error'): Xi = X_list[i] if self.categories == 'auto': cats = _encode(Xi) + elif self.categories == 'dtypes': + if hasattr(Xi, "cat"): + cats = Xi.cat.categories.to_numpy() + else: + cats = _encode(Xi) else: cats = np.array(self.categories[i], dtype=Xi.dtype) if Xi.dtype != object: @@ -114,32 +138,36 @@ def _transform(self, X, handle_unknown='error'): for i in range(n_features): Xi = X_list[i] - diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], - return_mask=True) - if not np.all(valid_mask): - if handle_unknown == 'error': - msg = ("Found unknown categories {0} in column {1}" - " during transform".format(diff, i)) - raise ValueError(msg) - else: - # Set the problematic rows to an acceptable value and - # continue `The rows are marked `X_mask` and will be - # removed later. - X_mask[:, i] = valid_mask - # cast Xi into the largest string type necessary - # to handle different lengths of numpy strings - if (self.categories_[i].dtype.kind in ('U', 'S') - and self.categories_[i].itemsize > Xi.itemsize): - Xi = Xi.astype(self.categories_[i].dtype) + if self.categories == 'dtypes' and hasattr(Xi, "cat"): + encoded = Xi.cat.codes + else: + diff, valid_mask = _encode_check_unknown(Xi, + self.categories_[i], + return_mask=True) + if not np.all(valid_mask): + if handle_unknown == 'error': + msg = ("Found unknown categories {0} in column {1}" + " during transform".format(diff, i)) + raise ValueError(msg) else: - Xi = Xi.copy() - - Xi[~valid_mask] = self.categories_[i][0] - # We use check_unknown=False, since _encode_check_unknown was - # already called above. - _, encoded = _encode(Xi, self.categories_[i], encode=True, - check_unknown=False) + # Set the problematic rows to an acceptable value and + # continue `The rows are marked `X_mask` and will be + # removed later. + X_mask[:, i] = valid_mask + # cast Xi into the largest string type necessary + # to handle different lengths of numpy strings + if (self.categories_[i].dtype.kind in ('U', 'S') + and self.categories_[i].itemsize > Xi.itemsize): + Xi = Xi.astype(self.categories_[i].dtype) + else: + Xi = Xi.copy() + + Xi[~valid_mask] = self.categories_[i][0] + # We use check_unknown=False, since _encode_check_unknown was + # already called above. + _, encoded = _encode(Xi, self.categories_[i], encode=True, + check_unknown=False) X_int[:, i] = encoded return X_int, X_mask diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 78590f40ffba5..20d95b6d588c1 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -6,6 +6,7 @@ from scipy import sparse import pytest +from sklearn.base import clone from sklearn.exceptions import NotFittedError from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_allclose @@ -637,3 +638,187 @@ def test_categories(density, drop): @pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder]) def test_encoders_has_categorical_tags(Encoder): assert 'categorical' in Encoder()._get_tags()['X_types'] + + +@pytest.mark.parametrize("is_sparse", [True, False]) +def test_one_hot_encoder_pd_categories(is_sparse): + pd = pytest.importorskip('pandas') + + X_df = pd.DataFrame({ + 'col_str': ['a', 'b', 'b', 'a'], + 'col_int': [3, 2, 1, 2]}) + + str_category = pd.api.types.CategoricalDtype( + categories=['b', 'a']) + int_category = pd.api.types.CategoricalDtype( + categories=[3, 1, 2], ordered=True) + + X_df['col_str'] = X_df['col_str'].astype(str_category) + X_df['col_int'] = X_df['col_int'].astype(int_category) + + ohe = OneHotEncoder(categories='dtypes', sparse=is_sparse) + + expected_trans = np.array([ + [0, 1, 1, 0, 0], + [1, 0, 0, 0, 1], + [1, 0, 0, 1, 0], + [0, 1, 0, 0, 1]], dtype=np.float64) + + X_trans = ohe.fit_transform(X_df) + if is_sparse: + X_trans = X_trans.toarray() + + assert_allclose(X_trans, expected_trans) + + X_inverse = ohe.inverse_transform(expected_trans) + + assert_array_equal(X_inverse, X_df.to_numpy()) + + +@pytest.mark.parametrize('encoder', [ + OneHotEncoder(categories="dtypes"), + OrdinalEncoder(categories="dtypes")]) +def test_encoder_pd_error_mismatch_dtype(encoder): + pd = pytest.importorskip('pandas') + + X_df = pd.DataFrame({ + 'col_str': ['a', 'b', 'b', 'a'], + 'col_int': [3, 2, 1, 2]}) + + str_category = pd.api.types.CategoricalDtype( + categories=['b', 'a'], ordered=True) + + X_df['col_str'] = X_df['col_str'].astype(str_category) + X_df['col_int'] = X_df['col_int'].astype('category') + + enc = clone(encoder).fit(X_df) + + # col_str dtype not ordered correctly + X_df2 = X_df.copy() + str_category_lex_ordered = pd.api.types.CategoricalDtype( + categories=['a', 'b']) + X_df2['col_str'] = X_df2['col_str'].astype(str_category_lex_ordered) + + msg = "X.dtypes must match the dtypes used when fitting" + with pytest.raises(ValueError, match=msg): + enc.transform(X_df2) + + # col_int not a categorical dtype + X_df3 = X_df.copy() + X_df3['col_int'] = X_df3['col_int'].astype(int) + + with pytest.raises(ValueError, match=msg): + enc.transform(X_df2) + + # number of features is not correct + X_df4 = pd.DataFrame({ + 'col_str': ['a', 'b', 'b', 'a']}) + + with pytest.raises(ValueError, match=msg): + enc.transform(X_df4) + + X_np = X_df.to_numpy() + msg = "X must be a dataframe when categories='dtypes'" + with pytest.raises(TypeError, match=msg): + enc.transform(X_np) + + +@pytest.mark.parametrize("is_sparse", [True, False]) +def test_one_hot_encoder_pd_categories_mixed(is_sparse): + pd = pytest.importorskip('pandas') + + X_df = pd.DataFrame({ + 'col_str': ['a', 'b', 'b', 'a'], + 'col_int': [3, 2, 1, 2], + 'norm_float': [1.0, 2.0, 1.0, 1.0], # not a pandas category + 'norm_str': ['z', 'd', 'z', 'd']} # not a pandas category + ) + + str_category = pd.api.types.CategoricalDtype( + categories=['b', 'a']) + int_category = pd.api.types.CategoricalDtype( + categories=[3, 1, 2], ordered=True) + + X_df['col_str'] = X_df['col_str'].astype(str_category) + X_df['col_int'] = X_df['col_int'].astype(int_category) + + ohe = OneHotEncoder(categories="dtypes", sparse=is_sparse).fit(X_df) + + expected_trans = np.array([ + [0, 1, 1, 0, 0, 1, 0, 0, 1], + [1, 0, 0, 0, 1, 0, 1, 1, 0], + [1, 0, 0, 1, 0, 1, 0, 0, 1], + [0, 1, 0, 0, 1, 1, 0, 1, 0]], dtype=np.float64) + + X_trans = ohe.fit_transform(X_df) + if is_sparse: + X_trans = X_trans.toarray() + + assert_allclose(X_trans, expected_trans) + X_inverse = ohe.inverse_transform(expected_trans) + + assert_array_equal(X_inverse, X_df.to_numpy()) + + +def test_ordinal_encoder_pd_categories_mixed(): + pd = pytest.importorskip('pandas') + + X_df = pd.DataFrame({ + 'col_str': ['a', 'b', 'b', 'a'], + 'col_int': [3, 2, 1, 2], + 'norm_float': [1.0, 2.0, 1.0, 1.0], # not a pandas category + 'norm_str': ['z', 'd', 'z', 'd']} # not a pandas category + ) + + str_category = pd.api.types.CategoricalDtype( + categories=['b', 'a']) + int_category = pd.api.types.CategoricalDtype( + categories=[3, 1, 2], ordered=True) + + X_df['col_str'] = X_df['col_str'].astype(str_category) + X_df['col_int'] = X_df['col_int'].astype(int_category) + + ohe = OrdinalEncoder(categories="dtypes").fit(X_df) + + expected_trans = np.array([ + [1, 0, 0, 1], # col_str + [0, 2, 1, 2], # col_int + [0, 1, 0, 0], # norm_float + [1, 0, 1, 0], # norm_str + ], dtype=np.float64).T + + X_trans = ohe.fit_transform(X_df) + + assert_allclose(X_trans, expected_trans) + X_inverse = ohe.inverse_transform(expected_trans) + + assert_array_equal(X_inverse, X_df.to_numpy()) + + +def test_ordinal_encoder_pd_categories(): + pd = pytest.importorskip('pandas') + + X_df = pd.DataFrame({ + 'col_str': ['a', 'b', 'b', 'a'], + 'col_int': [3, 2, 1, 2]}) + + str_category = pd.api.types.CategoricalDtype( + categories=['b', 'a']) + int_category = pd.api.types.CategoricalDtype( + categories=[3, 1, 2], ordered=True) + + X_df['col_str'] = X_df['col_str'].astype(str_category) + X_df['col_int'] = X_df['col_int'].astype(int_category) + + ohe = OrdinalEncoder(categories='dtypes') + + expected_trans = np.array([ + [1, 0, 0, 1], # col_str + [0, 2, 1, 2], # col_int + ], dtype=np.float64).T + + X_trans = ohe.fit_transform(X_df) + assert_allclose(X_trans, expected_trans) + X_inverse = ohe.inverse_transform(expected_trans) + + assert_array_equal(X_inverse, X_df.to_numpy()) From fe5908debea30eedacdb6ee5996c0ebabbfed004 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 11:38:50 -0400 Subject: [PATCH 02/28] TST Adds test for dropping --- sklearn/preprocessing/tests/test_encoders.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 20d95b6d588c1..3f859ec3c917f 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -641,7 +641,8 @@ def test_encoders_has_categorical_tags(Encoder): @pytest.mark.parametrize("is_sparse", [True, False]) -def test_one_hot_encoder_pd_categories(is_sparse): +@pytest.mark.parametrize("drop", ["first", None]) +def test_one_hot_encoder_pd_categories(is_sparse, drop): pd = pytest.importorskip('pandas') X_df = pd.DataFrame({ @@ -656,7 +657,7 @@ def test_one_hot_encoder_pd_categories(is_sparse): X_df['col_str'] = X_df['col_str'].astype(str_category) X_df['col_int'] = X_df['col_int'].astype(int_category) - ohe = OneHotEncoder(categories='dtypes', sparse=is_sparse) + ohe = OneHotEncoder(categories='dtypes', sparse=is_sparse, drop=drop) expected_trans = np.array([ [0, 1, 1, 0, 0], @@ -664,6 +665,9 @@ def test_one_hot_encoder_pd_categories(is_sparse): [1, 0, 0, 1, 0], [0, 1, 0, 0, 1]], dtype=np.float64) + if drop == 'first': + expected_trans = expected_trans[:, [1, 3, 4]] + X_trans = ohe.fit_transform(X_df) if is_sparse: X_trans = X_trans.toarray() @@ -724,7 +728,8 @@ def test_encoder_pd_error_mismatch_dtype(encoder): @pytest.mark.parametrize("is_sparse", [True, False]) -def test_one_hot_encoder_pd_categories_mixed(is_sparse): +@pytest.mark.parametrize("drop", ["first", None]) +def test_one_hot_encoder_pd_categories_mixed(is_sparse, drop): pd = pytest.importorskip('pandas') X_df = pd.DataFrame({ @@ -742,7 +747,9 @@ def test_one_hot_encoder_pd_categories_mixed(is_sparse): X_df['col_str'] = X_df['col_str'].astype(str_category) X_df['col_int'] = X_df['col_int'].astype(int_category) - ohe = OneHotEncoder(categories="dtypes", sparse=is_sparse).fit(X_df) + ohe = OneHotEncoder(categories="dtypes", + sparse=is_sparse, + drop=drop).fit(X_df) expected_trans = np.array([ [0, 1, 1, 0, 0, 1, 0, 0, 1], @@ -750,6 +757,9 @@ def test_one_hot_encoder_pd_categories_mixed(is_sparse): [1, 0, 0, 1, 0, 1, 0, 0, 1], [0, 1, 0, 0, 1, 1, 0, 1, 0]], dtype=np.float64) + if drop == 'first': + expected_trans = expected_trans[:, [1, 3, 4, 6, 8]] + X_trans = ohe.fit_transform(X_df) if is_sparse: X_trans = X_trans.toarray() From e03ef4f90e1613225a80693d4cbddd92af1cd163 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 11:41:04 -0400 Subject: [PATCH 03/28] DOC Adds comments --- sklearn/preprocessing/_encoders.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 98f7dfc88afd6..0f5a3794730c3 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -37,7 +37,8 @@ def _check_X(self, X): constructed feature by feature to preserve the data types of pandas DataFrame columns, as otherwise information is lost and cannot be used, eg for the `categories_` attribute. - + If categories == 'dtype' and the pandas column is a category, + the pandas series will be return in this list. """ if self.categories == 'dtypes': if not hasattr(X, 'dtypes'): @@ -45,7 +46,7 @@ def _check_X(self, X): "categories='dtypes'") X_dtypes = getattr(X, 'dtypes') - if hasattr(self, "_X_fit_dtypes"): + if hasattr(self, "_X_fit_dtypes"): # fitted if (len(self._X_fit_dtypes) != len(X_dtypes) or not all(self._X_fit_dtypes == X_dtypes)): raise ValueError("X.dtypes must match the dtypes used " @@ -140,6 +141,7 @@ def _transform(self, X, handle_unknown='error'): Xi = X_list[i] if self.categories == 'dtypes' and hasattr(Xi, "cat"): + # categorical dtypes contain no unknown values encoded = Xi.cat.codes else: diff, valid_mask = _encode_check_unknown(Xi, From 20808c05fa36ae28a158255562fe30798924dde9 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 11:53:33 -0400 Subject: [PATCH 04/28] ENH Uses values instead --- sklearn/preprocessing/_encoders.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 0f5a3794730c3..5546de5e3d2b8 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -106,7 +106,7 @@ def _fit(self, X, handle_unknown='error'): cats = _encode(Xi) elif self.categories == 'dtypes': if hasattr(Xi, "cat"): - cats = Xi.cat.categories.to_numpy() + cats = Xi.cat.categories.values.copy() else: cats = _encode(Xi) else: @@ -159,8 +159,8 @@ def _transform(self, X, handle_unknown='error'): X_mask[:, i] = valid_mask # cast Xi into the largest string type necessary # to handle different lengths of numpy strings - if (self.categories_[i].dtype.kind in ('U', 'S') - and self.categories_[i].itemsize > Xi.itemsize): + if (self.categories_[i].dtype.kind in ('U', 'S') and + self.categories_[i].itemsize > Xi.itemsize): Xi = Xi.astype(self.categories_[i].dtype) else: Xi = Xi.copy() From 6706e339f0909ddad19ec7422b4e97b5a2d61748 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 12:41:48 -0400 Subject: [PATCH 05/28] TST Ordered dtypes --- sklearn/preprocessing/tests/test_encoders.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 3f859ec3c917f..e578adbb76318 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -650,7 +650,7 @@ def test_one_hot_encoder_pd_categories(is_sparse, drop): 'col_int': [3, 2, 1, 2]}) str_category = pd.api.types.CategoricalDtype( - categories=['b', 'a']) + categories=['b', 'a'], ordered=True) int_category = pd.api.types.CategoricalDtype( categories=[3, 1, 2], ordered=True) @@ -676,7 +676,7 @@ def test_one_hot_encoder_pd_categories(is_sparse, drop): X_inverse = ohe.inverse_transform(expected_trans) - assert_array_equal(X_inverse, X_df.to_numpy()) + assert_array_equal(X_inverse, X_df.values) @pytest.mark.parametrize('encoder', [ @@ -700,7 +700,7 @@ def test_encoder_pd_error_mismatch_dtype(encoder): # col_str dtype not ordered correctly X_df2 = X_df.copy() str_category_lex_ordered = pd.api.types.CategoricalDtype( - categories=['a', 'b']) + categories=['a', 'b'], ordered=True) X_df2['col_str'] = X_df2['col_str'].astype(str_category_lex_ordered) msg = "X.dtypes must match the dtypes used when fitting" @@ -721,7 +721,7 @@ def test_encoder_pd_error_mismatch_dtype(encoder): with pytest.raises(ValueError, match=msg): enc.transform(X_df4) - X_np = X_df.to_numpy() + X_np = X_df.values msg = "X must be a dataframe when categories='dtypes'" with pytest.raises(TypeError, match=msg): enc.transform(X_np) @@ -740,7 +740,7 @@ def test_one_hot_encoder_pd_categories_mixed(is_sparse, drop): ) str_category = pd.api.types.CategoricalDtype( - categories=['b', 'a']) + categories=['b', 'a'], ordered=True) int_category = pd.api.types.CategoricalDtype( categories=[3, 1, 2], ordered=True) @@ -767,7 +767,7 @@ def test_one_hot_encoder_pd_categories_mixed(is_sparse, drop): assert_allclose(X_trans, expected_trans) X_inverse = ohe.inverse_transform(expected_trans) - assert_array_equal(X_inverse, X_df.to_numpy()) + assert_array_equal(X_inverse, X_df.values) def test_ordinal_encoder_pd_categories_mixed(): @@ -781,7 +781,7 @@ def test_ordinal_encoder_pd_categories_mixed(): ) str_category = pd.api.types.CategoricalDtype( - categories=['b', 'a']) + categories=['b', 'a'], ordered=True) int_category = pd.api.types.CategoricalDtype( categories=[3, 1, 2], ordered=True) @@ -802,7 +802,7 @@ def test_ordinal_encoder_pd_categories_mixed(): assert_allclose(X_trans, expected_trans) X_inverse = ohe.inverse_transform(expected_trans) - assert_array_equal(X_inverse, X_df.to_numpy()) + assert_array_equal(X_inverse, X_df.values) def test_ordinal_encoder_pd_categories(): @@ -813,7 +813,7 @@ def test_ordinal_encoder_pd_categories(): 'col_int': [3, 2, 1, 2]}) str_category = pd.api.types.CategoricalDtype( - categories=['b', 'a']) + categories=['b', 'a'], ordered=True) int_category = pd.api.types.CategoricalDtype( categories=[3, 1, 2], ordered=True) @@ -831,4 +831,4 @@ def test_ordinal_encoder_pd_categories(): assert_allclose(X_trans, expected_trans) X_inverse = ohe.inverse_transform(expected_trans) - assert_array_equal(X_inverse, X_df.to_numpy()) + assert_array_equal(X_inverse, X_df.values) From 25c2bdf3393e34ac5360e3fd079032172f348e16 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 13:41:53 -0400 Subject: [PATCH 06/28] TST Uses pip to install pandas --- build_tools/azure/install.sh | 10 ++++++---- sklearn/preprocessing/_encoders.py | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 250003372aef9..8bf21a0bfb488 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -30,10 +30,6 @@ if [[ "$DISTRIB" == "conda" ]]; then TO_INSTALL="$TO_INSTALL nomkl" fi - if [[ -n "$PANDAS_VERSION" ]]; then - TO_INSTALL="$TO_INSTALL pandas=$PANDAS_VERSION" - fi - if [[ -n "$PYAMG_VERSION" ]]; then TO_INSTALL="$TO_INSTALL pyamg=$PYAMG_VERSION" fi @@ -71,6 +67,12 @@ if [[ "$DISTRIB" == "conda" ]]; then pip install pytest-xdist fi + if [[ "$PANDAS_VERSION" == "*" ]]; then + pip install pandas + else + pip install pandas=="$PANDAS_VERSION" + fi + elif [[ "$DISTRIB" == "ubuntu" ]]; then sudo add-apt-repository --remove ppa:ubuntu-toolchain-r/test sudo apt-get install python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev libatlas-dev python3-virtualenv diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 5546de5e3d2b8..4e21d32a5c99a 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -206,6 +206,7 @@ class OneHotEncoder(_BaseEncoder): Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. + - 'dtypes' : Uses pandas categorical dtype to encode categories. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values within a single feature, and should be sorted in case of @@ -573,6 +574,7 @@ class OrdinalEncoder(_BaseEncoder): Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. + - 'dtypes' : Uses pandas categorical dtype to encode categories. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values, and should be sorted in case of numeric values. From 6cd2c62406b14822e4ff4b5b76dac36650d56305 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 14:27:56 -0400 Subject: [PATCH 07/28] TST Always install pandas --- azure-pipelines.yml | 1 - build_tools/azure/install.sh | 6 +----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index aaab848945ccf..00ed3c8196414 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -20,7 +20,6 @@ jobs: INSTALL_MKL: 'false' NUMPY_VERSION: '1.11.0' SCIPY_VERSION: '0.17.0' - PANDAS_VERSION: '*' CYTHON_VERSION: '*' PYTEST_VERSION: '*' PILLOW_VERSION: '4.0.0' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 8bf21a0bfb488..bb04f7f8dbcea 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -67,11 +67,7 @@ if [[ "$DISTRIB" == "conda" ]]; then pip install pytest-xdist fi - if [[ "$PANDAS_VERSION" == "*" ]]; then - pip install pandas - else - pip install pandas=="$PANDAS_VERSION" - fi + python -m pip install pandas elif [[ "$DISTRIB" == "ubuntu" ]]; then sudo add-apt-repository --remove ppa:ubuntu-toolchain-r/test From 131e2663bdaeaadb7c211085df28ae8eda35a28b Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 14:48:47 -0400 Subject: [PATCH 08/28] DOC Adds to user guide --- doc/modules/preprocessing.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 3e41c592fbbdc..e1e095258ea04 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -574,6 +574,27 @@ When this paramenter is not None, ``handle_unknown`` must be set to See :ref:`dict_feature_extraction` for categorical features that are represented as a dict, not as scalars. +When using pandas dataframe with categorical dtypes, :class:`OneHotEncoder` and +:class:`OrdinalEncoder` contain a `categories='dtypes'` option to use the +encoding provided by the the pandas category:: + + >>> import pandas as pd + >>> from pandas.api.types import CategoricalDtype + >>> X_df = pd.DataFrame({ + ... 'col_str': ['a', 'b', 'b', 'a'], + ... 'col_int': [3, 2, 1, 2]}) + >>> str_category = CategoricalDtype(categories=['b', 'a'], ordered=True) + >>> int_category = CategoricalDtype(categories=[3, 1, 2], ordered=True) + >>> X_df['col_str'] = X_df['col_str'].astype(str_category) + >>> X_df['col_int'] = X_df['col_int'].astype(int_category) + >>> enc = preprocessing.OneHotEncoder(categories='dtypes').fit(X_df) + >>> enc.transform(X_df).toarray() + array([[0., 1., 1., 0., 0.], + [1., 0., 0., 0., 1.], + [1., 0., 0., 1., 0.], + [0., 1., 0., 0., 1.]]) + + .. _preprocessing_discretization: Discretization From 24d9434f87d48108c4716b9b6cd3fa6b56944f69 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 14:52:56 -0400 Subject: [PATCH 09/28] DOC Adds whats new --- doc/whats_new/v0.22.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index cf20726949cfc..5095c530d5acb 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -590,6 +590,11 @@ Changelog :mod:`sklearn.preprocessing` ............................ +- |Feature| :class:`preprocessing.OneHotEncoder` and + :class:`preprocessing.OrdinalEncoder` now supports `categories='dtype'`, + which enables support for using pandas categorical dtypes. :pr:`15396` by + `Thomas Fan`_. + - |Enhancement| Avoid unnecessary data copy when fitting preprocessors :class:`preprocessing.StandardScaler`, :class:`preprocessing.MinMaxScaler`, :class:`preprocessing.MaxAbsScaler`, :class:`preprocessing.RobustScaler` From 3545088c84c8ea53b7b5264c6ba9a8f1711e26f6 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 14:53:45 -0400 Subject: [PATCH 10/28] DOC Update whats new --- doc/whats_new/v0.22.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 5095c530d5acb..8848514f8aae4 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -592,7 +592,7 @@ Changelog - |Feature| :class:`preprocessing.OneHotEncoder` and :class:`preprocessing.OrdinalEncoder` now supports `categories='dtype'`, - which enables support for using pandas categorical dtypes. :pr:`15396` by + which enables using pandas categorical dtypes for encoding. :pr:`15396` by `Thomas Fan`_. - |Enhancement| Avoid unnecessary data copy when fitting preprocessors From 2b8fc5ea69c6b2ed8dfeceaeb1ff4563b4a0a7f8 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 15:18:48 -0400 Subject: [PATCH 11/28] REV Revert build --- azure-pipelines.yml | 1 + build_tools/azure/install.sh | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 00ed3c8196414..aaab848945ccf 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -20,6 +20,7 @@ jobs: INSTALL_MKL: 'false' NUMPY_VERSION: '1.11.0' SCIPY_VERSION: '0.17.0' + PANDAS_VERSION: '*' CYTHON_VERSION: '*' PYTEST_VERSION: '*' PILLOW_VERSION: '4.0.0' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index bb04f7f8dbcea..250003372aef9 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -30,6 +30,10 @@ if [[ "$DISTRIB" == "conda" ]]; then TO_INSTALL="$TO_INSTALL nomkl" fi + if [[ -n "$PANDAS_VERSION" ]]; then + TO_INSTALL="$TO_INSTALL pandas=$PANDAS_VERSION" + fi + if [[ -n "$PYAMG_VERSION" ]]; then TO_INSTALL="$TO_INSTALL pyamg=$PYAMG_VERSION" fi @@ -67,8 +71,6 @@ if [[ "$DISTRIB" == "conda" ]]; then pip install pytest-xdist fi - python -m pip install pandas - elif [[ "$DISTRIB" == "ubuntu" ]]; then sudo add-apt-repository --remove ppa:ubuntu-toolchain-r/test sudo apt-get install python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev libatlas-dev python3-virtualenv From bd061dc52915924f6d79bd54d08d60389048f4a3 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 16:18:07 -0400 Subject: [PATCH 12/28] MNT Support python35 --- doc/modules/preprocessing.rst | 2 +- sklearn/preprocessing/_encoders.py | 21 ++++++-- sklearn/preprocessing/tests/test_encoders.py | 57 ++++++++++++-------- 3 files changed, 54 insertions(+), 26 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index e1e095258ea04..976e10eea8f2f 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -582,7 +582,7 @@ encoding provided by the the pandas category:: >>> from pandas.api.types import CategoricalDtype >>> X_df = pd.DataFrame({ ... 'col_str': ['a', 'b', 'b', 'a'], - ... 'col_int': [3, 2, 1, 2]}) + ... 'col_int': [3, 2, 1, 2]}, columns=['col_str', 'col_int']) >>> str_category = CategoricalDtype(categories=['b', 'a'], ordered=True) >>> int_category = CategoricalDtype(categories=[3, 1, 2], ordered=True) >>> X_df['col_str'] = X_df['col_str'].astype(str_category) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 4e21d32a5c99a..a301686b116fb 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -27,6 +27,20 @@ class _BaseEncoder(TransformerMixin, BaseEstimator): """ + def _check_dtypes_equal(self, dtypes_l, dtypes_r): + """Returns True if the dtypes.""" + if len(dtypes_l) != len(dtypes_r): + return False + + for dtype_l, dtype_r in zip(dtypes_l, dtypes_r): + dtype_l_is_cat = hasattr(dtype_l, 'categories') + dtype_r_is_cat = hasattr(dtype_r, 'categories') + if ((dtype_l_is_cat and not dtype_r_is_cat) + or (not dtype_l_is_cat and dtype_r_is_cat) + or (dtype_l != dtype_r)): + return False + return True + def _check_X(self, X): """ Perform custom check_array: @@ -44,11 +58,10 @@ def _check_X(self, X): if not hasattr(X, 'dtypes'): raise TypeError("X must be a dataframe when " "categories='dtypes'") - X_dtypes = getattr(X, 'dtypes') - + X_dtypes = X.dtypes if hasattr(self, "_X_fit_dtypes"): # fitted - if (len(self._X_fit_dtypes) != len(X_dtypes) or - not all(self._X_fit_dtypes == X_dtypes)): + if not self._check_dtypes_equal(self._X_fit_dtypes, + X_dtypes): raise ValueError("X.dtypes must match the dtypes used " "when fitting") else: diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index e578adbb76318..d9bb71b811453 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -641,13 +641,13 @@ def test_encoders_has_categorical_tags(Encoder): @pytest.mark.parametrize("is_sparse", [True, False]) -@pytest.mark.parametrize("drop", ["first", None]) +@pytest.mark.parametrize("drop", [None, "first"]) def test_one_hot_encoder_pd_categories(is_sparse, drop): pd = pytest.importorskip('pandas') X_df = pd.DataFrame({ 'col_str': ['a', 'b', 'b', 'a'], - 'col_int': [3, 2, 1, 2]}) + 'col_int': [3, 2, 1, 2]}, columns=['col_str', 'col_int']) str_category = pd.api.types.CategoricalDtype( categories=['b', 'a'], ordered=True) @@ -684,35 +684,50 @@ def test_one_hot_encoder_pd_categories(is_sparse, drop): OrdinalEncoder(categories="dtypes")]) def test_encoder_pd_error_mismatch_dtype(encoder): pd = pytest.importorskip('pandas') + msg = "X.dtypes must match the dtypes used when fitting" - X_df = pd.DataFrame({ + X_df_orig = pd.DataFrame({ 'col_str': ['a', 'b', 'b', 'a'], - 'col_int': [3, 2, 1, 2]}) + 'col_int': [3, 2, 1, 2]}, columns=['col_str', 'col_int']) + + enc_no_categories = clone(encoder).fit(X_df_orig) + + X_df0 = X_df_orig.copy() + X_df0['col_int'] = X_df0['col_int'].astype('category') + + # X_df0 has categories while the trained dataframe does not + with pytest.raises(ValueError, match=msg): + enc_no_categories.transform(X_df0) str_category = pd.api.types.CategoricalDtype( categories=['b', 'a'], ordered=True) + X_df1 = X_df_orig.copy() + X_df1['col_str'] = X_df1['col_str'].astype(str_category) + X_df1['col_int'] = X_df1['col_int'].astype('category') - X_df['col_str'] = X_df['col_str'].astype(str_category) - X_df['col_int'] = X_df['col_int'].astype('category') + # X_df1 has categories while the trained dataframe does not + with pytest.raises(ValueError, match=msg): + enc_no_categories.transform(X_df1) - enc = clone(encoder).fit(X_df) + # Train encoder with categoricals + enc = clone(encoder).fit(X_df1) # col_str dtype not ordered correctly - X_df2 = X_df.copy() + X_df2 = X_df_orig.copy() str_category_lex_ordered = pd.api.types.CategoricalDtype( categories=['a', 'b'], ordered=True) X_df2['col_str'] = X_df2['col_str'].astype(str_category_lex_ordered) + X_df2['col_int'] = X_df2['col_int'].astype('category') - msg = "X.dtypes must match the dtypes used when fitting" with pytest.raises(ValueError, match=msg): enc.transform(X_df2) # col_int not a categorical dtype - X_df3 = X_df.copy() + X_df3 = X_df_orig.copy() X_df3['col_int'] = X_df3['col_int'].astype(int) with pytest.raises(ValueError, match=msg): - enc.transform(X_df2) + enc.transform(X_df3) # number of features is not correct X_df4 = pd.DataFrame({ @@ -721,7 +736,7 @@ def test_encoder_pd_error_mismatch_dtype(encoder): with pytest.raises(ValueError, match=msg): enc.transform(X_df4) - X_np = X_df.values + X_np = X_df_orig.values msg = "X must be a dataframe when categories='dtypes'" with pytest.raises(TypeError, match=msg): enc.transform(X_np) @@ -732,12 +747,12 @@ def test_encoder_pd_error_mismatch_dtype(encoder): def test_one_hot_encoder_pd_categories_mixed(is_sparse, drop): pd = pytest.importorskip('pandas') - X_df = pd.DataFrame({ - 'col_str': ['a', 'b', 'b', 'a'], - 'col_int': [3, 2, 1, 2], - 'norm_float': [1.0, 2.0, 1.0, 1.0], # not a pandas category - 'norm_str': ['z', 'd', 'z', 'd']} # not a pandas category - ) + X_df = pd.DataFrame( + {'col_str': ['a', 'b', 'b', 'a'], + 'col_int': [3, 2, 1, 2], + 'norm_float': [1.0, 2.0, 1.0, 1.0], # not a pandas category + 'norm_str': ['z', 'd', 'z', 'd']}, # not a pandas category + columns=['col_str', 'col_int', 'norm_float', 'norm_str']) str_category = pd.api.types.CategoricalDtype( categories=['b', 'a'], ordered=True) @@ -777,8 +792,8 @@ def test_ordinal_encoder_pd_categories_mixed(): 'col_str': ['a', 'b', 'b', 'a'], 'col_int': [3, 2, 1, 2], 'norm_float': [1.0, 2.0, 1.0, 1.0], # not a pandas category - 'norm_str': ['z', 'd', 'z', 'd']} # not a pandas category - ) + 'norm_str': ['z', 'd', 'z', 'd']}, # not a pandas category + columns=['col_str', 'col_int', 'norm_float', 'norm_str']) str_category = pd.api.types.CategoricalDtype( categories=['b', 'a'], ordered=True) @@ -810,7 +825,7 @@ def test_ordinal_encoder_pd_categories(): X_df = pd.DataFrame({ 'col_str': ['a', 'b', 'b', 'a'], - 'col_int': [3, 2, 1, 2]}) + 'col_int': [3, 2, 1, 2]}, columns=['col_str', 'col_int']) str_category = pd.api.types.CategoricalDtype( categories=['b', 'a'], ordered=True) From 72a8ade59d68e2286202abd9782371f7a4f5f488 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 16:54:39 -0400 Subject: [PATCH 13/28] ENH Enable support for non dataframes --- sklearn/preprocessing/_encoders.py | 11 ++++++----- sklearn/preprocessing/tests/test_encoders.py | 5 ----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index a301686b116fb..c4685497ffe92 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -29,6 +29,10 @@ class _BaseEncoder(TransformerMixin, BaseEstimator): def _check_dtypes_equal(self, dtypes_l, dtypes_r): """Returns True if the dtypes.""" + if ((dtypes_l is None and dtypes_r is not None) or + (dtypes_l is not None and dtypes_r is None)): + return False + if len(dtypes_l) != len(dtypes_r): return False @@ -55,10 +59,7 @@ def _check_X(self, X): the pandas series will be return in this list. """ if self.categories == 'dtypes': - if not hasattr(X, 'dtypes'): - raise TypeError("X must be a dataframe when " - "categories='dtypes'") - X_dtypes = X.dtypes + X_dtypes = getattr(X, "dtypes", None) if hasattr(self, "_X_fit_dtypes"): # fitted if not self._check_dtypes_equal(self._X_fit_dtypes, X_dtypes): @@ -87,7 +88,7 @@ def _check_X(self, X): for i in range(n_features): Xi = self._get_feature(X, feature_idx=i) if self.categories == 'dtypes' and hasattr(Xi, 'cat'): - # TODO: Change missing value support is added + # TODO: Change when missing value support is added _assert_all_finite(Xi) else: Xi = check_array(Xi, ensure_2d=False, dtype=None, diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index d9bb71b811453..15e7cab1d36d5 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -736,11 +736,6 @@ def test_encoder_pd_error_mismatch_dtype(encoder): with pytest.raises(ValueError, match=msg): enc.transform(X_df4) - X_np = X_df_orig.values - msg = "X must be a dataframe when categories='dtypes'" - with pytest.raises(TypeError, match=msg): - enc.transform(X_np) - @pytest.mark.parametrize("is_sparse", [True, False]) @pytest.mark.parametrize("drop", ["first", None]) From 3d7210603d87712094ec1376c1ea78698e7a8676 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 17:00:55 -0400 Subject: [PATCH 14/28] DOC Includes fallback to auto in docs --- sklearn/preprocessing/_encoders.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index c4685497ffe92..ad817b48c3580 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -56,7 +56,7 @@ def _check_X(self, X): of pandas DataFrame columns, as otherwise information is lost and cannot be used, eg for the `categories_` attribute. If categories == 'dtype' and the pandas column is a category, - the pandas series will be return in this list. + the pandas series will be returned in this list. """ if self.categories == 'dtypes': X_dtypes = getattr(X, "dtypes", None) @@ -220,7 +220,9 @@ class OneHotEncoder(_BaseEncoder): Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. - - 'dtypes' : Uses pandas categorical dtype to encode categories. + - 'dtypes' : Uses pandas categorical dtype to encode categories. For + non pandas categorical data, the categories are automatically + determined from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values within a single feature, and should be sorted in case of @@ -588,7 +590,9 @@ class OrdinalEncoder(_BaseEncoder): Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. - - 'dtypes' : Uses pandas categorical dtype to encode categories. + - 'dtypes' : Uses pandas categorical dtype to encode categories. For + non pandas categorical data, the categories are automatically + determined from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values, and should be sorted in case of numeric values. From 53cb5371e2aa0ad5a9de775c8d3bc8a5da2e6653 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 17:07:26 -0400 Subject: [PATCH 15/28] DOC Update comment --- sklearn/preprocessing/_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index ad817b48c3580..8c89aedf425ae 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -55,7 +55,7 @@ def _check_X(self, X): constructed feature by feature to preserve the data types of pandas DataFrame columns, as otherwise information is lost and cannot be used, eg for the `categories_` attribute. - If categories == 'dtype' and the pandas column is a category, + If categories == 'dtypes' and the pandas column is a category, the pandas series will be returned in this list. """ if self.categories == 'dtypes': From b82eec2faf8b4cfba5b5bbb4dea91357d535dd54 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 19:49:27 -0400 Subject: [PATCH 16/28] CLN Address some joris's comments --- azure-pipelines.yml | 1 + sklearn/preprocessing/_encoders.py | 57 +++++---- sklearn/preprocessing/tests/test_encoders.py | 117 ++++--------------- 3 files changed, 54 insertions(+), 121 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index aaab848945ccf..d20119015d47c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -42,6 +42,7 @@ jobs: pylatest_conda_mkl: DISTRIB: 'conda' PYTHON_VERSION: '*' + PANDAS_VERSION: '*' INSTALL_MKL: 'true' NUMPY_VERSION: '*' SCIPY_VERSION: '*' diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 8c89aedf425ae..00daa89c6ae8e 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -27,23 +27,34 @@ class _BaseEncoder(TransformerMixin, BaseEstimator): """ - def _check_dtypes_equal(self, dtypes_l, dtypes_r): - """Returns True if the dtypes.""" - if ((dtypes_l is None and dtypes_r is not None) or - (dtypes_l is not None and dtypes_r is None)): - return False - - if len(dtypes_l) != len(dtypes_r): - return False - - for dtype_l, dtype_r in zip(dtypes_l, dtypes_r): - dtype_l_is_cat = hasattr(dtype_l, 'categories') - dtype_r_is_cat = hasattr(dtype_r, 'categories') - if ((dtype_l_is_cat and not dtype_r_is_cat) - or (not dtype_l_is_cat and dtype_r_is_cat) - or (dtype_l != dtype_r)): - return False - return True + def _check_categories_dtypes_equal(self, fit_dtypes, trans_dtypes): + """Return True if the categorical dtypes are equal.""" + msg = "categorical dtypes in X must match the dtypes used when fitting" + + # one is None and the other is not + if ((fit_dtypes is None and trans_dtypes is not None) or + (fit_dtypes is not None and trans_dtypes is None)): + raise ValueError(msg) + + if len(fit_dtypes) != len(trans_dtypes): + raise ValueError(msg) + + for fit_dtype, trans_dtype in zip(fit_dtypes, trans_dtypes): + fit_cats = getattr(fit_dtype, 'categories', None) + trans_cats = getattr(trans_dtype, 'categories', None) + + # not categories + if fit_cats is None and trans_cats is None: + continue + + # one is category and the other is not + if ((fit_cats is not None and trans_cats is None) or + (fit_cats is None and trans_cats is not None)): + raise ValueError(msg) + + # both are categories and are not equal + if all(fit_cats != trans_cats): + raise ValueError(msg) def _check_X(self, X): """ @@ -61,10 +72,8 @@ def _check_X(self, X): if self.categories == 'dtypes': X_dtypes = getattr(X, "dtypes", None) if hasattr(self, "_X_fit_dtypes"): # fitted - if not self._check_dtypes_equal(self._X_fit_dtypes, - X_dtypes): - raise ValueError("X.dtypes must match the dtypes used " - "when fitting") + self._check_categories_dtypes_equal(self._X_fit_dtypes, + X_dtypes) else: self._X_fit_dtypes = X_dtypes @@ -87,7 +96,7 @@ def _check_X(self, X): for i in range(n_features): Xi = self._get_feature(X, feature_idx=i) - if self.categories == 'dtypes' and hasattr(Xi, 'cat'): + if self.categories == 'dtypes' and Xi.dtype.name == 'category': # TODO: Change when missing value support is added _assert_all_finite(Xi) else: @@ -119,7 +128,7 @@ def _fit(self, X, handle_unknown='error'): if self.categories == 'auto': cats = _encode(Xi) elif self.categories == 'dtypes': - if hasattr(Xi, "cat"): + if Xi.dtype.name == 'category': cats = Xi.cat.categories.values.copy() else: cats = _encode(Xi) @@ -154,7 +163,7 @@ def _transform(self, X, handle_unknown='error'): for i in range(n_features): Xi = X_list[i] - if self.categories == 'dtypes' and hasattr(Xi, "cat"): + if self.categories == 'dtypes' and Xi.dtype.name == 'category': # categorical dtypes contain no unknown values encoded = Xi.cat.codes else: diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 15e7cab1d36d5..95b0a7eace920 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -640,51 +640,12 @@ def test_encoders_has_categorical_tags(Encoder): assert 'categorical' in Encoder()._get_tags()['X_types'] -@pytest.mark.parametrize("is_sparse", [True, False]) -@pytest.mark.parametrize("drop", [None, "first"]) -def test_one_hot_encoder_pd_categories(is_sparse, drop): - pd = pytest.importorskip('pandas') - - X_df = pd.DataFrame({ - 'col_str': ['a', 'b', 'b', 'a'], - 'col_int': [3, 2, 1, 2]}, columns=['col_str', 'col_int']) - - str_category = pd.api.types.CategoricalDtype( - categories=['b', 'a'], ordered=True) - int_category = pd.api.types.CategoricalDtype( - categories=[3, 1, 2], ordered=True) - - X_df['col_str'] = X_df['col_str'].astype(str_category) - X_df['col_int'] = X_df['col_int'].astype(int_category) - - ohe = OneHotEncoder(categories='dtypes', sparse=is_sparse, drop=drop) - - expected_trans = np.array([ - [0, 1, 1, 0, 0], - [1, 0, 0, 0, 1], - [1, 0, 0, 1, 0], - [0, 1, 0, 0, 1]], dtype=np.float64) - - if drop == 'first': - expected_trans = expected_trans[:, [1, 3, 4]] - - X_trans = ohe.fit_transform(X_df) - if is_sparse: - X_trans = X_trans.toarray() - - assert_allclose(X_trans, expected_trans) - - X_inverse = ohe.inverse_transform(expected_trans) - - assert_array_equal(X_inverse, X_df.values) - - @pytest.mark.parametrize('encoder', [ OneHotEncoder(categories="dtypes"), OrdinalEncoder(categories="dtypes")]) def test_encoder_pd_error_mismatch_dtype(encoder): pd = pytest.importorskip('pandas') - msg = "X.dtypes must match the dtypes used when fitting" + msg = "categorical dtypes in X must match the dtypes used when fitting" X_df_orig = pd.DataFrame({ 'col_str': ['a', 'b', 'b', 'a'], @@ -739,33 +700,29 @@ def test_encoder_pd_error_mismatch_dtype(encoder): @pytest.mark.parametrize("is_sparse", [True, False]) @pytest.mark.parametrize("drop", ["first", None]) -def test_one_hot_encoder_pd_categories_mixed(is_sparse, drop): +@pytest.mark.parametrize("dtype", [np.float64, np.int8]) +def test_one_hot_encoder_pd_categories_mixed(is_sparse, drop, dtype): pd = pytest.importorskip('pandas') X_df = pd.DataFrame( - {'col_str': ['a', 'b', 'b', 'a'], - 'col_int': [3, 2, 1, 2], + {'col_str': pd.Categorical(['a', 'b', 'b', 'a'], + categories=['b', 'a'], ordered=True), + 'col_int': pd.Categorical([3, 2, 1, 2], + categories=[3, 1, 2], ordered=True), 'norm_float': [1.0, 2.0, 1.0, 1.0], # not a pandas category 'norm_str': ['z', 'd', 'z', 'd']}, # not a pandas category columns=['col_str', 'col_int', 'norm_float', 'norm_str']) - str_category = pd.api.types.CategoricalDtype( - categories=['b', 'a'], ordered=True) - int_category = pd.api.types.CategoricalDtype( - categories=[3, 1, 2], ordered=True) - - X_df['col_str'] = X_df['col_str'].astype(str_category) - X_df['col_int'] = X_df['col_int'].astype(int_category) - ohe = OneHotEncoder(categories="dtypes", sparse=is_sparse, + dtype=dtype, drop=drop).fit(X_df) expected_trans = np.array([ [0, 1, 1, 0, 0, 1, 0, 0, 1], [1, 0, 0, 0, 1, 0, 1, 1, 0], [1, 0, 0, 1, 0, 1, 0, 0, 1], - [0, 1, 0, 0, 1, 1, 0, 1, 0]], dtype=np.float64) + [0, 1, 0, 0, 1, 1, 0, 1, 0]], dtype=dtype) if drop == 'first': expected_trans = expected_trans[:, [1, 3, 4, 6, 8]] @@ -780,32 +737,27 @@ def test_one_hot_encoder_pd_categories_mixed(is_sparse, drop): assert_array_equal(X_inverse, X_df.values) -def test_ordinal_encoder_pd_categories_mixed(): +@pytest.mark.parametrize("dtype", [np.float64, np.int8]) +def test_ordinal_encoder_pd_categories_mixed(dtype): pd = pytest.importorskip('pandas') - X_df = pd.DataFrame({ - 'col_str': ['a', 'b', 'b', 'a'], - 'col_int': [3, 2, 1, 2], - 'norm_float': [1.0, 2.0, 1.0, 1.0], # not a pandas category - 'norm_str': ['z', 'd', 'z', 'd']}, # not a pandas category + X_df = pd.DataFrame( + {'col_str': pd.Categorical(['a', 'b', 'b', 'a'], + categories=['b', 'a'], ordered=True), + 'col_int': pd.Categorical([3, 2, 1, 2], + categories=[3, 1, 2], ordered=True), + 'norm_float': [1.0, 2.0, 1.0, 1.0], # not a pandas category + 'norm_str': ['z', 'd', 'z', 'd']}, # not a pandas category columns=['col_str', 'col_int', 'norm_float', 'norm_str']) - str_category = pd.api.types.CategoricalDtype( - categories=['b', 'a'], ordered=True) - int_category = pd.api.types.CategoricalDtype( - categories=[3, 1, 2], ordered=True) - - X_df['col_str'] = X_df['col_str'].astype(str_category) - X_df['col_int'] = X_df['col_int'].astype(int_category) - - ohe = OrdinalEncoder(categories="dtypes").fit(X_df) + ohe = OrdinalEncoder(categories="dtypes", dtype=dtype).fit(X_df) expected_trans = np.array([ [1, 0, 0, 1], # col_str [0, 2, 1, 2], # col_int [0, 1, 0, 0], # norm_float [1, 0, 1, 0], # norm_str - ], dtype=np.float64).T + ], dtype=dtype).T X_trans = ohe.fit_transform(X_df) @@ -813,32 +765,3 @@ def test_ordinal_encoder_pd_categories_mixed(): X_inverse = ohe.inverse_transform(expected_trans) assert_array_equal(X_inverse, X_df.values) - - -def test_ordinal_encoder_pd_categories(): - pd = pytest.importorskip('pandas') - - X_df = pd.DataFrame({ - 'col_str': ['a', 'b', 'b', 'a'], - 'col_int': [3, 2, 1, 2]}, columns=['col_str', 'col_int']) - - str_category = pd.api.types.CategoricalDtype( - categories=['b', 'a'], ordered=True) - int_category = pd.api.types.CategoricalDtype( - categories=[3, 1, 2], ordered=True) - - X_df['col_str'] = X_df['col_str'].astype(str_category) - X_df['col_int'] = X_df['col_int'].astype(int_category) - - ohe = OrdinalEncoder(categories='dtypes') - - expected_trans = np.array([ - [1, 0, 0, 1], # col_str - [0, 2, 1, 2], # col_int - ], dtype=np.float64).T - - X_trans = ohe.fit_transform(X_df) - assert_allclose(X_trans, expected_trans) - X_inverse = ohe.inverse_transform(expected_trans) - - assert_array_equal(X_inverse, X_df.values) From 3d6ff26d10aafb36289efbdfc4abb97c0e7a238d Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 20:08:20 -0400 Subject: [PATCH 17/28] CLN Moves encoding to _encode --- sklearn/preprocessing/_encoders.py | 17 ++++++----------- sklearn/preprocessing/_label.py | 17 ++++++++++++++++- sklearn/preprocessing/tests/test_encoders.py | 5 ++--- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 00daa89c6ae8e..07c0cf2e89f85 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -56,7 +56,7 @@ def _check_categories_dtypes_equal(self, fit_dtypes, trans_dtypes): if all(fit_cats != trans_cats): raise ValueError(msg) - def _check_X(self, X): + def _check_X(self, X, is_fitting): """ Perform custom check_array: - convert list of strings to object dtype @@ -71,7 +71,7 @@ def _check_X(self, X): """ if self.categories == 'dtypes': X_dtypes = getattr(X, "dtypes", None) - if hasattr(self, "_X_fit_dtypes"): # fitted + if not is_fitting: # transform self._check_categories_dtypes_equal(self._X_fit_dtypes, X_dtypes) else: @@ -114,7 +114,7 @@ def _get_feature(self, X, feature_idx): return X[:, feature_idx] def _fit(self, X, handle_unknown='error'): - X_list, n_samples, n_features = self._check_X(X) + X_list, n_samples, n_features = self._check_X(X, is_fitting=True) if self.categories not in ('auto', 'dtypes'): if len(self.categories) != n_features: @@ -125,13 +125,8 @@ def _fit(self, X, handle_unknown='error'): for i in range(n_features): Xi = X_list[i] - if self.categories == 'auto': + if self.categories in ('auto', 'dtypes'): cats = _encode(Xi) - elif self.categories == 'dtypes': - if Xi.dtype.name == 'category': - cats = Xi.cat.categories.values.copy() - else: - cats = _encode(Xi) else: cats = np.array(self.categories[i], dtype=Xi.dtype) if Xi.dtype != object: @@ -147,7 +142,7 @@ def _fit(self, X, handle_unknown='error'): self.categories_.append(cats) def _transform(self, X, handle_unknown='error'): - X_list, n_samples, n_features = self._check_X(X) + X_list, n_samples, n_features = self._check_X(X, is_fitting=False) X_int = np.zeros((n_samples, n_features), dtype=np.int) X_mask = np.ones((n_samples, n_features), dtype=np.bool) @@ -165,7 +160,7 @@ def _transform(self, X, handle_unknown='error'): if self.categories == 'dtypes' and Xi.dtype.name == 'category': # categorical dtypes contain no unknown values - encoded = Xi.cat.codes + _, encoded = _encode(Xi, self.categories_[i], encode=True) else: diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 9fbc959969e33..18e34f8fae025 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -71,6 +71,16 @@ def _encode_python(values, uniques=None, encode=False): return uniques +def _encode_categorical(values, uniques=None, encode=False): + """Encode values of a pandas Series with a categorical dtype""" + if uniques is None: + uniques = values.cat.categories.values.copy() + if encode: + encoded = values.cat.codes + return uniques, encoded + return uniques + + def _encode(values, uniques=None, encode=False, check_unknown=True): """Helper function to factorize (find uniques) and encode values. @@ -81,9 +91,12 @@ def _encode(values, uniques=None, encode=False, check_unknown=True): the case. The calling method needs to ensure this for all non-object values. + If values is a pandas Series with a categorical dtype then the encoding + will be infered from the series. + Parameters ---------- - values : array + values : array or pandas Series Values to factorize or encode. uniques : array, optional If passed, uniques are not determined from passed values (this @@ -113,6 +126,8 @@ def _encode(values, uniques=None, encode=False, check_unknown=True): except TypeError: raise TypeError("argument must be a string or number") return res + elif values.dtype.name == "category": + return _encode_categorical(values, uniques=uniques, encode=encode) else: return _encode_numpy(values, uniques, encode, check_unknown=check_unknown) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 95b0a7eace920..fa99bbd2e3161 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -6,7 +6,6 @@ from scipy import sparse import pytest -from sklearn.base import clone from sklearn.exceptions import NotFittedError from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_allclose @@ -651,7 +650,7 @@ def test_encoder_pd_error_mismatch_dtype(encoder): 'col_str': ['a', 'b', 'b', 'a'], 'col_int': [3, 2, 1, 2]}, columns=['col_str', 'col_int']) - enc_no_categories = clone(encoder).fit(X_df_orig) + enc_no_categories = encoder.fit(X_df_orig) X_df0 = X_df_orig.copy() X_df0['col_int'] = X_df0['col_int'].astype('category') @@ -671,7 +670,7 @@ def test_encoder_pd_error_mismatch_dtype(encoder): enc_no_categories.transform(X_df1) # Train encoder with categoricals - enc = clone(encoder).fit(X_df1) + enc = encoder.fit(X_df1) # col_str dtype not ordered correctly X_df2 = X_df_orig.copy() From 3834b54471574960ff83a41c482347869db1622e Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 20:12:22 -0400 Subject: [PATCH 18/28] CLN Less diffs --- sklearn/preprocessing/_encoders.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 07c0cf2e89f85..99a1df84e3485 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -158,10 +158,9 @@ def _transform(self, X, handle_unknown='error'): for i in range(n_features): Xi = X_list[i] - if self.categories == 'dtypes' and Xi.dtype.name == 'category': - # categorical dtypes contain no unknown values - _, encoded = _encode(Xi, self.categories_[i], encode=True) - else: + is_category = (self.categories == 'dtypes' and + Xi.dtype.name == 'category') + if not is_category: diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], return_mask=True) @@ -184,10 +183,10 @@ def _transform(self, X, handle_unknown='error'): Xi = Xi.copy() Xi[~valid_mask] = self.categories_[i][0] - # We use check_unknown=False, since _encode_check_unknown was - # already called above. - _, encoded = _encode(Xi, self.categories_[i], encode=True, - check_unknown=False) + # We use check_unknown=False, since _encode_check_unknown was + # already called above. + _, encoded = _encode(Xi, self.categories_[i], encode=True, + check_unknown=False) X_int[:, i] = encoded return X_int, X_mask From 36ef62358e8f8a53a5d6fda6bb67ca81c03e0405 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 20:13:32 -0400 Subject: [PATCH 19/28] DOC Adds comment regarding unknowns --- sklearn/preprocessing/_encoders.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 99a1df84e3485..3cc4205501aa8 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -160,6 +160,7 @@ def _transform(self, X, handle_unknown='error'): is_category = (self.categories == 'dtypes' and Xi.dtype.name == 'category') + # categories without missing values do not have unknown values if not is_category: diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], From d76ddda683b2d09d1526a20c4e0e439c50058c5d Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 21:38:04 -0400 Subject: [PATCH 20/28] TST Adds pandas to osx --- azure-pipelines.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d20119015d47c..c593f9d167b01 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -74,6 +74,7 @@ jobs: INSTALL_MKL: 'true' NUMPY_VERSION: '*' SCIPY_VERSION: '*' + PANDAS_VERSION: '*' CYTHON_VERSION: '*' PILLOW_VERSION: '*' PYTEST_VERSION: '*' From 1a3e7ae56fc66cc85e16e538f0808b364f56e9fa Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 29 Oct 2019 21:39:45 -0400 Subject: [PATCH 21/28] DOC Remove dtypes in user guide --- azure-pipelines.yml | 2 -- doc/modules/preprocessing.rst | 21 --------------------- 2 files changed, 23 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index c593f9d167b01..aaab848945ccf 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -42,7 +42,6 @@ jobs: pylatest_conda_mkl: DISTRIB: 'conda' PYTHON_VERSION: '*' - PANDAS_VERSION: '*' INSTALL_MKL: 'true' NUMPY_VERSION: '*' SCIPY_VERSION: '*' @@ -74,7 +73,6 @@ jobs: INSTALL_MKL: 'true' NUMPY_VERSION: '*' SCIPY_VERSION: '*' - PANDAS_VERSION: '*' CYTHON_VERSION: '*' PILLOW_VERSION: '*' PYTEST_VERSION: '*' diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 976e10eea8f2f..3e41c592fbbdc 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -574,27 +574,6 @@ When this paramenter is not None, ``handle_unknown`` must be set to See :ref:`dict_feature_extraction` for categorical features that are represented as a dict, not as scalars. -When using pandas dataframe with categorical dtypes, :class:`OneHotEncoder` and -:class:`OrdinalEncoder` contain a `categories='dtypes'` option to use the -encoding provided by the the pandas category:: - - >>> import pandas as pd - >>> from pandas.api.types import CategoricalDtype - >>> X_df = pd.DataFrame({ - ... 'col_str': ['a', 'b', 'b', 'a'], - ... 'col_int': [3, 2, 1, 2]}, columns=['col_str', 'col_int']) - >>> str_category = CategoricalDtype(categories=['b', 'a'], ordered=True) - >>> int_category = CategoricalDtype(categories=[3, 1, 2], ordered=True) - >>> X_df['col_str'] = X_df['col_str'].astype(str_category) - >>> X_df['col_int'] = X_df['col_int'].astype(int_category) - >>> enc = preprocessing.OneHotEncoder(categories='dtypes').fit(X_df) - >>> enc.transform(X_df).toarray() - array([[0., 1., 1., 0., 0.], - [1., 0., 0., 0., 1.], - [1., 0., 0., 1., 0.], - [0., 1., 0., 0., 1.]]) - - .. _preprocessing_discretization: Discretization From e54b5ee9f5a3ef676e5d3284e75e12e48bcda514 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 30 Oct 2019 10:34:48 -0400 Subject: [PATCH 22/28] ENH Only checks categories --- sklearn/preprocessing/_encoders.py | 45 +++++++++++++++++------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 3cc4205501aa8..f9946d9a64b65 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -27,33 +27,31 @@ class _BaseEncoder(TransformerMixin, BaseEstimator): """ - def _check_categories_dtypes_equal(self, fit_dtypes, trans_dtypes): - """Return True if the categorical dtypes are equal.""" + def _check_categories_dtypes_equal(self, fit_cat_dict, trans_dtypes): + """Return True if the categorical dtypes in fit_cat_dtypes are in + trans_dtypes.""" msg = "categorical dtypes in X must match the dtypes used when fitting" # one is None and the other is not - if ((fit_dtypes is None and trans_dtypes is not None) or - (fit_dtypes is not None and trans_dtypes is None)): + if ((fit_cat_dict is None and trans_dtypes is not None) or + (fit_cat_dict is not None and trans_dtypes is None)): raise ValueError(msg) - if len(fit_dtypes) != len(trans_dtypes): - raise ValueError(msg) - - for fit_dtype, trans_dtype in zip(fit_dtypes, trans_dtypes): - fit_cats = getattr(fit_dtype, 'categories', None) - trans_cats = getattr(trans_dtype, 'categories', None) + trans_dtypes_dict = {name: dtype for name, dtype in + trans_dtypes.items() if dtype.name == 'category'} - # not categories - if fit_cats is None and trans_cats is None: - continue + # names do not match + if set(trans_dtypes_dict) ^ set(fit_cat_dict): + raise ValueError(msg) - # one is category and the other is not - if ((fit_cats is not None and trans_cats is None) or - (fit_cats is None and trans_cats is not None)): + for name, fit_cat_dtype in fit_cat_dict.items(): + try: + trans_cats = trans_dtypes[name].categories + except (AttributeError, KeyError): raise ValueError(msg) # both are categories and are not equal - if all(fit_cats != trans_cats): + if all(fit_cat_dtype.categories != trans_cats): raise ValueError(msg) def _check_X(self, X, is_fitting): @@ -72,10 +70,17 @@ def _check_X(self, X, is_fitting): if self.categories == 'dtypes': X_dtypes = getattr(X, "dtypes", None) if not is_fitting: # transform - self._check_categories_dtypes_equal(self._X_fit_dtypes, - X_dtypes) + self._check_categories_dtypes_equal( + self._X_fit_cat_dict, X_dtypes) else: - self._X_fit_dtypes = X_dtypes + if X_dtypes is not None: + # only remember categorical dtypes + self._X_fit_cat_dict = { + name: dtype for name, dtype in X_dtypes.items() + if dtype.name == 'category'} + else: + # not a pandas dataframe + self._X_fit_cat_dict = None if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2): # if not a dataframe, do normal check_array validation From e0b69d8aa998b47280656697edbaa901675c5c4b Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 30 Oct 2019 16:43:45 -0400 Subject: [PATCH 23/28] DOC Adds tests for learnt categories --- sklearn/preprocessing/tests/test_encoders.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index fa99bbd2e3161..29d6743a51af7 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -727,6 +727,11 @@ def test_one_hot_encoder_pd_categories_mixed(is_sparse, drop, dtype): expected_trans = expected_trans[:, [1, 3, 4, 6, 8]] X_trans = ohe.fit_transform(X_df) + assert_array_equal(ohe.categories_[0], ['b', 'a']) + assert_array_equal(ohe.categories_[1], [3, 1, 2]) + assert_allclose(ohe.categories_[2], [1.0, 2.0]) + assert_array_equal(ohe.categories_[3], ['d', 'z']) + if is_sparse: X_trans = X_trans.toarray() @@ -749,7 +754,7 @@ def test_ordinal_encoder_pd_categories_mixed(dtype): 'norm_str': ['z', 'd', 'z', 'd']}, # not a pandas category columns=['col_str', 'col_int', 'norm_float', 'norm_str']) - ohe = OrdinalEncoder(categories="dtypes", dtype=dtype).fit(X_df) + oe = OrdinalEncoder(categories="dtypes", dtype=dtype).fit(X_df) expected_trans = np.array([ [1, 0, 0, 1], # col_str @@ -758,9 +763,13 @@ def test_ordinal_encoder_pd_categories_mixed(dtype): [1, 0, 1, 0], # norm_str ], dtype=dtype).T - X_trans = ohe.fit_transform(X_df) + X_trans = oe.fit_transform(X_df) + assert_array_equal(oe.categories_[0], ['b', 'a']) + assert_array_equal(oe.categories_[1], [3, 1, 2]) + assert_allclose(oe.categories_[2], [1.0, 2.0]) + assert_array_equal(oe.categories_[3], ['d', 'z']) assert_allclose(X_trans, expected_trans) - X_inverse = ohe.inverse_transform(expected_trans) + X_inverse = oe.inverse_transform(expected_trans) assert_array_equal(X_inverse, X_df.values) From cadf2e7f8ab149064f8fe8c65029029605c1261a Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 30 Oct 2019 16:52:52 -0400 Subject: [PATCH 24/28] DOC Makes changes in _encode_check_unknown --- sklearn/preprocessing/_label.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 18e34f8fae025..90318b4bfd1aa 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -140,6 +140,10 @@ def _encode_check_unknown(values, uniques, return_mask=False): Uses pure python method for object dtype, and numpy method for all other dtypes. + If values is a pandas Series with a categorical dtype, then we assume that + the dtype is checked to be the same as fit time and no missing values. + + Parameters ---------- values : array @@ -170,6 +174,12 @@ def _encode_check_unknown(values, uniques, return_mask=False): return diff, valid_mask else: return diff + elif values.dtype.name == "category": + # Assume there are no missing vlaues in categorical + diff = [] + if return_mask: + return diff, np.ones_like(len(values), dtype=bool) + return diff else: unique_values = np.unique(values) diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True)) From 4fb97848df89a01f63b5ac99dd3ba52aadb712d4 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 30 Oct 2019 16:54:10 -0400 Subject: [PATCH 25/28] CLN Smaller diff --- sklearn/preprocessing/_encoders.py | 45 +++++++++++++----------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index f9946d9a64b65..cd14dab403d2b 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -162,33 +162,28 @@ def _transform(self, X, handle_unknown='error'): for i in range(n_features): Xi = X_list[i] + diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], + return_mask=True) - is_category = (self.categories == 'dtypes' and - Xi.dtype.name == 'category') - # categories without missing values do not have unknown values - if not is_category: - diff, valid_mask = _encode_check_unknown(Xi, - self.categories_[i], - return_mask=True) - if not np.all(valid_mask): - if handle_unknown == 'error': - msg = ("Found unknown categories {0} in column {1}" - " during transform".format(diff, i)) - raise ValueError(msg) + if not np.all(valid_mask): + if handle_unknown == 'error': + msg = ("Found unknown categories {0} in column {1}" + " during transform".format(diff, i)) + raise ValueError(msg) + else: + # Set the problematic rows to an acceptable value and + # continue `The rows are marked `X_mask` and will be + # removed later. + X_mask[:, i] = valid_mask + # cast Xi into the largest string type necessary + # to handle different lengths of numpy strings + if (self.categories_[i].dtype.kind in ('U', 'S') + and self.categories_[i].itemsize > Xi.itemsize): + Xi = Xi.astype(self.categories_[i].dtype) else: - # Set the problematic rows to an acceptable value and - # continue `The rows are marked `X_mask` and will be - # removed later. - X_mask[:, i] = valid_mask - # cast Xi into the largest string type necessary - # to handle different lengths of numpy strings - if (self.categories_[i].dtype.kind in ('U', 'S') and - self.categories_[i].itemsize > Xi.itemsize): - Xi = Xi.astype(self.categories_[i].dtype) - else: - Xi = Xi.copy() - - Xi[~valid_mask] = self.categories_[i][0] + Xi = Xi.copy() + + Xi[~valid_mask] = self.categories_[i][0] # We use check_unknown=False, since _encode_check_unknown was # already called above. _, encoded = _encode(Xi, self.categories_[i], encode=True, From 2407236ae780c3d5b27551dc7b8ffcf5f4f931e4 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 6 Nov 2019 16:22:57 -0500 Subject: [PATCH 26/28] TST Adds test to check for nans --- sklearn/preprocessing/tests/test_encoders.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 29d6743a51af7..185e5687cea6f 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -773,3 +773,19 @@ def test_ordinal_encoder_pd_categories_mixed(dtype): X_inverse = oe.inverse_transform(expected_trans) assert_array_equal(X_inverse, X_df.values) + + +@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder]) +def test_encoders_does_not_support_missing_values_in_pd_categories(Encoder): + pd = pytest.importorskip('pandas') + + X_df = pd.DataFrame( + {'col_str': pd.Categorical(['a', 'b', 'b', 'a', np.nan], + categories=['b', 'a'], ordered=True), + 'col_int': pd.Categorical([3, 2, 1, 2, np.nan], + categories=[3, 1, 2], ordered=True)}, + columns=['col_str', 'col_int']) + + enc = Encoder(categories="dtypes") + with pytest.raises(ValueError, match="Input contains NaN"): + enc.fit(X_df) From 9b34ac5c51c1e97a35e66e471991e1a0abd3efc0 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 7 Jan 2020 14:16:08 -0500 Subject: [PATCH 27/28] DOC Move to 0.23 --- doc/whats_new/v0.23.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 53c416c506614..cb0ebb68be7a6 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -121,6 +121,11 @@ Changelog :mod:`sklearn.preprocessing` ............................ +- |Feature| :class:`preprocessing.OneHotEncoder` and + :class:`preprocessing.OrdinalEncoder` now supports `categories='dtype'`, + which enables using pandas categorical dtypes for encoding. :pr:`15396` by + `Thomas Fan`_. + - |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at transforming. :pr:`15762` by `Thomas Fan`_. From 21f6342e7f1b0d751e9296913713d17e77c20957 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 7 Jan 2020 14:40:31 -0500 Subject: [PATCH 28/28] TST Adds test for categories with more categories than training --- sklearn/preprocessing/tests/test_encoders.py | 45 +++++++++++++++----- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 185e5687cea6f..2749c04b8e330 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -697,10 +697,9 @@ def test_encoder_pd_error_mismatch_dtype(encoder): enc.transform(X_df4) -@pytest.mark.parametrize("is_sparse", [True, False]) @pytest.mark.parametrize("drop", ["first", None]) @pytest.mark.parametrize("dtype", [np.float64, np.int8]) -def test_one_hot_encoder_pd_categories_mixed(is_sparse, drop, dtype): +def test_one_hot_encoder_pd_categories_mixed(drop, dtype): pd = pytest.importorskip('pandas') X_df = pd.DataFrame( @@ -713,10 +712,15 @@ def test_one_hot_encoder_pd_categories_mixed(is_sparse, drop, dtype): columns=['col_str', 'col_int', 'norm_float', 'norm_str']) ohe = OneHotEncoder(categories="dtypes", - sparse=is_sparse, dtype=dtype, + sparse=False, drop=drop).fit(X_df) + assert_array_equal(ohe.categories_[0], ['b', 'a']) + assert_array_equal(ohe.categories_[1], [3, 1, 2]) + assert_allclose(ohe.categories_[2], [1.0, 2.0]) + assert_array_equal(ohe.categories_[3], ['d', 'z']) + expected_trans = np.array([ [0, 1, 1, 0, 0, 1, 0, 0, 1], [1, 0, 0, 0, 1, 0, 1, 1, 0], @@ -726,15 +730,36 @@ def test_one_hot_encoder_pd_categories_mixed(is_sparse, drop, dtype): if drop == 'first': expected_trans = expected_trans[:, [1, 3, 4, 6, 8]] - X_trans = ohe.fit_transform(X_df) - assert_array_equal(ohe.categories_[0], ['b', 'a']) - assert_array_equal(ohe.categories_[1], [3, 1, 2]) - assert_allclose(ohe.categories_[2], [1.0, 2.0]) - assert_array_equal(ohe.categories_[3], ['d', 'z']) + X_trans = ohe.transform(X_df) + assert_allclose(X_trans, expected_trans) + X_inverse = ohe.inverse_transform(expected_trans) + + assert_array_equal(X_inverse, X_df.values) + + +def test_one_hot_encoder_pd_categories_with_more_categories(): + # pandas category contains more categories than in training + pd = pytest.importorskip('pandas') + + X_df = pd.DataFrame( + {'col_str': pd.Categorical(['a', 'b', 'b', 'a'], + categories=['b', 'c', 'a'], ordered=True), + 'col_int': pd.Categorical([3, 2, 1, 2], + categories=[3, 1, 2, 4], ordered=True)}, + columns=['col_str', 'col_int']) - if is_sparse: - X_trans = X_trans.toarray() + ohe = OneHotEncoder(categories="dtypes", sparse=False).fit(X_df) + + assert_array_equal(ohe.categories_[0], ['b', 'c', 'a']) + assert_array_equal(ohe.categories_[1], [3, 1, 2, 4]) + + expected_trans = np.array([ + [0, 0, 1, 1, 0, 0, 0], + [1, 0, 0, 0, 0, 1, 0], + [1, 0, 0, 0, 1, 0, 0], + [0, 0, 1, 0, 0, 1, 0]]) + X_trans = ohe.transform(X_df) assert_allclose(X_trans, expected_trans) X_inverse = ohe.inverse_transform(expected_trans)