diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 0e149ed03a9fa..84a28168f9d71 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -481,6 +481,14 @@ Changelog :mod:`sklearn.preprocessing` ............................ +- |Feature| :class:`preprocessing.OneHotEncoder` and + :class:`preprocessing.OrdinalEncoder` now supports `categories='dtype'`, + which enables using pandas categorical dtypes for encoding. :pr:`15396` by + `Thomas Fan`_. + +- |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at + transforming. :pr:`15762` by `Thomas Fan`_. + - |Feature| argument `drop` of :class:`preprocessing.OneHotEncoder` will now accept value 'if_binary' and will drop the first category of each feature with two categories. :pr:`16245` diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 3b0e43c151e0c..f6a25efa9b91c 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -8,6 +8,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils.validation import check_is_fitted +from ..utils.validation import _assert_all_finite from ..utils.validation import _deprecate_positional_args from ._label import _encode, _encode_check_unknown @@ -26,7 +27,34 @@ class _BaseEncoder(TransformerMixin, BaseEstimator): """ - def _check_X(self, X): + def _check_categories_dtypes_equal(self, fit_cat_dict, trans_dtypes): + """Return True if the categorical dtypes in fit_cat_dtypes are in + trans_dtypes.""" + msg = "categorical dtypes in X must match the dtypes used when fitting" + + # one is None and the other is not + if ((fit_cat_dict is None and trans_dtypes is not None) or + (fit_cat_dict is not None and trans_dtypes is None)): + raise ValueError(msg) + + trans_dtypes_dict = {name: dtype for name, dtype in + trans_dtypes.items() if dtype.name == 'category'} + + # names do not match + if set(trans_dtypes_dict) ^ set(fit_cat_dict): + raise ValueError(msg) + + for name, fit_cat_dtype in fit_cat_dict.items(): + try: + trans_cats = trans_dtypes[name].categories + except (AttributeError, KeyError): + raise ValueError(msg) + + # both are categories and are not equal + if all(fit_cat_dtype.categories != trans_cats): + raise ValueError(msg) + + def _check_X(self, X, is_fitting): """ Perform custom check_array: - convert list of strings to object dtype @@ -36,8 +64,24 @@ def _check_X(self, X): constructed feature by feature to preserve the data types of pandas DataFrame columns, as otherwise information is lost and cannot be used, eg for the `categories_` attribute. - + If categories == 'dtypes' and the pandas column is a category, + the pandas series will be returned in this list. """ + if self.categories == 'dtypes': + X_dtypes = getattr(X, "dtypes", None) + if not is_fitting: # transform + self._check_categories_dtypes_equal( + self._X_fit_cat_dict, X_dtypes) + else: + if X_dtypes is not None: + # only remember categorical dtypes + self._X_fit_cat_dict = { + name: dtype for name, dtype in X_dtypes.items() + if dtype.name == 'category'} + else: + # not a pandas dataframe + self._X_fit_cat_dict = None + if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2): # if not a dataframe, do normal check_array validation X_temp = check_array(X, dtype=None) @@ -57,8 +101,12 @@ def _check_X(self, X): for i in range(n_features): Xi = self._get_feature(X, feature_idx=i) - Xi = check_array(Xi, ensure_2d=False, dtype=None, - force_all_finite=needs_validation) + if self.categories == 'dtypes' and Xi.dtype.name == 'category': + # TODO: Change when missing value support is added + _assert_all_finite(Xi) + else: + Xi = check_array(Xi, ensure_2d=False, dtype=None, + force_all_finite=needs_validation) X_columns.append(Xi) return X_columns, n_samples, n_features @@ -71,9 +119,9 @@ def _get_feature(self, X, feature_idx): return X[:, feature_idx] def _fit(self, X, handle_unknown='error'): - X_list, n_samples, n_features = self._check_X(X) + X_list, n_samples, n_features = self._check_X(X, is_fitting=True) - if self.categories != 'auto': + if self.categories not in ('auto', 'dtypes'): if len(self.categories) != n_features: raise ValueError("Shape mismatch: if categories is an array," " it has to be of shape (n_features,).") @@ -82,7 +130,7 @@ def _fit(self, X, handle_unknown='error'): for i in range(n_features): Xi = X_list[i] - if self.categories == 'auto': + if self.categories in ('auto', 'dtypes'): cats = _encode(Xi) else: cats = np.array(self.categories[i], dtype=Xi.dtype) @@ -99,7 +147,7 @@ def _fit(self, X, handle_unknown='error'): self.categories_.append(cats) def _transform(self, X, handle_unknown='error'): - X_list, n_samples, n_features = self._check_X(X) + X_list, n_samples, n_features = self._check_X(X, is_fitting=False) X_int = np.zeros((n_samples, n_features), dtype=np.int) X_mask = np.ones((n_samples, n_features), dtype=np.bool) @@ -179,6 +227,9 @@ class OneHotEncoder(_BaseEncoder): Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. + - 'dtypes' : Uses pandas categorical dtype to encode categories. For + non pandas categorical data, the categories are automatically + determined from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values within a single feature, and should be sorted in case of @@ -613,6 +664,9 @@ class OrdinalEncoder(_BaseEncoder): Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. + - 'dtypes' : Uses pandas categorical dtype to encode categories. For + non pandas categorical data, the categories are automatically + determined from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values, and should be sorted in case of numeric values. diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 88fad3670cb01..5fadd0c6872c4 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -72,6 +72,16 @@ def _encode_python(values, uniques=None, encode=False): return uniques +def _encode_categorical(values, uniques=None, encode=False): + """Encode values of a pandas Series with a categorical dtype""" + if uniques is None: + uniques = values.cat.categories.values.copy() + if encode: + encoded = values.cat.codes + return uniques, encoded + return uniques + + def _encode(values, uniques=None, encode=False, check_unknown=True): """Helper function to factorize (find uniques) and encode values. @@ -82,9 +92,12 @@ def _encode(values, uniques=None, encode=False, check_unknown=True): the case. The calling method needs to ensure this for all non-object values. + If values is a pandas Series with a categorical dtype then the encoding + will be infered from the series. + Parameters ---------- - values : array + values : array or pandas Series Values to factorize or encode. uniques : array, optional If passed, uniques are not determined from passed values (this @@ -117,6 +130,8 @@ def _encode(values, uniques=None, encode=False, check_unknown=True): raise TypeError("Encoders require their input to be uniformly " f"strings or numbers. Got {types}") return res + elif values.dtype.name == "category": + return _encode_categorical(values, uniques=uniques, encode=encode) else: return _encode_numpy(values, uniques, encode, check_unknown=check_unknown) @@ -129,6 +144,10 @@ def _encode_check_unknown(values, uniques, return_mask=False): Uses pure python method for object dtype, and numpy method for all other dtypes. + If values is a pandas Series with a categorical dtype, then we assume that + the dtype is checked to be the same as fit time and no missing values. + + Parameters ---------- values : array @@ -159,6 +178,12 @@ def _encode_check_unknown(values, uniques, return_mask=False): return diff, valid_mask else: return diff + elif values.dtype.name == "category": + # Assume there are no missing vlaues in categorical + diff = [] + if return_mask: + return diff, np.ones_like(len(values), dtype=bool) + return diff else: unique_values = np.unique(values) diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True)) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 126a923a952dd..1933d6a6074fd 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -696,3 +696,180 @@ def test_encoders_does_not_support_none_values(Encoder): with pytest.raises(TypeError, match="Encoders require their input to be " "uniformly strings or numbers."): Encoder().fit(values) + + +@pytest.mark.parametrize('encoder', [ + OneHotEncoder(categories="dtypes"), + OrdinalEncoder(categories="dtypes")]) +def test_encoder_pd_error_mismatch_dtype(encoder): + pd = pytest.importorskip('pandas') + msg = "categorical dtypes in X must match the dtypes used when fitting" + + X_df_orig = pd.DataFrame({ + 'col_str': ['a', 'b', 'b', 'a'], + 'col_int': [3, 2, 1, 2]}, columns=['col_str', 'col_int']) + + enc_no_categories = encoder.fit(X_df_orig) + + X_df0 = X_df_orig.copy() + X_df0['col_int'] = X_df0['col_int'].astype('category') + + # X_df0 has categories while the trained dataframe does not + with pytest.raises(ValueError, match=msg): + enc_no_categories.transform(X_df0) + + str_category = pd.api.types.CategoricalDtype( + categories=['b', 'a'], ordered=True) + X_df1 = X_df_orig.copy() + X_df1['col_str'] = X_df1['col_str'].astype(str_category) + X_df1['col_int'] = X_df1['col_int'].astype('category') + + # X_df1 has categories while the trained dataframe does not + with pytest.raises(ValueError, match=msg): + enc_no_categories.transform(X_df1) + + # Train encoder with categoricals + enc = encoder.fit(X_df1) + + # col_str dtype not ordered correctly + X_df2 = X_df_orig.copy() + str_category_lex_ordered = pd.api.types.CategoricalDtype( + categories=['a', 'b'], ordered=True) + X_df2['col_str'] = X_df2['col_str'].astype(str_category_lex_ordered) + X_df2['col_int'] = X_df2['col_int'].astype('category') + + with pytest.raises(ValueError, match=msg): + enc.transform(X_df2) + + # col_int not a categorical dtype + X_df3 = X_df_orig.copy() + X_df3['col_int'] = X_df3['col_int'].astype(int) + + with pytest.raises(ValueError, match=msg): + enc.transform(X_df3) + + # number of features is not correct + X_df4 = pd.DataFrame({ + 'col_str': ['a', 'b', 'b', 'a']}) + + with pytest.raises(ValueError, match=msg): + enc.transform(X_df4) + + +@pytest.mark.parametrize("drop", ["first", None]) +@pytest.mark.parametrize("dtype", [np.float64, np.int8]) +def test_one_hot_encoder_pd_categories_mixed(drop, dtype): + pd = pytest.importorskip('pandas') + + X_df = pd.DataFrame( + {'col_str': pd.Categorical(['a', 'b', 'b', 'a'], + categories=['b', 'a'], ordered=True), + 'col_int': pd.Categorical([3, 2, 1, 2], + categories=[3, 1, 2], ordered=True), + 'norm_float': [1.0, 2.0, 1.0, 1.0], # not a pandas category + 'norm_str': ['z', 'd', 'z', 'd']}, # not a pandas category + columns=['col_str', 'col_int', 'norm_float', 'norm_str']) + + ohe = OneHotEncoder(categories="dtypes", + dtype=dtype, + sparse=False, + drop=drop).fit(X_df) + + assert_array_equal(ohe.categories_[0], ['b', 'a']) + assert_array_equal(ohe.categories_[1], [3, 1, 2]) + assert_allclose(ohe.categories_[2], [1.0, 2.0]) + assert_array_equal(ohe.categories_[3], ['d', 'z']) + + expected_trans = np.array([ + [0, 1, 1, 0, 0, 1, 0, 0, 1], + [1, 0, 0, 0, 1, 0, 1, 1, 0], + [1, 0, 0, 1, 0, 1, 0, 0, 1], + [0, 1, 0, 0, 1, 1, 0, 1, 0]], dtype=dtype) + + if drop == 'first': + expected_trans = expected_trans[:, [1, 3, 4, 6, 8]] + + X_trans = ohe.transform(X_df) + assert_allclose(X_trans, expected_trans) + X_inverse = ohe.inverse_transform(expected_trans) + + assert_array_equal(X_inverse, X_df.values) + + +def test_one_hot_encoder_pd_categories_with_more_categories(): + # pandas category contains more categories than in training + pd = pytest.importorskip('pandas') + + X_df = pd.DataFrame( + {'col_str': pd.Categorical(['a', 'b', 'b', 'a'], + categories=['b', 'c', 'a'], ordered=True), + 'col_int': pd.Categorical([3, 2, 1, 2], + categories=[3, 1, 2, 4], ordered=True)}, + columns=['col_str', 'col_int']) + + ohe = OneHotEncoder(categories="dtypes", sparse=False).fit(X_df) + + assert_array_equal(ohe.categories_[0], ['b', 'c', 'a']) + assert_array_equal(ohe.categories_[1], [3, 1, 2, 4]) + + expected_trans = np.array([ + [0, 0, 1, 1, 0, 0, 0], + [1, 0, 0, 0, 0, 1, 0], + [1, 0, 0, 0, 1, 0, 0], + [0, 0, 1, 0, 0, 1, 0]]) + + X_trans = ohe.transform(X_df) + assert_allclose(X_trans, expected_trans) + X_inverse = ohe.inverse_transform(expected_trans) + + assert_array_equal(X_inverse, X_df.values) + + +@pytest.mark.parametrize("dtype", [np.float64, np.int8]) +def test_ordinal_encoder_pd_categories_mixed(dtype): + pd = pytest.importorskip('pandas') + + X_df = pd.DataFrame( + {'col_str': pd.Categorical(['a', 'b', 'b', 'a'], + categories=['b', 'a'], ordered=True), + 'col_int': pd.Categorical([3, 2, 1, 2], + categories=[3, 1, 2], ordered=True), + 'norm_float': [1.0, 2.0, 1.0, 1.0], # not a pandas category + 'norm_str': ['z', 'd', 'z', 'd']}, # not a pandas category + columns=['col_str', 'col_int', 'norm_float', 'norm_str']) + + oe = OrdinalEncoder(categories="dtypes", dtype=dtype).fit(X_df) + + expected_trans = np.array([ + [1, 0, 0, 1], # col_str + [0, 2, 1, 2], # col_int + [0, 1, 0, 0], # norm_float + [1, 0, 1, 0], # norm_str + ], dtype=dtype).T + + X_trans = oe.fit_transform(X_df) + assert_array_equal(oe.categories_[0], ['b', 'a']) + assert_array_equal(oe.categories_[1], [3, 1, 2]) + assert_allclose(oe.categories_[2], [1.0, 2.0]) + assert_array_equal(oe.categories_[3], ['d', 'z']) + + assert_allclose(X_trans, expected_trans) + X_inverse = oe.inverse_transform(expected_trans) + + assert_array_equal(X_inverse, X_df.values) + + +@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder]) +def test_encoders_does_not_support_missing_values_in_pd_categories(Encoder): + pd = pytest.importorskip('pandas') + + X_df = pd.DataFrame( + {'col_str': pd.Categorical(['a', 'b', 'b', 'a', np.nan], + categories=['b', 'a'], ordered=True), + 'col_int': pd.Categorical([3, 2, 1, 2, np.nan], + categories=[3, 1, 2], ordered=True)}, + columns=['col_str', 'col_int']) + + enc = Encoder(categories="dtypes") + with pytest.raises(ValueError, match="Input contains NaN"): + enc.fit(X_df)