scikit-learn · thomasjpfan · Oct 29, 2019 · Oct 29, 2019 · Oct 29, 2019 · Oct 29, 2019
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
@@ -481,6 +481,14 @@ Changelog
 :mod:`sklearn.preprocessing`
 ............................
 
+- |Feature| :class:`preprocessing.OneHotEncoder` and 
+  :class:`preprocessing.OrdinalEncoder` now supports `categories='dtype'`,
+  which enables using pandas categorical dtypes for encoding. :pr:`15396` by
+  `Thomas Fan`_.
+
+- |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at
+  transforming. :pr:`15762` by `Thomas Fan`_.
+
 - |Feature| argument `drop` of :class:`preprocessing.OneHotEncoder`
   will now accept value 'if_binary' and will drop the first category of
   each feature with two categories. :pr:`16245`

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -8,6 +8,7 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _assert_all_finite
 from ..utils.validation import _deprecate_positional_args
 
 from ._label import _encode, _encode_check_unknown
@@ -26,7 +27,34 @@ class _BaseEncoder(TransformerMixin, BaseEstimator):
 
     """
 
-    def _check_X(self, X):
+    def _check_categories_dtypes_equal(self, fit_cat_dict, trans_dtypes):
+        """Return True if the categorical dtypes in fit_cat_dtypes are in
+        trans_dtypes."""
+        msg = "categorical dtypes in X must match the dtypes used when fitting"
+
+        # one is None and the other is not
+        if ((fit_cat_dict is None and trans_dtypes is not None) or
+                (fit_cat_dict is not None and trans_dtypes is None)):
+            raise ValueError(msg)
+
+        trans_dtypes_dict = {name: dtype for name, dtype in
+                             trans_dtypes.items() if dtype.name == 'category'}
+
+        # names do not match
+        if set(trans_dtypes_dict) ^ set(fit_cat_dict):
+            raise ValueError(msg)
+
+        for name, fit_cat_dtype in fit_cat_dict.items():
+            try:
+                trans_cats = trans_dtypes[name].categories
+            except (AttributeError, KeyError):
+                raise ValueError(msg)
+
+            # both are categories and are not equal
+            if all(fit_cat_dtype.categories != trans_cats):
+                raise ValueError(msg)
+
+    def _check_X(self, X, is_fitting):
         """
         Perform custom check_array:
         - convert list of strings to object dtype
@@ -36,8 +64,24 @@ def _check_X(self, X):
           constructed feature by feature to preserve the data types
           of pandas DataFrame columns, as otherwise information is lost
           and cannot be used, eg for the `categories_` attribute.
-
+          If categories == 'dtypes' and the pandas column is a category,
+          the pandas series will be returned in this list.
         """
+        if self.categories == 'dtypes':
+            X_dtypes = getattr(X, "dtypes", None)
+            if not is_fitting:  # transform
+                self._check_categories_dtypes_equal(
+                    self._X_fit_cat_dict, X_dtypes)
+            else:
+                if X_dtypes is not None:
+                    # only remember categorical dtypes
+                    self._X_fit_cat_dict = {
+                        name: dtype for name, dtype in X_dtypes.items()
+                        if dtype.name == 'category'}
+                else:
+                    # not a pandas dataframe
+                    self._X_fit_cat_dict = None
+
         if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
             # if not a dataframe, do normal check_array validation
             X_temp = check_array(X, dtype=None)
@@ -57,8 +101,12 @@ def _check_X(self, X):
 
         for i in range(n_features):
             Xi = self._get_feature(X, feature_idx=i)
-            Xi = check_array(Xi, ensure_2d=False, dtype=None,
-                             force_all_finite=needs_validation)
+            if self.categories == 'dtypes' and Xi.dtype.name == 'category':
+                # TODO: Change when missing value support is added
+                _assert_all_finite(Xi)
+            else:
+                Xi = check_array(Xi, ensure_2d=False, dtype=None,
+                                 force_all_finite=needs_validation)
             X_columns.append(Xi)
 
         return X_columns, n_samples, n_features
@@ -71,9 +119,9 @@ def _get_feature(self, X, feature_idx):
         return X[:, feature_idx]
 
     def _fit(self, X, handle_unknown='error'):
-        X_list, n_samples, n_features = self._check_X(X)
+        X_list, n_samples, n_features = self._check_X(X, is_fitting=True)
 
-        if self.categories != 'auto':
+        if self.categories not in ('auto', 'dtypes'):
             if len(self.categories) != n_features:
                 raise ValueError("Shape mismatch: if categories is an array,"
                                  " it has to be of shape (n_features,).")
@@ -82,7 +130,7 @@ def _fit(self, X, handle_unknown='error'):
 
         for i in range(n_features):
             Xi = X_list[i]
-            if self.categories == 'auto':
+            if self.categories in ('auto', 'dtypes'):
                 cats = _encode(Xi)
             else:
                 cats = np.array(self.categories[i], dtype=Xi.dtype)
@@ -99,7 +147,7 @@ def _fit(self, X, handle_unknown='error'):
             self.categories_.append(cats)
 
     def _transform(self, X, handle_unknown='error'):
-        X_list, n_samples, n_features = self._check_X(X)
+        X_list, n_samples, n_features = self._check_X(X, is_fitting=False)
 
         X_int = np.zeros((n_samples, n_features), dtype=np.int)
         X_mask = np.ones((n_samples, n_features), dtype=np.bool)
@@ -179,6 +227,9 @@ class OneHotEncoder(_BaseEncoder):
         Categories (unique values) per feature:
 
         - 'auto' : Determine categories automatically from the training data.
+        - 'dtypes' : Uses pandas categorical dtype to encode categories. For
+          non pandas categorical data, the categories are automatically
+          determined from the training data.
         - list : ``categories[i]`` holds the categories expected in the ith
           column. The passed categories should not mix strings and numeric
           values within a single feature, and should be sorted in case of
@@ -613,6 +664,9 @@ class OrdinalEncoder(_BaseEncoder):
         Categories (unique values) per feature:
 
         - 'auto' : Determine categories automatically from the training data.
+        - 'dtypes' : Uses pandas categorical dtype to encode categories. For
+          non pandas categorical data, the categories are automatically
+          determined from the training data.
         - list : ``categories[i]`` holds the categories expected in the ith
           column. The passed categories should not mix strings and numeric
           values, and should be sorted in case of numeric values.

diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
@@ -72,6 +72,16 @@ def _encode_python(values, uniques=None, encode=False):
         return uniques
 
 
+def _encode_categorical(values, uniques=None, encode=False):
+    """Encode values of a pandas Series with a categorical dtype"""
+    if uniques is None:
+        uniques = values.cat.categories.values.copy()
+    if encode:
+        encoded = values.cat.codes
+        return uniques, encoded
+    return uniques
+
+
 def _encode(values, uniques=None, encode=False, check_unknown=True):
     """Helper function to factorize (find uniques) and encode values.
 
@@ -82,9 +92,12 @@ def _encode(values, uniques=None, encode=False, check_unknown=True):
     the case. The calling method needs to ensure this for all non-object
     values.
 
+    If values is a pandas Series with a categorical dtype then the encoding
+    will be infered from the series.
+
     Parameters
     ----------
-    values : array
+    values : array or pandas Series
         Values to factorize or encode.
     uniques : array, optional
         If passed, uniques are not determined from passed values (this
@@ -117,6 +130,8 @@ def _encode(values, uniques=None, encode=False, check_unknown=True):
             raise TypeError("Encoders require their input to be uniformly "
                             f"strings or numbers. Got {types}")
         return res
+    elif values.dtype.name == "category":
+        return _encode_categorical(values, uniques=uniques, encode=encode)
     else:
         return _encode_numpy(values, uniques, encode,
                              check_unknown=check_unknown)
@@ -129,6 +144,10 @@ def _encode_check_unknown(values, uniques, return_mask=False):
     Uses pure python method for object dtype, and numpy method for
     all other dtypes.
 
+    If values is a pandas Series with a categorical dtype, then we assume that
+    the dtype is checked to be the same as fit time and no missing values.
+
+
     Parameters
     ----------
     values : array
@@ -159,6 +178,12 @@ def _encode_check_unknown(values, uniques, return_mask=False):
             return diff, valid_mask
         else:
             return diff
+    elif values.dtype.name == "category":
+        # Assume there are no missing vlaues in categorical
+        diff = []
+        if return_mask:
+            return diff, np.ones_like(len(values), dtype=bool)
+        return diff
     else:
         unique_values = np.unique(values)
         diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -696,3 +696,180 @@ def test_encoders_does_not_support_none_values(Encoder):
     with pytest.raises(TypeError, match="Encoders require their input to be "
                                         "uniformly strings or numbers."):
         Encoder().fit(values)
+
+
+@pytest.mark.parametrize('encoder', [
+    OneHotEncoder(categories="dtypes"),
+    OrdinalEncoder(categories="dtypes")])
+def test_encoder_pd_error_mismatch_dtype(encoder):
+    pd = pytest.importorskip('pandas')
+    msg = "categorical dtypes in X must match the dtypes used when fitting"
+
+    X_df_orig = pd.DataFrame({
+        'col_str': ['a', 'b', 'b', 'a'],
+        'col_int': [3, 2, 1, 2]}, columns=['col_str', 'col_int'])
+
+    enc_no_categories = encoder.fit(X_df_orig)
+
+    X_df0 = X_df_orig.copy()
+    X_df0['col_int'] = X_df0['col_int'].astype('category')
+
+    # X_df0 has categories while the trained dataframe does not
+    with pytest.raises(ValueError, match=msg):
+        enc_no_categories.transform(X_df0)
+
+    str_category = pd.api.types.CategoricalDtype(
+         categories=['b', 'a'], ordered=True)
+    X_df1 = X_df_orig.copy()
+    X_df1['col_str'] = X_df1['col_str'].astype(str_category)
+    X_df1['col_int'] = X_df1['col_int'].astype('category')
+
+    # X_df1 has categories while the trained dataframe does not
+    with pytest.raises(ValueError, match=msg):
+        enc_no_categories.transform(X_df1)
+
+    # Train encoder with categoricals
+    enc = encoder.fit(X_df1)
+
+    # col_str dtype not ordered correctly
+    X_df2 = X_df_orig.copy()
+    str_category_lex_ordered = pd.api.types.CategoricalDtype(
+         categories=['a', 'b'], ordered=True)
+    X_df2['col_str'] = X_df2['col_str'].astype(str_category_lex_ordered)
+    X_df2['col_int'] = X_df2['col_int'].astype('category')
+
+    with pytest.raises(ValueError, match=msg):
+        enc.transform(X_df2)
+
+    # col_int not a categorical dtype
+    X_df3 = X_df_orig.copy()
+    X_df3['col_int'] = X_df3['col_int'].astype(int)
+
+    with pytest.raises(ValueError, match=msg):
+        enc.transform(X_df3)
+
+    # number of features is not correct
+    X_df4 = pd.DataFrame({
+        'col_str': ['a', 'b', 'b', 'a']})
+
+    with pytest.raises(ValueError, match=msg):
+        enc.transform(X_df4)
+
+
+@pytest.mark.parametrize("drop", ["first", None])
+@pytest.mark.parametrize("dtype", [np.float64, np.int8])
+def test_one_hot_encoder_pd_categories_mixed(drop, dtype):
+    pd = pytest.importorskip('pandas')
+
+    X_df = pd.DataFrame(
+        {'col_str': pd.Categorical(['a', 'b', 'b', 'a'],
+                                   categories=['b', 'a'], ordered=True),
+         'col_int': pd.Categorical([3, 2, 1, 2],
+                                   categories=[3, 1, 2], ordered=True),
+         'norm_float': [1.0, 2.0, 1.0, 1.0],  # not a pandas category
+         'norm_str': ['z', 'd', 'z', 'd']},  # not a pandas category
+        columns=['col_str', 'col_int', 'norm_float', 'norm_str'])
+
+    ohe = OneHotEncoder(categories="dtypes",
+                        dtype=dtype,
+                        sparse=False,
+                        drop=drop).fit(X_df)
+
+    assert_array_equal(ohe.categories_[0], ['b', 'a'])
+    assert_array_equal(ohe.categories_[1], [3, 1, 2])
+    assert_allclose(ohe.categories_[2], [1.0, 2.0])
+    assert_array_equal(ohe.categories_[3], ['d', 'z'])
+
+    expected_trans = np.array([
+        [0, 1, 1, 0, 0, 1, 0, 0, 1],
+        [1, 0, 0, 0, 1, 0, 1, 1, 0],
+        [1, 0, 0, 1, 0, 1, 0, 0, 1],
+        [0, 1, 0, 0, 1, 1, 0, 1, 0]], dtype=dtype)
+
+    if drop == 'first':
+        expected_trans = expected_trans[:, [1, 3, 4, 6, 8]]
+
+    X_trans = ohe.transform(X_df)
+    assert_allclose(X_trans, expected_trans)
+    X_inverse = ohe.inverse_transform(expected_trans)
+
+    assert_array_equal(X_inverse, X_df.values)
+
+
+def test_one_hot_encoder_pd_categories_with_more_categories():
+    # pandas category contains more categories than in training
+    pd = pytest.importorskip('pandas')
+
+    X_df = pd.DataFrame(
+        {'col_str': pd.Categorical(['a', 'b', 'b', 'a'],
+                                   categories=['b', 'c', 'a'], ordered=True),
+         'col_int': pd.Categorical([3, 2, 1, 2],
+                                   categories=[3, 1, 2, 4], ordered=True)},
+        columns=['col_str', 'col_int'])
+
+    ohe = OneHotEncoder(categories="dtypes", sparse=False).fit(X_df)
+
+    assert_array_equal(ohe.categories_[0], ['b', 'c', 'a'])
+    assert_array_equal(ohe.categories_[1], [3, 1, 2, 4])
+
+    expected_trans = np.array([
+        [0, 0, 1, 1, 0, 0, 0],
+        [1, 0, 0, 0, 0, 1, 0],
+        [1, 0, 0, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 1, 0]])
+
+    X_trans = ohe.transform(X_df)
+    assert_allclose(X_trans, expected_trans)
+    X_inverse = ohe.inverse_transform(expected_trans)
+
+    assert_array_equal(X_inverse, X_df.values)
+
+
+@pytest.mark.parametrize("dtype", [np.float64, np.int8])
+def test_ordinal_encoder_pd_categories_mixed(dtype):
+    pd = pytest.importorskip('pandas')
+
+    X_df = pd.DataFrame(
+        {'col_str': pd.Categorical(['a', 'b', 'b', 'a'],
+                                   categories=['b', 'a'], ordered=True),
+         'col_int': pd.Categorical([3, 2, 1, 2],
+                                   categories=[3, 1, 2], ordered=True),
+         'norm_float': [1.0, 2.0, 1.0, 1.0],  # not a pandas category
+         'norm_str': ['z', 'd', 'z', 'd']},  # not a pandas category
+        columns=['col_str', 'col_int', 'norm_float', 'norm_str'])
+
+    oe = OrdinalEncoder(categories="dtypes", dtype=dtype).fit(X_df)
+
+    expected_trans = np.array([
+        [1, 0, 0, 1],  # col_str
+        [0, 2, 1, 2],  # col_int
+        [0, 1, 0, 0],  # norm_float
+        [1, 0, 1, 0],  # norm_str
+    ], dtype=dtype).T
+
+    X_trans = oe.fit_transform(X_df)
+    assert_array_equal(oe.categories_[0], ['b', 'a'])
+    assert_array_equal(oe.categories_[1], [3, 1, 2])
+    assert_allclose(oe.categories_[2], [1.0, 2.0])
+    assert_array_equal(oe.categories_[3], ['d', 'z'])
+
+    assert_allclose(X_trans, expected_trans)
+    X_inverse = oe.inverse_transform(expected_trans)
+
+    assert_array_equal(X_inverse, X_df.values)
+
+
+@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
+def test_encoders_does_not_support_missing_values_in_pd_categories(Encoder):
+    pd = pytest.importorskip('pandas')
+
+    X_df = pd.DataFrame(
+        {'col_str': pd.Categorical(['a', 'b', 'b', 'a', np.nan],
+                                   categories=['b', 'a'], ordered=True),
+         'col_int': pd.Categorical([3, 2, 1, 2, np.nan],
+                                   categories=[3, 1, 2], ordered=True)},
+        columns=['col_str', 'col_int'])
+
+    enc = Encoder(categories="dtypes")
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        enc.fit(X_df)