scikit-learn · maikia · Mar 1, 2019 · Mar 1, 2019 · Mar 1, 2019
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -32,7 +32,7 @@ class _BaseEncoder(BaseEstimator, TransformerMixin):
 
     """
 
-    def _check_X(self, X):
+    def _check_X(self, X, is_fit=False):
         """
         Perform custom check_array:
         - convert list of strings to object dtype
@@ -61,10 +61,38 @@ def _check_X(self, X):
         n_samples, n_features = X.shape
         X_columns = []
 
+        if is_fit:
+            self.features_dtype = []
         for i in range(n_features):
             Xi = self._get_feature(X, feature_idx=i)
-            Xi = check_array(Xi, ensure_2d=False, dtype=None,
-                             force_all_finite=needs_validation)
+
+            if Xi.dtype.name == 'category':
+                # categorical dtype; do not want to convert to an array,
+                # check if there are no nans (otherwise done in check_array())
+                if Xi.isna().any():
+                    raise ValueError("Input contains NaN")
+            else:
+                Xi = check_array(Xi, ensure_2d=False, dtype=None,
+                                 force_all_finite=needs_validation)
+
+            if is_fit:
+                # save the dtype or exact categories if category dtype
+                if Xi.dtype.name == 'category':
+                    f_dtype = Xi.cat.categories
+                else:
+                    f_dtype = Xi.dtype
+                self.features_dtype.append(f_dtype)
+            else:
+                # transform, check if dtype is the same as it was passed by fit
+                if not (Xi.dtype == self.features_dtype[i]):
+                    if Xi.dtype.name == 'category':
+                        # check if categories are the same
+                        if not (Xi.cat.categories ==
+                                self.features_dtype[i]).all():
+                                    raise ValueError("""Categories of
+                                                        the features were
+                                                        different in fit() and
+                                                        in the transform()""")
             X_columns.append(Xi)
 
         return X_columns, n_samples, n_features
@@ -77,7 +105,7 @@ def _get_feature(self, X, feature_idx):
         return X[:, feature_idx]
 
     def _fit(self, X, handle_unknown='error'):
-        X_list, n_samples, n_features = self._check_X(X)
+        X_list, n_samples, n_features = self._check_X(X, is_fit=True)
 
         if self._categories != 'auto':
             if len(self._categories) != n_features:
@@ -114,7 +142,6 @@ def _transform(self, X, handle_unknown='error'):
             Xi = X_list[i]
             diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
                                                      return_mask=True)
-
             if not np.all(valid_mask):
                 if handle_unknown == 'error':
                     msg = ("Found unknown categories {0} in column {1}"
@@ -675,22 +702,14 @@ def _legacy_transform(self, X):
 
     def _transform_new(self, X):
         """New implementation assuming categorical input"""
-        X_temp = check_array(X, dtype=None)
-        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
-            X = check_array(X, dtype=np.object)
-        else:
-            X = X_temp
-
-        n_samples, n_features = X.shape
-
         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
+        n_samples, n_features = X_int.shape
 
         if self.drop is not None:
             to_drop = self.drop_idx_.reshape(1, -1)
 
             # We remove all the dropped categories from mask, and decrement all
             # categories that occur after them to avoid an empty column.
-
             keep_cells = X_int != to_drop
             X_mask &= keep_cells
             X_int[X_int > to_drop] -= 1

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
@@ -70,6 +70,17 @@ def _encode_python(values, uniques=None, encode=False):
         return uniques
 
 
+def _encode_categorical(values, uniques=None, encode=False):
+    # only used in _encode below, see docstring there for details
+    cat_values = values.array
+    if uniques is None:
+        uniques = cat_values.categories
+    if encode:
+        # check if all values.categories are included in uniques
+        return uniques, cat_values.codes
+    else:
+        return uniques
+
 def _encode(values, uniques=None, encode=False):
     """Helper function to factorize (find uniques) and encode values.
 
@@ -104,8 +115,11 @@ def _encode(values, uniques=None, encode=False):
         try:
             res = _encode_python(values, uniques, encode)
         except TypeError:
-            raise TypeError("argument must be a string or number")
+            raise TypeError("argument must be a string or a number")
         return res
+    elif values.dtype.name == "category":
+        # pandas dtype category
+        return _encode_categorical(values, uniques, encode)
     else:
         return _encode_numpy(values, uniques, encode)
 
@@ -147,6 +161,16 @@ def _encode_check_unknown(values, uniques, return_mask=False):
             return diff, valid_mask
         else:
             return diff
+    elif values.dtype.name == "category":
+        # if category Pandas datatype is used there should never been diff
+        cat_values = values.array
+        uniques_set = set(uniques)
+        diff = list(set(cat_values) - uniques_set)
+        if return_mask:
+            valid_mask = np.ones(len(values), dtype=bool)
+            return diff, valid_mask
+        else:
+            return diff
     else:
         unique_values = np.unique(values)
         diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -346,6 +346,44 @@ def test_one_hot_encoder_dtype_pandas(output_dtype):
     assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
 
 
+@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
+def test_one_hot_encoder_categorical_dtype(method):
+    pd = pytest.importorskip('pandas')
+    cat = pd.Categorical(["a", "b", "c"], categories=["b", "a", "c", "d"])
+    X_df = pd.DataFrame({"A": cat, "B": ["a", "c", "c"]})
+
+    oh = OneHotEncoder()
+    getattr(oh, method)(X_df)
+    cats = oh.categories_
+    assert_array_equal(cats[0], X_df['A'].cat.categories)
+    assert_array_equal(cats[1], X_df['B'].unique())
+
+
+@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
+def test_categorical_nans(method):
+    # ensures error if categorical datatype contains Nones
+    pd = pytest.importorskip('pandas')
+    cat = pd.Categorical(["a", None, "c"], categories=["b", "a", "c", "d"])
+    X_df = pd.DataFrame({"A": cat, "B": ["a", "c", "c"]})
+
+    oh = OneHotEncoder()
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        getattr(oh, method)(X_df)
+
+
+def test_categorical_same_category_fit_transform():
+    # tests that all the categories are included within specified categories
+    pd = pytest.importorskip('pandas')
+    cat_fit = pd.Categorical(["a", "b"], categories=["b", "a", "c", "d"])
+    cat_transform = pd.Categorical(["b", "a"], categories=["a", "c", "d", "b"])
+    X_fit = pd.DataFrame({"A": cat_fit, "B": ["a", "c"]})
+    X_transform = pd.DataFrame({"A": cat_transform, "B": ["a", "c"]})
+
+    oh = OneHotEncoder()
+    oh.fit(X_fit)
+    with pytest.raises(ValueError, match="Categories of"):
+        oh.transform(X_transform)
+
 def test_one_hot_encoder_set_params():
     X = np.array([[1, 2]]).T
     oh = OneHotEncoder()