diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index ac9225bdfc31f..76b90af4a01de 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -32,7 +32,7 @@ class _BaseEncoder(BaseEstimator, TransformerMixin): """ - def _check_X(self, X): + def _check_X(self, X, is_fit=False): """ Perform custom check_array: - convert list of strings to object dtype @@ -61,10 +61,38 @@ def _check_X(self, X): n_samples, n_features = X.shape X_columns = [] + if is_fit: + self.features_dtype = [] for i in range(n_features): Xi = self._get_feature(X, feature_idx=i) - Xi = check_array(Xi, ensure_2d=False, dtype=None, - force_all_finite=needs_validation) + + if Xi.dtype.name == 'category': + # categorical dtype; do not want to convert to an array, + # check if there are no nans (otherwise done in check_array()) + if Xi.isna().any(): + raise ValueError("Input contains NaN") + else: + Xi = check_array(Xi, ensure_2d=False, dtype=None, + force_all_finite=needs_validation) + + if is_fit: + # save the dtype or exact categories if category dtype + if Xi.dtype.name == 'category': + f_dtype = Xi.cat.categories + else: + f_dtype = Xi.dtype + self.features_dtype.append(f_dtype) + else: + # transform, check if dtype is the same as it was passed by fit + if not (Xi.dtype == self.features_dtype[i]): + if Xi.dtype.name == 'category': + # check if categories are the same + if not (Xi.cat.categories == + self.features_dtype[i]).all(): + raise ValueError("""Categories of + the features were + different in fit() and + in the transform()""") X_columns.append(Xi) return X_columns, n_samples, n_features @@ -77,7 +105,7 @@ def _get_feature(self, X, feature_idx): return X[:, feature_idx] def _fit(self, X, handle_unknown='error'): - X_list, n_samples, n_features = self._check_X(X) + X_list, n_samples, n_features = self._check_X(X, is_fit=True) if self._categories != 'auto': if len(self._categories) != n_features: @@ -114,7 +142,6 @@ def _transform(self, X, handle_unknown='error'): Xi = X_list[i] diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], return_mask=True) - if not np.all(valid_mask): if handle_unknown == 'error': msg = ("Found unknown categories {0} in column {1}" @@ -675,22 +702,14 @@ def _legacy_transform(self, X): def _transform_new(self, X): """New implementation assuming categorical input""" - X_temp = check_array(X, dtype=None) - if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): - X = check_array(X, dtype=np.object) - else: - X = X_temp - - n_samples, n_features = X.shape - X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) + n_samples, n_features = X_int.shape if self.drop is not None: to_drop = self.drop_idx_.reshape(1, -1) # We remove all the dropped categories from mask, and decrement all # categories that occur after them to avoid an empty column. - keep_cells = X_int != to_drop X_mask &= keep_cells X_int[X_int > to_drop] -= 1 diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index f7cffa1e663b5..46bdb6b36501e 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -70,6 +70,17 @@ def _encode_python(values, uniques=None, encode=False): return uniques +def _encode_categorical(values, uniques=None, encode=False): + # only used in _encode below, see docstring there for details + cat_values = values.array + if uniques is None: + uniques = cat_values.categories + if encode: + # check if all values.categories are included in uniques + return uniques, cat_values.codes + else: + return uniques + def _encode(values, uniques=None, encode=False): """Helper function to factorize (find uniques) and encode values. @@ -104,8 +115,11 @@ def _encode(values, uniques=None, encode=False): try: res = _encode_python(values, uniques, encode) except TypeError: - raise TypeError("argument must be a string or number") + raise TypeError("argument must be a string or a number") return res + elif values.dtype.name == "category": + # pandas dtype category + return _encode_categorical(values, uniques, encode) else: return _encode_numpy(values, uniques, encode) @@ -147,6 +161,16 @@ def _encode_check_unknown(values, uniques, return_mask=False): return diff, valid_mask else: return diff + elif values.dtype.name == "category": + # if category Pandas datatype is used there should never been diff + cat_values = values.array + uniques_set = set(uniques) + diff = list(set(cat_values) - uniques_set) + if return_mask: + valid_mask = np.ones(len(values), dtype=bool) + return diff, valid_mask + else: + return diff else: unique_values = np.unique(values) diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True)) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 850efe22b5d11..ba1f260ca2f2e 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -346,6 +346,44 @@ def test_one_hot_encoder_dtype_pandas(output_dtype): assert_array_equal(oh.fit(X_df).transform(X_df), X_expected) +@pytest.mark.parametrize("method", ['fit', 'fit_transform']) +def test_one_hot_encoder_categorical_dtype(method): + pd = pytest.importorskip('pandas') + cat = pd.Categorical(["a", "b", "c"], categories=["b", "a", "c", "d"]) + X_df = pd.DataFrame({"A": cat, "B": ["a", "c", "c"]}) + + oh = OneHotEncoder() + getattr(oh, method)(X_df) + cats = oh.categories_ + assert_array_equal(cats[0], X_df['A'].cat.categories) + assert_array_equal(cats[1], X_df['B'].unique()) + + +@pytest.mark.parametrize("method", ['fit', 'fit_transform']) +def test_categorical_nans(method): + # ensures error if categorical datatype contains Nones + pd = pytest.importorskip('pandas') + cat = pd.Categorical(["a", None, "c"], categories=["b", "a", "c", "d"]) + X_df = pd.DataFrame({"A": cat, "B": ["a", "c", "c"]}) + + oh = OneHotEncoder() + with pytest.raises(ValueError, match="Input contains NaN"): + getattr(oh, method)(X_df) + + +def test_categorical_same_category_fit_transform(): + # tests that all the categories are included within specified categories + pd = pytest.importorskip('pandas') + cat_fit = pd.Categorical(["a", "b"], categories=["b", "a", "c", "d"]) + cat_transform = pd.Categorical(["b", "a"], categories=["a", "c", "d", "b"]) + X_fit = pd.DataFrame({"A": cat_fit, "B": ["a", "c"]}) + X_transform = pd.DataFrame({"A": cat_transform, "B": ["a", "c"]}) + + oh = OneHotEncoder() + oh.fit(X_fit) + with pytest.raises(ValueError, match="Categories of"): + oh.transform(X_transform) + def test_one_hot_encoder_set_params(): X = np.array([[1, 2]]).T oh = OneHotEncoder()