diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 835aead4f8836..181cc4244c3e9 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -752,10 +752,12 @@ enable the gathering of infrequent categories are `min_frequency` and this fraction of the total number of samples will be considered infrequent. The default value is 1, which means every category is encoded separately. -2. `max_categories` is either `None` or any integer greater than 1. This - parameter sets an upper limit to the number of output features for each - input feature. `max_categories` includes the feature that combines - infrequent categories. +2. `max_categories` is either `None` or. any integer greater or equal to 1. + :class:`OrdinalEncoder` also supports an array-like containing `None` and + integers or a dictionary mapping a feature name found in `feature_names_in_` + to an integer. This parameter sets an upper limit to the + number of output categories for each input feature. `max_categories` + includes the category that combines infrequent categories. In the following example with :class:`OrdinalEncoder`, the categories `'dog'` and `'snake'` are considered infrequent:: diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 29d4d87e68748..3db8409714584 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -925,6 +925,12 @@ Changelog - |Fix| :class:`preprocessing.OneHotEncoder` and :class:`preprocessing.OrdinalEncoder` raise an exception if the user provided categories contain duplicates. :pr:`27328` by :user:`Xuefeng Xu `. + +- |Enhancement| Added support for passing `max_categories` as `array-like` or + `dict` in :class:`preprocessing.OrdinalEncoder`. This allows specifying the + maximum number of output categories for each input feature instead of being + restricted to setting a global maximum number of output categories. + :pr:`26284` by :user:`Andrew Wang `. - |Fix| :class:`preprocessing.FunctionTransformer` raises an error at `transform` if the output of `get_feature_names_out` is not consistent with the column names of the diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 86e0c991ab2a3..46492b6e3181e 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -265,15 +265,84 @@ def infrequent_categories_(self): for category, indices in zip(self.categories_, infrequent_indices) ] + def _validate_max_categories(self): + """ + Check max_categories and returns the corresponding array. + """ + max_categories = getattr(self, "max_categories", None) + + if isinstance(max_categories, Integral) and max_categories >= 1: + return [max_categories] * self.n_features_in_ + + elif isinstance(max_categories, dict): + if not hasattr(self, "feature_names_in_"): + raise ValueError( + f"{self.__class__.__name__} was not fitted on data " + "with feature names. Pass max_categories as an integer " + "array instead." + ) + + unexpected_feature_names = list( + set(self.max_categories) - set(self.feature_names_in_) + ) + if unexpected_feature_names: + unexpected_feature_names.sort() # deterministic error message + n_unexpected = len(unexpected_feature_names) + if len(unexpected_feature_names) > 5: + unexpected_feature_names = unexpected_feature_names[:5] + unexpected_feature_names.append("...") + raise ValueError( + f"max_categories contains {n_unexpected} unexpected feature " + f"names: {unexpected_feature_names}." + ) + + max_categories_array = [None] * self.n_features_in_ + for feature_idx, feature_name in enumerate(self.feature_names_in_): + if feature_name in max_categories: + max_count = max_categories[feature_name] + if not (isinstance(max_count, Integral) and max_count >= 1): + raise ValueError( + f"max_categories['{feature_name}'] must be an " + f"integer at least 1. Got {max_count!r}." + ) + max_categories_array[feature_idx] = max_count + return max_categories_array if any(max_categories_array) else None + + elif _is_arraylike_not_scalar(max_categories): + max_categories = np.asarray(max_categories) + if ( + max_categories.ndim != 1 + or max_categories.shape[0] != self.n_features_in_ + ): + raise ValueError( + f"max_categories has shape {max_categories.shape} but the " + f"input data X has {self.n_features_in_} features." + ) + + if any( + max_count is not None + and not (isinstance(max_count, Integral) and max_count >= 1) + for max_count in max_categories + ): + raise ValueError( + "max_categories must be an array-like of None or integers " + "at least 1." + ) + + return max_categories if any(max_categories) else None + + else: + return None + def _check_infrequent_enabled(self): """ This functions checks whether _infrequent_enabled is True or False. This has to be called after parameter validation in the fit function. """ - max_categories = getattr(self, "max_categories", None) + self._max_categories_per_feature = self._validate_max_categories() min_frequency = getattr(self, "min_frequency", None) self._infrequent_enabled = ( - max_categories is not None and max_categories >= 1 + self._max_categories_per_feature is not None ) or min_frequency is not None def _identify_infrequent(self, category_count, n_samples, col_idx): @@ -305,9 +374,14 @@ def _identify_infrequent(self, category_count, n_samples, col_idx): infrequent_mask = np.zeros(category_count.shape[0], dtype=bool) n_current_features = category_count.size - infrequent_mask.sum() + 1 - if self.max_categories is not None and self.max_categories < n_current_features: + if self._max_categories_per_feature is not None: + max_categories = self._max_categories_per_feature[col_idx] + else: + max_categories = None + + if max_categories is not None and max_categories < n_current_features: # max_categories includes the one infrequent category - frequent_category_count = self.max_categories - 1 + frequent_category_count = max_categories - 1 if frequent_category_count == 0: # All categories are infrequent infrequent_mask[:] = True @@ -1318,12 +1392,20 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder): .. versionadded:: 1.3 Read more in the :ref:`User Guide `. - max_categories : int, default=None + max_categories : int, array-like of int, dict of str or None, default=None Specifies an upper limit to the number of output categories for each input feature when considering infrequent categories. If there are infrequent categories, `max_categories` includes the category representing the - infrequent categories along with the frequent categories. If `None`, - there is no limit to the number of output features. + infrequent categories along with the frequent categories. + + - If int, then `max_categories` is the upper limit of output categories + for all input features. + - If array-like, then each item in `max_categories` is the upper limit + of output categories for the corresponding input feature. + - If dict, then its keys should be the feature names occurring in + `feature_names_in_` and the corresponding values should be the + upper limits of output categories. + - If `None`, then there is no limit to the number of output categories. `max_categories` do **not** take into account missing or unknown categories. Setting `unknown_value` or `encoded_missing_value` to an @@ -1443,7 +1525,12 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder): "encoded_missing_value": [Integral, type(np.nan)], "handle_unknown": [StrOptions({"error", "use_encoded_value"})], "unknown_value": [Integral, type(np.nan), None], - "max_categories": [Interval(Integral, 1, None, closed="left"), None], + "max_categories": [ + Interval(Integral, 1, None, closed="left"), + "array-like", + dict, + None, + ], "min_frequency": [ Interval(Integral, 1, None, closed="left"), Interval(RealNotInt, 0, 1, closed="neither"), diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index dc7bbd2ec03b6..cb553458dfd2d 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -2138,6 +2138,87 @@ def test_ordinal_encoder_infrequent_three_levels(kwargs): assert_array_equal(X_inverse, expected_inverse) +@pytest.mark.parametrize( + "kwargs, expected_infrequent_categories, expected_trans, expected_inverse", + [ + ( + {"max_categories": [3, 3, 3]}, + [["a", "d"], ["e", "h"], ["i", "l"]], + [[2] * 3, [0] * 3, [1] * 3, [2] * 3, [-1] * 3], + [ + ["infrequent_sklearn"] * 3, + ["b", "f", "j"], + ["c", "g", "k"], + ["infrequent_sklearn"] * 3, + [None] * 3, + ], + ), + ( + {"max_categories": [3, None, None]}, + [["a", "d"], None, None], + [[2, 0, 0], [0, 1, 1], [1, 2, 2], [2, 3, 3], [-1] * 3], + [ + ["infrequent_sklearn", "e", "i"], + ["b", "f", "j"], + ["c", "g", "k"], + ["infrequent_sklearn", "h", "l"], + [None] * 3, + ], + ), + ( + {"max_categories": [3, 2, 1]}, + [["a", "d"], ["e", "g", "h"], ["i", "j", "k", "l"]], + [[2, 1, 0], [0, 0, 0], [1, 1, 0], [2, 1, 0], [-1] * 3], + [ + ["infrequent_sklearn"] * 3, + ["b", "f", "infrequent_sklearn"], + ["c", "infrequent_sklearn", "infrequent_sklearn"], + ["infrequent_sklearn"] * 3, + [None] * 3, + ], + ), + ], +) +def test_ordinal_encoder_infrequent_three_levels_multiple_features( + kwargs, expected_infrequent_categories, expected_trans, expected_inverse +): + """Test parameters for grouping multiple features into the infrequent category.""" + + X_train = ( + [["a", "e", "i"]] * 5 + + [["b", "f", "j"]] * 20 + + [["c", "g", "k"]] * 10 + + [["d", "h", "l"]] * 3 + ) + ordinal = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1, **kwargs + ).fit(X_train) + assert_array_equal( + ordinal.categories_, + [["a", "b", "c", "d"], ["e", "f", "g", "h"], ["i", "j", "k", "l"]], + ) + + assert len(ordinal.infrequent_categories_) == len(expected_infrequent_categories) + for expected, actual in zip( + ordinal.infrequent_categories_, expected_infrequent_categories + ): + assert_array_equal(expected, actual) + + X_test = [ + ["a", "e", "i"], + ["b", "f", "j"], + ["c", "g", "k"], + ["d", "h", "l"], + ["x", "y", "z"], + ] + + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, expected_trans) + + X_inverse = ordinal.inverse_transform(X_trans) + assert_array_equal(X_inverse, expected_inverse) + + def test_ordinal_encoder_infrequent_three_levels_user_cats(): """Test that the order of the categories provided by a user is respected. @@ -2195,7 +2276,34 @@ def test_ordinal_encoder_infrequent_mixed(): assert_array_equal(X_inverse, expected_inverse) -def test_ordinal_encoder_infrequent_multiple_categories_dtypes(): +@pytest.mark.parametrize( + "kwargs, expected_infrequent_categories, expected_trans", + [ + ( + {"max_categories": 3}, + [["a", "b"], [0, 3, 12], ["bird", "snake"]], + [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]], + ), + ( + {"max_categories": [3, None, None]}, + [["a", "b"], None, None], + [[2, 4, 1], [2, 0, 3], [1, 3, 0], [0, 2, 2]], + ), + ( + {"max_categories": {"str": 3}}, + [["a", "b"], None, None], + [[2, 4, 1], [2, 0, 3], [1, 3, 0], [0, 2, 2]], + ), + ( + {"max_categories": {"str": 3, "int": 2, "categorical": 1}}, + [["a", "b"], [0, 3, 5, 12], ["bird", "cat", "dog", "snake"]], + [[2, 1, 0], [2, 1, 0], [1, 0, 0], [0, 1, 0]], + ), + ], +) +def test_ordinal_encoder_infrequent_multiple_categories_dtypes( + kwargs, expected_infrequent_categories, expected_trans +): """Test infrequent categories with a pandas DataFrame with multiple dtypes.""" pd = pytest.importorskip("pandas") @@ -2212,7 +2320,7 @@ def test_ordinal_encoder_infrequent_multiple_categories_dtypes(): columns=["str", "int", "categorical"], ) - ordinal = OrdinalEncoder(max_categories=3).fit(X) + ordinal = OrdinalEncoder(**kwargs).fit(X) # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be # considered infrequent because they appear first when sorted @@ -2222,9 +2330,11 @@ def test_ordinal_encoder_infrequent_multiple_categories_dtypes(): # X[:, 2] "snake" and "bird" or infrequent - assert_array_equal(ordinal.infrequent_categories_[0], ["a", "b"]) - assert_array_equal(ordinal.infrequent_categories_[1], [0, 3, 12]) - assert_array_equal(ordinal.infrequent_categories_[2], ["bird", "snake"]) + assert len(ordinal.infrequent_categories_) == len(expected_infrequent_categories) + for expected, actual in zip( + ordinal.infrequent_categories_, expected_infrequent_categories + ): + assert_array_equal(expected, actual) X_test = pd.DataFrame( { @@ -2237,7 +2347,6 @@ def test_ordinal_encoder_infrequent_multiple_categories_dtypes(): }, columns=["str", "int", "categorical"], ) - expected_trans = [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]] X_trans = ordinal.transform(X_test) assert_allclose(X_trans, expected_trans) @@ -2352,6 +2461,158 @@ def test_ordinal_encoder_missing_appears_infrequent(): assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]]) +def test_ordinal_encoder_missing_feature_names(): + """Check behavior when max_categories specifies features on a dataset without + feature names.""" + X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T + ordinal = OrdinalEncoder(max_categories={"x0": 3}) + + with pytest.raises( + ValueError, + match=( + "OrdinalEncoder was not fitted on data with feature names. Pass" + " max_categories as an integer array instead." + ), + ): + ordinal.fit(X) + + +@pytest.mark.parametrize( + "max_categories, unexpected_features, n_unexpected_features", + [ + ({"str": 3, "int": 2, "categorical": 1, "unexpected": 13}, ["unexpected"], 1), + ( + { + "str": 3, + "int": 2, + "categorical": 1, + "unexpected": 13, + "unexpected_2": 13, + "unexpected_3": 13, + "unexpected_4": 13, + "unexpected_5": 13, + "unexpected_6": 13, + }, + [ + "unexpected", + "unexpected_2", + "unexpected_3", + "unexpected_4", + "unexpected_5", + "...", + ], + 6, + ), + ], +) +def test_ordinal_encoder_unexpected_feature_names( + max_categories, unexpected_features, n_unexpected_features +): + """Check behavior when max_categories specifies features that are not present on a + dataset.""" + pd = pytest.importorskip("pandas") + categorical_dtype = pd.CategoricalDtype(["bird", "cat", "dog", "snake"]) + X = pd.DataFrame( + { + "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"], + "int": [5, 3, 0, 10, 10, 12, 0, 3, 5], + "categorical": pd.Series( + ["dog"] * 4 + ["cat"] * 3 + ["snake"] + ["bird"], + dtype=categorical_dtype, + ), + }, + columns=["str", "int", "categorical"], + ) + + ordinal = OrdinalEncoder(max_categories=max_categories) + + msg = re.escape( + f"max_categories contains {n_unexpected_features} unexpected feature names:" + f" {unexpected_features}." + ) + + with pytest.raises(ValueError, match=msg): + ordinal.fit(X) + + +@pytest.mark.parametrize( + "max_categories, incorrect_feature, incorrect_value", + [ + ({"str": 3, "int": None, "categorical": 1}, "int", None), + ({"str": 3, "int": 2, "categorical": 0}, "categorical", 0), + ], +) +def test_ordinal_encoder_max_categories_dict_invalid_types( + max_categories, incorrect_feature, incorrect_value +): + """Check behavior when max_categories as a dictionary contains values that are + invalid.""" + pd = pytest.importorskip("pandas") + categorical_dtype = pd.CategoricalDtype(["bird", "cat", "dog", "snake"]) + X = pd.DataFrame( + { + "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"], + "int": [5, 3, 0, 10, 10, 12, 0, 3, 5], + "categorical": pd.Series( + ["dog"] * 4 + ["cat"] * 3 + ["snake"] + ["bird"], + dtype=categorical_dtype, + ), + }, + columns=["str", "int", "categorical"], + ) + + ordinal = OrdinalEncoder(max_categories=max_categories) + + msg = re.escape( + f"max_categories['{incorrect_feature}'] must be an integer at least 1. " + f"Got {incorrect_value}." + ) + + with pytest.raises(ValueError, match=msg): + ordinal.fit(X) + + +@pytest.mark.parametrize("max_categories", [[], [3, 2], [[3, 2, 1]]]) +def test_ordinal_encoder_max_categories_array_like_invalid_shape(max_categories): + """Check behavior when max_categories as an array_like has an invalid shape.""" + X = ( + [["a", "e", "i"]] * 5 + + [["b", "f", "j"]] * 20 + + [["c", "g", "k"]] * 10 + + [["d", "h", "l"]] * 3 + ) + + ordinal = OrdinalEncoder(max_categories=max_categories) + + msg = re.escape( + f"max_categories has shape {np.asarray(max_categories).shape} but the " + "input data X has 3 features." + ) + + with pytest.raises(ValueError, match=msg): + ordinal.fit(X) + + +def test_ordinal_encoder_max_categories_array_like_invalid_types(): + """Check behavior when max_categories as an array_like contains values that + are invalid.""" + X = ( + [["a", "e", "i"]] * 5 + + [["b", "f", "j"]] * 20 + + [["c", "g", "k"]] * 10 + + [["d", "h", "l"]] * 3 + ) + + ordinal = OrdinalEncoder(max_categories=[3, None, 0]) + + msg = re.escape( + "max_categories must be an array-like of None or integers at least 1." + ) + + with pytest.raises(ValueError, match=msg): + ordinal.fit(X) + + @pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder]) def test_encoder_not_fitted(Encoder): """Check that we raise a `NotFittedError` by calling transform before fit with