From b4208f23e1f0240d05e7c4d033bd54a2e905fe4e Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Tue, 25 Apr 2023 14:50:25 -0700
Subject: [PATCH 01/16] Add per feature max_categories for OrdinalEncoder

---
 doc/modules/preprocessing.rst                |   9 +-
 sklearn/preprocessing/_encoders.py           | 124 +++++++++++++++++--
 sklearn/preprocessing/tests/test_encoders.py | 116 ++++++++++++++++-
 3 files changed, 232 insertions(+), 17 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 69045147d8af9..684ab410ba335 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -746,10 +746,11 @@ enable the gathering of infrequent categories are `min_frequency` and
    this fraction of the total number of samples will be considered infrequent.
    The default value is 1, which means every category is encoded separately.
 
-2. `max_categories` is either `None` or any integer greater than 1. This
-   parameter sets an upper limit to the number of output features for each
-   input feature. `max_categories` includes the feature that combines
-   infrequent categories.
+2. `max_categories` is either `None`, any integer greater than 1, or (for
+   :class:`OrdinalEncoder` only) a dictionary mapping a valid feature name to
+   any integer greater than 1. This parameter sets an upper limit to the number
+   of output features for each input feature. `max_categories` includes the
+   feature that combines infrequent categories.
 
 In the following example with :class:`OrdinalEncoder`, the categories `'dog' and
 'snake'` are considered infrequent::
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index fd9941f5336ed..cbf59aad6a810 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -250,17 +250,38 @@ def infrequent_categories_(self):
             for category, indices in zip(self.categories_, infrequent_indices)
         ]
 
+    def _check_max_categories(self):
+        """
+        This function checks whether the value of max_categories
+        enables infrequent categories.
+        """
+        max_categories = getattr(self, "max_categories", None)
+        return max_categories is not None and max_categories >= 1
+
     def _check_infrequent_enabled(self):
         """
         This functions checks whether _infrequent_enabled is True or False.
         This has to be called after parameter validation in the fit function.
         """
-        max_categories = getattr(self, "max_categories", None)
         min_frequency = getattr(self, "min_frequency", None)
         self._infrequent_enabled = (
-            max_categories is not None and max_categories >= 1
+            self._check_max_categories()
         ) or min_frequency is not None
 
+    def _has_infrequent_categories(self, n_current_features, col_idx):
+        """
+        This function checks if there are any infrequent categories.
+        """
+        return (
+            self.max_categories is not None and self.max_categories < n_current_features
+        )
+
+    def _get_frequent_category_count(self, col_idx):
+        """
+        This functions computes the number of frequent categories.
+        """
+        return self.max_categories - 1
+
     def _identify_infrequent(self, category_count, n_samples, col_idx):
         """Compute the infrequent indices.
 
@@ -290,9 +311,9 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
             infrequent_mask = np.zeros(category_count.shape[0], dtype=bool)
 
         n_current_features = category_count.size - infrequent_mask.sum() + 1
-        if self.max_categories is not None and self.max_categories < n_current_features:
+        if self._has_infrequent_categories(n_current_features, col_idx):
             # max_categories includes the one infrequent category
-            frequent_category_count = self.max_categories - 1
+            frequent_category_count = self._get_frequent_category_count(col_idx)
             if frequent_category_count == 0:
                 # All categories are infrequent
                 infrequent_mask[:] = True
@@ -1294,11 +1315,13 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
         .. versionadded:: 1.3
             Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
 
-    max_categories : int, default=None
+    max_categories : int or dict, default=None
         Specifies an upper limit to the number of output categories for each input
         feature when considering infrequent categories. If there are infrequent
         categories, `max_categories` includes the category representing the
-        infrequent categories along with the frequent categories. If `None`,
+        infrequent categories along with the frequent categories.
+        If `max_categories` is a dictionary, each key-value pair represents the
+        upper limit to the number of output categories per feature. If `None`,
         there is no limit to the number of output features.
 
         `max_categories` do **not** take into account missing or unknown
@@ -1419,7 +1442,11 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
         "encoded_missing_value": [Integral, type(np.nan)],
         "handle_unknown": [StrOptions({"error", "use_encoded_value"})],
         "unknown_value": [Integral, type(np.nan), None],
-        "max_categories": [Interval(Integral, 1, None, closed="left"), None],
+        "max_categories": [
+            Interval(Integral, 1, None, closed="left"),
+            dict,
+            None,
+        ],
         "min_frequency": [
             Interval(Integral, 1, None, closed="left"),
             Interval(RealNotInt, 0, 1, closed="neither"),
@@ -1446,6 +1473,84 @@ def __init__(
         self.min_frequency = min_frequency
         self.max_categories = max_categories
 
+    def _validate_max_categories_dict(self):
+        """
+        This functions validates max_categories when it is a dictionary.
+        """
+        if len(self.max_categories) == 0:
+            raise ValueError("max_categories dictionary must be non-empty.")
+
+        for feature_name in self.max_categories.keys():
+            if not isinstance(feature_name, str):
+                raise TypeError(
+                    "feature in max_categories dictionary "
+                    "must be a string, "
+                    f"got {type(feature_name).__name__} for {feature_name}."
+                )
+
+        feature_names = set(_check_feature_names_in(self))
+        if self.max_categories.keys() > feature_names:
+            excess_feature_names = ", ".join(
+                sorted(self.max_categories.keys() - feature_names)
+            )
+            raise ValueError(
+                "features in max_categories dictionary "
+                "must be a valid feature name, "
+                f"got {excess_feature_names}."
+            )
+
+        for max_count in self.max_categories.values():
+            if not isinstance(max_count, Integral):
+                raise TypeError(
+                    "value in max_categories dictionary "
+                    "must be an integer, "
+                    f"got {type(max_count).__name__} for {max_count}."
+                )
+            if max_count < 1:
+                raise ValueError(
+                    "value in max_categories dictionary "
+                    "must be at least 1, "
+                    f"got {max_count}."
+                )
+
+    def _check_max_categories(self):
+        """
+        This function checks whether the value of max_categories
+        enables infrequent categories.
+        """
+        max_categories = getattr(self, "max_categories", None)
+
+        if max_categories is None:
+            return False
+        elif isinstance(max_categories, Integral):
+            return max_categories >= 1
+        else:
+            return all(max_count >= 1 for max_count in max_categories.values())
+
+    def _has_infrequent_categories(self, n_current_features, col_idx):
+        """
+        This function checks if there are any infrequent categories.
+        """
+        if self.max_categories is None:
+            return False
+        elif isinstance(self.max_categories, Integral):
+            return self.max_categories < n_current_features
+        else:
+            feature_name = _check_feature_names_in(self)[col_idx]
+            return (
+                feature_name in self.max_categories
+            ) and self.max_categories[feature_name] < n_current_features
+
+    def _get_frequent_category_count(self, col_idx):
+        """
+        This functions computes the number of frequent categories.
+        """
+        if isinstance(self.max_categories, Integral):
+            return self.max_categories - 1
+        else:
+            feature_name = _check_feature_names_in(self)[col_idx]
+            return self.max_categories[feature_name] - 1
+
     def fit(self, X, y=None):
         """
         Fit the OrdinalEncoder to X.
@@ -1488,6 +1593,11 @@ def fit(self, X, y=None):
                 f"got {self.unknown_value}."
             )
 
+        if isinstance(self.max_categories, dict):
+            self._check_n_features(X, reset=True)
+            self._check_feature_names(X, reset=True)
+            self._validate_max_categories_dict()
+
         # `_fit` will only raise an error when `self.handle_unknown="error"`
         fit_results = self._fit(
             X,
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 42c66980bfeba..3f4de65fb001e 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -2091,6 +2091,87 @@ def test_ordinal_encoder_infrequent_three_levels(kwargs):
     assert_array_equal(X_inverse, expected_inverse)
 
 
+@pytest.mark.parametrize(
+    "kwargs, expected_infrequent_categories, expected_trans, expected_inverse",
+    [
+        (
+            {"max_categories": {"x0": 3, "x1": 3, "x2": 3}},
+            [["a", "d"], ["e", "h"], ["i", "l"]],
+            [[2] * 3, [0] * 3, [1] * 3, [2] * 3, [-1] * 3],
+            [
+                ["infrequent_sklearn"] * 3,
+                ["b", "f", "j"],
+                ["c", "g", "k"],
+                ["infrequent_sklearn"] * 3,
+                [None] * 3,
+            ],
+        ),
+        (
+            {"max_categories": {"x0": 3}},
+            [["a", "d"], None, None],
+            [[2, 0, 0], [0, 1, 1], [1, 2, 2], [2, 3, 3], [-1] * 3],
+            [
+                ["infrequent_sklearn", "e", "i"],
+                ["b", "f", "j"],
+                ["c", "g", "k"],
+                ["infrequent_sklearn", "h", "l"],
+                [None] * 3,
+            ],
+        ),
+        (
+            {"max_categories": {"x0": 3, "x1": 2, "x2": 1}},
+            [["a", "d"], ["e", "g", "h"], ["i", "j", "k", "l"]],
+            [[2, 1, 0], [0, 0, 0], [1, 1, 0], [2, 1, 0], [-1] * 3],
+            [
+                ["infrequent_sklearn"] * 3,
+                ["b", "f", "infrequent_sklearn"],
+                ["c", "infrequent_sklearn", "infrequent_sklearn"],
+                ["infrequent_sklearn"] * 3,
+                [None] * 3,
+            ],
+        ),
+    ],
+)
+def test_ordinal_encoder_infrequent_three_levels_multiple_features(
+    kwargs, expected_infrequent_categories, expected_trans, expected_inverse
+):
+    """Test parameters for grouping multiple features into the infrequent category."""
+
+    X_train = (
+        [["a", "e", "i"]] * 5
+        + [["b", "f", "j"]] * 20
+        + [["c", "g", "k"]] * 10
+        + [["d", "h", "l"]] * 3
+    )
+    ordinal = OrdinalEncoder(
+        handle_unknown="use_encoded_value", unknown_value=-1, **kwargs
+    ).fit(X_train)
+    assert_array_equal(
+        ordinal.categories_,
+        [["a", "b", "c", "d"], ["e", "f", "g", "h"], ["i", "j", "k", "l"]],
+    )
+
+    assert len(ordinal.infrequent_categories_) == len(expected_infrequent_categories)
+    for expected, actual in zip(
+        ordinal.infrequent_categories_, expected_infrequent_categories
+    ):
+        assert_array_equal(expected, actual)
+
+    X_test = [
+        ["a", "e", "i"],
+        ["b", "f", "j"],
+        ["c", "g", "k"],
+        ["d", "h", "l"],
+        ["x", "y", "z"],
+    ]
+
+    X_trans = ordinal.transform(X_test)
+    assert_allclose(X_trans, expected_trans)
+
+    X_inverse = ordinal.inverse_transform(X_trans)
+    assert_array_equal(X_inverse, expected_inverse)
+
+
 def test_ordinal_encoder_infrequent_three_levels_user_cats():
     """Test that the order of the categories provided by a user is respected.
 
@@ -2148,7 +2229,29 @@ def test_ordinal_encoder_infrequent_mixed():
     assert_array_equal(X_inverse, expected_inverse)
 
 
-def test_ordinal_encoder_infrequent_multiple_categories_dtypes():
+@pytest.mark.parametrize(
+    "kwargs, expected_infrequent_categories, expected_trans",
+    [
+        (
+            {"max_categories": 3},
+            [["a", "b"], [0, 3, 12], ["bird", "snake"]],
+            [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]],
+        ),
+        (
+            {"max_categories": {"str": 3}},
+            [["a", "b"], None, None],
+            [[2, 4, 1], [2, 0, 3], [1, 3, 0], [0, 2, 2]],
+        ),
+        (
+            {"max_categories": {"str": 3, "int": 2, "categorical": 1}},
+            [["a", "b"], [0, 3, 5, 12], ["bird", "cat", "dog", "snake"]],
+            [[2, 1, 0], [2, 1, 0], [1, 0, 0], [0, 1, 0]],
+        ),
+    ],
+)
+def test_ordinal_encoder_infrequent_multiple_categories_dtypes(
+    kwargs, expected_infrequent_categories, expected_trans
+):
     """Test infrequent categories with a pandas DataFrame with multiple dtypes."""
 
     pd = pytest.importorskip("pandas")
@@ -2165,7 +2268,7 @@ def test_ordinal_encoder_infrequent_multiple_categories_dtypes():
         columns=["str", "int", "categorical"],
     )
 
-    ordinal = OrdinalEncoder(max_categories=3).fit(X)
+    ordinal = OrdinalEncoder(**kwargs).fit(X)
     # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be
     # considered infrequent because they appear first when sorted
 
@@ -2175,9 +2278,11 @@ def test_ordinal_encoder_infrequent_multiple_categories_dtypes():
 
     # X[:, 2] "snake" and "bird" or infrequent
 
-    assert_array_equal(ordinal.infrequent_categories_[0], ["a", "b"])
-    assert_array_equal(ordinal.infrequent_categories_[1], [0, 3, 12])
-    assert_array_equal(ordinal.infrequent_categories_[2], ["bird", "snake"])
+    assert len(ordinal.infrequent_categories_) == len(expected_infrequent_categories)
+    for expected, actual in zip(
+        ordinal.infrequent_categories_, expected_infrequent_categories
+    ):
+        assert_array_equal(expected, actual)
 
     X_test = pd.DataFrame(
         {
@@ -2190,7 +2295,6 @@ def test_ordinal_encoder_infrequent_multiple_categories_dtypes():
         },
         columns=["str", "int", "categorical"],
     )
-    expected_trans = [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]]
 
     X_trans = ordinal.transform(X_test)
     assert_allclose(X_trans, expected_trans)

From a3db2b62d7ee2a3b8bdb99d8df7cdf9a8816effb Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Wed, 26 Apr 2023 16:18:48 -0700
Subject: [PATCH 02/16] Fix formatting

---
 sklearn/preprocessing/_encoders.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index cbf59aad6a810..445e91a5b36e6 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -1537,9 +1537,9 @@ def _has_infrequent_categories(self, n_current_features, col_idx):
             return self.max_categories < n_current_features
         else:
             feature_name = _check_feature_names_in(self)[col_idx]
-            return (
-                feature_name in self.max_categories
-            ) and self.max_categories[feature_name] < n_current_features
+            return (feature_name in self.max_categories) and self.max_categories[
+                feature_name
+            ] < n_current_features
 
     def _get_frequent_category_count(self, col_idx):
         """

From 39e81ab022297c55cdecb9480f58e3033b7ee791 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Wed, 26 Apr 2023 23:15:50 -0700
Subject: [PATCH 03/16] Update behaviour of max_categories in OrdinalEncoder

---
 doc/modules/preprocessing.rst                |  12 +-
 sklearn/preprocessing/_encoders.py           | 147 ++++++++++---------
 sklearn/preprocessing/tests/test_encoders.py |  11 +-
 3 files changed, 96 insertions(+), 74 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 684ab410ba335..6bb4b2ee9a437 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -746,11 +746,13 @@ enable the gathering of infrequent categories are `min_frequency` and
    this fraction of the total number of samples will be considered infrequent.
    The default value is 1, which means every category is encoded separately.
 
-2. `max_categories` is either `None`, any integer greater than 1, or (for
-   :class:`OrdinalEncoder` only) a dictionary mapping a valid feature name to
-   any integer greater than 1. This parameter sets an upper limit to the number
-   of output features for each input feature. `max_categories` includes the
-   feature that combines infrequent categories.
+2. `max_categories` is either `None`, any integer greater or equal to 1, an
+   array-like comprised of `None` or integers greater than equal to 1 (for
+   :class:`OrdinalEncoder` only), or a dictionary mapping a feature name found
+   in `feature_names_in_` to any integer greater than or equal to 1 (for
+   :class:`OrdinalEncoder` only). This parameter sets an upper limit to the
+   number of output categories for each input feature. `max_categories`
+   includes the category that combines infrequent categories.
 
 In the following example with :class:`OrdinalEncoder`, the categories `'dog' and
 'snake'` are considered infrequent::
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 445e91a5b36e6..d0ce34739770f 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -250,13 +250,12 @@ def infrequent_categories_(self):
             for category, indices in zip(self.categories_, infrequent_indices)
         ]
 
-    def _check_max_categories(self):
+    def _max_categories_enable_infrequent(self):
         """
         This function checks whether the value of max_categories
         enables infrequent categories.
         """
-        max_categories = getattr(self, "max_categories", None)
-        return max_categories is not None and max_categories >= 1
+        return self.max_categories is not None and self.max_categories >= 1
 
     def _check_infrequent_enabled(self):
         """
@@ -265,7 +264,7 @@ def _check_infrequent_enabled(self):
         """
         min_frequency = getattr(self, "min_frequency", None)
         self._infrequent_enabled = (
-            self._check_max_categories()
+            self._max_categories_enable_infrequent()
         ) or min_frequency is not None
 
     def _has_infrequent_categories(self, n_current_features, col_idx):
@@ -1315,14 +1314,20 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
         .. versionadded:: 1.3
             Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
 
-    max_categories : int or dict, default=None
+    max_categories : int, array-like of int, dict of str or None, default=None
         Specifies an upper limit to the number of output categories for each input
         feature when considering infrequent categories. If there are infrequent
         categories, `max_categories` includes the category representing the
         infrequent categories along with the frequent categories.
-        If `max_categories` is a dictionary, each key-value pair represents the
-        upper limit to the number of output categories per feature. If `None`,
-        there is no limit to the number of output features.
+
+        - If int, then `max_categories` is the upper limit of output categories
+            for all input features.
+        - If array-like, then each item in `max_categories` is the upper limit
+            of output categories for the corresponding input feature.
+        - If dict, then its keys should be the feature names occurring in
+            `feature_names_in_` and the corresponding values should be the
+            upper limits of output categories.
+        - If `None`, then there is no limit to the number of output categories.
 
         `max_categories` do **not** take into account missing or unknown
         categories. Setting `unknown_value` or `encoded_missing_value` to an
@@ -1444,6 +1449,7 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
         "unknown_value": [Integral, type(np.nan), None],
         "max_categories": [
             Interval(Integral, 1, None, closed="left"),
+            "array-like",
             dict,
             None,
         ],
@@ -1473,83 +1479,93 @@ def __init__(
         self.min_frequency = min_frequency
         self.max_categories = max_categories
 
-    def _validate_max_categories_dict(self):
+    def _check_max_categories(self):
         """
-        This functions validates max_categories when it is a dictionary.
+        Check the max_categories and convert to the corresponding array.
         """
-        if len(self.max_categories) == 0:
-            raise ValueError("max_categories dictionary must be non-empty.")
-
-        for feature_name in self.max_categories.keys():
-            if not isinstance(feature_name, str):
-                raise TypeError(
-                    "feature in max_categories dictionary "
-                    "must be a string, "
-                    f"got {type(feature_name).__name__} for {feature_name}."
+        max_categories = self.max_categories
+        if max_categories is None or isinstance(max_categories, Integral):
+            max_categories = np.full(
+                shape=self.n_features_in_,
+                fill_value=max_categories,
+            )
+        elif isinstance(max_categories, dict):
+            max_categories = np.full(
+                shape=self.n_features_in_,
+                fill_value=None,
+                dtype=object,
+            )
+            if not hasattr(self, "feature_names_in_"):
+                raise ValueError(
+                    f"{self.__class__.__name__} was not fitted on data "
+                    "with feature names. Pass max_categories as an integer "
+                    "array instead."
                 )
-
-        feature_names = set(_check_feature_names_in(self))
-        if self.max_categories.keys() > feature_names:
-            excess_feature_names = ", ".join(
-                sorted(self.max_categories.keys() - feature_names)
+            unexpected_feature_names = list(
+                set(self.max_categories) - set(self.feature_names_in_)
             )
-            raise ValueError(
-                "features in max_categories dictionary "
-                "must be a valid feature name, "
-                f"got {excess_feature_names}."
+            unexpected_feature_names.sort()  # deterministic error message
+            n_unexpeced = len(unexpected_feature_names)
+            if unexpected_feature_names:
+                if len(unexpected_feature_names) > 5:
+                    unexpected_feature_names = unexpected_feature_names[:5]
+                    unexpected_feature_names.append("...")
+                raise ValueError(
+                    f"max_categories contains {n_unexpeced} unexpected feature "
+                    f"names: {unexpected_feature_names}."
+                )
+            for feature_idx, feature_name in enumerate(self.feature_names_in_):
+                if feature_name in self.max_categories:
+                    max_count = self.max_categories[feature_name]
+                    if not (isinstance(max_count, Integral) and max_count >= 1):
+                        raise ValueError(
+                            f"max_categories['{feature_name}'] must be an "
+                            f"integer at least 1. Got {max_count!r}."
+                        )
+                    max_categories[feature_idx] = max_count
+        else:
+            unexpected_max_counts = set(
+                max_count
+                for max_count in max_categories
+                if not (
+                    max_count is None
+                    or (isinstance(max_count, Integral) and max_count >= 1)
+                )
             )
-
-        for max_count in self.max_categories.values():
-            if not isinstance(max_count, Integral):
-                raise TypeError(
-                    "value in max_categories dictionary "
-                    "must be an integer, "
-                    f"got {type(max_count).__name__} for {max_count}."
+            if len(unexpected_max_counts):
+                raise ValueError(
+                    "max_categories must be an array-like of None or integers "
+                    "at least 1. Observed "
+                    f"values: {list(unexpected_max_counts)}."
                 )
-            if max_count < 1:
+
+            max_categories = np.asarray(max_categories)
+            if max_categories.shape[0] != self.n_features_in_:
                 raise ValueError(
-                    "value in max_categories dictionary "
-                    "must be at least 1, "
-                    f"got {max_count}."
+                    f"max_categories has shape {self.max_categories.shape} but the"
+                    f" input data X has {self.n_features_in_} features."
                 )
+        self.max_categories = max_categories
 
-    def _check_max_categories(self):
+    def _max_categories_enable_infrequent(self):
         """
         This function checks whether the value of max_categories
         enables infrequent categories.
         """
-        max_categories = getattr(self, "max_categories", None)
-
-        if max_categories is None:
-            return False
-        elif isinstance(max_categories, Integral):
-            return max_categories >= 1
-        else:
-            return all(max_count >= 1 for max_count in max_categories.values())
+        return all(self.max_categories)
 
     def _has_infrequent_categories(self, n_current_features, col_idx):
         """
         This function checks if there are any infrequent categories.
         """
-        if self.max_categories is None:
-            return False
-        elif isinstance(self.max_categories, Integral):
-            return self.max_categories < n_current_features
-        else:
-            feature_name = _check_feature_names_in(self)[col_idx]
-            return (feature_name in self.max_categories) and self.max_categories[
-                feature_name
-            ] < n_current_features
+        max_count = self.max_categories[col_idx]
+        return max_count is not None and max_count < n_current_features
 
     def _get_frequent_category_count(self, col_idx):
         """
         This functions computes the number of frequent categories.
         """
-        if isinstance(self.max_categories, Integral):
-            return self.max_categories - 1
-        else:
-            feature_name = _check_feature_names_in(self)[col_idx]
-            return self.max_categories[feature_name] - 1
+        return self.max_categories[col_idx] - 1
 
     def fit(self, X, y=None):
         """
@@ -1593,10 +1609,9 @@ def fit(self, X, y=None):
                 f"got {self.unknown_value}."
             )
 
-        if isinstance(self.max_categories, dict):
-            self._check_n_features(X, reset=True)
-            self._check_feature_names(X, reset=True)
-            self._validate_max_categories_dict()
+        self._check_n_features(X, reset=True)
+        self._check_feature_names(X, reset=True)
+        self._check_max_categories()
 
         # `_fit` will only raise an error when `self.handle_unknown="error"`
         fit_results = self._fit(
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 3f4de65fb001e..107872aa8d769 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -2095,7 +2095,7 @@ def test_ordinal_encoder_infrequent_three_levels(kwargs):
     "kwargs, expected_infrequent_categories, expected_trans, expected_inverse",
     [
         (
-            {"max_categories": {"x0": 3, "x1": 3, "x2": 3}},
+            {"max_categories": [3, 3, 3]},
             [["a", "d"], ["e", "h"], ["i", "l"]],
             [[2] * 3, [0] * 3, [1] * 3, [2] * 3, [-1] * 3],
             [
@@ -2107,7 +2107,7 @@ def test_ordinal_encoder_infrequent_three_levels(kwargs):
             ],
         ),
         (
-            {"max_categories": {"x0": 3}},
+            {"max_categories": [3, None, None]},
             [["a", "d"], None, None],
             [[2, 0, 0], [0, 1, 1], [1, 2, 2], [2, 3, 3], [-1] * 3],
             [
@@ -2119,7 +2119,7 @@ def test_ordinal_encoder_infrequent_three_levels(kwargs):
             ],
         ),
         (
-            {"max_categories": {"x0": 3, "x1": 2, "x2": 1}},
+            {"max_categories": [3, 2, 1]},
             [["a", "d"], ["e", "g", "h"], ["i", "j", "k", "l"]],
             [[2, 1, 0], [0, 0, 0], [1, 1, 0], [2, 1, 0], [-1] * 3],
             [
@@ -2237,6 +2237,11 @@ def test_ordinal_encoder_infrequent_mixed():
             [["a", "b"], [0, 3, 12], ["bird", "snake"]],
             [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]],
         ),
+        (
+            {"max_categories": [3, None, None]},
+            [["a", "b"], None, None],
+            [[2, 4, 1], [2, 0, 3], [1, 3, 0], [0, 2, 2]],
+        ),
         (
             {"max_categories": {"str": 3}},
             [["a", "b"], None, None],

From b06a0d52a4e1958358be1e59c20803356be32331 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Wed, 26 Apr 2023 23:53:54 -0700
Subject: [PATCH 04/16] Fix errors pertaining to checking for infrequent
 categories

---
 sklearn/preprocessing/_encoders.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index d0ce34739770f..94ee963c0bb4e 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -250,21 +250,22 @@ def infrequent_categories_(self):
             for category, indices in zip(self.categories_, infrequent_indices)
         ]
 
-    def _max_categories_enable_infrequent(self):
+    def _max_categories_enable_infrequent(self, max_categories):
         """
         This function checks whether the value of max_categories
         enables infrequent categories.
         """
-        return self.max_categories is not None and self.max_categories >= 1
+        return max_categories is not None and max_categories >= 1
 
     def _check_infrequent_enabled(self):
         """
         This functions checks whether _infrequent_enabled is True or False.
         This has to be called after parameter validation in the fit function.
         """
+        max_categories = getattr(self, "max_categories", None)
         min_frequency = getattr(self, "min_frequency", None)
         self._infrequent_enabled = (
-            self._max_categories_enable_infrequent()
+            self._max_categories_enable_infrequent(max_categories)
         ) or min_frequency is not None
 
     def _has_infrequent_categories(self, n_current_features, col_idx):
@@ -1547,12 +1548,12 @@ def _check_max_categories(self):
                 )
         self.max_categories = max_categories
 
-    def _max_categories_enable_infrequent(self):
+    def _max_categories_enable_infrequent(self, max_categories):
         """
         This function checks whether the value of max_categories
         enables infrequent categories.
         """
-        return all(self.max_categories)
+        return any(max_categories)
 
     def _has_infrequent_categories(self, n_current_features, col_idx):
         """

From be5242ae35b84bc56d0ee730389c804b08a1b384 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Thu, 27 Apr 2023 22:06:07 -0700
Subject: [PATCH 05/16] Only check max_categories in OrdinalEncoder when it is
 an array-like or a dict

---
 sklearn/preprocessing/_encoders.py | 38 ++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 94ee963c0bb4e..b5bc1d31b553d 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -13,6 +13,7 @@
 from ..utils import check_array, is_scalar_nan, _safe_indexing
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _check_feature_names_in
+from ..utils.validation import _is_arraylike_not_scalar
 from ..utils._param_validation import Interval, StrOptions, Hidden
 from ..utils._param_validation import RealNotInt
 from ..utils._mask import _get_mask
@@ -1485,12 +1486,7 @@ def _check_max_categories(self):
         Check the max_categories and convert to the corresponding array.
         """
         max_categories = self.max_categories
-        if max_categories is None or isinstance(max_categories, Integral):
-            max_categories = np.full(
-                shape=self.n_features_in_,
-                fill_value=max_categories,
-            )
-        elif isinstance(max_categories, dict):
+        if isinstance(max_categories, dict):
             max_categories = np.full(
                 shape=self.n_features_in_,
                 fill_value=None,
@@ -1553,20 +1549,33 @@ def _max_categories_enable_infrequent(self, max_categories):
         This function checks whether the value of max_categories
         enables infrequent categories.
         """
-        return any(max_categories)
+        if max_categories is None:
+            return False
+        elif isinstance(max_categories, Integral):
+            return max_categories >= 1
+        else:
+            return any(max_categories)
 
     def _has_infrequent_categories(self, n_current_features, col_idx):
         """
         This function checks if there are any infrequent categories.
         """
-        max_count = self.max_categories[col_idx]
-        return max_count is not None and max_count < n_current_features
+        if self.max_categories is None:
+            return False
+        if isinstance(self.max_categories, Integral):
+            return self.max_categories < n_current_features
+        else:
+            max_count = self.max_categories[col_idx]
+            return max_count is not None and max_count < n_current_features
 
     def _get_frequent_category_count(self, col_idx):
         """
         This functions computes the number of frequent categories.
         """
-        return self.max_categories[col_idx] - 1
+        if isinstance(self.max_categories, Integral):
+            return self.max_categories - 1
+        else:
+            return self.max_categories[col_idx] - 1
 
     def fit(self, X, y=None):
         """
@@ -1610,9 +1619,12 @@ def fit(self, X, y=None):
                 f"got {self.unknown_value}."
             )
 
-        self._check_n_features(X, reset=True)
-        self._check_feature_names(X, reset=True)
-        self._check_max_categories()
+        if isinstance(self.max_categories, dict) or _is_arraylike_not_scalar(
+            self.max_categories
+        ):
+            self._check_n_features(X, reset=True)
+            self._check_feature_names(X, reset=True)
+            self._check_max_categories()
 
         # `_fit` will only raise an error when `self.handle_unknown="error"`
         fit_results = self._fit(

From f43456e1551703e95ce4c60ca9e1169f2c186283 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Thu, 27 Apr 2023 22:15:10 -0700
Subject: [PATCH 06/16] Update changelog

---
 doc/whats_new/v1.3.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index bb245aa466152..c8a0bfe5f3191 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -483,6 +483,12 @@ Changelog
   The `sample_interval_` attribute is deprecated and will be removed in 1.5.
   :pr:`25190` by :user:`Vincent Maladière <Vincent-Maladiere>`.
 
+- |Enhancement| Added support for passing `max_categories` as `array-like` or
+  `dict` in :class:`preprocessing.OrdinalEncoder`. This allows specifying the
+  maximum number of output categories for each input feature instead of being
+  restricted to setting a global maximum number of output categories.
+  :pr:`26284` by :user:`Andrew Wang <Andrew-Wang-IB45>`.
+
 :mod:`sklearn.tree`
 ...................
 

From d5586874d4f8c0c6d11cc224639b333ce79f0433 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Sat, 29 Apr 2023 00:02:32 -0700
Subject: [PATCH 07/16] Improve ordering of checking max_categories and add
 tests for OrdinalEncoder

---
 sklearn/preprocessing/_encoders.py           |  13 +-
 sklearn/preprocessing/tests/test_encoders.py | 123 +++++++++++++++++++
 2 files changed, 130 insertions(+), 6 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index b5bc1d31b553d..52b88c7f3ba26 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -1521,6 +1521,13 @@ def _check_max_categories(self):
                         )
                     max_categories[feature_idx] = max_count
         else:
+            max_categories = np.asarray(max_categories)
+            if max_categories.shape[0] != self.n_features_in_:
+                raise ValueError(
+                    f"max_categories has shape {max_categories.shape} but the "
+                    f"input data X has {self.n_features_in_} features."
+                )
+
             unexpected_max_counts = set(
                 max_count
                 for max_count in max_categories
@@ -1536,12 +1543,6 @@ def _check_max_categories(self):
                     f"values: {list(unexpected_max_counts)}."
                 )
 
-            max_categories = np.asarray(max_categories)
-            if max_categories.shape[0] != self.n_features_in_:
-                raise ValueError(
-                    f"max_categories has shape {self.max_categories.shape} but the"
-                    f" input data X has {self.n_features_in_} features."
-                )
         self.max_categories = max_categories
 
     def _max_categories_enable_infrequent(self, max_categories):
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 107872aa8d769..7d0f1ed8bd3dc 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -2412,3 +2412,126 @@ def test_ordinal_encoder_missing_appears_infrequent():
     )
     X_trans = ordinal.transform(X_test)
     assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]])
+
+
+def test_ordinal_encoder_missing_feature_names():
+    """Check behavior when max_categories specifies features on a dataset without
+    feature names."""
+    X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T
+    ordinal = OrdinalEncoder(max_categories={"x0": 3})
+
+    with pytest.raises(
+        ValueError,
+        match=(
+            "OrdinalEncoder was not fitted on data with feature names. Pass"
+            " max_categories as an integer array instead."
+        ),
+    ):
+        ordinal.fit(X)
+
+
+def test_ordinal_encoder_unexpected_feature_names():
+    """Check behavior when max_categories specifies features that are not present on a
+    dataset."""
+    pd = pytest.importorskip("pandas")
+    categorical_dtype = pd.CategoricalDtype(["bird", "cat", "dog", "snake"])
+    X = pd.DataFrame(
+        {
+            "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"],
+            "int": [5, 3, 0, 10, 10, 12, 0, 3, 5],
+            "categorical": pd.Series(
+                ["dog"] * 4 + ["cat"] * 3 + ["snake"] + ["bird"],
+                dtype=categorical_dtype,
+            ),
+        },
+        columns=["str", "int", "categorical"],
+    )
+
+    max_categories = {"str": 3, "int": 2, "categorical": 1, "unexpected": 13}
+    ordinal = OrdinalEncoder(max_categories=max_categories)
+
+    msg = re.escape(
+        "max_categories contains 1 unexpected feature names: ['unexpected']."
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        ordinal.fit(X)
+
+
+@pytest.mark.parametrize(
+    "max_categories, incorrect_feature, incorrect_value",
+    [
+        ({"str": 3, "int": None, "categorical": 1}, "int", None),
+        ({"str": 3, "int": 2, "categorical": 0}, "categorical", 0),
+    ],
+)
+def test_ordinal_encoder_max_categories_dict_invalid_types(
+    max_categories, incorrect_feature, incorrect_value
+):
+    """Check behavior when max_categories as a dictionary contains values that are
+    invalid."""
+    pd = pytest.importorskip("pandas")
+    categorical_dtype = pd.CategoricalDtype(["bird", "cat", "dog", "snake"])
+    X = pd.DataFrame(
+        {
+            "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"],
+            "int": [5, 3, 0, 10, 10, 12, 0, 3, 5],
+            "categorical": pd.Series(
+                ["dog"] * 4 + ["cat"] * 3 + ["snake"] + ["bird"],
+                dtype=categorical_dtype,
+            ),
+        },
+        columns=["str", "int", "categorical"],
+    )
+
+    ordinal = OrdinalEncoder(max_categories=max_categories)
+
+    msg = re.escape(
+        f"max_categories['{incorrect_feature}'] must be an integer at least 1. "
+        f"Got {incorrect_value}."
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        ordinal.fit(X)
+
+
+@pytest.mark.parametrize("max_categories", [[], [3, 2], [[3, 2, 1]]])
+def test_ordinal_encoder_max_categories_array_like_invalid_shape(max_categories):
+    """Check behavior when max_categories as an array_like has an invalid shape."""
+    X = (
+        [["a", "e", "i"]] * 5
+        + [["b", "f", "j"]] * 20
+        + [["c", "g", "k"]] * 10
+        + [["d", "h", "l"]] * 3
+    )
+
+    ordinal = OrdinalEncoder(max_categories=max_categories)
+
+    msg = re.escape(
+        f"max_categories has shape {np.asarray(max_categories).shape} but the "
+        "input data X has 3 features."
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        ordinal.fit(X)
+
+
+def test_ordinal_encoder_max_categories_array_like_invalid_types():
+    """Check behavior when max_categories as an array_like contains values that
+    are invalid."""
+    X = (
+        [["a", "e", "i"]] * 5
+        + [["b", "f", "j"]] * 20
+        + [["c", "g", "k"]] * 10
+        + [["d", "h", "l"]] * 3
+    )
+
+    ordinal = OrdinalEncoder(max_categories=[3, None, 0])
+
+    msg = re.escape(
+        "max_categories must be an array-like of None or integers at least 1. "
+        "Observed values: [0]."
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        ordinal.fit(X)

From 77e95675f470d3b35acaa4caeb518d9ef7b768a5 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Tue, 16 May 2023 23:39:58 -0700
Subject: [PATCH 08/16] Add _max_categories_per_feature attribute to
 BaseEncoder and remove overrides in OrdinalEncoder

---
 sklearn/preprocessing/_encoders.py | 209 +++++++++++------------------
 1 file changed, 81 insertions(+), 128 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 52b88c7f3ba26..115af972aaaba 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -13,10 +13,10 @@
 from ..utils import check_array, is_scalar_nan, _safe_indexing
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _check_feature_names_in
-from ..utils.validation import _is_arraylike_not_scalar
 from ..utils._param_validation import Interval, StrOptions, Hidden
 from ..utils._param_validation import RealNotInt
 from ..utils._mask import _get_mask
+from ..utils.validation import _is_arraylike_not_scalar
 
 from ..utils._encode import _encode, _check_unknown, _unique, _get_counts
 
@@ -76,9 +76,9 @@ def _fit(
         return_counts=False,
         return_and_ignore_missing_for_infrequent=False,
     ):
-        self._check_infrequent_enabled()
         self._check_n_features(X, reset=True)
         self._check_feature_names(X, reset=True)
+        self._check_infrequent_enabled()
         X_list, n_samples, n_features = self._check_X(
             X, force_all_finite=force_all_finite
         )
@@ -251,38 +251,90 @@ def infrequent_categories_(self):
             for category, indices in zip(self.categories_, infrequent_indices)
         ]
 
-    def _max_categories_enable_infrequent(self, max_categories):
+    def _validate_max_categories(self):
         """
-        This function checks whether the value of max_categories
-        enables infrequent categories.
+        Check max_categories and returns the corresponding array.
         """
-        return max_categories is not None and max_categories >= 1
+        max_categories = getattr(self, "max_categories", None)
+
+        if isinstance(max_categories, Integral) and max_categories >= 1:
+            return [max_categories] * self.n_features_in_
+
+        elif isinstance(max_categories, dict):
+            if not hasattr(self, "feature_names_in_"):
+                raise ValueError(
+                    f"{self.__class__.__name__} was not fitted on data "
+                    "with feature names. Pass max_categories as an integer "
+                    "array instead."
+                )
+
+            unexpected_feature_names = list(
+                set(self.max_categories) - set(self.feature_names_in_)
+            )
+            unexpected_feature_names.sort()  # deterministic error message
+            n_unexpeced = len(unexpected_feature_names)
+            if unexpected_feature_names:
+                if len(unexpected_feature_names) > 5:
+                    unexpected_feature_names = unexpected_feature_names[:5]
+                    unexpected_feature_names.append("...")
+                raise ValueError(
+                    f"max_categories contains {n_unexpeced} unexpected feature "
+                    f"names: {unexpected_feature_names}."
+                )
+
+            max_categories_array = [None] * self.n_features_in_
+            for feature_idx, feature_name in enumerate(self.feature_names_in_):
+                if feature_name in max_categories:
+                    max_count = max_categories[feature_name]
+                    if not (isinstance(max_count, Integral) and max_count >= 1):
+                        raise ValueError(
+                            f"max_categories['{feature_name}'] must be an "
+                            f"integer at least 1. Got {max_count!r}."
+                        )
+                    max_categories_array[feature_idx] = max_count
+            return max_categories_array if any(max_categories_array) else None
+
+        elif _is_arraylike_not_scalar(max_categories):
+            max_categories = np.asarray(max_categories)
+            if (
+                max_categories.ndim != 1
+                or max_categories.shape[0] != self.n_features_in_
+            ):
+                raise ValueError(
+                    f"max_categories has shape {max_categories.shape} but the "
+                    f"input data X has {self.n_features_in_} features."
+                )
+
+            unexpected_max_counts = set(
+                max_count
+                for max_count in max_categories
+                if not (
+                    max_count is None
+                    or (isinstance(max_count, Integral) and max_count >= 1)
+                )
+            )
+            if len(unexpected_max_counts):
+                raise ValueError(
+                    "max_categories must be an array-like of None or integers "
+                    "at least 1. Observed "
+                    f"values: {list(unexpected_max_counts)}."
+                )
+            return max_categories if any(max_categories) else None
+
+        else:
+            return None
 
     def _check_infrequent_enabled(self):
         """
         This functions checks whether _infrequent_enabled is True or False.
         This has to be called after parameter validation in the fit function.
         """
-        max_categories = getattr(self, "max_categories", None)
+        self._max_categories_per_feature = self._validate_max_categories()
         min_frequency = getattr(self, "min_frequency", None)
         self._infrequent_enabled = (
-            self._max_categories_enable_infrequent(max_categories)
+            self._max_categories_per_feature is not None
         ) or min_frequency is not None
 
-    def _has_infrequent_categories(self, n_current_features, col_idx):
-        """
-        This function checks if there are any infrequent categories.
-        """
-        return (
-            self.max_categories is not None and self.max_categories < n_current_features
-        )
-
-    def _get_frequent_category_count(self, col_idx):
-        """
-        This functions computes the number of frequent categories.
-        """
-        return self.max_categories - 1
-
     def _identify_infrequent(self, category_count, n_samples, col_idx):
         """Compute the infrequent indices.
 
@@ -312,9 +364,14 @@ def _identify_infrequent(self, category_count, n_samples, col_idx):
             infrequent_mask = np.zeros(category_count.shape[0], dtype=bool)
 
         n_current_features = category_count.size - infrequent_mask.sum() + 1
-        if self._has_infrequent_categories(n_current_features, col_idx):
+        if self._max_categories_per_feature is not None:
+            max_categories = self._max_categories_per_feature[col_idx]
+        else:
+            max_categories = None
+
+        if max_categories is not None and max_categories < n_current_features:
             # max_categories includes the one infrequent category
-            frequent_category_count = self._get_frequent_category_count(col_idx)
+            frequent_category_count = max_categories - 1
             if frequent_category_count == 0:
                 # All categories are infrequent
                 infrequent_mask[:] = True
@@ -1481,103 +1538,6 @@ def __init__(
         self.min_frequency = min_frequency
         self.max_categories = max_categories
 
-    def _check_max_categories(self):
-        """
-        Check the max_categories and convert to the corresponding array.
-        """
-        max_categories = self.max_categories
-        if isinstance(max_categories, dict):
-            max_categories = np.full(
-                shape=self.n_features_in_,
-                fill_value=None,
-                dtype=object,
-            )
-            if not hasattr(self, "feature_names_in_"):
-                raise ValueError(
-                    f"{self.__class__.__name__} was not fitted on data "
-                    "with feature names. Pass max_categories as an integer "
-                    "array instead."
-                )
-            unexpected_feature_names = list(
-                set(self.max_categories) - set(self.feature_names_in_)
-            )
-            unexpected_feature_names.sort()  # deterministic error message
-            n_unexpeced = len(unexpected_feature_names)
-            if unexpected_feature_names:
-                if len(unexpected_feature_names) > 5:
-                    unexpected_feature_names = unexpected_feature_names[:5]
-                    unexpected_feature_names.append("...")
-                raise ValueError(
-                    f"max_categories contains {n_unexpeced} unexpected feature "
-                    f"names: {unexpected_feature_names}."
-                )
-            for feature_idx, feature_name in enumerate(self.feature_names_in_):
-                if feature_name in self.max_categories:
-                    max_count = self.max_categories[feature_name]
-                    if not (isinstance(max_count, Integral) and max_count >= 1):
-                        raise ValueError(
-                            f"max_categories['{feature_name}'] must be an "
-                            f"integer at least 1. Got {max_count!r}."
-                        )
-                    max_categories[feature_idx] = max_count
-        else:
-            max_categories = np.asarray(max_categories)
-            if max_categories.shape[0] != self.n_features_in_:
-                raise ValueError(
-                    f"max_categories has shape {max_categories.shape} but the "
-                    f"input data X has {self.n_features_in_} features."
-                )
-
-            unexpected_max_counts = set(
-                max_count
-                for max_count in max_categories
-                if not (
-                    max_count is None
-                    or (isinstance(max_count, Integral) and max_count >= 1)
-                )
-            )
-            if len(unexpected_max_counts):
-                raise ValueError(
-                    "max_categories must be an array-like of None or integers "
-                    "at least 1. Observed "
-                    f"values: {list(unexpected_max_counts)}."
-                )
-
-        self.max_categories = max_categories
-
-    def _max_categories_enable_infrequent(self, max_categories):
-        """
-        This function checks whether the value of max_categories
-        enables infrequent categories.
-        """
-        if max_categories is None:
-            return False
-        elif isinstance(max_categories, Integral):
-            return max_categories >= 1
-        else:
-            return any(max_categories)
-
-    def _has_infrequent_categories(self, n_current_features, col_idx):
-        """
-        This function checks if there are any infrequent categories.
-        """
-        if self.max_categories is None:
-            return False
-        if isinstance(self.max_categories, Integral):
-            return self.max_categories < n_current_features
-        else:
-            max_count = self.max_categories[col_idx]
-            return max_count is not None and max_count < n_current_features
-
-    def _get_frequent_category_count(self, col_idx):
-        """
-        This functions computes the number of frequent categories.
-        """
-        if isinstance(self.max_categories, Integral):
-            return self.max_categories - 1
-        else:
-            return self.max_categories[col_idx] - 1
-
     def fit(self, X, y=None):
         """
         Fit the OrdinalEncoder to X.
@@ -1620,13 +1580,6 @@ def fit(self, X, y=None):
                 f"got {self.unknown_value}."
             )
 
-        if isinstance(self.max_categories, dict) or _is_arraylike_not_scalar(
-            self.max_categories
-        ):
-            self._check_n_features(X, reset=True)
-            self._check_feature_names(X, reset=True)
-            self._check_max_categories()
-
         # `_fit` will only raise an error when `self.handle_unknown="error"`
         fit_results = self._fit(
             X,

From 20c4489c5a133def0a686ce215e83dfdd3c2d1c1 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Tue, 16 May 2023 23:40:29 -0700
Subject: [PATCH 09/16] Update tests

---
 sklearn/preprocessing/tests/test_encoders.py | 37 ++++++++++++++++++--
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 7d0f1ed8bd3dc..b7ec54a8c9059 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -2430,7 +2430,38 @@ def test_ordinal_encoder_missing_feature_names():
         ordinal.fit(X)
 
 
-def test_ordinal_encoder_unexpected_feature_names():
+@pytest.mark.parametrize(
+    "max_categories, unexpected_features, n_unexpected_features",
+    [
+        ({"str": 3, "int": 2, "categorical": 1, "unexpected": 13}, ["unexpected"], 1),
+        (
+            {
+                "str": 3,
+                "int": 2,
+                "categorical": 1,
+                "unexpected": 13,
+                "unexpected_2": 13,
+                "unexpected_3": 13,
+                "unexpected_3": 13,
+                "unexpected_4": 13,
+                "unexpected_5": 13,
+                "unexpected_6": 13,
+            },
+            [
+                "unexpected",
+                "unexpected_2",
+                "unexpected_3",
+                "unexpected_4",
+                "unexpected_5",
+                "...",
+            ],
+            6,
+        ),
+    ],
+)
+def test_ordinal_encoder_unexpected_feature_names(
+    max_categories, unexpected_features, n_unexpected_features
+):
     """Check behavior when max_categories specifies features that are not present on a
     dataset."""
     pd = pytest.importorskip("pandas")
@@ -2447,11 +2478,11 @@ def test_ordinal_encoder_unexpected_feature_names():
         columns=["str", "int", "categorical"],
     )
 
-    max_categories = {"str": 3, "int": 2, "categorical": 1, "unexpected": 13}
     ordinal = OrdinalEncoder(max_categories=max_categories)
 
     msg = re.escape(
-        "max_categories contains 1 unexpected feature names: ['unexpected']."
+        f"max_categories contains {n_unexpected_features} unexpected feature names:"
+        f" {unexpected_features}."
     )
 
     with pytest.raises(ValueError, match=msg):

From ea0a7fb1b8555ec98ed92a219960b983aad16169 Mon Sep 17 00:00:00 2001
From: Andrew Wang <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Thu, 25 May 2023 21:26:30 -0700
Subject: [PATCH 10/16] Update doc/modules/preprocessing.rst

Simplify parameter description of `max_categories`

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 doc/modules/preprocessing.rst | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 6bb4b2ee9a437..6a1c331f045b8 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -746,11 +746,10 @@ enable the gathering of infrequent categories are `min_frequency` and
    this fraction of the total number of samples will be considered infrequent.
    The default value is 1, which means every category is encoded separately.
 
-2. `max_categories` is either `None`, any integer greater or equal to 1, an
-   array-like comprised of `None` or integers greater than equal to 1 (for
-   :class:`OrdinalEncoder` only), or a dictionary mapping a feature name found
-   in `feature_names_in_` to any integer greater than or equal to 1 (for
-   :class:`OrdinalEncoder` only). This parameter sets an upper limit to the
+2. `max_categories` is either `None` or. any integer greater or equal to 1.
+   :class:`OrdinalEncoder` also supports an array-like containing `None` and
+   integers or a dictionary mapping a feature name found in `feature_names_in_`
+   to an integer. This parameter sets an upper limit to the
    number of output categories for each input feature. `max_categories`
    includes the category that combines infrequent categories.
 

From 342792442aded81225f5b350d01633c03e0b5f8e Mon Sep 17 00:00:00 2001
From: Andrew Wang <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Thu, 25 May 2023 21:27:28 -0700
Subject: [PATCH 11/16] Update sklearn/preprocessing/_encoders.py

Sort unexpected feature names only when there is an error

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/preprocessing/_encoders.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 115af972aaaba..a71b6987f038b 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -271,9 +271,9 @@ def _validate_max_categories(self):
             unexpected_feature_names = list(
                 set(self.max_categories) - set(self.feature_names_in_)
             )
-            unexpected_feature_names.sort()  # deterministic error message
-            n_unexpeced = len(unexpected_feature_names)
             if unexpected_feature_names:
+                unexpected_feature_names.sort()  # deterministic error message
+                n_unexpeced = len(unexpected_feature_names)
                 if len(unexpected_feature_names) > 5:
                     unexpected_feature_names = unexpected_feature_names[:5]
                     unexpected_feature_names.append("...")

From 1fa914ae8fbb4259f3d399ebea94d0cd6a5d9fcc Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Thu, 25 May 2023 21:40:34 -0700
Subject: [PATCH 12/16] Simplify error message for array-like max_categories

---
 sklearn/preprocessing/_encoders.py           | 20 ++++++++------------
 sklearn/preprocessing/tests/test_encoders.py |  3 +--
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index a71b6987f038b..1d74d06723011 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -273,12 +273,12 @@ def _validate_max_categories(self):
             )
             if unexpected_feature_names:
                 unexpected_feature_names.sort()  # deterministic error message
-                n_unexpeced = len(unexpected_feature_names)
+                n_unexpected = len(unexpected_feature_names)
                 if len(unexpected_feature_names) > 5:
                     unexpected_feature_names = unexpected_feature_names[:5]
                     unexpected_feature_names.append("...")
                 raise ValueError(
-                    f"max_categories contains {n_unexpeced} unexpected feature "
+                    f"max_categories contains {n_unexpected} unexpected feature "
                     f"names: {unexpected_feature_names}."
                 )
 
@@ -305,20 +305,16 @@ def _validate_max_categories(self):
                     f"input data X has {self.n_features_in_} features."
                 )
 
-            unexpected_max_counts = set(
-                max_count
+            if any(
+                max_count is not None
+                and not (isinstance(max_count, Integral) and max_count >= 1)
                 for max_count in max_categories
-                if not (
-                    max_count is None
-                    or (isinstance(max_count, Integral) and max_count >= 1)
-                )
-            )
-            if len(unexpected_max_counts):
+            ):
                 raise ValueError(
                     "max_categories must be an array-like of None or integers "
-                    "at least 1. Observed "
-                    f"values: {list(unexpected_max_counts)}."
+                    "at least 1."
                 )
+
             return max_categories if any(max_categories) else None
 
         else:
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index b7ec54a8c9059..cc3ccd0bf2ded 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -2560,8 +2560,7 @@ def test_ordinal_encoder_max_categories_array_like_invalid_types():
     ordinal = OrdinalEncoder(max_categories=[3, None, 0])
 
     msg = re.escape(
-        "max_categories must be an array-like of None or integers at least 1. "
-        "Observed values: [0]."
+        "max_categories must be an array-like of None or integers at least 1."
     )
 
     with pytest.raises(ValueError, match=msg):

From 6488f9bac9982488d500d0b612457f1e4cb7028d Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Tue, 13 Jun 2023 21:54:12 -0700
Subject: [PATCH 13/16] Fix indentation on changelog

---
 doc/whats_new/v1.3.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 4cf0d688b11ee..b05b4997d0019 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -627,12 +627,12 @@ Changelog
 - |Fix| :class:`AdditiveChi2Sampler` is now stateless.
   The `sample_interval_` attribute is deprecated and will be removed in 1.5.
   :pr:`25190` by :user:`Vincent Maladière <Vincent-Maladiere>`.
-  
+
 - |Fix| :class:`preprocessing.PowerTransformer` now correcly raises error when
   using `method="box-cox"` on data with a constant `np.nan` column.
   :pr:`26400` by :user:`Yao Xiao <Charlie-XIAO>`.
-  
- - |Enhancement| Added support for passing `max_categories` as `array-like` or
+
+- |Enhancement| Added support for passing `max_categories` as `array-like` or
   `dict` in :class:`preprocessing.OrdinalEncoder`. This allows specifying the
   maximum number of output categories for each input feature instead of being
   restricted to setting a global maximum number of output categories.

From 1a6e63a8843e7e638a877be1255f7ba3feb7bbf0 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Tue, 20 Jun 2023 18:42:20 -0700
Subject: [PATCH 14/16] Fix linting issues

---
 sklearn/preprocessing/tests/test_encoders.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index cc3ccd0bf2ded..3d22abb38f269 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -2442,7 +2442,6 @@ def test_ordinal_encoder_missing_feature_names():
                 "unexpected": 13,
                 "unexpected_2": 13,
                 "unexpected_3": 13,
-                "unexpected_3": 13,
                 "unexpected_4": 13,
                 "unexpected_5": 13,
                 "unexpected_6": 13,

From bce9206dfaa0e90ec9b6d6a4b818a3be3a66dde0 Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Wed, 21 Jun 2023 19:51:17 -0700
Subject: [PATCH 15/16] Fix linting issues

---
 sklearn/preprocessing/_encoders.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 5c497f7089d92..30cc66d04c785 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -14,7 +14,11 @@
 from ..utils._encode import _check_unknown, _encode, _get_counts, _unique
 from ..utils._mask import _get_mask
 from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
-from ..utils.validation import _check_feature_names_in, check_is_fitted, _is_arraylike_not_scalar
+from ..utils.validation import (
+    _check_feature_names_in,
+    _is_arraylike_not_scalar,
+    check_is_fitted,
+)
 
 __all__ = ["OneHotEncoder", "OrdinalEncoder"]
 

From 92e5b0d9fbb4f7876a4793a875277e847572f12d Mon Sep 17 00:00:00 2001
From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com>
Date: Wed, 9 Aug 2023 22:26:20 -0700
Subject: [PATCH 16/16] Migrate changelog from v1.3 to v1.4

---
 doc/whats_new/v1.3.rst | 8 +-------
 doc/whats_new/v1.4.rst | 6 ++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index 995144d4a3753..0f46f28cd340b 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -745,7 +745,7 @@ Changelog
 - |Fix| :class:`preprocessing.PowerTransformer` now correcly raises error when
   using `method="box-cox"` on data with a constant `np.nan` column.
   :pr:`26400` by :user:`Yao Xiao <Charlie-XIAO>`.
-  
+
 - |Fix| :class:`preprocessing.PowerTransformer` with `method="yeo-johnson"` now leaves
   constant features unchanged instead of transforming with an arbitrary value for
   the `lambdas_` fitted parameter.
@@ -755,12 +755,6 @@ Changelog
   :class:`preprocessing.KBinsDiscretizer` will change from `None` to `200_000` in
   version 1.5 when `strategy="kmeans"` or `strategy="uniform"`.
   :pr:`26424` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
-  
-- |Enhancement| Added support for passing `max_categories` as `array-like` or
-  `dict` in :class:`preprocessing.OrdinalEncoder`. This allows specifying the
-  maximum number of output categories for each input feature instead of being
-  restricted to setting a global maximum number of output categories.
-  :pr:`26284` by :user:`Andrew Wang <Andrew-Wang-IB45>`.
 
 :mod:`sklearn.svm`
 ..................
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index a7f2df245f105..817e3d4813093 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -188,6 +188,12 @@ Changelog
 - |Enhancement| :func:`sklearn.model_selection.train_test_split` now supports
   Array API compatible inputs. :pr:`26855` by `Tim Head`_.
 
+- |Enhancement| Added support for passing `max_categories` as `array-like` or
+  `dict` in :class:`preprocessing.OrdinalEncoder`. This allows specifying the
+  maximum number of output categories for each input feature instead of being
+  restricted to setting a global maximum number of output categories.
+  :pr:`26284` by :user:`Andrew Wang <Andrew-Wang-IB45>`.
+
 :mod:`sklearn.tree`
 ...................