From b4208f23e1f0240d05e7c4d033bd54a2e905fe4e Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Tue, 25 Apr 2023 14:50:25 -0700 Subject: [PATCH 01/16] Add per feature max_categories for OrdinalEncoder --- doc/modules/preprocessing.rst | 9 +- sklearn/preprocessing/_encoders.py | 124 +++++++++++++++++-- sklearn/preprocessing/tests/test_encoders.py | 116 ++++++++++++++++- 3 files changed, 232 insertions(+), 17 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 69045147d8af9..684ab410ba335 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -746,10 +746,11 @@ enable the gathering of infrequent categories are `min_frequency` and this fraction of the total number of samples will be considered infrequent. The default value is 1, which means every category is encoded separately. -2. `max_categories` is either `None` or any integer greater than 1. This - parameter sets an upper limit to the number of output features for each - input feature. `max_categories` includes the feature that combines - infrequent categories. +2. `max_categories` is either `None`, any integer greater than 1, or (for + :class:`OrdinalEncoder` only) a dictionary mapping a valid feature name to + any integer greater than 1. This parameter sets an upper limit to the number + of output features for each input feature. `max_categories` includes the + feature that combines infrequent categories. In the following example with :class:`OrdinalEncoder`, the categories `'dog' and 'snake'` are considered infrequent:: diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index fd9941f5336ed..cbf59aad6a810 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -250,17 +250,38 @@ def infrequent_categories_(self): for category, indices in zip(self.categories_, infrequent_indices) ] + def _check_max_categories(self): + """ + This function checks whether the value of max_categories + enables infrequent categories. + """ + max_categories = getattr(self, "max_categories", None) + return max_categories is not None and max_categories >= 1 + def _check_infrequent_enabled(self): """ This functions checks whether _infrequent_enabled is True or False. This has to be called after parameter validation in the fit function. """ - max_categories = getattr(self, "max_categories", None) min_frequency = getattr(self, "min_frequency", None) self._infrequent_enabled = ( - max_categories is not None and max_categories >= 1 + self._check_max_categories() ) or min_frequency is not None + def _has_infrequent_categories(self, n_current_features, col_idx): + """ + This function checks if there are any infrequent categories. + """ + return ( + self.max_categories is not None and self.max_categories < n_current_features + ) + + def _get_frequent_category_count(self, col_idx): + """ + This functions computes the number of frequent categories. + """ + return self.max_categories - 1 + def _identify_infrequent(self, category_count, n_samples, col_idx): """Compute the infrequent indices. @@ -290,9 +311,9 @@ def _identify_infrequent(self, category_count, n_samples, col_idx): infrequent_mask = np.zeros(category_count.shape[0], dtype=bool) n_current_features = category_count.size - infrequent_mask.sum() + 1 - if self.max_categories is not None and self.max_categories < n_current_features: + if self._has_infrequent_categories(n_current_features, col_idx): # max_categories includes the one infrequent category - frequent_category_count = self.max_categories - 1 + frequent_category_count = self._get_frequent_category_count(col_idx) if frequent_category_count == 0: # All categories are infrequent infrequent_mask[:] = True @@ -1294,11 +1315,13 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder): .. versionadded:: 1.3 Read more in the :ref:`User Guide `. - max_categories : int, default=None + max_categories : int or dict, default=None Specifies an upper limit to the number of output categories for each input feature when considering infrequent categories. If there are infrequent categories, `max_categories` includes the category representing the - infrequent categories along with the frequent categories. If `None`, + infrequent categories along with the frequent categories. + If `max_categories` is a dictionary, each key-value pair represents the + upper limit to the number of output categories per feature. If `None`, there is no limit to the number of output features. `max_categories` do **not** take into account missing or unknown @@ -1419,7 +1442,11 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder): "encoded_missing_value": [Integral, type(np.nan)], "handle_unknown": [StrOptions({"error", "use_encoded_value"})], "unknown_value": [Integral, type(np.nan), None], - "max_categories": [Interval(Integral, 1, None, closed="left"), None], + "max_categories": [ + Interval(Integral, 1, None, closed="left"), + dict, + None, + ], "min_frequency": [ Interval(Integral, 1, None, closed="left"), Interval(RealNotInt, 0, 1, closed="neither"), @@ -1446,6 +1473,84 @@ def __init__( self.min_frequency = min_frequency self.max_categories = max_categories + def _validate_max_categories_dict(self): + """ + This functions validates max_categories when it is a dictionary. + """ + if len(self.max_categories) == 0: + raise ValueError("max_categories dictionary must be non-empty.") + + for feature_name in self.max_categories.keys(): + if not isinstance(feature_name, str): + raise TypeError( + "feature in max_categories dictionary " + "must be a string, " + f"got {type(feature_name).__name__} for {feature_name}." + ) + + feature_names = set(_check_feature_names_in(self)) + if self.max_categories.keys() > feature_names: + excess_feature_names = ", ".join( + sorted(self.max_categories.keys() - feature_names) + ) + raise ValueError( + "features in max_categories dictionary " + "must be a valid feature name, " + f"got {excess_feature_names}." + ) + + for max_count in self.max_categories.values(): + if not isinstance(max_count, Integral): + raise TypeError( + "value in max_categories dictionary " + "must be an integer, " + f"got {type(max_count).__name__} for {max_count}." + ) + if max_count < 1: + raise ValueError( + "value in max_categories dictionary " + "must be at least 1, " + f"got {max_count}." + ) + + def _check_max_categories(self): + """ + This function checks whether the value of max_categories + enables infrequent categories. + """ + max_categories = getattr(self, "max_categories", None) + + if max_categories is None: + return False + elif isinstance(max_categories, Integral): + return max_categories >= 1 + else: + return all(max_count >= 1 for max_count in max_categories.values()) + + def _has_infrequent_categories(self, n_current_features, col_idx): + """ + This function checks if there are any infrequent categories. + """ + if self.max_categories is None: + return False + elif isinstance(self.max_categories, Integral): + return self.max_categories < n_current_features + else: + feature_name = _check_feature_names_in(self)[col_idx] + return ( + feature_name in self.max_categories + ) and self.max_categories[feature_name] < n_current_features + + def _get_frequent_category_count(self, col_idx): + """ + This functions computes the number of frequent categories. + """ + if isinstance(self.max_categories, Integral): + return self.max_categories - 1 + else: + feature_name = _check_feature_names_in(self)[col_idx] + return self.max_categories[feature_name] - 1 + def fit(self, X, y=None): """ Fit the OrdinalEncoder to X. @@ -1488,6 +1593,11 @@ def fit(self, X, y=None): f"got {self.unknown_value}." ) + if isinstance(self.max_categories, dict): + self._check_n_features(X, reset=True) + self._check_feature_names(X, reset=True) + self._validate_max_categories_dict() + # `_fit` will only raise an error when `self.handle_unknown="error"` fit_results = self._fit( X, diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 42c66980bfeba..3f4de65fb001e 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -2091,6 +2091,87 @@ def test_ordinal_encoder_infrequent_three_levels(kwargs): assert_array_equal(X_inverse, expected_inverse) +@pytest.mark.parametrize( + "kwargs, expected_infrequent_categories, expected_trans, expected_inverse", + [ + ( + {"max_categories": {"x0": 3, "x1": 3, "x2": 3}}, + [["a", "d"], ["e", "h"], ["i", "l"]], + [[2] * 3, [0] * 3, [1] * 3, [2] * 3, [-1] * 3], + [ + ["infrequent_sklearn"] * 3, + ["b", "f", "j"], + ["c", "g", "k"], + ["infrequent_sklearn"] * 3, + [None] * 3, + ], + ), + ( + {"max_categories": {"x0": 3}}, + [["a", "d"], None, None], + [[2, 0, 0], [0, 1, 1], [1, 2, 2], [2, 3, 3], [-1] * 3], + [ + ["infrequent_sklearn", "e", "i"], + ["b", "f", "j"], + ["c", "g", "k"], + ["infrequent_sklearn", "h", "l"], + [None] * 3, + ], + ), + ( + {"max_categories": {"x0": 3, "x1": 2, "x2": 1}}, + [["a", "d"], ["e", "g", "h"], ["i", "j", "k", "l"]], + [[2, 1, 0], [0, 0, 0], [1, 1, 0], [2, 1, 0], [-1] * 3], + [ + ["infrequent_sklearn"] * 3, + ["b", "f", "infrequent_sklearn"], + ["c", "infrequent_sklearn", "infrequent_sklearn"], + ["infrequent_sklearn"] * 3, + [None] * 3, + ], + ), + ], +) +def test_ordinal_encoder_infrequent_three_levels_multiple_features( + kwargs, expected_infrequent_categories, expected_trans, expected_inverse +): + """Test parameters for grouping multiple features into the infrequent category.""" + + X_train = ( + [["a", "e", "i"]] * 5 + + [["b", "f", "j"]] * 20 + + [["c", "g", "k"]] * 10 + + [["d", "h", "l"]] * 3 + ) + ordinal = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1, **kwargs + ).fit(X_train) + assert_array_equal( + ordinal.categories_, + [["a", "b", "c", "d"], ["e", "f", "g", "h"], ["i", "j", "k", "l"]], + ) + + assert len(ordinal.infrequent_categories_) == len(expected_infrequent_categories) + for expected, actual in zip( + ordinal.infrequent_categories_, expected_infrequent_categories + ): + assert_array_equal(expected, actual) + + X_test = [ + ["a", "e", "i"], + ["b", "f", "j"], + ["c", "g", "k"], + ["d", "h", "l"], + ["x", "y", "z"], + ] + + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, expected_trans) + + X_inverse = ordinal.inverse_transform(X_trans) + assert_array_equal(X_inverse, expected_inverse) + + def test_ordinal_encoder_infrequent_three_levels_user_cats(): """Test that the order of the categories provided by a user is respected. @@ -2148,7 +2229,29 @@ def test_ordinal_encoder_infrequent_mixed(): assert_array_equal(X_inverse, expected_inverse) -def test_ordinal_encoder_infrequent_multiple_categories_dtypes(): +@pytest.mark.parametrize( + "kwargs, expected_infrequent_categories, expected_trans", + [ + ( + {"max_categories": 3}, + [["a", "b"], [0, 3, 12], ["bird", "snake"]], + [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]], + ), + ( + {"max_categories": {"str": 3}}, + [["a", "b"], None, None], + [[2, 4, 1], [2, 0, 3], [1, 3, 0], [0, 2, 2]], + ), + ( + {"max_categories": {"str": 3, "int": 2, "categorical": 1}}, + [["a", "b"], [0, 3, 5, 12], ["bird", "cat", "dog", "snake"]], + [[2, 1, 0], [2, 1, 0], [1, 0, 0], [0, 1, 0]], + ), + ], +) +def test_ordinal_encoder_infrequent_multiple_categories_dtypes( + kwargs, expected_infrequent_categories, expected_trans +): """Test infrequent categories with a pandas DataFrame with multiple dtypes.""" pd = pytest.importorskip("pandas") @@ -2165,7 +2268,7 @@ def test_ordinal_encoder_infrequent_multiple_categories_dtypes(): columns=["str", "int", "categorical"], ) - ordinal = OrdinalEncoder(max_categories=3).fit(X) + ordinal = OrdinalEncoder(**kwargs).fit(X) # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be # considered infrequent because they appear first when sorted @@ -2175,9 +2278,11 @@ def test_ordinal_encoder_infrequent_multiple_categories_dtypes(): # X[:, 2] "snake" and "bird" or infrequent - assert_array_equal(ordinal.infrequent_categories_[0], ["a", "b"]) - assert_array_equal(ordinal.infrequent_categories_[1], [0, 3, 12]) - assert_array_equal(ordinal.infrequent_categories_[2], ["bird", "snake"]) + assert len(ordinal.infrequent_categories_) == len(expected_infrequent_categories) + for expected, actual in zip( + ordinal.infrequent_categories_, expected_infrequent_categories + ): + assert_array_equal(expected, actual) X_test = pd.DataFrame( { @@ -2190,7 +2295,6 @@ def test_ordinal_encoder_infrequent_multiple_categories_dtypes(): }, columns=["str", "int", "categorical"], ) - expected_trans = [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]] X_trans = ordinal.transform(X_test) assert_allclose(X_trans, expected_trans) From a3db2b62d7ee2a3b8bdb99d8df7cdf9a8816effb Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Wed, 26 Apr 2023 16:18:48 -0700 Subject: [PATCH 02/16] Fix formatting --- sklearn/preprocessing/_encoders.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index cbf59aad6a810..445e91a5b36e6 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -1537,9 +1537,9 @@ def _has_infrequent_categories(self, n_current_features, col_idx): return self.max_categories < n_current_features else: feature_name = _check_feature_names_in(self)[col_idx] - return ( - feature_name in self.max_categories - ) and self.max_categories[feature_name] < n_current_features + return (feature_name in self.max_categories) and self.max_categories[ + feature_name + ] < n_current_features def _get_frequent_category_count(self, col_idx): """ From 39e81ab022297c55cdecb9480f58e3033b7ee791 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Wed, 26 Apr 2023 23:15:50 -0700 Subject: [PATCH 03/16] Update behaviour of max_categories in OrdinalEncoder --- doc/modules/preprocessing.rst | 12 +- sklearn/preprocessing/_encoders.py | 147 ++++++++++--------- sklearn/preprocessing/tests/test_encoders.py | 11 +- 3 files changed, 96 insertions(+), 74 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 684ab410ba335..6bb4b2ee9a437 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -746,11 +746,13 @@ enable the gathering of infrequent categories are `min_frequency` and this fraction of the total number of samples will be considered infrequent. The default value is 1, which means every category is encoded separately. -2. `max_categories` is either `None`, any integer greater than 1, or (for - :class:`OrdinalEncoder` only) a dictionary mapping a valid feature name to - any integer greater than 1. This parameter sets an upper limit to the number - of output features for each input feature. `max_categories` includes the - feature that combines infrequent categories. +2. `max_categories` is either `None`, any integer greater or equal to 1, an + array-like comprised of `None` or integers greater than equal to 1 (for + :class:`OrdinalEncoder` only), or a dictionary mapping a feature name found + in `feature_names_in_` to any integer greater than or equal to 1 (for + :class:`OrdinalEncoder` only). This parameter sets an upper limit to the + number of output categories for each input feature. `max_categories` + includes the category that combines infrequent categories. In the following example with :class:`OrdinalEncoder`, the categories `'dog' and 'snake'` are considered infrequent:: diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 445e91a5b36e6..d0ce34739770f 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -250,13 +250,12 @@ def infrequent_categories_(self): for category, indices in zip(self.categories_, infrequent_indices) ] - def _check_max_categories(self): + def _max_categories_enable_infrequent(self): """ This function checks whether the value of max_categories enables infrequent categories. """ - max_categories = getattr(self, "max_categories", None) - return max_categories is not None and max_categories >= 1 + return self.max_categories is not None and self.max_categories >= 1 def _check_infrequent_enabled(self): """ @@ -265,7 +264,7 @@ def _check_infrequent_enabled(self): """ min_frequency = getattr(self, "min_frequency", None) self._infrequent_enabled = ( - self._check_max_categories() + self._max_categories_enable_infrequent() ) or min_frequency is not None def _has_infrequent_categories(self, n_current_features, col_idx): @@ -1315,14 +1314,20 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder): .. versionadded:: 1.3 Read more in the :ref:`User Guide `. - max_categories : int or dict, default=None + max_categories : int, array-like of int, dict of str or None, default=None Specifies an upper limit to the number of output categories for each input feature when considering infrequent categories. If there are infrequent categories, `max_categories` includes the category representing the infrequent categories along with the frequent categories. - If `max_categories` is a dictionary, each key-value pair represents the - upper limit to the number of output categories per feature. If `None`, - there is no limit to the number of output features. + + - If int, then `max_categories` is the upper limit of output categories + for all input features. + - If array-like, then each item in `max_categories` is the upper limit + of output categories for the corresponding input feature. + - If dict, then its keys should be the feature names occurring in + `feature_names_in_` and the corresponding values should be the + upper limits of output categories. + - If `None`, then there is no limit to the number of output categories. `max_categories` do **not** take into account missing or unknown categories. Setting `unknown_value` or `encoded_missing_value` to an @@ -1444,6 +1449,7 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder): "unknown_value": [Integral, type(np.nan), None], "max_categories": [ Interval(Integral, 1, None, closed="left"), + "array-like", dict, None, ], @@ -1473,83 +1479,93 @@ def __init__( self.min_frequency = min_frequency self.max_categories = max_categories - def _validate_max_categories_dict(self): + def _check_max_categories(self): """ - This functions validates max_categories when it is a dictionary. + Check the max_categories and convert to the corresponding array. """ - if len(self.max_categories) == 0: - raise ValueError("max_categories dictionary must be non-empty.") - - for feature_name in self.max_categories.keys(): - if not isinstance(feature_name, str): - raise TypeError( - "feature in max_categories dictionary " - "must be a string, " - f"got {type(feature_name).__name__} for {feature_name}." + max_categories = self.max_categories + if max_categories is None or isinstance(max_categories, Integral): + max_categories = np.full( + shape=self.n_features_in_, + fill_value=max_categories, + ) + elif isinstance(max_categories, dict): + max_categories = np.full( + shape=self.n_features_in_, + fill_value=None, + dtype=object, + ) + if not hasattr(self, "feature_names_in_"): + raise ValueError( + f"{self.__class__.__name__} was not fitted on data " + "with feature names. Pass max_categories as an integer " + "array instead." ) - - feature_names = set(_check_feature_names_in(self)) - if self.max_categories.keys() > feature_names: - excess_feature_names = ", ".join( - sorted(self.max_categories.keys() - feature_names) + unexpected_feature_names = list( + set(self.max_categories) - set(self.feature_names_in_) ) - raise ValueError( - "features in max_categories dictionary " - "must be a valid feature name, " - f"got {excess_feature_names}." + unexpected_feature_names.sort() # deterministic error message + n_unexpeced = len(unexpected_feature_names) + if unexpected_feature_names: + if len(unexpected_feature_names) > 5: + unexpected_feature_names = unexpected_feature_names[:5] + unexpected_feature_names.append("...") + raise ValueError( + f"max_categories contains {n_unexpeced} unexpected feature " + f"names: {unexpected_feature_names}." + ) + for feature_idx, feature_name in enumerate(self.feature_names_in_): + if feature_name in self.max_categories: + max_count = self.max_categories[feature_name] + if not (isinstance(max_count, Integral) and max_count >= 1): + raise ValueError( + f"max_categories['{feature_name}'] must be an " + f"integer at least 1. Got {max_count!r}." + ) + max_categories[feature_idx] = max_count + else: + unexpected_max_counts = set( + max_count + for max_count in max_categories + if not ( + max_count is None + or (isinstance(max_count, Integral) and max_count >= 1) + ) ) - - for max_count in self.max_categories.values(): - if not isinstance(max_count, Integral): - raise TypeError( - "value in max_categories dictionary " - "must be an integer, " - f"got {type(max_count).__name__} for {max_count}." + if len(unexpected_max_counts): + raise ValueError( + "max_categories must be an array-like of None or integers " + "at least 1. Observed " + f"values: {list(unexpected_max_counts)}." ) - if max_count < 1: + + max_categories = np.asarray(max_categories) + if max_categories.shape[0] != self.n_features_in_: raise ValueError( - "value in max_categories dictionary " - "must be at least 1, " - f"got {max_count}." + f"max_categories has shape {self.max_categories.shape} but the" + f" input data X has {self.n_features_in_} features." ) + self.max_categories = max_categories - def _check_max_categories(self): + def _max_categories_enable_infrequent(self): """ This function checks whether the value of max_categories enables infrequent categories. """ - max_categories = getattr(self, "max_categories", None) - - if max_categories is None: - return False - elif isinstance(max_categories, Integral): - return max_categories >= 1 - else: - return all(max_count >= 1 for max_count in max_categories.values()) + return all(self.max_categories) def _has_infrequent_categories(self, n_current_features, col_idx): """ This function checks if there are any infrequent categories. """ - if self.max_categories is None: - return False - elif isinstance(self.max_categories, Integral): - return self.max_categories < n_current_features - else: - feature_name = _check_feature_names_in(self)[col_idx] - return (feature_name in self.max_categories) and self.max_categories[ - feature_name - ] < n_current_features + max_count = self.max_categories[col_idx] + return max_count is not None and max_count < n_current_features def _get_frequent_category_count(self, col_idx): """ This functions computes the number of frequent categories. """ - if isinstance(self.max_categories, Integral): - return self.max_categories - 1 - else: - feature_name = _check_feature_names_in(self)[col_idx] - return self.max_categories[feature_name] - 1 + return self.max_categories[col_idx] - 1 def fit(self, X, y=None): """ @@ -1593,10 +1609,9 @@ def fit(self, X, y=None): f"got {self.unknown_value}." ) - if isinstance(self.max_categories, dict): - self._check_n_features(X, reset=True) - self._check_feature_names(X, reset=True) - self._validate_max_categories_dict() + self._check_n_features(X, reset=True) + self._check_feature_names(X, reset=True) + self._check_max_categories() # `_fit` will only raise an error when `self.handle_unknown="error"` fit_results = self._fit( diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 3f4de65fb001e..107872aa8d769 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -2095,7 +2095,7 @@ def test_ordinal_encoder_infrequent_three_levels(kwargs): "kwargs, expected_infrequent_categories, expected_trans, expected_inverse", [ ( - {"max_categories": {"x0": 3, "x1": 3, "x2": 3}}, + {"max_categories": [3, 3, 3]}, [["a", "d"], ["e", "h"], ["i", "l"]], [[2] * 3, [0] * 3, [1] * 3, [2] * 3, [-1] * 3], [ @@ -2107,7 +2107,7 @@ def test_ordinal_encoder_infrequent_three_levels(kwargs): ], ), ( - {"max_categories": {"x0": 3}}, + {"max_categories": [3, None, None]}, [["a", "d"], None, None], [[2, 0, 0], [0, 1, 1], [1, 2, 2], [2, 3, 3], [-1] * 3], [ @@ -2119,7 +2119,7 @@ def test_ordinal_encoder_infrequent_three_levels(kwargs): ], ), ( - {"max_categories": {"x0": 3, "x1": 2, "x2": 1}}, + {"max_categories": [3, 2, 1]}, [["a", "d"], ["e", "g", "h"], ["i", "j", "k", "l"]], [[2, 1, 0], [0, 0, 0], [1, 1, 0], [2, 1, 0], [-1] * 3], [ @@ -2237,6 +2237,11 @@ def test_ordinal_encoder_infrequent_mixed(): [["a", "b"], [0, 3, 12], ["bird", "snake"]], [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]], ), + ( + {"max_categories": [3, None, None]}, + [["a", "b"], None, None], + [[2, 4, 1], [2, 0, 3], [1, 3, 0], [0, 2, 2]], + ), ( {"max_categories": {"str": 3}}, [["a", "b"], None, None], From b06a0d52a4e1958358be1e59c20803356be32331 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Wed, 26 Apr 2023 23:53:54 -0700 Subject: [PATCH 04/16] Fix errors pertaining to checking for infrequent categories --- sklearn/preprocessing/_encoders.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index d0ce34739770f..94ee963c0bb4e 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -250,21 +250,22 @@ def infrequent_categories_(self): for category, indices in zip(self.categories_, infrequent_indices) ] - def _max_categories_enable_infrequent(self): + def _max_categories_enable_infrequent(self, max_categories): """ This function checks whether the value of max_categories enables infrequent categories. """ - return self.max_categories is not None and self.max_categories >= 1 + return max_categories is not None and max_categories >= 1 def _check_infrequent_enabled(self): """ This functions checks whether _infrequent_enabled is True or False. This has to be called after parameter validation in the fit function. """ + max_categories = getattr(self, "max_categories", None) min_frequency = getattr(self, "min_frequency", None) self._infrequent_enabled = ( - self._max_categories_enable_infrequent() + self._max_categories_enable_infrequent(max_categories) ) or min_frequency is not None def _has_infrequent_categories(self, n_current_features, col_idx): @@ -1547,12 +1548,12 @@ def _check_max_categories(self): ) self.max_categories = max_categories - def _max_categories_enable_infrequent(self): + def _max_categories_enable_infrequent(self, max_categories): """ This function checks whether the value of max_categories enables infrequent categories. """ - return all(self.max_categories) + return any(max_categories) def _has_infrequent_categories(self, n_current_features, col_idx): """ From be5242ae35b84bc56d0ee730389c804b08a1b384 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Thu, 27 Apr 2023 22:06:07 -0700 Subject: [PATCH 05/16] Only check max_categories in OrdinalEncoder when it is an array-like or a dict --- sklearn/preprocessing/_encoders.py | 38 ++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 94ee963c0bb4e..b5bc1d31b553d 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -13,6 +13,7 @@ from ..utils import check_array, is_scalar_nan, _safe_indexing from ..utils.validation import check_is_fitted from ..utils.validation import _check_feature_names_in +from ..utils.validation import _is_arraylike_not_scalar from ..utils._param_validation import Interval, StrOptions, Hidden from ..utils._param_validation import RealNotInt from ..utils._mask import _get_mask @@ -1485,12 +1486,7 @@ def _check_max_categories(self): Check the max_categories and convert to the corresponding array. """ max_categories = self.max_categories - if max_categories is None or isinstance(max_categories, Integral): - max_categories = np.full( - shape=self.n_features_in_, - fill_value=max_categories, - ) - elif isinstance(max_categories, dict): + if isinstance(max_categories, dict): max_categories = np.full( shape=self.n_features_in_, fill_value=None, @@ -1553,20 +1549,33 @@ def _max_categories_enable_infrequent(self, max_categories): This function checks whether the value of max_categories enables infrequent categories. """ - return any(max_categories) + if max_categories is None: + return False + elif isinstance(max_categories, Integral): + return max_categories >= 1 + else: + return any(max_categories) def _has_infrequent_categories(self, n_current_features, col_idx): """ This function checks if there are any infrequent categories. """ - max_count = self.max_categories[col_idx] - return max_count is not None and max_count < n_current_features + if self.max_categories is None: + return False + if isinstance(self.max_categories, Integral): + return self.max_categories < n_current_features + else: + max_count = self.max_categories[col_idx] + return max_count is not None and max_count < n_current_features def _get_frequent_category_count(self, col_idx): """ This functions computes the number of frequent categories. """ - return self.max_categories[col_idx] - 1 + if isinstance(self.max_categories, Integral): + return self.max_categories - 1 + else: + return self.max_categories[col_idx] - 1 def fit(self, X, y=None): """ @@ -1610,9 +1619,12 @@ def fit(self, X, y=None): f"got {self.unknown_value}." ) - self._check_n_features(X, reset=True) - self._check_feature_names(X, reset=True) - self._check_max_categories() + if isinstance(self.max_categories, dict) or _is_arraylike_not_scalar( + self.max_categories + ): + self._check_n_features(X, reset=True) + self._check_feature_names(X, reset=True) + self._check_max_categories() # `_fit` will only raise an error when `self.handle_unknown="error"` fit_results = self._fit( From f43456e1551703e95ce4c60ca9e1169f2c186283 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Thu, 27 Apr 2023 22:15:10 -0700 Subject: [PATCH 06/16] Update changelog --- doc/whats_new/v1.3.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index bb245aa466152..c8a0bfe5f3191 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -483,6 +483,12 @@ Changelog The `sample_interval_` attribute is deprecated and will be removed in 1.5. :pr:`25190` by :user:`Vincent Maladière `. +- |Enhancement| Added support for passing `max_categories` as `array-like` or + `dict` in :class:`preprocessing.OrdinalEncoder`. This allows specifying the + maximum number of output categories for each input feature instead of being + restricted to setting a global maximum number of output categories. + :pr:`26284` by :user:`Andrew Wang `. + :mod:`sklearn.tree` ................... From d5586874d4f8c0c6d11cc224639b333ce79f0433 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Sat, 29 Apr 2023 00:02:32 -0700 Subject: [PATCH 07/16] Improve ordering of checking max_categories and add tests for OrdinalEncoder --- sklearn/preprocessing/_encoders.py | 13 +- sklearn/preprocessing/tests/test_encoders.py | 123 +++++++++++++++++++ 2 files changed, 130 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index b5bc1d31b553d..52b88c7f3ba26 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -1521,6 +1521,13 @@ def _check_max_categories(self): ) max_categories[feature_idx] = max_count else: + max_categories = np.asarray(max_categories) + if max_categories.shape[0] != self.n_features_in_: + raise ValueError( + f"max_categories has shape {max_categories.shape} but the " + f"input data X has {self.n_features_in_} features." + ) + unexpected_max_counts = set( max_count for max_count in max_categories @@ -1536,12 +1543,6 @@ def _check_max_categories(self): f"values: {list(unexpected_max_counts)}." ) - max_categories = np.asarray(max_categories) - if max_categories.shape[0] != self.n_features_in_: - raise ValueError( - f"max_categories has shape {self.max_categories.shape} but the" - f" input data X has {self.n_features_in_} features." - ) self.max_categories = max_categories def _max_categories_enable_infrequent(self, max_categories): diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 107872aa8d769..7d0f1ed8bd3dc 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -2412,3 +2412,126 @@ def test_ordinal_encoder_missing_appears_infrequent(): ) X_trans = ordinal.transform(X_test) assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]]) + + +def test_ordinal_encoder_missing_feature_names(): + """Check behavior when max_categories specifies features on a dataset without + feature names.""" + X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T + ordinal = OrdinalEncoder(max_categories={"x0": 3}) + + with pytest.raises( + ValueError, + match=( + "OrdinalEncoder was not fitted on data with feature names. Pass" + " max_categories as an integer array instead." + ), + ): + ordinal.fit(X) + + +def test_ordinal_encoder_unexpected_feature_names(): + """Check behavior when max_categories specifies features that are not present on a + dataset.""" + pd = pytest.importorskip("pandas") + categorical_dtype = pd.CategoricalDtype(["bird", "cat", "dog", "snake"]) + X = pd.DataFrame( + { + "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"], + "int": [5, 3, 0, 10, 10, 12, 0, 3, 5], + "categorical": pd.Series( + ["dog"] * 4 + ["cat"] * 3 + ["snake"] + ["bird"], + dtype=categorical_dtype, + ), + }, + columns=["str", "int", "categorical"], + ) + + max_categories = {"str": 3, "int": 2, "categorical": 1, "unexpected": 13} + ordinal = OrdinalEncoder(max_categories=max_categories) + + msg = re.escape( + "max_categories contains 1 unexpected feature names: ['unexpected']." + ) + + with pytest.raises(ValueError, match=msg): + ordinal.fit(X) + + +@pytest.mark.parametrize( + "max_categories, incorrect_feature, incorrect_value", + [ + ({"str": 3, "int": None, "categorical": 1}, "int", None), + ({"str": 3, "int": 2, "categorical": 0}, "categorical", 0), + ], +) +def test_ordinal_encoder_max_categories_dict_invalid_types( + max_categories, incorrect_feature, incorrect_value +): + """Check behavior when max_categories as a dictionary contains values that are + invalid.""" + pd = pytest.importorskip("pandas") + categorical_dtype = pd.CategoricalDtype(["bird", "cat", "dog", "snake"]) + X = pd.DataFrame( + { + "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"], + "int": [5, 3, 0, 10, 10, 12, 0, 3, 5], + "categorical": pd.Series( + ["dog"] * 4 + ["cat"] * 3 + ["snake"] + ["bird"], + dtype=categorical_dtype, + ), + }, + columns=["str", "int", "categorical"], + ) + + ordinal = OrdinalEncoder(max_categories=max_categories) + + msg = re.escape( + f"max_categories['{incorrect_feature}'] must be an integer at least 1. " + f"Got {incorrect_value}." + ) + + with pytest.raises(ValueError, match=msg): + ordinal.fit(X) + + +@pytest.mark.parametrize("max_categories", [[], [3, 2], [[3, 2, 1]]]) +def test_ordinal_encoder_max_categories_array_like_invalid_shape(max_categories): + """Check behavior when max_categories as an array_like has an invalid shape.""" + X = ( + [["a", "e", "i"]] * 5 + + [["b", "f", "j"]] * 20 + + [["c", "g", "k"]] * 10 + + [["d", "h", "l"]] * 3 + ) + + ordinal = OrdinalEncoder(max_categories=max_categories) + + msg = re.escape( + f"max_categories has shape {np.asarray(max_categories).shape} but the " + "input data X has 3 features." + ) + + with pytest.raises(ValueError, match=msg): + ordinal.fit(X) + + +def test_ordinal_encoder_max_categories_array_like_invalid_types(): + """Check behavior when max_categories as an array_like contains values that + are invalid.""" + X = ( + [["a", "e", "i"]] * 5 + + [["b", "f", "j"]] * 20 + + [["c", "g", "k"]] * 10 + + [["d", "h", "l"]] * 3 + ) + + ordinal = OrdinalEncoder(max_categories=[3, None, 0]) + + msg = re.escape( + "max_categories must be an array-like of None or integers at least 1. " + "Observed values: [0]." + ) + + with pytest.raises(ValueError, match=msg): + ordinal.fit(X) From 77e95675f470d3b35acaa4caeb518d9ef7b768a5 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Tue, 16 May 2023 23:39:58 -0700 Subject: [PATCH 08/16] Add _max_categories_per_feature attribute to BaseEncoder and remove overrides in OrdinalEncoder --- sklearn/preprocessing/_encoders.py | 209 +++++++++++------------------ 1 file changed, 81 insertions(+), 128 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 52b88c7f3ba26..115af972aaaba 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -13,10 +13,10 @@ from ..utils import check_array, is_scalar_nan, _safe_indexing from ..utils.validation import check_is_fitted from ..utils.validation import _check_feature_names_in -from ..utils.validation import _is_arraylike_not_scalar from ..utils._param_validation import Interval, StrOptions, Hidden from ..utils._param_validation import RealNotInt from ..utils._mask import _get_mask +from ..utils.validation import _is_arraylike_not_scalar from ..utils._encode import _encode, _check_unknown, _unique, _get_counts @@ -76,9 +76,9 @@ def _fit( return_counts=False, return_and_ignore_missing_for_infrequent=False, ): - self._check_infrequent_enabled() self._check_n_features(X, reset=True) self._check_feature_names(X, reset=True) + self._check_infrequent_enabled() X_list, n_samples, n_features = self._check_X( X, force_all_finite=force_all_finite ) @@ -251,38 +251,90 @@ def infrequent_categories_(self): for category, indices in zip(self.categories_, infrequent_indices) ] - def _max_categories_enable_infrequent(self, max_categories): + def _validate_max_categories(self): """ - This function checks whether the value of max_categories - enables infrequent categories. + Check max_categories and returns the corresponding array. """ - return max_categories is not None and max_categories >= 1 + max_categories = getattr(self, "max_categories", None) + + if isinstance(max_categories, Integral) and max_categories >= 1: + return [max_categories] * self.n_features_in_ + + elif isinstance(max_categories, dict): + if not hasattr(self, "feature_names_in_"): + raise ValueError( + f"{self.__class__.__name__} was not fitted on data " + "with feature names. Pass max_categories as an integer " + "array instead." + ) + + unexpected_feature_names = list( + set(self.max_categories) - set(self.feature_names_in_) + ) + unexpected_feature_names.sort() # deterministic error message + n_unexpeced = len(unexpected_feature_names) + if unexpected_feature_names: + if len(unexpected_feature_names) > 5: + unexpected_feature_names = unexpected_feature_names[:5] + unexpected_feature_names.append("...") + raise ValueError( + f"max_categories contains {n_unexpeced} unexpected feature " + f"names: {unexpected_feature_names}." + ) + + max_categories_array = [None] * self.n_features_in_ + for feature_idx, feature_name in enumerate(self.feature_names_in_): + if feature_name in max_categories: + max_count = max_categories[feature_name] + if not (isinstance(max_count, Integral) and max_count >= 1): + raise ValueError( + f"max_categories['{feature_name}'] must be an " + f"integer at least 1. Got {max_count!r}." + ) + max_categories_array[feature_idx] = max_count + return max_categories_array if any(max_categories_array) else None + + elif _is_arraylike_not_scalar(max_categories): + max_categories = np.asarray(max_categories) + if ( + max_categories.ndim != 1 + or max_categories.shape[0] != self.n_features_in_ + ): + raise ValueError( + f"max_categories has shape {max_categories.shape} but the " + f"input data X has {self.n_features_in_} features." + ) + + unexpected_max_counts = set( + max_count + for max_count in max_categories + if not ( + max_count is None + or (isinstance(max_count, Integral) and max_count >= 1) + ) + ) + if len(unexpected_max_counts): + raise ValueError( + "max_categories must be an array-like of None or integers " + "at least 1. Observed " + f"values: {list(unexpected_max_counts)}." + ) + return max_categories if any(max_categories) else None + + else: + return None def _check_infrequent_enabled(self): """ This functions checks whether _infrequent_enabled is True or False. This has to be called after parameter validation in the fit function. """ - max_categories = getattr(self, "max_categories", None) + self._max_categories_per_feature = self._validate_max_categories() min_frequency = getattr(self, "min_frequency", None) self._infrequent_enabled = ( - self._max_categories_enable_infrequent(max_categories) + self._max_categories_per_feature is not None ) or min_frequency is not None - def _has_infrequent_categories(self, n_current_features, col_idx): - """ - This function checks if there are any infrequent categories. - """ - return ( - self.max_categories is not None and self.max_categories < n_current_features - ) - - def _get_frequent_category_count(self, col_idx): - """ - This functions computes the number of frequent categories. - """ - return self.max_categories - 1 - def _identify_infrequent(self, category_count, n_samples, col_idx): """Compute the infrequent indices. @@ -312,9 +364,14 @@ def _identify_infrequent(self, category_count, n_samples, col_idx): infrequent_mask = np.zeros(category_count.shape[0], dtype=bool) n_current_features = category_count.size - infrequent_mask.sum() + 1 - if self._has_infrequent_categories(n_current_features, col_idx): + if self._max_categories_per_feature is not None: + max_categories = self._max_categories_per_feature[col_idx] + else: + max_categories = None + + if max_categories is not None and max_categories < n_current_features: # max_categories includes the one infrequent category - frequent_category_count = self._get_frequent_category_count(col_idx) + frequent_category_count = max_categories - 1 if frequent_category_count == 0: # All categories are infrequent infrequent_mask[:] = True @@ -1481,103 +1538,6 @@ def __init__( self.min_frequency = min_frequency self.max_categories = max_categories - def _check_max_categories(self): - """ - Check the max_categories and convert to the corresponding array. - """ - max_categories = self.max_categories - if isinstance(max_categories, dict): - max_categories = np.full( - shape=self.n_features_in_, - fill_value=None, - dtype=object, - ) - if not hasattr(self, "feature_names_in_"): - raise ValueError( - f"{self.__class__.__name__} was not fitted on data " - "with feature names. Pass max_categories as an integer " - "array instead." - ) - unexpected_feature_names = list( - set(self.max_categories) - set(self.feature_names_in_) - ) - unexpected_feature_names.sort() # deterministic error message - n_unexpeced = len(unexpected_feature_names) - if unexpected_feature_names: - if len(unexpected_feature_names) > 5: - unexpected_feature_names = unexpected_feature_names[:5] - unexpected_feature_names.append("...") - raise ValueError( - f"max_categories contains {n_unexpeced} unexpected feature " - f"names: {unexpected_feature_names}." - ) - for feature_idx, feature_name in enumerate(self.feature_names_in_): - if feature_name in self.max_categories: - max_count = self.max_categories[feature_name] - if not (isinstance(max_count, Integral) and max_count >= 1): - raise ValueError( - f"max_categories['{feature_name}'] must be an " - f"integer at least 1. Got {max_count!r}." - ) - max_categories[feature_idx] = max_count - else: - max_categories = np.asarray(max_categories) - if max_categories.shape[0] != self.n_features_in_: - raise ValueError( - f"max_categories has shape {max_categories.shape} but the " - f"input data X has {self.n_features_in_} features." - ) - - unexpected_max_counts = set( - max_count - for max_count in max_categories - if not ( - max_count is None - or (isinstance(max_count, Integral) and max_count >= 1) - ) - ) - if len(unexpected_max_counts): - raise ValueError( - "max_categories must be an array-like of None or integers " - "at least 1. Observed " - f"values: {list(unexpected_max_counts)}." - ) - - self.max_categories = max_categories - - def _max_categories_enable_infrequent(self, max_categories): - """ - This function checks whether the value of max_categories - enables infrequent categories. - """ - if max_categories is None: - return False - elif isinstance(max_categories, Integral): - return max_categories >= 1 - else: - return any(max_categories) - - def _has_infrequent_categories(self, n_current_features, col_idx): - """ - This function checks if there are any infrequent categories. - """ - if self.max_categories is None: - return False - if isinstance(self.max_categories, Integral): - return self.max_categories < n_current_features - else: - max_count = self.max_categories[col_idx] - return max_count is not None and max_count < n_current_features - - def _get_frequent_category_count(self, col_idx): - """ - This functions computes the number of frequent categories. - """ - if isinstance(self.max_categories, Integral): - return self.max_categories - 1 - else: - return self.max_categories[col_idx] - 1 - def fit(self, X, y=None): """ Fit the OrdinalEncoder to X. @@ -1620,13 +1580,6 @@ def fit(self, X, y=None): f"got {self.unknown_value}." ) - if isinstance(self.max_categories, dict) or _is_arraylike_not_scalar( - self.max_categories - ): - self._check_n_features(X, reset=True) - self._check_feature_names(X, reset=True) - self._check_max_categories() - # `_fit` will only raise an error when `self.handle_unknown="error"` fit_results = self._fit( X, From 20c4489c5a133def0a686ce215e83dfdd3c2d1c1 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Tue, 16 May 2023 23:40:29 -0700 Subject: [PATCH 09/16] Update tests --- sklearn/preprocessing/tests/test_encoders.py | 37 ++++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 7d0f1ed8bd3dc..b7ec54a8c9059 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -2430,7 +2430,38 @@ def test_ordinal_encoder_missing_feature_names(): ordinal.fit(X) -def test_ordinal_encoder_unexpected_feature_names(): +@pytest.mark.parametrize( + "max_categories, unexpected_features, n_unexpected_features", + [ + ({"str": 3, "int": 2, "categorical": 1, "unexpected": 13}, ["unexpected"], 1), + ( + { + "str": 3, + "int": 2, + "categorical": 1, + "unexpected": 13, + "unexpected_2": 13, + "unexpected_3": 13, + "unexpected_3": 13, + "unexpected_4": 13, + "unexpected_5": 13, + "unexpected_6": 13, + }, + [ + "unexpected", + "unexpected_2", + "unexpected_3", + "unexpected_4", + "unexpected_5", + "...", + ], + 6, + ), + ], +) +def test_ordinal_encoder_unexpected_feature_names( + max_categories, unexpected_features, n_unexpected_features +): """Check behavior when max_categories specifies features that are not present on a dataset.""" pd = pytest.importorskip("pandas") @@ -2447,11 +2478,11 @@ def test_ordinal_encoder_unexpected_feature_names(): columns=["str", "int", "categorical"], ) - max_categories = {"str": 3, "int": 2, "categorical": 1, "unexpected": 13} ordinal = OrdinalEncoder(max_categories=max_categories) msg = re.escape( - "max_categories contains 1 unexpected feature names: ['unexpected']." + f"max_categories contains {n_unexpected_features} unexpected feature names:" + f" {unexpected_features}." ) with pytest.raises(ValueError, match=msg): From ea0a7fb1b8555ec98ed92a219960b983aad16169 Mon Sep 17 00:00:00 2001 From: Andrew Wang <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Thu, 25 May 2023 21:26:30 -0700 Subject: [PATCH 10/16] Update doc/modules/preprocessing.rst Simplify parameter description of `max_categories` Co-authored-by: Thomas J. Fan --- doc/modules/preprocessing.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 6bb4b2ee9a437..6a1c331f045b8 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -746,11 +746,10 @@ enable the gathering of infrequent categories are `min_frequency` and this fraction of the total number of samples will be considered infrequent. The default value is 1, which means every category is encoded separately. -2. `max_categories` is either `None`, any integer greater or equal to 1, an - array-like comprised of `None` or integers greater than equal to 1 (for - :class:`OrdinalEncoder` only), or a dictionary mapping a feature name found - in `feature_names_in_` to any integer greater than or equal to 1 (for - :class:`OrdinalEncoder` only). This parameter sets an upper limit to the +2. `max_categories` is either `None` or. any integer greater or equal to 1. + :class:`OrdinalEncoder` also supports an array-like containing `None` and + integers or a dictionary mapping a feature name found in `feature_names_in_` + to an integer. This parameter sets an upper limit to the number of output categories for each input feature. `max_categories` includes the category that combines infrequent categories. From 342792442aded81225f5b350d01633c03e0b5f8e Mon Sep 17 00:00:00 2001 From: Andrew Wang <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Thu, 25 May 2023 21:27:28 -0700 Subject: [PATCH 11/16] Update sklearn/preprocessing/_encoders.py Sort unexpected feature names only when there is an error Co-authored-by: Thomas J. Fan --- sklearn/preprocessing/_encoders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 115af972aaaba..a71b6987f038b 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -271,9 +271,9 @@ def _validate_max_categories(self): unexpected_feature_names = list( set(self.max_categories) - set(self.feature_names_in_) ) - unexpected_feature_names.sort() # deterministic error message - n_unexpeced = len(unexpected_feature_names) if unexpected_feature_names: + unexpected_feature_names.sort() # deterministic error message + n_unexpeced = len(unexpected_feature_names) if len(unexpected_feature_names) > 5: unexpected_feature_names = unexpected_feature_names[:5] unexpected_feature_names.append("...") From 1fa914ae8fbb4259f3d399ebea94d0cd6a5d9fcc Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Thu, 25 May 2023 21:40:34 -0700 Subject: [PATCH 12/16] Simplify error message for array-like max_categories --- sklearn/preprocessing/_encoders.py | 20 ++++++++------------ sklearn/preprocessing/tests/test_encoders.py | 3 +-- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index a71b6987f038b..1d74d06723011 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -273,12 +273,12 @@ def _validate_max_categories(self): ) if unexpected_feature_names: unexpected_feature_names.sort() # deterministic error message - n_unexpeced = len(unexpected_feature_names) + n_unexpected = len(unexpected_feature_names) if len(unexpected_feature_names) > 5: unexpected_feature_names = unexpected_feature_names[:5] unexpected_feature_names.append("...") raise ValueError( - f"max_categories contains {n_unexpeced} unexpected feature " + f"max_categories contains {n_unexpected} unexpected feature " f"names: {unexpected_feature_names}." ) @@ -305,20 +305,16 @@ def _validate_max_categories(self): f"input data X has {self.n_features_in_} features." ) - unexpected_max_counts = set( - max_count + if any( + max_count is not None + and not (isinstance(max_count, Integral) and max_count >= 1) for max_count in max_categories - if not ( - max_count is None - or (isinstance(max_count, Integral) and max_count >= 1) - ) - ) - if len(unexpected_max_counts): + ): raise ValueError( "max_categories must be an array-like of None or integers " - "at least 1. Observed " - f"values: {list(unexpected_max_counts)}." + "at least 1." ) + return max_categories if any(max_categories) else None else: diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index b7ec54a8c9059..cc3ccd0bf2ded 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -2560,8 +2560,7 @@ def test_ordinal_encoder_max_categories_array_like_invalid_types(): ordinal = OrdinalEncoder(max_categories=[3, None, 0]) msg = re.escape( - "max_categories must be an array-like of None or integers at least 1. " - "Observed values: [0]." + "max_categories must be an array-like of None or integers at least 1." ) with pytest.raises(ValueError, match=msg): From 6488f9bac9982488d500d0b612457f1e4cb7028d Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Tue, 13 Jun 2023 21:54:12 -0700 Subject: [PATCH 13/16] Fix indentation on changelog --- doc/whats_new/v1.3.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 4cf0d688b11ee..b05b4997d0019 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -627,12 +627,12 @@ Changelog - |Fix| :class:`AdditiveChi2Sampler` is now stateless. The `sample_interval_` attribute is deprecated and will be removed in 1.5. :pr:`25190` by :user:`Vincent Maladière `. - + - |Fix| :class:`preprocessing.PowerTransformer` now correcly raises error when using `method="box-cox"` on data with a constant `np.nan` column. :pr:`26400` by :user:`Yao Xiao `. - - - |Enhancement| Added support for passing `max_categories` as `array-like` or + +- |Enhancement| Added support for passing `max_categories` as `array-like` or `dict` in :class:`preprocessing.OrdinalEncoder`. This allows specifying the maximum number of output categories for each input feature instead of being restricted to setting a global maximum number of output categories. From 1a6e63a8843e7e638a877be1255f7ba3feb7bbf0 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Tue, 20 Jun 2023 18:42:20 -0700 Subject: [PATCH 14/16] Fix linting issues --- sklearn/preprocessing/tests/test_encoders.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index cc3ccd0bf2ded..3d22abb38f269 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -2442,7 +2442,6 @@ def test_ordinal_encoder_missing_feature_names(): "unexpected": 13, "unexpected_2": 13, "unexpected_3": 13, - "unexpected_3": 13, "unexpected_4": 13, "unexpected_5": 13, "unexpected_6": 13, From bce9206dfaa0e90ec9b6d6a4b818a3be3a66dde0 Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Wed, 21 Jun 2023 19:51:17 -0700 Subject: [PATCH 15/16] Fix linting issues --- sklearn/preprocessing/_encoders.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 5c497f7089d92..30cc66d04c785 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -14,7 +14,11 @@ from ..utils._encode import _check_unknown, _encode, _get_counts, _unique from ..utils._mask import _get_mask from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions -from ..utils.validation import _check_feature_names_in, check_is_fitted, _is_arraylike_not_scalar +from ..utils.validation import ( + _check_feature_names_in, + _is_arraylike_not_scalar, + check_is_fitted, +) __all__ = ["OneHotEncoder", "OrdinalEncoder"] From 92e5b0d9fbb4f7876a4793a875277e847572f12d Mon Sep 17 00:00:00 2001 From: Andrew <37257276+Andrew-Wang-IB45@users.noreply.github.com> Date: Wed, 9 Aug 2023 22:26:20 -0700 Subject: [PATCH 16/16] Migrate changelog from v1.3 to v1.4 --- doc/whats_new/v1.3.rst | 8 +------- doc/whats_new/v1.4.rst | 6 ++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 995144d4a3753..0f46f28cd340b 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -745,7 +745,7 @@ Changelog - |Fix| :class:`preprocessing.PowerTransformer` now correcly raises error when using `method="box-cox"` on data with a constant `np.nan` column. :pr:`26400` by :user:`Yao Xiao `. - + - |Fix| :class:`preprocessing.PowerTransformer` with `method="yeo-johnson"` now leaves constant features unchanged instead of transforming with an arbitrary value for the `lambdas_` fitted parameter. @@ -755,12 +755,6 @@ Changelog :class:`preprocessing.KBinsDiscretizer` will change from `None` to `200_000` in version 1.5 when `strategy="kmeans"` or `strategy="uniform"`. :pr:`26424` by :user:`Jérémie du Boisberranger `. - -- |Enhancement| Added support for passing `max_categories` as `array-like` or - `dict` in :class:`preprocessing.OrdinalEncoder`. This allows specifying the - maximum number of output categories for each input feature instead of being - restricted to setting a global maximum number of output categories. - :pr:`26284` by :user:`Andrew Wang `. :mod:`sklearn.svm` .................. diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index a7f2df245f105..817e3d4813093 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -188,6 +188,12 @@ Changelog - |Enhancement| :func:`sklearn.model_selection.train_test_split` now supports Array API compatible inputs. :pr:`26855` by `Tim Head`_. +- |Enhancement| Added support for passing `max_categories` as `array-like` or + `dict` in :class:`preprocessing.OrdinalEncoder`. This allows specifying the + maximum number of output categories for each input feature instead of being + restricted to setting a global maximum number of output categories. + :pr:`26284` by :user:`Andrew Wang `. + :mod:`sklearn.tree` ...................