scikit-learn · ogrisel · Nov 13, 2022 · Nov 13, 2022 · Nov 13, 2022 · Nov 21, 2022
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -1077,10 +1077,13 @@ categorical features as continuous (ordinal), which happens for ordinal-encoded
 categorical data, since categories are nominal quantities where order does not
 matter.
 
-To enable categorical support, a boolean mask can be passed to the
-`categorical_features` parameter, indicating which feature is categorical. In
-the following, the first feature will be treated as categorical and the
-second feature as numerical::
+There are several ways to use the native categorical feature support for those
+estimators. The simplest way is to pass the training data as `pandas.DataFrame`
+where the categorical features are of type `category`.
+
+Alternatively it is possible to pass a boolean mask to the `categorical_features`
+parameter, indicating which feature is categorical. In the following, the first
+feature will be treated as categorical and the second feature as numerical::
 
   >>> gbdt = HistGradientBoostingClassifier(categorical_features=[True, False])
 
@@ -1089,10 +1092,18 @@ categorical features::
 
   >>> gbdt = HistGradientBoostingClassifier(categorical_features=[0])
 
-The cardinality of each categorical feature should be less than the `max_bins`
-parameter, and each categorical feature is expected to be encoded in
-`[0, max_bins - 1]`. To that end, it might be useful to pre-process the data
-with an :class:`~sklearn.preprocessing.OrdinalEncoder` as done in
+Finally, one can pass a list of strings indicating the names of the categorical
+if training data is passed as a dataframe with string column names::
+
+  >>> gbdt = HistGradientBoostingClassifier(categorical_features=['f0'])
+
+In any case, the cardinality of each categorical feature should be less than
+the `max_bins` parameter, and each categorical feature is expected to be
+encoded in `[0, max_bins - 1]`.
+
+If the original data is not already using numerical encoding for the
+categorical features, it can to pre-processed with an
+:class:`~sklearn.preprocessing.OrdinalEncoder` as done in
 :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`.
 
 If there are missing values during training, the missing values will be

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -308,6 +308,13 @@ Changelog
 - |Efficiency| Improve runtime performance of :class:`ensemble.IsolationForest`
   by avoiding data copies. :pr:`23252` by :user:`Zhehao Liu <MaxwellLZH>`.
 
+- |Enhancement| Make it possible to pass a `pandas.DataFrame` instance with
+  explicitly categorical dtyped columns directly to
+  :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` without having to manually
+  ordinally encode such columns ahead of time.
+  :pr:`24907` by :user:`Olivier Grisel <ogrisel>`.
+
 - |Enhancement| :class:`ensemble.StackingClassifier` now accepts any kind of
   base estimator.
   :pr:`24538` by :user:`Guillem G Subies <GuillemGSubies>`.

diff --git a/examples/applications/plot_cyclical_feature_engineering.py b/examples/applications/plot_cyclical_feature_engineering.py
@@ -171,58 +171,24 @@
 # efficiently handle heteorogenous tabular data with a mix of categorical and
 # numerical features as long as the number of samples is large enough.
 #
-# Here, we do minimal ordinal encoding for the categorical variables and then
-# let the model know that it should treat those as categorical variables by
-# using a dedicated tree splitting rule. Since we use an ordinal encoder, we
-# pass the list of categorical values explicitly to use a logical order when
-# encoding the categories as integers instead of the lexicographical order.
-# This also has the added benefit of preventing any issue with unknown
-# categories when using cross-validation.
-#
-# The numerical variables need no preprocessing and, for the sake of simplicity,
-# we only try the default hyper-parameters for this model:
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import OrdinalEncoder
-from sklearn.compose import ColumnTransformer
-from sklearn.ensemble import HistGradientBoostingRegressor
-from sklearn.model_selection import cross_validate
-
-
-categorical_columns = [
-    "weather",
-    "season",
-    "holiday",
-    "workingday",
-]
-categories = [
-    ["clear", "misty", "rain"],
-    ["spring", "summer", "fall", "winter"],
-    ["False", "True"],
-    ["False", "True"],
-]
-ordinal_encoder = OrdinalEncoder(categories=categories)
+# The categorical variables are already typed as such in the pandas dataframe.
+# As a result, the gradient boosted trees model will automatically treat them
+# as categorical internally:
+categorical_columns = X.select_dtypes("category").columns.tolist()
+categorical_columns
 
+# %%
+# The numerical variables do not need any preprocessing either and, for the
+# sake of simplicity, we only try the default hyper-parameters for this model:
+from sklearn.ensemble import HistGradientBoostingRegressor
 
-gbrt_pipeline = make_pipeline(
-    ColumnTransformer(
-        transformers=[
-            ("categorical", ordinal_encoder, categorical_columns),
-        ],
-        remainder="passthrough",
-        # Use short feature names to make it easier to specify the categorical
-        # variables in the HistGradientBoostingRegressor in the next
-        # step of the pipeline.
-        verbose_feature_names_out=False,
-    ),
-    HistGradientBoostingRegressor(
-        categorical_features=categorical_columns,
-    ),
-).set_output(transform="pandas")
+gbrt = HistGradientBoostingRegressor()
 
 # %%
 #
 # Lets evaluate our gradient boosting model with the mean absolute error of the
 # relative demand averaged across our 5 time-based cross-validation splits:
+from sklearn.model_selection import cross_validate
 
 
 def evaluate(model, X, y, cv):
@@ -241,15 +207,19 @@ def evaluate(model, X, y, cv):
     )
 
 
-evaluate(gbrt_pipeline, X, y, cv=ts_cv)
+evaluate(gbrt, X, y, cv=ts_cv)
 
 # %%
 # This model has an average error around 4 to 5% of the maximum demand. This is
-# quite good for a first trial without any hyper-parameter tuning! We just had
-# to make the categorical variables explicit. Note that the time related
-# features are passed as is, i.e. without processing them. But this is not much
-# of a problem for tree-based models as they can learn a non-monotonic
-# relationship between ordinal input features and the target.
+# quite good for a first trial without any feature engineering nor
+# hyper-parameter tuning! We just had to ensure that we use the
+# "category" dtype in the pandas dataframe for all the categorical
+# variables.
+#
+# Note that the time related features are passed as is, i.e. without processing
+# them. But this is not much of a problem for tree-based models as they can
+# learn a non-monotonic relationship between ordinal input features and the
+# target.
 #
 # This is not the case for linear regression models as we will see in the
 # following.
@@ -261,6 +231,8 @@ def evaluate(model, X, y, cv):
 # For consistency, we scale the numerical features to the same 0-1 range using
 # class:`sklearn.preprocessing.MinMaxScaler`, although in this case it does not
 # impact the results much because they are already on comparable scales:
+from sklearn.pipeline import make_pipeline
+from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.linear_model import RidgeCV
@@ -712,8 +684,8 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 # Let us now have a qualitative look at the predictions of the kernel models
 # and of the gradient boosted trees that should be able to better model
 # non-linear interactions between features:
-gbrt_pipeline.fit(X.iloc[train_0], y.iloc[train_0])
-gbrt_predictions = gbrt_pipeline.predict(X.iloc[test_0])
+gbrt.fit(X.iloc[train_0], y.iloc[train_0])
+gbrt_predictions = gbrt.predict(X.iloc[test_0])
 
 one_hot_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])
 one_hot_poly_predictions = one_hot_poly_pipeline.predict(X.iloc[test_0])

diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py
@@ -33,7 +33,8 @@
 X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True, parser="pandas")
 
 # Select only a subset of features of X to make the example faster to run
-categorical_columns_subset = [
+column_subset = [
+    # Categorical features:
     "BldgType",
     "GarageFinish",
     "LotConfig",
@@ -44,9 +45,7 @@
     "ExterCond",
     "ExterQual",
     "PoolQC",
-]
-
-numerical_columns_subset = [
+    # Numerical features:
     "3SsnPorch",
     "Fireplaces",
     "BsmtHalfBath",
@@ -59,8 +58,12 @@
     "ScreenPorch",
 ]
 
-X = X[categorical_columns_subset + numerical_columns_subset]
-X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
+# Comment the line below to run the example on the full dataset:
+X = X[column_subset]
+
+# Explicitly type the categorical columns as such.
+object_dtyped_columns = X.select_dtypes(include=["object"]).columns
+X[object_dtyped_columns] = X[object_dtyped_columns].astype("category")
 
 categorical_columns = X.select_dtypes(include="category").columns
 n_categorical_features = len(categorical_columns)
@@ -123,10 +126,6 @@
         make_column_selector(dtype_include="category"),
     ),
     remainder="passthrough",
-    # Use short feature names to make it easier to specify the categorical
-    # variables in the HistGradientBoostingRegressor in the next step
-    # of the pipeline.
-    verbose_feature_names_out=False,
 )
 
 hist_ordinal = make_pipeline(
@@ -140,24 +139,35 @@
 # that will natively handle categorical features. This estimator will not treat
 # categorical features as ordered quantities.
 #
-# Since the :class:`~ensemble.HistGradientBoostingRegressor` requires category
-# values to be encoded in `[0, n_unique_categories - 1]`, we still rely on an
-# :class:`~preprocessing.OrdinalEncoder` to pre-process the data.
-#
-# The main difference between this pipeline and the previous one is that in
-# this one, we let the :class:`~ensemble.HistGradientBoostingRegressor` know
-# which features are categorical.
+# To benefit from this, one option is to encode the categorical features using the
+# pandas categorical dtype which we already did at the beginning of this
+# example with the call to `.astype("category")`.
+hist_native = HistGradientBoostingRegressor(random_state=42)
 
-# The ordinal encoder will first output the categorical features, and then the
-# continuous (passed-through) features
+# %%
+# Note that this is equivalent to using the ordinal encoder that output pandas
+# dataframe with unchanged column names and then passing the name of the
+# categorical features to the ``categorical_features`` constructor parameter of
+# :class:`~ensemble.HistGradientBoostingRegressor`:
+
+ordinal_encoder = make_column_transformer(
+    (
+        OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
+        categorical_columns,
+    ),
+    remainder="passthrough",
+    # Use short feature names to make it easier to specify the categorical
+    # variables in the HistGradientBoostingRegressor in the next step
+    # of the pipeline.
+    verbose_feature_names_out=False,
+).set_output(transform="pandas")
 
-hist_native = make_pipeline(
+hist_native2 = make_pipeline(
     ordinal_encoder,
     HistGradientBoostingRegressor(
-        random_state=42,
-        categorical_features=categorical_columns,
+        categorical_features=categorical_columns, random_state=42
     ),
-).set_output(transform="pandas")
+)
 
 # %%
 # Model comparison
@@ -254,11 +264,12 @@ def plot_results(figure_title):
 # we artificially limit the total number of splits by both limiting the number
 # of trees and the depth of each tree.
 
-for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):
+for pipe in (hist_dropped, hist_one_hot, hist_ordinal):
     pipe.set_params(
         histgradientboostingregressor__max_depth=3,
         histgradientboostingregressor__max_iter=15,
     )
+hist_native.set_params(max_depth=3, max_iter=15)
 
 dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
 one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)
@@ -272,6 +283,5 @@ def plot_results(figure_title):
 # %%
 # The results for these under-fitting models confirm our previous intuition:
 # the native category handling strategy performs the best when the splitting
-# budget is constrained. The two other strategies (one-hot encoding and
-# treating categories as ordinal values) lead to error values comparable
-# to the baseline model that just dropped the categorical features altogether.
+# budget is constrained. Note that this effect is even more pronounced when
+# we include all the features from the original dataset.