diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py index e80c0fb6fdc6e..2e1132584fcc2 100644 --- a/examples/ensemble/plot_gradient_boosting_categorical.py +++ b/examples/ensemble/plot_gradient_boosting_categorical.py @@ -10,12 +10,12 @@ different encoding strategies for categorical features. In particular, we will evaluate: -- dropping the categorical features -- using a :class:`~preprocessing.OneHotEncoder` -- using an :class:`~preprocessing.OrdinalEncoder` and treat categories as - ordered, equidistant quantities -- using an :class:`~preprocessing.OrdinalEncoder` and rely on the :ref:`native - category support ` of the +- "Dropped": dropping the categorical features; +- "One Hot": using a :class:`~preprocessing.OneHotEncoder`; +- "Ordinal": using an :class:`~preprocessing.OrdinalEncoder` and treat + categories as ordered, equidistant quantities; +- "Native": using an :class:`~preprocessing.OrdinalEncoder` and rely on the + :ref:`native category support ` of the :class:`~ensemble.HistGradientBoostingRegressor` estimator. We will work with the Ames Iowa Housing dataset which consists of numerical @@ -92,6 +92,7 @@ ("drop", make_column_selector(dtype_include="category")), remainder="passthrough" ) hist_dropped = make_pipeline(dropper, HistGradientBoostingRegressor(random_state=42)) +hist_dropped # %% # Gradient boosting estimator with one-hot encoding @@ -112,6 +113,7 @@ hist_one_hot = make_pipeline( one_hot_encoder, HistGradientBoostingRegressor(random_state=42) ) +hist_one_hot # %% # Gradient boosting estimator with ordinal encoding @@ -139,6 +141,7 @@ hist_ordinal = make_pipeline( ordinal_encoder, HistGradientBoostingRegressor(random_state=42) ) +hist_ordinal # %% # Gradient boosting estimator with native categorical support @@ -156,67 +159,105 @@ hist_native = HistGradientBoostingRegressor( random_state=42, categorical_features="from_dtype" ) +hist_native # %% # Model comparison # ---------------- -# Finally, we evaluate the models using cross validation. Here we compare the -# models performance in terms of -# :func:`~metrics.mean_absolute_percentage_error` and fit times. +# Here we use :term:`cross validation` to compare the models performance in +# terms of :func:`~metrics.mean_absolute_percentage_error` and fit times. In the +# upcoming plots, error bars represent 1 standard deviation as computed across +# folds. +from sklearn.model_selection import cross_validate + +common_params = {"cv": 5, "scoring": "neg_mean_absolute_percentage_error", "n_jobs": -1} + +dropped_result = cross_validate(hist_dropped, X, y, **common_params) +one_hot_result = cross_validate(hist_one_hot, X, y, **common_params) +ordinal_result = cross_validate(hist_ordinal, X, y, **common_params) +native_result = cross_validate(hist_native, X, y, **common_params) +results = [ + ("Dropped", dropped_result), + ("One Hot", one_hot_result), + ("Ordinal", ordinal_result), + ("Native", native_result), +] + +# %% import matplotlib.pyplot as plt +import matplotlib.ticker as ticker -from sklearn.model_selection import cross_validate -scoring = "neg_mean_absolute_percentage_error" -n_cv_folds = 3 - -dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring) -one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring) -ordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring) -native_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring) - - -def plot_results(figure_title): - fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8)) - - plot_info = [ - ("fit_time", "Fit times (s)", ax1, None), - ("test_score", "Mean Absolute Percentage Error", ax2, None), - ] - - x, width = np.arange(4), 0.9 - for key, title, ax, y_limit in plot_info: - items = [ - dropped_result[key], - one_hot_result[key], - ordinal_result[key], - native_result[key], - ] - - mape_cv_mean = [np.mean(np.abs(item)) for item in items] - mape_cv_std = [np.std(item) for item in items] - - ax.bar( - x=x, - height=mape_cv_mean, - width=width, - yerr=mape_cv_std, - color=["C0", "C1", "C2", "C3"], +def plot_performance_tradeoff(results, title): + fig, ax = plt.subplots() + markers = ["s", "o", "^", "x"] + + for idx, (name, result) in enumerate(results): + test_error = -result["test_score"] + mean_fit_time = np.mean(result["fit_time"]) + mean_score = np.mean(test_error) + std_fit_time = np.std(result["fit_time"]) + std_score = np.std(test_error) + + ax.scatter( + result["fit_time"], + test_error, + label=name, + marker=markers[idx], + ) + ax.scatter( + mean_fit_time, + mean_score, + color="k", + marker=markers[idx], ) - ax.set( - xlabel="Model", - title=title, - xticks=x, - xticklabels=["Dropped", "One Hot", "Ordinal", "Native"], - ylim=y_limit, + ax.errorbar( + x=mean_fit_time, + y=mean_score, + yerr=std_score, + c="k", + capsize=2, ) - fig.suptitle(figure_title) + ax.errorbar( + x=mean_fit_time, + y=mean_score, + xerr=std_fit_time, + c="k", + capsize=2, + ) + + ax.set_xscale("log") + + nticks = 7 + x0, x1 = np.log10(ax.get_xlim()) + ticks = np.logspace(x0, x1, nticks) + ax.set_xticks(ticks) + ax.xaxis.set_major_formatter(ticker.FormatStrFormatter("%1.1e")) + ax.minorticks_off() + ax.annotate( + " best\nmodels", + xy=(0.05, 0.05), + xycoords="axes fraction", + xytext=(0.1, 0.15), + textcoords="axes fraction", + arrowprops=dict(arrowstyle="->", lw=1.5), + ) + ax.set_xlabel("Time to fit (seconds)") + ax.set_ylabel("Mean Absolute Percentage Error") + ax.set_title(title) + ax.legend() + plt.show() -plot_results("Gradient Boosting on Ames Housing") + +plot_performance_tradeoff(results, "Gradient Boosting on Ames Housing") # %% +# In the plot above, the "best models" are those that are closer to the +# down-left corner, as indicated by the arrow. Those models would indeed +# correspond to faster fitting and lower error. +# # We see that the model with one-hot-encoded data is by far the slowest. This # is to be expected, since one-hot-encoding creates one additional feature per # category value (for each categorical feature), and thus more split points @@ -264,14 +305,21 @@ def plot_results(figure_title): histgradientboostingregressor__max_iter=15, ) -dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring) -one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring) -ordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring) -native_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring) - -plot_results("Gradient Boosting on Ames Housing (few and small trees)") +dropped_result = cross_validate(hist_dropped, X, y, **common_params) +one_hot_result = cross_validate(hist_one_hot, X, y, **common_params) +ordinal_result = cross_validate(hist_ordinal, X, y, **common_params) +native_result = cross_validate(hist_native, X, y, **common_params) +results_underfit = [ + ("Dropped", dropped_result), + ("One Hot", one_hot_result), + ("Ordinal", ordinal_result), + ("Native", native_result), +] -plt.show() +# %% +plot_performance_tradeoff( + results_underfit, "Gradient Boosting on Ames Housing (few and shallow trees)" +) # %% # The results for these under-fitting models confirm our previous intuition: