Skip to content
168 changes: 108 additions & 60 deletions examples/ensemble/plot_gradient_boosting_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
different encoding strategies for categorical features. In
particular, we will evaluate:

- dropping the categorical features
- using a :class:`~preprocessing.OneHotEncoder`
- using an :class:`~preprocessing.OrdinalEncoder` and treat categories as
ordered, equidistant quantities
- using an :class:`~preprocessing.OrdinalEncoder` and rely on the :ref:`native
category support <categorical_support_gbdt>` of the
- "Dropped": dropping the categorical features;
- "One Hot": using a :class:`~preprocessing.OneHotEncoder`;
- "Ordinal": using an :class:`~preprocessing.OrdinalEncoder` and treat
categories as ordered, equidistant quantities;
- "Native": using an :class:`~preprocessing.OrdinalEncoder` and rely on the
:ref:`native category support <categorical_support_gbdt>` of the
:class:`~ensemble.HistGradientBoostingRegressor` estimator.

We will work with the Ames Iowa Housing dataset which consists of numerical
Expand Down Expand Up @@ -92,6 +92,7 @@
("drop", make_column_selector(dtype_include="category")), remainder="passthrough"
)
hist_dropped = make_pipeline(dropper, HistGradientBoostingRegressor(random_state=42))
hist_dropped

# %%
# Gradient boosting estimator with one-hot encoding
Expand All @@ -112,6 +113,7 @@
hist_one_hot = make_pipeline(
one_hot_encoder, HistGradientBoostingRegressor(random_state=42)
)
hist_one_hot

# %%
# Gradient boosting estimator with ordinal encoding
Expand Down Expand Up @@ -139,6 +141,7 @@
hist_ordinal = make_pipeline(
ordinal_encoder, HistGradientBoostingRegressor(random_state=42)
)
hist_ordinal

# %%
# Gradient boosting estimator with native categorical support
Expand All @@ -156,67 +159,105 @@
hist_native = HistGradientBoostingRegressor(
random_state=42, categorical_features="from_dtype"
)
hist_native

# %%
# Model comparison
# ----------------
# Finally, we evaluate the models using cross validation. Here we compare the
# models performance in terms of
# :func:`~metrics.mean_absolute_percentage_error` and fit times.
# Here we use :term:`cross validation` to compare the models performance in
# terms of :func:`~metrics.mean_absolute_percentage_error` and fit times. In the
# upcoming plots, error bars represent 1 standard deviation as computed across
# folds.

from sklearn.model_selection import cross_validate

common_params = {"cv": 5, "scoring": "neg_mean_absolute_percentage_error", "n_jobs": -1}

dropped_result = cross_validate(hist_dropped, X, y, **common_params)
one_hot_result = cross_validate(hist_one_hot, X, y, **common_params)
ordinal_result = cross_validate(hist_ordinal, X, y, **common_params)
native_result = cross_validate(hist_native, X, y, **common_params)
results = [
("Dropped", dropped_result),
("One Hot", one_hot_result),
("Ordinal", ordinal_result),
("Native", native_result),
]

# %%
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from sklearn.model_selection import cross_validate

scoring = "neg_mean_absolute_percentage_error"
n_cv_folds = 3

dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)
ordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)
native_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)


def plot_results(figure_title):
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))

plot_info = [
("fit_time", "Fit times (s)", ax1, None),
("test_score", "Mean Absolute Percentage Error", ax2, None),
]

x, width = np.arange(4), 0.9
for key, title, ax, y_limit in plot_info:
items = [
dropped_result[key],
one_hot_result[key],
ordinal_result[key],
native_result[key],
]

mape_cv_mean = [np.mean(np.abs(item)) for item in items]
mape_cv_std = [np.std(item) for item in items]

ax.bar(
x=x,
height=mape_cv_mean,
width=width,
yerr=mape_cv_std,
color=["C0", "C1", "C2", "C3"],
def plot_performance_tradeoff(results, title):
fig, ax = plt.subplots()
markers = ["s", "o", "^", "x"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick, the last graph, it is a bit tricky to quickly tell the native and ordinal markers/error bars
Would it be 'easy' to change the colour of the the marker, to be the same colour as the scatter points?

image

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that risks leading the reader to think that the marker is an actual scatter point.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point.

Stupid question, for 'Ordinal', why is the black error bar (?) marker not centered between the horizontal and vertical error bars?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nevermind, I see that it is in the new rendered doc, for some reason the horizontal error bar was not rendering properly in the first one above

image


for idx, (name, result) in enumerate(results):
test_error = -result["test_score"]
mean_fit_time = np.mean(result["fit_time"])
mean_score = np.mean(test_error)
std_fit_time = np.std(result["fit_time"])
std_score = np.std(test_error)

ax.scatter(
result["fit_time"],
test_error,
label=name,
marker=markers[idx],
)
ax.scatter(
mean_fit_time,
mean_score,
color="k",
marker=markers[idx],
)
ax.set(
xlabel="Model",
title=title,
xticks=x,
xticklabels=["Dropped", "One Hot", "Ordinal", "Native"],
ylim=y_limit,
ax.errorbar(
x=mean_fit_time,
y=mean_score,
yerr=std_score,
c="k",
capsize=2,
)
fig.suptitle(figure_title)
ax.errorbar(
x=mean_fit_time,
y=mean_score,
xerr=std_fit_time,
c="k",
capsize=2,
)

ax.set_xscale("log")

nticks = 7
x0, x1 = np.log10(ax.get_xlim())
ticks = np.logspace(x0, x1, nticks)
ax.set_xticks(ticks)
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter("%1.1e"))
ax.minorticks_off()

ax.annotate(
" best\nmodels",
xy=(0.05, 0.05),
xycoords="axes fraction",
xytext=(0.1, 0.15),
textcoords="axes fraction",
arrowprops=dict(arrowstyle="->", lw=1.5),
)
ax.set_xlabel("Time to fit (seconds)")
ax.set_ylabel("Mean Absolute Percentage Error")
ax.set_title(title)
ax.legend()
plt.show()

plot_results("Gradient Boosting on Ames Housing")

plot_performance_tradeoff(results, "Gradient Boosting on Ames Housing")

# %%
# In the plot above, the "best models" are those that are closer to the
# down-left corner, as indicated by the arrow. Those models would indeed
# correspond to faster fitting and lower error.
#
# We see that the model with one-hot-encoded data is by far the slowest. This
# is to be expected, since one-hot-encoding creates one additional feature per
# category value (for each categorical feature), and thus more split points
Expand Down Expand Up @@ -264,14 +305,21 @@ def plot_results(figure_title):
histgradientboostingregressor__max_iter=15,
)

dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)
ordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)
native_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)

plot_results("Gradient Boosting on Ames Housing (few and small trees)")
dropped_result = cross_validate(hist_dropped, X, y, **common_params)
one_hot_result = cross_validate(hist_one_hot, X, y, **common_params)
ordinal_result = cross_validate(hist_ordinal, X, y, **common_params)
native_result = cross_validate(hist_native, X, y, **common_params)
results_underfit = [
("Dropped", dropped_result),
("One Hot", one_hot_result),
("Ordinal", ordinal_result),
("Native", native_result),
]

plt.show()
# %%
plot_performance_tradeoff(
results_underfit, "Gradient Boosting on Ames Housing (few and shallow trees)"
)

# %%
# The results for these under-fitting models confirm our previous intuition:
Expand Down