Skip to content

DOC Update plots in Categorical Feature Support in GBDT example #31062

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 130 additions & 55 deletions examples/ensemble/plot_gradient_boosting_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@
("drop", make_column_selector(dtype_include="category")), remainder="passthrough"
)
hist_dropped = make_pipeline(dropper, HistGradientBoostingRegressor(random_state=42))
hist_dropped

# %%
# Gradient boosting estimator with one-hot encoding
Expand All @@ -112,6 +113,7 @@
hist_one_hot = make_pipeline(
one_hot_encoder, HistGradientBoostingRegressor(random_state=42)
)
hist_one_hot

# %%
# Gradient boosting estimator with ordinal encoding
Expand Down Expand Up @@ -139,6 +141,7 @@
hist_ordinal = make_pipeline(
ordinal_encoder, HistGradientBoostingRegressor(random_state=42)
)
hist_ordinal

# %%
# Gradient boosting estimator with native categorical support
Expand All @@ -156,65 +159,130 @@
hist_native = HistGradientBoostingRegressor(
random_state=42, categorical_features="from_dtype"
)
hist_native

# %%
# Model comparison
# ----------------
# Finally, we evaluate the models using cross validation. Here we compare the
# models performance in terms of
# Here we use cross validation to compare the models performance in terms of
# :func:`~metrics.mean_absolute_percentage_error` and fit times.

import matplotlib.pyplot as plt

from sklearn.model_selection import cross_validate

scoring = "neg_mean_absolute_percentage_error"
n_cv_folds = 3

dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)
ordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)
native_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)


def plot_results(figure_title):
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))

plot_info = [
("fit_time", "Fit times (s)", ax1, None),
("test_score", "Mean Absolute Percentage Error", ax2, None),
]

x, width = np.arange(4), 0.9
for key, title, ax, y_limit in plot_info:
items = [
dropped_result[key],
one_hot_result[key],
ordinal_result[key],
native_result[key],
]

mape_cv_mean = [np.mean(np.abs(item)) for item in items]
mape_cv_std = [np.std(item) for item in items]

ax.bar(
x=x,
height=mape_cv_mean,
width=width,
yerr=mape_cv_std,
color=["C0", "C1", "C2", "C3"],
common_params = {"cv": 5, "scoring": "neg_mean_absolute_percentage_error", "n_jobs": -1}

dropped_result = cross_validate(hist_dropped, X, y, **common_params)
one_hot_result = cross_validate(hist_one_hot, X, y, **common_params)
ordinal_result = cross_validate(hist_ordinal, X, y, **common_params)
native_result = cross_validate(hist_native, X, y, **common_params)
results = [
("Dropped", dropped_result),
("One Hot", one_hot_result),
("Ordinal", ordinal_result),
("Native", native_result),
]

# %%
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


class CustomLogFormatter(ticker.Formatter):
def __call__(self, x, pos=None):
if x == 0:
return "0"
exponent = int(np.floor(np.log10(x)))
coeff = x / (10**exponent)
# Only show coefficient if it's not ~1
if np.isclose(coeff, 1.0):
coeff_str = ""
else:
coeff_str = f"{coeff:.1f}x"

# Format exponent using Unicode superscripts
superscripts = str.maketrans("-0123456789", "⁻⁰¹²³⁴⁵⁶⁷⁸⁹")
exponent_str = str(exponent).translate(superscripts)

return f"{coeff_str}10{exponent_str}"


def plot_performance_tradeoff(results, title):
fig, ax = plt.subplots()
markers = ["s", "o", "^", "x"]

for idx, (name, result) in enumerate(results):
test_error = -result["test_score"]
mean_fit_time = np.mean(result["fit_time"])
mean_score = np.mean(test_error)
std_fit_time = np.std(result["fit_time"])
std_score = np.std(test_error)

ax.scatter(
result["fit_time"],
test_error,
label=name,
marker=markers[idx],
)
ax.set(
xlabel="Model",
title=title,
xticks=x,
xticklabels=["Dropped", "One Hot", "Ordinal", "Native"],
ylim=y_limit,
ax.scatter(
mean_fit_time,
mean_score,
color="k",
marker=markers[idx],
)
ax.errorbar(
x=mean_fit_time,
y=mean_score,
yerr=std_score,
c="k",
capsize=2,
)
ax.errorbar(
x=mean_fit_time,
y=mean_score,
xerr=std_fit_time,
c="k",
capsize=2,
)
fig.suptitle(figure_title)


plot_results("Gradient Boosting on Ames Housing")
ax.set_xscale("log")
xlim = ax.get_xlim()
ylim = ax.get_ylim()
log_start = np.log10(xlim[0])
log_end = np.log10(xlim[1])

nticks = 7
ticks = np.logspace(log_start, log_end, nticks)
ax.set_xticks(ticks)
ax.get_xaxis().set_major_formatter(CustomLogFormatter())
ax.tick_params(axis="x", which="minor", labelbottom=False)
ax.minorticks_off()

log_xlim = np.log10(xlim)
x_log_range = log_xlim[1] - log_xlim[0]
y_range = ylim[1] - ylim[0]
arrow_tip_x_log = log_xlim[0] + 0.05 * x_log_range
arrow_tip_y = ylim[0] + 0.05 * y_range
arrow_start_x_log = arrow_tip_x_log + 0.04 * x_log_range
arrow_start_y = arrow_tip_y + 0.12 * y_range
arrow_tip_x = np.power(10, arrow_tip_x_log)
arrow_start_x = np.power(10, arrow_start_x_log)

ax.annotate(
" best\nmodels",
xy=(arrow_tip_x, arrow_tip_y),
xytext=(arrow_start_x, arrow_start_y),
arrowprops=dict(arrowstyle="->", lw=1.5),
fontsize=9,
)

ax.set_xlabel("Time to fit (seconds)")
ax.set_ylabel("Mean Absolute Percentage Error")
ax.set_title(title)
ax.legend()
plt.show()


plot_performance_tradeoff(results, "Gradient Boosting on Ames Housing")

# %%
# We see that the model with one-hot-encoded data is by far the slowest. This
Expand Down Expand Up @@ -264,14 +332,21 @@ def plot_results(figure_title):
histgradientboostingregressor__max_iter=15,
)

dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)
ordinal_result = cross_validate(hist_ordinal, X, y, cv=n_cv_folds, scoring=scoring)
native_result = cross_validate(hist_native, X, y, cv=n_cv_folds, scoring=scoring)

plot_results("Gradient Boosting on Ames Housing (few and small trees)")
dropped_result = cross_validate(hist_dropped, X, y, **common_params)
one_hot_result = cross_validate(hist_one_hot, X, y, **common_params)
ordinal_result = cross_validate(hist_ordinal, X, y, **common_params)
native_result = cross_validate(hist_native, X, y, **common_params)
results_underfit = [
("Dropped", dropped_result),
("One Hot", one_hot_result),
("Ordinal", ordinal_result),
("Native", native_result),
]

plt.show()
# %%
plot_performance_tradeoff(
results_underfit, "Gradient Boosting on Ames Housing (few and shallow trees)"
)

# %%
# The results for these under-fitting models confirm our previous intuition:
Expand Down