Skip to content

Commit 2bcfd2e

Browse files
ArturoAmorQArturoAmorQthomasjpfanvirchan
authored
DOC Add TargetEncoder to Categorical Feature Support example (#32019)
Co-authored-by: ArturoAmorQ <arturo.amor-quiroz@polytechnique.edu> Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com> Co-authored-by: Virgil Chan <virchan.math@gmail.com>
1 parent ef4885f commit 2bcfd2e

File tree

1 file changed

+54
-13
lines changed

1 file changed

+54
-13
lines changed

examples/ensemble/plot_gradient_boosting_categorical.py

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
- "One Hot": using a :class:`~preprocessing.OneHotEncoder`;
1414
- "Ordinal": using an :class:`~preprocessing.OrdinalEncoder` and treat
1515
categories as ordered, equidistant quantities;
16+
- "Target": using a :class:`~preprocessing.TargetEncoder`;
1617
- "Native": relying on the :ref:`native category support
1718
<categorical_support_gbdt>` of the
1819
:class:`~ensemble.HistGradientBoostingRegressor` estimator.
@@ -142,6 +143,38 @@
142143
)
143144
hist_ordinal
144145

146+
# %%
147+
# Gradient boosting estimator with target encoding
148+
# ------------------------------------------------
149+
# Another possibility is to use the :class:`~preprocessing.TargetEncoder`, which
150+
# encodes the categories computed from the mean of the (training) target
151+
# variable, as computed using a smoothed `np.mean(y, axis=0)` i.e.:
152+
#
153+
# - in regression it uses the mean of `y`;
154+
# - in binary classification, the positive-class rate;
155+
# - in multiclass, a vector of class rates (one per class).
156+
#
157+
# For each category, it computes these target averages using :term:`cross
158+
# fitting`, meaning that the training data are split into folds: in each fold
159+
# the averages are calculated only on a subset of data and then applied to the
160+
# held-out part. This way, each sample is encoded using statistics from data it
161+
# was not part of, preventing information leakage from the target.
162+
163+
from sklearn.preprocessing import TargetEncoder
164+
165+
target_encoder = make_column_transformer(
166+
(
167+
TargetEncoder(target_type="continuous", random_state=42),
168+
make_column_selector(dtype_include="category"),
169+
),
170+
remainder="passthrough",
171+
)
172+
173+
hist_target = make_pipeline(
174+
target_encoder, HistGradientBoostingRegressor(random_state=42)
175+
)
176+
hist_target
177+
145178
# %%
146179
# Gradient boosting estimator with native categorical support
147180
# -----------------------------------------------------------
@@ -184,11 +217,13 @@
184217
dropped_result = cross_validate(hist_dropped, X, y, **common_params)
185218
one_hot_result = cross_validate(hist_one_hot, X, y, **common_params)
186219
ordinal_result = cross_validate(hist_ordinal, X, y, **common_params)
220+
target_result = cross_validate(hist_target, X, y, **common_params)
187221
native_result = cross_validate(hist_native, X, y, **common_params)
188222
results = [
189223
("Dropped", dropped_result),
190224
("One Hot", one_hot_result),
191225
("Ordinal", ordinal_result),
226+
("Target", target_result),
192227
("Native", native_result),
193228
]
194229

@@ -199,7 +234,7 @@
199234

200235
def plot_performance_tradeoff(results, title):
201236
fig, ax = plt.subplots()
202-
markers = ["s", "o", "^", "x"]
237+
markers = ["s", "o", "^", "x", "D"]
203238

204239
for idx, (name, result) in enumerate(results):
205240
test_error = -result["test_score"]
@@ -246,9 +281,9 @@ def plot_performance_tradeoff(results, title):
246281

247282
ax.annotate(
248283
" best\nmodels",
249-
xy=(0.05, 0.05),
284+
xy=(0.04, 0.04),
250285
xycoords="axes fraction",
251-
xytext=(0.1, 0.15),
286+
xytext=(0.09, 0.14),
252287
textcoords="axes fraction",
253288
arrowprops=dict(arrowstyle="->", lw=1.5),
254289
)
@@ -276,9 +311,13 @@ def plot_performance_tradeoff(results, title):
276311
# number of categories is small, and this may not always be reflected in
277312
# practice.
278313
#
314+
# The time required to fit when using the `TargetEncoder` depends on the
315+
# cross fitting parameter `cv`, as adding splits come at a computational cost.
316+
#
279317
# In terms of prediction performance, dropping the categorical features leads to
280-
# the worst performance. The three models that use categorical features have
281-
# comparable error rates, with a slight edge for the native handling.
318+
# the worst performance. The four models that make use of the categorical
319+
# features have comparable error rates, with a slight edge for the native
320+
# handling.
282321

283322
# %%
284323
# Limiting the number of splits
@@ -291,18 +330,18 @@ def plot_performance_tradeoff(results, title):
291330
#
292331
# This is also true when categories are treated as ordinal quantities: if
293332
# categories are `A..F` and the best split is `ACF - BDE` the one-hot-encoder
294-
# model will need 3 split points (one per category in the left node), and the
295-
# ordinal non-native model will need 4 splits: 1 split to isolate `A`, 1 split
333+
# model would need 3 split points (one per category in the left node), and the
334+
# ordinal non-native model would need 4 splits: 1 split to isolate `A`, 1 split
296335
# to isolate `F`, and 2 splits to isolate `C` from `BCDE`.
297336
#
298-
# How strongly the models' performances differ in practice will depend on the
337+
# How strongly the models' performances differ in practice depends on the
299338
# dataset and on the flexibility of the trees.
300339
#
301340
# To see this, let us re-run the same analysis with under-fitting models where
302341
# we artificially limit the total number of splits by both limiting the number
303342
# of trees and the depth of each tree.
304343

305-
for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):
344+
for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_target, hist_native):
306345
if pipe is hist_native:
307346
# The native model does not use a pipeline so, we can set the parameters
308347
# directly.
@@ -316,11 +355,13 @@ def plot_performance_tradeoff(results, title):
316355
dropped_result = cross_validate(hist_dropped, X, y, **common_params)
317356
one_hot_result = cross_validate(hist_one_hot, X, y, **common_params)
318357
ordinal_result = cross_validate(hist_ordinal, X, y, **common_params)
358+
target_result = cross_validate(hist_target, X, y, **common_params)
319359
native_result = cross_validate(hist_native, X, y, **common_params)
320360
results_underfit = [
321361
("Dropped", dropped_result),
322362
("One Hot", one_hot_result),
323363
("Ordinal", ordinal_result),
364+
("Target", target_result),
324365
("Native", native_result),
325366
]
326367

@@ -332,7 +373,7 @@ def plot_performance_tradeoff(results, title):
332373
# %%
333374
# The results for these underfitting models confirm our previous intuition: the
334375
# native category handling strategy performs the best when the splitting budget
335-
# is constrained. The two explicit encoding strategies (one-hot and ordinal
336-
# encoding) lead to slightly larger errors than the estimator's native handling,
337-
# but still perform better than the baseline model that just dropped the
338-
# categorical features altogether.
376+
# is constrained. The three explicit encoding strategies (one-hot, ordinal and
377+
# target encoding) lead to slightly larger errors than the estimator's native
378+
# handling, but still perform better than the baseline model that just dropped
379+
# the categorical features altogether.

0 commit comments

Comments
 (0)