13
13
- "One Hot": using a :class:`~preprocessing.OneHotEncoder`;
14
14
- "Ordinal": using an :class:`~preprocessing.OrdinalEncoder` and treat
15
15
categories as ordered, equidistant quantities;
16
+ - "Target": using a :class:`~preprocessing.TargetEncoder`;
16
17
- "Native": relying on the :ref:`native category support
17
18
<categorical_support_gbdt>` of the
18
19
:class:`~ensemble.HistGradientBoostingRegressor` estimator.
142
143
)
143
144
hist_ordinal
144
145
146
+ # %%
147
+ # Gradient boosting estimator with target encoding
148
+ # ------------------------------------------------
149
+ # Another possibility is to use the :class:`~preprocessing.TargetEncoder`, which
150
+ # encodes the categories computed from the mean of the (training) target
151
+ # variable, as computed using a smoothed `np.mean(y, axis=0)` i.e.:
152
+ #
153
+ # - in regression it uses the mean of `y`;
154
+ # - in binary classification, the positive-class rate;
155
+ # - in multiclass, a vector of class rates (one per class).
156
+ #
157
+ # For each category, it computes these target averages using :term:`cross
158
+ # fitting`, meaning that the training data are split into folds: in each fold
159
+ # the averages are calculated only on a subset of data and then applied to the
160
+ # held-out part. This way, each sample is encoded using statistics from data it
161
+ # was not part of, preventing information leakage from the target.
162
+
163
+ from sklearn .preprocessing import TargetEncoder
164
+
165
+ target_encoder = make_column_transformer (
166
+ (
167
+ TargetEncoder (target_type = "continuous" , random_state = 42 ),
168
+ make_column_selector (dtype_include = "category" ),
169
+ ),
170
+ remainder = "passthrough" ,
171
+ )
172
+
173
+ hist_target = make_pipeline (
174
+ target_encoder , HistGradientBoostingRegressor (random_state = 42 )
175
+ )
176
+ hist_target
177
+
145
178
# %%
146
179
# Gradient boosting estimator with native categorical support
147
180
# -----------------------------------------------------------
184
217
dropped_result = cross_validate (hist_dropped , X , y , ** common_params )
185
218
one_hot_result = cross_validate (hist_one_hot , X , y , ** common_params )
186
219
ordinal_result = cross_validate (hist_ordinal , X , y , ** common_params )
220
+ target_result = cross_validate (hist_target , X , y , ** common_params )
187
221
native_result = cross_validate (hist_native , X , y , ** common_params )
188
222
results = [
189
223
("Dropped" , dropped_result ),
190
224
("One Hot" , one_hot_result ),
191
225
("Ordinal" , ordinal_result ),
226
+ ("Target" , target_result ),
192
227
("Native" , native_result ),
193
228
]
194
229
199
234
200
235
def plot_performance_tradeoff (results , title ):
201
236
fig , ax = plt .subplots ()
202
- markers = ["s" , "o" , "^" , "x" ]
237
+ markers = ["s" , "o" , "^" , "x" , "D" ]
203
238
204
239
for idx , (name , result ) in enumerate (results ):
205
240
test_error = - result ["test_score" ]
@@ -246,9 +281,9 @@ def plot_performance_tradeoff(results, title):
246
281
247
282
ax .annotate (
248
283
" best\n models" ,
249
- xy = (0.05 , 0.05 ),
284
+ xy = (0.04 , 0.04 ),
250
285
xycoords = "axes fraction" ,
251
- xytext = (0.1 , 0.15 ),
286
+ xytext = (0.09 , 0.14 ),
252
287
textcoords = "axes fraction" ,
253
288
arrowprops = dict (arrowstyle = "->" , lw = 1.5 ),
254
289
)
@@ -276,9 +311,13 @@ def plot_performance_tradeoff(results, title):
276
311
# number of categories is small, and this may not always be reflected in
277
312
# practice.
278
313
#
314
+ # The time required to fit when using the `TargetEncoder` depends on the
315
+ # cross fitting parameter `cv`, as adding splits come at a computational cost.
316
+ #
279
317
# In terms of prediction performance, dropping the categorical features leads to
280
- # the worst performance. The three models that use categorical features have
281
- # comparable error rates, with a slight edge for the native handling.
318
+ # the worst performance. The four models that make use of the categorical
319
+ # features have comparable error rates, with a slight edge for the native
320
+ # handling.
282
321
283
322
# %%
284
323
# Limiting the number of splits
@@ -291,18 +330,18 @@ def plot_performance_tradeoff(results, title):
291
330
#
292
331
# This is also true when categories are treated as ordinal quantities: if
293
332
# categories are `A..F` and the best split is `ACF - BDE` the one-hot-encoder
294
- # model will need 3 split points (one per category in the left node), and the
295
- # ordinal non-native model will need 4 splits: 1 split to isolate `A`, 1 split
333
+ # model would need 3 split points (one per category in the left node), and the
334
+ # ordinal non-native model would need 4 splits: 1 split to isolate `A`, 1 split
296
335
# to isolate `F`, and 2 splits to isolate `C` from `BCDE`.
297
336
#
298
- # How strongly the models' performances differ in practice will depend on the
337
+ # How strongly the models' performances differ in practice depends on the
299
338
# dataset and on the flexibility of the trees.
300
339
#
301
340
# To see this, let us re-run the same analysis with under-fitting models where
302
341
# we artificially limit the total number of splits by both limiting the number
303
342
# of trees and the depth of each tree.
304
343
305
- for pipe in (hist_dropped , hist_one_hot , hist_ordinal , hist_native ):
344
+ for pipe in (hist_dropped , hist_one_hot , hist_ordinal , hist_target , hist_native ):
306
345
if pipe is hist_native :
307
346
# The native model does not use a pipeline so, we can set the parameters
308
347
# directly.
@@ -316,11 +355,13 @@ def plot_performance_tradeoff(results, title):
316
355
dropped_result = cross_validate (hist_dropped , X , y , ** common_params )
317
356
one_hot_result = cross_validate (hist_one_hot , X , y , ** common_params )
318
357
ordinal_result = cross_validate (hist_ordinal , X , y , ** common_params )
358
+ target_result = cross_validate (hist_target , X , y , ** common_params )
319
359
native_result = cross_validate (hist_native , X , y , ** common_params )
320
360
results_underfit = [
321
361
("Dropped" , dropped_result ),
322
362
("One Hot" , one_hot_result ),
323
363
("Ordinal" , ordinal_result ),
364
+ ("Target" , target_result ),
324
365
("Native" , native_result ),
325
366
]
326
367
@@ -332,7 +373,7 @@ def plot_performance_tradeoff(results, title):
332
373
# %%
333
374
# The results for these underfitting models confirm our previous intuition: the
334
375
# native category handling strategy performs the best when the splitting budget
335
- # is constrained. The two explicit encoding strategies (one-hot and ordinal
336
- # encoding) lead to slightly larger errors than the estimator's native handling,
337
- # but still perform better than the baseline model that just dropped the
338
- # categorical features altogether.
376
+ # is constrained. The three explicit encoding strategies (one-hot, ordinal and
377
+ # target encoding) lead to slightly larger errors than the estimator's native
378
+ # handling, but still perform better than the baseline model that just dropped
379
+ # the categorical features altogether.
0 commit comments