Skip to content

Commit d7a1144

Browse files
authored
DOC improve example on cyclic feature engineering (scikit-learn#27501)
1 parent fa84c90 commit d7a1144

File tree

1 file changed

+42
-15
lines changed

1 file changed

+42
-15
lines changed

examples/applications/plot_cyclical_feature_engineering.py

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,14 @@
6161
# .. note::
6262
#
6363
# The fit method of the models used in this notebook all minimize the
64-
# mean squared error to estimate the conditional mean instead of the mean
65-
# absolute error that would fit an estimator of the conditional median.
66-
#
67-
# When reporting performance measure on the test set in the discussion, we
68-
# instead choose to focus on the mean absolute error that is more
69-
# intuitive than the (root) mean squared error. Note, however, that the
70-
# best models for one metric are also the best for the other in this
71-
# study.
64+
# mean squared error to estimate the conditional mean.
65+
# The absolute error, however, would estimate the conditional median.
66+
#
67+
# Nevertheless, when reporting performance measures on the test set in
68+
# the discussion, we choose to focus on the mean absolute error instead
69+
# of the (root) mean squared error because it is more intuitive to
70+
# interpret. Note, however, that in this study the best models for one
71+
# metric are also the best ones in terms of the other metric.
7272
y = df["count"] / df["count"].max()
7373

7474
# %%
@@ -170,7 +170,10 @@
170170
# efficiently handle heteorogenous tabular data with a mix of categorical and
171171
# numerical features as long as the number of samples is large enough.
172172
#
173-
# Here, we do minimal ordinal encoding for the categorical variables and then
173+
# Here, we use the modern
174+
# :class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support
175+
# for categorical features. Therefore, we only do minimal ordinal encoding for
176+
# the categorical variables and then
174177
# let the model know that it should treat those as categorical variables by
175178
# using a dedicated tree splitting rule. Since we use an ordinal encoder, we
176179
# pass the list of categorical values explicitly to use a logical order when
@@ -213,6 +216,9 @@
213216
verbose_feature_names_out=False,
214217
),
215218
HistGradientBoostingRegressor(
219+
max_iter=300,
220+
early_stopping=True,
221+
validation_fraction=0.1,
216222
categorical_features=categorical_columns,
217223
random_state=42,
218224
),
@@ -222,16 +228,26 @@
222228
#
223229
# Lets evaluate our gradient boosting model with the mean absolute error of the
224230
# relative demand averaged across our 5 time-based cross-validation splits:
231+
import numpy as np
225232

226233

227-
def evaluate(model, X, y, cv):
234+
def evaluate(model, X, y, cv, model_prop=None, model_step=None):
228235
cv_results = cross_validate(
229236
model,
230237
X,
231238
y,
232239
cv=cv,
233240
scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
241+
return_estimator=model_prop is not None,
234242
)
243+
if model_prop is not None:
244+
if model_step is not None:
245+
values = [
246+
getattr(m[model_step], model_prop) for m in cv_results["estimator"]
247+
]
248+
else:
249+
values = [getattr(m, model_prop) for m in cv_results["estimator"]]
250+
print(f"Mean model.{model_prop} = {np.mean(values)}")
235251
mae = -cv_results["test_neg_mean_absolute_error"]
236252
rmse = -cv_results["test_neg_root_mean_squared_error"]
237253
print(
@@ -240,9 +256,18 @@ def evaluate(model, X, y, cv):
240256
)
241257

242258

243-
evaluate(gbrt_pipeline, X, y, cv=ts_cv)
259+
evaluate(
260+
gbrt_pipeline,
261+
X,
262+
y,
263+
cv=ts_cv,
264+
model_prop="n_iter_",
265+
model_step="histgradientboostingregressor",
266+
)
244267

245268
# %%
269+
# We see that we set `max_iter` large enough such that early stopping took place.
270+
#
246271
# This model has an average error around 4 to 5% of the maximum demand. This is
247272
# quite good for a first trial without any hyper-parameter tuning! We just had
248273
# to make the categorical variables explicit. Note that the time related
@@ -258,10 +283,8 @@ def evaluate(model, X, y, cv):
258283
#
259284
# As usual for linear models, categorical variables need to be one-hot encoded.
260285
# For consistency, we scale the numerical features to the same 0-1 range using
261-
# class:`sklearn.preprocessing.MinMaxScaler`, although in this case it does not
286+
# :class:`~sklearn.preprocessing.MinMaxScaler`, although in this case it does not
262287
# impact the results much because they are already on comparable scales:
263-
import numpy as np
264-
265288
from sklearn.linear_model import RidgeCV
266289
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
267290

@@ -278,10 +301,14 @@ def evaluate(model, X, y, cv):
278301
)
279302

280303

281-
evaluate(naive_linear_pipeline, X, y, cv=ts_cv)
304+
evaluate(
305+
naive_linear_pipeline, X, y, cv=ts_cv, model_prop="alpha_", model_step="ridgecv"
306+
)
282307

283308

284309
# %%
310+
# It is affirmative to see that the selected `alpha_` is in our specified
311+
# range.
285312
#
286313
# The performance is not good: the average error is around 14% of the maximum
287314
# demand. This is more than three times higher than the average error of the

0 commit comments

Comments
 (0)