3
3
Lagged features for time series forecasting
4
4
===========================================
5
5
6
- This example demonstrates how pandas -engineered lagged features can be used
6
+ This example demonstrates how Polars -engineered lagged features can be used
7
7
for time series forecasting with
8
8
:class:`~sklearn.ensemble.HistGradientBoostingRegressor` on the Bike Sharing
9
9
Demand dataset.
19
19
# Analyzing the Bike Sharing Demand dataset
20
20
# -----------------------------------------
21
21
#
22
- # We start by loading the data from the OpenML repository.
22
+ # We start by loading the data from the OpenML repository
23
+ # as a pandas dataframe. This will be replaced with Polars
24
+ # after `fetch_openml` will add a native support for it.
25
+ # We convert to Polars for feature engineering, as it automatically caches
26
+ # common subexpressions which are reused in multiple expressions
27
+ # (like `pl.col("count").shift(1)` below). See
28
+ # https://docs.pola.rs/user-guide/lazy/optimizations/ for more information.
29
+
23
30
import numpy as np
24
- import pandas as pd
31
+ import polars as pl
25
32
26
33
from sklearn .datasets import fetch_openml
27
34
35
+ pl .Config .set_fmt_str_lengths (20 )
36
+
28
37
bike_sharing = fetch_openml (
29
38
"Bike_Sharing_Demand" , version = 2 , as_frame = True , parser = "pandas"
30
39
)
31
40
df = bike_sharing .frame
41
+ df = pl .DataFrame ({col : df [col ].to_numpy () for col in df .columns })
32
42
33
43
# %%
34
44
# Next, we take a look at the statistical summary of the dataset
35
45
# so that we can better understand the data that we are working with.
36
- summary = pd .DataFrame (df .describe ())
37
- summary = (
38
- summary .style .background_gradient ()
39
- .set_table_attributes ("style = 'display: inline'" )
40
- .set_caption ("Statistics of the Dataset" )
41
- .set_table_styles ([{"selector" : "caption" , "props" : [("font-size" , "16px" )]}])
42
- )
46
+ import polars .selectors as cs
47
+
48
+ summary = df .select (cs .numeric ()).describe ()
43
49
summary
44
50
45
51
# %%
52
58
53
59
54
60
# %%
55
- # Generating pandas -engineered lagged features
61
+ # Generating Polars -engineered lagged features
56
62
# --------------------------------------------
57
63
# Let's consider the problem of predicting the demand at the
58
64
# next hour given past demands. Since the demand is a continuous
59
65
# variable, one could intuitively use any regression model. However, we do
60
66
# not have the usual `(X_train, y_train)` dataset. Instead, we just have
61
67
# the `y_train` demand data sequentially organized by time.
62
- count = df ["count" ]
63
- lagged_df = pd .concat (
64
- [
65
- count ,
66
- count .shift (1 ).rename ("lagged_count_1h" ),
67
- count .shift (2 ).rename ("lagged_count_2h" ),
68
- count .shift (3 ).rename ("lagged_count_3h" ),
69
- count .shift (24 ).rename ("lagged_count_1d" ),
70
- count .shift (24 + 1 ).rename ("lagged_count_1d_1h" ),
71
- count .shift (7 * 24 ).rename ("lagged_count_7d" ),
72
- count .shift (7 * 24 + 1 ).rename ("lagged_count_7d_1h" ),
73
- count .shift (1 ).rolling (24 ).mean ().rename ("lagged_mean_24h" ),
74
- count .shift (1 ).rolling (24 ).max ().rename ("lagged_max_24h" ),
75
- count .shift (1 ).rolling (24 ).min ().rename ("lagged_min_24h" ),
76
- count .shift (1 ).rolling (7 * 24 ).mean ().rename ("lagged_mean_7d" ),
77
- count .shift (1 ).rolling (7 * 24 ).max ().rename ("lagged_max_7d" ),
78
- count .shift (1 ).rolling (7 * 24 ).min ().rename ("lagged_min_7d" ),
79
- ],
80
- axis = "columns" ,
68
+ lagged_df = df .select (
69
+ "count" ,
70
+ * [pl .col ("count" ).shift (i ).alias (f"lagged_count_{ i } h" ) for i in [1 , 2 , 3 ]],
71
+ lagged_count_1d = pl .col ("count" ).shift (24 ),
72
+ lagged_count_1d_1h = pl .col ("count" ).shift (24 + 1 ),
73
+ lagged_count_7d = pl .col ("count" ).shift (7 * 24 ),
74
+ lagged_count_7d_1h = pl .col ("count" ).shift (7 * 24 + 1 ),
75
+ lagged_mean_24h = pl .col ("count" ).shift (1 ).rolling_mean (24 ),
76
+ lagged_max_24h = pl .col ("count" ).shift (1 ).rolling_max (24 ),
77
+ lagged_min_24h = pl .col ("count" ).shift (1 ).rolling_min (24 ),
78
+ lagged_mean_7d = pl .col ("count" ).shift (1 ).rolling_mean (7 * 24 ),
79
+ lagged_max_7d = pl .col ("count" ).shift (1 ).rolling_max (7 * 24 ),
80
+ lagged_min_7d = pl .col ("count" ).shift (1 ).rolling_min (7 * 24 ),
81
81
)
82
82
lagged_df .tail (10 )
83
83
89
89
# %%
90
90
# We can now separate the lagged features in a matrix `X` and the target variable
91
91
# (the counts to predict) in an array of the same first dimension `y`.
92
- lagged_df = lagged_df .dropna ()
93
- X = lagged_df .drop ("count" , axis = "columns" )
92
+ lagged_df = lagged_df .drop_nulls ()
93
+ X = lagged_df .drop ("count" )
94
94
y = lagged_df ["count" ]
95
95
print ("X shape: {}\n y shape: {}" .format (X .shape , y .shape ))
96
96
141
141
# %%
142
142
# Training the model and evaluating its performance based on MAPE.
143
143
train_idx , test_idx = all_splits [0 ]
144
- X_train , X_test = X . iloc [train_idx ], X . iloc [test_idx ]
145
- y_train , y_test = y . iloc [train_idx ], y . iloc [test_idx ]
144
+ X_train , X_test = X [train_idx , : ], X [test_idx , : ]
145
+ y_train , y_test = y [train_idx ], y [test_idx ]
146
146
147
147
model = HistGradientBoostingRegressor ().fit (X_train , y_train )
148
148
y_pred = model .predict (X_test )
@@ -258,41 +258,32 @@ def consolidate_scores(cv_results, scores, metric):
258
258
metric = key .split ("test_" )[1 ]
259
259
scores = consolidate_scores (cv_results , scores , metric )
260
260
261
- df = pd .DataFrame (scores )
262
-
263
- styled_df_copy = df .copy ()
264
-
265
-
266
- def extract_numeric (value ):
267
- parts = value .split ("±" )
268
- mean_value = float (parts [0 ])
269
- std_value = float (parts [1 ].split ()[0 ])
270
-
271
- return mean_value , std_value
261
+ scores_df = pl .DataFrame (scores )
262
+ scores_df
272
263
273
264
274
- # Convert columns containing "±" to tuples of numerical values
275
- cols_to_convert = df .columns [1 :] # Exclude the "loss" column
276
- for col in cols_to_convert :
277
- df [col ] = df [col ].apply (extract_numeric )
278
-
279
- min_values = df .min ()
280
-
281
- # Create a mask for highlighting minimum values
282
- mask = pd .DataFrame ("" , index = df .index , columns = df .columns )
283
- for col in cols_to_convert :
284
- mask [col ] = df [col ].apply (
285
- lambda x : "font-weight: bold" if x == min_values [col ] else ""
286
- )
287
-
288
- styled_df_copy = styled_df_copy .style .apply (lambda x : mask , axis = None )
289
- styled_df_copy
290
-
265
+ # %%
266
+ # Let us take a look at the losses that minimise each metric.
267
+ def min_arg (col ):
268
+ col_split = pl .col (col ).str .split (" " )
269
+ arg_min = pl .arg_sort_by (
270
+ col_split .list .get (0 ).cast (pl .Float64 ),
271
+ col_split .list .get (2 ).cast (pl .Float64 ),
272
+ ).first ()
273
+ return arg_min
274
+
275
+
276
+ scores_df .select (
277
+ pl .col ("loss" ).get (min_arg (col_name )).alias (col_name )
278
+ for col_name in scores_df .columns
279
+ if col_name != "loss"
280
+ )
291
281
292
282
# %%
293
- # Even if the score distributions overlap due to the variance in the dataset, it
294
- # is true that the average RMSE is lower when `loss="squared_error"`, whereas
295
- # the average MAPE is lower when `loss="absolute_error"` as expected. That is
283
+ # Even if the score distributions overlap due to the variance in the dataset,
284
+ # it is true that the average RMSE is lower when `loss="squared_error"`, whereas
285
+ # the average MAPE is lower when `loss="absolute_error"` as expected
286
+ # (the same is true for the quantile 50). That is
296
287
# also the case for the Mean Pinball Loss with the quantiles 5 and 95. The score
297
288
# corresponding to the 50 quantile loss is overlapping with the score obtained
298
289
# by minimizing other loss functions, which is also the case for the MAE.
@@ -304,8 +295,8 @@ def extract_numeric(value):
304
295
all_splits = list (ts_cv .split (X , y ))
305
296
train_idx , test_idx = all_splits [0 ]
306
297
307
- X_train , X_test = X . iloc [train_idx ], X . iloc [test_idx ]
308
- y_train , y_test = y . iloc [train_idx ], y . iloc [test_idx ]
298
+ X_train , X_test = X [train_idx , : ], X [test_idx , : ]
299
+ y_train , y_test = y [train_idx ], y [test_idx ]
309
300
310
301
max_iter = 50
311
302
gbrt_mean_poisson = HistGradientBoostingRegressor (loss = "poisson" , max_iter = max_iter )
@@ -336,7 +327,7 @@ def extract_numeric(value):
336
327
fig , ax = plt .subplots (figsize = (15 , 7 ))
337
328
plt .title ("Predictions by regression models" )
338
329
ax .plot (
339
- y_test . values [last_hours ],
330
+ y_test [last_hours ],
340
331
"x-" ,
341
332
alpha = 0.2 ,
342
333
label = "Actual demand" ,
@@ -399,7 +390,7 @@ def extract_numeric(value):
399
390
]
400
391
for ax , pred , label in zip (axes , predictions , labels ):
401
392
PredictionErrorDisplay .from_predictions (
402
- y_true = y_test . values ,
393
+ y_true = y_test ,
403
394
y_pred = pred ,
404
395
kind = "residual_vs_predicted" ,
405
396
scatter_kwargs = {"alpha" : 0.3 },
0 commit comments