Skip to content

Commit a7b5a9b

Browse files
author
raisa
committed
use Polars in "plot time series lagged features" example
1 parent f59c503 commit a7b5a9b

File tree

1 file changed

+60
-69
lines changed

1 file changed

+60
-69
lines changed

examples/applications/plot_time_series_lagged_features.py

Lines changed: 60 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Lagged features for time series forecasting
44
===========================================
55
6-
This example demonstrates how pandas-engineered lagged features can be used
6+
This example demonstrates how Polars-engineered lagged features can be used
77
for time series forecasting with
88
:class:`~sklearn.ensemble.HistGradientBoostingRegressor` on the Bike Sharing
99
Demand dataset.
@@ -19,27 +19,33 @@
1919
# Analyzing the Bike Sharing Demand dataset
2020
# -----------------------------------------
2121
#
22-
# We start by loading the data from the OpenML repository.
22+
# We start by loading the data from the OpenML repository
23+
# as a pandas dataframe. This will be replaced with Polars
24+
# after `fetch_openml` will add a native support for it.
25+
# We convert to Polars for feature engineering, as it automatically caches
26+
# common subexpressions which are reused in multiple expressions
27+
# (like `pl.col("count").shift(1)` below). See
28+
# https://docs.pola.rs/user-guide/lazy/optimizations/ for more information.
29+
2330
import numpy as np
24-
import pandas as pd
31+
import polars as pl
2532

2633
from sklearn.datasets import fetch_openml
2734

35+
pl.Config.set_fmt_str_lengths(20)
36+
2837
bike_sharing = fetch_openml(
2938
"Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas"
3039
)
3140
df = bike_sharing.frame
41+
df = pl.DataFrame({col: df[col].to_numpy() for col in df.columns})
3242

3343
# %%
3444
# Next, we take a look at the statistical summary of the dataset
3545
# so that we can better understand the data that we are working with.
36-
summary = pd.DataFrame(df.describe())
37-
summary = (
38-
summary.style.background_gradient()
39-
.set_table_attributes("style = 'display: inline'")
40-
.set_caption("Statistics of the Dataset")
41-
.set_table_styles([{"selector": "caption", "props": [("font-size", "16px")]}])
42-
)
46+
import polars.selectors as cs
47+
48+
summary = df.select(cs.numeric()).describe()
4349
summary
4450

4551
# %%
@@ -52,32 +58,26 @@
5258

5359

5460
# %%
55-
# Generating pandas-engineered lagged features
61+
# Generating Polars-engineered lagged features
5662
# --------------------------------------------
5763
# Let's consider the problem of predicting the demand at the
5864
# next hour given past demands. Since the demand is a continuous
5965
# variable, one could intuitively use any regression model. However, we do
6066
# not have the usual `(X_train, y_train)` dataset. Instead, we just have
6167
# the `y_train` demand data sequentially organized by time.
62-
count = df["count"]
63-
lagged_df = pd.concat(
64-
[
65-
count,
66-
count.shift(1).rename("lagged_count_1h"),
67-
count.shift(2).rename("lagged_count_2h"),
68-
count.shift(3).rename("lagged_count_3h"),
69-
count.shift(24).rename("lagged_count_1d"),
70-
count.shift(24 + 1).rename("lagged_count_1d_1h"),
71-
count.shift(7 * 24).rename("lagged_count_7d"),
72-
count.shift(7 * 24 + 1).rename("lagged_count_7d_1h"),
73-
count.shift(1).rolling(24).mean().rename("lagged_mean_24h"),
74-
count.shift(1).rolling(24).max().rename("lagged_max_24h"),
75-
count.shift(1).rolling(24).min().rename("lagged_min_24h"),
76-
count.shift(1).rolling(7 * 24).mean().rename("lagged_mean_7d"),
77-
count.shift(1).rolling(7 * 24).max().rename("lagged_max_7d"),
78-
count.shift(1).rolling(7 * 24).min().rename("lagged_min_7d"),
79-
],
80-
axis="columns",
68+
lagged_df = df.select(
69+
"count",
70+
*[pl.col("count").shift(i).alias(f"lagged_count_{i}h") for i in [1, 2, 3]],
71+
lagged_count_1d=pl.col("count").shift(24),
72+
lagged_count_1d_1h=pl.col("count").shift(24 + 1),
73+
lagged_count_7d=pl.col("count").shift(7 * 24),
74+
lagged_count_7d_1h=pl.col("count").shift(7 * 24 + 1),
75+
lagged_mean_24h=pl.col("count").shift(1).rolling_mean(24),
76+
lagged_max_24h=pl.col("count").shift(1).rolling_max(24),
77+
lagged_min_24h=pl.col("count").shift(1).rolling_min(24),
78+
lagged_mean_7d=pl.col("count").shift(1).rolling_mean(7 * 24),
79+
lagged_max_7d=pl.col("count").shift(1).rolling_max(7 * 24),
80+
lagged_min_7d=pl.col("count").shift(1).rolling_min(7 * 24),
8181
)
8282
lagged_df.tail(10)
8383

@@ -89,8 +89,8 @@
8989
# %%
9090
# We can now separate the lagged features in a matrix `X` and the target variable
9191
# (the counts to predict) in an array of the same first dimension `y`.
92-
lagged_df = lagged_df.dropna()
93-
X = lagged_df.drop("count", axis="columns")
92+
lagged_df = lagged_df.drop_nulls()
93+
X = lagged_df.drop("count")
9494
y = lagged_df["count"]
9595
print("X shape: {}\ny shape: {}".format(X.shape, y.shape))
9696

@@ -141,8 +141,8 @@
141141
# %%
142142
# Training the model and evaluating its performance based on MAPE.
143143
train_idx, test_idx = all_splits[0]
144-
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
145-
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
144+
X_train, X_test = X[train_idx, :], X[test_idx, :]
145+
y_train, y_test = y[train_idx], y[test_idx]
146146

147147
model = HistGradientBoostingRegressor().fit(X_train, y_train)
148148
y_pred = model.predict(X_test)
@@ -258,41 +258,32 @@ def consolidate_scores(cv_results, scores, metric):
258258
metric = key.split("test_")[1]
259259
scores = consolidate_scores(cv_results, scores, metric)
260260

261-
df = pd.DataFrame(scores)
262-
263-
styled_df_copy = df.copy()
264-
265-
266-
def extract_numeric(value):
267-
parts = value.split("±")
268-
mean_value = float(parts[0])
269-
std_value = float(parts[1].split()[0])
270-
271-
return mean_value, std_value
261+
scores_df = pl.DataFrame(scores)
262+
scores_df
272263

273264

274-
# Convert columns containing "±" to tuples of numerical values
275-
cols_to_convert = df.columns[1:] # Exclude the "loss" column
276-
for col in cols_to_convert:
277-
df[col] = df[col].apply(extract_numeric)
278-
279-
min_values = df.min()
280-
281-
# Create a mask for highlighting minimum values
282-
mask = pd.DataFrame("", index=df.index, columns=df.columns)
283-
for col in cols_to_convert:
284-
mask[col] = df[col].apply(
285-
lambda x: "font-weight: bold" if x == min_values[col] else ""
286-
)
287-
288-
styled_df_copy = styled_df_copy.style.apply(lambda x: mask, axis=None)
289-
styled_df_copy
290-
265+
# %%
266+
# Let us take a look at the losses that minimise each metric.
267+
def min_arg(col):
268+
col_split = pl.col(col).str.split(" ")
269+
arg_min = pl.arg_sort_by(
270+
col_split.list.get(0).cast(pl.Float64),
271+
col_split.list.get(2).cast(pl.Float64),
272+
).first()
273+
return arg_min
274+
275+
276+
scores_df.select(
277+
pl.col("loss").get(min_arg(col_name)).alias(col_name)
278+
for col_name in scores_df.columns
279+
if col_name != "loss"
280+
)
291281

292282
# %%
293-
# Even if the score distributions overlap due to the variance in the dataset, it
294-
# is true that the average RMSE is lower when `loss="squared_error"`, whereas
295-
# the average MAPE is lower when `loss="absolute_error"` as expected. That is
283+
# Even if the score distributions overlap due to the variance in the dataset,
284+
# it is true that the average RMSE is lower when `loss="squared_error"`, whereas
285+
# the average MAPE is lower when `loss="absolute_error"` as expected
286+
# (the same is true for the quantile 50). That is
296287
# also the case for the Mean Pinball Loss with the quantiles 5 and 95. The score
297288
# corresponding to the 50 quantile loss is overlapping with the score obtained
298289
# by minimizing other loss functions, which is also the case for the MAE.
@@ -304,8 +295,8 @@ def extract_numeric(value):
304295
all_splits = list(ts_cv.split(X, y))
305296
train_idx, test_idx = all_splits[0]
306297

307-
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
308-
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
298+
X_train, X_test = X[train_idx, :], X[test_idx, :]
299+
y_train, y_test = y[train_idx], y[test_idx]
309300

310301
max_iter = 50
311302
gbrt_mean_poisson = HistGradientBoostingRegressor(loss="poisson", max_iter=max_iter)
@@ -336,7 +327,7 @@ def extract_numeric(value):
336327
fig, ax = plt.subplots(figsize=(15, 7))
337328
plt.title("Predictions by regression models")
338329
ax.plot(
339-
y_test.values[last_hours],
330+
y_test[last_hours],
340331
"x-",
341332
alpha=0.2,
342333
label="Actual demand",
@@ -399,7 +390,7 @@ def extract_numeric(value):
399390
]
400391
for ax, pred, label in zip(axes, predictions, labels):
401392
PredictionErrorDisplay.from_predictions(
402-
y_true=y_test.values,
393+
y_true=y_test,
403394
y_pred=pred,
404395
kind="residual_vs_predicted",
405396
scatter_kwargs={"alpha": 0.3},

0 commit comments

Comments
 (0)