-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
DOC user polars instead of polars in plot_time_series_lagged_features #28601
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ | |
Lagged features for time series forecasting | ||
=========================================== | ||
|
||
This example demonstrates how pandas-engineered lagged features can be used | ||
This example demonstrates how Polars-engineered lagged features can be used | ||
for time series forecasting with | ||
:class:`~sklearn.ensemble.HistGradientBoostingRegressor` on the Bike Sharing | ||
Demand dataset. | ||
|
@@ -19,27 +19,33 @@ | |
# Analyzing the Bike Sharing Demand dataset | ||
# ----------------------------------------- | ||
# | ||
# We start by loading the data from the OpenML repository. | ||
# We start by loading the data from the OpenML repository | ||
# as a pandas dataframe. This will be replaced with Polars | ||
# once `fetch_openml` adds a native support for it. | ||
# We convert to Polars for feature engineering, as it automatically caches | ||
# common subexpressions which are reused in multiple expressions | ||
# (like `pl.col("count").shift(1)` below). See | ||
# https://docs.pola.rs/user-guide/lazy/optimizations/ for more information. | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import polars as pl | ||
|
||
from sklearn.datasets import fetch_openml | ||
|
||
pl.Config.set_fmt_str_lengths(20) | ||
|
||
bike_sharing = fetch_openml( | ||
"Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas" | ||
) | ||
df = bike_sharing.frame | ||
df = pl.DataFrame({col: df[col].to_numpy() for col in df.columns}) | ||
|
||
# %% | ||
# Next, we take a look at the statistical summary of the dataset | ||
# so that we can better understand the data that we are working with. | ||
summary = pd.DataFrame(df.describe()) | ||
summary = ( | ||
summary.style.background_gradient() | ||
.set_table_attributes("style = 'display: inline'") | ||
.set_caption("Statistics of the Dataset") | ||
.set_table_styles([{"selector": "caption", "props": [("font-size", "16px")]}]) | ||
) | ||
import polars.selectors as cs | ||
|
||
summary = df.select(cs.numeric()).describe() | ||
summary | ||
|
||
# %% | ||
|
@@ -52,32 +58,26 @@ | |
|
||
|
||
# %% | ||
# Generating pandas-engineered lagged features | ||
# Generating Polars-engineered lagged features | ||
# -------------------------------------------- | ||
# Let's consider the problem of predicting the demand at the | ||
# next hour given past demands. Since the demand is a continuous | ||
# variable, one could intuitively use any regression model. However, we do | ||
# not have the usual `(X_train, y_train)` dataset. Instead, we just have | ||
# the `y_train` demand data sequentially organized by time. | ||
count = df["count"] | ||
lagged_df = pd.concat( | ||
[ | ||
count, | ||
count.shift(1).rename("lagged_count_1h"), | ||
count.shift(2).rename("lagged_count_2h"), | ||
count.shift(3).rename("lagged_count_3h"), | ||
count.shift(24).rename("lagged_count_1d"), | ||
count.shift(24 + 1).rename("lagged_count_1d_1h"), | ||
count.shift(7 * 24).rename("lagged_count_7d"), | ||
count.shift(7 * 24 + 1).rename("lagged_count_7d_1h"), | ||
count.shift(1).rolling(24).mean().rename("lagged_mean_24h"), | ||
count.shift(1).rolling(24).max().rename("lagged_max_24h"), | ||
count.shift(1).rolling(24).min().rename("lagged_min_24h"), | ||
count.shift(1).rolling(7 * 24).mean().rename("lagged_mean_7d"), | ||
count.shift(1).rolling(7 * 24).max().rename("lagged_max_7d"), | ||
count.shift(1).rolling(7 * 24).min().rename("lagged_min_7d"), | ||
], | ||
axis="columns", | ||
lagged_df = df.select( | ||
"count", | ||
*[pl.col("count").shift(i).alias(f"lagged_count_{i}h") for i in [1, 2, 3]], | ||
lagged_count_1d=pl.col("count").shift(24), | ||
lagged_count_1d_1h=pl.col("count").shift(24 + 1), | ||
lagged_count_7d=pl.col("count").shift(7 * 24), | ||
lagged_count_7d_1h=pl.col("count").shift(7 * 24 + 1), | ||
lagged_mean_24h=pl.col("count").shift(1).rolling_mean(24), | ||
lagged_max_24h=pl.col("count").shift(1).rolling_max(24), | ||
lagged_min_24h=pl.col("count").shift(1).rolling_min(24), | ||
lagged_mean_7d=pl.col("count").shift(1).rolling_mean(7 * 24), | ||
lagged_max_7d=pl.col("count").shift(1).rolling_max(7 * 24), | ||
lagged_min_7d=pl.col("count").shift(1).rolling_min(7 * 24), | ||
) | ||
lagged_df.tail(10) | ||
|
||
|
@@ -89,8 +89,8 @@ | |
# %% | ||
# We can now separate the lagged features in a matrix `X` and the target variable | ||
# (the counts to predict) in an array of the same first dimension `y`. | ||
lagged_df = lagged_df.dropna() | ||
X = lagged_df.drop("count", axis="columns") | ||
lagged_df = lagged_df.drop_nulls() | ||
X = lagged_df.drop("count") | ||
y = lagged_df["count"] | ||
print("X shape: {}\ny shape: {}".format(X.shape, y.shape)) | ||
|
||
|
@@ -141,8 +141,8 @@ | |
# %% | ||
# Training the model and evaluating its performance based on MAPE. | ||
train_idx, test_idx = all_splits[0] | ||
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] | ||
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] | ||
X_train, X_test = X[train_idx, :], X[test_idx, :] | ||
y_train, y_test = y[train_idx], y[test_idx] | ||
|
||
model = HistGradientBoostingRegressor().fit(X_train, y_train) | ||
y_pred = model.predict(X_test) | ||
|
@@ -258,40 +258,29 @@ def consolidate_scores(cv_results, scores, metric): | |
metric = key.split("test_")[1] | ||
scores = consolidate_scores(cv_results, scores, metric) | ||
|
||
df = pd.DataFrame(scores) | ||
|
||
styled_df_copy = df.copy() | ||
|
||
|
||
def extract_numeric(value): | ||
parts = value.split("±") | ||
mean_value = float(parts[0]) | ||
std_value = float(parts[1].split()[0]) | ||
|
||
return mean_value, std_value | ||
scores_df = pl.DataFrame(scores) | ||
scores_df | ||
|
||
|
||
# Convert columns containing "±" to tuples of numerical values | ||
cols_to_convert = df.columns[1:] # Exclude the "loss" column | ||
for col in cols_to_convert: | ||
df[col] = df[col].apply(extract_numeric) | ||
|
||
min_values = df.min() | ||
|
||
# Create a mask for highlighting minimum values | ||
mask = pd.DataFrame("", index=df.index, columns=df.columns) | ||
for col in cols_to_convert: | ||
mask[col] = df[col].apply( | ||
lambda x: "font-weight: bold" if x == min_values[col] else "" | ||
) | ||
|
||
styled_df_copy = styled_df_copy.style.apply(lambda x: mask, axis=None) | ||
styled_df_copy | ||
|
||
# %% | ||
# Let us take a look at the losses that minimise each metric. | ||
def min_arg(col): | ||
col_split = pl.col(col).str.split(" ") | ||
return pl.arg_sort_by( | ||
col_split.list.get(0).cast(pl.Float64), | ||
col_split.list.get(2).cast(pl.Float64), | ||
Comment on lines
+270
to
+271
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Honest question, why does the casting here changes the results so drastically? What is the dtype otherwise? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for your reviews @ArturoAmorQ . It looks like the fit times are slightly lower for Polars (apart from the squared loss) but otherwise there are no differences in the final table. Before casting the dtype is string (pl.Utf8). In the def extract_numeric(value):
parts = value.split("±")
mean_value = float(parts[0])
std_value = float(parts[1].split()[0])
return mean_value, std_value |
||
).first() | ||
|
||
|
||
scores_df.select( | ||
pl.col("loss").get(min_arg(col_name)).alias(col_name) | ||
for col_name in scores_df.columns | ||
if col_name != "loss" | ||
) | ||
Comment on lines
+275
to
+279
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a clever solution, though :) |
||
|
||
# %% | ||
# Even if the score distributions overlap due to the variance in the dataset, it | ||
# is true that the average RMSE is lower when `loss="squared_error"`, whereas | ||
# Even if the score distributions overlap due to the variance in the dataset, | ||
# it is true that the average RMSE is lower when `loss="squared_error"`, whereas | ||
# the average MAPE is lower when `loss="absolute_error"` as expected. That is | ||
# also the case for the Mean Pinball Loss with the quantiles 5 and 95. The score | ||
# corresponding to the 50 quantile loss is overlapping with the score obtained | ||
|
@@ -304,8 +293,8 @@ def extract_numeric(value): | |
all_splits = list(ts_cv.split(X, y)) | ||
train_idx, test_idx = all_splits[0] | ||
|
||
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] | ||
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] | ||
X_train, X_test = X[train_idx, :], X[test_idx, :] | ||
y_train, y_test = y[train_idx], y[test_idx] | ||
|
||
max_iter = 50 | ||
gbrt_mean_poisson = HistGradientBoostingRegressor(loss="poisson", max_iter=max_iter) | ||
|
@@ -336,7 +325,7 @@ def extract_numeric(value): | |
fig, ax = plt.subplots(figsize=(15, 7)) | ||
plt.title("Predictions by regression models") | ||
ax.plot( | ||
y_test.values[last_hours], | ||
y_test[last_hours], | ||
"x-", | ||
alpha=0.2, | ||
label="Actual demand", | ||
|
@@ -399,7 +388,7 @@ def extract_numeric(value): | |
] | ||
for ax, pred, label in zip(axes, predictions, labels): | ||
PredictionErrorDisplay.from_predictions( | ||
y_true=y_test.values, | ||
y_true=y_test, | ||
y_pred=pred, | ||
kind="residual_vs_predicted", | ||
scatter_kwargs={"alpha": 0.3}, | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am doing this instead of
pl.from_pandas()
to avoid installingpyarrow
.