replace pandas with Polars for forecasting co2 example

raisa · raisa · commit 40e30a1edb4a · 2024-04-10T14:46:05.000+01:00
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
@@ -33,32 +33,33 @@
 # We will derive a dataset from the Mauna Loa Observatory that collected air
 # samples. We are interested in estimating the concentration of CO2 and
 # extrapolate it for further year. First, we load the original dataset available
-# in OpenML.
+# in OpenML as a pandas dataframe. This will be replaced with Polars
+# once `fetch_openml` adds a native support for it.
 from sklearn.datasets import fetch_openml
 
 co2 = fetch_openml(data_id=41187, as_frame=True)
 co2.frame.head()
 
 # %%
-# First, we process the original dataframe to create a date index and select
-# only the CO2 column.
-import pandas as pd
+# First, we process the original dataframe to create a date column and select
+# it along with the CO2 column.
+import polars as pl
 
-co2_data = co2.frame
-co2_data["date"] = pd.to_datetime(co2_data[["year", "month", "day"]])
-co2_data = co2_data[["date", "co2"]].set_index("date")
+co2_data = pl.DataFrame({col: co2.frame[col].to_numpy() for col in co2.frame.columns})
+co2_data = co2_data.with_columns(pl.date("year", "month", "day")).select("date", "co2")
 co2_data.head()
 
 # %%
-co2_data.index.min(), co2_data.index.max()
+co2_data["date"].min(), co2_data["date"].max()
 
 # %%
 # We see that we get CO2 concentration for some days from March, 1958 to
 # December, 2001. We can plot these raw information to have a better
 # understanding.
 import matplotlib.pyplot as plt
 
-co2_data.plot()
+plt.plot(co2_data["date"], co2_data["co2"])
+plt.xlabel("date")
 plt.ylabel("CO$_2$ concentration (ppm)")
 _ = plt.title("Raw air samples measurements from the Mauna Loa Observatory")
 
@@ -67,15 +68,15 @@
 # for which no measurements were collected. Such a processing will have an
 # smoothing effect on the data.
 
-try:
-    co2_data_resampled_monthly = co2_data.resample("ME")
-except ValueError:
-    # pandas < 2.2 uses M instead of ME
-    co2_data_resampled_monthly = co2_data.resample("M")
-
-
-co2_data = co2_data_resampled_monthly.mean().dropna(axis="index", how="any")
-co2_data.plot()
+co2_data = (
+    co2_data.sort(by="date")
+    .group_by_dynamic("date", every="1mo")
+    .agg(pl.col("co2").mean())
+    .with_columns(pl.col("date").dt.month_end())
+    .filter(pl.col("date").is_not_null())
+)
+plt.plot(co2_data["date"], co2_data["co2"])
+plt.xlabel("date")
 plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
 _ = plt.title(
     "Monthly average of air samples measurements\nfrom the Mauna Loa Observatory"
@@ -88,7 +89,12 @@
 #
 # As a first step, we will divide the data and the target to estimate. The data
 # being a date, we will convert it into a numeric.
-X = (co2_data.index.year + co2_data.index.month / 12).to_numpy().reshape(-1, 1)
+X = (
+    co2_data.select("date")
+    .with_columns(pl.col("date").dt.year() + pl.col("date").dt.month() / 12)
+    .to_numpy()
+    .reshape(-1, 1)
+)
 y = co2_data["co2"].to_numpy()
 
 # %%