Skip to content

Commit 40e30a1

Browse files
author
raisa
committed
replace pandas with Polars for forecasting co2 example
1 parent 77f8731 commit 40e30a1

File tree

1 file changed

+25
-19
lines changed

1 file changed

+25
-19
lines changed

examples/gaussian_process/plot_gpr_co2.py

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -33,32 +33,33 @@
3333
# We will derive a dataset from the Mauna Loa Observatory that collected air
3434
# samples. We are interested in estimating the concentration of CO2 and
3535
# extrapolate it for further year. First, we load the original dataset available
36-
# in OpenML.
36+
# in OpenML as a pandas dataframe. This will be replaced with Polars
37+
# once `fetch_openml` adds a native support for it.
3738
from sklearn.datasets import fetch_openml
3839

3940
co2 = fetch_openml(data_id=41187, as_frame=True)
4041
co2.frame.head()
4142

4243
# %%
43-
# First, we process the original dataframe to create a date index and select
44-
# only the CO2 column.
45-
import pandas as pd
44+
# First, we process the original dataframe to create a date column and select
45+
# it along with the CO2 column.
46+
import polars as pl
4647

47-
co2_data = co2.frame
48-
co2_data["date"] = pd.to_datetime(co2_data[["year", "month", "day"]])
49-
co2_data = co2_data[["date", "co2"]].set_index("date")
48+
co2_data = pl.DataFrame({col: co2.frame[col].to_numpy() for col in co2.frame.columns})
49+
co2_data = co2_data.with_columns(pl.date("year", "month", "day")).select("date", "co2")
5050
co2_data.head()
5151

5252
# %%
53-
co2_data.index.min(), co2_data.index.max()
53+
co2_data["date"].min(), co2_data["date"].max()
5454

5555
# %%
5656
# We see that we get CO2 concentration for some days from March, 1958 to
5757
# December, 2001. We can plot these raw information to have a better
5858
# understanding.
5959
import matplotlib.pyplot as plt
6060

61-
co2_data.plot()
61+
plt.plot(co2_data["date"], co2_data["co2"])
62+
plt.xlabel("date")
6263
plt.ylabel("CO$_2$ concentration (ppm)")
6364
_ = plt.title("Raw air samples measurements from the Mauna Loa Observatory")
6465

@@ -67,15 +68,15 @@
6768
# for which no measurements were collected. Such a processing will have an
6869
# smoothing effect on the data.
6970

70-
try:
71-
co2_data_resampled_monthly = co2_data.resample("ME")
72-
except ValueError:
73-
# pandas < 2.2 uses M instead of ME
74-
co2_data_resampled_monthly = co2_data.resample("M")
75-
76-
77-
co2_data = co2_data_resampled_monthly.mean().dropna(axis="index", how="any")
78-
co2_data.plot()
71+
co2_data = (
72+
co2_data.sort(by="date")
73+
.group_by_dynamic("date", every="1mo")
74+
.agg(pl.col("co2").mean())
75+
.with_columns(pl.col("date").dt.month_end())
76+
.filter(pl.col("date").is_not_null())
77+
)
78+
plt.plot(co2_data["date"], co2_data["co2"])
79+
plt.xlabel("date")
7980
plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
8081
_ = plt.title(
8182
"Monthly average of air samples measurements\nfrom the Mauna Loa Observatory"
@@ -88,7 +89,12 @@
8889
#
8990
# As a first step, we will divide the data and the target to estimate. The data
9091
# being a date, we will convert it into a numeric.
91-
X = (co2_data.index.year + co2_data.index.month / 12).to_numpy().reshape(-1, 1)
92+
X = (
93+
co2_data.select("date")
94+
.with_columns(pl.col("date").dt.year() + pl.col("date").dt.month() / 12)
95+
.to_numpy()
96+
.reshape(-1, 1)
97+
)
9298
y = co2_data["co2"].to_numpy()
9399

94100
# %%

0 commit comments

Comments
 (0)