|
33 | 33 | # We will derive a dataset from the Mauna Loa Observatory that collected air
|
34 | 34 | # samples. We are interested in estimating the concentration of CO2 and
|
35 | 35 | # extrapolate it for further year. First, we load the original dataset available
|
36 |
| -# in OpenML. |
| 36 | +# in OpenML as a pandas dataframe. This will be replaced with Polars |
| 37 | +# once `fetch_openml` adds a native support for it. |
37 | 38 | from sklearn.datasets import fetch_openml
|
38 | 39 |
|
39 | 40 | co2 = fetch_openml(data_id=41187, as_frame=True)
|
40 | 41 | co2.frame.head()
|
41 | 42 |
|
42 | 43 | # %%
|
43 |
| -# First, we process the original dataframe to create a date index and select |
44 |
| -# only the CO2 column. |
45 |
| -import pandas as pd |
| 44 | +# First, we process the original dataframe to create a date column and select |
| 45 | +# it along with the CO2 column. |
| 46 | +import polars as pl |
46 | 47 |
|
47 |
| -co2_data = co2.frame |
48 |
| -co2_data["date"] = pd.to_datetime(co2_data[["year", "month", "day"]]) |
49 |
| -co2_data = co2_data[["date", "co2"]].set_index("date") |
| 48 | +co2_data = pl.DataFrame({col: co2.frame[col].to_numpy() for col in co2.frame.columns}) |
| 49 | +co2_data = co2_data.with_columns(pl.date("year", "month", "day")).select("date", "co2") |
50 | 50 | co2_data.head()
|
51 | 51 |
|
52 | 52 | # %%
|
53 |
| -co2_data.index.min(), co2_data.index.max() |
| 53 | +co2_data["date"].min(), co2_data["date"].max() |
54 | 54 |
|
55 | 55 | # %%
|
56 | 56 | # We see that we get CO2 concentration for some days from March, 1958 to
|
57 | 57 | # December, 2001. We can plot these raw information to have a better
|
58 | 58 | # understanding.
|
59 | 59 | import matplotlib.pyplot as plt
|
60 | 60 |
|
61 |
| -co2_data.plot() |
| 61 | +plt.plot(co2_data["date"], co2_data["co2"]) |
| 62 | +plt.xlabel("date") |
62 | 63 | plt.ylabel("CO$_2$ concentration (ppm)")
|
63 | 64 | _ = plt.title("Raw air samples measurements from the Mauna Loa Observatory")
|
64 | 65 |
|
|
67 | 68 | # for which no measurements were collected. Such a processing will have an
|
68 | 69 | # smoothing effect on the data.
|
69 | 70 |
|
70 |
| -try: |
71 |
| - co2_data_resampled_monthly = co2_data.resample("ME") |
72 |
| -except ValueError: |
73 |
| - # pandas < 2.2 uses M instead of ME |
74 |
| - co2_data_resampled_monthly = co2_data.resample("M") |
75 |
| - |
76 |
| - |
77 |
| -co2_data = co2_data_resampled_monthly.mean().dropna(axis="index", how="any") |
78 |
| -co2_data.plot() |
| 71 | +co2_data = ( |
| 72 | + co2_data.sort(by="date") |
| 73 | + .group_by_dynamic("date", every="1mo") |
| 74 | + .agg(pl.col("co2").mean()) |
| 75 | + .with_columns(pl.col("date").dt.month_end()) |
| 76 | + .filter(pl.col("date").is_not_null()) |
| 77 | +) |
| 78 | +plt.plot(co2_data["date"], co2_data["co2"]) |
| 79 | +plt.xlabel("date") |
79 | 80 | plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
|
80 | 81 | _ = plt.title(
|
81 | 82 | "Monthly average of air samples measurements\nfrom the Mauna Loa Observatory"
|
|
88 | 89 | #
|
89 | 90 | # As a first step, we will divide the data and the target to estimate. The data
|
90 | 91 | # being a date, we will convert it into a numeric.
|
91 |
| -X = (co2_data.index.year + co2_data.index.month / 12).to_numpy().reshape(-1, 1) |
| 92 | +X = ( |
| 93 | + co2_data.select("date") |
| 94 | + .with_columns(pl.col("date").dt.year() + pl.col("date").dt.month() / 12) |
| 95 | + .to_numpy() |
| 96 | + .reshape(-1, 1) |
| 97 | +) |
92 | 98 | y = co2_data["co2"].to_numpy()
|
93 | 99 |
|
94 | 100 | # %%
|
|
0 commit comments