From d81ae8cc85103c3d8453cc7436553e4ed40d9588 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Thu, 25 Nov 2021 11:44:25 -0500 Subject: [PATCH 1/5] ENH store test preds to reduce time --- ...plot_poisson_regression_non_normal_loss.py | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index cadc25412eb57..f0690ffad81fe 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -194,9 +194,11 @@ def score_estimator(estimator, df_test): ) ) + return y_pred + print("Constant mean frequency evaluation:") -score_estimator(dummy, df_test) +test_preds = [score_estimator(dummy, df_test)] # %% # (Generalized) linear models @@ -226,7 +228,7 @@ def score_estimator(estimator, df_test): # meta-estimator to map ``y_pred`` to a strictly positive domain. print("Ridge evaluation:") -score_estimator(ridge_glm, df_test) +test_preds.append(score_estimator(ridge_glm, df_test)) # %% # Next we fit the Poisson regressor on the target variable. We set the @@ -255,7 +257,7 @@ def score_estimator(estimator, df_test): ) print("PoissonRegressor evaluation:") -score_estimator(poisson_glm, df_test) +test_preds.append(score_estimator(poisson_glm, df_test)) # %% # Gradient Boosting Regression Trees for Poisson regression @@ -307,7 +309,7 @@ def score_estimator(estimator, df_test): ) print("Poisson Gradient Boosted Trees evaluation:") -score_estimator(poisson_gbrt, df_test) +test_preds.append(score_estimator(poisson_gbrt, df_test)) # %% # Like the Poisson GLM above, the gradient boosted trees model minimizes @@ -335,7 +337,10 @@ def score_estimator(estimator, df_test): axes[row_idx, 0].set_ylabel(label + " samples") for idx, model in enumerate([ridge_glm, poisson_glm, poisson_gbrt]): - y_pred = model.predict(df) + if label == "train": + y_pred = model.predict(df) + else: + y_pred = test_preds[idx + 1] pd.Series(y_pred).hist( bins=np.linspace(-1, 4, n_bins), ax=axes[row_idx, idx + 1] @@ -428,10 +433,12 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100 fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 8)) plt.subplots_adjust(wspace=0.3) -for axi, model in zip(ax.ravel(), [ridge_glm, poisson_glm, poisson_gbrt, dummy]): - y_pred = model.predict(df_test) - y_true = df_test["Frequency"].values - exposure = df_test["Exposure"].values +y_true = df_test["Frequency"].values +exposure = df_test["Exposure"].values +for idx, (axi, model) in enumerate( + zip(ax.ravel(), [dummy, ridge_glm, poisson_glm, poisson_gbrt]) +): + y_pred = test_preds[idx] q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( y_true, y_pred, sample_weight=exposure, n_bins=10 ) @@ -505,8 +512,8 @@ def lorenz_curve(y_true, y_pred, exposure): fig, ax = plt.subplots(figsize=(8, 8)) -for model in [dummy, ridge_glm, poisson_glm, poisson_gbrt]: - y_pred = model.predict(df_test) +for idx, model in enumerate([dummy, ridge_glm, poisson_glm, poisson_gbrt]): + y_pred = test_preds[idx] cum_exposure, cum_claims = lorenz_curve( df_test["Frequency"], y_pred, df_test["Exposure"] ) From e190c3a207011634df0b831c18eea7b7e161eee9 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Fri, 26 Nov 2021 09:43:30 -0500 Subject: [PATCH 2/5] ENH optimize loops & train/test splits --- .../plot_poisson_regression_non_normal_loss.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index f0690ffad81fe..1b500f4460b64 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -435,10 +435,9 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100 y_true = df_test["Frequency"].values exposure = df_test["Exposure"].values -for idx, (axi, model) in enumerate( - zip(ax.ravel(), [dummy, ridge_glm, poisson_glm, poisson_gbrt]) +for axi, model, y_pred in zip( + ax.ravel(), [dummy, ridge_glm, poisson_glm, poisson_gbrt], test_preds ): - y_pred = test_preds[idx] q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( y_true, y_pred, sample_weight=exposure, n_bins=10 ) @@ -512,8 +511,8 @@ def lorenz_curve(y_true, y_pred, exposure): fig, ax = plt.subplots(figsize=(8, 8)) -for idx, model in enumerate([dummy, ridge_glm, poisson_glm, poisson_gbrt]): - y_pred = test_preds[idx] + +for model, y_pred in zip([dummy, ridge_glm, poisson_glm, poisson_gbrt], test_preds): cum_exposure, cum_claims = lorenz_curve( df_test["Frequency"], y_pred, df_test["Exposure"] ) From 8a0aaadd474b4632848def5f283a515cd2f5e068 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Fri, 26 Nov 2021 11:32:03 -0500 Subject: [PATCH 3/5] ENH remove unnecessary code --- .../plot_poisson_regression_non_normal_loss.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 1b500f4460b64..3f1cad7ea5e75 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -244,17 +244,12 @@ def score_estimator(estimator, df_test): from sklearn.linear_model import PoissonRegressor -n_samples = df_train.shape[0] - poisson_glm = Pipeline( [ ("preprocessor", linear_model_preprocessor), ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300)), ] -) -poisson_glm.fit( - df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"] -) +).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) print("PoissonRegressor evaluation:") test_preds.append(score_estimator(poisson_glm, df_test)) @@ -303,10 +298,7 @@ def score_estimator(estimator, df_test): HistGradientBoostingRegressor(loss="poisson", max_leaf_nodes=128), ), ] -) -poisson_gbrt.fit( - df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"] -) +).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) print("Poisson Gradient Boosted Trees evaluation:") test_preds.append(score_estimator(poisson_gbrt, df_test)) From 39a6780c2aafd85f56b3c3dc5ec41252d1486a89 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Mon, 29 Nov 2021 14:58:56 -0500 Subject: [PATCH 4/5] FIX remove y labels from X sets --- ...plot_poisson_regression_non_normal_loss.py | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 3f1cad7ea5e75..0577c02506a18 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -145,7 +145,11 @@ ("preprocessor", linear_model_preprocessor), ("regressor", DummyRegressor(strategy="mean")), ] -).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) +).fit( + df_train.drop(columns=["Frequency"]), + df_train["Frequency"], + regressor__sample_weight=df_train["Exposure"], +) ############################################################################## @@ -159,7 +163,7 @@ def score_estimator(estimator, df_test): """Score an estimator on the test set.""" - y_pred = estimator.predict(df_test) + y_pred = estimator.predict(df_test.drop(columns=["Frequency"])) print( "MSE: %.3f" @@ -217,7 +221,11 @@ def score_estimator(estimator, df_test): ("preprocessor", linear_model_preprocessor), ("regressor", Ridge(alpha=1e-6)), ] -).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) +).fit( + df_train.drop(columns=["Frequency"]), + df_train["Frequency"], + regressor__sample_weight=df_train["Exposure"], +) # %% # The Poisson deviance cannot be computed on non-positive values predicted by @@ -249,7 +257,11 @@ def score_estimator(estimator, df_test): ("preprocessor", linear_model_preprocessor), ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300)), ] -).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) +).fit( + df_train.drop(columns=["Frequency"]), + df_train["Frequency"], + regressor__sample_weight=df_train["Exposure"], +) print("PoissonRegressor evaluation:") test_preds.append(score_estimator(poisson_glm, df_test)) @@ -298,7 +310,11 @@ def score_estimator(estimator, df_test): HistGradientBoostingRegressor(loss="poisson", max_leaf_nodes=128), ), ] -).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) +).fit( + df_train.drop(columns=["Frequency"]), + df_train["Frequency"], + regressor__sample_weight=df_train["Exposure"], +) print("Poisson Gradient Boosted Trees evaluation:") test_preds.append(score_estimator(poisson_gbrt, df_test)) @@ -330,7 +346,7 @@ def score_estimator(estimator, df_test): for idx, model in enumerate([ridge_glm, poisson_glm, poisson_gbrt]): if label == "train": - y_pred = model.predict(df) + y_pred = model.predict(df.drop(columns=["Frequency"])) else: y_pred = test_preds[idx + 1] From 688a8d2a584ba9b7ad2b38da7d009f01e3173e0f Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Tue, 7 Dec 2021 08:10:09 -0500 Subject: [PATCH 5/5] Revert "FIX remove y labels from X sets" This reverts commit 39a6780c2aafd85f56b3c3dc5ec41252d1486a89. --- ...plot_poisson_regression_non_normal_loss.py | 28 ++++--------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 0577c02506a18..3f1cad7ea5e75 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -145,11 +145,7 @@ ("preprocessor", linear_model_preprocessor), ("regressor", DummyRegressor(strategy="mean")), ] -).fit( - df_train.drop(columns=["Frequency"]), - df_train["Frequency"], - regressor__sample_weight=df_train["Exposure"], -) +).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) ############################################################################## @@ -163,7 +159,7 @@ def score_estimator(estimator, df_test): """Score an estimator on the test set.""" - y_pred = estimator.predict(df_test.drop(columns=["Frequency"])) + y_pred = estimator.predict(df_test) print( "MSE: %.3f" @@ -221,11 +217,7 @@ def score_estimator(estimator, df_test): ("preprocessor", linear_model_preprocessor), ("regressor", Ridge(alpha=1e-6)), ] -).fit( - df_train.drop(columns=["Frequency"]), - df_train["Frequency"], - regressor__sample_weight=df_train["Exposure"], -) +).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) # %% # The Poisson deviance cannot be computed on non-positive values predicted by @@ -257,11 +249,7 @@ def score_estimator(estimator, df_test): ("preprocessor", linear_model_preprocessor), ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300)), ] -).fit( - df_train.drop(columns=["Frequency"]), - df_train["Frequency"], - regressor__sample_weight=df_train["Exposure"], -) +).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) print("PoissonRegressor evaluation:") test_preds.append(score_estimator(poisson_glm, df_test)) @@ -310,11 +298,7 @@ def score_estimator(estimator, df_test): HistGradientBoostingRegressor(loss="poisson", max_leaf_nodes=128), ), ] -).fit( - df_train.drop(columns=["Frequency"]), - df_train["Frequency"], - regressor__sample_weight=df_train["Exposure"], -) +).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) print("Poisson Gradient Boosted Trees evaluation:") test_preds.append(score_estimator(poisson_gbrt, df_test)) @@ -346,7 +330,7 @@ def score_estimator(estimator, df_test): for idx, model in enumerate([ridge_glm, poisson_glm, poisson_gbrt]): if label == "train": - y_pred = model.predict(df.drop(columns=["Frequency"])) + y_pred = model.predict(df) else: y_pred = test_preds[idx + 1]