diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index cadc25412eb57..3f1cad7ea5e75 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -194,9 +194,11 @@ def score_estimator(estimator, df_test): ) ) + return y_pred + print("Constant mean frequency evaluation:") -score_estimator(dummy, df_test) +test_preds = [score_estimator(dummy, df_test)] # %% # (Generalized) linear models @@ -226,7 +228,7 @@ def score_estimator(estimator, df_test): # meta-estimator to map ``y_pred`` to a strictly positive domain. print("Ridge evaluation:") -score_estimator(ridge_glm, df_test) +test_preds.append(score_estimator(ridge_glm, df_test)) # %% # Next we fit the Poisson regressor on the target variable. We set the @@ -242,20 +244,15 @@ def score_estimator(estimator, df_test): from sklearn.linear_model import PoissonRegressor -n_samples = df_train.shape[0] - poisson_glm = Pipeline( [ ("preprocessor", linear_model_preprocessor), ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300)), ] -) -poisson_glm.fit( - df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"] -) +).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) print("PoissonRegressor evaluation:") -score_estimator(poisson_glm, df_test) +test_preds.append(score_estimator(poisson_glm, df_test)) # %% # Gradient Boosting Regression Trees for Poisson regression @@ -301,13 +298,10 @@ def score_estimator(estimator, df_test): HistGradientBoostingRegressor(loss="poisson", max_leaf_nodes=128), ), ] -) -poisson_gbrt.fit( - df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"] -) +).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) print("Poisson Gradient Boosted Trees evaluation:") -score_estimator(poisson_gbrt, df_test) +test_preds.append(score_estimator(poisson_gbrt, df_test)) # %% # Like the Poisson GLM above, the gradient boosted trees model minimizes @@ -335,7 +329,10 @@ def score_estimator(estimator, df_test): axes[row_idx, 0].set_ylabel(label + " samples") for idx, model in enumerate([ridge_glm, poisson_glm, poisson_gbrt]): - y_pred = model.predict(df) + if label == "train": + y_pred = model.predict(df) + else: + y_pred = test_preds[idx + 1] pd.Series(y_pred).hist( bins=np.linspace(-1, 4, n_bins), ax=axes[row_idx, idx + 1] @@ -428,10 +425,11 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100 fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 8)) plt.subplots_adjust(wspace=0.3) -for axi, model in zip(ax.ravel(), [ridge_glm, poisson_glm, poisson_gbrt, dummy]): - y_pred = model.predict(df_test) - y_true = df_test["Frequency"].values - exposure = df_test["Exposure"].values +y_true = df_test["Frequency"].values +exposure = df_test["Exposure"].values +for axi, model, y_pred in zip( + ax.ravel(), [dummy, ridge_glm, poisson_glm, poisson_gbrt], test_preds +): q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( y_true, y_pred, sample_weight=exposure, n_bins=10 ) @@ -505,8 +503,8 @@ def lorenz_curve(y_true, y_pred, exposure): fig, ax = plt.subplots(figsize=(8, 8)) -for model in [dummy, ridge_glm, poisson_glm, poisson_gbrt]: - y_pred = model.predict(df_test) + +for model, y_pred in zip([dummy, ridge_glm, poisson_glm, poisson_gbrt], test_preds): cum_exposure, cum_claims = lorenz_curve( df_test["Frequency"], y_pred, df_test["Exposure"] )