From f3f8a415b9f2d923092a64d5c96d0891cdbeccc7 Mon Sep 17 00:00:00 2001 From: saldanhad Date: Fri, 27 Sep 2024 14:00:33 +0530 Subject: [PATCH 1/9] merge validation curve and traintest_error --- .../plot_train_error_vs_test_error.py | 42 ++++++++++++++++++ .../model_selection/plot_validation_curve.py | 43 ------------------- 2 files changed, 42 insertions(+), 43 deletions(-) delete mode 100644 examples/model_selection/plot_validation_curve.py diff --git a/examples/model_selection/plot_train_error_vs_test_error.py b/examples/model_selection/plot_train_error_vs_test_error.py index dc370383b2ef7..7b6edc2d08cad 100644 --- a/examples/model_selection/plot_train_error_vs_test_error.py +++ b/examples/model_selection/plot_train_error_vs_test_error.py @@ -86,3 +86,45 @@ plt.legend() plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.26) plt.show() + +# %% +# Plotting Validation Curves +#------------------------------------------------------------- +#In this plot, you can see the training and validation scores +#of the ElasticNet model for different values of regularization +#parameter alpha. As can be inferred from the plot, for very low values +#of alpha (close to zero), the regularization is weak, meaning the model +#fits the training data very closely, leading to high training scores but lower +#validation scores. This is a case of overfitting, where the model captures +#noise in the training data rather than the underlying pattern. +# +#Using the ``ValidationCurveDisplay`` class helps by automating the plotting of training +#and validation scores across a range of alpha values, eliminating the need for +# manual iteration and plotting, and providing a clear, consistent visualization +# of model performance. + + +from sklearn.model_selection import ValidationCurveDisplay + +# Define the range of alphas (regularization strength) to explore +alphas = np.logspace(-5, 1, 60) + +# Use the ValidationCurveDisplay to automatically plot the train and test scores +disp = ValidationCurveDisplay.from_estimator( + enet, # ElasticNet model + X_train, # Training data + y_train, # Training target + param_name="alpha", # Hyperparameter to vary + param_range=alphas, # Range of alpha values + scoring="r2", # Scoring metric, R^2 in this case + n_jobs=-1, # Use all available CPUs + score_type="both", # Plot both training and test scores +) + +# Customize the display +disp.ax_.set_title("Validation Curve for ElasticNet (R^2 Score)") +disp.ax_.set_xlabel(r"alpha (regularization strength)") +disp.ax_.set_ylabel("R^2 Score") +disp.ax_.set_ylim(0.0, 1.1) + +plt.show() \ No newline at end of file diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py deleted file mode 100644 index 44a382fed0c17..0000000000000 --- a/examples/model_selection/plot_validation_curve.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -========================== -Plotting Validation Curves -========================== - -In this plot you can see the training scores and validation scores of an SVM -for different values of the kernel parameter gamma. For very low values of -gamma, you can see that both the training score and the validation score are -low. This is called underfitting. Medium values of gamma will result in high -values for both scores, i.e. the classifier is performing fairly well. If gamma -is too high, the classifier will overfit, which means that the training score -is good but the validation score is poor. - -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -import matplotlib.pyplot as plt -import numpy as np - -from sklearn.datasets import load_digits -from sklearn.model_selection import ValidationCurveDisplay -from sklearn.svm import SVC - -X, y = load_digits(return_X_y=True) -subset_mask = np.isin(y, [1, 2]) # binary classification: 1 vs 2 -X, y = X[subset_mask], y[subset_mask] - -disp = ValidationCurveDisplay.from_estimator( - SVC(), - X, - y, - param_name="gamma", - param_range=np.logspace(-6, -1, 5), - score_type="both", - n_jobs=2, - score_name="Accuracy", -) -disp.ax_.set_title("Validation Curve for SVM with an RBF kernel") -disp.ax_.set_xlabel(r"gamma (inverse radius of the RBF kernel)") -disp.ax_.set_ylim(0.0, 1.1) -plt.show() From 5b947b7f19d82decaa1b50fca627d78fa32f5582 Mon Sep 17 00:00:00 2001 From: saldanhad Date: Fri, 27 Sep 2024 14:15:03 +0530 Subject: [PATCH 2/9] fix linting --- .../plot_train_error_vs_test_error.py | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/examples/model_selection/plot_train_error_vs_test_error.py b/examples/model_selection/plot_train_error_vs_test_error.py index 7b6edc2d08cad..8a24126852d35 100644 --- a/examples/model_selection/plot_train_error_vs_test_error.py +++ b/examples/model_selection/plot_train_error_vs_test_error.py @@ -89,19 +89,19 @@ # %% # Plotting Validation Curves -#------------------------------------------------------------- -#In this plot, you can see the training and validation scores -#of the ElasticNet model for different values of regularization -#parameter alpha. As can be inferred from the plot, for very low values -#of alpha (close to zero), the regularization is weak, meaning the model -#fits the training data very closely, leading to high training scores but lower -#validation scores. This is a case of overfitting, where the model captures -#noise in the training data rather than the underlying pattern. +# ------------------------------------------------------------- +# In this plot, you can see the training and validation scores +# of the ElasticNet model for different values of regularization +# parameter alpha. As can be inferred from the plot, for very low values +# of alpha (close to zero), the regularization is weak, meaning the model +# fits the training data very closely, leading to high training scores but lower +# validation scores. This is a case of overfitting, where the model captures +# noise in the training data rather than the underlying pattern. # -#Using the ``ValidationCurveDisplay`` class helps by automating the plotting of training -#and validation scores across a range of alpha values, eliminating the need for -# manual iteration and plotting, and providing a clear, consistent visualization -# of model performance. +# Using the ``ValidationCurveDisplay`` class helps by automating the plotting of +# trainingand validation scores across a range of alpha values, eliminating the +# need for manual iteration and plotting, and providing a clear, consistent +# visualization of model performance. from sklearn.model_selection import ValidationCurveDisplay @@ -111,13 +111,13 @@ # Use the ValidationCurveDisplay to automatically plot the train and test scores disp = ValidationCurveDisplay.from_estimator( - enet, # ElasticNet model - X_train, # Training data - y_train, # Training target - param_name="alpha", # Hyperparameter to vary - param_range=alphas, # Range of alpha values - scoring="r2", # Scoring metric, R^2 in this case - n_jobs=-1, # Use all available CPUs + enet, # ElasticNet model + X_train, # Training data + y_train, # Training target + param_name="alpha", # Hyperparameter to vary + param_range=alphas, # Range of alpha values + scoring="r2", # Scoring metric, R^2 in this case + n_jobs=-1, # Use all available CPUs score_type="both", # Plot both training and test scores ) @@ -127,4 +127,4 @@ disp.ax_.set_ylabel("R^2 Score") disp.ax_.set_ylim(0.0, 1.1) -plt.show() \ No newline at end of file +plt.show() From 30cfdb832c058819860702a73baef64037de830b Mon Sep 17 00:00:00 2001 From: saldanhad Date: Fri, 27 Sep 2024 21:35:24 +0530 Subject: [PATCH 3/9] update redirects in conf.py --- doc/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/conf.py b/doc/conf.py index d07926b8b27f4..c15bf3e0852f8 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -447,6 +447,9 @@ def add_js_css_files(app, pagename, templatename, context, doctree): "auto_examples/model_selection/grid_search_text_feature_extraction.py": ( "auto_examples/model_selection/plot_grid_search_text_feature_extraction.py" ), + "auto_examples/model_selection/plot_validation_curve.py": ( + "auto_examples/model_selection/plot_train_error_vs_test_error.py" + ), "auto_examples/miscellaneous/plot_changed_only_pprint_parameter": ( "auto_examples/miscellaneous/plot_estimator_representation" ), From 9cff8440e38dfaf9056b696a75c7b1f4637832ab Mon Sep 17 00:00:00 2001 From: Deepak Saldanha Date: Sat, 28 Sep 2024 09:20:34 +0000 Subject: [PATCH 4/9] rename file, reflecting change --- ...rror_vs_test_error_and_validation_curve.py | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 examples/model_selection/plot_train_error_vs_test_error_and_validation_curve.py diff --git a/examples/model_selection/plot_train_error_vs_test_error_and_validation_curve.py b/examples/model_selection/plot_train_error_vs_test_error_and_validation_curve.py new file mode 100644 index 0000000000000..8a24126852d35 --- /dev/null +++ b/examples/model_selection/plot_train_error_vs_test_error_and_validation_curve.py @@ -0,0 +1,130 @@ +""" +========================= +Train error vs Test error +========================= + +Illustration of how the performance of an estimator on unseen data (test data) +is not the same as the performance on training data. As the regularization +increases the performance on train decreases while the performance on test +is optimal within a range of values of the regularization parameter. +The example with an Elastic-Net regression model and the performance is +measured using the explained variance a.k.a. R^2. + +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# %% +# Generate sample data +# -------------------- +import numpy as np + +from sklearn import linear_model +from sklearn.datasets import make_regression +from sklearn.model_selection import train_test_split + +n_samples_train, n_samples_test, n_features = 75, 150, 500 +X, y, coef = make_regression( + n_samples=n_samples_train + n_samples_test, + n_features=n_features, + n_informative=50, + shuffle=False, + noise=1.0, + coef=True, +) +X_train, X_test, y_train, y_test = train_test_split( + X, y, train_size=n_samples_train, test_size=n_samples_test, shuffle=False +) +# %% +# Compute train and test errors +# ----------------------------- +alphas = np.logspace(-5, 1, 60) +enet = linear_model.ElasticNet(l1_ratio=0.7, max_iter=10000) +train_errors = list() +test_errors = list() +for alpha in alphas: + enet.set_params(alpha=alpha) + enet.fit(X_train, y_train) + train_errors.append(enet.score(X_train, y_train)) + test_errors.append(enet.score(X_test, y_test)) + +i_alpha_optim = np.argmax(test_errors) +alpha_optim = alphas[i_alpha_optim] +print("Optimal regularization parameter : %s" % alpha_optim) + +# Estimate the coef_ on full data with optimal regularization parameter +enet.set_params(alpha=alpha_optim) +coef_ = enet.fit(X, y).coef_ + +# %% +# Plot results functions +# ---------------------- + +import matplotlib.pyplot as plt + +plt.subplot(2, 1, 1) +plt.semilogx(alphas, train_errors, label="Train") +plt.semilogx(alphas, test_errors, label="Test") +plt.vlines( + alpha_optim, + plt.ylim()[0], + np.max(test_errors), + color="k", + linewidth=3, + label="Optimum on test", +) +plt.legend(loc="lower right") +plt.ylim([0, 1.2]) +plt.xlabel("Regularization parameter") +plt.ylabel("Performance") + +# Show estimated coef_ vs true coef +plt.subplot(2, 1, 2) +plt.plot(coef, label="True coef") +plt.plot(coef_, label="Estimated coef") +plt.legend() +plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.26) +plt.show() + +# %% +# Plotting Validation Curves +# ------------------------------------------------------------- +# In this plot, you can see the training and validation scores +# of the ElasticNet model for different values of regularization +# parameter alpha. As can be inferred from the plot, for very low values +# of alpha (close to zero), the regularization is weak, meaning the model +# fits the training data very closely, leading to high training scores but lower +# validation scores. This is a case of overfitting, where the model captures +# noise in the training data rather than the underlying pattern. +# +# Using the ``ValidationCurveDisplay`` class helps by automating the plotting of +# trainingand validation scores across a range of alpha values, eliminating the +# need for manual iteration and plotting, and providing a clear, consistent +# visualization of model performance. + + +from sklearn.model_selection import ValidationCurveDisplay + +# Define the range of alphas (regularization strength) to explore +alphas = np.logspace(-5, 1, 60) + +# Use the ValidationCurveDisplay to automatically plot the train and test scores +disp = ValidationCurveDisplay.from_estimator( + enet, # ElasticNet model + X_train, # Training data + y_train, # Training target + param_name="alpha", # Hyperparameter to vary + param_range=alphas, # Range of alpha values + scoring="r2", # Scoring metric, R^2 in this case + n_jobs=-1, # Use all available CPUs + score_type="both", # Plot both training and test scores +) + +# Customize the display +disp.ax_.set_title("Validation Curve for ElasticNet (R^2 Score)") +disp.ax_.set_xlabel(r"alpha (regularization strength)") +disp.ax_.set_ylabel("R^2 Score") +disp.ax_.set_ylim(0.0, 1.1) + +plt.show() From d794f2e84d410b8355b8b1fd4eb797660a568b22 Mon Sep 17 00:00:00 2001 From: saldanhad Date: Sun, 29 Sep 2024 02:04:58 +0530 Subject: [PATCH 5/9] update link in conf.py --- doc/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/conf.py b/doc/conf.py index c15bf3e0852f8..cc92fc6ccd08b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -448,7 +448,7 @@ def add_js_css_files(app, pagename, templatename, context, doctree): "auto_examples/model_selection/plot_grid_search_text_feature_extraction.py" ), "auto_examples/model_selection/plot_validation_curve.py": ( - "auto_examples/model_selection/plot_train_error_vs_test_error.py" + "auto_examples/model_selection/plot_train_error_vs_test_error_and_validation_curve.py" ), "auto_examples/miscellaneous/plot_changed_only_pprint_parameter": ( "auto_examples/miscellaneous/plot_estimator_representation" From ac8d7776cf169d071d245b1c6b8eab92908fd5f6 Mon Sep 17 00:00:00 2001 From: saldanhad Date: Wed, 2 Oct 2024 01:21:34 +0530 Subject: [PATCH 6/9] implement changes post review --- doc/conf.py | 2 +- .../plot_train_error_vs_test_error.py | 77 +++++++++---------- 2 files changed, 36 insertions(+), 43 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 32b5dfc788f0a..f1abff0b1b4b9 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -448,7 +448,7 @@ def add_js_css_files(app, pagename, templatename, context, doctree): "auto_examples/model_selection/plot_grid_search_text_feature_extraction.py" ), "auto_examples/model_selection/plot_validation_curve.py": ( - "auto_examples/model_selection/plot_train_error_vs_test_error_and_validation_curve.py" + "auto_examples/model_selection/plot_train_error_vs_test_error.py" ), "auto_examples/datasets/plot_digits_last_image.py": ( "auto_examples/exercises/plot_digits_classification_exercises.py" diff --git a/examples/model_selection/plot_train_error_vs_test_error.py b/examples/model_selection/plot_train_error_vs_test_error.py index 8a24126852d35..5abcc104cf058 100644 --- a/examples/model_selection/plot_train_error_vs_test_error.py +++ b/examples/model_selection/plot_train_error_vs_test_error.py @@ -57,36 +57,6 @@ enet.set_params(alpha=alpha_optim) coef_ = enet.fit(X, y).coef_ -# %% -# Plot results functions -# ---------------------- - -import matplotlib.pyplot as plt - -plt.subplot(2, 1, 1) -plt.semilogx(alphas, train_errors, label="Train") -plt.semilogx(alphas, test_errors, label="Test") -plt.vlines( - alpha_optim, - plt.ylim()[0], - np.max(test_errors), - color="k", - linewidth=3, - label="Optimum on test", -) -plt.legend(loc="lower right") -plt.ylim([0, 1.2]) -plt.xlabel("Regularization parameter") -plt.ylabel("Performance") - -# Show estimated coef_ vs true coef -plt.subplot(2, 1, 2) -plt.plot(coef, label="True coef") -plt.plot(coef_, label="Estimated coef") -plt.legend() -plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.26) -plt.show() - # %% # Plotting Validation Curves # ------------------------------------------------------------- @@ -103,28 +73,51 @@ # need for manual iteration and plotting, and providing a clear, consistent # visualization of model performance. +import matplotlib.pyplot as plt from sklearn.model_selection import ValidationCurveDisplay -# Define the range of alphas (regularization strength) to explore alphas = np.logspace(-5, 1, 60) -# Use the ValidationCurveDisplay to automatically plot the train and test scores disp = ValidationCurveDisplay.from_estimator( - enet, # ElasticNet model - X_train, # Training data - y_train, # Training target - param_name="alpha", # Hyperparameter to vary - param_range=alphas, # Range of alpha values - scoring="r2", # Scoring metric, R^2 in this case - n_jobs=-1, # Use all available CPUs - score_type="both", # Plot both training and test scores + enet, + X_train, + y_train, + param_name="alpha", + param_range=alphas, + scoring="r2", + n_jobs=2, + score_type="both", ) -# Customize the display disp.ax_.set_title("Validation Curve for ElasticNet (R^2 Score)") disp.ax_.set_xlabel(r"alpha (regularization strength)") disp.ax_.set_ylabel("R^2 Score") -disp.ax_.set_ylim(0.0, 1.1) +disp.ax_.set_ylim(-1.0, 1.2) +disp.ax_.vlines( + alpha_optim, + disp.ax_.get_ylim()[0], + np.max(test_errors), + color="k", + linewidth=3, + label="Optimum on test", +) +disp.ax_.legend(loc="lower right") + +plt.show() + +# %% +# Plotting Performance Comparison Curves +# ------------------------------------------------------------- +# This plot compares the true coefficients (coef) with the estimated coefficients (coef_) +# from the model. It visually helps assess how well the model has captured the +# underlying patterns in the data. +plt.plot(coef, label="True coef") +plt.plot(coef_, label="Estimated coef") +plt.legend() +plt.title("True vs Estimated Coefficients") +plt.xlabel("Feature Index") +plt.ylabel("Coefficient Value") +plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.26) plt.show() From 3217ccc62c760181417aba129d9f1a34364eea03 Mon Sep 17 00:00:00 2001 From: saldanhad Date: Wed, 2 Oct 2024 01:22:32 +0530 Subject: [PATCH 7/9] delete duplicate file --- ...rror_vs_test_error_and_validation_curve.py | 130 ------------------ 1 file changed, 130 deletions(-) delete mode 100644 examples/model_selection/plot_train_error_vs_test_error_and_validation_curve.py diff --git a/examples/model_selection/plot_train_error_vs_test_error_and_validation_curve.py b/examples/model_selection/plot_train_error_vs_test_error_and_validation_curve.py deleted file mode 100644 index 8a24126852d35..0000000000000 --- a/examples/model_selection/plot_train_error_vs_test_error_and_validation_curve.py +++ /dev/null @@ -1,130 +0,0 @@ -""" -========================= -Train error vs Test error -========================= - -Illustration of how the performance of an estimator on unseen data (test data) -is not the same as the performance on training data. As the regularization -increases the performance on train decreases while the performance on test -is optimal within a range of values of the regularization parameter. -The example with an Elastic-Net regression model and the performance is -measured using the explained variance a.k.a. R^2. - -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -# %% -# Generate sample data -# -------------------- -import numpy as np - -from sklearn import linear_model -from sklearn.datasets import make_regression -from sklearn.model_selection import train_test_split - -n_samples_train, n_samples_test, n_features = 75, 150, 500 -X, y, coef = make_regression( - n_samples=n_samples_train + n_samples_test, - n_features=n_features, - n_informative=50, - shuffle=False, - noise=1.0, - coef=True, -) -X_train, X_test, y_train, y_test = train_test_split( - X, y, train_size=n_samples_train, test_size=n_samples_test, shuffle=False -) -# %% -# Compute train and test errors -# ----------------------------- -alphas = np.logspace(-5, 1, 60) -enet = linear_model.ElasticNet(l1_ratio=0.7, max_iter=10000) -train_errors = list() -test_errors = list() -for alpha in alphas: - enet.set_params(alpha=alpha) - enet.fit(X_train, y_train) - train_errors.append(enet.score(X_train, y_train)) - test_errors.append(enet.score(X_test, y_test)) - -i_alpha_optim = np.argmax(test_errors) -alpha_optim = alphas[i_alpha_optim] -print("Optimal regularization parameter : %s" % alpha_optim) - -# Estimate the coef_ on full data with optimal regularization parameter -enet.set_params(alpha=alpha_optim) -coef_ = enet.fit(X, y).coef_ - -# %% -# Plot results functions -# ---------------------- - -import matplotlib.pyplot as plt - -plt.subplot(2, 1, 1) -plt.semilogx(alphas, train_errors, label="Train") -plt.semilogx(alphas, test_errors, label="Test") -plt.vlines( - alpha_optim, - plt.ylim()[0], - np.max(test_errors), - color="k", - linewidth=3, - label="Optimum on test", -) -plt.legend(loc="lower right") -plt.ylim([0, 1.2]) -plt.xlabel("Regularization parameter") -plt.ylabel("Performance") - -# Show estimated coef_ vs true coef -plt.subplot(2, 1, 2) -plt.plot(coef, label="True coef") -plt.plot(coef_, label="Estimated coef") -plt.legend() -plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.26) -plt.show() - -# %% -# Plotting Validation Curves -# ------------------------------------------------------------- -# In this plot, you can see the training and validation scores -# of the ElasticNet model for different values of regularization -# parameter alpha. As can be inferred from the plot, for very low values -# of alpha (close to zero), the regularization is weak, meaning the model -# fits the training data very closely, leading to high training scores but lower -# validation scores. This is a case of overfitting, where the model captures -# noise in the training data rather than the underlying pattern. -# -# Using the ``ValidationCurveDisplay`` class helps by automating the plotting of -# trainingand validation scores across a range of alpha values, eliminating the -# need for manual iteration and plotting, and providing a clear, consistent -# visualization of model performance. - - -from sklearn.model_selection import ValidationCurveDisplay - -# Define the range of alphas (regularization strength) to explore -alphas = np.logspace(-5, 1, 60) - -# Use the ValidationCurveDisplay to automatically plot the train and test scores -disp = ValidationCurveDisplay.from_estimator( - enet, # ElasticNet model - X_train, # Training data - y_train, # Training target - param_name="alpha", # Hyperparameter to vary - param_range=alphas, # Range of alpha values - scoring="r2", # Scoring metric, R^2 in this case - n_jobs=-1, # Use all available CPUs - score_type="both", # Plot both training and test scores -) - -# Customize the display -disp.ax_.set_title("Validation Curve for ElasticNet (R^2 Score)") -disp.ax_.set_xlabel(r"alpha (regularization strength)") -disp.ax_.set_ylabel("R^2 Score") -disp.ax_.set_ylim(0.0, 1.1) - -plt.show() From 24c904ab22624f668efcb9b85015756f5d5c2a97 Mon Sep 17 00:00:00 2001 From: saldanhad Date: Wed, 2 Oct 2024 01:26:46 +0530 Subject: [PATCH 8/9] fix linting --- examples/model_selection/plot_train_error_vs_test_error.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/model_selection/plot_train_error_vs_test_error.py b/examples/model_selection/plot_train_error_vs_test_error.py index 5abcc104cf058..be6ac0c4031f9 100644 --- a/examples/model_selection/plot_train_error_vs_test_error.py +++ b/examples/model_selection/plot_train_error_vs_test_error.py @@ -109,8 +109,8 @@ # %% # Plotting Performance Comparison Curves # ------------------------------------------------------------- -# This plot compares the true coefficients (coef) with the estimated coefficients (coef_) -# from the model. It visually helps assess how well the model has captured the +# This plot compares the true coefficients (coef) with the estimated coefficients +# (coef_) from the model. It visually helps assess how well the model has captured the # underlying patterns in the data. plt.plot(coef, label="True coef") From 5376ea22cb42f0b305ba741d7e56c62e40ccc882 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 11 Oct 2024 17:37:36 +0200 Subject: [PATCH 9/9] improve narrative of the example and improve visual by tweeking data and model --- doc/conf.py | 4 +- .../plot_train_error_vs_test_error.py | 191 +++++++++++------- 2 files changed, 119 insertions(+), 76 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 079c82422b0a7..278b588c103b5 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -447,8 +447,8 @@ def add_js_css_files(app, pagename, templatename, context, doctree): "auto_examples/model_selection/grid_search_text_feature_extraction": ( "auto_examples/model_selection/plot_grid_search_text_feature_extraction" ), - "auto_examples/model_selection/plot_validation_curve.py": ( - "auto_examples/model_selection/plot_train_error_vs_test_error.py" + "auto_examples/model_selection/plot_validation_curve": ( + "auto_examples/model_selection/plot_train_error_vs_test_error" ), "auto_examples/datasets/plot_digits_last_image": ( "auto_examples/exercises/plot_digits_classification_exercises" diff --git a/examples/model_selection/plot_train_error_vs_test_error.py b/examples/model_selection/plot_train_error_vs_test_error.py index be6ac0c4031f9..a64b4ca94846e 100644 --- a/examples/model_selection/plot_train_error_vs_test_error.py +++ b/examples/model_selection/plot_train_error_vs_test_error.py @@ -1,15 +1,18 @@ """ -========================= -Train error vs Test error -========================= - -Illustration of how the performance of an estimator on unseen data (test data) -is not the same as the performance on training data. As the regularization -increases the performance on train decreases while the performance on test -is optimal within a range of values of the regularization parameter. -The example with an Elastic-Net regression model and the performance is -measured using the explained variance a.k.a. R^2. - +========================================================= +Effect of model regularization on training and test error +========================================================= + +In this example, we evaluate the impact of the regularization parameter in a +linear model called :class:`~sklearn.linear_model.ElasticNet`. To carry out this +evaluation, we use a validation curve using +:class:`~sklearn.model_selection.ValidationCurveDisplay`. This curve shows the +training and test scores of the model for different values of the regularization +parameter. + +Once we identify the optimal regularization parameter, we compare the true and +estimated coefficients of the model to determine if the model is able to recover +the coefficients from the noisy input data. """ # Authors: The scikit-learn developers @@ -18,67 +21,75 @@ # %% # Generate sample data # -------------------- -import numpy as np - -from sklearn import linear_model +# +# We generate a regression dataset that contains many features relative to the +# number of samples. However, only 10% of the features are informative. In this context, +# linear models exposing L1 penalization are commonly used to recover a sparse +# set of coefficients. from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split -n_samples_train, n_samples_test, n_features = 75, 150, 500 -X, y, coef = make_regression( +n_samples_train, n_samples_test, n_features = 150, 300, 500 +X, y, true_coef = make_regression( n_samples=n_samples_train + n_samples_test, n_features=n_features, n_informative=50, shuffle=False, noise=1.0, coef=True, + random_state=42, ) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=n_samples_train, test_size=n_samples_test, shuffle=False ) -# %% -# Compute train and test errors -# ----------------------------- -alphas = np.logspace(-5, 1, 60) -enet = linear_model.ElasticNet(l1_ratio=0.7, max_iter=10000) -train_errors = list() -test_errors = list() -for alpha in alphas: - enet.set_params(alpha=alpha) - enet.fit(X_train, y_train) - train_errors.append(enet.score(X_train, y_train)) - test_errors.append(enet.score(X_test, y_test)) - -i_alpha_optim = np.argmax(test_errors) -alpha_optim = alphas[i_alpha_optim] -print("Optimal regularization parameter : %s" % alpha_optim) - -# Estimate the coef_ on full data with optimal regularization parameter -enet.set_params(alpha=alpha_optim) -coef_ = enet.fit(X, y).coef_ # %% -# Plotting Validation Curves -# ------------------------------------------------------------- -# In this plot, you can see the training and validation scores -# of the ElasticNet model for different values of regularization -# parameter alpha. As can be inferred from the plot, for very low values -# of alpha (close to zero), the regularization is weak, meaning the model -# fits the training data very closely, leading to high training scores but lower -# validation scores. This is a case of overfitting, where the model captures -# noise in the training data rather than the underlying pattern. +# Model definition +# ---------------- +# +# Here, we do not use a model that only exposes an L1 penalty. Instead, we use +# an :class:`~sklearn.linear_model.ElasticNet` model that exposes both L1 and L2 +# penalties. +# +# We fix the `l1_ratio` parameter such that the solution found by the model is still +# sparse. Therefore, this type of model tries to find a sparse solution but at the same +# time also tries to shrink all coefficients towards zero. # -# Using the ``ValidationCurveDisplay`` class helps by automating the plotting of -# trainingand validation scores across a range of alpha values, eliminating the -# need for manual iteration and plotting, and providing a clear, consistent -# visualization of model performance. +# In addition, we force the coefficients of the model to be positive since we know that +# `make_regression` generates a response with a positive signal. So we use this +# pre-knowledge to get a better model. -import matplotlib.pyplot as plt +from sklearn.linear_model import ElasticNet + +enet = ElasticNet(l1_ratio=0.9, positive=True, max_iter=10_000) + + +# %% +# Evaluate the impact of the regularization parameter +# --------------------------------------------------- +# +# To evaluate the impact of the regularization parameter, we use a validation +# curve. This curve shows the training and test scores of the model for different +# values of the regularization parameter. +# +# The regularization `alpha` is a parameter applied to the coefficients of the model: +# when it tends to zero, no regularization is applied and the model tries to fit the +# training data with the least amount of error. However, it leads to overfitting when +# features are noisy. When `alpha` increases, the model coefficients are constrained, +# and thus the model cannot fit the training data as closely, avoiding overfitting. +# However, if too much regularization is applied, the model underfits the data and +# is not able to properly capture the signal. +# +# The validation curve helps in finding a good trade-off between both extremes: the +# model is not regularized and thus flexible enough to fit the signal, but not too +# flexible to overfit. The :class:`~sklearn.model_selection.ValidationCurveDisplay` +# allows us to display the training and validation scores across a range of alpha +# values. +import numpy as np from sklearn.model_selection import ValidationCurveDisplay alphas = np.logspace(-5, 1, 60) - disp = ValidationCurveDisplay.from_estimator( enet, X_train, @@ -89,35 +100,67 @@ n_jobs=2, score_type="both", ) +disp.ax_.set( + title=r"Validation Curve for ElasticNet (R$^2$ Score)", + xlabel=r"alpha (regularization strength)", + ylabel="R$^2$ Score", +) -disp.ax_.set_title("Validation Curve for ElasticNet (R^2 Score)") -disp.ax_.set_xlabel(r"alpha (regularization strength)") -disp.ax_.set_ylabel("R^2 Score") -disp.ax_.set_ylim(-1.0, 1.2) +test_scores_mean = disp.test_scores.mean(axis=1) +idx_avg_max_test_score = np.argmax(test_scores_mean) disp.ax_.vlines( - alpha_optim, + alphas[idx_avg_max_test_score], disp.ax_.get_ylim()[0], - np.max(test_errors), + test_scores_mean[idx_avg_max_test_score], color="k", - linewidth=3, - label="Optimum on test", + linewidth=2, + linestyle="--", + label=f"Optimum on test\n$\\alpha$ = {alphas[idx_avg_max_test_score]:.2e}", ) -disp.ax_.legend(loc="lower right") +_ = disp.ax_.legend(loc="lower right") -plt.show() +# %% +# To find the optimal regularization parameter, we can select the value of `alpha` +# that maximizes the validation score. +# +# Coefficients comparison +# ----------------------- +# +# Now that we have identified the optimal regularization parameter, we can compare the +# true coefficients and the estimated coefficients. +# +# First, let's set the regularization parameter to the optimal value and fit the +# model on the training data. In addition, we'll show the test score for this model. +enet.set_params(alpha=alphas[idx_avg_max_test_score]).fit(X_train, y_train) +print( + f"Test score: {enet.score(X_test, y_test):.3f}", +) # %% -# Plotting Performance Comparison Curves -# ------------------------------------------------------------- -# This plot compares the true coefficients (coef) with the estimated coefficients -# (coef_) from the model. It visually helps assess how well the model has captured the -# underlying patterns in the data. - -plt.plot(coef, label="True coef") -plt.plot(coef_, label="Estimated coef") -plt.legend() -plt.title("True vs Estimated Coefficients") -plt.xlabel("Feature Index") -plt.ylabel("Coefficient Value") -plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.26) +# Now, we plot the true coefficients and the estimated coefficients. +import matplotlib.pyplot as plt + +fig, axs = plt.subplots(ncols=2, figsize=(12, 6), sharex=True, sharey=True) +for ax, coef, title in zip(axs, [true_coef, enet.coef_], ["True", "Model"]): + ax.stem(coef) + ax.set( + title=f"{title} Coefficients", + xlabel="Feature Index", + ylabel="Coefficient Value", + ) +fig.suptitle( + "Comparison of the coefficients of the true generative model and \n" + "the estimated elastic net coefficients" +) + plt.show() + +# %% +# While the original coefficients are sparse, the estimated coefficients are not +# as sparse. The reason is that we fixed the `l1_ratio` parameter to 0.9. We could +# force the model to get a sparser solution by increasing the `l1_ratio` parameter. +# +# However, we observed that for the estimated coefficients that are close to zero in +# the true generative model, our model shrinks them towards zero. So we don't recover +# the true coefficients, but we get a sensible outcome in line with the performance +# obtained on the test set.