From e40b45d8f35c79b2f10b32bfa34c5ffa538d829e Mon Sep 17 00:00:00 2001 From: rachit mehta <23110261@iitgn.ac.in> Date: Tue, 8 Oct 2024 20:36:33 +0530 Subject: [PATCH] DOC improvements in plot_lasso_lasso_lars_elasticnet_path.py --- .../plot_lasso_lasso_lars_elasticnet_path.py | 238 +++++++++++------- 1 file changed, 143 insertions(+), 95 deletions(-) diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py index 44ae64c4c2811..cbae0ead12858 100644 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py @@ -1,136 +1,184 @@ -""" -======================================== -Lasso, Lasso-LARS, and Elastic Net paths -======================================== +''' +# Regularization Paths for Lasso, Lasso-LARS, and ElasticNet -This example shows how to compute the "paths" of coefficients along the Lasso, -Lasso-LARS, and Elastic Net regularization paths. In other words, it shows the -relationship between the regularization parameter (alpha) and the coefficients. +In this example, we will explore and compare the regularization paths of three important +linear models used for regularization: +- :func:`~sklearn.linear_model.Lasso` +- :func:`~sklearn.linear_model.LassoLars` +- :func:`~sklearn.linear_model.ElasticNet` -Lasso and Lasso-LARS impose a sparsity constraint on the coefficients, -encouraging some of them to be zero. Elastic Net is a generalization of -Lasso that adds an L2 penalty term to the L1 penalty term. This allows for -some coefficients to be non-zero while still encouraging sparsity. +## What is a Regularization Path? -Lasso and Elastic Net use a coordinate descent method to compute the paths, while -Lasso-LARS uses the LARS algorithm to compute the paths. +Regularization path is a plot between model coefficients and the regularization parameter (alpha). +For models like Lasso and ElasticNet, the path shows how coefficients +are shrunk towards zero as regularization becomes stronger. This helps in feature selection +and model interpretability. -The paths are computed using :func:`~sklearn.linear_model.lasso_path`, -:func:`~sklearn.linear_model.lars_path`, and :func:`~sklearn.linear_model.enet_path`. +We will dive into comparing Lasso vs ElasticNet, +and Lasso vs Lasso-LARS, focusing on their regularization paths. +''' -The results show different comparison plots: +import matplotlib.pyplot as plt +from itertools import cycle +from sklearn.datasets import load_diabetes +from sklearn.linear_model import enet_path, lasso_path, lars_path -- Compare Lasso and Lasso-LARS -- Compare Lasso and Elastic Net -- Compare Lasso with positive Lasso -- Compare LARS and Positive LARS -- Compare Elastic Net and positive Elastic Net +# Load the dataset +X, y = load_diabetes(return_X_y=True) +X /= X.std(axis=0) # Standardize data (this ensures the features have mean 0 and variance 1) -Each plot shows how the model coefficients vary as the regularization strength changes, -offering insight into the behavior of these models -under different constraints. -""" +''' +### 1. Lasso and ElasticNet: A Comparison of Regularization path -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause +Lasso (Least Absolute Shrinkage and Selection Operator) uses L1 regularization, meaning it +penalizes the absolute value of the coefficients. As a result, Lasso tends to produce sparse +models, where some coefficients are exactly zero. -from itertools import cycle +ElasticNet, on the other hand, is a combination of L1 and L2 regularization. The L2 penalty +helps overcome some limitations of Lasso, particularly when features are highly correlated. -import matplotlib.pyplot as plt +$$ \text{ElasticNet Loss} = \frac{1}{2n_{\text{samples}}} \|y - Xw\|^2_2 + \alpha \rho \|w\|_1 + \alpha (1 - \rho) \|w\|_2^2 $$ -from sklearn.datasets import load_diabetes -from sklearn.linear_model import enet_path, lars_path, lasso_path +where $\rho$ is the mix ratio between Lasso (L1) and Ridge (L2) penalties. +''' -X, y = load_diabetes(return_X_y=True) -X /= X.std(axis=0) # Standardize data (easier to set the l1_ratio parameter) +eps = 5e-3 # A smaller value of eps leads to a longer regularization path -# Compute paths +# Compute the regularization path for Lasso +alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps) -eps = 5e-3 # the smaller it is the longer is the path +# Compute the regularization path for ElasticNet with l1_ratio=0.8 +l1_ratio = 0.8 # This controls the mix between L1 and L2 (Lasso and Ridge) +alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=l1_ratio) -print("Computing regularization path using the lasso...") -alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps) +# Plot the paths for Lasso and ElasticNet +plt.figure(figsize=(10, 6)) +colors = cycle(["b", "r", "g", "c", "k"]) -print("Computing regularization path using the positive lasso...") -alphas_positive_lasso, coefs_positive_lasso, _ = lasso_path( - X, y, eps=eps, positive=True -) +for coef_l, coef_e, c in zip(coefs_lasso, coefs_enet, colors): + plt.semilogx(alphas_lasso, coef_l, c=c) + plt.semilogx(alphas_enet, coef_e, linestyle="--", c=c) -print("Computing regularization path using the LARS...") -alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso") +plt.xlabel("alpha") +plt.ylabel("coefficients") +plt.title("Lasso vs ElasticNet Regularization Path") +plt.legend(["Lasso", "ElasticNet (L1 ratio = 0.8)"], loc="upper left") +plt.axis("tight") +plt.show() + +''' +We can see in the plot that as alpha increases (more regularization), both Lasso and ElasticNet drive coefficients towards +zero. However, ElasticNet's combination of L1 and L2 regularization causes coefficients to +shrink more smoothly as compared to Lasso. This allows ElasticNet to handle correlated features +better, whereas Lasso might arbitrarily select one of the correlated features and set the rest to zero. +''' -print("Computing regularization path using the positive LARS...") -alphas_positive_lars, _, coefs_positive_lars = lars_path( - X, y, method="lasso", positive=True -) +''' +### 2. Lasso vs Lasso-LARS: Regularization Path -print("Computing regularization path using the elastic net...") -alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=0.8) +The main difference between Lasso and Lasso-LARS is the method it uses +to minimize loss. +Lasso uses cordinate descent to minimize the loss function which is an computationally +expensive method but Lasso-LARS (Least Angle Regression) is a more efficient +algorithm. It finds the minimum solution by moving in a path of most correlated +features. The regularization path of lasso and lasso-lars would similar, but +lasso-lars would be must faster when there are many correlated features. -print("Computing regularization path using the positive elastic net...") -alphas_positive_enet, coefs_positive_enet, _ = enet_path( - X, y, eps=eps, l1_ratio=0.8, positive=True -) +Let compute and compare their regularization paths. +''' -# Display results +# Compute the regularization path for Lasso-LARS +alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso") -plt.figure(1) +# Plot the paths for Lasso and Lasso-LARS +plt.figure(figsize=(10, 6)) colors = cycle(["b", "r", "g", "c", "k"]) + for coef_lasso, coef_lars, c in zip(coefs_lasso, coefs_lars, colors): - l1 = plt.semilogx(alphas_lasso, coef_lasso, c=c) - l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c=c) + plt.semilogx(alphas_lasso, coef_lasso, c=c) + plt.semilogx(alphas_lars, coef_lars, linestyle="--", c=c) plt.xlabel("alpha") plt.ylabel("coefficients") -plt.title("Lasso and LARS Paths") -plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right") +plt.title("Lasso vs Lasso-LARS Regularization Path") +plt.legend(["Lasso", "Lasso-LARS"], loc="upper right") plt.axis("tight") +plt.show() -plt.figure(2) -colors = cycle(["b", "r", "g", "c", "k"]) -for coef_l, coef_e, c in zip(coefs_lasso, coefs_enet, colors): - l1 = plt.semilogx(alphas_lasso, coef_l, c=c) - l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c=c) +''' +As We can see the paths for Lasso and Lasso-LARS are close to each other. But lasso-LARS has a more direct +path instead of a curve smooth path, that is because of its method of implementation. +Both methods set some coefficients to exactly zero, but the LARS algorithm moves in the +direction of the strongest feature correlation, making it particularly suited for sparse models. +''' -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("Lasso and Elastic-Net Paths") -plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right") -plt.axis("tight") +''' +### 3. Positive Constraints +Both Lasso and ElasticNet can also enforce positive constraints on the coefficients by +specifying `positive=True`. -plt.figure(3) -for coef_l, coef_pl, c in zip(coefs_lasso, coefs_positive_lasso, colors): - l1 = plt.semilogy(alphas_lasso, coef_l, c=c) - l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c=c) +Lets see how positive constraints impact the regularization paths for Lasso. +''' -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("Lasso and positive Lasso") -plt.legend((l1[-1], l2[-1]), ("Lasso", "positive Lasso"), loc="lower right") -plt.axis("tight") +alphas_positive_lasso, coefs_positive_lasso, _ = lasso_path(X, y, eps=eps, positive=True) +alphas_positive_enet, coefs_positive_enet, _ = enet_path(X, y, eps=eps, l1_ratio=l1_ratio, positive=True) +alphas_positive_lars, _, coefs_positive_lars = lars_path(X, y, method="lasso", positive=True) +# Plot all three subplots in one row +fig, axes = plt.subplots(1, 3, figsize=(18, 6)) -plt.figure(4) colors = cycle(["b", "r", "g", "c", "k"]) -for coef_lars, coef_positive_lars, c in zip(coefs_lars, coefs_positive_lars, colors): - l1 = plt.semilogx(alphas_lars, coef_lars, c=c) - l2 = plt.semilogx(alphas_positive_lars, coef_positive_lars, linestyle="--", c=c) -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("LARS and Positive LARS") -plt.legend((l1[-1], l2[-1]), ("LARS", "Positive LARS"), loc="lower right") -plt.axis("tight") +# First plot: Lasso vs Positive Lasso +for coef_lasso, coef_positive_lasso, c in zip(coefs_lasso, coefs_positive_lasso, colors): + axes[0].semilogx(alphas_lasso, coef_lasso, c=c) + axes[0].semilogx(alphas_positive_lasso, coef_positive_lasso, linestyle="--", c=c) + +axes[0].set_xlabel("alpha") +axes[0].set_ylabel("coefficients") +axes[0].set_title("Lasso vs Positive Lasso") +axes[0].legend(["Lasso", "Positive Lasso"], loc="upper right") +axes[0].axis("tight") -plt.figure(5) +# Second plot: ElasticNet vs Positive ElasticNet for coef_e, coef_pe, c in zip(coefs_enet, coefs_positive_enet, colors): - l1 = plt.semilogx(alphas_enet, coef_e, c=c) - l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c=c) + axes[1].semilogx(alphas_enet, coef_e, c=c) + axes[1].semilogx(alphas_positive_enet, coef_pe, linestyle="--", c=c) -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("Elastic-Net and positive Elastic-Net") -plt.legend((l1[-1], l2[-1]), ("Elastic-Net", "positive Elastic-Net"), loc="lower right") -plt.axis("tight") +axes[1].set_xlabel("alpha") +axes[1].set_ylabel("coefficients") +axes[1].set_title(" ElasticNet vs Positive ElasticNet") +axes[1].legend(["ElasticNet", "Positive ElasticNet (L1 ratio = 0.8)"], loc="upper left") +axes[1].axis("tight") + +# Third plot: Lasso-LARS vs Positive Lasso-LARS +for coef_lars, coef_positive_lars, c in zip(coefs_lars, coefs_positive_lars, colors): + axes[2].semilogx(alphas_lars, coef_lars, c=c) + axes[2].semilogx(alphas_positive_lars, coef_positive_lars, linestyle="--", c=c) + +axes[2].set_xlabel("alpha") +axes[2].set_ylabel("coefficients") +axes[2].set_title("Lasso-LARS vs Positive Lasso-LARS") +axes[2].legend(["Lasso-LARS", "Positive Lasso-LARS"], loc="upper right") +axes[2].axis("tight") + +# Display the plots +plt.tight_layout() plt.show() + +''' +When we enforce positive constraints on Lasso, the regularization path differs, as coefficients +are restricted to positive values only. This constraint leads to a different path, particularly +for coefficients that would have otherwise become negative. +''' + +''' +## Conclusion: + +This example illustrates how the choice of regularization method and solver impacts the +regularization path. Lasso and ElasticNet differ in their penalties (L1 vs a mix of L1 and L2), +while Lasso and Lasso-LARS differ in their solvers, with LARS being more efficient for +high-dimensional problems. Additionally, positive constraints can lead to different paths, +forcing non-negative coefficients in models like Lasso and ElasticNet. +'''