From e40b45d8f35c79b2f10b32bfa34c5ffa538d829e Mon Sep 17 00:00:00 2001
From: rachit mehta <23110261@iitgn.ac.in>
Date: Tue, 8 Oct 2024 20:36:33 +0530
Subject: [PATCH]  DOC improvements in plot_lasso_lasso_lars_elasticnet_path.py

---
 .../plot_lasso_lasso_lars_elasticnet_path.py  | 238 +++++++++++-------
 1 file changed, 143 insertions(+), 95 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
index 44ae64c4c2811..cbae0ead12858 100644
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
@@ -1,136 +1,184 @@
-"""
-========================================
-Lasso, Lasso-LARS, and Elastic Net paths
-========================================
+'''
+# Regularization Paths for Lasso, Lasso-LARS, and ElasticNet
 
-This example shows how to compute the "paths" of coefficients along the Lasso,
-Lasso-LARS, and Elastic Net regularization paths. In other words, it shows the
-relationship between the regularization parameter (alpha) and the coefficients.
+In this example, we will explore and compare the regularization paths of three important 
+linear models used for regularization: 
+- :func:`~sklearn.linear_model.Lasso`
+- :func:`~sklearn.linear_model.LassoLars`
+- :func:`~sklearn.linear_model.ElasticNet`
 
-Lasso and Lasso-LARS impose a sparsity constraint on the coefficients,
-encouraging some of them to be zero. Elastic Net is a generalization of
-Lasso that adds an L2 penalty term to the L1 penalty term. This allows for
-some coefficients to be non-zero while still encouraging sparsity.
+## What is a Regularization Path?
 
-Lasso and Elastic Net use a coordinate descent method to compute the paths, while
-Lasso-LARS uses the LARS algorithm to compute the paths.
+Regularization path is a plot between model coefficients  and the regularization parameter (alpha).
+For models like Lasso and ElasticNet, the path shows how coefficients 
+are shrunk towards zero as regularization becomes stronger. This helps in feature selection 
+and model interpretability.
 
-The paths are computed using :func:`~sklearn.linear_model.lasso_path`,
-:func:`~sklearn.linear_model.lars_path`, and :func:`~sklearn.linear_model.enet_path`.
+We will dive into comparing Lasso vs ElasticNet, 
+and Lasso vs Lasso-LARS, focusing on their regularization paths.
+'''
 
-The results show different comparison plots:
+import matplotlib.pyplot as plt
+from itertools import cycle
+from sklearn.datasets import load_diabetes
+from sklearn.linear_model import enet_path, lasso_path, lars_path
 
-- Compare Lasso and Lasso-LARS
-- Compare Lasso and Elastic Net
-- Compare Lasso with positive Lasso
-- Compare LARS and Positive LARS
-- Compare Elastic Net and positive Elastic Net
+# Load the dataset
+X, y = load_diabetes(return_X_y=True)
+X /= X.std(axis=0)  # Standardize data (this ensures the features have mean 0 and variance 1)
 
-Each plot shows how the model coefficients vary as the regularization strength changes,
-offering insight into the behavior of these models
-under different constraints.
-"""
+'''
+### 1. Lasso and ElasticNet: A Comparison of Regularization path
 
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
+Lasso (Least Absolute Shrinkage and Selection Operator) uses L1 regularization, meaning it 
+penalizes the absolute value of the coefficients. As a result, Lasso tends to produce sparse 
+models, where some coefficients are exactly zero.
 
-from itertools import cycle
+ElasticNet, on the other hand, is a combination of L1 and L2 regularization. The L2 penalty 
+helps overcome some limitations of Lasso, particularly when features are highly correlated.
 
-import matplotlib.pyplot as plt
+$$ \text{ElasticNet Loss} = \frac{1}{2n_{\text{samples}}} \|y - Xw\|^2_2 + \alpha \rho \|w\|_1 + \alpha (1 - \rho) \|w\|_2^2 $$
 
-from sklearn.datasets import load_diabetes
-from sklearn.linear_model import enet_path, lars_path, lasso_path
+where $\rho$ is the mix ratio between Lasso (L1) and Ridge (L2) penalties.
+'''
 
-X, y = load_diabetes(return_X_y=True)
-X /= X.std(axis=0)  # Standardize data (easier to set the l1_ratio parameter)
+eps = 5e-3  # A smaller value of eps leads to a longer regularization path
 
-# Compute paths
+# Compute the regularization path for Lasso
+alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps)
 
-eps = 5e-3  # the smaller it is the longer is the path
+# Compute the regularization path for ElasticNet with l1_ratio=0.8
+l1_ratio = 0.8  # This controls the mix between L1 and L2 (Lasso and Ridge)
+alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=l1_ratio)
 
-print("Computing regularization path using the lasso...")
-alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps)
+# Plot the paths for Lasso and ElasticNet
+plt.figure(figsize=(10, 6))
+colors = cycle(["b", "r", "g", "c", "k"])
 
-print("Computing regularization path using the positive lasso...")
-alphas_positive_lasso, coefs_positive_lasso, _ = lasso_path(
-    X, y, eps=eps, positive=True
-)
+for coef_l, coef_e, c in zip(coefs_lasso, coefs_enet, colors):
+    plt.semilogx(alphas_lasso, coef_l, c=c)
+    plt.semilogx(alphas_enet, coef_e, linestyle="--", c=c)
 
-print("Computing regularization path using the LARS...")
-alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso")
+plt.xlabel("alpha")
+plt.ylabel("coefficients")
+plt.title("Lasso vs ElasticNet Regularization Path")
+plt.legend(["Lasso", "ElasticNet (L1 ratio = 0.8)"], loc="upper left")
+plt.axis("tight")
+plt.show()
+
+'''
+We can see in the plot that as alpha increases (more regularization), both Lasso and ElasticNet drive coefficients towards 
+zero. However, ElasticNet's combination of L1 and L2 regularization causes coefficients to 
+shrink more smoothly as compared to Lasso. This allows ElasticNet to handle correlated features 
+better, whereas Lasso might arbitrarily select one of the correlated features and set the rest to zero.
+'''
 
-print("Computing regularization path using the positive LARS...")
-alphas_positive_lars, _, coefs_positive_lars = lars_path(
-    X, y, method="lasso", positive=True
-)
+'''
+### 2. Lasso vs Lasso-LARS: Regularization Path
 
-print("Computing regularization path using the elastic net...")
-alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=0.8)
+The main difference between Lasso and Lasso-LARS is the method it uses
+to minimize loss.
+Lasso uses cordinate descent to minimize the loss function which is an computationally
+expensive method but Lasso-LARS (Least Angle Regression) is a more efficient 
+algorithm. It finds the minimum solution by moving in a path of most correlated 
+features. The regularization path of lasso and lasso-lars would similar, but 
+lasso-lars would be must faster when there are many correlated features.
 
-print("Computing regularization path using the positive elastic net...")
-alphas_positive_enet, coefs_positive_enet, _ = enet_path(
-    X, y, eps=eps, l1_ratio=0.8, positive=True
-)
+Let compute and compare their regularization paths.
+'''
 
-# Display results
+# Compute the regularization path for Lasso-LARS
+alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso")
 
-plt.figure(1)
+# Plot the paths for Lasso and Lasso-LARS
+plt.figure(figsize=(10, 6))
 colors = cycle(["b", "r", "g", "c", "k"])
+
 for coef_lasso, coef_lars, c in zip(coefs_lasso, coefs_lars, colors):
-    l1 = plt.semilogx(alphas_lasso, coef_lasso, c=c)
-    l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c=c)
+    plt.semilogx(alphas_lasso, coef_lasso, c=c)
+    plt.semilogx(alphas_lars, coef_lars, linestyle="--", c=c)
 
 plt.xlabel("alpha")
 plt.ylabel("coefficients")
-plt.title("Lasso and LARS Paths")
-plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right")
+plt.title("Lasso vs Lasso-LARS Regularization Path")
+plt.legend(["Lasso", "Lasso-LARS"], loc="upper right")
 plt.axis("tight")
+plt.show()
 
-plt.figure(2)
-colors = cycle(["b", "r", "g", "c", "k"])
-for coef_l, coef_e, c in zip(coefs_lasso, coefs_enet, colors):
-    l1 = plt.semilogx(alphas_lasso, coef_l, c=c)
-    l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c=c)
+'''
+As We can see the paths for Lasso and Lasso-LARS are close to each other. But lasso-LARS has a more direct 
+path instead of a curve smooth path, that is because of its method of implementation.
+Both methods set some coefficients to exactly zero, but the LARS algorithm moves in the 
+direction of the strongest feature correlation, making it particularly suited for sparse models.
+'''
 
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("Lasso and Elastic-Net Paths")
-plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right")
-plt.axis("tight")
+'''
+### 3. Positive Constraints
 
+Both Lasso and ElasticNet can also enforce positive constraints on the coefficients by 
+specifying `positive=True`.
 
-plt.figure(3)
-for coef_l, coef_pl, c in zip(coefs_lasso, coefs_positive_lasso, colors):
-    l1 = plt.semilogy(alphas_lasso, coef_l, c=c)
-    l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c=c)
+Lets see how positive constraints impact the regularization paths for Lasso.
+'''
 
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("Lasso and positive Lasso")
-plt.legend((l1[-1], l2[-1]), ("Lasso", "positive Lasso"), loc="lower right")
-plt.axis("tight")
+alphas_positive_lasso, coefs_positive_lasso, _ = lasso_path(X, y, eps=eps, positive=True)
+alphas_positive_enet, coefs_positive_enet, _ = enet_path(X, y, eps=eps, l1_ratio=l1_ratio, positive=True)
+alphas_positive_lars, _, coefs_positive_lars = lars_path(X, y, method="lasso", positive=True)
 
+# Plot all three subplots in one row
+fig, axes = plt.subplots(1, 3, figsize=(18, 6))
 
-plt.figure(4)
 colors = cycle(["b", "r", "g", "c", "k"])
-for coef_lars, coef_positive_lars, c in zip(coefs_lars, coefs_positive_lars, colors):
-    l1 = plt.semilogx(alphas_lars, coef_lars, c=c)
-    l2 = plt.semilogx(alphas_positive_lars, coef_positive_lars, linestyle="--", c=c)
 
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("LARS and Positive LARS")
-plt.legend((l1[-1], l2[-1]), ("LARS", "Positive LARS"), loc="lower right")
-plt.axis("tight")
+# First plot: Lasso vs Positive Lasso
+for coef_lasso, coef_positive_lasso, c in zip(coefs_lasso, coefs_positive_lasso, colors):
+    axes[0].semilogx(alphas_lasso, coef_lasso, c=c)
+    axes[0].semilogx(alphas_positive_lasso, coef_positive_lasso, linestyle="--", c=c)
+
+axes[0].set_xlabel("alpha")
+axes[0].set_ylabel("coefficients")
+axes[0].set_title("Lasso vs Positive Lasso")
+axes[0].legend(["Lasso", "Positive Lasso"], loc="upper right")
+axes[0].axis("tight")
 
-plt.figure(5)
+# Second plot:  ElasticNet vs Positive ElasticNet
 for coef_e, coef_pe, c in zip(coefs_enet, coefs_positive_enet, colors):
-    l1 = plt.semilogx(alphas_enet, coef_e, c=c)
-    l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c=c)
+    axes[1].semilogx(alphas_enet, coef_e, c=c)
+    axes[1].semilogx(alphas_positive_enet, coef_pe, linestyle="--", c=c)
 
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("Elastic-Net and positive Elastic-Net")
-plt.legend((l1[-1], l2[-1]), ("Elastic-Net", "positive Elastic-Net"), loc="lower right")
-plt.axis("tight")
+axes[1].set_xlabel("alpha")
+axes[1].set_ylabel("coefficients")
+axes[1].set_title(" ElasticNet vs Positive ElasticNet")
+axes[1].legend(["ElasticNet", "Positive ElasticNet (L1 ratio = 0.8)"], loc="upper left")
+axes[1].axis("tight")
+
+# Third plot: Lasso-LARS vs Positive Lasso-LARS
+for coef_lars, coef_positive_lars, c in zip(coefs_lars, coefs_positive_lars, colors):
+    axes[2].semilogx(alphas_lars, coef_lars, c=c)
+    axes[2].semilogx(alphas_positive_lars, coef_positive_lars, linestyle="--", c=c)
+
+axes[2].set_xlabel("alpha")
+axes[2].set_ylabel("coefficients")
+axes[2].set_title("Lasso-LARS vs Positive Lasso-LARS")
+axes[2].legend(["Lasso-LARS", "Positive Lasso-LARS"], loc="upper right")
+axes[2].axis("tight")
+
+# Display the plots
+plt.tight_layout()
 plt.show()
+
+'''
+When we enforce positive constraints on Lasso, the regularization path differs, as coefficients 
+are restricted to positive values only. This constraint leads to a different path, particularly 
+for coefficients that would have otherwise become negative.
+'''
+
+'''
+## Conclusion:
+
+This example illustrates how the choice of regularization method and solver impacts the 
+regularization path. Lasso and ElasticNet differ in their penalties (L1 vs a mix of L1 and L2), 
+while Lasso and Lasso-LARS differ in their solvers, with LARS being more efficient for 
+high-dimensional problems. Additionally, positive constraints can lead to different paths, 
+forcing non-negative coefficients in models like Lasso and ElasticNet.
+'''