From 194685aacb7404b5320172f64cd83e3b0d4553b8 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 24 Nov 2021 15:09:43 +0100
Subject: [PATCH 1/4] DOC convert to notebook style SVM C scaling example

---
 examples/svm/plot_svm_scale_c.py | 244 ++++++++++++++++---------------
 1 file changed, 126 insertions(+), 118 deletions(-)

diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py
index b7e367e45d531..9dde22d5d73ad 100644
--- a/examples/svm/plot_svm_scale_c.py
+++ b/examples/svm/plot_svm_scale_c.py
@@ -33,136 +33,144 @@
 
 Since our loss function is dependent on the amount of samples, the latter
 will influence the selected value of `C`.
-The question that arises is `How do we optimally adjust C to
-account for the different amount of training samples?`
-
-The figures below are used to illustrate the effect of scaling our
-`C` to compensate for the change in the number of samples, in the
-case of using an `l1` penalty, as well as the `l2` penalty.
-
-l1-penalty case
------------------
-In the `l1` case, theory says that prediction consistency
-(i.e. that under given hypothesis, the estimator
-learned predicts as well as a model knowing the true distribution)
-is not possible because of the bias of the `l1`. It does say, however,
-that model consistency, in terms of finding the right set of non-zero
-parameters as well as their signs, can be achieved by scaling
-`C1`.
-
-l2-penalty case
------------------
-The theory says that in order to achieve prediction consistency, the
-penalty parameter should be kept constant
-as the number of samples grow.
-
-Simulations
-------------
-
-The two figures below plot the values of `C` on the `x-axis` and the
-corresponding cross-validation scores on the `y-axis`, for several different
-fractions of a generated data-set.
-
-In the `l1` penalty case, the cross-validation-error correlates best with
-the test-error, when scaling our `C` with the number of samples, `n`,
-which can be seen in the first figure.
-
-For the `l2` penalty case, the best result comes from the case where `C`
-is not scaled.
-
-.. topic:: Note:
-
-    Two separate datasets are used for the two different plots. The reason
-    behind this is the `l1` case works better on sparse data, while `l2`
-    is better suited to the non-sparse case.
+The question that arises is "How do we optimally adjust C to
+account for the different amount of training samples?"
 
+In the remainder of this example, we will investigate the effect of scaling
+the value of the regularization parameter `C` in regards to the number of
+samples for both L1 and L2 penalty. We will generate some synthetic datasets
+that are appropriate for each type of regularization.
 """
 
 # Author: Andreas Mueller <amueller@ais.uni-bonn.de>
 #         Jaques Grobler <jaques.grobler@inria.fr>
 # License: BSD 3 clause
 
+# %%
+# L1-penalty case
+# ---------------
+# In the L1 case, theory says that prediction consistency (i.e. that under
+# given hypothesis, the estimator learned predicts as well as a model knowing
+# the true distribution) is not possible because of the bias of the L1. It
+# does say, however, that model consistency, in terms of finding the right set
+# of non-zero parameters as well as their signs, can be achieved by scaling
+# `C`.
+#
+# We will now check if by using synthetic data, we can show this effect. Let's
+# first generate some synthetic data. This dataset will be sparse, meaning
+# that only a few features will be informative and useful for the model.
+from sklearn.datasets import make_classification
+
+n_samples, n_features = 100, 300
+X, y = make_classification(
+    n_samples=n_samples, n_features=n_features, n_informative=5, random_state=1
+)
+
+# %%
+# Now, we can define a linear SVC with the `l1` penalty.
+from sklearn.svm import LinearSVC
+
+penalty, loss = "l1", "squared_hinge"
+model_l1 = LinearSVC(penalty=penalty, loss=loss, dual=False, tol=1e-3)
+
+# %%
+# We will compute the mean test score for different values of `C`.
 import numpy as np
+import pandas as pd
+from sklearn.model_selection import validation_curve, ShuffleSplit
+
+Cs = np.logspace(-2.3, -1.3, 10)
+train_sizes = np.linspace(0.3, 0.7, 3)[::-1]
+
+results = {"C": Cs}
+for train_size in train_sizes:
+    cv = ShuffleSplit(train_size=train_size, test_size=0.3, n_splits=50, random_state=1)
+    train_scores, test_scores = validation_curve(
+        model_l1, X, y, param_name="C", param_range=Cs, cv=cv
+    )
+    results[f"fraction: {train_size}"] = test_scores.mean(axis=1)
+results = pd.DataFrame(results).set_index("C")
+
+# %%
+# Finally, we can plot the results with and without scaling the parameter `C`
+# depending of the size of the training set.
 import matplotlib.pyplot as plt
 
-from sklearn.svm import LinearSVC
-from sklearn.model_selection import ShuffleSplit
-from sklearn.model_selection import GridSearchCV
-from sklearn.utils import check_random_state
-from sklearn import datasets
+fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10))
+
+# plot results without scaling C
+results.plot(ax=axes[0], logx=True)
+axes[0].set_ylabel("CV score")
+axes[0].set_title(f"scaling=No scaling, penalty={penalty!r}, loss={loss!r}")
+
+# plot results by scaling C
+for train_size_idx, fraction in enumerate(results):
+    results_scaled = results[fraction]
+    results_scaled.index *= float(n_samples * train_sizes[train_size_idx])
+    results_scaled.plot(ax=axes[1], logx=True, label=fraction)
+axes[1].legend()
+axes[1].set_title(f"scaling=1/n_samples, penalty={penalty!r}, loss={loss!r}")
+axes[1].set_ylabel("CV score")
+
+_ = fig.suptitle("Effect of scaling C with L1 penalty")
+
+# %%
+# Here, we observe that the cross-validation-error correlates best with the
+# test-error, when scaling our `C` with the number of samples, `n`.
+#
+# L2-penalty case
+# ---------------
+# We can repeat a similar experiment with the `l2` penalty. In this case, we
+# don't need to use a sparse dataset.
+#
+# In this case, the theory says that in order to achieve prediction
+# consistency, the penalty parameter should be kept constant as the number of
+# samples grow.
+#
+# So we will repeat the same experiment by creating a linear SVC classifier
+# with the `l2` penalty and check the test score via cross-validation and
+# plot the results with and without scaling the parameter `C`.
+rng = np.random.RandomState(1)
+y = np.sign(0.5 - rng.rand(n_samples))
+X = rng.randn(n_samples, n_features // 5) + y[:, np.newaxis]
+X += 5 * rng.randn(n_samples, n_features // 5)
+
+# %%
+penalty = "l2"
+model_l2 = LinearSVC(penalty=penalty, loss=loss, dual=True)
+Cs = np.logspace(-4.5, -2, 10)
+
+results = {"C": Cs}
+for train_size in train_sizes:
+    cv = ShuffleSplit(train_size=train_size, test_size=0.3, n_splits=50, random_state=1)
+    train_scores, test_scores = validation_curve(
+        model_l2, X, y, param_name="C", param_range=Cs, cv=cv
+    )
+    results[f"fraction: {train_size}"] = test_scores.mean(axis=1)
+results = pd.DataFrame(results).set_index("C")
+
+# %%
+import matplotlib.pyplot as plt
 
-rnd = check_random_state(1)
+fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10))
 
-# set up dataset
-n_samples = 100
-n_features = 300
+# plot results without scaling C
+results.plot(ax=axes[0], logx=True)
+axes[0].set_ylabel("CV score")
+axes[0].set_title(f"scaling=No scaling, penalty={penalty!r}, loss={loss!r}")
 
-# l1 data (only 5 informative features)
-X_1, y_1 = datasets.make_classification(
-    n_samples=n_samples, n_features=n_features, n_informative=5, random_state=1
-)
+# plot results by scaling C
+for train_size_idx, fraction in enumerate(results):
+    results_scaled = results[fraction]
+    results_scaled.index *= float(n_samples * train_sizes[train_size_idx])
+    results_scaled.plot(ax=axes[1], logx=True, label=fraction)
+axes[1].legend()
+axes[1].set_title(f"scaling=1/n_samples, penalty={penalty!r}, loss={loss!r}")
+axes[1].set_ylabel("CV score")
+
+_ = fig.suptitle("Effect of scaling C with L2 penalty")
 
-# l2 data: non sparse, but less features
-y_2 = np.sign(0.5 - rnd.rand(n_samples))
-X_2 = rnd.randn(n_samples, n_features // 5) + y_2[:, np.newaxis]
-X_2 += 5 * rnd.randn(n_samples, n_features // 5)
-
-clf_sets = [
-    (
-        LinearSVC(penalty="l1", loss="squared_hinge", dual=False, tol=1e-3),
-        np.logspace(-2.3, -1.3, 10),
-        X_1,
-        y_1,
-    ),
-    (
-        LinearSVC(penalty="l2", loss="squared_hinge", dual=True),
-        np.logspace(-4.5, -2, 10),
-        X_2,
-        y_2,
-    ),
-]
-
-colors = ["navy", "cyan", "darkorange"]
-lw = 2
-
-for clf, cs, X, y in clf_sets:
-    # set up the plot for each regressor
-    fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10))
-
-    for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]):
-        param_grid = dict(C=cs)
-        # To get nice curve, we need a large number of iterations to
-        # reduce the variance
-        grid = GridSearchCV(
-            clf,
-            refit=False,
-            param_grid=param_grid,
-            cv=ShuffleSplit(
-                train_size=train_size, test_size=0.3, n_splits=50, random_state=1
-            ),
-        )
-        grid.fit(X, y)
-        scores = grid.cv_results_["mean_test_score"]
-
-        scales = [
-            (1, "No scaling"),
-            ((n_samples * train_size), "1/n_samples"),
-        ]
-
-        for ax, (scaler, name) in zip(axes, scales):
-            ax.set_xlabel("C")
-            ax.set_ylabel("CV Score")
-            grid_cs = cs * float(scaler)  # scale the C's
-            ax.semilogx(
-                grid_cs,
-                scores,
-                label="fraction %.2f" % train_size,
-                color=colors[k],
-                lw=lw,
-            )
-            ax.set_title(
-                "scaling=%s, penalty=%s, loss=%s" % (name, clf.penalty, clf.loss)
-            )
-
-    plt.legend(loc="best")
+# %%
+# So or the L2 penalty case, the best result comes from the case where `C` is
+# not scaled.
 plt.show()

From dd298f3533035a79bbb26214edfa6356b6808d76 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 15 Nov 2022 20:21:49 +0100
Subject: [PATCH 2/4] Apply suggestions from code review

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 examples/svm/plot_svm_scale_c.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py
index 9dde22d5d73ad..12eb418dcce74 100644
--- a/examples/svm/plot_svm_scale_c.py
+++ b/examples/svm/plot_svm_scale_c.py
@@ -56,9 +56,9 @@
 # of non-zero parameters as well as their signs, can be achieved by scaling
 # `C`.
 #
-# We will now check if by using synthetic data, we can show this effect. Let's
-# first generate some synthetic data. This dataset will be sparse, meaning
-# that only a few features will be informative and useful for the model.
+# We will demonstrate this effect by using a synthetic dataset. This
+# dataset will be sparse, meaning that only a few features will be informative
+# and useful for the model.
 from sklearn.datasets import make_classification
 
 n_samples, n_features = 100, 300
@@ -70,8 +70,7 @@
 # Now, we can define a linear SVC with the `l1` penalty.
 from sklearn.svm import LinearSVC
 
-penalty, loss = "l1", "squared_hinge"
-model_l1 = LinearSVC(penalty=penalty, loss=loss, dual=False, tol=1e-3)
+model_l1 = LinearSVC(penalty="l1", loss="squared_hinge", dual=False, tol=1e-3)
 
 # %%
 # We will compute the mean test score for different values of `C`.
@@ -80,7 +79,7 @@
 from sklearn.model_selection import validation_curve, ShuffleSplit
 
 Cs = np.logspace(-2.3, -1.3, 10)
-train_sizes = np.linspace(0.3, 0.7, 3)[::-1]
+train_sizes = np.linspace(0.3, 0.7, 3)
 
 results = {"C": Cs}
 for train_size in train_sizes:
@@ -96,7 +95,7 @@
 # depending of the size of the training set.
 import matplotlib.pyplot as plt
 
-fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10))
+fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 6))
 
 # plot results without scaling C
 results.plot(ax=axes[0], logx=True)

From 80990f304bb666020fd9b03553274a527e17e866 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 15 Nov 2022 20:23:15 +0100
Subject: [PATCH 3/4] Apply suggestions from code review

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 examples/svm/plot_svm_scale_c.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py
index 12eb418dcce74..f0ce705bef4fe 100644
--- a/examples/svm/plot_svm_scale_c.py
+++ b/examples/svm/plot_svm_scale_c.py
@@ -135,8 +135,7 @@
 X += 5 * rng.randn(n_samples, n_features // 5)
 
 # %%
-penalty = "l2"
-model_l2 = LinearSVC(penalty=penalty, loss=loss, dual=True)
+model_l2 = LinearSVC(penalty="l2", loss="squared_hinge", dual=True)
 Cs = np.logspace(-4.5, -2, 10)
 
 results = {"C": Cs}
@@ -151,7 +150,7 @@
 # %%
 import matplotlib.pyplot as plt
 
-fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10))
+fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 6))
 
 # plot results without scaling C
 results.plot(ax=axes[0], logx=True)

From 5118900fffb9559dc175dc2b938a430aa03fe67c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 15 Nov 2022 20:38:55 +0100
Subject: [PATCH 4/4] address thomas comments

---
 examples/svm/plot_svm_scale_c.py | 50 +++++++++++++++-----------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py
index f0ce705bef4fe..4ba025cffac8e 100644
--- a/examples/svm/plot_svm_scale_c.py
+++ b/examples/svm/plot_svm_scale_c.py
@@ -80,36 +80,34 @@
 
 Cs = np.logspace(-2.3, -1.3, 10)
 train_sizes = np.linspace(0.3, 0.7, 3)
+labels = [f"fraction: {train_size}" for train_size in train_sizes]
 
 results = {"C": Cs}
-for train_size in train_sizes:
+for label, train_size in zip(labels, train_sizes):
     cv = ShuffleSplit(train_size=train_size, test_size=0.3, n_splits=50, random_state=1)
     train_scores, test_scores = validation_curve(
         model_l1, X, y, param_name="C", param_range=Cs, cv=cv
     )
-    results[f"fraction: {train_size}"] = test_scores.mean(axis=1)
-results = pd.DataFrame(results).set_index("C")
+    results[label] = test_scores.mean(axis=1)
+results = pd.DataFrame(results)
 
 # %%
-# Finally, we can plot the results with and without scaling the parameter `C`
-# depending of the size of the training set.
 import matplotlib.pyplot as plt
 
 fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 6))
 
 # plot results without scaling C
-results.plot(ax=axes[0], logx=True)
+results.plot(x="C", ax=axes[0], logx=True)
 axes[0].set_ylabel("CV score")
-axes[0].set_title(f"scaling=No scaling, penalty={penalty!r}, loss={loss!r}")
+axes[0].set_title("No scaling")
 
 # plot results by scaling C
-for train_size_idx, fraction in enumerate(results):
-    results_scaled = results[fraction]
-    results_scaled.index *= float(n_samples * train_sizes[train_size_idx])
-    results_scaled.plot(ax=axes[1], logx=True, label=fraction)
-axes[1].legend()
-axes[1].set_title(f"scaling=1/n_samples, penalty={penalty!r}, loss={loss!r}")
-axes[1].set_ylabel("CV score")
+for train_size_idx, label in enumerate(labels):
+    results_scaled = results[[label]].assign(
+        C_scaled=Cs * float(n_samples * train_sizes[train_size_idx])
+    )
+    results_scaled.plot(x="C_scaled", ax=axes[1], logx=True, label=label)
+axes[1].set_title("Scaling C by 1 / n_samples")
 
 _ = fig.suptitle("Effect of scaling C with L1 penalty")
 
@@ -138,14 +136,15 @@
 model_l2 = LinearSVC(penalty="l2", loss="squared_hinge", dual=True)
 Cs = np.logspace(-4.5, -2, 10)
 
+labels = [f"fraction: {train_size}" for train_size in train_sizes]
 results = {"C": Cs}
-for train_size in train_sizes:
+for label, train_size in zip(labels, train_sizes):
     cv = ShuffleSplit(train_size=train_size, test_size=0.3, n_splits=50, random_state=1)
     train_scores, test_scores = validation_curve(
         model_l2, X, y, param_name="C", param_range=Cs, cv=cv
     )
-    results[f"fraction: {train_size}"] = test_scores.mean(axis=1)
-results = pd.DataFrame(results).set_index("C")
+    results[label] = test_scores.mean(axis=1)
+results = pd.DataFrame(results)
 
 # %%
 import matplotlib.pyplot as plt
@@ -153,18 +152,17 @@
 fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 6))
 
 # plot results without scaling C
-results.plot(ax=axes[0], logx=True)
+results.plot(x="C", ax=axes[0], logx=True)
 axes[0].set_ylabel("CV score")
-axes[0].set_title(f"scaling=No scaling, penalty={penalty!r}, loss={loss!r}")
+axes[0].set_title("No scaling")
 
 # plot results by scaling C
-for train_size_idx, fraction in enumerate(results):
-    results_scaled = results[fraction]
-    results_scaled.index *= float(n_samples * train_sizes[train_size_idx])
-    results_scaled.plot(ax=axes[1], logx=True, label=fraction)
-axes[1].legend()
-axes[1].set_title(f"scaling=1/n_samples, penalty={penalty!r}, loss={loss!r}")
-axes[1].set_ylabel("CV score")
+for train_size_idx, label in enumerate(labels):
+    results_scaled = results[[label]].assign(
+        C_scaled=Cs * float(n_samples * train_sizes[train_size_idx])
+    )
+    results_scaled.plot(x="C_scaled", ax=axes[1], logx=True, label=label)
+axes[1].set_title("Scaling C by 1 / n_samples")
 
 _ = fig.suptitle("Effect of scaling C with L2 penalty")