From 194685aacb7404b5320172f64cd83e3b0d4553b8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 24 Nov 2021 15:09:43 +0100 Subject: [PATCH 1/4] DOC convert to notebook style SVM C scaling example --- examples/svm/plot_svm_scale_c.py | 244 ++++++++++++++++--------------- 1 file changed, 126 insertions(+), 118 deletions(-) diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py index b7e367e45d531..9dde22d5d73ad 100644 --- a/examples/svm/plot_svm_scale_c.py +++ b/examples/svm/plot_svm_scale_c.py @@ -33,136 +33,144 @@ Since our loss function is dependent on the amount of samples, the latter will influence the selected value of `C`. -The question that arises is `How do we optimally adjust C to -account for the different amount of training samples?` - -The figures below are used to illustrate the effect of scaling our -`C` to compensate for the change in the number of samples, in the -case of using an `l1` penalty, as well as the `l2` penalty. - -l1-penalty case ------------------ -In the `l1` case, theory says that prediction consistency -(i.e. that under given hypothesis, the estimator -learned predicts as well as a model knowing the true distribution) -is not possible because of the bias of the `l1`. It does say, however, -that model consistency, in terms of finding the right set of non-zero -parameters as well as their signs, can be achieved by scaling -`C1`. - -l2-penalty case ------------------ -The theory says that in order to achieve prediction consistency, the -penalty parameter should be kept constant -as the number of samples grow. - -Simulations ------------- - -The two figures below plot the values of `C` on the `x-axis` and the -corresponding cross-validation scores on the `y-axis`, for several different -fractions of a generated data-set. - -In the `l1` penalty case, the cross-validation-error correlates best with -the test-error, when scaling our `C` with the number of samples, `n`, -which can be seen in the first figure. - -For the `l2` penalty case, the best result comes from the case where `C` -is not scaled. - -.. topic:: Note: - - Two separate datasets are used for the two different plots. The reason - behind this is the `l1` case works better on sparse data, while `l2` - is better suited to the non-sparse case. +The question that arises is "How do we optimally adjust C to +account for the different amount of training samples?" +In the remainder of this example, we will investigate the effect of scaling +the value of the regularization parameter `C` in regards to the number of +samples for both L1 and L2 penalty. We will generate some synthetic datasets +that are appropriate for each type of regularization. """ # Author: Andreas Mueller # Jaques Grobler # License: BSD 3 clause +# %% +# L1-penalty case +# --------------- +# In the L1 case, theory says that prediction consistency (i.e. that under +# given hypothesis, the estimator learned predicts as well as a model knowing +# the true distribution) is not possible because of the bias of the L1. It +# does say, however, that model consistency, in terms of finding the right set +# of non-zero parameters as well as their signs, can be achieved by scaling +# `C`. +# +# We will now check if by using synthetic data, we can show this effect. Let's +# first generate some synthetic data. This dataset will be sparse, meaning +# that only a few features will be informative and useful for the model. +from sklearn.datasets import make_classification + +n_samples, n_features = 100, 300 +X, y = make_classification( + n_samples=n_samples, n_features=n_features, n_informative=5, random_state=1 +) + +# %% +# Now, we can define a linear SVC with the `l1` penalty. +from sklearn.svm import LinearSVC + +penalty, loss = "l1", "squared_hinge" +model_l1 = LinearSVC(penalty=penalty, loss=loss, dual=False, tol=1e-3) + +# %% +# We will compute the mean test score for different values of `C`. import numpy as np +import pandas as pd +from sklearn.model_selection import validation_curve, ShuffleSplit + +Cs = np.logspace(-2.3, -1.3, 10) +train_sizes = np.linspace(0.3, 0.7, 3)[::-1] + +results = {"C": Cs} +for train_size in train_sizes: + cv = ShuffleSplit(train_size=train_size, test_size=0.3, n_splits=50, random_state=1) + train_scores, test_scores = validation_curve( + model_l1, X, y, param_name="C", param_range=Cs, cv=cv + ) + results[f"fraction: {train_size}"] = test_scores.mean(axis=1) +results = pd.DataFrame(results).set_index("C") + +# %% +# Finally, we can plot the results with and without scaling the parameter `C` +# depending of the size of the training set. import matplotlib.pyplot as plt -from sklearn.svm import LinearSVC -from sklearn.model_selection import ShuffleSplit -from sklearn.model_selection import GridSearchCV -from sklearn.utils import check_random_state -from sklearn import datasets +fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10)) + +# plot results without scaling C +results.plot(ax=axes[0], logx=True) +axes[0].set_ylabel("CV score") +axes[0].set_title(f"scaling=No scaling, penalty={penalty!r}, loss={loss!r}") + +# plot results by scaling C +for train_size_idx, fraction in enumerate(results): + results_scaled = results[fraction] + results_scaled.index *= float(n_samples * train_sizes[train_size_idx]) + results_scaled.plot(ax=axes[1], logx=True, label=fraction) +axes[1].legend() +axes[1].set_title(f"scaling=1/n_samples, penalty={penalty!r}, loss={loss!r}") +axes[1].set_ylabel("CV score") + +_ = fig.suptitle("Effect of scaling C with L1 penalty") + +# %% +# Here, we observe that the cross-validation-error correlates best with the +# test-error, when scaling our `C` with the number of samples, `n`. +# +# L2-penalty case +# --------------- +# We can repeat a similar experiment with the `l2` penalty. In this case, we +# don't need to use a sparse dataset. +# +# In this case, the theory says that in order to achieve prediction +# consistency, the penalty parameter should be kept constant as the number of +# samples grow. +# +# So we will repeat the same experiment by creating a linear SVC classifier +# with the `l2` penalty and check the test score via cross-validation and +# plot the results with and without scaling the parameter `C`. +rng = np.random.RandomState(1) +y = np.sign(0.5 - rng.rand(n_samples)) +X = rng.randn(n_samples, n_features // 5) + y[:, np.newaxis] +X += 5 * rng.randn(n_samples, n_features // 5) + +# %% +penalty = "l2" +model_l2 = LinearSVC(penalty=penalty, loss=loss, dual=True) +Cs = np.logspace(-4.5, -2, 10) + +results = {"C": Cs} +for train_size in train_sizes: + cv = ShuffleSplit(train_size=train_size, test_size=0.3, n_splits=50, random_state=1) + train_scores, test_scores = validation_curve( + model_l2, X, y, param_name="C", param_range=Cs, cv=cv + ) + results[f"fraction: {train_size}"] = test_scores.mean(axis=1) +results = pd.DataFrame(results).set_index("C") + +# %% +import matplotlib.pyplot as plt -rnd = check_random_state(1) +fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10)) -# set up dataset -n_samples = 100 -n_features = 300 +# plot results without scaling C +results.plot(ax=axes[0], logx=True) +axes[0].set_ylabel("CV score") +axes[0].set_title(f"scaling=No scaling, penalty={penalty!r}, loss={loss!r}") -# l1 data (only 5 informative features) -X_1, y_1 = datasets.make_classification( - n_samples=n_samples, n_features=n_features, n_informative=5, random_state=1 -) +# plot results by scaling C +for train_size_idx, fraction in enumerate(results): + results_scaled = results[fraction] + results_scaled.index *= float(n_samples * train_sizes[train_size_idx]) + results_scaled.plot(ax=axes[1], logx=True, label=fraction) +axes[1].legend() +axes[1].set_title(f"scaling=1/n_samples, penalty={penalty!r}, loss={loss!r}") +axes[1].set_ylabel("CV score") + +_ = fig.suptitle("Effect of scaling C with L2 penalty") -# l2 data: non sparse, but less features -y_2 = np.sign(0.5 - rnd.rand(n_samples)) -X_2 = rnd.randn(n_samples, n_features // 5) + y_2[:, np.newaxis] -X_2 += 5 * rnd.randn(n_samples, n_features // 5) - -clf_sets = [ - ( - LinearSVC(penalty="l1", loss="squared_hinge", dual=False, tol=1e-3), - np.logspace(-2.3, -1.3, 10), - X_1, - y_1, - ), - ( - LinearSVC(penalty="l2", loss="squared_hinge", dual=True), - np.logspace(-4.5, -2, 10), - X_2, - y_2, - ), -] - -colors = ["navy", "cyan", "darkorange"] -lw = 2 - -for clf, cs, X, y in clf_sets: - # set up the plot for each regressor - fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10)) - - for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]): - param_grid = dict(C=cs) - # To get nice curve, we need a large number of iterations to - # reduce the variance - grid = GridSearchCV( - clf, - refit=False, - param_grid=param_grid, - cv=ShuffleSplit( - train_size=train_size, test_size=0.3, n_splits=50, random_state=1 - ), - ) - grid.fit(X, y) - scores = grid.cv_results_["mean_test_score"] - - scales = [ - (1, "No scaling"), - ((n_samples * train_size), "1/n_samples"), - ] - - for ax, (scaler, name) in zip(axes, scales): - ax.set_xlabel("C") - ax.set_ylabel("CV Score") - grid_cs = cs * float(scaler) # scale the C's - ax.semilogx( - grid_cs, - scores, - label="fraction %.2f" % train_size, - color=colors[k], - lw=lw, - ) - ax.set_title( - "scaling=%s, penalty=%s, loss=%s" % (name, clf.penalty, clf.loss) - ) - - plt.legend(loc="best") +# %% +# So or the L2 penalty case, the best result comes from the case where `C` is +# not scaled. plt.show() From dd298f3533035a79bbb26214edfa6356b6808d76 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 15 Nov 2022 20:21:49 +0100 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: Thomas J. Fan --- examples/svm/plot_svm_scale_c.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py index 9dde22d5d73ad..12eb418dcce74 100644 --- a/examples/svm/plot_svm_scale_c.py +++ b/examples/svm/plot_svm_scale_c.py @@ -56,9 +56,9 @@ # of non-zero parameters as well as their signs, can be achieved by scaling # `C`. # -# We will now check if by using synthetic data, we can show this effect. Let's -# first generate some synthetic data. This dataset will be sparse, meaning -# that only a few features will be informative and useful for the model. +# We will demonstrate this effect by using a synthetic dataset. This +# dataset will be sparse, meaning that only a few features will be informative +# and useful for the model. from sklearn.datasets import make_classification n_samples, n_features = 100, 300 @@ -70,8 +70,7 @@ # Now, we can define a linear SVC with the `l1` penalty. from sklearn.svm import LinearSVC -penalty, loss = "l1", "squared_hinge" -model_l1 = LinearSVC(penalty=penalty, loss=loss, dual=False, tol=1e-3) +model_l1 = LinearSVC(penalty="l1", loss="squared_hinge", dual=False, tol=1e-3) # %% # We will compute the mean test score for different values of `C`. @@ -80,7 +79,7 @@ from sklearn.model_selection import validation_curve, ShuffleSplit Cs = np.logspace(-2.3, -1.3, 10) -train_sizes = np.linspace(0.3, 0.7, 3)[::-1] +train_sizes = np.linspace(0.3, 0.7, 3) results = {"C": Cs} for train_size in train_sizes: @@ -96,7 +95,7 @@ # depending of the size of the training set. import matplotlib.pyplot as plt -fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10)) +fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 6)) # plot results without scaling C results.plot(ax=axes[0], logx=True) From 80990f304bb666020fd9b03553274a527e17e866 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 15 Nov 2022 20:23:15 +0100 Subject: [PATCH 3/4] Apply suggestions from code review Co-authored-by: Thomas J. Fan --- examples/svm/plot_svm_scale_c.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py index 12eb418dcce74..f0ce705bef4fe 100644 --- a/examples/svm/plot_svm_scale_c.py +++ b/examples/svm/plot_svm_scale_c.py @@ -135,8 +135,7 @@ X += 5 * rng.randn(n_samples, n_features // 5) # %% -penalty = "l2" -model_l2 = LinearSVC(penalty=penalty, loss=loss, dual=True) +model_l2 = LinearSVC(penalty="l2", loss="squared_hinge", dual=True) Cs = np.logspace(-4.5, -2, 10) results = {"C": Cs} @@ -151,7 +150,7 @@ # %% import matplotlib.pyplot as plt -fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10)) +fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 6)) # plot results without scaling C results.plot(ax=axes[0], logx=True) From 5118900fffb9559dc175dc2b938a430aa03fe67c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 15 Nov 2022 20:38:55 +0100 Subject: [PATCH 4/4] address thomas comments --- examples/svm/plot_svm_scale_c.py | 50 +++++++++++++++----------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py index f0ce705bef4fe..4ba025cffac8e 100644 --- a/examples/svm/plot_svm_scale_c.py +++ b/examples/svm/plot_svm_scale_c.py @@ -80,36 +80,34 @@ Cs = np.logspace(-2.3, -1.3, 10) train_sizes = np.linspace(0.3, 0.7, 3) +labels = [f"fraction: {train_size}" for train_size in train_sizes] results = {"C": Cs} -for train_size in train_sizes: +for label, train_size in zip(labels, train_sizes): cv = ShuffleSplit(train_size=train_size, test_size=0.3, n_splits=50, random_state=1) train_scores, test_scores = validation_curve( model_l1, X, y, param_name="C", param_range=Cs, cv=cv ) - results[f"fraction: {train_size}"] = test_scores.mean(axis=1) -results = pd.DataFrame(results).set_index("C") + results[label] = test_scores.mean(axis=1) +results = pd.DataFrame(results) # %% -# Finally, we can plot the results with and without scaling the parameter `C` -# depending of the size of the training set. import matplotlib.pyplot as plt fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 6)) # plot results without scaling C -results.plot(ax=axes[0], logx=True) +results.plot(x="C", ax=axes[0], logx=True) axes[0].set_ylabel("CV score") -axes[0].set_title(f"scaling=No scaling, penalty={penalty!r}, loss={loss!r}") +axes[0].set_title("No scaling") # plot results by scaling C -for train_size_idx, fraction in enumerate(results): - results_scaled = results[fraction] - results_scaled.index *= float(n_samples * train_sizes[train_size_idx]) - results_scaled.plot(ax=axes[1], logx=True, label=fraction) -axes[1].legend() -axes[1].set_title(f"scaling=1/n_samples, penalty={penalty!r}, loss={loss!r}") -axes[1].set_ylabel("CV score") +for train_size_idx, label in enumerate(labels): + results_scaled = results[[label]].assign( + C_scaled=Cs * float(n_samples * train_sizes[train_size_idx]) + ) + results_scaled.plot(x="C_scaled", ax=axes[1], logx=True, label=label) +axes[1].set_title("Scaling C by 1 / n_samples") _ = fig.suptitle("Effect of scaling C with L1 penalty") @@ -138,14 +136,15 @@ model_l2 = LinearSVC(penalty="l2", loss="squared_hinge", dual=True) Cs = np.logspace(-4.5, -2, 10) +labels = [f"fraction: {train_size}" for train_size in train_sizes] results = {"C": Cs} -for train_size in train_sizes: +for label, train_size in zip(labels, train_sizes): cv = ShuffleSplit(train_size=train_size, test_size=0.3, n_splits=50, random_state=1) train_scores, test_scores = validation_curve( model_l2, X, y, param_name="C", param_range=Cs, cv=cv ) - results[f"fraction: {train_size}"] = test_scores.mean(axis=1) -results = pd.DataFrame(results).set_index("C") + results[label] = test_scores.mean(axis=1) +results = pd.DataFrame(results) # %% import matplotlib.pyplot as plt @@ -153,18 +152,17 @@ fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 6)) # plot results without scaling C -results.plot(ax=axes[0], logx=True) +results.plot(x="C", ax=axes[0], logx=True) axes[0].set_ylabel("CV score") -axes[0].set_title(f"scaling=No scaling, penalty={penalty!r}, loss={loss!r}") +axes[0].set_title("No scaling") # plot results by scaling C -for train_size_idx, fraction in enumerate(results): - results_scaled = results[fraction] - results_scaled.index *= float(n_samples * train_sizes[train_size_idx]) - results_scaled.plot(ax=axes[1], logx=True, label=fraction) -axes[1].legend() -axes[1].set_title(f"scaling=1/n_samples, penalty={penalty!r}, loss={loss!r}") -axes[1].set_ylabel("CV score") +for train_size_idx, label in enumerate(labels): + results_scaled = results[[label]].assign( + C_scaled=Cs * float(n_samples * train_sizes[train_size_idx]) + ) + results_scaled.plot(x="C_scaled", ax=axes[1], logx=True, label=label) +axes[1].set_title("Scaling C by 1 / n_samples") _ = fig.suptitle("Effect of scaling C with L2 penalty")