diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py index b7e367e45d531..4ba025cffac8e 100644 --- a/examples/svm/plot_svm_scale_c.py +++ b/examples/svm/plot_svm_scale_c.py @@ -33,136 +33,140 @@ Since our loss function is dependent on the amount of samples, the latter will influence the selected value of `C`. -The question that arises is `How do we optimally adjust C to -account for the different amount of training samples?` - -The figures below are used to illustrate the effect of scaling our -`C` to compensate for the change in the number of samples, in the -case of using an `l1` penalty, as well as the `l2` penalty. - -l1-penalty case ------------------ -In the `l1` case, theory says that prediction consistency -(i.e. that under given hypothesis, the estimator -learned predicts as well as a model knowing the true distribution) -is not possible because of the bias of the `l1`. It does say, however, -that model consistency, in terms of finding the right set of non-zero -parameters as well as their signs, can be achieved by scaling -`C1`. - -l2-penalty case ------------------ -The theory says that in order to achieve prediction consistency, the -penalty parameter should be kept constant -as the number of samples grow. - -Simulations ------------- - -The two figures below plot the values of `C` on the `x-axis` and the -corresponding cross-validation scores on the `y-axis`, for several different -fractions of a generated data-set. - -In the `l1` penalty case, the cross-validation-error correlates best with -the test-error, when scaling our `C` with the number of samples, `n`, -which can be seen in the first figure. - -For the `l2` penalty case, the best result comes from the case where `C` -is not scaled. - -.. topic:: Note: - - Two separate datasets are used for the two different plots. The reason - behind this is the `l1` case works better on sparse data, while `l2` - is better suited to the non-sparse case. +The question that arises is "How do we optimally adjust C to +account for the different amount of training samples?" +In the remainder of this example, we will investigate the effect of scaling +the value of the regularization parameter `C` in regards to the number of +samples for both L1 and L2 penalty. We will generate some synthetic datasets +that are appropriate for each type of regularization. """ # Author: Andreas Mueller # Jaques Grobler # License: BSD 3 clause +# %% +# L1-penalty case +# --------------- +# In the L1 case, theory says that prediction consistency (i.e. that under +# given hypothesis, the estimator learned predicts as well as a model knowing +# the true distribution) is not possible because of the bias of the L1. It +# does say, however, that model consistency, in terms of finding the right set +# of non-zero parameters as well as their signs, can be achieved by scaling +# `C`. +# +# We will demonstrate this effect by using a synthetic dataset. This +# dataset will be sparse, meaning that only a few features will be informative +# and useful for the model. +from sklearn.datasets import make_classification + +n_samples, n_features = 100, 300 +X, y = make_classification( + n_samples=n_samples, n_features=n_features, n_informative=5, random_state=1 +) + +# %% +# Now, we can define a linear SVC with the `l1` penalty. +from sklearn.svm import LinearSVC + +model_l1 = LinearSVC(penalty="l1", loss="squared_hinge", dual=False, tol=1e-3) + +# %% +# We will compute the mean test score for different values of `C`. import numpy as np +import pandas as pd +from sklearn.model_selection import validation_curve, ShuffleSplit + +Cs = np.logspace(-2.3, -1.3, 10) +train_sizes = np.linspace(0.3, 0.7, 3) +labels = [f"fraction: {train_size}" for train_size in train_sizes] + +results = {"C": Cs} +for label, train_size in zip(labels, train_sizes): + cv = ShuffleSplit(train_size=train_size, test_size=0.3, n_splits=50, random_state=1) + train_scores, test_scores = validation_curve( + model_l1, X, y, param_name="C", param_range=Cs, cv=cv + ) + results[label] = test_scores.mean(axis=1) +results = pd.DataFrame(results) + +# %% import matplotlib.pyplot as plt -from sklearn.svm import LinearSVC -from sklearn.model_selection import ShuffleSplit -from sklearn.model_selection import GridSearchCV -from sklearn.utils import check_random_state -from sklearn import datasets +fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 6)) + +# plot results without scaling C +results.plot(x="C", ax=axes[0], logx=True) +axes[0].set_ylabel("CV score") +axes[0].set_title("No scaling") + +# plot results by scaling C +for train_size_idx, label in enumerate(labels): + results_scaled = results[[label]].assign( + C_scaled=Cs * float(n_samples * train_sizes[train_size_idx]) + ) + results_scaled.plot(x="C_scaled", ax=axes[1], logx=True, label=label) +axes[1].set_title("Scaling C by 1 / n_samples") + +_ = fig.suptitle("Effect of scaling C with L1 penalty") + +# %% +# Here, we observe that the cross-validation-error correlates best with the +# test-error, when scaling our `C` with the number of samples, `n`. +# +# L2-penalty case +# --------------- +# We can repeat a similar experiment with the `l2` penalty. In this case, we +# don't need to use a sparse dataset. +# +# In this case, the theory says that in order to achieve prediction +# consistency, the penalty parameter should be kept constant as the number of +# samples grow. +# +# So we will repeat the same experiment by creating a linear SVC classifier +# with the `l2` penalty and check the test score via cross-validation and +# plot the results with and without scaling the parameter `C`. +rng = np.random.RandomState(1) +y = np.sign(0.5 - rng.rand(n_samples)) +X = rng.randn(n_samples, n_features // 5) + y[:, np.newaxis] +X += 5 * rng.randn(n_samples, n_features // 5) + +# %% +model_l2 = LinearSVC(penalty="l2", loss="squared_hinge", dual=True) +Cs = np.logspace(-4.5, -2, 10) + +labels = [f"fraction: {train_size}" for train_size in train_sizes] +results = {"C": Cs} +for label, train_size in zip(labels, train_sizes): + cv = ShuffleSplit(train_size=train_size, test_size=0.3, n_splits=50, random_state=1) + train_scores, test_scores = validation_curve( + model_l2, X, y, param_name="C", param_range=Cs, cv=cv + ) + results[label] = test_scores.mean(axis=1) +results = pd.DataFrame(results) + +# %% +import matplotlib.pyplot as plt -rnd = check_random_state(1) +fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 6)) -# set up dataset -n_samples = 100 -n_features = 300 +# plot results without scaling C +results.plot(x="C", ax=axes[0], logx=True) +axes[0].set_ylabel("CV score") +axes[0].set_title("No scaling") -# l1 data (only 5 informative features) -X_1, y_1 = datasets.make_classification( - n_samples=n_samples, n_features=n_features, n_informative=5, random_state=1 -) +# plot results by scaling C +for train_size_idx, label in enumerate(labels): + results_scaled = results[[label]].assign( + C_scaled=Cs * float(n_samples * train_sizes[train_size_idx]) + ) + results_scaled.plot(x="C_scaled", ax=axes[1], logx=True, label=label) +axes[1].set_title("Scaling C by 1 / n_samples") + +_ = fig.suptitle("Effect of scaling C with L2 penalty") -# l2 data: non sparse, but less features -y_2 = np.sign(0.5 - rnd.rand(n_samples)) -X_2 = rnd.randn(n_samples, n_features // 5) + y_2[:, np.newaxis] -X_2 += 5 * rnd.randn(n_samples, n_features // 5) - -clf_sets = [ - ( - LinearSVC(penalty="l1", loss="squared_hinge", dual=False, tol=1e-3), - np.logspace(-2.3, -1.3, 10), - X_1, - y_1, - ), - ( - LinearSVC(penalty="l2", loss="squared_hinge", dual=True), - np.logspace(-4.5, -2, 10), - X_2, - y_2, - ), -] - -colors = ["navy", "cyan", "darkorange"] -lw = 2 - -for clf, cs, X, y in clf_sets: - # set up the plot for each regressor - fig, axes = plt.subplots(nrows=2, sharey=True, figsize=(9, 10)) - - for k, train_size in enumerate(np.linspace(0.3, 0.7, 3)[::-1]): - param_grid = dict(C=cs) - # To get nice curve, we need a large number of iterations to - # reduce the variance - grid = GridSearchCV( - clf, - refit=False, - param_grid=param_grid, - cv=ShuffleSplit( - train_size=train_size, test_size=0.3, n_splits=50, random_state=1 - ), - ) - grid.fit(X, y) - scores = grid.cv_results_["mean_test_score"] - - scales = [ - (1, "No scaling"), - ((n_samples * train_size), "1/n_samples"), - ] - - for ax, (scaler, name) in zip(axes, scales): - ax.set_xlabel("C") - ax.set_ylabel("CV Score") - grid_cs = cs * float(scaler) # scale the C's - ax.semilogx( - grid_cs, - scores, - label="fraction %.2f" % train_size, - color=colors[k], - lw=lw, - ) - ax.set_title( - "scaling=%s, penalty=%s, loss=%s" % (name, clf.penalty, clf.loss) - ) - - plt.legend(loc="best") +# %% +# So or the L2 penalty case, the best result comes from the case where `C` is +# not scaled. plt.show()