scikit-learn · jnothman · Jan 28, 2019 · Aug 1, 2018 · Aug 25, 2018 · Jan 26, 2019
diff --git a/examples/svm/plot_svm_anova.py b/examples/svm/plot_svm_anova.py
@@ -4,37 +4,35 @@
 =================================================
 
 This example shows how to perform univariate feature selection before running a
-SVC (support vector classifier) to improve the classification scores.
+SVC (support vector classifier) to improve the classification scores. We use
+the iris dataset (4 features) and add 36 non-informative features. We can find
+that our model achieves best performance when we select around 10% of features.
 """
 print(__doc__)
 
 import numpy as np
 import matplotlib.pyplot as plt
-from sklearn.datasets import load_digits
+from sklearn.datasets import load_iris
 from sklearn.feature_selection import SelectPercentile, chi2
 from sklearn.model_selection import cross_val_score
 from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
 
 
 # #############################################################################
 # Import some data to play with
-X, y = load_digits(return_X_y=True)
-# Throw away data, to be in the curse of dimension settings
-X = X[:200]
-y = y[:200]
-n_samples = len(y)
-X = X.reshape((n_samples, -1))
-# add 200 non-informative features
-X = np.hstack((X, 2 * np.random.random((n_samples, 200))))
+X, y = load_iris(return_X_y=True)
+# Add non-informative features
+np.random.seed(0)
+X = np.hstack((X, 2 * np.random.random((X.shape[0], 36))))
 
 # #############################################################################
-# Create a feature-selection transform and an instance of SVM that we
+# Create a feature-selection transform, a scaler and an instance of SVM that we
 # combine together to have an full-blown estimator
-
-transform = SelectPercentile(chi2)
-
-clf = Pipeline([('anova', transform), ('svc', SVC(gamma="auto"))])
+clf = Pipeline([('anova', SelectPercentile(chi2)),
+                ('scaler', StandardScaler()),
+                ('svc', SVC(gamma="auto"))])
 
 # #############################################################################
 # Plot the cross-validation score as a function of percentile of features
@@ -44,17 +42,15 @@
 
 for percentile in percentiles:
     clf.set_params(anova__percentile=percentile)
-    # Compute cross-validation score using 1 CPU
-    this_scores = cross_val_score(clf, X, y, cv=5, n_jobs=1)
+    this_scores = cross_val_score(clf, X, y, cv=5)
     score_means.append(this_scores.mean())
     score_stds.append(this_scores.std())
 
 plt.errorbar(percentiles, score_means, np.array(score_stds))
-
 plt.title(
     'Performance of the SVM-Anova varying the percentile of features selected')
+plt.xticks(np.linspace(0, 100, 11, endpoint=True))
 plt.xlabel('Percentile')
-plt.ylabel('Prediction rate')
-
+plt.ylabel('Accuracy Score')
 plt.axis('tight')
 plt.show()