Skip to content

[MRG] EXA Improve example plot_svm_anova.py #11731

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 28, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 16 additions & 20 deletions examples/svm/plot_svm_anova.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,35 @@
=================================================

This example shows how to perform univariate feature selection before running a
SVC (support vector classifier) to improve the classification scores.
SVC (support vector classifier) to improve the classification scores. We use
the iris dataset (4 features) and add 36 non-informative features. We can find
that our model achieves best performance when we select around 10% of features.
"""
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


# #############################################################################
# Import some data to play with
X, y = load_digits(return_X_y=True)
# Throw away data, to be in the curse of dimension settings
X = X[:200]
y = y[:200]
n_samples = len(y)
X = X.reshape((n_samples, -1))
# add 200 non-informative features
X = np.hstack((X, 2 * np.random.random((n_samples, 200))))
X, y = load_iris(return_X_y=True)
# Add non-informative features
np.random.seed(0)
X = np.hstack((X, 2 * np.random.random((X.shape[0], 36))))

# #############################################################################
# Create a feature-selection transform and an instance of SVM that we
# Create a feature-selection transform, a scaler and an instance of SVM that we
# combine together to have an full-blown estimator

transform = SelectPercentile(chi2)

clf = Pipeline([('anova', transform), ('svc', SVC(gamma="auto"))])
clf = Pipeline([('anova', SelectPercentile(chi2)),
('scaler', StandardScaler()),
('svc', SVC(gamma="auto"))])

# #############################################################################
# Plot the cross-validation score as a function of percentile of features
Expand All @@ -44,17 +42,15 @@

for percentile in percentiles:
clf.set_params(anova__percentile=percentile)
# Compute cross-validation score using 1 CPU
this_scores = cross_val_score(clf, X, y, cv=5, n_jobs=1)
this_scores = cross_val_score(clf, X, y, cv=5)
score_means.append(this_scores.mean())
score_stds.append(this_scores.std())

plt.errorbar(percentiles, score_means, np.array(score_stds))

plt.title(
'Performance of the SVM-Anova varying the percentile of features selected')
plt.xticks(np.linspace(0, 100, 11, endpoint=True))
plt.xlabel('Percentile')
plt.ylabel('Prediction rate')

plt.ylabel('Accuracy Score')
plt.axis('tight')
plt.show()