|
1 | 1 | """
|
2 | 2 | ==================
|
3 |
| -Pipeline Anova SVM |
| 3 | +Pipeline ANOVA SVM |
4 | 4 | ==================
|
5 | 5 |
|
6 |
| -Simple usage of Pipeline that runs successively a univariate |
7 |
| -feature selection with anova and then a SVM of the selected features. |
| 6 | +This example shows how a feature selection can be easily integrated within |
| 7 | +a machine learning pipeline. |
8 | 8 |
|
9 |
| -Using a sub-pipeline, the fitted coefficients can be mapped back into |
10 |
| -the original feature space. |
| 9 | +We also show that you can easily introspect part of the pipeline. |
11 | 10 | """
|
12 |
| -from sklearn import svm |
13 |
| -from sklearn.datasets import make_classification |
14 |
| -from sklearn.feature_selection import SelectKBest, f_classif |
15 |
| -from sklearn.pipeline import make_pipeline |
16 |
| -from sklearn.model_selection import train_test_split |
17 |
| -from sklearn.metrics import classification_report |
18 | 11 |
|
19 | 12 | print(__doc__)
|
| 13 | +from sklearn import set_config |
| 14 | +set_config(display='diagram') |
20 | 15 |
|
21 |
| -# import some data to play with |
22 |
| -X, y = make_classification( |
23 |
| - n_features=20, n_informative=3, n_redundant=0, n_classes=4, |
24 |
| - n_clusters_per_class=2) |
| 16 | +# %% |
| 17 | +# We will start by generating a binary classification dataset. Subsequently, we |
| 18 | +# will divide the dataset into two subsets. |
25 | 19 |
|
| 20 | +from sklearn.datasets import make_classification |
| 21 | +from sklearn.model_selection import train_test_split |
| 22 | + |
| 23 | +X, y = make_classification( |
| 24 | + n_features=20, n_informative=3, n_redundant=0, n_classes=2, |
| 25 | + n_clusters_per_class=2, random_state=42) |
26 | 26 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
|
27 | 27 |
|
28 |
| -# ANOVA SVM-C |
29 |
| -# 1) anova filter, take 3 best ranked features |
30 |
| -anova_filter = SelectKBest(f_classif, k=3) |
31 |
| -# 2) svm |
32 |
| -clf = svm.LinearSVC() |
| 28 | +# %% |
| 29 | +# A common mistake done with feature selection is to search a subset of |
| 30 | +# discriminative features on the full dataset instead of only using the |
| 31 | +# training set. The usage of scikit-learn :func:`~sklearn.pipeline.Pipeline` |
| 32 | +# prevents to make such mistake. |
| 33 | +# |
| 34 | +# Here, we will demonstrate how to build a pipeline where the first step will |
| 35 | +# be the feature selection. |
| 36 | +# |
| 37 | +# When calling `fit` on the training data, a subset of feature will be selected |
| 38 | +# and the index of these selected features will be stored. The feature selector |
| 39 | +# will subsequently reduce the number of feature and pass this subset to the |
| 40 | +# classifier which will be trained. |
33 | 41 |
|
| 42 | +from sklearn.feature_selection import SelectKBest, f_classif |
| 43 | +from sklearn.pipeline import make_pipeline |
| 44 | +from sklearn.svm import LinearSVC |
| 45 | + |
| 46 | +anova_filter = SelectKBest(f_classif, k=3) |
| 47 | +clf = LinearSVC() |
34 | 48 | anova_svm = make_pipeline(anova_filter, clf)
|
35 | 49 | anova_svm.fit(X_train, y_train)
|
| 50 | + |
| 51 | +# %% |
| 52 | +# Once the training accomplished, we can predict on new unseen samples. In this |
| 53 | +# case, the feature selector will only select the most discriminative features |
| 54 | +# based on the information stored during training. Then, the data will be |
| 55 | +# passed to the classifier which will make the prediction. |
| 56 | +# |
| 57 | +# Here, we report the final metrics via a classification report. |
| 58 | + |
| 59 | +from sklearn.metrics import classification_report |
| 60 | + |
36 | 61 | y_pred = anova_svm.predict(X_test)
|
37 | 62 | print(classification_report(y_test, y_pred))
|
38 | 63 |
|
39 |
| -coef = anova_svm[:-1].inverse_transform(anova_svm['linearsvc'].coef_) |
40 |
| -print(coef) |
| 64 | +# %% |
| 65 | +# Be aware that you can inspect a step in the pipeline. For instance, we might |
| 66 | +# be interested about the parameters of the classifier. Since we selected |
| 67 | +# three features, we expect to have three coefficients. |
| 68 | + |
| 69 | +anova_svm[-1].coef_ |
| 70 | + |
| 71 | +# %% |
| 72 | +# However, we do not know which features where selected from the original |
| 73 | +# dataset. We could proceed by several manner. Here, we will inverse the |
| 74 | +# transformation of these coefficients to get information about the original |
| 75 | +# space. |
| 76 | + |
| 77 | +anova_svm[:-1].inverse_transform(anova_svm[-1].coef_) |
| 78 | + |
| 79 | +# %% |
| 80 | +# We can see that the first three features where the selected features by |
| 81 | +# the first step. |
0 commit comments