|
1 | 1 | """
|
2 |
| -========================================================= |
3 |
| -PCA example with Iris Data-set |
4 |
| -========================================================= |
| 2 | +================================================== |
| 3 | +Principal Component Analysis (PCA) on Iris Dataset |
| 4 | +================================================== |
5 | 5 |
|
6 |
| -Principal Component Analysis applied to the Iris dataset. |
7 |
| -
|
8 |
| -See `here <https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ for more |
9 |
| -information on this dataset. |
| 6 | +This example shows a well known decomposition technique known as Principal Component |
| 7 | +Analysis (PCA) on the |
| 8 | +`Iris dataset <https://en.wikipedia.org/wiki/Iris_flower_data_set>`_. |
10 | 9 |
|
| 10 | +This dataset is made of 4 features: sepal length, sepal width, petal length, petal |
| 11 | +width. We use PCA to project this 4 feature space into a 3-dimensional space. |
11 | 12 | """
|
12 | 13 |
|
13 | 14 | # Authors: The scikit-learn developers
|
14 | 15 | # SPDX-License-Identifier: BSD-3-Clause
|
15 | 16 |
|
| 17 | +# %% |
| 18 | +# Loading the Iris dataset |
| 19 | +# ------------------------ |
| 20 | +# |
| 21 | +# The Iris dataset is directly available as part of scikit-learn. It can be loaded |
| 22 | +# using the :func:`~sklearn.datasets.load_iris` function. With the default parameters, |
| 23 | +# a :class:`~sklearn.utils.Bunch` object is returned, containing the data, the |
| 24 | +# target values, the feature names, and the target names. |
| 25 | +from sklearn.datasets import load_iris |
| 26 | + |
| 27 | +iris = load_iris(as_frame=True) |
| 28 | +print(iris.keys()) |
| 29 | + |
| 30 | +# %% |
| 31 | +# Plot of pairs of features of the Iris dataset |
| 32 | +# --------------------------------------------- |
| 33 | +# |
| 34 | +# Let's first plot the pairs of features of the Iris dataset. |
| 35 | +import seaborn as sns |
| 36 | + |
| 37 | +# Rename classes using the iris target names |
| 38 | +iris.frame["target"] = iris.target_names[iris.target] |
| 39 | +_ = sns.pairplot(iris.frame, hue="target") |
| 40 | + |
| 41 | +# %% |
| 42 | +# Each data point on each scatter plot refers to one of the 150 iris flowers |
| 43 | +# in the dataset, with the color indicating their respective type |
| 44 | +# (Setosa, Versicolor, and Virginica). |
| 45 | +# |
| 46 | +# You can already see a pattern regarding the Setosa type, which is |
| 47 | +# easily identifiable based on its short and wide sepal. Only |
| 48 | +# considering these two dimensions, sepal width and length, there's still |
| 49 | +# overlap between the Versicolor and Virginica types. |
| 50 | +# |
| 51 | +# The diagonal of the plot shows the distribution of each feature. We observe |
| 52 | +# that the petal width and the petal length are the most discriminant features |
| 53 | +# for the three types. |
| 54 | +# |
| 55 | +# Plot a PCA representation |
| 56 | +# ------------------------- |
| 57 | +# Let's apply a Principal Component Analysis (PCA) to the iris dataset |
| 58 | +# and then plot the irises across the first three PCA dimensions. |
| 59 | +# This will allow us to better differentiate among the three types! |
| 60 | + |
16 | 61 | import matplotlib.pyplot as plt
|
17 | 62 |
|
18 | 63 | # unused but required import for doing 3d projections with matplotlib < 3.2
|
19 | 64 | import mpl_toolkits.mplot3d # noqa: F401
|
20 |
| -import numpy as np |
21 |
| - |
22 |
| -from sklearn import datasets, decomposition |
23 |
| - |
24 |
| -np.random.seed(5) |
25 |
| - |
26 |
| -iris = datasets.load_iris() |
27 |
| -X = iris.data |
28 |
| -y = iris.target |
29 |
| - |
30 |
| -fig = plt.figure(1, figsize=(4, 3)) |
31 |
| -plt.clf() |
32 |
| - |
33 |
| -ax = fig.add_subplot(111, projection="3d", elev=48, azim=134) |
34 |
| -ax.set_position([0, 0, 0.95, 1]) |
35 |
| - |
36 |
| - |
37 |
| -plt.cla() |
38 |
| -pca = decomposition.PCA(n_components=3) |
39 |
| -pca.fit(X) |
40 |
| -X = pca.transform(X) |
41 |
| - |
42 |
| -for name, label in [("Setosa", 0), ("Versicolour", 1), ("Virginica", 2)]: |
43 |
| - ax.text3D( |
44 |
| - X[y == label, 0].mean(), |
45 |
| - X[y == label, 1].mean() + 1.5, |
46 |
| - X[y == label, 2].mean(), |
47 |
| - name, |
48 |
| - horizontalalignment="center", |
49 |
| - bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"), |
50 |
| - ) |
51 |
| -# Reorder the labels to have colors matching the cluster results |
52 |
| -y = np.choose(y, [1, 2, 0]).astype(float) |
53 |
| -ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.nipy_spectral, edgecolor="k") |
54 | 65 |
|
| 66 | +from sklearn.decomposition import PCA |
| 67 | + |
| 68 | +fig = plt.figure(1, figsize=(8, 6)) |
| 69 | +ax = fig.add_subplot(111, projection="3d", elev=-150, azim=110) |
| 70 | + |
| 71 | +X_reduced = PCA(n_components=3).fit_transform(iris.data) |
| 72 | +scatter = ax.scatter( |
| 73 | + X_reduced[:, 0], |
| 74 | + X_reduced[:, 1], |
| 75 | + X_reduced[:, 2], |
| 76 | + c=iris.target, |
| 77 | + s=40, |
| 78 | +) |
| 79 | + |
| 80 | +ax.set( |
| 81 | + title="First three PCA dimensions", |
| 82 | + xlabel="1st Eigenvector", |
| 83 | + ylabel="2nd Eigenvector", |
| 84 | + zlabel="3rd Eigenvector", |
| 85 | +) |
55 | 86 | ax.xaxis.set_ticklabels([])
|
56 | 87 | ax.yaxis.set_ticklabels([])
|
57 | 88 | ax.zaxis.set_ticklabels([])
|
58 | 89 |
|
| 90 | +# Add a legend |
| 91 | +legend1 = ax.legend( |
| 92 | + scatter.legend_elements()[0], |
| 93 | + iris.target_names.tolist(), |
| 94 | + loc="upper right", |
| 95 | + title="Classes", |
| 96 | +) |
| 97 | +ax.add_artist(legend1) |
| 98 | + |
59 | 99 | plt.show()
|
| 100 | + |
| 101 | +# %% |
| 102 | +# PCA will create 3 new features that are a linear combination of the 4 original |
| 103 | +# features. In addition, this transformation maximizes the variance. With this |
| 104 | +# transformation, we see that we can identify each species using only the first feature |
| 105 | +# (i.e., first eigenvector). |
0 commit comments