Skip to content

Commit 019e953

Browse files
authored
DOC remove basic iris example and merge with iris with PCA example (#29964)
1 parent 5b2b751 commit 019e953

File tree

4 files changed

+91
-130
lines changed

4 files changed

+91
-130
lines changed

doc/conf.py

+3
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,9 @@ def add_js_css_files(app, pagename, templatename, context, doctree):
487487
"auto_examples/ensemble/plot_forest_importances_faces": (
488488
"auto_examples/ensemble/plot_forest_importances"
489489
),
490+
"auto_examples/datasets/plot_iris_dataset": (
491+
"auto_examples/decomposition/plot_pca_iris"
492+
),
490493
}
491494
html_context["redirects"] = redirects
492495
for old_link in redirects:

examples/datasets/plot_iris_dataset.py

-88
This file was deleted.
+87-41
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,105 @@
11
"""
2-
=========================================================
3-
PCA example with Iris Data-set
4-
=========================================================
2+
==================================================
3+
Principal Component Analysis (PCA) on Iris Dataset
4+
==================================================
55
6-
Principal Component Analysis applied to the Iris dataset.
7-
8-
See `here <https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ for more
9-
information on this dataset.
6+
This example shows a well known decomposition technique known as Principal Component
7+
Analysis (PCA) on the
8+
`Iris dataset <https://en.wikipedia.org/wiki/Iris_flower_data_set>`_.
109
10+
This dataset is made of 4 features: sepal length, sepal width, petal length, petal
11+
width. We use PCA to project this 4 feature space into a 3-dimensional space.
1112
"""
1213

1314
# Authors: The scikit-learn developers
1415
# SPDX-License-Identifier: BSD-3-Clause
1516

17+
# %%
18+
# Loading the Iris dataset
19+
# ------------------------
20+
#
21+
# The Iris dataset is directly available as part of scikit-learn. It can be loaded
22+
# using the :func:`~sklearn.datasets.load_iris` function. With the default parameters,
23+
# a :class:`~sklearn.utils.Bunch` object is returned, containing the data, the
24+
# target values, the feature names, and the target names.
25+
from sklearn.datasets import load_iris
26+
27+
iris = load_iris(as_frame=True)
28+
print(iris.keys())
29+
30+
# %%
31+
# Plot of pairs of features of the Iris dataset
32+
# ---------------------------------------------
33+
#
34+
# Let's first plot the pairs of features of the Iris dataset.
35+
import seaborn as sns
36+
37+
# Rename classes using the iris target names
38+
iris.frame["target"] = iris.target_names[iris.target]
39+
_ = sns.pairplot(iris.frame, hue="target")
40+
41+
# %%
42+
# Each data point on each scatter plot refers to one of the 150 iris flowers
43+
# in the dataset, with the color indicating their respective type
44+
# (Setosa, Versicolor, and Virginica).
45+
#
46+
# You can already see a pattern regarding the Setosa type, which is
47+
# easily identifiable based on its short and wide sepal. Only
48+
# considering these two dimensions, sepal width and length, there's still
49+
# overlap between the Versicolor and Virginica types.
50+
#
51+
# The diagonal of the plot shows the distribution of each feature. We observe
52+
# that the petal width and the petal length are the most discriminant features
53+
# for the three types.
54+
#
55+
# Plot a PCA representation
56+
# -------------------------
57+
# Let's apply a Principal Component Analysis (PCA) to the iris dataset
58+
# and then plot the irises across the first three PCA dimensions.
59+
# This will allow us to better differentiate among the three types!
60+
1661
import matplotlib.pyplot as plt
1762

1863
# unused but required import for doing 3d projections with matplotlib < 3.2
1964
import mpl_toolkits.mplot3d # noqa: F401
20-
import numpy as np
21-
22-
from sklearn import datasets, decomposition
23-
24-
np.random.seed(5)
25-
26-
iris = datasets.load_iris()
27-
X = iris.data
28-
y = iris.target
29-
30-
fig = plt.figure(1, figsize=(4, 3))
31-
plt.clf()
32-
33-
ax = fig.add_subplot(111, projection="3d", elev=48, azim=134)
34-
ax.set_position([0, 0, 0.95, 1])
35-
36-
37-
plt.cla()
38-
pca = decomposition.PCA(n_components=3)
39-
pca.fit(X)
40-
X = pca.transform(X)
41-
42-
for name, label in [("Setosa", 0), ("Versicolour", 1), ("Virginica", 2)]:
43-
ax.text3D(
44-
X[y == label, 0].mean(),
45-
X[y == label, 1].mean() + 1.5,
46-
X[y == label, 2].mean(),
47-
name,
48-
horizontalalignment="center",
49-
bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"),
50-
)
51-
# Reorder the labels to have colors matching the cluster results
52-
y = np.choose(y, [1, 2, 0]).astype(float)
53-
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.nipy_spectral, edgecolor="k")
5465

66+
from sklearn.decomposition import PCA
67+
68+
fig = plt.figure(1, figsize=(8, 6))
69+
ax = fig.add_subplot(111, projection="3d", elev=-150, azim=110)
70+
71+
X_reduced = PCA(n_components=3).fit_transform(iris.data)
72+
scatter = ax.scatter(
73+
X_reduced[:, 0],
74+
X_reduced[:, 1],
75+
X_reduced[:, 2],
76+
c=iris.target,
77+
s=40,
78+
)
79+
80+
ax.set(
81+
title="First three PCA dimensions",
82+
xlabel="1st Eigenvector",
83+
ylabel="2nd Eigenvector",
84+
zlabel="3rd Eigenvector",
85+
)
5586
ax.xaxis.set_ticklabels([])
5687
ax.yaxis.set_ticklabels([])
5788
ax.zaxis.set_ticklabels([])
5889

90+
# Add a legend
91+
legend1 = ax.legend(
92+
scatter.legend_elements()[0],
93+
iris.target_names.tolist(),
94+
loc="upper right",
95+
title="Classes",
96+
)
97+
ax.add_artist(legend1)
98+
5999
plt.show()
100+
101+
# %%
102+
# PCA will create 3 new features that are a linear combination of the 4 original
103+
# features. In addition, this transformation maximizes the variance. With this
104+
# transformation, we see that we can identify each species using only the first feature
105+
# (i.e., first eigenvector).

sklearn/datasets/_base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -707,7 +707,7 @@ def load_iris(*, return_X_y=False, as_frame=False):
707707
>>> list(data.target_names)
708708
[np.str_('setosa'), np.str_('versicolor'), np.str_('virginica')]
709709
710-
See :ref:`sphx_glr_auto_examples_datasets_plot_iris_dataset.py` for a more
710+
See :ref:`sphx_glr_auto_examples_decomposition_plot_pca_iris.py` for a more
711711
detailed example of how to work with the iris dataset.
712712
"""
713713
data_file_name = "iris.csv"

0 commit comments

Comments
 (0)