scikit-learn · glemaitre · Sep 7, 2023 · Aug 1, 2023 · Aug 1, 2023 · Aug 1, 2023
diff --git a/examples/datasets/plot_iris_dataset.py b/examples/datasets/plot_iris_dataset.py
@@ -1,7 +1,7 @@
 """
-=========================================================
+================
 The Iris Dataset
-=========================================================
+================
 This data sets consists of 3 different types of irises'
 (Setosa, Versicolour, and Virginica) petal and sepal
 length, stored in a 150x4 numpy.ndarray
@@ -19,37 +19,47 @@
 # Modified for documentation by Jaques Grobler
 # License: BSD 3 clause
 
-import matplotlib.pyplot as plt
-
-# unused but required import for doing 3d projections with matplotlib < 3.2
-import mpl_toolkits.mplot3d  # noqa: F401
-
+# %%
+# Loading the iris dataset
+# ------------------------
 from sklearn import datasets
-from sklearn.decomposition import PCA
 
-# import some data to play with
 iris = datasets.load_iris()
-X = iris.data[:, :2]  # we only take the first two features.
-y = iris.target
 
-x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
-y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
 
-plt.figure(2, figsize=(8, 6))
-plt.clf()
+# %%
+# Scatter Plot of the Iris dataset
+# --------------------------------
+import matplotlib.pyplot as plt
+
+_, ax = plt.subplots()
+scatter = ax.scatter(iris.data[:, 0], iris.data[:, 1], c=iris.target)
+ax.set(xlabel=iris.feature_names[0], ylabel=iris.feature_names[1])
+_ = ax.legend(
+    scatter.legend_elements()[0], iris.target_names, loc="lower right", title="Classes"
+)
 
-# Plot the training points
-plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, edgecolor="k")
-plt.xlabel("Sepal length")
-plt.ylabel("Sepal width")
+# %%
+# Each point in the scatter plot refers to one of the 150 iris flowers
+# in the dataset, with the color indicating their respective type
+# (Setosa, Versicolour, and Virginica).
+# You can already see a pattern regarding the Setosa type, which is
+# easily identifiable based on its short and wide sepal. Only
+# considering these 2 dimensions, sepal width and length, there's still
+# overlap between the Versicolor and Virginica types.
+
+# %%
+# Plot a PCA representation
+# -------------------------
+# Let's apply a Principal Component Analysis (PCA) to the iris dataset
+# and then plot the irises across the first three PCA dimensions.
+# This will allow us to better differentiate between the three types!
 
-plt.xlim(x_min, x_max)
-plt.ylim(y_min, y_max)
-plt.xticks(())
-plt.yticks(())
+# unused but required import for doing 3d projections with matplotlib < 3.2
+import mpl_toolkits.mplot3d  # noqa: F401
+
+from sklearn.decomposition import PCA
 
-# To getter a better understanding of interaction of the dimensions
-# plot the first three PCA dimensions
 fig = plt.figure(1, figsize=(8, 6))
 ax = fig.add_subplot(111, projection="3d", elev=-150, azim=110)
 
@@ -58,18 +68,22 @@
     X_reduced[:, 0],
     X_reduced[:, 1],
     X_reduced[:, 2],
-    c=y,
-    cmap=plt.cm.Set1,
-    edgecolor="k",
+    c=iris.target,
     s=40,
 )
 
-ax.set_title("First three PCA directions")
-ax.set_xlabel("1st eigenvector")
+ax.set_title("First three PCA dimensions")
+ax.set_xlabel("1st Eigenvector")
 ax.xaxis.set_ticklabels([])
-ax.set_ylabel("2nd eigenvector")
+ax.set_ylabel("2nd Eigenvector")
 ax.yaxis.set_ticklabels([])
-ax.set_zlabel("3rd eigenvector")
+ax.set_zlabel("3rd Eigenvector")
 ax.zaxis.set_ticklabels([])
 
 plt.show()
+
+# %%
+# PCA will create 3 new features that are a linear combination of the
+# 4 original features. In addition, this transform maximizes the variance.
+# With this transformation, we see that we can identify each species using
+# only the first feature (i.e. first eigenvalues).
diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
@@ -667,6 +667,9 @@ def load_iris(*, return_X_y=False, as_frame=False):
     array([0, 0, 1])
     >>> list(data.target_names)
     ['setosa', 'versicolor', 'virginica']
+
+    See :ref:`sphx_glr_auto_examples_datasets_plot_iris_dataset.py` for a more
+    detailed example of how to work with the iris dataset.
     """
     data_file_name = "iris.csv"
     data, target, target_names, fdescr = load_csv_data(