DOC Add links to KMeans examples in docstrings and the user guide (scikit-learn#27799)

marenwestermann · web-flow · commit 056864d9c558 · 2024-01-06T04:11:52.000+01:00
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
@@ -182,6 +182,10 @@ It suffers from various drawbacks:
    :align: center
    :scale: 50
 
+For more detailed descriptions of the issues shown above and how to address them,
+refer to the examples :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`
+and :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.
+
 K-means is often referred to as Lloyd's algorithm. In basic terms, the
 algorithm has three steps. The first step chooses the initial centroids, with
 the most basic method being to choose :math:`k` samples from the dataset
@@ -218,7 +222,9 @@ initializations of the centroids. One method to help address this issue is the
 k-means++ initialization scheme, which has been implemented in scikit-learn
 (use the ``init='k-means++'`` parameter). This initializes the centroids to be
 (generally) distant from each other, leading to probably better results than
-random initialization, as shown in the reference.
+random initialization, as shown in the reference. For a detailed example of
+comaparing different initialization schemes, refer to
+:ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`.
 
 K-means++ can also be called independently to select seeds for other
 clustering algorithms, see :func:`sklearn.cluster.kmeans_plusplus` for details
@@ -231,7 +237,17 @@ weight of 2 to a sample is equivalent to adding a duplicate of that sample
 to the dataset :math:`X`.
 
 K-means can be used for vector quantization. This is achieved using the
-transform method of a trained model of :class:`KMeans`.
+``transform`` method of a trained model of :class:`KMeans`. For an example of
+performing vector quantization on an image refer to
+:ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py`.
+
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`: Example usage of
+   :class:`KMeans` using the iris dataset
+
+ * :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`: Document clustering
+   using :class:`KMeans` and :class:`MiniBatchKMeans` based on sparse data
 
 Low-level parallelism
 ---------------------
@@ -291,11 +307,11 @@ small, as shown in the example and cited reference.
 
 .. topic:: Examples:
 
- * :ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`: Comparison of KMeans and
-   MiniBatchKMeans
+ * :ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`: Comparison of
+   :class:`KMeans` and :class:`MiniBatchKMeans`
 
- * :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`: Document clustering using sparse
-   MiniBatchKMeans
+ * :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`: Document clustering
+   using :class:`KMeans` and :class:`MiniBatchKMeans` based on sparse data
 
  * :ref:`sphx_glr_auto_examples_cluster_plot_dict_face_patches.py`
 
diff --git a/examples/cluster/plot_cluster_iris.py b/examples/cluster/plot_cluster_iris.py
@@ -7,13 +7,13 @@
 
 - top left: What a K-means algorithm would yield using 8 clusters.
 
-- top right: What the effect of a bad initialization is
+- top right: What using three clusters would deliver.
+
+- bottom left: What the effect of a bad initialization is
   on the classification process: By setting n_init to only 1
   (default is 10), the amount of times that the algorithm will
   be run with different centroid seeds is reduced.
 
-- bottom left: What using eight clusters would deliver.
-
 - bottom right: The ground truth.
 
 """
@@ -73,8 +73,7 @@
         horizontalalignment="center",
         bbox=dict(alpha=0.2, edgecolor="w", facecolor="w"),
     )
-# Reorder the labels to have colors matching the cluster results
-y = np.choose(y, [1, 2, 0]).astype(float)
+
 ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor="k")
 
 ax.xaxis.set_ticklabels([])
diff --git a/examples/cluster/plot_color_quantization.py b/examples/cluster/plot_color_quantization.py
@@ -41,7 +41,7 @@
 china = load_sample_image("china.jpg")
 
 # Convert to floats instead of the default 8 bits integer coding. Dividing by
-# 255 is important so that plt.imshow behaves works well on float data (need to
+# 255 is important so that plt.imshow works well on float data (need to
 # be in the range [0-1])
 china = np.array(china, dtype=np.float64) / 255
 
diff --git a/examples/text/plot_document_clustering.py b/examples/text/plot_document_clustering.py
@@ -99,8 +99,9 @@
 #   assignment have an ARI of 0.0 in expectation.
 #
 # If the ground truth labels are not known, evaluation can only be performed
-# using the model results itself. In that case, the Silhouette Coefficient comes
-# in handy.
+# using the model results itself. In that case, the Silhouette Coefficient comes in
+# handy. See :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`
+# for an example on how to do it.
 #
 # For more reference, see :ref:`clustering_evaluation`.
 
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
@@ -1208,6 +1208,9 @@ class KMeans(_BaseKMeans):
         The number of clusters to form as well as the number of
         centroids to generate.
 
+        For an example of how to choose an optimal value for `n_clusters` refer to
+        :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`.
+
     init : {'k-means++', 'random'}, callable or array-like of shape \
             (n_clusters, n_features), default='k-means++'
         Method for initialization:
@@ -1364,6 +1367,21 @@ class KMeans(_BaseKMeans):
     >>> kmeans.cluster_centers_
     array([[10.,  2.],
            [ 1.,  2.]])
+
+    For a more detailed example of K-Means using the iris dataset see
+    :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`.
+
+    For examples of common problems with K-Means and how to address them see
+    :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`.
+
+    For an example of how to use K-Means to perform color quantization see
+    :ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py`.
+
+    For a demonstration of how K-Means can be used to cluster text documents see
+    :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`.
+
+    For a comparison between K-Means and MiniBatchKMeans refer to example
+    :ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`.
     """
 
     _parameter_constraints: dict = {

Original file line number	Diff line number	Diff line change
`@@ -99,8 +99,9 @@`
`99`	`99`	`# assignment have an ARI of 0.0 in expectation.`
`100`	`100`	`#`
`101`	`101`	`# If the ground truth labels are not known, evaluation can only be performed`
`102`		`-# using the model results itself. In that case, the Silhouette Coefficient comes`
`103`		`-# in handy.`
	`102`	`+# using the model results itself. In that case, the Silhouette Coefficient comes in`
	`103`	+# handy. See :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`
	`104`	`+# for an example on how to do it.`
`104`	`105`	`#`
`105`	`106`	# For more reference, see :ref:`clustering_evaluation`.
`106`	`107`