From 851e7b49be0c9fd514552031e93b4a0dac4723ec Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Fri, 17 Nov 2023 14:22:17 +0100 Subject: [PATCH 01/12] plot_cluster_iris example --- examples/cluster/plot_cluster_iris.py | 9 ++++----- sklearn/cluster/_kmeans.py | 3 +++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/cluster/plot_cluster_iris.py b/examples/cluster/plot_cluster_iris.py index b20bc8e38dd78..b152473aa1bec 100644 --- a/examples/cluster/plot_cluster_iris.py +++ b/examples/cluster/plot_cluster_iris.py @@ -7,13 +7,13 @@ - top left: What a K-means algorithm would yield using 8 clusters. -- top right: What the effect of a bad initialization is +- top right: What using three clusters would deliver. + +- bottom left: What the effect of a bad initialization is on the classification process: By setting n_init to only 1 (default is 10), the amount of times that the algorithm will be run with different centroid seeds is reduced. -- bottom left: What using eight clusters would deliver. - - bottom right: The ground truth. """ @@ -73,8 +73,7 @@ horizontalalignment="center", bbox=dict(alpha=0.2, edgecolor="w", facecolor="w"), ) -# Reorder the labels to have colors matching the cluster results -y = np.choose(y, [1, 2, 0]).astype(float) + ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor="k") ax.xaxis.set_ticklabels([]) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 09758c0508334..649ed99df35e4 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1376,6 +1376,9 @@ class KMeans(_BaseKMeans): >>> kmeans.cluster_centers_ array([[10., 2.], [ 1., 2.]]) + + For a more detailed example using the iris dataset, see + :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`. """ _parameter_constraints: dict = { From 632a54356e018a4d2aef2862b1b396a62ee27c60 Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Fri, 17 Nov 2023 14:56:06 +0100 Subject: [PATCH 02/12] add plot_cluster_iris example to user guide --- doc/modules/clustering.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index eb668bbac3381..495b74802e25b 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -233,6 +233,11 @@ to the dataset :math:`X`. K-means can be used for vector quantization. This is achieved using the transform method of a trained model of :class:`KMeans`. +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`: Example usage of KMeans + using the iris dataset + Low-level parallelism --------------------- From 858571fdead9bf062cb362377aa4c7ed73e70325 Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Sat, 18 Nov 2023 11:51:06 +0100 Subject: [PATCH 03/12] add links to plot_color_quantization --- doc/modules/clustering.rst | 5 ++++- examples/cluster/plot_color_quantization.py | 2 +- sklearn/cluster/_kmeans.py | 5 ++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 495b74802e25b..56fe41cfaf022 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -235,9 +235,12 @@ transform method of a trained model of :class:`KMeans`. .. topic:: Examples: - * :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`: Example usage of KMeans + * :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`: Example usage of K-Means using the iris dataset + * :ref:`ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py`: Example of + using K-Means for performing color quantization + Low-level parallelism --------------------- diff --git a/examples/cluster/plot_color_quantization.py b/examples/cluster/plot_color_quantization.py index cc8849b64ab6f..e322fe1a0fd99 100644 --- a/examples/cluster/plot_color_quantization.py +++ b/examples/cluster/plot_color_quantization.py @@ -41,7 +41,7 @@ china = load_sample_image("china.jpg") # Convert to floats instead of the default 8 bits integer coding. Dividing by -# 255 is important so that plt.imshow behaves works well on float data (need to +# 255 is important so that plt.imshow works well on float data (need to # be in the range [0-1]) china = np.array(china, dtype=np.float64) / 255 diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 649ed99df35e4..9725a071949e2 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1377,8 +1377,11 @@ class KMeans(_BaseKMeans): array([[10., 2.], [ 1., 2.]]) - For a more detailed example using the iris dataset, see + For a more detailed example of K-Means using the iris dataset see :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`. + + For an example of how to use K-Means to perform color quantization see + :ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py`. """ _parameter_constraints: dict = { From 983865cc7306836c3c24374c1104e2f79291a61f Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Sun, 19 Nov 2023 11:28:21 +0100 Subject: [PATCH 04/12] fix rendering issue --- doc/modules/clustering.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 56fe41cfaf022..96df3aaf934b5 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -238,7 +238,7 @@ transform method of a trained model of :class:`KMeans`. * :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`: Example usage of K-Means using the iris dataset - * :ref:`ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py`: Example of + * :ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py`: Example of using K-Means for performing color quantization Low-level parallelism From 54846eca73d75c9283479be789fb75d53bb12bba Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Sun, 19 Nov 2023 18:50:26 +0100 Subject: [PATCH 05/12] add links to plot_kmeans_assumptions --- doc/modules/clustering.rst | 3 +++ sklearn/cluster/_kmeans.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 96df3aaf934b5..aa1af56d2ed88 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -238,6 +238,9 @@ transform method of a trained model of :class:`KMeans`. * :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`: Example usage of K-Means using the iris dataset + * :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`: Examples of common + problems with the K-Means algorithm and how to address them + * :ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py`: Example of using K-Means for performing color quantization diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 9725a071949e2..fff6be55f7903 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1380,6 +1380,9 @@ class KMeans(_BaseKMeans): For a more detailed example of K-Means using the iris dataset see :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`. + For examples of common problems with K-Means and how to address them see + :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`. + For an example of how to use K-Means to perform color quantization see :ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py`. """ From a4f05c83529284ee21a9a46c73f58bc709e3939b Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Mon, 20 Nov 2023 15:33:44 +0100 Subject: [PATCH 06/12] embed links in text --- doc/modules/clustering.rst | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index aa1af56d2ed88..63abe0b529a63 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -182,6 +182,9 @@ It suffers from various drawbacks: :align: center :scale: 50 +For more detailed descriptions of the issues shown above and how to address them, +refer to this example: :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`. + K-means is often referred to as Lloyd's algorithm. In basic terms, the algorithm has three steps. The first step chooses the initial centroids, with the most basic method being to choose :math:`k` samples from the dataset @@ -231,19 +234,15 @@ weight of 2 to a sample is equivalent to adding a duplicate of that sample to the dataset :math:`X`. K-means can be used for vector quantization. This is achieved using the -transform method of a trained model of :class:`KMeans`. +transform method of a trained model of :class:`KMeans`. For an example of +performing vector quantization on an image refer to +:ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py`. .. topic:: Examples: * :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`: Example usage of K-Means using the iris dataset - * :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`: Examples of common - problems with the K-Means algorithm and how to address them - - * :ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py`: Example of - using K-Means for performing color quantization - Low-level parallelism --------------------- From 48a0b30a66d35ec9a3bb0845719730790653585d Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Mon, 20 Nov 2023 16:57:42 +0100 Subject: [PATCH 07/12] add link to plot_kmeans_digits in user guide --- doc/modules/clustering.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 63abe0b529a63..fcf1f4910db73 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -221,7 +221,9 @@ initializations of the centroids. One method to help address this issue is the k-means++ initialization scheme, which has been implemented in scikit-learn (use the ``init='k-means++'`` parameter). This initializes the centroids to be (generally) distant from each other, leading to probably better results than -random initialization, as shown in the reference. +random initialization, as shown in the reference. For a detailed example of +comaparing different initialization schemes refer to +:ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`. K-means++ can also be called independently to select seeds for other clustering algorithms, see :func:`sklearn.cluster.kmeans_plusplus` for details From 5f8546815080e0f117035f008bee288b104686ee Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Tue, 21 Nov 2023 16:31:07 +0100 Subject: [PATCH 08/12] add links to plot_kmeans_silhouette_analysis --- doc/modules/clustering.rst | 3 ++- sklearn/cluster/_kmeans.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index fcf1f4910db73..6466802785dea 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -183,7 +183,8 @@ It suffers from various drawbacks: :scale: 50 For more detailed descriptions of the issues shown above and how to address them, -refer to this example: :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`. +refer to the examples :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py` +and :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`. K-means is often referred to as Lloyd's algorithm. In basic terms, the algorithm has three steps. The first step chooses the initial centroids, with diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index fff6be55f7903..f5e8e59e32577 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1220,6 +1220,9 @@ class KMeans(_BaseKMeans): The number of clusters to form as well as the number of centroids to generate. + For an example of how to choose an optimal value for `n_clusters` refer to + :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`. + init : {'k-means++', 'random'}, callable or array-like of shape \ (n_clusters, n_features), default='k-means++' Method for initialization: From 57b1cf17f722e07f9435067a7e2c33128ae56aa5 Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Wed, 22 Nov 2023 11:53:12 +0100 Subject: [PATCH 09/12] add link to plot_mini_batch_kmeans --- sklearn/cluster/_kmeans.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index f5e8e59e32577..028e9d130a610 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1388,6 +1388,9 @@ class KMeans(_BaseKMeans): For an example of how to use K-Means to perform color quantization see :ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py`. + + For a comparison between K-Means and MiniBatchKMeans refer to example + :ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`. """ _parameter_constraints: dict = { From 147c4737cecffd8c53cd668cca0d80569b0125d0 Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Fri, 24 Nov 2023 13:44:37 +0100 Subject: [PATCH 10/12] add link in plot_document_clustering.py --- examples/text/plot_document_clustering.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/text/plot_document_clustering.py b/examples/text/plot_document_clustering.py index fa68b8bd312ea..bfc83fd96af2f 100644 --- a/examples/text/plot_document_clustering.py +++ b/examples/text/plot_document_clustering.py @@ -99,8 +99,8 @@ # assignment have an ARI of 0.0 in expectation. # # If the ground truth labels are not known, evaluation can only be performed -# using the model results itself. In that case, the Silhouette Coefficient comes -# in handy. +# using the model results itself. In that case, the :ref:`Silhouette Coefficient +# ` comes in handy. # # For more reference, see :ref:`clustering_evaluation`. From c92170663966df8e7ef9f7cc075d4fdb21415ce0 Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Fri, 24 Nov 2023 14:04:01 +0100 Subject: [PATCH 11/12] add links to plot_document_clustering --- doc/modules/clustering.rst | 7 +++++-- sklearn/cluster/_kmeans.py | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 6466802785dea..bf17d2ba66e9d 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -246,6 +246,9 @@ performing vector quantization on an image refer to * :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`: Example usage of K-Means using the iris dataset + * :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`: Document clustering + using KMeans and MiniBatchKMeans based on sparse data + Low-level parallelism --------------------- @@ -307,8 +310,8 @@ small, as shown in the example and cited reference. * :ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`: Comparison of KMeans and MiniBatchKMeans - * :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`: Document clustering using sparse - MiniBatchKMeans + * :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`: Document clustering + using KMeans and MiniBatchKMeans based on sparse data * :ref:`sphx_glr_auto_examples_cluster_plot_dict_face_patches.py` diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 028e9d130a610..3e9206c0e4288 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1389,6 +1389,9 @@ class KMeans(_BaseKMeans): For an example of how to use K-Means to perform color quantization see :ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py`. + For a demonstration of how K-Means can be used to cluster text documents see + :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`. + For a comparison between K-Means and MiniBatchKMeans refer to example :ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`. """ From 759da869f20e5b50687d12dc56eb3172600d7c1e Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Sun, 10 Dec 2023 11:29:47 +0100 Subject: [PATCH 12/12] address comments --- doc/modules/clustering.rst | 16 ++++++++-------- examples/text/plot_document_clustering.py | 5 +++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index bf17d2ba66e9d..6c8ddc89c2b93 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -223,7 +223,7 @@ k-means++ initialization scheme, which has been implemented in scikit-learn (use the ``init='k-means++'`` parameter). This initializes the centroids to be (generally) distant from each other, leading to probably better results than random initialization, as shown in the reference. For a detailed example of -comaparing different initialization schemes refer to +comaparing different initialization schemes, refer to :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`. K-means++ can also be called independently to select seeds for other @@ -237,17 +237,17 @@ weight of 2 to a sample is equivalent to adding a duplicate of that sample to the dataset :math:`X`. K-means can be used for vector quantization. This is achieved using the -transform method of a trained model of :class:`KMeans`. For an example of +``transform`` method of a trained model of :class:`KMeans`. For an example of performing vector quantization on an image refer to :ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py`. .. topic:: Examples: - * :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`: Example usage of K-Means - using the iris dataset + * :ref:`sphx_glr_auto_examples_cluster_plot_cluster_iris.py`: Example usage of + :class:`KMeans` using the iris dataset * :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`: Document clustering - using KMeans and MiniBatchKMeans based on sparse data + using :class:`KMeans` and :class:`MiniBatchKMeans` based on sparse data Low-level parallelism --------------------- @@ -307,11 +307,11 @@ small, as shown in the example and cited reference. .. topic:: Examples: - * :ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`: Comparison of KMeans and - MiniBatchKMeans + * :ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`: Comparison of + :class:`KMeans` and :class:`MiniBatchKMeans` * :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`: Document clustering - using KMeans and MiniBatchKMeans based on sparse data + using :class:`KMeans` and :class:`MiniBatchKMeans` based on sparse data * :ref:`sphx_glr_auto_examples_cluster_plot_dict_face_patches.py` diff --git a/examples/text/plot_document_clustering.py b/examples/text/plot_document_clustering.py index bfc83fd96af2f..2c3506f4ec32e 100644 --- a/examples/text/plot_document_clustering.py +++ b/examples/text/plot_document_clustering.py @@ -99,8 +99,9 @@ # assignment have an ARI of 0.0 in expectation. # # If the ground truth labels are not known, evaluation can only be performed -# using the model results itself. In that case, the :ref:`Silhouette Coefficient -# ` comes in handy. +# using the model results itself. In that case, the Silhouette Coefficient comes in +# handy. See :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py` +# for an example on how to do it. # # For more reference, see :ref:`clustering_evaluation`.