From a24a13833fcad5dcc79ab4933c3377f424164d8e Mon Sep 17 00:00:00 2001
From: Vachan D A <vaas4627@colorado.edu>
Date: Thu, 5 Dec 2019 20:09:22 -0700
Subject: [PATCH 1/2] Fixes default values in docs for FeatureAgglomeration,
 KMeans, and MiniBatchKMeans classes.

---
 sklearn/cluster/_hierarchical.py | 21 +++++++-----
 sklearn/cluster/_k_means.py      | 56 +++++++++++++++++---------------
 2 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py
index f553a9e505eb5..3106bb5b0af45 100644
--- a/sklearn/cluster/_hierarchical.py
+++ b/sklearn/cluster/_hierarchical.py
@@ -929,21 +929,21 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
 
     Parameters
     ----------
-    n_clusters : int or None, optional (default=2)
+    n_clusters : int or None, default=2
         The number of clusters to find. It must be ``None`` if
         ``distance_threshold`` is not ``None``.
 
-    affinity : string or callable, default "euclidean"
+    affinity : string or callable, default='euclidean'
         Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
         "manhattan", "cosine", or 'precomputed'.
         If linkage is "ward", only "euclidean" is accepted.
 
-    memory : None, str or object with the joblib.Memory interface, optional
+    memory : None, str or object with the joblib.Memory interface, default=None
         Used to cache the output of the computation of the tree.
         By default, no caching is done. If a string is given, it is the
         path to the caching directory.
 
-    connectivity : array-like or callable, optional
+    connectivity : array-like or callable, default=None
         Connectivity matrix. Defines for each feature the neighboring
         features following a given structure of the data.
         This can be a connectivity matrix itself or a callable that transforms
@@ -951,7 +951,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         kneighbors_graph. Default is None, i.e, the
         hierarchical clustering algorithm is unstructured.
 
-    compute_full_tree : bool or 'auto', optional, default "auto"
+    compute_full_tree : bool or 'auto', optional, default='auto'
         Stop early the construction of the tree at n_clusters. This is
         useful to decrease computation time if the number of clusters is
         not small compared to the number of features. This option is
@@ -960,8 +960,11 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         be advantageous to compute the full tree. It must be ``True`` if
         ``distance_threshold`` is not ``None``.
 
-    linkage : {"ward", "complete", "average", "single"}, optional\
-            (default="ward")
+        - auto by default takes True if distance_threshold is not None,
+            otherwise it's value is based on the expression:
+            n_clusters < max(100, 0.02 * n_samples)
+
+    linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
         Which linkage criterion to use. The linkage criterion determines which
         distance to use between sets of features. The algorithm will merge
         the pairs of cluster that minimize this criterion.
@@ -974,12 +977,12 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         - single uses the minimum of the distances between all observations
           of the two sets.
 
-    pooling_func : callable, default np.mean
+    pooling_func : callable, default=np.mean
         This combines the values of agglomerated features into a single
         value, and should accept an array of shape [M, N] and the keyword
         argument `axis=1`, and reduce it to an array of size [M].
 
-    distance_threshold : float, optional (default=None)
+    distance_threshold : float, default=None
         The linkage distance threshold above which, clusters will not be
         merged. If not ``None``, ``n_clusters`` must be ``None`` and
         ``compute_full_tree`` must be ``True``.
diff --git a/sklearn/cluster/_k_means.py b/sklearn/cluster/_k_means.py
index 52f2b5fee4dac..6de67228d82f0 100644
--- a/sklearn/cluster/_k_means.py
+++ b/sklearn/cluster/_k_means.py
@@ -654,12 +654,12 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
     Parameters
     ----------
 
-    n_clusters : int, optional, default: 8
+    n_clusters : int, default=8
         The number of clusters to form as well as the number of
         centroids to generate.
 
-    init : {'k-means++', 'random' or an ndarray}
-        Method for initialization, defaults to 'k-means++':
+    init : {'k-means++', 'random' or an ndarray}, default='k-means++'
+        Method for initialization
 
         'k-means++' : selects initial cluster centers for k-mean
         clustering in a smart way to speed up convergence. See section
@@ -671,19 +671,19 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         If an ndarray is passed, it should be of shape (n_clusters, n_features)
         and gives the initial centers.
 
-    n_init : int, default: 10
+    n_init : int, default=10
         Number of time the k-means algorithm will be run with different
         centroid seeds. The final results will be the best output of
         n_init consecutive runs in terms of inertia.
 
-    max_iter : int, default: 300
+    max_iter : int, default=300
         Maximum number of iterations of the k-means algorithm for a
         single run.
 
-    tol : float, default: 1e-4
+    tol : float, default=1e-4
         Relative tolerance with regards to inertia to declare convergence.
 
-    precompute_distances : {'auto', True, False}
+    precompute_distances : {'auto', True, False}, default='auto'
         Precompute distances (faster but takes more memory).
 
         'auto' : do not precompute distances if n_samples * n_clusters > 12
@@ -694,15 +694,15 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
 
         False : never precompute distances.
 
-    verbose : int, default 0
+    verbose : int, default=0
         Verbosity mode.
 
-    random_state : int, RandomState instance or None (default)
+    random_state : int, RandomState instance or None, default=None
         Determines random number generation for centroid initialization. Use
         an int to make the randomness deterministic.
         See :term:`Glossary <random_state>`.
 
-    copy_x : bool, optional
+    copy_x : bool, default=True
         When pre-computing distances it is more numerically accurate to center
         the data first.  If copy_x is True (default), then the original data is
         not modified, ensuring X is C-contiguous.  If False, the original data
@@ -711,7 +711,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         the data mean, in this case it will also not ensure that data is
         C-contiguous which may cause a significant slowdown.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int or None, default=None
         The number of jobs to use for the computation. This works by computing
         each of the n_init runs in parallel.
 
@@ -719,11 +719,11 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    algorithm : "auto", "full" or "elkan", default="auto"
-        K-means algorithm to use. The classical EM-style algorithm is "full".
-        The "elkan" variation is more efficient by using the triangle
-        inequality, but currently doesn't support sparse data. "auto" chooses
-        "elkan" for dense data and "full" for sparse data.
+    algorithm : 'auto', 'full' or 'elkan', default='auto'
+        K-means algorithm to use. The classical EM-style algorithm is 'full'.
+        The 'elkan' variation is more efficient by using the triangle
+        inequality, but currently doesn't support sparse data. 'auto' chooses
+        'elkan' for dense data and 'full' for sparse data.
 
     Attributes
     ----------
@@ -1335,12 +1335,12 @@ class MiniBatchKMeans(KMeans):
     Parameters
     ----------
 
-    n_clusters : int, optional, default: 8
+    n_clusters : int, optional, default=8
         The number of clusters to form as well as the number of
         centroids to generate.
 
-    init : {'k-means++', 'random' or an ndarray}, default: 'k-means++'
-        Method for initialization, defaults to 'k-means++':
+    init : {'k-means++', 'random' or an ndarray}, default='k-means++'
+        Method for initialization
 
         'k-means++' : selects initial cluster centers for k-mean
         clustering in a smart way to speed up convergence. See section
@@ -1352,26 +1352,26 @@ class MiniBatchKMeans(KMeans):
         If an ndarray is passed, it should be of shape (n_clusters, n_features)
         and gives the initial centers.
 
-    max_iter : int, optional
+    max_iter : int, default=100
         Maximum number of iterations over the complete dataset before
         stopping independently of any early stopping criterion heuristics.
 
-    batch_size : int, optional, default: 100
+    batch_size : int, default=100
         Size of the mini batches.
 
-    verbose : bool, optional
+    verbose : int, default=0
         Verbosity mode.
 
     compute_labels : bool, default=True
         Compute label assignment and inertia for the complete dataset
         once the minibatch optimization has converged in fit.
 
-    random_state : int, RandomState instance or None (default)
+    random_state : int, RandomState instance or None, default=None
         Determines random number generation for centroid initialization and
         random reassignment. Use an int to make the randomness deterministic.
         See :term:`Glossary <random_state>`.
 
-    tol : float, default: 0.0
+    tol : float, default=0.0
         Control early stopping based on the relative center changes as
         measured by a smoothed, variance-normalized of the mean center
         squared position changes. This early stopping heuristics is
@@ -1382,25 +1382,27 @@ class MiniBatchKMeans(KMeans):
         To disable convergence detection based on normalized center
         change, set tol to 0.0 (default).
 
-    max_no_improvement : int, default: 10
+    max_no_improvement : int, default=10
         Control early stopping based on the consecutive number of mini
         batches that does not yield an improvement on the smoothed inertia.
 
         To disable convergence detection based on inertia, set
         max_no_improvement to None.
 
-    init_size : int, optional, default: 3 * batch_size
+    init_size : int, default=None
         Number of samples to randomly sample for speeding up the
         initialization (sometimes at the expense of accuracy): the
         only algorithm is initialized by running a batch KMeans on a
         random subset of the data. This needs to be larger than n_clusters.
 
+        If None, init_size is 3 * batch_size
+
     n_init : int, default=3
         Number of random initializations that are tried.
         In contrast to KMeans, the algorithm is only run once, using the
         best of the ``n_init`` initializations as measured by inertia.
 
-    reassignment_ratio : float, default: 0.01
+    reassignment_ratio : float, default=0.01
         Control the fraction of the maximum number of counts for a
         center to be reassigned. A higher value means that low count
         centers are more easily reassigned, which means that the

From 7d5b14fa1c518014229f8e5b63677e45bf691efd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 10 Dec 2019 17:35:38 +0100
Subject: [PATCH 2/2] fix

---
 sklearn/cluster/_hierarchical.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py
index b44595fbdc031..9883f7e809d48 100644
--- a/sklearn/cluster/_hierarchical.py
+++ b/sklearn/cluster/_hierarchical.py
@@ -716,8 +716,8 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
         the full tree. It must be ``True`` if ``distance_threshold`` is not
         ``None``. By default `compute_full_tree` is "auto", which is equivalent
         to `True` when `distance_threshold` is not `None` or that `n_clusters`
-        is inferior to 100 or `0.02 * n_samples`. Otherwise, "auto" is
-        equivalent to `False`.
+        is inferior to the maximum between 100 or `0.02 * n_samples`.
+        Otherwise, "auto" is equivalent to `False`.
 
     linkage : {"ward", "complete", "average", "single"}, default="ward"
         Which linkage criterion to use. The linkage criterion determines which
@@ -955,8 +955,8 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         the full tree. It must be ``True`` if ``distance_threshold`` is not
         ``None``. By default `compute_full_tree` is "auto", which is equivalent
         to `True` when `distance_threshold` is not `None` or that `n_clusters`
-        is inferior to 100 or `0.02 * n_samples`. Otherwise, "auto" is
-        equivalent to `False`.
+        is inferior to the maximum between 100 or `0.02 * n_samples`.
+        Otherwise, "auto" is equivalent to `False`.
 
     linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
         Which linkage criterion to use. The linkage criterion determines which