scikit-learn · glemaitre · Dec 10, 2019 · Dec 6, 2019 · Dec 10, 2019 · Dec 10, 2019
diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py
@@ -716,8 +716,8 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
         the full tree. It must be ``True`` if ``distance_threshold`` is not
         ``None``. By default `compute_full_tree` is "auto", which is equivalent
         to `True` when `distance_threshold` is not `None` or that `n_clusters`
-        is inferior to 100 or `0.02 * n_samples`. Otherwise, "auto" is
-        equivalent to `False`.
+        is inferior to the maximum between 100 or `0.02 * n_samples`.
+        Otherwise, "auto" is equivalent to `False`.
 
     linkage : {"ward", "complete", "average", "single"}, default="ward"
         Which linkage criterion to use. The linkage criterion determines which
@@ -924,39 +924,41 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
 
     Parameters
     ----------
-    n_clusters : int or None, optional (default=2)
+    n_clusters : int, default=2
         The number of clusters to find. It must be ``None`` if
         ``distance_threshold`` is not ``None``.
 
-    affinity : string or callable, default "euclidean"
+    affinity : str or callable, default='euclidean'
         Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
         "manhattan", "cosine", or 'precomputed'.
         If linkage is "ward", only "euclidean" is accepted.
 
-    memory : None, str or object with the joblib.Memory interface, optional
+    memory : str or object with the joblib.Memory interface, default=None
         Used to cache the output of the computation of the tree.
         By default, no caching is done. If a string is given, it is the
         path to the caching directory.
 
-    connectivity : array-like or callable, optional
+    connectivity : array-like or callable, default=None
         Connectivity matrix. Defines for each feature the neighboring
         features following a given structure of the data.
         This can be a connectivity matrix itself or a callable that transforms
         the data into a connectivity matrix, such as derived from
         kneighbors_graph. Default is None, i.e, the
         hierarchical clustering algorithm is unstructured.
 
-    compute_full_tree : bool or 'auto', optional, default "auto"
-        Stop early the construction of the tree at n_clusters. This is
-        useful to decrease computation time if the number of clusters is
-        not small compared to the number of features. This option is
-        useful only when specifying a connectivity matrix. Note also that
-        when varying the number of clusters and using caching, it may
-        be advantageous to compute the full tree. It must be ``True`` if
-        ``distance_threshold`` is not ``None``.
+    compute_full_tree : 'auto' or bool, optional, default='auto'
+        Stop early the construction of the tree at n_clusters. This is useful
+        to decrease computation time if the number of clusters is not small
+        compared to the number of features. This option is useful only when
+        specifying a connectivity matrix. Note also that when varying the
+        number of clusters and using caching, it may be advantageous to compute
+        the full tree. It must be ``True`` if ``distance_threshold`` is not
+        ``None``. By default `compute_full_tree` is "auto", which is equivalent
+        to `True` when `distance_threshold` is not `None` or that `n_clusters`
+        is inferior to the maximum between 100 or `0.02 * n_samples`.
+        Otherwise, "auto" is equivalent to `False`.
 
-    linkage : {"ward", "complete", "average", "single"}, optional\
-            (default="ward")
+    linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
         Which linkage criterion to use. The linkage criterion determines which
         distance to use between sets of features. The algorithm will merge
         the pairs of cluster that minimize this criterion.
@@ -969,12 +971,12 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         - single uses the minimum of the distances between all observations
           of the two sets.
 
-    pooling_func : callable, default np.mean
+    pooling_func : callable, default=np.mean
         This combines the values of agglomerated features into a single
         value, and should accept an array of shape [M, N] and the keyword
         argument `axis=1`, and reduce it to an array of size [M].
 
-    distance_threshold : float, optional (default=None)
+    distance_threshold : float, default=None
         The linkage distance threshold above which, clusters will not be
         merged. If not ``None``, ``n_clusters`` must be ``None`` and
         ``compute_full_tree`` must be ``True``.
@@ -988,7 +990,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         ``distance_threshold=None``, it will be equal to the given
         ``n_clusters``.
 
-    labels_ : array-like, (n_features,)
+    labels_ : array-like of (n_features,)
         cluster labels for each feature.
 
     n_leaves_ : int
@@ -997,15 +999,15 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
     n_connected_components_ : int
         The estimated number of connected components in the graph.
 
-    children_ : array-like, shape (n_nodes-1, 2)
+    children_ : array-like of shape (n_nodes-1, 2)
         The children of each non-leaf node. Values less than `n_features`
         correspond to leaves of the tree which are the original samples.
         A node `i` greater than or equal to `n_features` is a non-leaf
         node and has children `children_[i - n_features]`. Alternatively
         at the i-th iteration, children[i][0] and children[i][1]
         are merged to form node `n_features + i`
 
-    distances_ : array-like, shape (n_nodes-1,)
+    distances_ : array-like of shape (n_nodes-1,)
         Distances between nodes in the corresponding place in `children_`.
         Only computed if distance_threshold is not None.
 

diff --git a/sklearn/cluster/_k_means.py b/sklearn/cluster/_k_means.py
@@ -1336,12 +1336,13 @@ class MiniBatchKMeans(KMeans):
     Parameters
     ----------
 
-    n_clusters : int, optional, default: 8
+    n_clusters : int, default=8
         The number of clusters to form as well as the number of
         centroids to generate.
 
-    init : {'k-means++', 'random' or an ndarray}, default: 'k-means++'
-        Method for initialization, defaults to 'k-means++':
+    init : {'k-means++', 'random'} or ndarray of shape \
+            (n_clusters, n_features), default='k-means++'
+        Method for initialization
 
         'k-means++' : selects initial cluster centers for k-mean
         clustering in a smart way to speed up convergence. See section
@@ -1353,26 +1354,26 @@ class MiniBatchKMeans(KMeans):
         If an ndarray is passed, it should be of shape (n_clusters, n_features)
         and gives the initial centers.
 
-    max_iter : int, optional
+    max_iter : int, default=100
         Maximum number of iterations over the complete dataset before
         stopping independently of any early stopping criterion heuristics.
 
-    batch_size : int, optional, default: 100
+    batch_size : int, default=100
         Size of the mini batches.
 
-    verbose : bool, optional
+    verbose : int, default=0
         Verbosity mode.
 
     compute_labels : bool, default=True
         Compute label assignment and inertia for the complete dataset
         once the minibatch optimization has converged in fit.
 
-    random_state : int, RandomState instance or None (default)
+    random_state : int, RandomState instance, default=None
         Determines random number generation for centroid initialization and
         random reassignment. Use an int to make the randomness deterministic.
         See :term:`Glossary <random_state>`.
 
-    tol : float, default: 0.0
+    tol : float, default=0.0
         Control early stopping based on the relative center changes as
         measured by a smoothed, variance-normalized of the mean center
         squared position changes. This early stopping heuristics is
@@ -1383,25 +1384,27 @@ class MiniBatchKMeans(KMeans):
         To disable convergence detection based on normalized center
         change, set tol to 0.0 (default).
 
-    max_no_improvement : int, default: 10
+    max_no_improvement : int, default=10
         Control early stopping based on the consecutive number of mini
         batches that does not yield an improvement on the smoothed inertia.
 
         To disable convergence detection based on inertia, set
         max_no_improvement to None.
 
-    init_size : int, optional, default: 3 * batch_size
+    init_size : int, default=None
         Number of samples to randomly sample for speeding up the
         initialization (sometimes at the expense of accuracy): the
         only algorithm is initialized by running a batch KMeans on a
         random subset of the data. This needs to be larger than n_clusters.
 
+        If `None`, `init_size= 3 * batch_size`.
+
     n_init : int, default=3
         Number of random initializations that are tried.
         In contrast to KMeans, the algorithm is only run once, using the
         best of the ``n_init`` initializations as measured by inertia.
 
-    reassignment_ratio : float, default: 0.01
+    reassignment_ratio : float, default=0.01
         Control the fraction of the maximum number of counts for a
         center to be reassigned. A higher value means that low count
         centers are more easily reassigned, which means that the
@@ -1411,10 +1414,10 @@ class MiniBatchKMeans(KMeans):
     Attributes
     ----------
 
-    cluster_centers_ : array, [n_clusters, n_features]
+    cluster_centers_ : ndarray of shape (n_clusters, n_features)
         Coordinates of cluster centers
 
-    labels_ :
+    labels_ : int
         Labels of each point (if compute_labels is set to True).
 
     inertia_ : float