From 38f87a5ac572fd2bc259e645983ff11ce3287657 Mon Sep 17 00:00:00 2001 From: Vachan D A Date: Mon, 2 Dec 2019 20:50:36 -0700 Subject: [PATCH 1/4] DOC included default values in AgglomerativeClustering class. --- sklearn/cluster/_hierarchical.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py index f553a9e505eb5..ba84483b343e1 100644 --- a/sklearn/cluster/_hierarchical.py +++ b/sklearn/cluster/_hierarchical.py @@ -683,23 +683,23 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): Parameters ---------- - n_clusters : int or None, optional (default=2) + n_clusters : int or None, default=2 The number of clusters to find. It must be ``None`` if ``distance_threshold`` is not ``None``. - affinity : string or callable, default: "euclidean" + affinity : string or callable, default='euclidean' Metric used to compute the linkage. Can be "euclidean", "l1", "l2", "manhattan", "cosine", or "precomputed". If linkage is "ward", only "euclidean" is accepted. If "precomputed", a distance matrix (instead of a similarity matrix) is needed as input for the fit method. - memory : None, str or object with the joblib.Memory interface, optional + memory : None, str or object with the joblib.Memory interface, default=None Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. - connectivity : array-like or callable, optional + connectivity : array-like or callable, default=None Connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. This can be a connectivity matrix itself or a callable that transforms @@ -707,7 +707,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): kneighbors_graph. Default is None, i.e, the hierarchical clustering algorithm is unstructured. - compute_full_tree : bool or 'auto' (optional) + compute_full_tree : bool or 'auto', default='auto' Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. This option is @@ -716,8 +716,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): be advantageous to compute the full tree. It must be ``True`` if ``distance_threshold`` is not ``None``. - linkage : {"ward", "complete", "average", "single"}, optional \ - (default="ward") + linkage : {"ward", "complete", "average", "single"}, default='ward' Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge the pairs of cluster that minimize this criterion. @@ -730,7 +729,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): - single uses the minimum of the distances between all observations of the two sets. - distance_threshold : float, optional (default=None) + distance_threshold : float, default=None The linkage distance threshold above which, clusters will not be merged. If not ``None``, ``n_clusters`` must be ``None`` and ``compute_full_tree`` must be ``True``. From 24919301dacbd3c910c236b2358a9b8d51fd5264 Mon Sep 17 00:00:00 2001 From: Vachan D A Date: Tue, 3 Dec 2019 19:40:38 -0700 Subject: [PATCH 2/4] Adds explain for 'auto' keyword in 'compute_full_tree' parameter. --- sklearn/cluster/_hierarchical.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py index ba84483b343e1..5446c3f1ebdcc 100644 --- a/sklearn/cluster/_hierarchical.py +++ b/sklearn/cluster/_hierarchical.py @@ -959,6 +959,10 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): be advantageous to compute the full tree. It must be ``True`` if ``distance_threshold`` is not ``None``. + - auto by default tales True if distance_threshold is not None, + otherwise it's value is based on the expression: + n_clusters < max(100, 0.02 * n_samples) + linkage : {"ward", "complete", "average", "single"}, optional\ (default="ward") Which linkage criterion to use. The linkage criterion determines which From f2f304bb9ce384526f35f0c8f29a374645efde9f Mon Sep 17 00:00:00 2001 From: Vachan D A Date: Wed, 4 Dec 2019 09:29:50 -0700 Subject: [PATCH 3/4] Moves the doc string to the correct class. --- sklearn/cluster/_hierarchical.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py index 5446c3f1ebdcc..828244b100eac 100644 --- a/sklearn/cluster/_hierarchical.py +++ b/sklearn/cluster/_hierarchical.py @@ -716,6 +716,10 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): be advantageous to compute the full tree. It must be ``True`` if ``distance_threshold`` is not ``None``. + - auto by default tales True if distance_threshold is not None, + otherwise it's value is based on the expression: + n_clusters < max(100, 0.02 * n_samples) + linkage : {"ward", "complete", "average", "single"}, default='ward' Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge @@ -959,10 +963,6 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): be advantageous to compute the full tree. It must be ``True`` if ``distance_threshold`` is not ``None``. - - auto by default tales True if distance_threshold is not None, - otherwise it's value is based on the expression: - n_clusters < max(100, 0.02 * n_samples) - linkage : {"ward", "complete", "average", "single"}, optional\ (default="ward") Which linkage criterion to use. The linkage criterion determines which From 91788a5cd5bdc8bac9089909d3918d2da11e9e7d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 10 Dec 2019 16:40:42 +0100 Subject: [PATCH 4/4] Apply review --- sklearn/cluster/_hierarchical.py | 35 ++++++++++++++++---------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/sklearn/cluster/_hierarchical.py b/sklearn/cluster/_hierarchical.py index 828244b100eac..f5c03cfca536c 100644 --- a/sklearn/cluster/_hierarchical.py +++ b/sklearn/cluster/_hierarchical.py @@ -687,14 +687,14 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): The number of clusters to find. It must be ``None`` if ``distance_threshold`` is not ``None``. - affinity : string or callable, default='euclidean' + affinity : str or callable, default='euclidean' Metric used to compute the linkage. Can be "euclidean", "l1", "l2", "manhattan", "cosine", or "precomputed". If linkage is "ward", only "euclidean" is accepted. If "precomputed", a distance matrix (instead of a similarity matrix) is needed as input for the fit method. - memory : None, str or object with the joblib.Memory interface, default=None + memory : str or object with the joblib.Memory interface, default=None Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. @@ -707,20 +707,19 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): kneighbors_graph. Default is None, i.e, the hierarchical clustering algorithm is unstructured. - compute_full_tree : bool or 'auto', default='auto' - Stop early the construction of the tree at n_clusters. This is - useful to decrease computation time if the number of clusters is - not small compared to the number of samples. This option is - useful only when specifying a connectivity matrix. Note also that - when varying the number of clusters and using caching, it may - be advantageous to compute the full tree. It must be ``True`` if - ``distance_threshold`` is not ``None``. - - - auto by default tales True if distance_threshold is not None, - otherwise it's value is based on the expression: - n_clusters < max(100, 0.02 * n_samples) - - linkage : {"ward", "complete", "average", "single"}, default='ward' + compute_full_tree : 'auto' or bool, default='auto' + Stop early the construction of the tree at n_clusters. This is useful + to decrease computation time if the number of clusters is not small + compared to the number of samples. This option is useful only when + specifying a connectivity matrix. Note also that when varying the + number of clusters and using caching, it may be advantageous to compute + the full tree. It must be ``True`` if ``distance_threshold`` is not + ``None``. By default `compute_full_tree` is "auto", which is equivalent + to `True` when `distance_threshold` is not `None` or that `n_clusters` + is inferior to 100 or `0.02 * n_samples`. Otherwise, "auto" is + equivalent to `False`. + + linkage : {"ward", "complete", "average", "single"}, default="ward" Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge the pairs of cluster that minimize this criterion. @@ -747,7 +746,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): ``distance_threshold=None``, it will be equal to the given ``n_clusters``. - labels_ : array [n_samples] + labels_ : ndarray of shape (n_samples) cluster labels for each point n_leaves_ : int @@ -756,7 +755,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): n_connected_components_ : int The estimated number of connected components in the graph. - children_ : array-like, shape (n_samples-1, 2) + children_ : array-like of shape (n_samples-1, 2) The children of each non-leaf node. Values less than `n_samples` correspond to leaves of the tree which are the original samples. A node `i` greater than or equal to `n_samples` is a non-leaf