From 4fec4d5c9b610a7aedfa056e183779c9b6c51747 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 10 Nov 2022 17:04:50 +0100 Subject: [PATCH 1/4] MAINT refactor affinity_propagation to make a private function without validation --- sklearn/cluster/_affinity_propagation.py | 92 ++++++++++++++++++--- sklearn/linear_model/_coordinate_descent.py | 2 + 2 files changed, 82 insertions(+), 12 deletions(-) diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 07443d65f0ec4..6d806ff71e0f9 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -52,7 +52,6 @@ def affinity_propagation( Parameters ---------- - S : array-like of shape (n_samples, n_samples) Matrix of similarities between points. @@ -95,7 +94,6 @@ def affinity_propagation( Returns ------- - cluster_centers_indices : ndarray of shape (n_clusters,) Index of clusters centers. @@ -128,16 +126,81 @@ def affinity_propagation( Between Data Points", Science Feb. 2007 """ S = as_float_array(S, copy=copy) - n_samples = S.shape[0] if S.shape[0] != S.shape[1]: - raise ValueError("S must be a square array (shape=%s)" % repr(S.shape)) + raise ValueError(f"S must be a square array (shape={S.shape})") if preference is None: preference = np.median(S) + preference = np.array(preference, copy=False) + + random_state = check_random_state(random_state) + + return _affinity_propagation( + S, + preference=preference, + convergence_iter=convergence_iter, + max_iter=max_iter, + damping=damping, + verbose=verbose, + return_n_iter=return_n_iter, + random_state=random_state, + ) - preference = np.array(preference) +def _affinity_propagation( + S, + *, + preference, + convergence_iter, + max_iter, + damping, + verbose, + return_n_iter, + random_state, +): + """Same function than `affinity_propagation` but without input validation. + + Parameters + ---------- + S : array-like of shape (n_samples, n_samples), dtype={np.float32, np.float64} + Matrix of similarities between points. + + preference : ndarray of shape (n_samples,) or (1,) + Preferences for each point. + + convergence_iter : int + Number of iterations with no change in the number of estimated clusters + that stops the convergence. + + max_iter : int + Maximum number of iterations. + + damping : float + Damping factor between 0.5 and 1. + + verbose : bool, default=False + The verbosity level. + + return_n_iter : bool + Whether or not to return the number of iterations. + + random_state : RandomState instance + Pseudo-random number generator to control the starting state. + + Returns + ------- + cluster_centers_indices : ndarray of shape (n_clusters,) + Index of clusters centers. + + labels : ndarray of shape (n_samples,) + Cluster labels for each point. + + n_iter : int + Number of iterations run. Returned only if `return_n_iter` is + set to True. + """ + n_samples = S.shape[0] if n_samples == 1 or _equal_similarities_and_preferences(S, preference): # It makes no sense to run the algorithm in this case, so return 1 or # n_samples clusters, depending on preferences @@ -158,8 +221,6 @@ def affinity_propagation( else (np.array([0]), np.array([0] * n_samples)) ) - random_state = check_random_state(random_state) - # Place preference on the diagonal of S S.flat[:: (n_samples + 1)] = preference @@ -472,24 +533,31 @@ def fit(self, X, y=None): accept_sparse = "csr" X = self._validate_data(X, accept_sparse=accept_sparse) if self.affinity == "precomputed": - self.affinity_matrix_ = X + self.affinity_matrix_ = X.copy() if self.copy else X else: # self.affinity == "euclidean" self.affinity_matrix_ = -euclidean_distances(X, squared=True) + if self.preference is None: + preference = np.median(self.affinity_matrix_) + else: + preference = self.preference + preference = np.array(preference, copy=False) + + random_state = check_random_state(self.random_state) + ( self.cluster_centers_indices_, self.labels_, self.n_iter_, - ) = affinity_propagation( + ) = _affinity_propagation( self.affinity_matrix_, - preference=self.preference, + preference=preference, max_iter=self.max_iter, convergence_iter=self.convergence_iter, damping=self.damping, - copy=self.copy, verbose=self.verbose, return_n_iter=True, - random_state=self.random_state, + random_state=random_state, ) if self.affinity != "precomputed": diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index bb9a4e4c0c326..0b8e9eba8e585 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -631,6 +631,8 @@ def enet_path( positive, ) elif precompute is False: + if l1_reg < 0.1: + print(coef_[:5]) model = cd_fast.enet_coordinate_descent( coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive ) From fd3f9e6e63866f7e47541042c9eb8432169b3833 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 10 Nov 2022 17:23:04 +0100 Subject: [PATCH 2/4] Bunch overengineering --- sklearn/cluster/_affinity_propagation.py | 263 +++++++++--------- .../tests/test_affinity_propagation.py | 2 +- 2 files changed, 139 insertions(+), 126 deletions(-) diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 6d806ff71e0f9..188d9be1e684c 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -12,7 +12,7 @@ from ..exceptions import ConvergenceWarning from ..base import BaseEstimator, ClusterMixin -from ..utils import as_float_array, check_random_state +from ..utils import Bunch, as_float_array, check_random_state from ..utils._param_validation import Interval, StrOptions from ..utils.validation import check_is_fitted from ..metrics import euclidean_distances @@ -34,120 +34,6 @@ def all_equal_similarities(): return all_equal_preferences() and all_equal_similarities() -def affinity_propagation( - S, - *, - preference=None, - convergence_iter=15, - max_iter=200, - damping=0.5, - copy=True, - verbose=False, - return_n_iter=False, - random_state=None, -): - """Perform Affinity Propagation Clustering of data. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - S : array-like of shape (n_samples, n_samples) - Matrix of similarities between points. - - preference : array-like of shape (n_samples,) or float, default=None - Preferences for each point - points with larger values of - preferences are more likely to be chosen as exemplars. The number of - exemplars, i.e. of clusters, is influenced by the input preferences - value. If the preferences are not passed as arguments, they will be - set to the median of the input similarities (resulting in a moderate - number of clusters). For a smaller amount of clusters, this can be set - to the minimum value of the similarities. - - convergence_iter : int, default=15 - Number of iterations with no change in the number - of estimated clusters that stops the convergence. - - max_iter : int, default=200 - Maximum number of iterations. - - damping : float, default=0.5 - Damping factor between 0.5 and 1. - - copy : bool, default=True - If copy is False, the affinity matrix is modified inplace by the - algorithm, for memory efficiency. - - verbose : bool, default=False - The verbosity level. - - return_n_iter : bool, default=False - Whether or not to return the number of iterations. - - random_state : int, RandomState instance or None, default=None - Pseudo-random number generator to control the starting state. - Use an int for reproducible results across function calls. - See the :term:`Glossary `. - - .. versionadded:: 0.23 - this parameter was previously hardcoded as 0. - - Returns - ------- - cluster_centers_indices : ndarray of shape (n_clusters,) - Index of clusters centers. - - labels : ndarray of shape (n_samples,) - Cluster labels for each point. - - n_iter : int - Number of iterations run. Returned only if `return_n_iter` is - set to True. - - Notes - ----- - For an example, see :ref:`examples/cluster/plot_affinity_propagation.py - `. - - When the algorithm does not converge, it will still return a arrays of - ``cluster_center_indices`` and labels if there are any exemplars/clusters, - however they may be degenerate and should be used with caution. - - When all training samples have equal similarities and equal preferences, - the assignment of cluster centers and labels depends on the preference. - If the preference is smaller than the similarities, a single cluster center - and label ``0`` for every sample will be returned. Otherwise, every - training sample becomes its own cluster center and is assigned a unique - label. - - References - ---------- - Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages - Between Data Points", Science Feb. 2007 - """ - S = as_float_array(S, copy=copy) - - if S.shape[0] != S.shape[1]: - raise ValueError(f"S must be a square array (shape={S.shape})") - - if preference is None: - preference = np.median(S) - preference = np.array(preference, copy=False) - - random_state = check_random_state(random_state) - - return _affinity_propagation( - S, - preference=preference, - convergence_iter=convergence_iter, - max_iter=max_iter, - damping=damping, - verbose=verbose, - return_n_iter=return_n_iter, - random_state=random_state, - ) - - def _affinity_propagation( S, *, @@ -328,7 +214,133 @@ def _affinity_propagation( return cluster_centers_indices, labels -############################################################################### +def _validate_init_common_params(params): + """Validate common parameters for init methods and public function.""" + if params.S.shape[0] != params.S.shape[1]: + raise ValueError( + f"The matrix of similarities must be a square array. Got {params.S.shape} " + "instead." + ) + + if params.preference is None: + preference = np.median(params.S) + else: + preference = params.preference + preference = np.array(preference, copy=False) + + random_state = check_random_state(params.random_state) + return { + "preference": preference, + "random_state": random_state, + } + + +def affinity_propagation( + S, + *, + preference=None, + convergence_iter=15, + max_iter=200, + damping=0.5, + copy=True, + verbose=False, + return_n_iter=False, + random_state=None, +): + """Perform Affinity Propagation Clustering of data. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + S : array-like of shape (n_samples, n_samples) + Matrix of similarities between points. + + preference : array-like of shape (n_samples,) or float, default=None + Preferences for each point - points with larger values of + preferences are more likely to be chosen as exemplars. The number of + exemplars, i.e. of clusters, is influenced by the input preferences + value. If the preferences are not passed as arguments, they will be + set to the median of the input similarities (resulting in a moderate + number of clusters). For a smaller amount of clusters, this can be set + to the minimum value of the similarities. + + convergence_iter : int, default=15 + Number of iterations with no change in the number + of estimated clusters that stops the convergence. + + max_iter : int, default=200 + Maximum number of iterations. + + damping : float, default=0.5 + Damping factor between 0.5 and 1. + + copy : bool, default=True + If copy is False, the affinity matrix is modified inplace by the + algorithm, for memory efficiency. + + verbose : bool, default=False + The verbosity level. + + return_n_iter : bool, default=False + Whether or not to return the number of iterations. + + random_state : int, RandomState instance or None, default=None + Pseudo-random number generator to control the starting state. + Use an int for reproducible results across function calls. + See the :term:`Glossary `. + + .. versionadded:: 0.23 + this parameter was previously hardcoded as 0. + + Returns + ------- + cluster_centers_indices : ndarray of shape (n_clusters,) + Index of clusters centers. + + labels : ndarray of shape (n_samples,) + Cluster labels for each point. + + n_iter : int + Number of iterations run. Returned only if `return_n_iter` is + set to True. + + Notes + ----- + For an example, see :ref:`examples/cluster/plot_affinity_propagation.py + `. + + When the algorithm does not converge, it will still return a arrays of + ``cluster_center_indices`` and labels if there are any exemplars/clusters, + however they may be degenerate and should be used with caution. + + When all training samples have equal similarities and equal preferences, + the assignment of cluster centers and labels depends on the preference. + If the preference is smaller than the similarities, a single cluster center + and label ``0`` for every sample will be returned. Otherwise, every + training sample becomes its own cluster center and is assigned a unique + label. + + References + ---------- + Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages + Between Data Points", Science Feb. 2007 + """ + S = as_float_array(S, copy=copy) + + params = _validate_init_common_params( + Bunch(**{"S": S, "preference": preference, "random_state": random_state}) + ) + + return _affinity_propagation( + S, + convergence_iter=convergence_iter, + max_iter=max_iter, + damping=damping, + verbose=verbose, + return_n_iter=return_n_iter, + **params, + ) class AffinityPropagation(ClusterMixin, BaseEstimator): @@ -537,13 +549,15 @@ def fit(self, X, y=None): else: # self.affinity == "euclidean" self.affinity_matrix_ = -euclidean_distances(X, squared=True) - if self.preference is None: - preference = np.median(self.affinity_matrix_) - else: - preference = self.preference - preference = np.array(preference, copy=False) - - random_state = check_random_state(self.random_state) + params = _validate_init_common_params( + Bunch( + **{ + "S": self.affinity_matrix_, + "preference": self.preference, + "random_state": self.random_state, + } + ) + ) ( self.cluster_centers_indices_, @@ -551,13 +565,12 @@ def fit(self, X, y=None): self.n_iter_, ) = _affinity_propagation( self.affinity_matrix_, - preference=preference, max_iter=self.max_iter, convergence_iter=self.convergence_iter, damping=self.damping, verbose=self.verbose, return_n_iter=True, - random_state=random_state, + **params, ) if self.affinity != "precomputed": diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py index cc696620a0e4d..52007c375f667 100644 --- a/sklearn/cluster/tests/test_affinity_propagation.py +++ b/sklearn/cluster/tests/test_affinity_propagation.py @@ -101,7 +101,7 @@ def test_affinity_propagation_no_copy(): def test_affinity_propagation_affinity_shape(): """Check the shape of the affinity matrix when using `affinity_propagation.""" S = -euclidean_distances(X, squared=True) - err_msg = "S must be a square array" + err_msg = "The matrix of similarities must be a square array" with pytest.raises(ValueError, match=err_msg): affinity_propagation(S[:, :-1]) From 0cbf2b4a6222ee2974041d10e08ef56e845fa398 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 10 Nov 2022 17:26:46 +0100 Subject: [PATCH 3/4] remove debug --- sklearn/linear_model/_coordinate_descent.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 0b8e9eba8e585..bb9a4e4c0c326 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -631,8 +631,6 @@ def enet_path( positive, ) elif precompute is False: - if l1_reg < 0.1: - print(coef_[:5]) model = cd_fast.enet_coordinate_descent( coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive ) From f12ba4a95f73b82026929b60936aa89a14d39a09 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 10 Nov 2022 18:02:47 +0100 Subject: [PATCH 4/4] make the function call the class --- sklearn/cluster/_affinity_propagation.py | 111 ++++++----------------- 1 file changed, 30 insertions(+), 81 deletions(-) diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 188d9be1e684c..180e37996aa07 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -12,7 +12,7 @@ from ..exceptions import ConvergenceWarning from ..base import BaseEstimator, ClusterMixin -from ..utils import Bunch, as_float_array, check_random_state +from ..utils import as_float_array, check_random_state from ..utils._param_validation import Interval, StrOptions from ..utils.validation import check_is_fitted from ..metrics import euclidean_distances @@ -45,47 +45,7 @@ def _affinity_propagation( return_n_iter, random_state, ): - """Same function than `affinity_propagation` but without input validation. - - Parameters - ---------- - S : array-like of shape (n_samples, n_samples), dtype={np.float32, np.float64} - Matrix of similarities between points. - - preference : ndarray of shape (n_samples,) or (1,) - Preferences for each point. - - convergence_iter : int - Number of iterations with no change in the number of estimated clusters - that stops the convergence. - - max_iter : int - Maximum number of iterations. - - damping : float - Damping factor between 0.5 and 1. - - verbose : bool, default=False - The verbosity level. - - return_n_iter : bool - Whether or not to return the number of iterations. - - random_state : RandomState instance - Pseudo-random number generator to control the starting state. - - Returns - ------- - cluster_centers_indices : ndarray of shape (n_clusters,) - Index of clusters centers. - - labels : ndarray of shape (n_samples,) - Cluster labels for each point. - - n_iter : int - Number of iterations run. Returned only if `return_n_iter` is - set to True. - """ + """Main affinity propagation algorithm.""" n_samples = S.shape[0] if n_samples == 1 or _equal_similarities_and_preferences(S, preference): # It makes no sense to run the algorithm in this case, so return 1 or @@ -214,25 +174,8 @@ def _affinity_propagation( return cluster_centers_indices, labels -def _validate_init_common_params(params): - """Validate common parameters for init methods and public function.""" - if params.S.shape[0] != params.S.shape[1]: - raise ValueError( - f"The matrix of similarities must be a square array. Got {params.S.shape} " - "instead." - ) - - if params.preference is None: - preference = np.median(params.S) - else: - preference = params.preference - preference = np.array(preference, copy=False) - - random_state = check_random_state(params.random_state) - return { - "preference": preference, - "random_state": random_state, - } +############################################################################### +# Public API def affinity_propagation( @@ -328,19 +271,20 @@ def affinity_propagation( """ S = as_float_array(S, copy=copy) - params = _validate_init_common_params( - Bunch(**{"S": S, "preference": preference, "random_state": random_state}) - ) - - return _affinity_propagation( - S, - convergence_iter=convergence_iter, - max_iter=max_iter, + estimator = AffinityPropagation( damping=damping, + max_iter=max_iter, + convergence_iter=convergence_iter, + copy=False, + preference=preference, + affinity="precomputed", verbose=verbose, - return_n_iter=return_n_iter, - **params, - ) + random_state=random_state, + ).fit(S) + + if return_n_iter: + return estimator.cluster_centers_indices_, estimator.labels_, estimator.n_iter_ + return estimator.cluster_centers_indices_, estimator.labels_ class AffinityPropagation(ClusterMixin, BaseEstimator): @@ -549,15 +493,19 @@ def fit(self, X, y=None): else: # self.affinity == "euclidean" self.affinity_matrix_ = -euclidean_distances(X, squared=True) - params = _validate_init_common_params( - Bunch( - **{ - "S": self.affinity_matrix_, - "preference": self.preference, - "random_state": self.random_state, - } + if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]: + raise ValueError( + "The matrix of similarities must be a square array. " + f"Got {self.affinity_matrix_.shape} instead." ) - ) + + if self.preference is None: + preference = np.median(self.affinity_matrix_) + else: + preference = self.preference + preference = np.array(preference, copy=False) + + random_state = check_random_state(self.random_state) ( self.cluster_centers_indices_, @@ -567,10 +515,11 @@ def fit(self, X, y=None): self.affinity_matrix_, max_iter=self.max_iter, convergence_iter=self.convergence_iter, + preference=preference, damping=self.damping, verbose=self.verbose, return_n_iter=True, - **params, + random_state=random_state, ) if self.affinity != "precomputed":