diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 07443d65f0ec4..180e37996aa07 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -34,110 +34,19 @@ def all_equal_similarities(): return all_equal_preferences() and all_equal_similarities() -def affinity_propagation( +def _affinity_propagation( S, *, - preference=None, - convergence_iter=15, - max_iter=200, - damping=0.5, - copy=True, - verbose=False, - return_n_iter=False, - random_state=None, + preference, + convergence_iter, + max_iter, + damping, + verbose, + return_n_iter, + random_state, ): - """Perform Affinity Propagation Clustering of data. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - - S : array-like of shape (n_samples, n_samples) - Matrix of similarities between points. - - preference : array-like of shape (n_samples,) or float, default=None - Preferences for each point - points with larger values of - preferences are more likely to be chosen as exemplars. The number of - exemplars, i.e. of clusters, is influenced by the input preferences - value. If the preferences are not passed as arguments, they will be - set to the median of the input similarities (resulting in a moderate - number of clusters). For a smaller amount of clusters, this can be set - to the minimum value of the similarities. - - convergence_iter : int, default=15 - Number of iterations with no change in the number - of estimated clusters that stops the convergence. - - max_iter : int, default=200 - Maximum number of iterations. - - damping : float, default=0.5 - Damping factor between 0.5 and 1. - - copy : bool, default=True - If copy is False, the affinity matrix is modified inplace by the - algorithm, for memory efficiency. - - verbose : bool, default=False - The verbosity level. - - return_n_iter : bool, default=False - Whether or not to return the number of iterations. - - random_state : int, RandomState instance or None, default=None - Pseudo-random number generator to control the starting state. - Use an int for reproducible results across function calls. - See the :term:`Glossary `. - - .. versionadded:: 0.23 - this parameter was previously hardcoded as 0. - - Returns - ------- - - cluster_centers_indices : ndarray of shape (n_clusters,) - Index of clusters centers. - - labels : ndarray of shape (n_samples,) - Cluster labels for each point. - - n_iter : int - Number of iterations run. Returned only if `return_n_iter` is - set to True. - - Notes - ----- - For an example, see :ref:`examples/cluster/plot_affinity_propagation.py - `. - - When the algorithm does not converge, it will still return a arrays of - ``cluster_center_indices`` and labels if there are any exemplars/clusters, - however they may be degenerate and should be used with caution. - - When all training samples have equal similarities and equal preferences, - the assignment of cluster centers and labels depends on the preference. - If the preference is smaller than the similarities, a single cluster center - and label ``0`` for every sample will be returned. Otherwise, every - training sample becomes its own cluster center and is assigned a unique - label. - - References - ---------- - Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages - Between Data Points", Science Feb. 2007 - """ - S = as_float_array(S, copy=copy) + """Main affinity propagation algorithm.""" n_samples = S.shape[0] - - if S.shape[0] != S.shape[1]: - raise ValueError("S must be a square array (shape=%s)" % repr(S.shape)) - - if preference is None: - preference = np.median(S) - - preference = np.array(preference) - if n_samples == 1 or _equal_similarities_and_preferences(S, preference): # It makes no sense to run the algorithm in this case, so return 1 or # n_samples clusters, depending on preferences @@ -158,8 +67,6 @@ def affinity_propagation( else (np.array([0]), np.array([0] * n_samples)) ) - random_state = check_random_state(random_state) - # Place preference on the diagonal of S S.flat[:: (n_samples + 1)] = preference @@ -268,6 +175,116 @@ def affinity_propagation( ############################################################################### +# Public API + + +def affinity_propagation( + S, + *, + preference=None, + convergence_iter=15, + max_iter=200, + damping=0.5, + copy=True, + verbose=False, + return_n_iter=False, + random_state=None, +): + """Perform Affinity Propagation Clustering of data. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + S : array-like of shape (n_samples, n_samples) + Matrix of similarities between points. + + preference : array-like of shape (n_samples,) or float, default=None + Preferences for each point - points with larger values of + preferences are more likely to be chosen as exemplars. The number of + exemplars, i.e. of clusters, is influenced by the input preferences + value. If the preferences are not passed as arguments, they will be + set to the median of the input similarities (resulting in a moderate + number of clusters). For a smaller amount of clusters, this can be set + to the minimum value of the similarities. + + convergence_iter : int, default=15 + Number of iterations with no change in the number + of estimated clusters that stops the convergence. + + max_iter : int, default=200 + Maximum number of iterations. + + damping : float, default=0.5 + Damping factor between 0.5 and 1. + + copy : bool, default=True + If copy is False, the affinity matrix is modified inplace by the + algorithm, for memory efficiency. + + verbose : bool, default=False + The verbosity level. + + return_n_iter : bool, default=False + Whether or not to return the number of iterations. + + random_state : int, RandomState instance or None, default=None + Pseudo-random number generator to control the starting state. + Use an int for reproducible results across function calls. + See the :term:`Glossary `. + + .. versionadded:: 0.23 + this parameter was previously hardcoded as 0. + + Returns + ------- + cluster_centers_indices : ndarray of shape (n_clusters,) + Index of clusters centers. + + labels : ndarray of shape (n_samples,) + Cluster labels for each point. + + n_iter : int + Number of iterations run. Returned only if `return_n_iter` is + set to True. + + Notes + ----- + For an example, see :ref:`examples/cluster/plot_affinity_propagation.py + `. + + When the algorithm does not converge, it will still return a arrays of + ``cluster_center_indices`` and labels if there are any exemplars/clusters, + however they may be degenerate and should be used with caution. + + When all training samples have equal similarities and equal preferences, + the assignment of cluster centers and labels depends on the preference. + If the preference is smaller than the similarities, a single cluster center + and label ``0`` for every sample will be returned. Otherwise, every + training sample becomes its own cluster center and is assigned a unique + label. + + References + ---------- + Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages + Between Data Points", Science Feb. 2007 + """ + S = as_float_array(S, copy=copy) + + estimator = AffinityPropagation( + damping=damping, + max_iter=max_iter, + convergence_iter=convergence_iter, + copy=False, + preference=preference, + affinity="precomputed", + verbose=verbose, + random_state=random_state, + ).fit(S) + + if return_n_iter: + return estimator.cluster_centers_indices_, estimator.labels_, estimator.n_iter_ + return estimator.cluster_centers_indices_, estimator.labels_ class AffinityPropagation(ClusterMixin, BaseEstimator): @@ -472,24 +489,37 @@ def fit(self, X, y=None): accept_sparse = "csr" X = self._validate_data(X, accept_sparse=accept_sparse) if self.affinity == "precomputed": - self.affinity_matrix_ = X + self.affinity_matrix_ = X.copy() if self.copy else X else: # self.affinity == "euclidean" self.affinity_matrix_ = -euclidean_distances(X, squared=True) + if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]: + raise ValueError( + "The matrix of similarities must be a square array. " + f"Got {self.affinity_matrix_.shape} instead." + ) + + if self.preference is None: + preference = np.median(self.affinity_matrix_) + else: + preference = self.preference + preference = np.array(preference, copy=False) + + random_state = check_random_state(self.random_state) + ( self.cluster_centers_indices_, self.labels_, self.n_iter_, - ) = affinity_propagation( + ) = _affinity_propagation( self.affinity_matrix_, - preference=self.preference, max_iter=self.max_iter, convergence_iter=self.convergence_iter, + preference=preference, damping=self.damping, - copy=self.copy, verbose=self.verbose, return_n_iter=True, - random_state=self.random_state, + random_state=random_state, ) if self.affinity != "precomputed": diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py index cc696620a0e4d..52007c375f667 100644 --- a/sklearn/cluster/tests/test_affinity_propagation.py +++ b/sklearn/cluster/tests/test_affinity_propagation.py @@ -101,7 +101,7 @@ def test_affinity_propagation_no_copy(): def test_affinity_propagation_affinity_shape(): """Check the shape of the affinity matrix when using `affinity_propagation.""" S = -euclidean_distances(X, squared=True) - err_msg = "S must be a square array" + err_msg = "The matrix of similarities must be a square array" with pytest.raises(ValueError, match=err_msg): affinity_propagation(S[:, :-1])