Skip to content

MAINT make affinity_propagation calling AffinityPropagation #24884

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
244 changes: 137 additions & 107 deletions sklearn/cluster/_affinity_propagation.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,110 +34,19 @@ def all_equal_similarities():
return all_equal_preferences() and all_equal_similarities()


def affinity_propagation(
def _affinity_propagation(
S,
*,
preference=None,
convergence_iter=15,
max_iter=200,
damping=0.5,
copy=True,
verbose=False,
return_n_iter=False,
random_state=None,
preference,
convergence_iter,
max_iter,
damping,
verbose,
return_n_iter,
random_state,
):
"""Perform Affinity Propagation Clustering of data.

Read more in the :ref:`User Guide <affinity_propagation>`.

Parameters
----------

S : array-like of shape (n_samples, n_samples)
Matrix of similarities between points.

preference : array-like of shape (n_samples,) or float, default=None
Preferences for each point - points with larger values of
preferences are more likely to be chosen as exemplars. The number of
exemplars, i.e. of clusters, is influenced by the input preferences
value. If the preferences are not passed as arguments, they will be
set to the median of the input similarities (resulting in a moderate
number of clusters). For a smaller amount of clusters, this can be set
to the minimum value of the similarities.

convergence_iter : int, default=15
Number of iterations with no change in the number
of estimated clusters that stops the convergence.

max_iter : int, default=200
Maximum number of iterations.

damping : float, default=0.5
Damping factor between 0.5 and 1.

copy : bool, default=True
If copy is False, the affinity matrix is modified inplace by the
algorithm, for memory efficiency.

verbose : bool, default=False
The verbosity level.

return_n_iter : bool, default=False
Whether or not to return the number of iterations.

random_state : int, RandomState instance or None, default=None
Pseudo-random number generator to control the starting state.
Use an int for reproducible results across function calls.
See the :term:`Glossary <random_state>`.

.. versionadded:: 0.23
this parameter was previously hardcoded as 0.

Returns
-------

cluster_centers_indices : ndarray of shape (n_clusters,)
Index of clusters centers.

labels : ndarray of shape (n_samples,)
Cluster labels for each point.

n_iter : int
Number of iterations run. Returned only if `return_n_iter` is
set to True.

Notes
-----
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.

When the algorithm does not converge, it will still return a arrays of
``cluster_center_indices`` and labels if there are any exemplars/clusters,
however they may be degenerate and should be used with caution.

When all training samples have equal similarities and equal preferences,
the assignment of cluster centers and labels depends on the preference.
If the preference is smaller than the similarities, a single cluster center
and label ``0`` for every sample will be returned. Otherwise, every
training sample becomes its own cluster center and is assigned a unique
label.

References
----------
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
Between Data Points", Science Feb. 2007
"""
S = as_float_array(S, copy=copy)
"""Main affinity propagation algorithm."""
n_samples = S.shape[0]

if S.shape[0] != S.shape[1]:
raise ValueError("S must be a square array (shape=%s)" % repr(S.shape))

if preference is None:
preference = np.median(S)

preference = np.array(preference)

if n_samples == 1 or _equal_similarities_and_preferences(S, preference):
# It makes no sense to run the algorithm in this case, so return 1 or
# n_samples clusters, depending on preferences
Expand All @@ -158,8 +67,6 @@ def affinity_propagation(
else (np.array([0]), np.array([0] * n_samples))
)

random_state = check_random_state(random_state)

# Place preference on the diagonal of S
S.flat[:: (n_samples + 1)] = preference

Expand Down Expand Up @@ -268,6 +175,116 @@ def affinity_propagation(


###############################################################################
# Public API


def affinity_propagation(
S,
*,
preference=None,
convergence_iter=15,
max_iter=200,
damping=0.5,
copy=True,
verbose=False,
return_n_iter=False,
random_state=None,
):
"""Perform Affinity Propagation Clustering of data.

Read more in the :ref:`User Guide <affinity_propagation>`.

Parameters
----------
S : array-like of shape (n_samples, n_samples)
Matrix of similarities between points.

preference : array-like of shape (n_samples,) or float, default=None
Preferences for each point - points with larger values of
preferences are more likely to be chosen as exemplars. The number of
exemplars, i.e. of clusters, is influenced by the input preferences
value. If the preferences are not passed as arguments, they will be
set to the median of the input similarities (resulting in a moderate
number of clusters). For a smaller amount of clusters, this can be set
to the minimum value of the similarities.

convergence_iter : int, default=15
Number of iterations with no change in the number
of estimated clusters that stops the convergence.

max_iter : int, default=200
Maximum number of iterations.

damping : float, default=0.5
Damping factor between 0.5 and 1.

copy : bool, default=True
If copy is False, the affinity matrix is modified inplace by the
algorithm, for memory efficiency.

verbose : bool, default=False
The verbosity level.

return_n_iter : bool, default=False
Whether or not to return the number of iterations.

random_state : int, RandomState instance or None, default=None
Pseudo-random number generator to control the starting state.
Use an int for reproducible results across function calls.
See the :term:`Glossary <random_state>`.

.. versionadded:: 0.23
this parameter was previously hardcoded as 0.

Returns
-------
cluster_centers_indices : ndarray of shape (n_clusters,)
Index of clusters centers.

labels : ndarray of shape (n_samples,)
Cluster labels for each point.

n_iter : int
Number of iterations run. Returned only if `return_n_iter` is
set to True.

Notes
-----
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.

When the algorithm does not converge, it will still return a arrays of
``cluster_center_indices`` and labels if there are any exemplars/clusters,
however they may be degenerate and should be used with caution.

When all training samples have equal similarities and equal preferences,
the assignment of cluster centers and labels depends on the preference.
If the preference is smaller than the similarities, a single cluster center
and label ``0`` for every sample will be returned. Otherwise, every
training sample becomes its own cluster center and is assigned a unique
label.

References
----------
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
Between Data Points", Science Feb. 2007
"""
S = as_float_array(S, copy=copy)

estimator = AffinityPropagation(
damping=damping,
max_iter=max_iter,
convergence_iter=convergence_iter,
copy=False,
preference=preference,
affinity="precomputed",
verbose=verbose,
random_state=random_state,
).fit(S)

if return_n_iter:
return estimator.cluster_centers_indices_, estimator.labels_, estimator.n_iter_
return estimator.cluster_centers_indices_, estimator.labels_


class AffinityPropagation(ClusterMixin, BaseEstimator):
Expand Down Expand Up @@ -472,24 +489,37 @@ def fit(self, X, y=None):
accept_sparse = "csr"
X = self._validate_data(X, accept_sparse=accept_sparse)
if self.affinity == "precomputed":
self.affinity_matrix_ = X
self.affinity_matrix_ = X.copy() if self.copy else X
else: # self.affinity == "euclidean"
self.affinity_matrix_ = -euclidean_distances(X, squared=True)

if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]:
raise ValueError(
"The matrix of similarities must be a square array. "
f"Got {self.affinity_matrix_.shape} instead."
)

if self.preference is None:
preference = np.median(self.affinity_matrix_)
else:
preference = self.preference
preference = np.array(preference, copy=False)

random_state = check_random_state(self.random_state)

(
self.cluster_centers_indices_,
self.labels_,
self.n_iter_,
) = affinity_propagation(
) = _affinity_propagation(
self.affinity_matrix_,
preference=self.preference,
max_iter=self.max_iter,
convergence_iter=self.convergence_iter,
preference=preference,
damping=self.damping,
copy=self.copy,
verbose=self.verbose,
return_n_iter=True,
random_state=self.random_state,
random_state=random_state,
)

if self.affinity != "precomputed":
Expand Down
2 changes: 1 addition & 1 deletion sklearn/cluster/tests/test_affinity_propagation.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def test_affinity_propagation_no_copy():
def test_affinity_propagation_affinity_shape():
"""Check the shape of the affinity matrix when using `affinity_propagation."""
S = -euclidean_distances(X, squared=True)
err_msg = "S must be a square array"
err_msg = "The matrix of similarities must be a square array"
with pytest.raises(ValueError, match=err_msg):
affinity_propagation(S[:, :-1])

Expand Down