diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 439b348ce2610..b2e648cf948f7 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -264,6 +264,11 @@ Changelog - |Fix| :func:`metric.manhattan_distances` now supports readonly sparse datasets. :pr:`25432` by :user:`Julien Jerphanion `. +- |Enhancement| :class:`metrics.silhouette_samples` nows accepts a sparse + matrix of pairwise distances between samples, or a feature array. + :pr:`18723` by :user:`Sahil Gupta ` and + :pr:`24677` by :user:`Ashwin Mathur `. + - |Fix| :func:`log_loss` raises a warning if the values of the parameter `y_pred` are not normalized, instead of actually normalizing them in the metric. Starting from 1.5 this will raise an error. :pr:`25299` by :user:`Omar Salman `_ """ - X, labels = check_X_y(X, labels, accept_sparse=["csc", "csr"]) + X, labels = check_X_y(X, labels, accept_sparse=["csr"]) # Check for non-zero diagonal entries in precomputed distance matrix if metric == "precomputed": @@ -219,10 +244,10 @@ def silhouette_samples(X, labels, *, metric="euclidean", **kwds): ) if X.dtype.kind == "f": atol = np.finfo(X.dtype).eps * 100 - if np.any(np.abs(np.diagonal(X)) > atol): - raise ValueError(error_msg) - elif np.any(np.diagonal(X) != 0): # integral dtype - raise ValueError(error_msg) + if np.any(np.abs(X.diagonal()) > atol): + raise error_msg + elif np.any(X.diagonal() != 0): # integral dtype + raise error_msg le = LabelEncoder() labels = le.fit_transform(labels) diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index 22dd1a1bf1557..8be2fe5cdae99 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -1,14 +1,17 @@ import warnings import numpy as np -import scipy.sparse as sp import pytest -from scipy.sparse import csr_matrix + +from numpy.testing import assert_allclose +from scipy.sparse import csr_matrix, csc_matrix, dok_matrix, lil_matrix +from scipy.sparse import issparse from sklearn import datasets from sklearn.utils._testing import assert_array_equal from sklearn.metrics.cluster import silhouette_score from sklearn.metrics.cluster import silhouette_samples +from sklearn.metrics.cluster._unsupervised import _silhouette_reduce from sklearn.metrics import pairwise_distances from sklearn.metrics.cluster import calinski_harabasz_score from sklearn.metrics.cluster import davies_bouldin_score @@ -19,11 +22,12 @@ def test_silhouette(): dataset = datasets.load_iris() X_dense = dataset.data X_csr = csr_matrix(X_dense) - X_dok = sp.dok_matrix(X_dense) - X_lil = sp.lil_matrix(X_dense) + X_csc = csc_matrix(X_dense) + X_dok = dok_matrix(X_dense) + X_lil = lil_matrix(X_dense) y = dataset.target - for X in [X_dense, X_csr, X_dok, X_lil]: + for X in [X_dense, X_csr, X_csc, X_dok, X_lil]: D = pairwise_distances(X, metric="euclidean") # Given that the actual labels are used, we can assume that S would be # positive. @@ -282,6 +286,47 @@ def test_silhouette_nonzero_diag(dtype): silhouette_samples(dists, labels, metric="precomputed") +@pytest.mark.parametrize("to_sparse", (csr_matrix, csc_matrix, dok_matrix, lil_matrix)) +def test_silhouette_samples_precomputed_sparse(to_sparse): + """Check that silhouette_samples works for sparse matrices correctly.""" + X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T + y = [0, 0, 0, 0, 1, 1, 1, 1] + pdist_dense = pairwise_distances(X) + pdist_sparse = to_sparse(pdist_dense) + assert issparse(pdist_sparse) + output_with_sparse_input = silhouette_samples(pdist_sparse, y, metric="precomputed") + output_with_dense_input = silhouette_samples(pdist_dense, y, metric="precomputed") + assert_allclose(output_with_sparse_input, output_with_dense_input) + + +@pytest.mark.parametrize("to_sparse", (csr_matrix, csc_matrix, dok_matrix, lil_matrix)) +def test_silhouette_samples_euclidean_sparse(to_sparse): + """Check that silhouette_samples works for sparse matrices correctly.""" + X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T + y = [0, 0, 0, 0, 1, 1, 1, 1] + pdist_dense = pairwise_distances(X) + pdist_sparse = to_sparse(pdist_dense) + assert issparse(pdist_sparse) + output_with_sparse_input = silhouette_samples(pdist_sparse, y) + output_with_dense_input = silhouette_samples(pdist_dense, y) + assert_allclose(output_with_sparse_input, output_with_dense_input) + + +@pytest.mark.parametrize("to_non_csr_sparse", (csc_matrix, dok_matrix, lil_matrix)) +def test_silhouette_reduce(to_non_csr_sparse): + """Check for non-CSR input to private method `_silhouette_reduce`.""" + X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T + pdist_dense = pairwise_distances(X) + pdist_sparse = to_non_csr_sparse(pdist_dense) + y = [0, 0, 0, 0, 1, 1, 1, 1] + label_freqs = np.bincount(y) + with pytest.raises( + TypeError, + match="Expected CSR matrix. Please pass sparse matrix in CSR format.", + ): + _silhouette_reduce(pdist_sparse, start=0, labels=y, label_freqs=label_freqs) + + def assert_raises_on_only_one_label(func): """Assert message when there is only one label""" rng = np.random.RandomState(seed=0)