Skip to content
20 changes: 15 additions & 5 deletions sklearn/metrics/cluster/_unsupervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import functools

import numpy as np
from scipy.sparse import issparse

from ...utils import check_random_state
from ...utils import check_X_y
Expand Down Expand Up @@ -131,15 +132,24 @@ def _silhouette_reduce(D_chunk, start, labels, label_freqs):
label_freqs : array-like
Distribution of cluster labels in ``labels``.
"""
n_chunk_samples = D_chunk.shape[0]
# accumulate distances from each sample to each cluster
clust_dists = np.zeros((len(D_chunk), len(label_freqs)),
clust_dists = np.zeros((n_chunk_samples, len(label_freqs)),
dtype=D_chunk.dtype)
for i in range(len(D_chunk)):
clust_dists[i] += np.bincount(labels, weights=D_chunk[i],
for i in range(n_chunk_samples):
if issparse(D_chunk):
current_chunk = D_chunk.getrow(i)
sample_weights = current_chunk.data
sample_labels = np.take(labels, current_chunk.indices)
Comment on lines +142 to +143
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks better.

I think we would need a test to make sure that sample_weights work as expected. A simple test would be to run silhouette_samples on a sparse dataset and compare it to the result with the same dataset but densified.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@thomasjpfan I have added a test for this

else:
sample_weights = D_chunk[i]
sample_labels = labels
clust_dists[i] += np.bincount(sample_labels, weights=sample_weights,
minlength=len(label_freqs))

# intra_index selects intra-cluster distances within clust_dists
intra_index = (np.arange(len(D_chunk)), labels[start:start + len(D_chunk)])
end = start + n_chunk_samples
intra_index = (np.arange(n_chunk_samples), labels[start:end])
# intra_clust_dists are averaged over cluster size outside this function
intra_clust_dists = clust_dists[intra_index]
# of the remaining distances we normalise and extract the minimum
Expand Down Expand Up @@ -216,7 +226,7 @@ def silhouette_samples(X, labels, *, metric='euclidean', **kwds):
# Check for non-zero diagonal entries in precomputed distance matrix
if metric == 'precomputed':
atol = np.finfo(X.dtype).eps * 100
if np.any(np.abs(np.diagonal(X)) > atol):
if np.any(np.abs(X.diagonal()) > atol):
raise ValueError(
'The precomputed distance matrix contains non-zero '
'elements on the diagonal. Use np.fill_diagonal(X, 0).'
Expand Down
24 changes: 24 additions & 0 deletions sklearn/metrics/cluster/tests/test_unsupervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import scipy.sparse as sp
import pytest
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix

from sklearn import datasets
from sklearn.utils._testing import assert_array_equal
Expand Down Expand Up @@ -184,6 +185,29 @@ def test_silhouette_nonzero_diag(dtype):
silhouette_samples(dists, labels, metric='precomputed')


@pytest.mark.parametrize('to_sparse', (csr_matrix, csc_matrix))
def test_silhouette_sparse_input(to_sparse):
""" Ensure that silhouette_samples works for sparse matrix inputs """
X = np.array([[0, 0], [1, 0], [10, 10], [10, 11]], dtype=np.float32)
y = np.array([1, 1, 1, 0])
pdist = pairwise_distances(X)
sX = to_sparse(pdist)
silhouette_samples(sX, y, metric="precomputed")


def test_silhouette_sparse_implementation():
""" Ensure implementation for sparse matrix works correctly"""
X = np.array([[0, 0], [1, 0], [10, 10], [10, 11]], dtype=np.float32)
y = np.array([1, 1, 1, 0])
pdist = pairwise_distances(X)
sX = csr_matrix(pdist)
sparse_out = silhouette_samples(sX, y, metric="precomputed")
dense_out = silhouette_samples(pdist, y, metric="precomputed")

for out in zip(sparse_out, dense_out):
assert out[0] == out[1]
Comment on lines +198 to +208
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To test a sparse pdist that have some non-diagonal zeros.

from numpy.testing import assert_allclose

def test_silhouette_sparse_implementation():
    """ Ensure implementation for sparse matrix works correctly"""
    X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]],
                 dtype=np.float32).T
    y = [0, 0, 0, 0, 1, 1, 1, 1]
    pdist_dense = pairwise_distances(X)
    pdist_sparse = csr_matrix(pdist_dense)
    sparse_out = silhouette_samples(pdist_sparse, y, metric="precomputed")
    dense_out = silhouette_samples(pdist_dense, y, metric="precomputed")
    assert_allclose(sparse_out, dense_out)



def assert_raises_on_only_one_label(func):
"""Assert message when there is only one label"""
rng = np.random.RandomState(seed=0)
Expand Down