[MRG+1] Add Davies-Bouldin index (#10827)

logc · glemaitre · commit 680c36b47542 · 2018-05-18T13:52:48.000+02:00
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -893,6 +893,7 @@ details.
    metrics.adjusted_mutual_info_score
    metrics.adjusted_rand_score
    metrics.calinski_harabaz_score
+   metrics.davies_bouldin_score
    metrics.completeness_score
    metrics.cluster.contingency_matrix
    metrics.fowlkes_mallows_score
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
@@ -1591,6 +1591,86 @@ Drawbacks
     analysis". Communications in Statistics-theory and Methods 3: 1-27.
     `doi:10.1080/03610926.2011.560741 <https://doi.org/10.1080/03610926.2011.560741>`_.
 
+
+.. _davies-bouldin_index:
+
+Davies-Bouldin Index
+--------------------
+
+If the ground truth labels are not known, the Davies-Bouldin index
+(:func:`sklearn.metrics.davies_bouldin_score`) can be used to evaluate the
+model, where a lower Davies-Bouldin index relates to a model with better
+separation between the clusters.
+
+The index is defined as the average similarity between each cluster :math:`C_i`
+for :math:`i=1, ..., k` and its most similar one :math:`C_j`. In the context of
+this index, similarity is defined as a measure :math:`R_{ij}` that trades off:
+
+- :math:`s_i`, the average distance between each point of cluster :math:`i` and
+  the centroid of that cluster -- also know as cluster diameter.
+- :math:`d_{ij}`, the distance between cluster centroids :math:`i` and :math:`j`.
+
+A simple choice to construct :math:`R_ij` so that it is nonnegative and
+symmetric is:
+
+.. math::
+   R_{ij} = \frac{s_i + s_j}{d_{ij}}
+
+Then the Davies-Bouldin index is defined as:
+
+.. math::
+   DB = \frac{1}{k} \sum{i=1}^k \max_{i \neq j} R_{ij}
+
+Zero is the lowest possible score. Values closer to zero indicate a better
+partition.
+
+In normal usage, the Davies-Bouldin index is applied to the results of a
+cluster analysis as follows:
+
+  >>> from sklearn import datasets
+  >>> iris = datasets.load_iris()
+  >>> X = iris.data
+  >>> from sklearn.cluster import KMeans
+  >>> from sklearn.metrics import davies_bouldin_score
+  >>> kmeans = KMeans(n_clusters=3, random_state=1).fit(X)
+  >>> labels = kmeans.labels_
+  >>> davies_bouldin_score(X, labels)  # doctest: +ELLIPSIS
+  0.6623...
+
+
+Advantages
+~~~~~~~~~~
+
+- The computation of Davies-Bouldin is simpler than that of Silhouette scores.
+- The index is computed only quantities and features inherent to the dataset.
+
+Drawbacks
+~~~~~~~~~
+
+- The Davies-Boulding index is generally higher for convex clusters than other
+  concepts of clusters, such as density based clusters like those obtained from
+  DBSCAN.
+
+- The usage of centroid distance limits the distance metric to Euclidean space.
+- A good value reported by this method does not imply the best information retrieval.
+
+.. topic:: References
+
+ * Davies, David L.; Bouldin, Donald W. (1979).
+   "A Cluster Separation Measure"
+   IEEE Transactions on Pattern Analysis and Machine Intelligence.
+   PAMI-1 (2): 224-227.
+   `doi:10.1109/TPAMI.1979.4766909 <http://dx.doi.org/10.1109/TPAMI.1979.4766909>`_.
+
+ * Halkidi, Maria; Batistakis, Yannis; Vazirgiannis, Michalis (2001).
+   "On Clustering Validation Techniques"
+   Journal of Intelligent Information Systems, 17(2-3), 107-145.
+   `doi:10.1023/A:1012801612483 <http://dx.doi.org/10.1023/A:1012801612483>`_.
+
+ * `Wikipedia entry for Davies-Bouldin index
+   <https://en.wikipedia.org/wiki/Davies–Bouldin_index>`_.
+
+
 .. _contingency_matrix:
 
 Contingency Matrix
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -99,6 +99,9 @@ Preprocessing
 
 Model evaluation
 
+- Added the :func:`metrics.cluster.davies_bouldin_index` metric for unsupervised
+  evaluation of clustering models. :issue:`10827` by :user:`Luis Osa <logc>`.
+
 - Added the :func:`metrics.balanced_accuracy_score` metric and a corresponding
   ``'balanced_accuracy'`` scorer for binary classification.
   :issue:`8066` by :user:`xyguo` and :user:`Aman Dalmia <dalmia>`.
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
@@ -45,6 +45,7 @@
 from .cluster import silhouette_score
 from .cluster import calinski_harabaz_score
 from .cluster import v_measure_score
+from .cluster import davies_bouldin_score
 
 from .pairwise import euclidean_distances
 from .pairwise import pairwise_distances
@@ -80,6 +81,7 @@
     'confusion_matrix',
     'consensus_score',
     'coverage_error',
+    'davies_bouldin_score',
     'euclidean_distances',
     'explained_variance_score',
     'f1_score',
diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py
@@ -20,11 +20,13 @@
 from .unsupervised import silhouette_samples
 from .unsupervised import silhouette_score
 from .unsupervised import calinski_harabaz_score
+from .unsupervised import davies_bouldin_score
 from .bicluster import consensus_score
 
 __all__ = ["adjusted_mutual_info_score", "normalized_mutual_info_score",
            "adjusted_rand_score", "completeness_score", "contingency_matrix",
            "expected_mutual_information", "homogeneity_completeness_v_measure",
            "homogeneity_score", "mutual_info_score", "v_measure_score",
            "fowlkes_mallows_score", "entropy", "silhouette_samples",
-           "silhouette_score", "calinski_harabaz_score", "consensus_score"]
+           "silhouette_score", "calinski_harabaz_score",
+           "davies_bouldin_score", "consensus_score"]
diff --git a/sklearn/metrics/cluster/tests/test_common.py b/sklearn/metrics/cluster/tests/test_common.py
@@ -13,6 +13,7 @@
 from sklearn.metrics.cluster import v_measure_score
 from sklearn.metrics.cluster import silhouette_score
 from sklearn.metrics.cluster import calinski_harabaz_score
+from sklearn.metrics.cluster import davies_bouldin_score
 
 from sklearn.utils.testing import assert_allclose
 
@@ -43,7 +44,8 @@
 UNSUPERVISED_METRICS = {
     "silhouette_score": silhouette_score,
     "silhouette_manhattan": partial(silhouette_score, metric='manhattan'),
-    "calinski_harabaz_score": calinski_harabaz_score
+    "calinski_harabaz_score": calinski_harabaz_score,
+    "davies_bouldin_score": davies_bouldin_score
 }
 
 # Lists of metrics with common properties
diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py
@@ -1,10 +1,10 @@
 import numpy as np
 import scipy.sparse as sp
+import pytest
 from scipy.sparse import csr_matrix
 
 from sklearn import datasets
 from sklearn.utils.testing import assert_false
-from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_raises_regexp
@@ -14,6 +14,7 @@
 from sklearn.metrics.cluster import silhouette_samples
 from sklearn.metrics import pairwise_distances
 from sklearn.metrics.cluster import calinski_harabaz_score
+from sklearn.metrics.cluster import davies_bouldin_score
 
 
 def test_silhouette():
@@ -33,13 +34,13 @@ def test_silhouette():
         assert_greater(score_precomputed, 0)
         # Test without calculating D
         score_euclidean = silhouette_score(X, y, metric='euclidean')
-        assert_almost_equal(score_precomputed, score_euclidean)
+        pytest.approx(score_precomputed, score_euclidean)
 
         if X is X_dense:
             score_dense_without_sampling = score_precomputed
         else:
-            assert_almost_equal(score_euclidean,
-                                score_dense_without_sampling)
+            pytest.approx(score_euclidean,
+                          score_dense_without_sampling)
 
         # Test with sampling
         score_precomputed = silhouette_score(D, y, metric='precomputed',
@@ -50,12 +51,12 @@ def test_silhouette():
                                            random_state=0)
         assert_greater(score_precomputed, 0)
         assert_greater(score_euclidean, 0)
-        assert_almost_equal(score_euclidean, score_precomputed)
+        pytest.approx(score_euclidean, score_precomputed)
 
         if X is X_dense:
             score_dense_with_sampling = score_precomputed
         else:
-            assert_almost_equal(score_euclidean, score_dense_with_sampling)
+            pytest.approx(score_euclidean, score_dense_with_sampling)
 
 
 def test_cluster_size_1():
@@ -120,12 +121,14 @@ def test_silhouette_paper_example():
                                     (labels2, expected2, score2)]:
         expected = [expected[name] for name in names]
         # we check to 2dp because that's what's in the paper
-        assert_almost_equal(expected, silhouette_samples(D, np.array(labels),
-                                                         metric='precomputed'),
-                            decimal=2)
-        assert_almost_equal(score, silhouette_score(D, np.array(labels),
-                                                    metric='precomputed'),
-                            decimal=2)
+        pytest.approx(expected,
+                      silhouette_samples(D, np.array(labels),
+                                         metric='precomputed'),
+                      abs=1e-2)
+        pytest.approx(score,
+                      silhouette_score(D, np.array(labels),
+                                       metric='precomputed'),
+                      abs=1e-2)
 
 
 def test_correct_labelsize():
@@ -166,19 +169,27 @@ def test_non_numpy_labels():
         silhouette_score(list(X), list(y)), silhouette_score(X, y))
 
 
-def test_calinski_harabaz_score():
+def assert_raises_on_only_one_label(func):
+    """Assert message when there is only one label"""
     rng = np.random.RandomState(seed=0)
-
-    # Assert message when there is only one label
     assert_raise_message(ValueError, "Number of labels is",
-                         calinski_harabaz_score,
+                         func,
                          rng.rand(10, 2), np.zeros(10))
 
-    # Assert message when all point are in different clusters
+
+def assert_raises_on_all_points_same_cluster(func):
+    """Assert message when all point are in different clusters"""
+    rng = np.random.RandomState(seed=0)
     assert_raise_message(ValueError, "Number of labels is",
-                         calinski_harabaz_score,
+                         func,
                          rng.rand(10, 2), np.arange(10))
 
+
+def test_calinski_harabaz_score():
+    assert_raises_on_only_one_label(calinski_harabaz_score)
+
+    assert_raises_on_all_points_same_cluster(calinski_harabaz_score)
+
     # Assert the value is 1. when all samples are equals
     assert_equal(1., calinski_harabaz_score(np.ones((10, 2)),
                                             [0] * 5 + [1] * 5))
@@ -191,5 +202,29 @@ def test_calinski_harabaz_score():
     X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
          [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
     labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
-    assert_almost_equal(calinski_harabaz_score(X, labels),
+    pytest.approx(calinski_harabaz_score(X, labels),
                         45 * (40 - 4) / (5 * (4 - 1)))
+
+
+def test_davies_bouldin_score():
+    assert_raises_on_only_one_label(davies_bouldin_score)
+    assert_raises_on_all_points_same_cluster(davies_bouldin_score)
+
+    # Assert the value is 0. when all samples are equals
+    assert davies_bouldin_score(np.ones((10, 2)),
+                                [0] * 5 + [1] * 5) == pytest.approx(0.0)
+
+    # Assert the value is 0. when all the mean cluster are equal
+    assert davies_bouldin_score([[-1, -1], [1, 1]] * 10,
+                                [0] * 10 + [1] * 10) == pytest.approx(0.0)
+
+    # General case (with non numpy arrays)
+    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
+         [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
+    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
+    pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3)
+
+    # General case - cluster have one sample
+    X = ([[0, 0], [2, 2], [3, 3], [5, 5]])
+    labels = [0, 0, 1, 2]
+    pytest.approx(davies_bouldin_score(X, labels), (5. / 4) / 3)
diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py
@@ -9,6 +9,7 @@
 
 from ...utils import check_random_state
 from ...utils import check_X_y
+from ...utils import safe_indexing
 from ..pairwise import pairwise_distances
 from ...preprocessing import LabelEncoder
 
@@ -258,3 +259,57 @@ def calinski_harabaz_score(X, labels):
     return (1. if intra_disp == 0. else
             extra_disp * (n_samples - n_labels) /
             (intra_disp * (n_labels - 1.)))
+
+
+def davies_bouldin_score(X, labels):
+    """Computes the Davies-Bouldin score.
+
+    The score is defined as the ratio of within-cluster distances to
+    between-cluster distances.
+
+    Read more in the :ref:`User Guide <davies-bouldin_index>`.
+
+    Parameters
+    ----------
+    X : array-like, shape (``n_samples``, ``n_features``)
+        List of ``n_features``-dimensional data points. Each row corresponds
+        to a single data point.
+
+    labels : array-like, shape (``n_samples``,)
+        Predicted labels for each sample.
+
+    Returns
+    -------
+    score: float
+        The resulting Davies-Bouldin score.
+
+    References
+    ----------
+    .. [1] `Davies, David L.; Bouldin, Donald W. (1979).
+       "A Cluster Separation Measure". IEEE Transactions on
+       Pattern Analysis and Machine Intelligence. PAMI-1 (2): 224-227`_
+    """
+    X, labels = check_X_y(X, labels)
+    le = LabelEncoder()
+    labels = le.fit_transform(labels)
+    n_samples, _ = X.shape
+    n_labels = len(le.classes_)
+    check_number_of_labels(n_labels, n_samples)
+
+    intra_dists = np.zeros(n_labels)
+    centroids = np.zeros((n_labels, len(X[0])), dtype=np.float)
+    for k in range(n_labels):
+        cluster_k = safe_indexing(X, labels == k)
+        centroid = cluster_k.mean(axis=0)
+        centroids[k] = centroid
+        intra_dists[k] = np.average(pairwise_distances(
+            cluster_k, [centroid]))
+
+    centroid_distances = pairwise_distances(centroids)
+
+    if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0):
+        return 0.0
+
+    score = (intra_dists[:, None] + intra_dists) / centroid_distances
+    score[score == np.inf] = np.nan
+    return np.mean(np.nanmax(score, axis=1))