scikit-learn · tomron · Nov 25, 2016 · Nov 26, 2016 · Nov 26, 2016 · Nov 28, 2016
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
@@ -1569,3 +1569,77 @@ Drawbacks
  *  Caliński, T., & Harabasz, J. (1974). "A dendrite method for cluster
     analysis". Communications in Statistics-theory and Methods 3: 1-27.
     `doi:10.1080/03610926.2011.560741 <http://dx.doi.org/10.1080/03610926.2011.560741>`_.
+
+.. _davies–bouldin_index:
+
+Davies–Bouldin Index
+----------------------
+
+If the ground truth labels are not known, the Davies–Bouldin index
+(:func:`sklearn.metrics.davies_bouldin_index`) can be used to evaluate the
+model, where a lower Davies–Bouldin Index relates to a model with better
+separation between clusters.
+
+For :math:`k` clusters, the Davies–Bouldin index :math:`DB` is given as the
+ratio of within cluster-mean distance to the between means distance.
+
+.. math::
+  DB(k) = \frac{1}{k} \sum_{i=1}^k \max_{i \neq j} D_{ij}
+
+Where :math:`D_{ij}` is the ratio between the within distances in clusters
+:math:`i` and :math:`j` and the distance between the means of cluster
+:math:`i` and :math:`j`.
+
+.. math::
+  D_{ij} = \frac{\bar{d_i}+\bar{d_j}}{d_ij}
+
+:math:`\bar{d_i}` is the average distance between each point in cluster
+:math:`i` and the centroid of cluster :math:`i`.
+:math:`\bar{d_i}` is the diameter of cluster :math:`i`.
+
+:math:`\bar{d_j}` is the average distance between each point in cluster
+:math:`j` and the centroid of cluster :math:`j`.
+:math:`\bar{d_j}` is the diameter of cluster :math:`j`.
+
+:math:`d_{ij}` is the Euclidean distance between the centroid of cluster
+:math:`i` and the centroid of cluster :math:`j`.
+
+
+  >>> from sklearn import metrics
+  >>> from sklearn.metrics import pairwise_distances
+  >>> from sklearn import datasets
+  >>> dataset = datasets.load_iris()
+  >>> X = dataset.data
+
+In normal usage, the Davies-Bouldin index is applied to the results of a
+cluster analysis.
+
+  >>> import numpy as np
+  >>> from sklearn.cluster import KMeans
+  >>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)
+  >>> labels = kmeans_model.labels_
+  >>> metrics.davies_bouldin_index(X, labels)  # doctest: +ELLIPSIS
+  0.6623...
+
+
+Advantages
+~~~~~~~~~~
+
+- The computation of the Davies-Bouldin index is simpler than the computation
+  of the Silhouette index.
+
+Drawbacks
+~~~~~~~~~
+
+- The Davies-Bouldin index is generally higher for convex clusters than other
+  concepts of clusters, such as density based clusters like those obtained
+  through DBSCAN.
+- The usage of centroid distance limit the distance metric only to Euclidean space.
+
+.. topic:: References
+
+ *  Davies, David L.; Bouldin, Donald W. (1979).
+    "A Cluster Separation Measure"
+    IEEE Transactions on Pattern Analysis and Machine Intelligence.
+    PAMI-1 (2): 224–227. 
+    `doi:10.1109/TPAMI.1979.4766909 <http://dx.doi.org/10.1109/TPAMI.1979.4766909>`_.
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
@@ -43,6 +43,7 @@
 from .cluster import silhouette_samples
 from .cluster import silhouette_score
 from .cluster import calinski_harabaz_score
+from .cluster import davies_bouldin_index
 from .cluster import v_measure_score
 
 from .pairwise import euclidean_distances
@@ -73,6 +74,7 @@
     'confusion_matrix',
     'consensus_score',
     'coverage_error',
+    'davies_bouldin_index',
     'euclidean_distances',
     'explained_variance_score',
     'f1_score',

diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py
@@ -20,11 +20,13 @@
 from .unsupervised import silhouette_samples
 from .unsupervised import silhouette_score
 from .unsupervised import calinski_harabaz_score
+from .unsupervised import davies_bouldin_index
 from .bicluster import consensus_score
 
 __all__ = ["adjusted_mutual_info_score", "normalized_mutual_info_score",
            "adjusted_rand_score", "completeness_score", "contingency_matrix",
            "expected_mutual_information", "homogeneity_completeness_v_measure",
            "homogeneity_score", "mutual_info_score", "v_measure_score",
            "fowlkes_mallows_score", "entropy", "silhouette_samples",
-           "silhouette_score", "calinski_harabaz_score", "consensus_score"]
+           "silhouette_score", "calinski_harabaz_score",
+           "davies_bouldin_index", "consensus_score"]
diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py
@@ -14,6 +14,7 @@
 from sklearn.metrics.cluster import silhouette_samples
 from sklearn.metrics import pairwise_distances
 from sklearn.metrics.cluster import calinski_harabaz_score
+from sklearn.metrics.cluster import davies_bouldin_index
 
 
 def test_silhouette():
@@ -146,3 +147,38 @@ def test_calinski_harabaz_score():
     labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
     assert_almost_equal(calinski_harabaz_score(X, labels),
                         45 * (40 - 4) / (5 * (4 - 1)))
+
+
+def test_davies_bouldin_index():
+    rng = np.random.RandomState(seed=0)
+
+    # Assert message when there is only one label
+    assert_raise_message(ValueError, "Number of labels is",
+                         davies_bouldin_index,
+                         rng.rand(10, 2), np.zeros(10))
+
+    # Assert message when all point are in different clusters
+    assert_raise_message(ValueError, "Number of labels is",
+                         davies_bouldin_index,
+                         rng.rand(10, 2), np.arange(10))
+
+    # Assert the value is 0. when all samples are equals
+    assert_equal(0., davies_bouldin_index(np.ones((10, 2)),
+                                          [0] * 5 + [1] * 5))
+
+    # Assert the value is 0. when all the mean cluster are equal
+    assert_equal(0., davies_bouldin_index([[-1, -1], [1, 1]] * 10,
+                                          [0] * 10 + [1] * 10))
+
+    # General case (with non numpy arrays)
+    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
+         [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
+    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
+    assert_almost_equal(davies_bouldin_index(X, labels),
+                        2*np.sqrt(0.5)/3)
+
+    # General case - cluster have one sample
+    X = ([[0, 0], [2, 2], [3, 3], [5, 5]])
+    labels = [0, 0, 1, 2]
+    assert_almost_equal(davies_bouldin_index(X, labels),
+                        (5./4)/3)
diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py
@@ -255,3 +255,55 @@ def calinski_harabaz_score(X, labels):
     return (1. if intra_disp == 0. else
             extra_disp * (n_samples - n_labels) /
             (intra_disp * (n_labels - 1.)))
+
+
+def davies_bouldin_index(X, labels):
+    """Compute the Davies Bouldin index.
+
+    The index is defined as the ratio of within-cluster
+    and between-cluster distances.
+
+    Parameters
+    ----------
+    X : array-like, shape (``n_samples``, ``n_features``)
+        List of ``n_features``-dimensional data points. Each row corresponds
+        to a single data point.
+
+    labels : array-like, shape (``n_samples``,)
+        Predicted labels for each sample.
+
+    Returns
+    -------
+    score : float
+        The resulting Davies-Bouldin index.
+
+    References
+    ----------
+    .. [1] `Davies, David L.; Bouldin, Donald W. (1979).
+       "A Cluster Separation Measure". IEEE Transactions on
+       Pattern Analysis and Machine Intelligence. PAMI-1 (2): 224-227`_
+    """
+
+    X, labels = check_X_y(X, labels)
+    le = LabelEncoder()
+    labels = le.fit_transform(labels)
+    n_samples, _ = X.shape
+    n_labels = len(le.classes_)
+
+    check_number_of_labels(n_labels, n_samples)
+    intra_dists = np.zeros(n_labels)
+    centroids = np.zeros((n_labels, len(X[0])), np.float32)
+    for k in range(n_labels):
+        cluster_k = X[labels == k]
+        mean_k = np.mean(cluster_k, axis=0)
+        centroids[k] = mean_k
+        intra_dists[k] = np.average(pairwise_distances(cluster_k, [mean_k]))
+    centroid_distances = pairwise_distances(centroids)
+    with np.errstate(divide='ignore', invalid='ignore'):
+        if np.all((intra_dists[:, None] + intra_dists) == 0.0) or \
+           np.all(centroid_distances == 0.0):
+            return 0.0
+        scores = (intra_dists[:, None] + intra_dists)/centroid_distances
+        # remove inf values
+        scores[scores == np.inf] = np.nan
+        return np.mean(np.nanmax(scores, axis=1))