From 80844ae48c6e059fc439ea6531cb565ff6a5754c Mon Sep 17 00:00:00 2001
From: Arya McCarthy <arya.mccarthy@gmail.com>
Date: Wed, 23 May 2018 18:12:03 -0400
Subject: [PATCH 01/14] Add averaging option to AMI and NMI

Leave current behavior unchanged
---
 sklearn/metrics/cluster/supervised.py | 52 +++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 19bc461c9e9fd..b8759a1e5695e 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -16,6 +16,7 @@
 from __future__ import division
 
 from math import log
+import warnings
 
 import numpy as np
 from scipy import sparse as sp
@@ -527,6 +528,22 @@ def v_measure_score(labels_true, labels_pred):
     return homogeneity_completeness_v_measure(labels_true, labels_pred)[2]
 
 
+
+def _generalized_average(U, V, average_method):
+    if average_method == "min":
+        return min(U, V)
+    elif average_method == "sqrt":
+        return max(np.sqrt(U * V), 1e-10)  # Avoids zero-division error
+    elif average_method == "sum":
+        return max(np.mean(U, V), 1e-10)
+    elif average_method == "max":
+        return max(U, V)
+    else:
+        raise ValueError("'average_method' must be 'min', 'sqrt', 'sum', or "
+        "'max'")
+
+
+
 def mutual_info_score(labels_true, labels_pred, contingency=None):
     r"""Mutual Information between two clusterings.
 
@@ -608,7 +625,7 @@ def mutual_info_score(labels_true, labels_pred, contingency=None):
     return mi.sum()
 
 
-def adjusted_mutual_info_score(labels_true, labels_pred):
+def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
     """Adjusted Mutual Information between two clusterings.
 
     Adjusted Mutual Information (AMI) is an adjustment of the Mutual
@@ -617,7 +634,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred):
     clusters, regardless of whether there is actually more information shared.
     For two clusterings :math:`U` and :math:`V`, the AMI is given as::
 
-        AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [max(H(U), H(V)) - E(MI(U, V))]
+        AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]
 
     This metric is independent of the absolute values of the labels:
     a permutation of the class or cluster label values won't change the
@@ -641,6 +658,12 @@ def adjusted_mutual_info_score(labels_true, labels_pred):
     labels_pred : array, shape = [n_samples]
         A clustering of the data into disjoint subsets.
 
+    average_method : string or None, optional (default: None)
+        How to compute the normalizer in the denominator. Possible options
+        are 'min', 'sqrt', 'sum', and 'max'.
+        If None, 'max' will be used. This is likely to change in a future
+        version. 
+
     Returns
     -------
     ami: float(upperlimited by 1.0)
@@ -682,6 +705,12 @@ def adjusted_mutual_info_score(labels_true, labels_pred):
        <https://en.wikipedia.org/wiki/Adjusted_Mutual_Information>`_
 
     """
+    if average_method is None:
+        warnings.warn("The behavior of AMI will change in a future version. "
+           "To match the behavior of 'v_measure_score', AMI will use "
+           "sqrt-averaging, i.e. geometric mean, by default."
+           )
+        average_method = 'max'
     labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
     n_samples = labels_true.shape[0]
     classes = np.unique(labels_true)
@@ -700,17 +729,19 @@ def adjusted_mutual_info_score(labels_true, labels_pred):
     emi = expected_mutual_information(contingency, n_samples)
     # Calculate entropy for each labeling
     h_true, h_pred = entropy(labels_true), entropy(labels_pred)
-    ami = (mi - emi) / (max(h_true, h_pred) - emi)
+    normalizer = _generalized_average(h_true, h_pred, average_method)
+    ami = (mi - emi) / (normalizer - emi)
     return ami
 
 
-def normalized_mutual_info_score(labels_true, labels_pred):
+def normalized_mutual_info_score(labels_true, labels_pred, average_method=None):
     """Normalized Mutual Information between two clusterings.
 
     Normalized Mutual Information (NMI) is an normalization of the Mutual
     Information (MI) score to scale the results between 0 (no mutual
     information) and 1 (perfect correlation). In this function, mutual
-    information is normalized by ``sqrt(H(labels_true) * H(labels_pred))``.
+    information is normalized by some generalized mean of ``H(labels_true)``
+    and ``H(labels_pred))``.
 
     This measure is not adjusted for chance. Therefore
     :func:`adjusted_mustual_info_score` might be preferred.
@@ -734,6 +765,12 @@ def normalized_mutual_info_score(labels_true, labels_pred):
     labels_pred : array, shape = [n_samples]
         A clustering of the data into disjoint subsets.
 
+    average_method : string or None, optional (default: None)
+        How to compute the normalizer in the denominator. Possible options
+        are 'min', 'sqrt', 'sum', and 'max'.
+        If None, 'sqrt' will be used, matching the behavior of
+        `v_measure_score`. 
+
     Returns
     -------
     nmi : float
@@ -764,6 +801,8 @@ def normalized_mutual_info_score(labels_true, labels_pred):
       0.0
 
     """
+    if average_method is None:
+        average_method = 'sqrt'
     labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
     classes = np.unique(labels_true)
     clusters = np.unique(labels_pred)
@@ -780,7 +819,8 @@ def normalized_mutual_info_score(labels_true, labels_pred):
     # Calculate the expected value for the mutual information
     # Calculate entropy for each labeling
     h_true, h_pred = entropy(labels_true), entropy(labels_pred)
-    nmi = mi / max(np.sqrt(h_true * h_pred), 1e-10)
+    normalizer = _generalized_average(h_true, h_pred, average_method)
+    nmi = mi / normalizer
     return nmi
 
 

From 479448683c6023d8a0351fd406eab3ad0ff7ab14 Mon Sep 17 00:00:00 2001
From: Arya McCarthy <arya.mccarthy@gmail.com>
Date: Thu, 24 May 2018 11:20:33 -0400
Subject: [PATCH 02/14] Flake8 fixes

---
 sklearn/metrics/cluster/supervised.py | 50 +++++++++++++++------------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index b8759a1e5695e..6c2e63d874a6f 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -11,6 +11,7 @@
 #          Thierry Guillemot <thierry.guillemot.work@gmail.com>
 #          Gregory Stupp <stuppie@gmail.com>
 #          Joel Nothman <joel.nothman@gmail.com>
+#          Arya McCarthy <arya@jhu.edu>
 # License: BSD 3 clause
 
 from __future__ import division
@@ -51,6 +52,21 @@ def check_clusterings(labels_true, labels_pred):
     return labels_true, labels_pred
 
 
+def _generalized_average(U, V, average_method):
+    """Return a particular mean of two numbers."""
+    if average_method == "min":
+        return max(min(U, V), 1e-10)
+    elif average_method == "sqrt":
+        return max(np.sqrt(U * V), 1e-10)  # Avoids zero-division error
+    elif average_method == "sum":
+        return max(np.mean([U, V]), 1e-10)
+    elif average_method == "max":
+        return max(U, V)
+    else:
+        raise ValueError("'average_method' must be 'min', 'sqrt', 'sum', or "
+                         "'max'")
+
+
 def contingency_matrix(labels_true, labels_pred, eps=None, sparse=False):
     """Build a contingency matrix describing the relationship between labels.
 
@@ -528,22 +544,6 @@ def v_measure_score(labels_true, labels_pred):
     return homogeneity_completeness_v_measure(labels_true, labels_pred)[2]
 
 
-
-def _generalized_average(U, V, average_method):
-    if average_method == "min":
-        return min(U, V)
-    elif average_method == "sqrt":
-        return max(np.sqrt(U * V), 1e-10)  # Avoids zero-division error
-    elif average_method == "sum":
-        return max(np.mean(U, V), 1e-10)
-    elif average_method == "max":
-        return max(U, V)
-    else:
-        raise ValueError("'average_method' must be 'min', 'sqrt', 'sum', or "
-        "'max'")
-
-
-
 def mutual_info_score(labels_true, labels_pred, contingency=None):
     r"""Mutual Information between two clusterings.
 
@@ -662,7 +662,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
         How to compute the normalizer in the denominator. Possible options
         are 'min', 'sqrt', 'sum', and 'max'.
         If None, 'max' will be used. This is likely to change in a future
-        version. 
+        version.
 
     Returns
     -------
@@ -707,9 +707,9 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
     """
     if average_method is None:
         warnings.warn("The behavior of AMI will change in a future version. "
-           "To match the behavior of 'v_measure_score', AMI will use "
-           "sqrt-averaging, i.e. geometric mean, by default."
-           )
+                      "To match the behavior of 'v_measure_score', AMI will "
+                      "use sum-averaging, i.e. arithmetic mean, by default."
+                      )
         average_method = 'max'
     labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
     n_samples = labels_true.shape[0]
@@ -730,11 +730,13 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
     # Calculate entropy for each labeling
     h_true, h_pred = entropy(labels_true), entropy(labels_pred)
     normalizer = _generalized_average(h_true, h_pred, average_method)
+    print(normalizer)
     ami = (mi - emi) / (normalizer - emi)
     return ami
 
 
-def normalized_mutual_info_score(labels_true, labels_pred, average_method=None):
+def normalized_mutual_info_score(labels_true, labels_pred,
+                                 average_method=None):
     """Normalized Mutual Information between two clusterings.
 
     Normalized Mutual Information (NMI) is an normalization of the Mutual
@@ -769,7 +771,7 @@ def normalized_mutual_info_score(labels_true, labels_pred, average_method=None):
         How to compute the normalizer in the denominator. Possible options
         are 'min', 'sqrt', 'sum', and 'max'.
         If None, 'sqrt' will be used, matching the behavior of
-        `v_measure_score`. 
+        `v_measure_score`.
 
     Returns
     -------
@@ -802,6 +804,10 @@ def normalized_mutual_info_score(labels_true, labels_pred, average_method=None):
 
     """
     if average_method is None:
+        warnings.warn("The behavior of NMI will change in a future version. "
+                      "To match the behavior of 'v_measure_score', NMI will "
+                      "use sum-averaging, i.e. arithmetic mean, by default."
+                      )
         average_method = 'sqrt'
     labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
     classes = np.unique(labels_true)

From 6279c2535a91a031eb92d7ba00cea54be06f7c09 Mon Sep 17 00:00:00 2001
From: Arya McCarthy <arya.mccarthy@gmail.com>
Date: Thu, 24 May 2018 11:21:07 -0400
Subject: [PATCH 03/14] Incorporate tests of means for AMI and NMI

---
 .../metrics/cluster/tests/test_supervised.py  | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index 8be39cd220d2a..a1a199cae8b56 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -12,6 +12,7 @@
 from sklearn.metrics.cluster import mutual_info_score
 from sklearn.metrics.cluster import normalized_mutual_info_score
 from sklearn.metrics.cluster import v_measure_score
+from sklearn.metrics.cluster.supervised import _generalized_average
 
 from sklearn.utils import assert_all_finite
 from sklearn.utils.testing import (
@@ -46,6 +47,18 @@ def test_error_messages_on_wrong_input():
                              [0, 1, 0], [[1, 1], [0, 0]])
 
 
+def test_generalized_average():
+    a, b = 1, 2
+    methods = ["min", "sqrt", "sum", "max"]
+    means = [_generalized_average(a, b, method) for method in methods]
+    assert means[0] <= means[1] <= means[2] <= means[3]
+    c, d = 12, 12
+    means = [_generalized_average(c, d, method) for method in methods]
+    assert_equal(means[0], means[1])
+    assert_equal(means[1], means[2])
+    assert_equal(means[2], means[3])
+
+
 def test_perfect_matches():
     for score_func in score_funcs:
         assert_equal(score_func([], []), 1.0)
@@ -55,6 +68,20 @@ def test_perfect_matches():
         assert_equal(score_func([0., 1., 0.], [42., 7., 42.]), 1.0)
         assert_equal(score_func([0., 1., 2.], [42., 7., 2.]), 1.0)
         assert_equal(score_func([0, 1, 2], [42, 7, 2]), 1.0)
+    score_funcs_with_changing_means = [
+        normalized_mutual_info_score,
+        adjusted_mutual_info_score,
+    ]
+    means = {"min", "sqrt", "sum", "max"}
+    for score_func in score_funcs_with_changing_means:
+        for mean in means:
+            assert_equal(score_func([], [], mean), 1.0)
+            assert_equal(score_func([0], [1], mean), 1.0)
+            assert_equal(score_func([0, 0, 0], [0, 0, 0], mean), 1.0)
+            assert_equal(score_func([0, 1, 0], [42, 7, 42], mean), 1.0)
+            assert_equal(score_func([0., 1., 0.], [42., 7., 42.], mean), 1.0)
+            assert_equal(score_func([0., 1., 2.], [42., 7., 2.], mean), 1.0)
+            assert_equal(score_func([0, 1, 2], [42, 7, 2], mean), 1.0)
 
 
 def test_homogeneous_but_not_complete_labeling():
@@ -87,7 +114,7 @@ def test_not_complete_and_not_homogeneous_labeling():
     assert_almost_equal(v, 0.52, 2)
 
 
-def test_non_consicutive_labels():
+def test_non_consecutive_labels():
     # regression tests for labels with gaps
     h, c, v = homogeneity_completeness_v_measure(
         [0, 0, 0, 2, 2, 2],
@@ -224,6 +251,14 @@ def test_exactly_zero_info_score():
         assert_equal(v_measure_score(labels_a, labels_b), 0.0)
         assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0)
         assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
+        for method in ["min", "sqrt", "sum", "max"]:
+            print(method)
+            assert_equal(adjusted_mutual_info_score(labels_a, labels_b,
+                                                    method),
+                         0.0)
+            assert_equal(normalized_mutual_info_score(labels_a, labels_b,
+                                                      method),
+                         0.0)
 
 
 def test_v_measure_and_mutual_information(seed=36):
@@ -235,6 +270,10 @@ def test_v_measure_and_mutual_information(seed=36):
         assert_almost_equal(v_measure_score(labels_a, labels_b),
                             2.0 * mutual_info_score(labels_a, labels_b) /
                             (entropy(labels_a) + entropy(labels_b)), 0)
+        assert_almost_equal(v_measure_score(labels_a, labels_b),
+                            normalized_mutual_info_score(labels_a, labels_b,
+                                                         average_method='sum')
+                            )
 
 
 def test_fowlkes_mallows_score():

From ed500d694b0f36f47efa03d8ac1dc53990182d89 Mon Sep 17 00:00:00 2001
From: Arya McCarthy <arya.mccarthy@gmail.com>
Date: Thu, 24 May 2018 11:33:41 -0400
Subject: [PATCH 04/14] Add note about `average_method` in NMI

---
 sklearn/metrics/cluster/supervised.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 6c2e63d874a6f..7e04c31907555 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -730,7 +730,6 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
     # Calculate entropy for each labeling
     h_true, h_pred = entropy(labels_true), entropy(labels_pred)
     normalizer = _generalized_average(h_true, h_pred, average_method)
-    print(normalizer)
     ami = (mi - emi) / (normalizer - emi)
     return ami
 
@@ -743,7 +742,7 @@ def normalized_mutual_info_score(labels_true, labels_pred,
     Information (MI) score to scale the results between 0 (no mutual
     information) and 1 (perfect correlation). In this function, mutual
     information is normalized by some generalized mean of ``H(labels_true)``
-    and ``H(labels_pred))``.
+    and ``H(labels_pred))``, defined by the `average_method`.
 
     This measure is not adjusted for chance. Therefore
     :func:`adjusted_mustual_info_score` might be preferred.

From 5ed8527a6a5fc72f685bdc55c7e61c03a1630a3d Mon Sep 17 00:00:00 2001
From: Arya McCarthy <arya.mccarthy@gmail.com>
Date: Thu, 24 May 2018 13:34:09 -0400
Subject: [PATCH 05/14] Update docs from AMI, NMI changes (#1)

* Correct the NMI and AMI descriptions in docs

* Update docstrings due to averaging changes

- V-measure
- Homogeneity
- Completeness
- NMI
- AMI
---
 doc/modules/clustering.rst            | 15 +++++----------
 sklearn/metrics/cluster/supervised.py | 14 ++++++++++++--
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index ce335cef2dd5c..fe16eb4658a73 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1102,17 +1102,11 @@ Advantages
   for any value of ``n_clusters`` and ``n_samples`` (which is not the
   case for raw Mutual Information or the V-measure for instance).
 
-- **Bounded range [0, 1]**:  Values close to zero indicate two label
+- **Upper bound  of 1**:  Values close to zero indicate two label
   assignments that are largely independent, while values close to one
-  indicate significant agreement. Further, values of exactly 0 indicate
-  **purely** independent label assignments and a AMI of exactly 1 indicates
+  indicate significant agreement. Further, an AMI of exactly 1 indicates
   that the two label assignments are equal (with or without permutation).
 
-- **No assumption is made on the cluster structure**: can be used
-  to compare clustering algorithms such as k-means which assumes isotropic
-  blob shapes with results of spectral clustering algorithms which can
-  find cluster with "folded" shapes.
-
 
 Drawbacks
 ~~~~~~~~~
@@ -1185,7 +1179,7 @@ following equation, from Vinh, Epps, and Bailey, (2009). In this equation,
 Using the expected value, the adjusted mutual information can then be
 calculated using a similar form to that of the adjusted Rand index:
 
-.. math:: \text{AMI} = \frac{\text{MI} - E[\text{MI}]}{\max(H(U), H(V)) - E[\text{MI}]}
+.. math:: \text{AMI} = \frac{\text{MI} - E[\text{MI}]}{\text{mean}(H(U), H(V)) - E[\text{MI}]}
 
 .. topic:: References
 
@@ -1249,7 +1243,8 @@ Their harmonic mean called **V-measure** is computed by
   0.51...
 
 The V-measure is actually equivalent to the mutual information (NMI)
-discussed above normalized by the sum of the label entropies [B2011]_.
+discussed above normalized by the arithmetic mean of the label 
+entropies [B2011]_.
 
 Homogeneity, completeness and V-measure can be computed at once using
 :func:`homogeneity_completeness_v_measure` as follows::
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 7e04c31907555..9cb52431b6aaa 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -253,7 +253,9 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred):
 
     V-Measure is furthermore symmetric: swapping ``labels_true`` and
     ``label_pred`` will give the same score. This does not hold for
-    homogeneity and completeness.
+    homogeneity and completeness. V-Measure is identical to
+    :func:`normalized_mutual_info_score` with the averaging method
+    ``'sum'``.
 
     Read more in the :ref:`User Guide <homogeneity_completeness>`.
 
@@ -452,7 +454,8 @@ def completeness_score(labels_true, labels_pred):
 def v_measure_score(labels_true, labels_pred):
     """V-measure cluster labeling given a ground truth.
 
-    This score is identical to :func:`normalized_mutual_info_score`.
+    This score is identical to :func:`normalized_mutual_info_score` with
+    the ``'sum'`` option for averaging.
 
     The V-measure is the harmonic mean between homogeneity and completeness::
 
@@ -467,6 +470,7 @@ def v_measure_score(labels_true, labels_pred):
     measure the agreement of two independent label assignments strategies
     on the same dataset when the real ground truth is not known.
 
+
     Read more in the :ref:`User Guide <homogeneity_completeness>`.
 
     Parameters
@@ -493,6 +497,7 @@ def v_measure_score(labels_true, labels_pred):
     --------
     homogeneity_score
     completeness_score
+    normalized_mutual_info_score
 
     Examples
     --------
@@ -664,6 +669,8 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
         If None, 'max' will be used. This is likely to change in a future
         version.
 
+        .. versionadded:: 0.20
+
     Returns
     -------
     ami: float(upperlimited by 1.0)
@@ -772,6 +779,8 @@ def normalized_mutual_info_score(labels_true, labels_pred,
         If None, 'sqrt' will be used, matching the behavior of
         `v_measure_score`.
 
+        .. versionadded:: 0.20
+
     Returns
     -------
     nmi : float
@@ -779,6 +788,7 @@ def normalized_mutual_info_score(labels_true, labels_pred,
 
     See also
     --------
+    v_measure_score: V-Measure (NMI with arithmetic mean option.)
     adjusted_rand_score: Adjusted Rand Index
     adjusted_mutual_info_score: Adjusted Mutual Information (adjusted
         against chance)

From df60d46282fed85a591b62f9aa3d51e5cb26780f Mon Sep 17 00:00:00 2001
From: Arya McCarthy <arya.mccarthy@gmail.com>
Date: Sat, 26 May 2018 21:59:41 -0400
Subject: [PATCH 06/14] Update documentation and remove nose tests (#2)

* Update v0.20.rst

* Update test_supervised.py

* Update clustering.rst
---
 doc/modules/clustering.rst                    | 21 +++++++++++---
 doc/whats_new/v0.20.rst                       |  6 ++++
 .../metrics/cluster/tests/test_supervised.py  | 29 ++++++++-----------
 3 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index fe16eb4658a73..cf994a8dc0a37 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1158,7 +1158,7 @@ It also can be expressed in set cardinality formulation:
 
 The normalized mutual information is defined as
 
-.. math:: \text{NMI}(U, V) = \frac{\text{MI}(U, V)}{\sqrt{H(U)H(V)}}
+.. math:: \text{NMI}(U, V) = \frac{\text{MI}(U, V)}{\text{mean}(H(U), H(V))}
 
 This value of the mutual information and also the normalized variant is not
 adjusted for chance and will tend to increase as the number of different labels
@@ -1181,6 +1181,13 @@ calculated using a similar form to that of the adjusted Rand index:
 
 .. math:: \text{AMI} = \frac{\text{MI} - E[\text{MI}]}{\text{mean}(H(U), H(V)) - E[\text{MI}]}
 
+For normalized mutual information and adjusted mutual information, the normalizing 
+value is typically some mean of the entropies of each clustering. Various means exist,
+and no firm rules exist for preferring one over the others.  The decision is largely 
+a field-by-field basis; for instance, in community detection, the arithmetic mean is
+most common. Yang et al. (2016) found that each normalizing method provided 
+"qualitatively similar behaviours".
+
 .. topic:: References
 
  * Strehl, Alexander, and Joydeep Ghosh (2002). "Cluster ensembles – a
@@ -1194,16 +1201,22 @@ calculated using a similar form to that of the adjusted Rand index:
    `doi:10.1145/1553374.1553511 <https://dl.acm.org/citation.cfm?doid=1553374.1553511>`_.
    ISBN 9781605585161.
 
- * Vinh, Epps, and Bailey, (2010). Information Theoretic Measures for
+ * Vinh, Epps, and Bailey, (2010). "Information Theoretic Measures for
    Clusterings Comparison: Variants, Properties, Normalization and
-   Correction for Chance, JMLR
-   http://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf
+   Correction for Chance". JMLR
+   <http://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf>
 
  * `Wikipedia entry for the (normalized) Mutual Information
    <https://en.wikipedia.org/wiki/Mutual_Information>`_
 
  * `Wikipedia entry for the Adjusted Mutual Information
    <https://en.wikipedia.org/wiki/Adjusted_Mutual_Information>`_
+   
+ * Yang, Algesheimer, and Tessone, (2016). "A comparative analysis of community
+   detection algorithms on artificial networks". Scientific Reports 6: 30750.
+   `doi:10.1038/srep30750 <https://www.nature.com/articles/srep30750>`_.
+   
+   
 
 .. _homogeneity_completeness:
 
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index ef6f065bdd3c2..9e1b158ab93b3 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -121,6 +121,12 @@ Metrics
 - Partial AUC is available via ``max_fpr`` parameter in
   :func:`metrics.roc_auc_score`. :issue:`3273` by
   :user:`Alexander Niederbühl <Alexander-N>`.
+- Added control over the normalizer in 
+  :func:`metrics.normalized_mutual_information_score` and
+  :func:`metrics.adjusted_mutual_information_score` via the ``average_method``
+  parameter. In a future version, the default normalizer for each will become
+  the *arithmetic* mean of the entropies of each clustering. :issue:`11124` by
+  :user:`Arya McCarthy <aryamccarthy>`.
 
 Enhancements
 ............
diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index a1a199cae8b56..6bcbd2fcc2e60 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -54,9 +54,7 @@ def test_generalized_average():
     assert means[0] <= means[1] <= means[2] <= means[3]
     c, d = 12, 12
     means = [_generalized_average(c, d, method) for method in methods]
-    assert_equal(means[0], means[1])
-    assert_equal(means[1], means[2])
-    assert_equal(means[2], means[3])
+    assert means[0] == means[1] == means[2] == means[3]
 
 
 def test_perfect_matches():
@@ -75,13 +73,13 @@ def test_perfect_matches():
     means = {"min", "sqrt", "sum", "max"}
     for score_func in score_funcs_with_changing_means:
         for mean in means:
-            assert_equal(score_func([], [], mean), 1.0)
-            assert_equal(score_func([0], [1], mean), 1.0)
-            assert_equal(score_func([0, 0, 0], [0, 0, 0], mean), 1.0)
-            assert_equal(score_func([0, 1, 0], [42, 7, 42], mean), 1.0)
-            assert_equal(score_func([0., 1., 0.], [42., 7., 42.], mean), 1.0)
-            assert_equal(score_func([0., 1., 2.], [42., 7., 2.], mean), 1.0)
-            assert_equal(score_func([0, 1, 2], [42, 7, 2], mean), 1.0)
+            assert score_func([], [], mean) ==  1.0
+            assert score_func([0], [1], mean) == 1.0
+            assert score_func([0, 0, 0], [0, 0, 0], mean) == 1.0
+            assert score_func([0, 1, 0], [42, 7, 42], mean) == 1.0
+            assert score_func([0., 1., 0.], [42., 7., 42.], mean) == 1.0
+            assert score_func([0., 1., 2.], [42., 7., 2.], mean) == 1.0
+            assert score_func([0, 1, 2], [42, 7, 2], mean) == 1.0
 
 
 def test_homogeneous_but_not_complete_labeling():
@@ -252,13 +250,10 @@ def test_exactly_zero_info_score():
         assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0)
         assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
         for method in ["min", "sqrt", "sum", "max"]:
-            print(method)
-            assert_equal(adjusted_mutual_info_score(labels_a, labels_b,
-                                                    method),
-                         0.0)
-            assert_equal(normalized_mutual_info_score(labels_a, labels_b,
-                                                      method),
-                         0.0)
+            assert adjusted_mutual_info_score(labels_a, labels_b,
+                                              method) == 0.0
+            assert normalized_mutual_info_score(labels_a, labels_b,
+                                                method) == 0.0
 
 
 def test_v_measure_and_mutual_information(seed=36):

From b449cb924e1927d65f2d0695cfff8847490a26c6 Mon Sep 17 00:00:00 2001
From: Arya McCarthy <arya.mccarthy@gmail.com>
Date: Sat, 26 May 2018 22:27:19 -0400
Subject: [PATCH 07/14] Fix multiple spaces after operator

---
 sklearn/metrics/cluster/tests/test_supervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index 6bcbd2fcc2e60..08d96130d2f3e 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -73,7 +73,7 @@ def test_perfect_matches():
     means = {"min", "sqrt", "sum", "max"}
     for score_func in score_funcs_with_changing_means:
         for mean in means:
-            assert score_func([], [], mean) ==  1.0
+            assert score_func([], [], mean) == 1.0
             assert score_func([0], [1], mean) == 1.0
             assert score_func([0, 0, 0], [0, 0, 0], mean) == 1.0
             assert score_func([0, 1, 0], [42, 7, 42], mean) == 1.0

From 1b36da577f314fbcb11bdaaf9cccf634d2f5143b Mon Sep 17 00:00:00 2001
From: Arya McCarthy <arya.mccarthy@gmail.com>
Date: Mon, 28 May 2018 11:19:59 -0400
Subject: [PATCH 08/14] Rename all arguments

---
 doc/modules/clustering.rst                    |  8 ++++--
 sklearn/metrics/cluster/supervised.py         | 28 +++++++++----------
 .../metrics/cluster/tests/test_supervised.py  |  9 +++---
 3 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index cf994a8dc0a37..bb85b9c6b6e9c 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1048,8 +1048,8 @@ Given the knowledge of the ground truth class assignments ``labels_true`` and
 our clustering algorithm assignments of the same samples ``labels_pred``, the
 **Mutual Information** is a function that measures the **agreement** of the two
 assignments, ignoring permutations.  Two different normalized versions of this
-measure are available, **Normalized Mutual Information(NMI)** and **Adjusted
-Mutual Information(AMI)**. NMI is often used in the literature while AMI was
+measure are available, **Normalized Mutual Information (NMI)** and **Adjusted
+Mutual Information (AMI)**. NMI is often used in the literature, while AMI was
 proposed more recently and is **normalized against chance**::
 
   >>> from sklearn import metrics
@@ -1188,6 +1188,10 @@ a field-by-field basis; for instance, in community detection, the arithmetic mea
 most common. Yang et al. (2016) found that each normalizing method provided 
 "qualitatively similar behaviours".
 
+Vinh et al. (2010) named variants of NMI and AMI by their averaging method. Their
+'sqrt' and 'sum' averages are the geometric and arithmetic means; we use these
+more broadly common names.
+
 .. topic:: References
 
  * Strehl, Alexander, and Joydeep Ghosh (2002). "Cluster ensembles – a
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 9cb52431b6aaa..c7de12e61d586 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -56,15 +56,15 @@ def _generalized_average(U, V, average_method):
     """Return a particular mean of two numbers."""
     if average_method == "min":
         return max(min(U, V), 1e-10)
-    elif average_method == "sqrt":
+    elif average_method == "geometric":
         return max(np.sqrt(U * V), 1e-10)  # Avoids zero-division error
-    elif average_method == "sum":
+    elif average_method == "arithmetic":
         return max(np.mean([U, V]), 1e-10)
     elif average_method == "max":
         return max(U, V)
     else:
-        raise ValueError("'average_method' must be 'min', 'sqrt', 'sum', or "
-                         "'max'")
+        raise ValueError("'average_method' must be 'min', 'geometric', "
+                         "'arithmetic', or 'max'")
 
 
 def contingency_matrix(labels_true, labels_pred, eps=None, sparse=False):
@@ -254,8 +254,8 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred):
     V-Measure is furthermore symmetric: swapping ``labels_true`` and
     ``label_pred`` will give the same score. This does not hold for
     homogeneity and completeness. V-Measure is identical to
-    :func:`normalized_mutual_info_score` with the averaging method
-    ``'sum'``.
+    :func:`normalized_mutual_info_score` with the arithmetic averaging
+    method.
 
     Read more in the :ref:`User Guide <homogeneity_completeness>`.
 
@@ -455,7 +455,7 @@ def v_measure_score(labels_true, labels_pred):
     """V-measure cluster labeling given a ground truth.
 
     This score is identical to :func:`normalized_mutual_info_score` with
-    the ``'sum'`` option for averaging.
+    the ``'arithmetic'`` option for averaging.
 
     The V-measure is the harmonic mean between homogeneity and completeness::
 
@@ -665,7 +665,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
 
     average_method : string or None, optional (default: None)
         How to compute the normalizer in the denominator. Possible options
-        are 'min', 'sqrt', 'sum', and 'max'.
+        are 'min', 'geometric', 'arithmetic', and 'max'.
         If None, 'max' will be used. This is likely to change in a future
         version.
 
@@ -715,7 +715,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
     if average_method is None:
         warnings.warn("The behavior of AMI will change in a future version. "
                       "To match the behavior of 'v_measure_score', AMI will "
-                      "use sum-averaging, i.e. arithmetic mean, by default."
+                      "use arithmetic mean by default."
                       )
         average_method = 'max'
     labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
@@ -775,9 +775,9 @@ def normalized_mutual_info_score(labels_true, labels_pred,
 
     average_method : string or None, optional (default: None)
         How to compute the normalizer in the denominator. Possible options
-        are 'min', 'sqrt', 'sum', and 'max'.
-        If None, 'sqrt' will be used, matching the behavior of
-        `v_measure_score`.
+        are 'min', 'geometric', 'arithmetic', and 'max'.
+        If None, 'geometric' will be used. This is likely to change in a
+        future version.
 
         .. versionadded:: 0.20
 
@@ -815,9 +815,9 @@ def normalized_mutual_info_score(labels_true, labels_pred,
     if average_method is None:
         warnings.warn("The behavior of NMI will change in a future version. "
                       "To match the behavior of 'v_measure_score', NMI will "
-                      "use sum-averaging, i.e. arithmetic mean, by default."
+                      "use arithmetic mean by default."
                       )
-        average_method = 'sqrt'
+        average_method = 'geometric'
     labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
     classes = np.unique(labels_true)
     clusters = np.unique(labels_pred)
diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index 08d96130d2f3e..bb2f253774b85 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -49,7 +49,7 @@ def test_error_messages_on_wrong_input():
 
 def test_generalized_average():
     a, b = 1, 2
-    methods = ["min", "sqrt", "sum", "max"]
+    methods = ["min", "geometric", "arithmetic", "max"]
     means = [_generalized_average(a, b, method) for method in methods]
     assert means[0] <= means[1] <= means[2] <= means[3]
     c, d = 12, 12
@@ -70,7 +70,7 @@ def test_perfect_matches():
         normalized_mutual_info_score,
         adjusted_mutual_info_score,
     ]
-    means = {"min", "sqrt", "sum", "max"}
+    means = {"min", "geometric", "arithmetic", "max"}
     for score_func in score_funcs_with_changing_means:
         for mean in means:
             assert score_func([], [], mean) == 1.0
@@ -249,7 +249,7 @@ def test_exactly_zero_info_score():
         assert_equal(v_measure_score(labels_a, labels_b), 0.0)
         assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0)
         assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
-        for method in ["min", "sqrt", "sum", "max"]:
+        for method in ["min", "geometric", "arithmetic", "max"]:
             assert adjusted_mutual_info_score(labels_a, labels_b,
                                               method) == 0.0
             assert normalized_mutual_info_score(labels_a, labels_b,
@@ -265,9 +265,10 @@ def test_v_measure_and_mutual_information(seed=36):
         assert_almost_equal(v_measure_score(labels_a, labels_b),
                             2.0 * mutual_info_score(labels_a, labels_b) /
                             (entropy(labels_a) + entropy(labels_b)), 0)
+        avg = 'arithmetic'
         assert_almost_equal(v_measure_score(labels_a, labels_b),
                             normalized_mutual_info_score(labels_a, labels_b,
-                                                         average_method='sum')
+                                                         average_method=avg)
                             )
 
 

From 3d8bf2c16133730ef3041c4d0816526ffdf277df Mon Sep 17 00:00:00 2001
From: Arya McCarthy <arya.mccarthy@gmail.com>
Date: Wed, 6 Jun 2018 13:20:07 -0400
Subject: [PATCH 09/14] No more arbitrary values!

---
 doc/modules/clustering.rst            |  3 ++-
 doc/whats_new/v0.20.rst               | 13 ++++++++++++-
 sklearn/metrics/cluster/supervised.py | 21 +++++++++++++--------
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index bb85b9c6b6e9c..c2e40081c49f1 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1186,7 +1186,8 @@ value is typically some mean of the entropies of each clustering. Various means
 and no firm rules exist for preferring one over the others.  The decision is largely 
 a field-by-field basis; for instance, in community detection, the arithmetic mean is
 most common. Yang et al. (2016) found that each normalizing method provided 
-"qualitatively similar behaviours".
+"qualitatively similar behaviours". In our implementation, this is
+controlled by the ``average_method`` parameter.
 
 Vinh et al. (2010) named variants of NMI and AMI by their averaging method. Their
 'sqrt' and 'sum' averages are the geometric and arithmetic means; we use these
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 9e1b158ab93b3..a61badd73f929 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -124,7 +124,7 @@ Metrics
 - Added control over the normalizer in 
   :func:`metrics.normalized_mutual_information_score` and
   :func:`metrics.adjusted_mutual_information_score` via the ``average_method``
-  parameter. In a future version, the default normalizer for each will become
+  parameter. In version 0.22, the default normalizer for each will become
   the *arithmetic* mean of the entropies of each clustering. :issue:`11124` by
   :user:`Arya McCarthy <aryamccarthy>`.
 
@@ -537,6 +537,17 @@ Metrics
   for :func:`metrics.roc_auc_score`. Moreover using ``reorder=True`` can hide bugs
   due to floating point error in the input.
   :issue:`9851` by :user:`Hanmin Qin <qinhanmin2014>`.
+- In :func:`metrics.normalized_mutual_information_score` and
+  :func:`metrics.adjusted_mutual_information_score`, 
+  warn that ``average_method``
+  will have a new default value. In version 0.22, the default normalizer for each 
+  will become the *arithmetic* mean of the entropies of each clustering. Currently,
+  :func:`metrics.normalized_mutual_information_score` uses the default of
+  ``average_method='geometric'``, and :func:`metrics.adjusted_mutual_information_score`
+  uses the default of ``average_method='max'`` to match their behaviors in
+  version 0.19.
+  :issue:`11124` by :user:`Arya McCarthy <aryamccarthy>`.
+
 
 Cluster
 
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index c7de12e61d586..4d984b73b8dde 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -55,11 +55,11 @@ def check_clusterings(labels_true, labels_pred):
 def _generalized_average(U, V, average_method):
     """Return a particular mean of two numbers."""
     if average_method == "min":
-        return max(min(U, V), 1e-10)
+        return min(U, V)
     elif average_method == "geometric":
-        return max(np.sqrt(U * V), 1e-10)  # Avoids zero-division error
+        return np.sqrt(U * V)
     elif average_method == "arithmetic":
-        return max(np.mean([U, V]), 1e-10)
+        return np.mean([U, V])
     elif average_method == "max":
         return max(U, V)
     else:
@@ -713,9 +713,9 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
 
     """
     if average_method is None:
-        warnings.warn("The behavior of AMI will change in a future version. "
+        warnings.warn("The behavior of AMI will change in version 0.22. "
                       "To match the behavior of 'v_measure_score', AMI will "
-                      "use arithmetic mean by default."
+                      "use average_method='arithmetic' by default."
                       )
         average_method = 'max'
     labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
@@ -737,7 +737,10 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
     # Calculate entropy for each labeling
     h_true, h_pred = entropy(labels_true), entropy(labels_pred)
     normalizer = _generalized_average(h_true, h_pred, average_method)
-    ami = (mi - emi) / (normalizer - emi)
+    # Avoid 0.0 / 0.0 when either entropy is zero.
+    denominator = normalizer - emi
+    denominator = max(denominator, np.finfo('float64').eps)
+    ami = (mi - emi) / denominator
     return ami
 
 
@@ -813,9 +816,9 @@ def normalized_mutual_info_score(labels_true, labels_pred,
 
     """
     if average_method is None:
-        warnings.warn("The behavior of NMI will change in a future version. "
+        warnings.warn("The behavior of NMI will change in version 0.22. "
                       "To match the behavior of 'v_measure_score', NMI will "
-                      "use arithmetic mean by default."
+                      "use average_method='arithmetic' by default."
                       )
         average_method = 'geometric'
     labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
@@ -835,6 +838,8 @@ def normalized_mutual_info_score(labels_true, labels_pred,
     # Calculate entropy for each labeling
     h_true, h_pred = entropy(labels_true), entropy(labels_pred)
     normalizer = _generalized_average(h_true, h_pred, average_method)
+    # Avoid 0.0 / 0.0 when either entropy is zero.
+    normalizer = max(normalizer, np.finfo('float64').eps)
     nmi = mi / normalizer
     return nmi
 

From 2854014e5ee25fceeb74a34c46c696c65f104107 Mon Sep 17 00:00:00 2001
From: Arya McCarthy <arya.mccarthy@gmail.com>
Date: Wed, 6 Jun 2018 14:53:13 -0400
Subject: [PATCH 10/14] Improve handling of floating-point imprecision

---
 sklearn/metrics/cluster/supervised.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 4d984b73b8dde..52c4ad5454388 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -737,9 +737,15 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
     # Calculate entropy for each labeling
     h_true, h_pred = entropy(labels_true), entropy(labels_pred)
     normalizer = _generalized_average(h_true, h_pred, average_method)
-    # Avoid 0.0 / 0.0 when either entropy is zero.
     denominator = normalizer - emi
-    denominator = max(denominator, np.finfo('float64').eps)
+    # Avoid 0.0 / 0.0 when expectation equals maximum, i.e a perfect match.
+    # normalizer should always be >= emi, but because of floating-point
+    # representation, sometimes emi is slightly larger. Correct this
+    # by preserving the sign.
+    if denominator < 0:
+        denominator = min(denominator, -np.finfo('float64').eps)
+    else:
+        denominator = max(denominator, np.finfo('float64').eps)
     ami = (mi - emi) / denominator
     return ami
 

From 059bae6c0b4088c8b3a2898e3c2e46e5df883435 Mon Sep 17 00:00:00 2001
From: Arya McCarthy <arya.mccarthy@gmail.com>
Date: Wed, 6 Jun 2018 20:04:49 -0400
Subject: [PATCH 11/14] Clearly state when the change occurs

---
 sklearn/metrics/cluster/supervised.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 52c4ad5454388..0dbedf1a9dafd 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -666,8 +666,8 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
     average_method : string or None, optional (default: None)
         How to compute the normalizer in the denominator. Possible options
         are 'min', 'geometric', 'arithmetic', and 'max'.
-        If None, 'max' will be used. This is likely to change in a future
-        version.
+        If None, 'max' will be used. The default will change to
+        'arithmetic' in version 0.22.
 
         .. versionadded:: 0.20
 
@@ -785,8 +785,8 @@ def normalized_mutual_info_score(labels_true, labels_pred,
     average_method : string or None, optional (default: None)
         How to compute the normalizer in the denominator. Possible options
         are 'min', 'geometric', 'arithmetic', and 'max'.
-        If None, 'geometric' will be used. This is likely to change in a
-        future version.
+        If None, 'geometric' will be used. The default will change to
+        'arithmetic' in version 0.22.
 
         .. versionadded:: 0.20
 

From e8b957903f808ebaba68680072e653691b373ad3 Mon Sep 17 00:00:00 2001
From: Arya McCarthy <arya.mccarthy@gmail.com>
Date: Sun, 15 Jul 2018 15:28:27 -0400
Subject: [PATCH 12/14] Update AMI/NMI docs

---
 doc/modules/clustering.rst | 30 +++++++++++++++---------------
 doc/whats_new/v0.20.rst    |  2 ++
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index c2e40081c49f1..7f55e6ac7830e 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1166,7 +1166,7 @@ adjusted for chance and will tend to increase as the number of different labels
 between the label assignments.
 
 The expected value for the mutual information can be calculated using the
-following equation, from Vinh, Epps, and Bailey, (2009). In this equation,
+following equation [VEB2009]_. In this equation,
 :math:`a_i = |U_i|` (the number of elements in :math:`U_i`) and
 :math:`b_j = |V_j|` (the number of elements in :math:`V_j`).
 
@@ -1181,15 +1181,15 @@ calculated using a similar form to that of the adjusted Rand index:
 
 .. math:: \text{AMI} = \frac{\text{MI} - E[\text{MI}]}{\text{mean}(H(U), H(V)) - E[\text{MI}]}
 
-For normalized mutual information and adjusted mutual information, the normalizing 
-value is typically some mean of the entropies of each clustering. Various means exist,
-and no firm rules exist for preferring one over the others.  The decision is largely 
-a field-by-field basis; for instance, in community detection, the arithmetic mean is
-most common. Yang et al. (2016) found that each normalizing method provided 
-"qualitatively similar behaviours". In our implementation, this is
-controlled by the ``average_method`` parameter.
+For normalized mutual information and adjusted mutual information, the normalizing
+value is typically some *generalized* mean of the entropies of each clustering.
+Various generalized means exist, and no firm rules exist for preferring one over the
+others.  The decision is largely a field-by-field basis; for instance, in community
+detection, the arithmetic mean is most common. Each
+normalizing method provides "qualitatively similar behaviours" [YAT2016]_. In our
+implementation, this is controlled by the ``average_method`` parameter.
 
-Vinh et al. (2010) named variants of NMI and AMI by their averaging method. Their
+Vinh et al. (2010) named variants of NMI and AMI by their averaging method [VEB2010]_. Their
 'sqrt' and 'sum' averages are the geometric and arithmetic means; we use these
 more broadly common names.
 
@@ -1200,13 +1200,13 @@ more broadly common names.
    Machine Learning Research 3: 583–617.
    `doi:10.1162/153244303321897735 <http://strehl.com/download/strehl-jmlr02.pdf>`_.
 
- * Vinh, Epps, and Bailey, (2009). "Information theoretic measures
+ * [VEB2009] Vinh, Epps, and Bailey, (2009). "Information theoretic measures
    for clusterings comparison". Proceedings of the 26th Annual International
    Conference on Machine Learning - ICML '09.
    `doi:10.1145/1553374.1553511 <https://dl.acm.org/citation.cfm?doid=1553374.1553511>`_.
    ISBN 9781605585161.
 
- * Vinh, Epps, and Bailey, (2010). "Information Theoretic Measures for
+ * [VEB2010] Vinh, Epps, and Bailey, (2010). "Information Theoretic Measures for
    Clusterings Comparison: Variants, Properties, Normalization and
    Correction for Chance". JMLR
    <http://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf>
@@ -1217,7 +1217,8 @@ more broadly common names.
  * `Wikipedia entry for the Adjusted Mutual Information
    <https://en.wikipedia.org/wiki/Adjusted_Mutual_Information>`_
    
- * Yang, Algesheimer, and Tessone, (2016). "A comparative analysis of community
+ * [YAT2016] Yang, Algesheimer, and Tessone, (2016). "A comparative analysis of
+   community
    detection algorithms on artificial networks". Scientific Reports 6: 30750.
    `doi:10.1038/srep30750 <https://www.nature.com/articles/srep30750>`_.
    
@@ -1261,8 +1262,7 @@ Their harmonic mean called **V-measure** is computed by
   0.51...
 
 The V-measure is actually equivalent to the mutual information (NMI)
-discussed above normalized by the arithmetic mean of the label 
-entropies [B2011]_.
+discussed above, with the aggregation function being the arithmetic mean [B2011]_.
 
 Homogeneity, completeness and V-measure can be computed at once using
 :func:`homogeneity_completeness_v_measure` as follows::
@@ -1437,7 +1437,7 @@ Advantages
   for any value of ``n_clusters`` and ``n_samples`` (which is not the
   case for raw Mutual Information or the V-measure for instance).
 
-- **Bounded range [0, 1]**:  Values close to zero indicate two label
+- **Upper-bounded at 1**:  Values close to zero indicate two label
   assignments that are largely independent, while values close to one
   indicate significant agreement. Further, values of exactly 0 indicate
   **purely** independent label assignments and a AMI of exactly 1 indicates
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index a61badd73f929..9001c3b839939 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -121,6 +121,7 @@ Metrics
 - Partial AUC is available via ``max_fpr`` parameter in
   :func:`metrics.roc_auc_score`. :issue:`3273` by
   :user:`Alexander Niederbühl <Alexander-N>`.
+
 - Added control over the normalizer in 
   :func:`metrics.normalized_mutual_information_score` and
   :func:`metrics.adjusted_mutual_information_score` via the ``average_method``
@@ -537,6 +538,7 @@ Metrics
   for :func:`metrics.roc_auc_score`. Moreover using ``reorder=True`` can hide bugs
   due to floating point error in the input.
   :issue:`9851` by :user:`Hanmin Qin <qinhanmin2014>`.
+
 - In :func:`metrics.normalized_mutual_information_score` and
   :func:`metrics.adjusted_mutual_information_score`, 
   warn that ``average_method``

From c65d2b3e3ea6b55aee32d835849fa285ed6f9aad Mon Sep 17 00:00:00 2001
From: Andreas Mueller <t3kcit@gmail.com>
Date: Mon, 16 Jul 2018 15:21:04 -0500
Subject: [PATCH 13/14] Update v0.20.rst

---
 doc/whats_new/v0.20.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 5f1b29e15004c..dca1157ac0368 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -774,7 +774,6 @@ Metrics
   due to floating point error in the input.
   :issue:`9851` by :user:`Hanmin Qin <qinhanmin2014>`.
 
-<<<<<<< HEAD
 - In :func:`metrics.normalized_mutual_information_score` and
   :func:`metrics.adjusted_mutual_information_score`, 
   warn that ``average_method``
@@ -786,13 +785,11 @@ Metrics
   version 0.19.
   :issue:`11124` by :user:`Arya McCarthy <aryamccarthy>`.
 
-=======
 - The ``batch_size`` parameter to :func:`metrics.pairwise_distances_argmin_min`
   and :func:`metrics.pairwise_distances_argmin` is deprecated to be removed in
   v0.22.  It no longer has any effect, as batch size is determined by global
   ``working_memory`` config. See :ref:`working_memory`. :issue:`10280` by `Joel
   Nothman`_ and :user:`Aman Dalmia <dalmia>`.
->>>>>>> master
 
 Cluster
 

From a5b3c0f91cf39f234a181b2ab8ba0aeb69cfad93 Mon Sep 17 00:00:00 2001
From: Arya McCarthy <arya.mccarthy@gmail.com>
Date: Mon, 16 Jul 2018 18:16:50 -0400
Subject: [PATCH 14/14] Catch FutureWarnings in AMI and NMI

---
 sklearn/metrics/cluster/supervised.py         | 27 ++++++++++---------
 .../metrics/cluster/tests/test_supervised.py  | 18 +++++++++++++
 2 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 225bb1614fa72..13addf29fdc00 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -639,7 +639,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None):
     return mi.sum()
 
 
-def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
+def adjusted_mutual_info_score(labels_true, labels_pred,
+                               average_method='warn'):
     """Adjusted Mutual Information between two clusterings.
 
     Adjusted Mutual Information (AMI) is an adjustment of the Mutual
@@ -672,17 +673,17 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
     labels_pred : array, shape = [n_samples]
         A clustering of the data into disjoint subsets.
 
-    average_method : string or None, optional (default: None)
+    average_method : string, optional (default: 'warn')
         How to compute the normalizer in the denominator. Possible options
         are 'min', 'geometric', 'arithmetic', and 'max'.
-        If None, 'max' will be used. The default will change to
+        If 'warn', 'max' will be used. The default will change to
         'arithmetic' in version 0.22.
 
         .. versionadded:: 0.20
 
     Returns
     -------
-    ami: float(upperlimited by 1.0)
+    ami: float (upperlimited by 1.0)
        The AMI returns a value of 1 when the two partitions are identical
        (ie perfectly matched). Random partitions (independent labellings) have
        an expected AMI around 0 on average hence can be negative.
@@ -721,11 +722,11 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
        <https://en.wikipedia.org/wiki/Adjusted_Mutual_Information>`_
 
     """
-    if average_method is None:
+    if average_method == 'warn':
         warnings.warn("The behavior of AMI will change in version 0.22. "
                       "To match the behavior of 'v_measure_score', AMI will "
-                      "use average_method='arithmetic' by default."
-                      )
+                      "use average_method='arithmetic' by default.",
+                      FutureWarning)
         average_method = 'max'
     labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
     n_samples = labels_true.shape[0]
@@ -760,7 +761,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred, average_method=None):
 
 
 def normalized_mutual_info_score(labels_true, labels_pred,
-                                 average_method=None):
+                                 average_method='warn'):
     """Normalized Mutual Information between two clusterings.
 
     Normalized Mutual Information (NMI) is an normalization of the Mutual
@@ -791,10 +792,10 @@ def normalized_mutual_info_score(labels_true, labels_pred,
     labels_pred : array, shape = [n_samples]
         A clustering of the data into disjoint subsets.
 
-    average_method : string or None, optional (default: None)
+    average_method : string, optional (default: 'warn')
         How to compute the normalizer in the denominator. Possible options
         are 'min', 'geometric', 'arithmetic', and 'max'.
-        If None, 'geometric' will be used. The default will change to
+        If 'warn', 'geometric' will be used. The default will change to
         'arithmetic' in version 0.22.
 
         .. versionadded:: 0.20
@@ -830,11 +831,11 @@ def normalized_mutual_info_score(labels_true, labels_pred,
       0.0
 
     """
-    if average_method is None:
+    if average_method == 'warn':
         warnings.warn("The behavior of NMI will change in version 0.22. "
                       "To match the behavior of 'v_measure_score', NMI will "
-                      "use average_method='arithmetic' by default."
-                      )
+                      "use average_method='arithmetic' by default.",
+                      FutureWarning)
         average_method = 'geometric'
     labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
     classes = np.unique(labels_true)
diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index bb2f253774b85..46b95cfd8fda4 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -17,6 +17,7 @@
 from sklearn.utils import assert_all_finite
 from sklearn.utils.testing import (
         assert_equal, assert_almost_equal, assert_raise_message,
+        assert_warns_message, ignore_warnings
 )
 from numpy.testing import assert_array_almost_equal
 
@@ -31,6 +32,18 @@
 ]
 
 
+def test_future_warning():
+    score_funcs_with_changing_means = [
+        normalized_mutual_info_score,
+        adjusted_mutual_info_score,
+    ]
+    warning_msg = "The behavior of "
+    args = [0, 0, 0], [0, 0, 0]
+    for score_func in score_funcs_with_changing_means:
+        assert_warns_message(FutureWarning, warning_msg, score_func, *args)
+
+
+@ignore_warnings(category=FutureWarning)
 def test_error_messages_on_wrong_input():
     for score_func in score_funcs:
         expected = ('labels_true and labels_pred must have same size,'
@@ -57,6 +70,7 @@ def test_generalized_average():
     assert means[0] == means[1] == means[2] == means[3]
 
 
+@ignore_warnings(category=FutureWarning)
 def test_perfect_matches():
     for score_func in score_funcs:
         assert_equal(score_func([], []), 1.0)
@@ -134,6 +148,7 @@ def test_non_consecutive_labels():
     assert_almost_equal(ari_2, 0.24, 2)
 
 
+@ignore_warnings(category=FutureWarning)
 def uniform_labelings_scores(score_func, n_samples, k_range, n_runs=10,
                              seed=42):
     # Compute score for random uniform cluster labelings
@@ -147,6 +162,7 @@ def uniform_labelings_scores(score_func, n_samples, k_range, n_runs=10,
     return scores
 
 
+@ignore_warnings(category=FutureWarning)
 def test_adjustment_for_chance():
     # Check that adjusted scores are almost zero on random labels
     n_clusters_range = [2, 10, 50, 90]
@@ -160,6 +176,7 @@ def test_adjustment_for_chance():
     assert_array_almost_equal(max_abs_scores, [0.02, 0.03, 0.03, 0.02], 2)
 
 
+@ignore_warnings(category=FutureWarning)
 def test_adjusted_mutual_info_score():
     # Compute the Adjusted Mutual Information and test against known values
     labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
@@ -240,6 +257,7 @@ def test_contingency_matrix_sparse():
                                     eps=1e-10, sparse=True)
 
 
+@ignore_warnings(category=FutureWarning)
 def test_exactly_zero_info_score():
     # Check numerical stability when information is exactly zero
     for i in np.logspace(1, 4, 4).astype(np.int):