From 221301865d9ad39e92b4372fb7b43e168e7274e2 Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Wed, 7 Feb 2018 21:17:45 +0000
Subject: [PATCH 01/39] adding functionality to allow more clustering metrics

---
 sklearn/metrics/cluster/__init__.py   |   3 +-
 sklearn/metrics/cluster/supervised.py | 102 ++++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py
index 4cda1108ece32..825bd080cff31 100644
--- a/sklearn/metrics/cluster/__init__.py
+++ b/sklearn/metrics/cluster/__init__.py
@@ -17,6 +17,7 @@
 from .supervised import v_measure_score
 from .supervised import fowlkes_mallows_score
 from .supervised import entropy
+from .supervised import class_cluster_match
 from .unsupervised import silhouette_samples
 from .unsupervised import silhouette_score
 from .unsupervised import calinski_harabaz_score
@@ -26,5 +27,5 @@
            "adjusted_rand_score", "completeness_score", "contingency_matrix",
            "expected_mutual_information", "homogeneity_completeness_v_measure",
            "homogeneity_score", "mutual_info_score", "v_measure_score",
-           "fowlkes_mallows_score", "entropy", "silhouette_samples",
+           "fowlkes_mallows_score", "entropy", "class_cluster_match", "silhouette_samples",
            "silhouette_score", "calinski_harabaz_score", "consensus_score"]
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index ebbbacd4caa01..b50f6328af2e1 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -11,6 +11,7 @@
 #          Thierry Guillemot <thierry.guillemot.work@gmail.com>
 #          Gregory Stupp <stuppie@gmail.com>
 #          Joel Nothman <joel.nothman@gmail.com>
+#          Lucas Pugens Fernandes <lpfernandes@gmail.com>
 # License: BSD 3 clause
 
 from __future__ import division
@@ -19,9 +20,11 @@
 
 import numpy as np
 from scipy import sparse as sp
+from scipy.optimize import linprog
 
 from .expected_mutual_info_fast import expected_mutual_information
 from ...utils.validation import check_array
+from ...utils.multiclass import unique_labels
 from ...utils.fixes import comb
 
 
@@ -870,3 +873,102 @@ def entropy(labels):
     # log(a / b) should be calculated as log(a) - log(b) for
     # possible loss of precision
     return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum)))
+
+
+def max_main_diagonal(A):
+    """Sort matrix A columns to achieve greater main diagonal sum
+    Sorting is done by maximization of the confusion matrix :math:`C`
+    main diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`. Notice the
+    number of cluster has to be equal or smaller than the number
+    of true classes.
+    Parameters
+    ----------
+    A : array, shape = [n,n]
+        Square numerical matrix
+    Returns
+    -------
+    B : array, shape = [n,n]
+        Pivot matrix that sorts A for maximum main diagonal sum
+    References
+    ----------
+    Examples
+    --------
+    >>> from sklearn.metrics.cluster import max_main_diagonal
+    >>> import numpy as np
+    >>> A = np.matrix([[2, 1, 0],
+                       [1, 0, 0],
+                       [0, 2, 0]])
+    >>> max_main_diagonal(A)
+    array([[1., 0., 0.],
+           [0., 0., 1.],
+           [0., 1., 0.]])
+    """
+    n, n = A.shape
+    res = linprog(-A.ravel(),
+                  A_eq=np.r_[np.kron(np.identity(n), np.ones((1, n))),
+                             np.kron(np.ones((1, n)), np.identity(n))],
+                  b_eq=np.ones((2*n,)), bounds=n*n*[(0, None)])
+    assert res.success
+    return res.x.reshape(n, n).T
+
+
+def class_cluster_match(y_true, y_pred, translate=True):
+    """Sort prediction labels in order to maximize the confusion matrix main diagonal sum
+    Sort the prediction labels of a clustering output in order to enable calc
+    of external metrics (eg. accuracy, f1_score, ...). Sorting is done by
+    maximization of the confusion matrix :math:`C` main diagonal sum
+    :math:`\sum{i=0}^{K}C_{i, i}`. Notice the number of cluster has to be equal
+     or smaller than the number of true classes.
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        Ground truth (correct) target values.
+    y_pred : array, shape = [n_samples]
+        Estimated targets as returned by a clustering algorithm.
+    translate : boolean, optional, default True
+        If True, y_pred_sort will be translated from y_pred notation symbols to y_true notation symbols.
+    Returns
+    -------
+    y_pred_sort : array, shape = [n_classes, n_classes]
+        Estimated targets sorted for maximum accuracy with y_true
+    References
+    ----------
+    Examples
+    --------
+    >>> from sklearn.metrics import confusion_matrix
+    >>> from sklearn.metrics.cluster import class_cluster_match
+    >>> y_true = ["class1", "class2", "class3", "class1", "class1", "class3"]
+    >>> y_pred = [0, 0, 2, 2, 0, 2]
+    >>> y_pred_translated = class_cluster_match(y_true, y_pred)
+    >>> y_pred_translated
+    ["class1", "class1", "class3", "class3", "class1", "class3"]
+    >>> confusion_matrix(y_true, y_pred_translated)
+    array([[2., 0., 1.],
+           [1., 0., 0.],
+           [0., 0., 2.]])
+    """
+    classes = list(unique_labels(y_true))
+    n_classes = len(classes)
+    num_classes = [classes.index(y) for y in y_true]
+    clusters = list(unique_labels(y_pred))
+    n_clusters = len(clusters)
+    num_clusters = [clusters.index(y) for y in y_pred]
+
+    if n_clusters > n_classes:
+        raise ValueError("Number of different clusters ("+str(n_clusters) +
+                         ") should be smaller or equal to the number of different classes ("+str(n_classes)+")")
+
+    cm = np.zeros((n_classes, n_classes))
+
+    for y_t, y_p in zip(num_classes, num_clusters):
+        cm[y_t, y_p] += 1
+
+    shuffle = best_perm(cm)
+
+    matching_clusters = [row.tolist().index(1) for row in shuffle]
+
+    y_pred_sort = [matching_clusters[y] for y in num_clusters]
+    if translate:
+        y_pred_sort = [classes[y] for y in y_pred_sort]
+
+    return y_pred_sort

From ef2ec2bbb4ce813cbb7adcddeac32efcf2abe635 Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Wed, 7 Feb 2018 21:29:05 +0000
Subject: [PATCH 02/39] formatting compliant

---
 sklearn/metrics/cluster/supervised.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index b50f6328af2e1..2008fbbf2ac1f 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -912,9 +912,9 @@ def max_main_diagonal(A):
     return res.x.reshape(n, n).T
 
 
-def class_cluster_match(y_true, y_pred, translate=True):
-    """Sort prediction labels in order to maximize the confusion matrix main diagonal sum
-    Sort the prediction labels of a clustering output in order to enable calc
+def class_cluster_match(y_true, y_pred):
+    """Sort prediction labels to maximize the confusion matrix main diagonal sum
+    Sort the prediction labels of a clustering output to enable calc
     of external metrics (eg. accuracy, f1_score, ...). Sorting is done by
     maximization of the confusion matrix :math:`C` main diagonal sum
     :math:`\sum{i=0}^{K}C_{i, i}`. Notice the number of cluster has to be equal
@@ -925,8 +925,6 @@ def class_cluster_match(y_true, y_pred, translate=True):
         Ground truth (correct) target values.
     y_pred : array, shape = [n_samples]
         Estimated targets as returned by a clustering algorithm.
-    translate : boolean, optional, default True
-        If True, y_pred_sort will be translated from y_pred notation symbols to y_true notation symbols.
     Returns
     -------
     y_pred_sort : array, shape = [n_classes, n_classes]
@@ -956,19 +954,20 @@ def class_cluster_match(y_true, y_pred, translate=True):
 
     if n_clusters > n_classes:
         raise ValueError("Number of different clusters ("+str(n_clusters) +
-                         ") should be smaller or equal to the number of different classes ("+str(n_classes)+")")
+                         ") should be smaller or equal to" +
+                         " the number of different" +
+                         " classes ("+str(n_classes)+")")
 
     cm = np.zeros((n_classes, n_classes))
 
     for y_t, y_p in zip(num_classes, num_clusters):
         cm[y_t, y_p] += 1
 
-    shuffle = best_perm(cm)
+    shuffle = max_main_diagonal(cm)
 
     matching_clusters = [row.tolist().index(1) for row in shuffle]
 
     y_pred_sort = [matching_clusters[y] for y in num_clusters]
-    if translate:
-        y_pred_sort = [classes[y] for y in y_pred_sort]
+    y_pred_sort = [classes[y] for y in y_pred_sort]
 
     return y_pred_sort

From 5fab323fdd728c862029cde0653dbca7db7d9de6 Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Wed, 7 Feb 2018 21:42:18 +0000
Subject: [PATCH 03/39] formatting and adding an functionality example

---
 examples/cluster/plot_affinity_propagation.py | 5 +++++
 sklearn/metrics/cluster/__init__.py           | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py
index 2c8fc3acc3936..1084b612291ec 100644
--- a/examples/cluster/plot_affinity_propagation.py
+++ b/examples/cluster/plot_affinity_propagation.py
@@ -27,6 +27,7 @@
 labels = af.labels_
 
 n_clusters_ = len(cluster_centers_indices)
+translated_labels = metrics.cluster.class_cluster_match(labels_true,labels)
 
 print('Estimated number of clusters: %d' % n_clusters_)
 print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
@@ -38,6 +39,10 @@
       % metrics.adjusted_mutual_info_score(labels_true, labels))
 print("Silhouette Coefficient: %0.3f"
       % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
+print("Accuracy: %0.3f"
+      % metrics.accuracy_score(labels_true, translated_labels))
+print("Confusion Matrix:\n%s"
+      % str(metrics.confusion_matrix(labels_true, translated_labels)))
 
 # #############################################################################
 # Plot result
diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py
index 825bd080cff31..07ab7260bf2e3 100644
--- a/sklearn/metrics/cluster/__init__.py
+++ b/sklearn/metrics/cluster/__init__.py
@@ -27,5 +27,6 @@
            "adjusted_rand_score", "completeness_score", "contingency_matrix",
            "expected_mutual_information", "homogeneity_completeness_v_measure",
            "homogeneity_score", "mutual_info_score", "v_measure_score",
-           "fowlkes_mallows_score", "entropy", "class_cluster_match", "silhouette_samples",
-           "silhouette_score", "calinski_harabaz_score", "consensus_score"]
+           "fowlkes_mallows_score", "entropy", "class_cluster_match",
+           "silhouette_samples", "silhouette_score", "calinski_harabaz_score",
+           "consensus_score"]

From 8402cf22044de6c5ad6d878b826ac10dead79d15 Mon Sep 17 00:00:00 2001
From: Lucas Pugens Fernandes <lucaspugensf@gmail.com>
Date: Wed, 7 Feb 2018 21:02:58 -0200
Subject: [PATCH 04/39] adding comma to comply with formatting

---
 examples/cluster/plot_affinity_propagation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py
index 1084b612291ec..4ab71d90e018b 100644
--- a/examples/cluster/plot_affinity_propagation.py
+++ b/examples/cluster/plot_affinity_propagation.py
@@ -27,7 +27,7 @@
 labels = af.labels_
 
 n_clusters_ = len(cluster_centers_indices)
-translated_labels = metrics.cluster.class_cluster_match(labels_true,labels)
+translated_labels = metrics.cluster.class_cluster_match(labels_true, labels)
 
 print('Estimated number of clusters: %d' % n_clusters_)
 print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))

From e97ef2b2057e076af4f798f24d35a61e31ae82d6 Mon Sep 17 00:00:00 2001
From: Lucas Pugens Fernandes <lucaspugensf@gmail.com>
Date: Wed, 7 Feb 2018 21:05:59 -0200
Subject: [PATCH 05/39] Fixed doc generator

---
 sklearn/metrics/cluster/__init__.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py
index 07ab7260bf2e3..222203dbdfc16 100644
--- a/sklearn/metrics/cluster/__init__.py
+++ b/sklearn/metrics/cluster/__init__.py
@@ -17,6 +17,7 @@
 from .supervised import v_measure_score
 from .supervised import fowlkes_mallows_score
 from .supervised import entropy
+from .supervised import max_main_diagonal
 from .supervised import class_cluster_match
 from .unsupervised import silhouette_samples
 from .unsupervised import silhouette_score
@@ -27,6 +28,6 @@
            "adjusted_rand_score", "completeness_score", "contingency_matrix",
            "expected_mutual_information", "homogeneity_completeness_v_measure",
            "homogeneity_score", "mutual_info_score", "v_measure_score",
-           "fowlkes_mallows_score", "entropy", "class_cluster_match",
-           "silhouette_samples", "silhouette_score", "calinski_harabaz_score",
-           "consensus_score"]
+           "fowlkes_mallows_score", "entropy", "max_main_diagonal",
+           "class_cluster_match", "silhouette_samples", "silhouette_score",
+           "calinski_harabaz_score", "consensus_score"]

From 625ced68a4b0d4247c5098c60747d1324b9733ec Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Wed, 7 Feb 2018 23:54:28 +0000
Subject: [PATCH 06/39] more modifications to be compliant with coding
 guidelines

---
 sklearn/metrics/cluster/supervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 2008fbbf2ac1f..fa70b97259d58 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -939,7 +939,7 @@ def class_cluster_match(y_true, y_pred):
     >>> y_pred = [0, 0, 2, 2, 0, 2]
     >>> y_pred_translated = class_cluster_match(y_true, y_pred)
     >>> y_pred_translated
-    ["class1", "class1", "class3", "class3", "class1", "class3"]
+    ['class1', 'class1', 'class3', 'class3', 'class1', 'class3']
     >>> confusion_matrix(y_true, y_pred_translated)
     array([[2., 0., 1.],
            [1., 0., 0.],

From 46fd79f59e12151197fe9eaf9f6d220eb93245c8 Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Thu, 8 Feb 2018 00:17:49 +0000
Subject: [PATCH 07/39] fixing doc bug

---
 sklearn/metrics/cluster/supervised.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index fa70b97259d58..0448460a60c4a 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -896,8 +896,8 @@ def max_main_diagonal(A):
     >>> from sklearn.metrics.cluster import max_main_diagonal
     >>> import numpy as np
     >>> A = np.matrix([[2, 1, 0],
-                       [1, 0, 0],
-                       [0, 2, 0]])
+    >>>                [1, 0, 0],
+    >>>                [0, 2, 0]])
     >>> max_main_diagonal(A)
     array([[1., 0., 0.],
            [0., 0., 1.],

From d1bdae1c08c564685743c1af2843ed67eef229af Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Thu, 8 Feb 2018 00:50:12 +0000
Subject: [PATCH 08/39] fixing doctest and adopting max_assignment_score name
 as proposed by jnothman

---
 sklearn/metrics/cluster/__init__.py   |  4 ++--
 sklearn/metrics/cluster/supervised.py | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py
index 222203dbdfc16..3f3a0a767ea65 100644
--- a/sklearn/metrics/cluster/__init__.py
+++ b/sklearn/metrics/cluster/__init__.py
@@ -17,7 +17,7 @@
 from .supervised import v_measure_score
 from .supervised import fowlkes_mallows_score
 from .supervised import entropy
-from .supervised import max_main_diagonal
+from .supervised import max_assignment_score
 from .supervised import class_cluster_match
 from .unsupervised import silhouette_samples
 from .unsupervised import silhouette_score
@@ -28,6 +28,6 @@
            "adjusted_rand_score", "completeness_score", "contingency_matrix",
            "expected_mutual_information", "homogeneity_completeness_v_measure",
            "homogeneity_score", "mutual_info_score", "v_measure_score",
-           "fowlkes_mallows_score", "entropy", "max_main_diagonal",
+           "fowlkes_mallows_score", "entropy", "max_assignment_score",
            "class_cluster_match", "silhouette_samples", "silhouette_score",
            "calinski_harabaz_score", "consensus_score"]
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 0448460a60c4a..fb0715ef30c04 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -875,7 +875,7 @@ def entropy(labels):
     return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum)))
 
 
-def max_main_diagonal(A):
+def max_assignment_score(A):
     """Sort matrix A columns to achieve greater main diagonal sum
     Sorting is done by maximization of the confusion matrix :math:`C`
     main diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`. Notice the
@@ -893,12 +893,12 @@ def max_main_diagonal(A):
     ----------
     Examples
     --------
-    >>> from sklearn.metrics.cluster import max_main_diagonal
+    >>> from sklearn.metrics.cluster import max_assignment_score
     >>> import numpy as np
     >>> A = np.matrix([[2, 1, 0],
-    >>>                [1, 0, 0],
-    >>>                [0, 2, 0]])
-    >>> max_main_diagonal(A)
+    ...                [1, 0, 0],
+    ...                [0, 2, 0]])
+    >>> max_assignment_score(A)
     array([[1., 0., 0.],
            [0., 0., 1.],
            [0., 1., 0.]])
@@ -963,7 +963,7 @@ def class_cluster_match(y_true, y_pred):
     for y_t, y_p in zip(num_classes, num_clusters):
         cm[y_t, y_p] += 1
 
-    shuffle = max_main_diagonal(cm)
+    shuffle = max_assignment_score(cm)
 
     matching_clusters = [row.tolist().index(1) for row in shuffle]
 

From 07be65c138dcbeb274eb02ae617f75c4a5c514f7 Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Thu, 8 Feb 2018 01:09:58 +0000
Subject: [PATCH 09/39] fixing examle again

---
 sklearn/metrics/cluster/supervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index fb0715ef30c04..a6db7a72865ca 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -895,7 +895,7 @@ def max_assignment_score(A):
     --------
     >>> from sklearn.metrics.cluster import max_assignment_score
     >>> import numpy as np
-    >>> A = np.matrix([[2, 1, 0],
+    >>> A = np.asarray([[2, 1, 0],
     ...                [1, 0, 0],
     ...                [0, 2, 0]])
     >>> max_assignment_score(A)

From 2b01392a5dc16a7919297846cf28f311f38688ed Mon Sep 17 00:00:00 2001
From: Lucas Pugens Fernandes <lucaspugensf@gmail.com>
Date: Thu, 8 Feb 2018 10:34:05 -0200
Subject: [PATCH 10/39] Doc fixing to pass Travis verification

---
 sklearn/metrics/cluster/supervised.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index a6db7a72865ca..62cb51cb8042a 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -899,9 +899,9 @@ def max_assignment_score(A):
     ...                [1, 0, 0],
     ...                [0, 2, 0]])
     >>> max_assignment_score(A)
-    array([[1., 0., 0.],
-           [0., 0., 1.],
-           [0., 1., 0.]])
+    array([[ 1.,  0.,  0.],
+           [ 0.,  0.,  1.],
+           [ 0.,  1.,  0.]])
     """
     n, n = A.shape
     res = linprog(-A.ravel(),
@@ -941,9 +941,9 @@ def class_cluster_match(y_true, y_pred):
     >>> y_pred_translated
     ['class1', 'class1', 'class3', 'class3', 'class1', 'class3']
     >>> confusion_matrix(y_true, y_pred_translated)
-    array([[2., 0., 1.],
-           [1., 0., 0.],
-           [0., 0., 2.]])
+    array([[ 2.,  0.,  1.],
+           [ 1.,  0.,  0.],
+           [ 0.,  0.,  2.]])
     """
     classes = list(unique_labels(y_true))
     n_classes = len(classes)

From b5116b7ce31fa36ff0a8d45116ab6217e16449c8 Mon Sep 17 00:00:00 2001
From: Lucas Pugens Fernandes <lucaspugensf@gmail.com>
Date: Thu, 8 Feb 2018 10:55:26 -0200
Subject: [PATCH 11/39] Yet another doc fix

---
 sklearn/metrics/cluster/supervised.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 62cb51cb8042a..3e492546ff6c9 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -941,9 +941,9 @@ def class_cluster_match(y_true, y_pred):
     >>> y_pred_translated
     ['class1', 'class1', 'class3', 'class3', 'class1', 'class3']
     >>> confusion_matrix(y_true, y_pred_translated)
-    array([[ 2.,  0.,  1.],
-           [ 1.,  0.,  0.],
-           [ 0.,  0.,  2.]])
+    array([[2, 0, 1],
+           [1, 0, 0],
+           [0, 0, 2]])
     """
     classes = list(unique_labels(y_true))
     n_classes = len(classes)

From 81804bfca9a3fc15e9c383cc9968e95c95883ea5 Mon Sep 17 00:00:00 2001
From: Lucas Pugens <lucaspugensf@gmail.com>
Date: Thu, 8 Feb 2018 13:17:53 +0000
Subject: [PATCH 12/39] fixing travis version of scipy

---
 build_tools/travis/install.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
index 9a5b65ce225bd..e0f989d12fdd8 100755
--- a/build_tools/travis/install.sh
+++ b/build_tools/travis/install.sh
@@ -71,7 +71,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then
     # and scipy
     virtualenv --system-site-packages testvenv
     source testvenv/bin/activate
-    pip install pytest pytest-cov cython==$CYTHON_VERSION
+    pip install pytest pytest-cov cython==$CYTHON_VERSION scipy
 
 elif [[ "$DISTRIB" == "scipy-dev-wheels" ]]; then
     # Set up our own virtualenv environment to avoid travis' numpy.

From 4ba0cacb3d923ba32399811cceb148ff2a7ae4fc Mon Sep 17 00:00:00 2001
From: Lucas Pugens <lucaspugensf@gmail.com>
Date: Thu, 8 Feb 2018 16:43:45 +0000
Subject: [PATCH 13/39] allowing for any number of clusters and classes

---
 sklearn/metrics/cluster/supervised.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 3e492546ff6c9..0d4b999c396ef 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -952,13 +952,14 @@ def class_cluster_match(y_true, y_pred):
     n_clusters = len(clusters)
     num_clusters = [clusters.index(y) for y in y_pred]
 
-    if n_clusters > n_classes:
-        raise ValueError("Number of different clusters ("+str(n_clusters) +
-                         ") should be smaller or equal to" +
-                         " the number of different" +
-                         " classes ("+str(n_classes)+")")
-
-    cm = np.zeros((n_classes, n_classes))
+    # if n_clusters > n_classes:
+        # raise ValueError("Number of different clusters ("+str(n_clusters) +
+        #                  ") should be smaller or equal to" +
+        #                  " the number of different" +
+        #                  " classes ("+str(n_classes)+")")
+    dims = max(n_classes, n_clusters)
+    classes += ['non_class'+str(i) for i in range(dims-n_classes)]
+    cm = np.zeros((dims, dims))
 
     for y_t, y_p in zip(num_classes, num_clusters):
         cm[y_t, y_p] += 1

From 29b715485812dd54c6db9ea4ca667f8a413cef95 Mon Sep 17 00:00:00 2001
From: Lucas Pugens <lucaspugensf@gmail.com>
Date: Thu, 8 Feb 2018 16:46:50 +0000
Subject: [PATCH 14/39] allowing for any number of clusters and classes and
 undoing travis script modification

---
 build_tools/travis/install.sh         | 2 +-
 sklearn/metrics/cluster/supervised.py | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
index e0f989d12fdd8..9a5b65ce225bd 100755
--- a/build_tools/travis/install.sh
+++ b/build_tools/travis/install.sh
@@ -71,7 +71,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then
     # and scipy
     virtualenv --system-site-packages testvenv
     source testvenv/bin/activate
-    pip install pytest pytest-cov cython==$CYTHON_VERSION scipy
+    pip install pytest pytest-cov cython==$CYTHON_VERSION
 
 elif [[ "$DISTRIB" == "scipy-dev-wheels" ]]; then
     # Set up our own virtualenv environment to avoid travis' numpy.
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 0d4b999c396ef..17a480b2cef42 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -952,11 +952,6 @@ def class_cluster_match(y_true, y_pred):
     n_clusters = len(clusters)
     num_clusters = [clusters.index(y) for y in y_pred]
 
-    # if n_clusters > n_classes:
-        # raise ValueError("Number of different clusters ("+str(n_clusters) +
-        #                  ") should be smaller or equal to" +
-        #                  " the number of different" +
-        #                  " classes ("+str(n_classes)+")")
     dims = max(n_classes, n_clusters)
     classes += ['non_class'+str(i) for i in range(dims-n_classes)]
     cm = np.zeros((dims, dims))

From 25885382b391d444d88ab46e8df249a03510abfb Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Sat, 10 Feb 2018 23:33:44 +0000
Subject: [PATCH 15/39] better implementation

---
 doc/modules/classes.rst                       |  1 +
 doc/modules/clustering.rst                    | 56 ++++++++++++
 sklearn/metrics/cluster/__init__.py           |  7 +-
 sklearn/metrics/cluster/supervised.py         | 85 ++++++------------
 .../metrics/cluster/tests/test_supervised.py  | 90 ++++++++++++++++++-
 sklearn/utils/__init__.py                     |  4 +-
 6 files changed, 178 insertions(+), 65 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 5977e2f01a9b3..9bff49f61dc96 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -857,6 +857,7 @@ details.
    metrics.calinski_harabaz_score
    metrics.completeness_score
    metrics.cluster.contingency_matrix
+   metrics.cluster.class_cluster_match
    metrics.fowlkes_mallows_score
    metrics.homogeneity_completeness_v_measure
    metrics.homogeneity_score
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 2a3d93e263004..661db822aea55 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1622,6 +1622,62 @@ contingency matrix where the order of rows and columns correspond to a list
 of classes.
 
 
+Advantages
+~~~~~~~~~~
+
+- Allows to examine the spread of each true cluster across predicted
+  clusters and vice versa.
+
+- The contingency table calculated is typically utilized in the calculation
+  of a similarity statistic (like the others listed in this document) between
+  the two clusterings.
+
+Drawbacks
+~~~~~~~~~
+
+- Contingency matrix is easy to interpret for a small number of clusters, but
+  becomes very hard to interpret for a large number of clusters.
+
+- It doesn't give a single metric to use as an objective for clustering
+  optimisation.
+
+
+.. topic:: References
+
+ * `Wikipedia entry for contingency matrix
+   <https://en.wikipedia.org/wiki/Contingency_table>`_
+
+.. _class_cluster_match:
+
+Class-cluster Match
+-------------------
+
+Class-cluster match
+(:func:`sklearn.metrics.cluster.class_cluster_matching`) provides a
+friendly way for the user to calculate classical classification
+metrics, such as :func:`sklearn.metrics.accuracy_score` and 
+:func:`sklearn.metrics.f1_score`. 
+
+Here is an example::
+
+   >>> from sklearn.metrics.cluster import class_cluster_match
+   >>> x = ["a", "a", "a", "b", "b", "b"]
+   >>> y = [0, 0, 1, 1, 2, 2]
+   >>> contingency_matrix(x, y)
+   array([[2, 1, 0],
+          [0, 1, 2]])
+
+The first row of output array indicates that there are three samples whose
+true cluster is "a". Of them, two are in predicted cluster 0, one is in 1,
+and none is in 2. And the second row indicates that there are three samples
+whose true cluster is "b". Of them, none is in predicted cluster 0, one is in
+1 and two are in 2.
+
+A :ref:`confusion matrix <confusion_matrix>` for classification is a square
+contingency matrix where the order of rows and columns correspond to a list
+of classes.
+
+
 Advantages
 ~~~~~~~~~~
 
diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py
index 3f3a0a767ea65..4325d3670a650 100644
--- a/sklearn/metrics/cluster/__init__.py
+++ b/sklearn/metrics/cluster/__init__.py
@@ -17,7 +17,6 @@
 from .supervised import v_measure_score
 from .supervised import fowlkes_mallows_score
 from .supervised import entropy
-from .supervised import max_assignment_score
 from .supervised import class_cluster_match
 from .unsupervised import silhouette_samples
 from .unsupervised import silhouette_score
@@ -28,6 +27,6 @@
            "adjusted_rand_score", "completeness_score", "contingency_matrix",
            "expected_mutual_information", "homogeneity_completeness_v_measure",
            "homogeneity_score", "mutual_info_score", "v_measure_score",
-           "fowlkes_mallows_score", "entropy", "max_assignment_score",
-           "class_cluster_match", "silhouette_samples", "silhouette_score",
-           "calinski_harabaz_score", "consensus_score"]
+           "fowlkes_mallows_score", "entropy", "class_cluster_match",
+           "silhouette_samples", "silhouette_score",
+           "calinski_harabaz_score", "consensus_score", "class_cluster_match"]
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 17a480b2cef42..7f3337142e30a 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -20,12 +20,12 @@
 
 import numpy as np
 from scipy import sparse as sp
-from scipy.optimize import linprog
 
 from .expected_mutual_info_fast import expected_mutual_information
 from ...utils.validation import check_array
 from ...utils.multiclass import unique_labels
 from ...utils.fixes import comb
+from ...utils import linear_assignment
 
 
 def comb2(n):
@@ -875,62 +875,30 @@ def entropy(labels):
     return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum)))
 
 
-def max_assignment_score(A):
-    """Sort matrix A columns to achieve greater main diagonal sum
-    Sorting is done by maximization of the confusion matrix :math:`C`
-    main diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`. Notice the
-    number of cluster has to be equal or smaller than the number
-    of true classes.
-    Parameters
-    ----------
-    A : array, shape = [n,n]
-        Square numerical matrix
-    Returns
-    -------
-    B : array, shape = [n,n]
-        Pivot matrix that sorts A for maximum main diagonal sum
-    References
-    ----------
-    Examples
-    --------
-    >>> from sklearn.metrics.cluster import max_assignment_score
-    >>> import numpy as np
-    >>> A = np.asarray([[2, 1, 0],
-    ...                [1, 0, 0],
-    ...                [0, 2, 0]])
-    >>> max_assignment_score(A)
-    array([[ 1.,  0.,  0.],
-           [ 0.,  0.,  1.],
-           [ 0.,  1.,  0.]])
-    """
-    n, n = A.shape
-    res = linprog(-A.ravel(),
-                  A_eq=np.r_[np.kron(np.identity(n), np.ones((1, n))),
-                             np.kron(np.ones((1, n)), np.identity(n))],
-                  b_eq=np.ones((2*n,)), bounds=n*n*[(0, None)])
-    assert res.success
-    return res.x.reshape(n, n).T
-
-
 def class_cluster_match(y_true, y_pred):
-    """Sort prediction labels to maximize the confusion matrix main diagonal sum
-    Sort the prediction labels of a clustering output to enable calc
-    of external metrics (eg. accuracy, f1_score, ...). Sorting is done by
+    """Translate prediction labels to maximize the accuracy.
+
+    Translate the prediction labels of a clustering output to enable calc
+    of external metrics (eg. accuracy, f1_score, ...). Translation is done by
     maximization of the confusion matrix :math:`C` main diagonal sum
     :math:`\sum{i=0}^{K}C_{i, i}`. Notice the number of cluster has to be equal
      or smaller than the number of true classes.
+
     Parameters
     ----------
     y_true : array, shape = [n_samples]
         Ground truth (correct) target values.
     y_pred : array, shape = [n_samples]
         Estimated targets as returned by a clustering algorithm.
+
     Returns
     -------
-    y_pred_sort : array, shape = [n_classes, n_classes]
-        Estimated targets sorted for maximum accuracy with y_true
+    trans : array, shape = [n_classes, n_classes]
+        Mapping of y_pred clusters, such that :math:`trans\subseteq y_true`
+
     References
     ----------
+
     Examples
     --------
     >>> from sklearn.metrics import confusion_matrix
@@ -945,25 +913,28 @@ def class_cluster_match(y_true, y_pred):
            [1, 0, 0],
            [0, 0, 2]])
     """
-    classes = list(unique_labels(y_true))
+
+    classes = unique_labels(y_true).tolist()
     n_classes = len(classes)
-    num_classes = [classes.index(y) for y in y_true]
-    clusters = list(unique_labels(y_pred))
+    clusters = unique_labels(y_pred).tolist()
     n_clusters = len(clusters)
-    num_clusters = [clusters.index(y) for y in y_pred]
 
-    dims = max(n_classes, n_clusters)
-    classes += ['non_class'+str(i) for i in range(dims-n_classes)]
-    cm = np.zeros((dims, dims))
+    if n_clusters > n_classes:
+        classes += ['DEF_CLASS'+str(i) for i in range(n_clusters-n_classes)]
+    elif n_classes > n_clusters:
+        clusters += ['DEF_CLUSTER'+str(i) for i in range(n_classes-n_clusters)]
 
-    for y_t, y_p in zip(num_classes, num_clusters):
-        cm[y_t, y_p] += 1
+    C = contingency_matrix(y_true, y_pred)
+    true_idx, pred_idx = linear_assignment(-C).T
 
-    shuffle = max_assignment_score(cm)
+    true_idx = true_idx.tolist()
+    pred_idx = pred_idx.tolist()
 
-    matching_clusters = [row.tolist().index(1) for row in shuffle]
+    true_idx = [classes[idx] for idx in true_idx]
+    true_idx = true_idx + list(set(classes) - set(true_idx))
+    pred_idx = [clusters[idx] for idx in pred_idx]
+    pred_idx = pred_idx + list(set(clusters) - set(pred_idx))
 
-    y_pred_sort = [matching_clusters[y] for y in num_clusters]
-    y_pred_sort = [classes[y] for y in y_pred_sort]
+    return_list = [true_idx[pred_idx.index(y)] for y in y_pred]
 
-    return y_pred_sort
+    return return_list
diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index 8da03d1e17457..2946e1cde5f0e 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -12,10 +12,11 @@
 from sklearn.metrics.cluster import mutual_info_score
 from sklearn.metrics.cluster import normalized_mutual_info_score
 from sklearn.metrics.cluster import v_measure_score
+from sklearn.metrics.cluster import class_cluster_match
 
 from sklearn.utils import assert_all_finite
 from sklearn.utils.testing import (
-        assert_equal, assert_almost_equal, assert_raise_message,
+    assert_equal, assert_almost_equal, assert_raise_message,
 )
 from numpy.testing import assert_array_almost_equal
 
@@ -175,8 +176,8 @@ def test_expected_mutual_info_overflow():
 
 def test_int_overflow_mutual_info_score():
     # Test overflow in mutual_info_classif
-    x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 +
-                 204) + [4] * (814 + 39) + [5] * (316 + 20))
+    x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3]
+                 * (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20))
     y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 +
                  [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 +
                  [1] * 20)
@@ -274,3 +275,86 @@ def test_fowlkes_mallows_score_properties():
     # symmetric and permutation(both together)
     score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3)
     assert_almost_equal(score_both, expected)
+
+
+def test_class_cluster_match():
+    # handcrafted example - same number of clusters and classes
+    y_true = ['a'] * 1 + ['b'] * 2 + ['c'] * 20 + ['d'] * 6 + ['e'] * \
+        13 + ['f'] * 2 + ['g'] * 3 + ['h'] * 3 + ['i'] * 2 + ['j'] * 1
+    y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] * 4 + [5] * 2 + [0] * 4 + \
+        [3] * 5 + [6] * 2 + [9] * 2 + [7] * 2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1
+    expected = [
+        'a',
+        'c',
+        'c',
+        'g',
+        'g',
+        'g',
+        'g',
+        'g',
+        'g',
+        'c',
+        'c',
+        'c',
+        'c',
+        'c',
+        'c',
+        'c',
+        'c',
+        'c',
+        'c',
+        'j',
+        'j',
+        'j',
+        'j',
+        'd',
+        'd',
+        'd',
+        'd',
+        'i',
+        'i',
+        'g',
+        'g',
+        'g',
+        'g',
+        'e',
+        'e',
+        'e',
+        'e',
+        'e',
+        'a',
+        'a',
+        'b',
+        'b',
+        'f',
+        'f',
+        'g',
+        'g',
+        'j',
+        'h',
+        'h',
+        'h',
+        'e',
+        'e',
+        'j']
+
+    y_pred_translated = class_cluster_match(y_true, y_pred)
+    assert_equal(y_pred_translated, expected)
+
+    # handcrafted example - more clusters than classes
+    y_true = ['a', 'a', 'a', 'b', 'b', 'b']
+    y_pred = [4, 0, 1, 1, 2, 2]
+
+    expected = ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b']
+
+    y_pred_translated = class_cluster_match(y_true, y_pred)
+    assert_equal(y_pred_translated, expected)
+
+    # handcrafted example - more clusters than classes
+    y_true = ['a', 'd', 'e', 'b', 'b', 'b']
+    y_pred = [0, 0, 1, 1, 2, 2]
+
+    expected = ['a', 'a', 'e', 'e', 'b', 'b']
+
+    y_pred_translated = class_cluster_match(y_true, y_pred)
+    assert_equal(y_pred_translated, expected)
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index be0048b60b0e3..7b565f2f3bc70 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -17,6 +17,7 @@
 from ..externals.joblib import cpu_count
 from ..exceptions import DataConversionWarning
 from .deprecation import deprecated
+from .linear_assignment_ import linear_assignment
 
 
 __all__ = ["murmurhash3_32", "as_float_array",
@@ -25,7 +26,8 @@
            "compute_class_weight", "compute_sample_weight",
            "column_or_1d", "safe_indexing",
            "check_consistent_length", "check_X_y", 'indexable',
-           "check_symmetric", "indices_to_mask", "deprecated"]
+           "check_symmetric", "indices_to_mask", "deprecated",
+           "linear_assignment"]
 
 
 class Bunch(dict):

From 6e436d885ff59b6598b07eb1b489f641cc7b955f Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Sun, 11 Feb 2018 00:01:24 +0000
Subject: [PATCH 16/39] fixing test result

---
 sklearn/metrics/cluster/tests/test_supervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index 2946e1cde5f0e..dc9e071131430 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -345,7 +345,7 @@ def test_class_cluster_match():
     y_true = ['a', 'a', 'a', 'b', 'b', 'b']
     y_pred = [4, 0, 1, 1, 2, 2]
 
-    expected = ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b']
+    expected = ['DEF_CLASS0', 'a', 'DEF_CLASS1', 'DEF_CLASS1', 'b', 'b']
 
     y_pred_translated = class_cluster_match(y_true, y_pred)
     assert_equal(y_pred_translated, expected)

From c3d1ea5b6adb25261c91cf9adf1f1b25abc25db5 Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Sun, 11 Feb 2018 00:44:13 +0000
Subject: [PATCH 17/39] fixing pep8 formatting error

---
 sklearn/metrics/cluster/tests/test_supervised.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index dc9e071131430..a4427ff38133f 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -281,8 +281,9 @@ def test_class_cluster_match():
     # handcrafted example - same number of clusters and classes
     y_true = ['a'] * 1 + ['b'] * 2 + ['c'] * 20 + ['d'] * 6 + ['e'] * \
         13 + ['f'] * 2 + ['g'] * 3 + ['h'] * 3 + ['i'] * 2 + ['j'] * 1
-    y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] * 4 + [5] * 2 + [0] * 4 + \
-        [3] * 5 + [6] * 2 + [9] * 2 + [7] * 2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1
+    y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] *\
+        4 + [5] * 2 + [0] * 4 + [3] * 5 + [6] * 2 + [9] * 2 + [7] *\
+        2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1
     expected = [
         'a',
         'c',

From 10892dc7164d00eee5f3cd7e035ab37156c2e9ce Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Sun, 11 Feb 2018 01:16:14 +0000
Subject: [PATCH 18/39] fixing test error

---
 sklearn/metrics/cluster/tests/test_supervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index a4427ff38133f..fe2c4e846c361 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -346,7 +346,7 @@ def test_class_cluster_match():
     y_true = ['a', 'a', 'a', 'b', 'b', 'b']
     y_pred = [4, 0, 1, 1, 2, 2]
 
-    expected = ['DEF_CLASS0', 'a', 'DEF_CLASS1', 'DEF_CLASS1', 'b', 'b']
+    expected = ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b']
 
     y_pred_translated = class_cluster_match(y_true, y_pred)
     assert_equal(y_pred_translated, expected)

From 8e12b2a6b0130e936f9e4e0dde079911a65b5eac Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Sun, 11 Feb 2018 11:53:20 +0000
Subject: [PATCH 19/39] sorting result set for maintaining interoperability
 between python 2 and 3

---
 sklearn/metrics/cluster/supervised.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 7f3337142e30a..48be12c11379c 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -931,9 +931,9 @@ def class_cluster_match(y_true, y_pred):
     pred_idx = pred_idx.tolist()
 
     true_idx = [classes[idx] for idx in true_idx]
-    true_idx = true_idx + list(set(classes) - set(true_idx))
+    true_idx = true_idx + sorted(set(classes) - set(true_idx))
     pred_idx = [clusters[idx] for idx in pred_idx]
-    pred_idx = pred_idx + list(set(clusters) - set(pred_idx))
+    pred_idx = pred_idx + sorted(set(clusters) - set(pred_idx))
 
     return_list = [true_idx[pred_idx.index(y)] for y in y_pred]
 

From 63dd10845d5662eb4b541056c522b762c60a9465 Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Sun, 11 Feb 2018 13:05:28 +0000
Subject: [PATCH 20/39] adding documentation

---
 doc/modules/clustering.rst | 81 ++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 33 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 661db822aea55..3b1e0fdfd97a1 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1660,45 +1660,60 @@ metrics, such as :func:`sklearn.metrics.accuracy_score` and
 
 Here is an example::
 
-   >>> from sklearn.metrics.cluster import class_cluster_match
-   >>> x = ["a", "a", "a", "b", "b", "b"]
-   >>> y = [0, 0, 1, 1, 2, 2]
-   >>> contingency_matrix(x, y)
-   array([[2, 1, 0],
-          [0, 1, 2]])
-
-The first row of output array indicates that there are three samples whose
-true cluster is "a". Of them, two are in predicted cluster 0, one is in 1,
-and none is in 2. And the second row indicates that there are three samples
-whose true cluster is "b". Of them, none is in predicted cluster 0, one is in
-1 and two are in 2.
-
-A :ref:`confusion matrix <confusion_matrix>` for classification is a square
-contingency matrix where the order of rows and columns correspond to a list
-of classes.
-
+   >>> from sklearn.metrics.cluster import class_cluster_match, adjusted_rand_score
+   >>> from sklearn.metrics import confusion_matrix, accuracy_score
+   >>> y_true = ['a'] * 1 + ['b'] * 2 + ['c'] * 20 + ['d'] * 6 + ['e'] * \
+   ...  13 + ['f'] * 2 + ['g'] * 3 + ['h'] * 3 + ['i'] * 2 + ['j'] * 1
+   >>> y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] *\
+   ...  4 + [5] * 2 + [0] * 4 + [3] * 5 + [6] * 2 + [9] * 2 + [7] *\
+   ...  2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1
+   >>> y_pred = class_cluster_match(x, y)
+   >>> confusion_matrix(y_true, y_pred)
+   [[ 1  0  0  0  0  0  0  0  0  0]
+    [ 0  0  2  0  0  0  0  0  0  0]
+    [ 0  0 10  0  0  0  6  0  0  4]
+    [ 0  0  0  4  0  0  0  0  2  0]
+    [ 2  2  0  0  5  0  4  0  0  0]
+    [ 0  0  0  0  0  2  0  0  0  0]
+    [ 0  0  0  0  0  0  2  0  0  1]
+    [ 0  0  0  0  0  0  0  3  0  0]
+    [ 0  0  0  0  2  0  0  0  0  0]
+    [ 0  0  0  0  0  0  0  0  0  1]]
+  >>> accuracy_score(y_true, y_pred) # doctest: +ELLIPSIS
+  0.52...
+  >>> adjusted_rand_score(y_true, y_pred) # doctest: +ELLIPSIS
+  0.29...
+
+  Notice the confusion matrix above has its main diagonal maximized, meaning
+  the maximum possible value of accuracy score is obtained by such match of 
+  true classes and clusters.
+
+  This conversion of clustering labels is also compatible with default
+  clustering metrics, since the change in clusters labels does not
+  affect results of such metrics, such as the ARI above.
+
+  Another example::
+
+   >>> y_true = ['a', 'a', 'a', 'b', 'b', 'b']
+   >>> y_pred = [3, 0, 1, 1, 2, 2]
+   >>> class_cluster_match(y_true, y_pred)
+   ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b']
+
+  The above example shows what happens with your clustering method identifies
+  more clusters than true classes. *Such results must be treated carefully*,
+  since not all metrics derived from such mapping are meaningful.
 
 Advantages
 ~~~~~~~~~~
 
-- Allows to examine the spread of each true cluster across predicted
-  clusters and vice versa.
+- Enables calculation of classical classification metrics, such as
+  accuracy and f1_score.
 
-- The contingency table calculated is typically utilized in the calculation
-  of a similarity statistic (like the others listed in this document) between
-  the two clusterings.
+- Allows for a meaningful and easy-to-read clustering output when classes
+  are known.
 
 Drawbacks
 ~~~~~~~~~
 
-- Contingency matrix is easy to interpret for a small number of clusters, but
-  becomes very hard to interpret for a large number of clusters.
-
-- It doesn't give a single metric to use as an objective for clustering
-  optimisation.
-
-
-.. topic:: References
-
- * `Wikipedia entry for contingency matrix
-   <https://en.wikipedia.org/wiki/Contingency_table>`_
+- One should use this tool carefully, since its metrics are not always
+  meaningful for every clustering task.
\ No newline at end of file

From c50284fd9fe5ea4628cf7f5b4a7bef3835d97772 Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Sun, 11 Feb 2018 13:29:24 +0000
Subject: [PATCH 21/39] fixing doc

---
 doc/modules/clustering.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 3b1e0fdfd97a1..6fb15fea52910 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1667,7 +1667,7 @@ Here is an example::
    >>> y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] *\
    ...  4 + [5] * 2 + [0] * 4 + [3] * 5 + [6] * 2 + [9] * 2 + [7] *\
    ...  2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1
-   >>> y_pred = class_cluster_match(x, y)
+   >>> y_pred = class_cluster_match(y_true, y_pred)
    >>> confusion_matrix(y_true, y_pred)
    [[ 1  0  0  0  0  0  0  0  0  0]
     [ 0  0  2  0  0  0  0  0  0  0]

From 5c8c3f3d4b5da0295ab0c226ef81f7bba4ccd70e Mon Sep 17 00:00:00 2001
From:  <lucaspugensf@gmail.com>
Date: Sun, 11 Feb 2018 13:54:31 +0000
Subject: [PATCH 22/39] fixing doc

---
 doc/modules/clustering.rst | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 6fb15fea52910..17005f438ed1e 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1669,16 +1669,16 @@ Here is an example::
    ...  2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1
    >>> y_pred = class_cluster_match(y_true, y_pred)
    >>> confusion_matrix(y_true, y_pred)
-   [[ 1  0  0  0  0  0  0  0  0  0]
-    [ 0  0  2  0  0  0  0  0  0  0]
-    [ 0  0 10  0  0  0  6  0  0  4]
-    [ 0  0  0  4  0  0  0  0  2  0]
-    [ 2  2  0  0  5  0  4  0  0  0]
-    [ 0  0  0  0  0  2  0  0  0  0]
-    [ 0  0  0  0  0  0  2  0  0  1]
-    [ 0  0  0  0  0  0  0  3  0  0]
-    [ 0  0  0  0  2  0  0  0  0  0]
-    [ 0  0  0  0  0  0  0  0  0  1]]
+   array([[ 1,  0,  0,  0,  0,  0,  0,  0,  0,  0],
+          [ 0,  0,  2,  0,  0,  0,  0,  0,  0,  0],
+          [ 0,  0, 10,  0,  0,  0,  6,  0,  0,  4],
+          [ 0,  0,  0,  4,  0,  0,  0,  0,  2,  0],
+          [ 2,  2,  0,  0,  5,  0,  4,  0,  0,  0],
+          [ 0,  0,  0,  0,  0,  2,  0,  0,  0,  0],
+          [ 0,  0,  0,  0,  0,  0,  2,  0,  0,  1],
+          [ 0,  0,  0,  0,  0,  0,  0,  3,  0,  0],
+          [ 0,  0,  0,  0,  2,  0,  0,  0,  0,  0],
+          [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1]])
   >>> accuracy_score(y_true, y_pred) # doctest: +ELLIPSIS
   0.52...
   >>> adjusted_rand_score(y_true, y_pred) # doctest: +ELLIPSIS

From fefe91c64d57966803956548baf522192a658ad5 Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 12:52:06 -0300
Subject: [PATCH 23/39] fixing nomenclature

---
 doc/modules/classes.rst               | 2 +-
 doc/modules/clustering.rst            | 6 +++---
 sklearn/metrics/cluster/supervised.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 9bff49f61dc96..890bcd75db22d 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -857,7 +857,7 @@ details.
    metrics.calinski_harabaz_score
    metrics.completeness_score
    metrics.cluster.contingency_matrix
-   metrics.cluster.class_cluster_match
+   metrics.cluster.map_cluster_labels
    metrics.fowlkes_mallows_score
    metrics.homogeneity_completeness_v_measure
    metrics.homogeneity_score
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 17005f438ed1e..6f7799852a585 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1660,14 +1660,14 @@ metrics, such as :func:`sklearn.metrics.accuracy_score` and
 
 Here is an example::
 
-   >>> from sklearn.metrics.cluster import class_cluster_match, adjusted_rand_score
+   >>> from sklearn.metrics.cluster import map_cluster_labels, adjusted_rand_score
    >>> from sklearn.metrics import confusion_matrix, accuracy_score
    >>> y_true = ['a'] * 1 + ['b'] * 2 + ['c'] * 20 + ['d'] * 6 + ['e'] * \
    ...  13 + ['f'] * 2 + ['g'] * 3 + ['h'] * 3 + ['i'] * 2 + ['j'] * 1
    >>> y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] *\
    ...  4 + [5] * 2 + [0] * 4 + [3] * 5 + [6] * 2 + [9] * 2 + [7] *\
    ...  2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1
-   >>> y_pred = class_cluster_match(y_true, y_pred)
+   >>> y_pred = map_cluster_labels(y_true, y_pred)
    >>> confusion_matrix(y_true, y_pred)
    array([[ 1,  0,  0,  0,  0,  0,  0,  0,  0,  0],
           [ 0,  0,  2,  0,  0,  0,  0,  0,  0,  0],
@@ -1696,7 +1696,7 @@ Here is an example::
 
    >>> y_true = ['a', 'a', 'a', 'b', 'b', 'b']
    >>> y_pred = [3, 0, 1, 1, 2, 2]
-   >>> class_cluster_match(y_true, y_pred)
+   >>> map_cluster_labels(y_true, y_pred)
    ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b']
 
   The above example shows what happens with your clustering method identifies
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 48be12c11379c..fb0b495996376 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -902,10 +902,10 @@ def class_cluster_match(y_true, y_pred):
     Examples
     --------
     >>> from sklearn.metrics import confusion_matrix
-    >>> from sklearn.metrics.cluster import class_cluster_match
+    >>> from sklearn.metrics.cluster import map_cluster_labels
     >>> y_true = ["class1", "class2", "class3", "class1", "class1", "class3"]
     >>> y_pred = [0, 0, 2, 2, 0, 2]
-    >>> y_pred_translated = class_cluster_match(y_true, y_pred)
+    >>> y_pred_translated = map_cluster_labels(y_true, y_pred)
     >>> y_pred_translated
     ['class1', 'class1', 'class3', 'class3', 'class1', 'class3']
     >>> confusion_matrix(y_true, y_pred_translated)

From bd31fb9cdaba97da61d983a658ea6be50f908931 Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 13:08:07 -0300
Subject: [PATCH 24/39] fixing nomenclature

---
 doc/modules/clustering.rst                       |  4 ++--
 examples/cluster/plot_affinity_propagation.py    |  2 +-
 sklearn/metrics/cluster/__init__.py              |  6 +++---
 sklearn/metrics/cluster/supervised.py            |  2 +-
 sklearn/metrics/cluster/tests/test_supervised.py | 10 +++++-----
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 6f7799852a585..8c3eb6d231c44 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1647,13 +1647,13 @@ Drawbacks
  * `Wikipedia entry for contingency matrix
    <https://en.wikipedia.org/wiki/Contingency_table>`_
 
-.. _class_cluster_match:
+.. _map_cluster_labels:
 
 Class-cluster Match
 -------------------
 
 Class-cluster match
-(:func:`sklearn.metrics.cluster.class_cluster_matching`) provides a
+(:func:`sklearn.metrics.cluster.map_cluster_labels`) provides a
 friendly way for the user to calculate classical classification
 metrics, such as :func:`sklearn.metrics.accuracy_score` and 
 :func:`sklearn.metrics.f1_score`. 
diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py
index 4ab71d90e018b..1f318875a7198 100644
--- a/examples/cluster/plot_affinity_propagation.py
+++ b/examples/cluster/plot_affinity_propagation.py
@@ -27,7 +27,7 @@
 labels = af.labels_
 
 n_clusters_ = len(cluster_centers_indices)
-translated_labels = metrics.cluster.class_cluster_match(labels_true, labels)
+translated_labels = metrics.cluster.map_cluster_labels(labels_true, labels)
 
 print('Estimated number of clusters: %d' % n_clusters_)
 print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py
index 4325d3670a650..32c58c3312ac7 100644
--- a/sklearn/metrics/cluster/__init__.py
+++ b/sklearn/metrics/cluster/__init__.py
@@ -17,7 +17,7 @@
 from .supervised import v_measure_score
 from .supervised import fowlkes_mallows_score
 from .supervised import entropy
-from .supervised import class_cluster_match
+from .supervised import map_cluster_labels
 from .unsupervised import silhouette_samples
 from .unsupervised import silhouette_score
 from .unsupervised import calinski_harabaz_score
@@ -27,6 +27,6 @@
            "adjusted_rand_score", "completeness_score", "contingency_matrix",
            "expected_mutual_information", "homogeneity_completeness_v_measure",
            "homogeneity_score", "mutual_info_score", "v_measure_score",
-           "fowlkes_mallows_score", "entropy", "class_cluster_match",
+           "fowlkes_mallows_score", "entropy", "map_cluster_labels",
            "silhouette_samples", "silhouette_score",
-           "calinski_harabaz_score", "consensus_score", "class_cluster_match"]
+           "calinski_harabaz_score", "consensus_score", "map_cluster_labels"]
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index ebdc669f52ff5..179e15039e3b0 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -875,7 +875,7 @@ def entropy(labels):
     return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum)))
 
 
-def class_cluster_match(y_true, y_pred):
+def map_cluster_labels(y_true, y_pred):
     """Translate prediction labels to maximize the accuracy.
 
     Translate the prediction labels of a clustering output to enable calc
diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index fe2c4e846c361..c7ca5595eeff5 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -12,7 +12,7 @@
 from sklearn.metrics.cluster import mutual_info_score
 from sklearn.metrics.cluster import normalized_mutual_info_score
 from sklearn.metrics.cluster import v_measure_score
-from sklearn.metrics.cluster import class_cluster_match
+from sklearn.metrics.cluster import map_cluster_labels
 
 from sklearn.utils import assert_all_finite
 from sklearn.utils.testing import (
@@ -277,7 +277,7 @@ def test_fowlkes_mallows_score_properties():
     assert_almost_equal(score_both, expected)
 
 
-def test_class_cluster_match():
+def test_map_cluster_labels():
     # handcrafted example - same number of clusters and classes
     y_true = ['a'] * 1 + ['b'] * 2 + ['c'] * 20 + ['d'] * 6 + ['e'] * \
         13 + ['f'] * 2 + ['g'] * 3 + ['h'] * 3 + ['i'] * 2 + ['j'] * 1
@@ -339,7 +339,7 @@ def test_class_cluster_match():
         'e',
         'j']
 
-    y_pred_translated = class_cluster_match(y_true, y_pred)
+    y_pred_translated = map_cluster_labels(y_true, y_pred)
     assert_equal(y_pred_translated, expected)
 
     # handcrafted example - more clusters than classes
@@ -348,7 +348,7 @@ def test_class_cluster_match():
 
     expected = ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b']
 
-    y_pred_translated = class_cluster_match(y_true, y_pred)
+    y_pred_translated = map_cluster_labels(y_true, y_pred)
     assert_equal(y_pred_translated, expected)
 
     # handcrafted example - more clusters than classes
@@ -357,5 +357,5 @@ def test_class_cluster_match():
 
     expected = ['a', 'a', 'e', 'e', 'b', 'b']
 
-    y_pred_translated = class_cluster_match(y_true, y_pred)
+    y_pred_translated = map_cluster_labels(y_true, y_pred)
     assert_equal(y_pred_translated, expected)

From 1539a0be0203f8b2011d9e4a64ed28603606694b Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 13:21:02 -0300
Subject: [PATCH 25/39] fixing commit bug

---
 sklearn/utils/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index cfecfdd82da4e..a89dbdb808e21 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -17,7 +17,7 @@
 from .class_weight import compute_class_weight, compute_sample_weight
 from ..externals.joblib import cpu_count
 from ..exceptions import DataConversionWarning
-from .deprecation import deprecated<<<<<<< clustering_match
+from .deprecation import deprecated
 from .linear_assignment_ import linear_assignment
 from .. import get_config
 

From 9d3916397efa59b865ef3c7764f1f5250446dec2 Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 14:59:29 -0300
Subject: [PATCH 26/39] fixing doc title

---
 doc/modules/clustering.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 24abef41a84c4..5aa1de7a7c775 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1736,7 +1736,7 @@ Drawbacks
 
 .. _map_cluster_labels:
 
-Class-cluster Match
+Map cluster labels
 -------------------
 
 Class-cluster match

From 1e08a8b2a5872ced94e870125b57b327bbc2c580 Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 15:34:19 -0300
Subject: [PATCH 27/39] avoinding messing with imports

---
 sklearn/metrics/cluster/supervised.py | 2 +-
 sklearn/utils/__init__.py             | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 507cab646afb1..b88dc5ec3d954 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -25,7 +25,7 @@
 from ...utils.validation import check_array
 from ...utils.multiclass import unique_labels
 from ...utils.fixes import comb
-from ...utils import linear_assignment
+from ...utils.linear_assignment_ import linear_assignment
 
 
 def comb2(n):
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index a89dbdb808e21..bb1f383505fe9 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -18,7 +18,6 @@
 from ..externals.joblib import cpu_count
 from ..exceptions import DataConversionWarning
 from .deprecation import deprecated
-from .linear_assignment_ import linear_assignment
 from .. import get_config
 
 __all__ = ["murmurhash3_32", "as_float_array",
@@ -27,8 +26,7 @@
            "compute_class_weight", "compute_sample_weight",
            "column_or_1d", "safe_indexing",
            "check_consistent_length", "check_X_y", 'indexable',
-           "check_symmetric", "indices_to_mask", "deprecated",
-           "linear_assignment"]
+           "check_symmetric", "indices_to_mask", "deprecated"]
 
 
 class Bunch(dict):

From 51a68cb89d73f6a9e9818504d11200181c876e91 Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 15:41:45 -0300
Subject: [PATCH 28/39] changing default label nomenclature

---
 doc/modules/clustering.rst                       | 2 +-
 sklearn/metrics/cluster/supervised.py            | 2 +-
 sklearn/metrics/cluster/tests/test_supervised.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 5aa1de7a7c775..064ec997a60ef 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1784,7 +1784,7 @@ Here is an example::
    >>> y_true = ['a', 'a', 'a', 'b', 'b', 'b']
    >>> y_pred = [3, 0, 1, 1, 2, 2]
    >>> map_cluster_labels(y_true, y_pred)
-   ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b']
+   ['DEFAULT_LABEL_1', 'a', 'DEFAULT_LABEL_0', 'DEFAULT_LABEL_0', 'b', 'b']
 
   The above example shows what happens with your clustering method identifies
   more clusters than true classes. *Such results must be treated carefully*,
diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index b88dc5ec3d954..6cc9643e75f13 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -921,7 +921,7 @@ def map_cluster_labels(y_true, y_pred):
     n_clusters = len(clusters)
 
     if n_clusters > n_classes:
-        classes += ['DEF_CLASS'+str(i) for i in range(n_clusters-n_classes)]
+        classes += ['DEFAULT_LABEL_'+str(i) for i in range(n_clusters-n_classes)]
     elif n_classes > n_clusters:
         clusters += ['DEF_CLUSTER'+str(i) for i in range(n_classes-n_clusters)]
 
diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index a5f8d63f3edc6..a6afbec9edd6e 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -347,7 +347,7 @@ def test_map_cluster_labels():
     y_true = ['a', 'a', 'a', 'b', 'b', 'b']
     y_pred = [4, 0, 1, 1, 2, 2]
 
-    expected = ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b']
+    expected = ['DEFAULT_LABEL_1', 'a', 'DEFAULT_LABEL_0', 'DEFAULT_LABEL_0', 'b', 'b']
 
     y_pred_translated = map_cluster_labels(y_true, y_pred)
     assert_equal(y_pred_translated, expected)

From 01e92533956c05d363f9712ad87710c564058c5f Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 18:25:39 -0300
Subject: [PATCH 29/39] adding negative indices to the test

---
 sklearn/metrics/cluster/tests/test_supervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index a6afbec9edd6e..f9dbdc4a8c1d7 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -354,7 +354,7 @@ def test_map_cluster_labels():
 
     # handcrafted example - more clusters than classes
     y_true = ['a', 'd', 'e', 'b', 'b', 'b']
-    y_pred = [0, 0, 1, 1, 2, 2]
+    y_pred = [0, 0, -1, -1, 2, 2]
 
     expected = ['a', 'a', 'e', 'e', 'b', 'b']
 

From e11cd9164ed268bade9de8fb9eb433f1f3c5b62b Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 18:32:32 -0300
Subject: [PATCH 30/39] nomenclature fix

---
 sklearn/metrics/cluster/supervised.py | 28 +++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 6cc9643e75f13..082332fd414f6 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -876,26 +876,26 @@ def entropy(labels):
     return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum)))
 
 
-def map_cluster_labels(y_true, y_pred):
+def map_cluster_labels(labels_true, labels_pred):
     """Translate prediction labels to maximize the accuracy.
 
     Translate the prediction labels of a clustering output to enable calc
     of external metrics (eg. accuracy, f1_score, ...). Translation is done by
     maximization of the confusion matrix :math:`C` main diagonal sum
     :math:`\sum{i=0}^{K}C_{i, i}`. Notice the number of cluster has to be equal
-     or smaller than the number of true classes.
+    or smaller than the number of true classes.
 
     Parameters
     ----------
-    y_true : array, shape = [n_samples]
+    labels_true : array, shape = [n_samples]
         Ground truth (correct) target values.
-    y_pred : array, shape = [n_samples]
-        Estimated targets as returned by a clustering algorithm.
+    labels_pred : array, shape = [n_samples]
+        Estimated clusters as returned by a clustering algorithm.
 
     Returns
     -------
     trans : array, shape = [n_classes, n_classes]
-        Mapping of y_pred clusters, such that :math:`trans\subseteq y_true`
+        Mapping of labels_pred clusters, such that :math:`trans\subseteq labels_true`
 
     References
     ----------
@@ -904,20 +904,20 @@ def map_cluster_labels(y_true, y_pred):
     --------
     >>> from sklearn.metrics import confusion_matrix
     >>> from sklearn.metrics.cluster import map_cluster_labels
-    >>> y_true = ["class1", "class2", "class3", "class1", "class1", "class3"]
-    >>> y_pred = [0, 0, 2, 2, 0, 2]
-    >>> y_pred_translated = map_cluster_labels(y_true, y_pred)
+    >>> labels_true = ["class1", "class2", "class3", "class1", "class1", "class3"]
+    >>> labels_pred = [0, 0, 2, 2, 0, 2]
+    >>> y_pred_translated = map_cluster_labels(labels_true, labels_pred)
     >>> y_pred_translated
     ['class1', 'class1', 'class3', 'class3', 'class1', 'class3']
-    >>> confusion_matrix(y_true, y_pred_translated)
+    >>> confusion_matrix(labels_true, y_pred_translated)
     array([[2, 0, 1],
            [1, 0, 0],
            [0, 0, 2]])
     """
 
-    classes = unique_labels(y_true).tolist()
+    classes = unique_labels(labels_true).tolist()
     n_classes = len(classes)
-    clusters = unique_labels(y_pred).tolist()
+    clusters = unique_labels(labels_pred).tolist()
     n_clusters = len(clusters)
 
     if n_clusters > n_classes:
@@ -925,7 +925,7 @@ def map_cluster_labels(y_true, y_pred):
     elif n_classes > n_clusters:
         clusters += ['DEF_CLUSTER'+str(i) for i in range(n_classes-n_clusters)]
 
-    C = contingency_matrix(y_true, y_pred)
+    C = contingency_matrix(labels_true, labels_pred)
     true_idx, pred_idx = linear_assignment(-C).T
 
     true_idx = true_idx.tolist()
@@ -936,6 +936,6 @@ def map_cluster_labels(y_true, y_pred):
     pred_idx = [clusters[idx] for idx in pred_idx]
     pred_idx = pred_idx + sorted(set(clusters) - set(pred_idx))
 
-    return_list = [true_idx[pred_idx.index(y)] for y in y_pred]
+    return_list = [true_idx[pred_idx.index(y)] for y in labels_pred]
 
     return return_list

From 320858dbbabf1f22e3eb9c79952cfbcaf31ee2b9 Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 18:33:54 -0300
Subject: [PATCH 31/39] comment fix

---
 sklearn/metrics/cluster/tests/test_supervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index f9dbdc4a8c1d7..ba9b1d9325302 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -352,7 +352,7 @@ def test_map_cluster_labels():
     y_pred_translated = map_cluster_labels(y_true, y_pred)
     assert_equal(y_pred_translated, expected)
 
-    # handcrafted example - more clusters than classes
+    # handcrafted example - more classes than clusters
     y_true = ['a', 'd', 'e', 'b', 'b', 'b']
     y_pred = [0, 0, -1, -1, 2, 2]
 

From 0a65cc2fa9ccf03a2b3ba16f2d5ba12e0b4990c1 Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 18:48:28 -0300
Subject: [PATCH 32/39] doc fix

---
 doc/modules/clustering.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 064ec997a60ef..db36537857519 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1739,7 +1739,7 @@ Drawbacks
 Map cluster labels
 -------------------
 
-Class-cluster match
+Map cluster labels
 (:func:`sklearn.metrics.cluster.map_cluster_labels`) provides a
 friendly way for the user to calculate classical classification
 metrics, such as :func:`sklearn.metrics.accuracy_score` and 

From 4847df3d6ca37d3480bb82027ca0352e7bf35a1f Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 18:52:24 -0300
Subject: [PATCH 33/39] comment fix

---
 sklearn/metrics/cluster/supervised.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 082332fd414f6..1d606f8c53271 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -879,11 +879,10 @@ def entropy(labels):
 def map_cluster_labels(labels_true, labels_pred):
     """Translate prediction labels to maximize the accuracy.
 
-    Translate the prediction labels of a clustering output to enable calc
-    of external metrics (eg. accuracy, f1_score, ...). Translation is done by
-    maximization of the confusion matrix :math:`C` main diagonal sum
-    :math:`\sum{i=0}^{K}C_{i, i}`. Notice the number of cluster has to be equal
-    or smaller than the number of true classes.
+    Translate the prediction labels of a clustering output to those in the
+    ground truth to enable calc of external metrics (eg. accuracy, f1_score, ...).
+    Translation is done by maximization of the confusion matrix :math:`C` main
+    diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`.
 
     Parameters
     ----------

From 42a7a1d778493593f8ddd4488eea5a5606873a8e Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 18:58:56 -0300
Subject: [PATCH 34/39] simplifying test

---
 .../metrics/cluster/tests/test_supervised.py  | 63 ++-----------------
 1 file changed, 4 insertions(+), 59 deletions(-)

diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index ba9b1d9325302..a8fec4f7f4743 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -280,65 +280,10 @@ def test_fowlkes_mallows_score_properties():
 
 def test_map_cluster_labels():
     # handcrafted example - same number of clusters and classes
-    y_true = ['a'] * 1 + ['b'] * 2 + ['c'] * 20 + ['d'] * 6 + ['e'] * \
-        13 + ['f'] * 2 + ['g'] * 3 + ['h'] * 3 + ['i'] * 2 + ['j'] * 1
-    y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] *\
-        4 + [5] * 2 + [0] * 4 + [3] * 5 + [6] * 2 + [9] * 2 + [7] *\
-        2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1
-    expected = [
-        'a',
-        'c',
-        'c',
-        'g',
-        'g',
-        'g',
-        'g',
-        'g',
-        'g',
-        'c',
-        'c',
-        'c',
-        'c',
-        'c',
-        'c',
-        'c',
-        'c',
-        'c',
-        'c',
-        'j',
-        'j',
-        'j',
-        'j',
-        'd',
-        'd',
-        'd',
-        'd',
-        'i',
-        'i',
-        'g',
-        'g',
-        'g',
-        'g',
-        'e',
-        'e',
-        'e',
-        'e',
-        'e',
-        'a',
-        'a',
-        'b',
-        'b',
-        'f',
-        'f',
-        'g',
-        'g',
-        'j',
-        'h',
-        'h',
-        'h',
-        'e',
-        'e',
-        'j']
+    y_true = ['a', 'b', 'b', 'c', 'c', 'a']
+    y_pred = [1, 0, 0, 1, 2, 1]
+
+    expected = ['a', 'b', 'b', 'a', 'c', 'a']
 
     y_pred_translated = map_cluster_labels(y_true, y_pred)
     assert_equal(y_pred_translated, expected)

From cdbe4a387016292f505309fa8e38b6ecf216186c Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 19:03:32 -0300
Subject: [PATCH 35/39] fixing name style on labels

---
 sklearn/metrics/cluster/supervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 1d606f8c53271..72ec559d7c62a 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -922,7 +922,7 @@ def map_cluster_labels(labels_true, labels_pred):
     if n_clusters > n_classes:
         classes += ['DEFAULT_LABEL_'+str(i) for i in range(n_clusters-n_classes)]
     elif n_classes > n_clusters:
-        clusters += ['DEF_CLUSTER'+str(i) for i in range(n_classes-n_clusters)]
+        clusters += ['DEF_CLUSTER_'+str(i) for i in range(n_classes-n_clusters)]
 
     C = contingency_matrix(labels_true, labels_pred)
     true_idx, pred_idx = linear_assignment(-C).T

From 1474d53accc0cc8cd7ac415fe4e964b5f6119b09 Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 19:04:55 -0300
Subject: [PATCH 36/39] fixing name style on labels

---
 sklearn/metrics/cluster/supervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 72ec559d7c62a..673582128a78c 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -922,7 +922,7 @@ def map_cluster_labels(labels_true, labels_pred):
     if n_clusters > n_classes:
         classes += ['DEFAULT_LABEL_'+str(i) for i in range(n_clusters-n_classes)]
     elif n_classes > n_clusters:
-        clusters += ['DEF_CLUSTER_'+str(i) for i in range(n_classes-n_clusters)]
+        clusters += ['DEFAULT_CLUSTER_'+str(i) for i in range(n_classes-n_clusters)]
 
     C = contingency_matrix(labels_true, labels_pred)
     true_idx, pred_idx = linear_assignment(-C).T

From f582884d3e364ee2464552e0493738de47380d2e Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 20:08:20 -0300
Subject: [PATCH 37/39] fixing line length

---
 sklearn/metrics/cluster/supervised.py            | 9 ++++++---
 sklearn/metrics/cluster/tests/test_supervised.py | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index 673582128a78c..d8722479705fc 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -903,7 +903,8 @@ def map_cluster_labels(labels_true, labels_pred):
     --------
     >>> from sklearn.metrics import confusion_matrix
     >>> from sklearn.metrics.cluster import map_cluster_labels
-    >>> labels_true = ["class1", "class2", "class3", "class1", "class1", "class3"]
+    >>> labels_true = ["class1", "class2", "class3", "class1", "class1",
+    >>>                "class3"]
     >>> labels_pred = [0, 0, 2, 2, 0, 2]
     >>> y_pred_translated = map_cluster_labels(labels_true, labels_pred)
     >>> y_pred_translated
@@ -920,9 +921,11 @@ def map_cluster_labels(labels_true, labels_pred):
     n_clusters = len(clusters)
 
     if n_clusters > n_classes:
-        classes += ['DEFAULT_LABEL_'+str(i) for i in range(n_clusters-n_classes)]
+        classes += ['DEFAULT_LABEL_'+str(i) for i in
+                    range(n_clusters-n_classes)]
     elif n_classes > n_clusters:
-        clusters += ['DEFAULT_CLUSTER_'+str(i) for i in range(n_classes-n_clusters)]
+        clusters += ['DEFAULT_CLUSTER_'+str(i) for i in
+                     range(n_classes-n_clusters)]
 
     C = contingency_matrix(labels_true, labels_pred)
     true_idx, pred_idx = linear_assignment(-C).T
diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index a8fec4f7f4743..44b56fd051f3d 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -292,7 +292,8 @@ def test_map_cluster_labels():
     y_true = ['a', 'a', 'a', 'b', 'b', 'b']
     y_pred = [4, 0, 1, 1, 2, 2]
 
-    expected = ['DEFAULT_LABEL_1', 'a', 'DEFAULT_LABEL_0', 'DEFAULT_LABEL_0', 'b', 'b']
+    expected = ['DEFAULT_LABEL_1', 'a', 'DEFAULT_LABEL_0', 'DEFAULT_LABEL_0',
+                'b', 'b']
 
     y_pred_translated = map_cluster_labels(y_true, y_pred)
     assert_equal(y_pred_translated, expected)

From 7227ab56fb9a2384b967920436b985f5c6ecdf5f Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 21:01:11 -0300
Subject: [PATCH 38/39] fixing line length

---
 sklearn/metrics/cluster/supervised.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index d8722479705fc..c0b0892fb5527 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -880,9 +880,9 @@ def map_cluster_labels(labels_true, labels_pred):
     """Translate prediction labels to maximize the accuracy.
 
     Translate the prediction labels of a clustering output to those in the
-    ground truth to enable calc of external metrics (eg. accuracy, f1_score, ...).
-    Translation is done by maximization of the confusion matrix :math:`C` main
-    diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`.
+    ground truth to enable calc of external metrics (eg. accuracy, f1_score,
+    ...). Translation is done by maximization of the confusion matrix :math:`C`
+    main diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`.
 
     Parameters
     ----------
@@ -894,7 +894,8 @@ def map_cluster_labels(labels_true, labels_pred):
     Returns
     -------
     trans : array, shape = [n_classes, n_classes]
-        Mapping of labels_pred clusters, such that :math:`trans\subseteq labels_true`
+        Mapping of labels_pred clusters, such that :math:`trans\subseteq
+        labels_true`
 
     References
     ----------

From e1686d650339f8b1c16e0961039f56076745b50b Mon Sep 17 00:00:00 2001
From: LucasPugensFernandes <lucaspugensf@gmail.com>
Date: Sun, 1 Jul 2018 21:48:01 -0300
Subject: [PATCH 39/39] fixing comment code

---
 sklearn/metrics/cluster/supervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py
index c0b0892fb5527..987438ac93a6b 100644
--- a/sklearn/metrics/cluster/supervised.py
+++ b/sklearn/metrics/cluster/supervised.py
@@ -905,7 +905,7 @@ def map_cluster_labels(labels_true, labels_pred):
     >>> from sklearn.metrics import confusion_matrix
     >>> from sklearn.metrics.cluster import map_cluster_labels
     >>> labels_true = ["class1", "class2", "class3", "class1", "class1",
-    >>>                "class3"]
+    ...                "class3"]
     >>> labels_pred = [0, 0, 2, 2, 0, 2]
     >>> y_pred_translated = map_cluster_labels(labels_true, labels_pred)
     >>> y_pred_translated