From 221301865d9ad39e92b4372fb7b43e168e7274e2 Mon Sep 17 00:00:00 2001 From: Date: Wed, 7 Feb 2018 21:17:45 +0000 Subject: [PATCH 01/39] adding functionality to allow more clustering metrics --- sklearn/metrics/cluster/__init__.py | 3 +- sklearn/metrics/cluster/supervised.py | 102 ++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py index 4cda1108ece32..825bd080cff31 100644 --- a/sklearn/metrics/cluster/__init__.py +++ b/sklearn/metrics/cluster/__init__.py @@ -17,6 +17,7 @@ from .supervised import v_measure_score from .supervised import fowlkes_mallows_score from .supervised import entropy +from .supervised import class_cluster_match from .unsupervised import silhouette_samples from .unsupervised import silhouette_score from .unsupervised import calinski_harabaz_score @@ -26,5 +27,5 @@ "adjusted_rand_score", "completeness_score", "contingency_matrix", "expected_mutual_information", "homogeneity_completeness_v_measure", "homogeneity_score", "mutual_info_score", "v_measure_score", - "fowlkes_mallows_score", "entropy", "silhouette_samples", + "fowlkes_mallows_score", "entropy", "class_cluster_match", "silhouette_samples", "silhouette_score", "calinski_harabaz_score", "consensus_score"] diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index ebbbacd4caa01..b50f6328af2e1 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -11,6 +11,7 @@ # Thierry Guillemot # Gregory Stupp # Joel Nothman +# Lucas Pugens Fernandes # License: BSD 3 clause from __future__ import division @@ -19,9 +20,11 @@ import numpy as np from scipy import sparse as sp +from scipy.optimize import linprog from .expected_mutual_info_fast import expected_mutual_information from ...utils.validation import check_array +from ...utils.multiclass import unique_labels from ...utils.fixes import comb @@ -870,3 +873,102 @@ def entropy(labels): # log(a / b) should be calculated as log(a) - log(b) for # possible loss of precision return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum))) + + +def max_main_diagonal(A): + """Sort matrix A columns to achieve greater main diagonal sum + Sorting is done by maximization of the confusion matrix :math:`C` + main diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`. Notice the + number of cluster has to be equal or smaller than the number + of true classes. + Parameters + ---------- + A : array, shape = [n,n] + Square numerical matrix + Returns + ------- + B : array, shape = [n,n] + Pivot matrix that sorts A for maximum main diagonal sum + References + ---------- + Examples + -------- + >>> from sklearn.metrics.cluster import max_main_diagonal + >>> import numpy as np + >>> A = np.matrix([[2, 1, 0], + [1, 0, 0], + [0, 2, 0]]) + >>> max_main_diagonal(A) + array([[1., 0., 0.], + [0., 0., 1.], + [0., 1., 0.]]) + """ + n, n = A.shape + res = linprog(-A.ravel(), + A_eq=np.r_[np.kron(np.identity(n), np.ones((1, n))), + np.kron(np.ones((1, n)), np.identity(n))], + b_eq=np.ones((2*n,)), bounds=n*n*[(0, None)]) + assert res.success + return res.x.reshape(n, n).T + + +def class_cluster_match(y_true, y_pred, translate=True): + """Sort prediction labels in order to maximize the confusion matrix main diagonal sum + Sort the prediction labels of a clustering output in order to enable calc + of external metrics (eg. accuracy, f1_score, ...). Sorting is done by + maximization of the confusion matrix :math:`C` main diagonal sum + :math:`\sum{i=0}^{K}C_{i, i}`. Notice the number of cluster has to be equal + or smaller than the number of true classes. + Parameters + ---------- + y_true : array, shape = [n_samples] + Ground truth (correct) target values. + y_pred : array, shape = [n_samples] + Estimated targets as returned by a clustering algorithm. + translate : boolean, optional, default True + If True, y_pred_sort will be translated from y_pred notation symbols to y_true notation symbols. + Returns + ------- + y_pred_sort : array, shape = [n_classes, n_classes] + Estimated targets sorted for maximum accuracy with y_true + References + ---------- + Examples + -------- + >>> from sklearn.metrics import confusion_matrix + >>> from sklearn.metrics.cluster import class_cluster_match + >>> y_true = ["class1", "class2", "class3", "class1", "class1", "class3"] + >>> y_pred = [0, 0, 2, 2, 0, 2] + >>> y_pred_translated = class_cluster_match(y_true, y_pred) + >>> y_pred_translated + ["class1", "class1", "class3", "class3", "class1", "class3"] + >>> confusion_matrix(y_true, y_pred_translated) + array([[2., 0., 1.], + [1., 0., 0.], + [0., 0., 2.]]) + """ + classes = list(unique_labels(y_true)) + n_classes = len(classes) + num_classes = [classes.index(y) for y in y_true] + clusters = list(unique_labels(y_pred)) + n_clusters = len(clusters) + num_clusters = [clusters.index(y) for y in y_pred] + + if n_clusters > n_classes: + raise ValueError("Number of different clusters ("+str(n_clusters) + + ") should be smaller or equal to the number of different classes ("+str(n_classes)+")") + + cm = np.zeros((n_classes, n_classes)) + + for y_t, y_p in zip(num_classes, num_clusters): + cm[y_t, y_p] += 1 + + shuffle = best_perm(cm) + + matching_clusters = [row.tolist().index(1) for row in shuffle] + + y_pred_sort = [matching_clusters[y] for y in num_clusters] + if translate: + y_pred_sort = [classes[y] for y in y_pred_sort] + + return y_pred_sort From ef2ec2bbb4ce813cbb7adcddeac32efcf2abe635 Mon Sep 17 00:00:00 2001 From: Date: Wed, 7 Feb 2018 21:29:05 +0000 Subject: [PATCH 02/39] formatting compliant --- sklearn/metrics/cluster/supervised.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index b50f6328af2e1..2008fbbf2ac1f 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -912,9 +912,9 @@ def max_main_diagonal(A): return res.x.reshape(n, n).T -def class_cluster_match(y_true, y_pred, translate=True): - """Sort prediction labels in order to maximize the confusion matrix main diagonal sum - Sort the prediction labels of a clustering output in order to enable calc +def class_cluster_match(y_true, y_pred): + """Sort prediction labels to maximize the confusion matrix main diagonal sum + Sort the prediction labels of a clustering output to enable calc of external metrics (eg. accuracy, f1_score, ...). Sorting is done by maximization of the confusion matrix :math:`C` main diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`. Notice the number of cluster has to be equal @@ -925,8 +925,6 @@ def class_cluster_match(y_true, y_pred, translate=True): Ground truth (correct) target values. y_pred : array, shape = [n_samples] Estimated targets as returned by a clustering algorithm. - translate : boolean, optional, default True - If True, y_pred_sort will be translated from y_pred notation symbols to y_true notation symbols. Returns ------- y_pred_sort : array, shape = [n_classes, n_classes] @@ -956,19 +954,20 @@ def class_cluster_match(y_true, y_pred, translate=True): if n_clusters > n_classes: raise ValueError("Number of different clusters ("+str(n_clusters) + - ") should be smaller or equal to the number of different classes ("+str(n_classes)+")") + ") should be smaller or equal to" + + " the number of different" + + " classes ("+str(n_classes)+")") cm = np.zeros((n_classes, n_classes)) for y_t, y_p in zip(num_classes, num_clusters): cm[y_t, y_p] += 1 - shuffle = best_perm(cm) + shuffle = max_main_diagonal(cm) matching_clusters = [row.tolist().index(1) for row in shuffle] y_pred_sort = [matching_clusters[y] for y in num_clusters] - if translate: - y_pred_sort = [classes[y] for y in y_pred_sort] + y_pred_sort = [classes[y] for y in y_pred_sort] return y_pred_sort From 5fab323fdd728c862029cde0653dbca7db7d9de6 Mon Sep 17 00:00:00 2001 From: Date: Wed, 7 Feb 2018 21:42:18 +0000 Subject: [PATCH 03/39] formatting and adding an functionality example --- examples/cluster/plot_affinity_propagation.py | 5 +++++ sklearn/metrics/cluster/__init__.py | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py index 2c8fc3acc3936..1084b612291ec 100644 --- a/examples/cluster/plot_affinity_propagation.py +++ b/examples/cluster/plot_affinity_propagation.py @@ -27,6 +27,7 @@ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) +translated_labels = metrics.cluster.class_cluster_match(labels_true,labels) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) @@ -38,6 +39,10 @@ % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) +print("Accuracy: %0.3f" + % metrics.accuracy_score(labels_true, translated_labels)) +print("Confusion Matrix:\n%s" + % str(metrics.confusion_matrix(labels_true, translated_labels))) # ############################################################################# # Plot result diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py index 825bd080cff31..07ab7260bf2e3 100644 --- a/sklearn/metrics/cluster/__init__.py +++ b/sklearn/metrics/cluster/__init__.py @@ -27,5 +27,6 @@ "adjusted_rand_score", "completeness_score", "contingency_matrix", "expected_mutual_information", "homogeneity_completeness_v_measure", "homogeneity_score", "mutual_info_score", "v_measure_score", - "fowlkes_mallows_score", "entropy", "class_cluster_match", "silhouette_samples", - "silhouette_score", "calinski_harabaz_score", "consensus_score"] + "fowlkes_mallows_score", "entropy", "class_cluster_match", + "silhouette_samples", "silhouette_score", "calinski_harabaz_score", + "consensus_score"] From 8402cf22044de6c5ad6d878b826ac10dead79d15 Mon Sep 17 00:00:00 2001 From: Lucas Pugens Fernandes Date: Wed, 7 Feb 2018 21:02:58 -0200 Subject: [PATCH 04/39] adding comma to comply with formatting --- examples/cluster/plot_affinity_propagation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py index 1084b612291ec..4ab71d90e018b 100644 --- a/examples/cluster/plot_affinity_propagation.py +++ b/examples/cluster/plot_affinity_propagation.py @@ -27,7 +27,7 @@ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) -translated_labels = metrics.cluster.class_cluster_match(labels_true,labels) +translated_labels = metrics.cluster.class_cluster_match(labels_true, labels) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) From e97ef2b2057e076af4f798f24d35a61e31ae82d6 Mon Sep 17 00:00:00 2001 From: Lucas Pugens Fernandes Date: Wed, 7 Feb 2018 21:05:59 -0200 Subject: [PATCH 05/39] Fixed doc generator --- sklearn/metrics/cluster/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py index 07ab7260bf2e3..222203dbdfc16 100644 --- a/sklearn/metrics/cluster/__init__.py +++ b/sklearn/metrics/cluster/__init__.py @@ -17,6 +17,7 @@ from .supervised import v_measure_score from .supervised import fowlkes_mallows_score from .supervised import entropy +from .supervised import max_main_diagonal from .supervised import class_cluster_match from .unsupervised import silhouette_samples from .unsupervised import silhouette_score @@ -27,6 +28,6 @@ "adjusted_rand_score", "completeness_score", "contingency_matrix", "expected_mutual_information", "homogeneity_completeness_v_measure", "homogeneity_score", "mutual_info_score", "v_measure_score", - "fowlkes_mallows_score", "entropy", "class_cluster_match", - "silhouette_samples", "silhouette_score", "calinski_harabaz_score", - "consensus_score"] + "fowlkes_mallows_score", "entropy", "max_main_diagonal", + "class_cluster_match", "silhouette_samples", "silhouette_score", + "calinski_harabaz_score", "consensus_score"] From 625ced68a4b0d4247c5098c60747d1324b9733ec Mon Sep 17 00:00:00 2001 From: Date: Wed, 7 Feb 2018 23:54:28 +0000 Subject: [PATCH 06/39] more modifications to be compliant with coding guidelines --- sklearn/metrics/cluster/supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 2008fbbf2ac1f..fa70b97259d58 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -939,7 +939,7 @@ def class_cluster_match(y_true, y_pred): >>> y_pred = [0, 0, 2, 2, 0, 2] >>> y_pred_translated = class_cluster_match(y_true, y_pred) >>> y_pred_translated - ["class1", "class1", "class3", "class3", "class1", "class3"] + ['class1', 'class1', 'class3', 'class3', 'class1', 'class3'] >>> confusion_matrix(y_true, y_pred_translated) array([[2., 0., 1.], [1., 0., 0.], From 46fd79f59e12151197fe9eaf9f6d220eb93245c8 Mon Sep 17 00:00:00 2001 From: Date: Thu, 8 Feb 2018 00:17:49 +0000 Subject: [PATCH 07/39] fixing doc bug --- sklearn/metrics/cluster/supervised.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index fa70b97259d58..0448460a60c4a 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -896,8 +896,8 @@ def max_main_diagonal(A): >>> from sklearn.metrics.cluster import max_main_diagonal >>> import numpy as np >>> A = np.matrix([[2, 1, 0], - [1, 0, 0], - [0, 2, 0]]) + >>> [1, 0, 0], + >>> [0, 2, 0]]) >>> max_main_diagonal(A) array([[1., 0., 0.], [0., 0., 1.], From d1bdae1c08c564685743c1af2843ed67eef229af Mon Sep 17 00:00:00 2001 From: Date: Thu, 8 Feb 2018 00:50:12 +0000 Subject: [PATCH 08/39] fixing doctest and adopting max_assignment_score name as proposed by jnothman --- sklearn/metrics/cluster/__init__.py | 4 ++-- sklearn/metrics/cluster/supervised.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py index 222203dbdfc16..3f3a0a767ea65 100644 --- a/sklearn/metrics/cluster/__init__.py +++ b/sklearn/metrics/cluster/__init__.py @@ -17,7 +17,7 @@ from .supervised import v_measure_score from .supervised import fowlkes_mallows_score from .supervised import entropy -from .supervised import max_main_diagonal +from .supervised import max_assignment_score from .supervised import class_cluster_match from .unsupervised import silhouette_samples from .unsupervised import silhouette_score @@ -28,6 +28,6 @@ "adjusted_rand_score", "completeness_score", "contingency_matrix", "expected_mutual_information", "homogeneity_completeness_v_measure", "homogeneity_score", "mutual_info_score", "v_measure_score", - "fowlkes_mallows_score", "entropy", "max_main_diagonal", + "fowlkes_mallows_score", "entropy", "max_assignment_score", "class_cluster_match", "silhouette_samples", "silhouette_score", "calinski_harabaz_score", "consensus_score"] diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 0448460a60c4a..fb0715ef30c04 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -875,7 +875,7 @@ def entropy(labels): return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum))) -def max_main_diagonal(A): +def max_assignment_score(A): """Sort matrix A columns to achieve greater main diagonal sum Sorting is done by maximization of the confusion matrix :math:`C` main diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`. Notice the @@ -893,12 +893,12 @@ def max_main_diagonal(A): ---------- Examples -------- - >>> from sklearn.metrics.cluster import max_main_diagonal + >>> from sklearn.metrics.cluster import max_assignment_score >>> import numpy as np >>> A = np.matrix([[2, 1, 0], - >>> [1, 0, 0], - >>> [0, 2, 0]]) - >>> max_main_diagonal(A) + ... [1, 0, 0], + ... [0, 2, 0]]) + >>> max_assignment_score(A) array([[1., 0., 0.], [0., 0., 1.], [0., 1., 0.]]) @@ -963,7 +963,7 @@ def class_cluster_match(y_true, y_pred): for y_t, y_p in zip(num_classes, num_clusters): cm[y_t, y_p] += 1 - shuffle = max_main_diagonal(cm) + shuffle = max_assignment_score(cm) matching_clusters = [row.tolist().index(1) for row in shuffle] From 07be65c138dcbeb274eb02ae617f75c4a5c514f7 Mon Sep 17 00:00:00 2001 From: Date: Thu, 8 Feb 2018 01:09:58 +0000 Subject: [PATCH 09/39] fixing examle again --- sklearn/metrics/cluster/supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index fb0715ef30c04..a6db7a72865ca 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -895,7 +895,7 @@ def max_assignment_score(A): -------- >>> from sklearn.metrics.cluster import max_assignment_score >>> import numpy as np - >>> A = np.matrix([[2, 1, 0], + >>> A = np.asarray([[2, 1, 0], ... [1, 0, 0], ... [0, 2, 0]]) >>> max_assignment_score(A) From 2b01392a5dc16a7919297846cf28f311f38688ed Mon Sep 17 00:00:00 2001 From: Lucas Pugens Fernandes Date: Thu, 8 Feb 2018 10:34:05 -0200 Subject: [PATCH 10/39] Doc fixing to pass Travis verification --- sklearn/metrics/cluster/supervised.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index a6db7a72865ca..62cb51cb8042a 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -899,9 +899,9 @@ def max_assignment_score(A): ... [1, 0, 0], ... [0, 2, 0]]) >>> max_assignment_score(A) - array([[1., 0., 0.], - [0., 0., 1.], - [0., 1., 0.]]) + array([[ 1., 0., 0.], + [ 0., 0., 1.], + [ 0., 1., 0.]]) """ n, n = A.shape res = linprog(-A.ravel(), @@ -941,9 +941,9 @@ def class_cluster_match(y_true, y_pred): >>> y_pred_translated ['class1', 'class1', 'class3', 'class3', 'class1', 'class3'] >>> confusion_matrix(y_true, y_pred_translated) - array([[2., 0., 1.], - [1., 0., 0.], - [0., 0., 2.]]) + array([[ 2., 0., 1.], + [ 1., 0., 0.], + [ 0., 0., 2.]]) """ classes = list(unique_labels(y_true)) n_classes = len(classes) From b5116b7ce31fa36ff0a8d45116ab6217e16449c8 Mon Sep 17 00:00:00 2001 From: Lucas Pugens Fernandes Date: Thu, 8 Feb 2018 10:55:26 -0200 Subject: [PATCH 11/39] Yet another doc fix --- sklearn/metrics/cluster/supervised.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 62cb51cb8042a..3e492546ff6c9 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -941,9 +941,9 @@ def class_cluster_match(y_true, y_pred): >>> y_pred_translated ['class1', 'class1', 'class3', 'class3', 'class1', 'class3'] >>> confusion_matrix(y_true, y_pred_translated) - array([[ 2., 0., 1.], - [ 1., 0., 0.], - [ 0., 0., 2.]]) + array([[2, 0, 1], + [1, 0, 0], + [0, 0, 2]]) """ classes = list(unique_labels(y_true)) n_classes = len(classes) From 81804bfca9a3fc15e9c383cc9968e95c95883ea5 Mon Sep 17 00:00:00 2001 From: Lucas Pugens Date: Thu, 8 Feb 2018 13:17:53 +0000 Subject: [PATCH 12/39] fixing travis version of scipy --- build_tools/travis/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 9a5b65ce225bd..e0f989d12fdd8 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -71,7 +71,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then # and scipy virtualenv --system-site-packages testvenv source testvenv/bin/activate - pip install pytest pytest-cov cython==$CYTHON_VERSION + pip install pytest pytest-cov cython==$CYTHON_VERSION scipy elif [[ "$DISTRIB" == "scipy-dev-wheels" ]]; then # Set up our own virtualenv environment to avoid travis' numpy. From 4ba0cacb3d923ba32399811cceb148ff2a7ae4fc Mon Sep 17 00:00:00 2001 From: Lucas Pugens Date: Thu, 8 Feb 2018 16:43:45 +0000 Subject: [PATCH 13/39] allowing for any number of clusters and classes --- sklearn/metrics/cluster/supervised.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 3e492546ff6c9..0d4b999c396ef 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -952,13 +952,14 @@ def class_cluster_match(y_true, y_pred): n_clusters = len(clusters) num_clusters = [clusters.index(y) for y in y_pred] - if n_clusters > n_classes: - raise ValueError("Number of different clusters ("+str(n_clusters) + - ") should be smaller or equal to" + - " the number of different" + - " classes ("+str(n_classes)+")") - - cm = np.zeros((n_classes, n_classes)) + # if n_clusters > n_classes: + # raise ValueError("Number of different clusters ("+str(n_clusters) + + # ") should be smaller or equal to" + + # " the number of different" + + # " classes ("+str(n_classes)+")") + dims = max(n_classes, n_clusters) + classes += ['non_class'+str(i) for i in range(dims-n_classes)] + cm = np.zeros((dims, dims)) for y_t, y_p in zip(num_classes, num_clusters): cm[y_t, y_p] += 1 From 29b715485812dd54c6db9ea4ca667f8a413cef95 Mon Sep 17 00:00:00 2001 From: Lucas Pugens Date: Thu, 8 Feb 2018 16:46:50 +0000 Subject: [PATCH 14/39] allowing for any number of clusters and classes and undoing travis script modification --- build_tools/travis/install.sh | 2 +- sklearn/metrics/cluster/supervised.py | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index e0f989d12fdd8..9a5b65ce225bd 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -71,7 +71,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then # and scipy virtualenv --system-site-packages testvenv source testvenv/bin/activate - pip install pytest pytest-cov cython==$CYTHON_VERSION scipy + pip install pytest pytest-cov cython==$CYTHON_VERSION elif [[ "$DISTRIB" == "scipy-dev-wheels" ]]; then # Set up our own virtualenv environment to avoid travis' numpy. diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 0d4b999c396ef..17a480b2cef42 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -952,11 +952,6 @@ def class_cluster_match(y_true, y_pred): n_clusters = len(clusters) num_clusters = [clusters.index(y) for y in y_pred] - # if n_clusters > n_classes: - # raise ValueError("Number of different clusters ("+str(n_clusters) + - # ") should be smaller or equal to" + - # " the number of different" + - # " classes ("+str(n_classes)+")") dims = max(n_classes, n_clusters) classes += ['non_class'+str(i) for i in range(dims-n_classes)] cm = np.zeros((dims, dims)) From 25885382b391d444d88ab46e8df249a03510abfb Mon Sep 17 00:00:00 2001 From: Date: Sat, 10 Feb 2018 23:33:44 +0000 Subject: [PATCH 15/39] better implementation --- doc/modules/classes.rst | 1 + doc/modules/clustering.rst | 56 ++++++++++++ sklearn/metrics/cluster/__init__.py | 7 +- sklearn/metrics/cluster/supervised.py | 85 ++++++------------ .../metrics/cluster/tests/test_supervised.py | 90 ++++++++++++++++++- sklearn/utils/__init__.py | 4 +- 6 files changed, 178 insertions(+), 65 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 5977e2f01a9b3..9bff49f61dc96 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -857,6 +857,7 @@ details. metrics.calinski_harabaz_score metrics.completeness_score metrics.cluster.contingency_matrix + metrics.cluster.class_cluster_match metrics.fowlkes_mallows_score metrics.homogeneity_completeness_v_measure metrics.homogeneity_score diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 2a3d93e263004..661db822aea55 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1622,6 +1622,62 @@ contingency matrix where the order of rows and columns correspond to a list of classes. +Advantages +~~~~~~~~~~ + +- Allows to examine the spread of each true cluster across predicted + clusters and vice versa. + +- The contingency table calculated is typically utilized in the calculation + of a similarity statistic (like the others listed in this document) between + the two clusterings. + +Drawbacks +~~~~~~~~~ + +- Contingency matrix is easy to interpret for a small number of clusters, but + becomes very hard to interpret for a large number of clusters. + +- It doesn't give a single metric to use as an objective for clustering + optimisation. + + +.. topic:: References + + * `Wikipedia entry for contingency matrix + `_ + +.. _class_cluster_match: + +Class-cluster Match +------------------- + +Class-cluster match +(:func:`sklearn.metrics.cluster.class_cluster_matching`) provides a +friendly way for the user to calculate classical classification +metrics, such as :func:`sklearn.metrics.accuracy_score` and +:func:`sklearn.metrics.f1_score`. + +Here is an example:: + + >>> from sklearn.metrics.cluster import class_cluster_match + >>> x = ["a", "a", "a", "b", "b", "b"] + >>> y = [0, 0, 1, 1, 2, 2] + >>> contingency_matrix(x, y) + array([[2, 1, 0], + [0, 1, 2]]) + +The first row of output array indicates that there are three samples whose +true cluster is "a". Of them, two are in predicted cluster 0, one is in 1, +and none is in 2. And the second row indicates that there are three samples +whose true cluster is "b". Of them, none is in predicted cluster 0, one is in +1 and two are in 2. + +A :ref:`confusion matrix ` for classification is a square +contingency matrix where the order of rows and columns correspond to a list +of classes. + + Advantages ~~~~~~~~~~ diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py index 3f3a0a767ea65..4325d3670a650 100644 --- a/sklearn/metrics/cluster/__init__.py +++ b/sklearn/metrics/cluster/__init__.py @@ -17,7 +17,6 @@ from .supervised import v_measure_score from .supervised import fowlkes_mallows_score from .supervised import entropy -from .supervised import max_assignment_score from .supervised import class_cluster_match from .unsupervised import silhouette_samples from .unsupervised import silhouette_score @@ -28,6 +27,6 @@ "adjusted_rand_score", "completeness_score", "contingency_matrix", "expected_mutual_information", "homogeneity_completeness_v_measure", "homogeneity_score", "mutual_info_score", "v_measure_score", - "fowlkes_mallows_score", "entropy", "max_assignment_score", - "class_cluster_match", "silhouette_samples", "silhouette_score", - "calinski_harabaz_score", "consensus_score"] + "fowlkes_mallows_score", "entropy", "class_cluster_match", + "silhouette_samples", "silhouette_score", + "calinski_harabaz_score", "consensus_score", "class_cluster_match"] diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 17a480b2cef42..7f3337142e30a 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -20,12 +20,12 @@ import numpy as np from scipy import sparse as sp -from scipy.optimize import linprog from .expected_mutual_info_fast import expected_mutual_information from ...utils.validation import check_array from ...utils.multiclass import unique_labels from ...utils.fixes import comb +from ...utils import linear_assignment def comb2(n): @@ -875,62 +875,30 @@ def entropy(labels): return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum))) -def max_assignment_score(A): - """Sort matrix A columns to achieve greater main diagonal sum - Sorting is done by maximization of the confusion matrix :math:`C` - main diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`. Notice the - number of cluster has to be equal or smaller than the number - of true classes. - Parameters - ---------- - A : array, shape = [n,n] - Square numerical matrix - Returns - ------- - B : array, shape = [n,n] - Pivot matrix that sorts A for maximum main diagonal sum - References - ---------- - Examples - -------- - >>> from sklearn.metrics.cluster import max_assignment_score - >>> import numpy as np - >>> A = np.asarray([[2, 1, 0], - ... [1, 0, 0], - ... [0, 2, 0]]) - >>> max_assignment_score(A) - array([[ 1., 0., 0.], - [ 0., 0., 1.], - [ 0., 1., 0.]]) - """ - n, n = A.shape - res = linprog(-A.ravel(), - A_eq=np.r_[np.kron(np.identity(n), np.ones((1, n))), - np.kron(np.ones((1, n)), np.identity(n))], - b_eq=np.ones((2*n,)), bounds=n*n*[(0, None)]) - assert res.success - return res.x.reshape(n, n).T - - def class_cluster_match(y_true, y_pred): - """Sort prediction labels to maximize the confusion matrix main diagonal sum - Sort the prediction labels of a clustering output to enable calc - of external metrics (eg. accuracy, f1_score, ...). Sorting is done by + """Translate prediction labels to maximize the accuracy. + + Translate the prediction labels of a clustering output to enable calc + of external metrics (eg. accuracy, f1_score, ...). Translation is done by maximization of the confusion matrix :math:`C` main diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`. Notice the number of cluster has to be equal or smaller than the number of true classes. + Parameters ---------- y_true : array, shape = [n_samples] Ground truth (correct) target values. y_pred : array, shape = [n_samples] Estimated targets as returned by a clustering algorithm. + Returns ------- - y_pred_sort : array, shape = [n_classes, n_classes] - Estimated targets sorted for maximum accuracy with y_true + trans : array, shape = [n_classes, n_classes] + Mapping of y_pred clusters, such that :math:`trans\subseteq y_true` + References ---------- + Examples -------- >>> from sklearn.metrics import confusion_matrix @@ -945,25 +913,28 @@ def class_cluster_match(y_true, y_pred): [1, 0, 0], [0, 0, 2]]) """ - classes = list(unique_labels(y_true)) + + classes = unique_labels(y_true).tolist() n_classes = len(classes) - num_classes = [classes.index(y) for y in y_true] - clusters = list(unique_labels(y_pred)) + clusters = unique_labels(y_pred).tolist() n_clusters = len(clusters) - num_clusters = [clusters.index(y) for y in y_pred] - dims = max(n_classes, n_clusters) - classes += ['non_class'+str(i) for i in range(dims-n_classes)] - cm = np.zeros((dims, dims)) + if n_clusters > n_classes: + classes += ['DEF_CLASS'+str(i) for i in range(n_clusters-n_classes)] + elif n_classes > n_clusters: + clusters += ['DEF_CLUSTER'+str(i) for i in range(n_classes-n_clusters)] - for y_t, y_p in zip(num_classes, num_clusters): - cm[y_t, y_p] += 1 + C = contingency_matrix(y_true, y_pred) + true_idx, pred_idx = linear_assignment(-C).T - shuffle = max_assignment_score(cm) + true_idx = true_idx.tolist() + pred_idx = pred_idx.tolist() - matching_clusters = [row.tolist().index(1) for row in shuffle] + true_idx = [classes[idx] for idx in true_idx] + true_idx = true_idx + list(set(classes) - set(true_idx)) + pred_idx = [clusters[idx] for idx in pred_idx] + pred_idx = pred_idx + list(set(clusters) - set(pred_idx)) - y_pred_sort = [matching_clusters[y] for y in num_clusters] - y_pred_sort = [classes[y] for y in y_pred_sort] + return_list = [true_idx[pred_idx.index(y)] for y in y_pred] - return y_pred_sort + return return_list diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index 8da03d1e17457..2946e1cde5f0e 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -12,10 +12,11 @@ from sklearn.metrics.cluster import mutual_info_score from sklearn.metrics.cluster import normalized_mutual_info_score from sklearn.metrics.cluster import v_measure_score +from sklearn.metrics.cluster import class_cluster_match from sklearn.utils import assert_all_finite from sklearn.utils.testing import ( - assert_equal, assert_almost_equal, assert_raise_message, + assert_equal, assert_almost_equal, assert_raise_message, ) from numpy.testing import assert_array_almost_equal @@ -175,8 +176,8 @@ def test_expected_mutual_info_overflow(): def test_int_overflow_mutual_info_score(): # Test overflow in mutual_info_classif - x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 + - 204) + [4] * (814 + 39) + [5] * (316 + 20)) + x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] + * (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20)) y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 + [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 + [1] * 20) @@ -274,3 +275,86 @@ def test_fowlkes_mallows_score_properties(): # symmetric and permutation(both together) score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3) assert_almost_equal(score_both, expected) + + +def test_class_cluster_match(): + # handcrafted example - same number of clusters and classes + y_true = ['a'] * 1 + ['b'] * 2 + ['c'] * 20 + ['d'] * 6 + ['e'] * \ + 13 + ['f'] * 2 + ['g'] * 3 + ['h'] * 3 + ['i'] * 2 + ['j'] * 1 + y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] * 4 + [5] * 2 + [0] * 4 + \ + [3] * 5 + [6] * 2 + [9] * 2 + [7] * 2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1 + expected = [ + 'a', + 'c', + 'c', + 'g', + 'g', + 'g', + 'g', + 'g', + 'g', + 'c', + 'c', + 'c', + 'c', + 'c', + 'c', + 'c', + 'c', + 'c', + 'c', + 'j', + 'j', + 'j', + 'j', + 'd', + 'd', + 'd', + 'd', + 'i', + 'i', + 'g', + 'g', + 'g', + 'g', + 'e', + 'e', + 'e', + 'e', + 'e', + 'a', + 'a', + 'b', + 'b', + 'f', + 'f', + 'g', + 'g', + 'j', + 'h', + 'h', + 'h', + 'e', + 'e', + 'j'] + + y_pred_translated = class_cluster_match(y_true, y_pred) + assert_equal(y_pred_translated, expected) + + # handcrafted example - more clusters than classes + y_true = ['a', 'a', 'a', 'b', 'b', 'b'] + y_pred = [4, 0, 1, 1, 2, 2] + + expected = ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b'] + + y_pred_translated = class_cluster_match(y_true, y_pred) + assert_equal(y_pred_translated, expected) + + # handcrafted example - more clusters than classes + y_true = ['a', 'd', 'e', 'b', 'b', 'b'] + y_pred = [0, 0, 1, 1, 2, 2] + + expected = ['a', 'a', 'e', 'e', 'b', 'b'] + + y_pred_translated = class_cluster_match(y_true, y_pred) + assert_equal(y_pred_translated, expected) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index be0048b60b0e3..7b565f2f3bc70 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -17,6 +17,7 @@ from ..externals.joblib import cpu_count from ..exceptions import DataConversionWarning from .deprecation import deprecated +from .linear_assignment_ import linear_assignment __all__ = ["murmurhash3_32", "as_float_array", @@ -25,7 +26,8 @@ "compute_class_weight", "compute_sample_weight", "column_or_1d", "safe_indexing", "check_consistent_length", "check_X_y", 'indexable', - "check_symmetric", "indices_to_mask", "deprecated"] + "check_symmetric", "indices_to_mask", "deprecated", + "linear_assignment"] class Bunch(dict): From 6e436d885ff59b6598b07eb1b489f641cc7b955f Mon Sep 17 00:00:00 2001 From: Date: Sun, 11 Feb 2018 00:01:24 +0000 Subject: [PATCH 16/39] fixing test result --- sklearn/metrics/cluster/tests/test_supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index 2946e1cde5f0e..dc9e071131430 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -345,7 +345,7 @@ def test_class_cluster_match(): y_true = ['a', 'a', 'a', 'b', 'b', 'b'] y_pred = [4, 0, 1, 1, 2, 2] - expected = ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b'] + expected = ['DEF_CLASS0', 'a', 'DEF_CLASS1', 'DEF_CLASS1', 'b', 'b'] y_pred_translated = class_cluster_match(y_true, y_pred) assert_equal(y_pred_translated, expected) From c3d1ea5b6adb25261c91cf9adf1f1b25abc25db5 Mon Sep 17 00:00:00 2001 From: Date: Sun, 11 Feb 2018 00:44:13 +0000 Subject: [PATCH 17/39] fixing pep8 formatting error --- sklearn/metrics/cluster/tests/test_supervised.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index dc9e071131430..a4427ff38133f 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -281,8 +281,9 @@ def test_class_cluster_match(): # handcrafted example - same number of clusters and classes y_true = ['a'] * 1 + ['b'] * 2 + ['c'] * 20 + ['d'] * 6 + ['e'] * \ 13 + ['f'] * 2 + ['g'] * 3 + ['h'] * 3 + ['i'] * 2 + ['j'] * 1 - y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] * 4 + [5] * 2 + [0] * 4 + \ - [3] * 5 + [6] * 2 + [9] * 2 + [7] * 2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1 + y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] *\ + 4 + [5] * 2 + [0] * 4 + [3] * 5 + [6] * 2 + [9] * 2 + [7] *\ + 2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1 expected = [ 'a', 'c', From 10892dc7164d00eee5f3cd7e035ab37156c2e9ce Mon Sep 17 00:00:00 2001 From: Date: Sun, 11 Feb 2018 01:16:14 +0000 Subject: [PATCH 18/39] fixing test error --- sklearn/metrics/cluster/tests/test_supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index a4427ff38133f..fe2c4e846c361 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -346,7 +346,7 @@ def test_class_cluster_match(): y_true = ['a', 'a', 'a', 'b', 'b', 'b'] y_pred = [4, 0, 1, 1, 2, 2] - expected = ['DEF_CLASS0', 'a', 'DEF_CLASS1', 'DEF_CLASS1', 'b', 'b'] + expected = ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b'] y_pred_translated = class_cluster_match(y_true, y_pred) assert_equal(y_pred_translated, expected) From 8e12b2a6b0130e936f9e4e0dde079911a65b5eac Mon Sep 17 00:00:00 2001 From: Date: Sun, 11 Feb 2018 11:53:20 +0000 Subject: [PATCH 19/39] sorting result set for maintaining interoperability between python 2 and 3 --- sklearn/metrics/cluster/supervised.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 7f3337142e30a..48be12c11379c 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -931,9 +931,9 @@ def class_cluster_match(y_true, y_pred): pred_idx = pred_idx.tolist() true_idx = [classes[idx] for idx in true_idx] - true_idx = true_idx + list(set(classes) - set(true_idx)) + true_idx = true_idx + sorted(set(classes) - set(true_idx)) pred_idx = [clusters[idx] for idx in pred_idx] - pred_idx = pred_idx + list(set(clusters) - set(pred_idx)) + pred_idx = pred_idx + sorted(set(clusters) - set(pred_idx)) return_list = [true_idx[pred_idx.index(y)] for y in y_pred] From 63dd10845d5662eb4b541056c522b762c60a9465 Mon Sep 17 00:00:00 2001 From: Date: Sun, 11 Feb 2018 13:05:28 +0000 Subject: [PATCH 20/39] adding documentation --- doc/modules/clustering.rst | 81 ++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 33 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 661db822aea55..3b1e0fdfd97a1 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1660,45 +1660,60 @@ metrics, such as :func:`sklearn.metrics.accuracy_score` and Here is an example:: - >>> from sklearn.metrics.cluster import class_cluster_match - >>> x = ["a", "a", "a", "b", "b", "b"] - >>> y = [0, 0, 1, 1, 2, 2] - >>> contingency_matrix(x, y) - array([[2, 1, 0], - [0, 1, 2]]) - -The first row of output array indicates that there are three samples whose -true cluster is "a". Of them, two are in predicted cluster 0, one is in 1, -and none is in 2. And the second row indicates that there are three samples -whose true cluster is "b". Of them, none is in predicted cluster 0, one is in -1 and two are in 2. - -A :ref:`confusion matrix ` for classification is a square -contingency matrix where the order of rows and columns correspond to a list -of classes. - + >>> from sklearn.metrics.cluster import class_cluster_match, adjusted_rand_score + >>> from sklearn.metrics import confusion_matrix, accuracy_score + >>> y_true = ['a'] * 1 + ['b'] * 2 + ['c'] * 20 + ['d'] * 6 + ['e'] * \ + ... 13 + ['f'] * 2 + ['g'] * 3 + ['h'] * 3 + ['i'] * 2 + ['j'] * 1 + >>> y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] *\ + ... 4 + [5] * 2 + [0] * 4 + [3] * 5 + [6] * 2 + [9] * 2 + [7] *\ + ... 2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1 + >>> y_pred = class_cluster_match(x, y) + >>> confusion_matrix(y_true, y_pred) + [[ 1 0 0 0 0 0 0 0 0 0] + [ 0 0 2 0 0 0 0 0 0 0] + [ 0 0 10 0 0 0 6 0 0 4] + [ 0 0 0 4 0 0 0 0 2 0] + [ 2 2 0 0 5 0 4 0 0 0] + [ 0 0 0 0 0 2 0 0 0 0] + [ 0 0 0 0 0 0 2 0 0 1] + [ 0 0 0 0 0 0 0 3 0 0] + [ 0 0 0 0 2 0 0 0 0 0] + [ 0 0 0 0 0 0 0 0 0 1]] + >>> accuracy_score(y_true, y_pred) # doctest: +ELLIPSIS + 0.52... + >>> adjusted_rand_score(y_true, y_pred) # doctest: +ELLIPSIS + 0.29... + + Notice the confusion matrix above has its main diagonal maximized, meaning + the maximum possible value of accuracy score is obtained by such match of + true classes and clusters. + + This conversion of clustering labels is also compatible with default + clustering metrics, since the change in clusters labels does not + affect results of such metrics, such as the ARI above. + + Another example:: + + >>> y_true = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> y_pred = [3, 0, 1, 1, 2, 2] + >>> class_cluster_match(y_true, y_pred) + ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b'] + + The above example shows what happens with your clustering method identifies + more clusters than true classes. *Such results must be treated carefully*, + since not all metrics derived from such mapping are meaningful. Advantages ~~~~~~~~~~ -- Allows to examine the spread of each true cluster across predicted - clusters and vice versa. +- Enables calculation of classical classification metrics, such as + accuracy and f1_score. -- The contingency table calculated is typically utilized in the calculation - of a similarity statistic (like the others listed in this document) between - the two clusterings. +- Allows for a meaningful and easy-to-read clustering output when classes + are known. Drawbacks ~~~~~~~~~ -- Contingency matrix is easy to interpret for a small number of clusters, but - becomes very hard to interpret for a large number of clusters. - -- It doesn't give a single metric to use as an objective for clustering - optimisation. - - -.. topic:: References - - * `Wikipedia entry for contingency matrix - `_ +- One should use this tool carefully, since its metrics are not always + meaningful for every clustering task. \ No newline at end of file From c50284fd9fe5ea4628cf7f5b4a7bef3835d97772 Mon Sep 17 00:00:00 2001 From: Date: Sun, 11 Feb 2018 13:29:24 +0000 Subject: [PATCH 21/39] fixing doc --- doc/modules/clustering.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 3b1e0fdfd97a1..6fb15fea52910 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1667,7 +1667,7 @@ Here is an example:: >>> y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] *\ ... 4 + [5] * 2 + [0] * 4 + [3] * 5 + [6] * 2 + [9] * 2 + [7] *\ ... 2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1 - >>> y_pred = class_cluster_match(x, y) + >>> y_pred = class_cluster_match(y_true, y_pred) >>> confusion_matrix(y_true, y_pred) [[ 1 0 0 0 0 0 0 0 0 0] [ 0 0 2 0 0 0 0 0 0 0] From 5c8c3f3d4b5da0295ab0c226ef81f7bba4ccd70e Mon Sep 17 00:00:00 2001 From: Date: Sun, 11 Feb 2018 13:54:31 +0000 Subject: [PATCH 22/39] fixing doc --- doc/modules/clustering.rst | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 6fb15fea52910..17005f438ed1e 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1669,16 +1669,16 @@ Here is an example:: ... 2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1 >>> y_pred = class_cluster_match(y_true, y_pred) >>> confusion_matrix(y_true, y_pred) - [[ 1 0 0 0 0 0 0 0 0 0] - [ 0 0 2 0 0 0 0 0 0 0] - [ 0 0 10 0 0 0 6 0 0 4] - [ 0 0 0 4 0 0 0 0 2 0] - [ 2 2 0 0 5 0 4 0 0 0] - [ 0 0 0 0 0 2 0 0 0 0] - [ 0 0 0 0 0 0 2 0 0 1] - [ 0 0 0 0 0 0 0 3 0 0] - [ 0 0 0 0 2 0 0 0 0 0] - [ 0 0 0 0 0 0 0 0 0 1]] + array([[ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0], + [ 0, 0, 10, 0, 0, 0, 6, 0, 0, 4], + [ 0, 0, 0, 4, 0, 0, 0, 0, 2, 0], + [ 2, 2, 0, 0, 5, 0, 4, 0, 0, 0], + [ 0, 0, 0, 0, 0, 2, 0, 0, 0, 0], + [ 0, 0, 0, 0, 0, 0, 2, 0, 0, 1], + [ 0, 0, 0, 0, 0, 0, 0, 3, 0, 0], + [ 0, 0, 0, 0, 2, 0, 0, 0, 0, 0], + [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]) >>> accuracy_score(y_true, y_pred) # doctest: +ELLIPSIS 0.52... >>> adjusted_rand_score(y_true, y_pred) # doctest: +ELLIPSIS From fefe91c64d57966803956548baf522192a658ad5 Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 12:52:06 -0300 Subject: [PATCH 23/39] fixing nomenclature --- doc/modules/classes.rst | 2 +- doc/modules/clustering.rst | 6 +++--- sklearn/metrics/cluster/supervised.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 9bff49f61dc96..890bcd75db22d 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -857,7 +857,7 @@ details. metrics.calinski_harabaz_score metrics.completeness_score metrics.cluster.contingency_matrix - metrics.cluster.class_cluster_match + metrics.cluster.map_cluster_labels metrics.fowlkes_mallows_score metrics.homogeneity_completeness_v_measure metrics.homogeneity_score diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 17005f438ed1e..6f7799852a585 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1660,14 +1660,14 @@ metrics, such as :func:`sklearn.metrics.accuracy_score` and Here is an example:: - >>> from sklearn.metrics.cluster import class_cluster_match, adjusted_rand_score + >>> from sklearn.metrics.cluster import map_cluster_labels, adjusted_rand_score >>> from sklearn.metrics import confusion_matrix, accuracy_score >>> y_true = ['a'] * 1 + ['b'] * 2 + ['c'] * 20 + ['d'] * 6 + ['e'] * \ ... 13 + ['f'] * 2 + ['g'] * 3 + ['h'] * 3 + ['i'] * 2 + ['j'] * 1 >>> y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] *\ ... 4 + [5] * 2 + [0] * 4 + [3] * 5 + [6] * 2 + [9] * 2 + [7] *\ ... 2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1 - >>> y_pred = class_cluster_match(y_true, y_pred) + >>> y_pred = map_cluster_labels(y_true, y_pred) >>> confusion_matrix(y_true, y_pred) array([[ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 2, 0, 0, 0, 0, 0, 0, 0], @@ -1696,7 +1696,7 @@ Here is an example:: >>> y_true = ['a', 'a', 'a', 'b', 'b', 'b'] >>> y_pred = [3, 0, 1, 1, 2, 2] - >>> class_cluster_match(y_true, y_pred) + >>> map_cluster_labels(y_true, y_pred) ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b'] The above example shows what happens with your clustering method identifies diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 48be12c11379c..fb0b495996376 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -902,10 +902,10 @@ def class_cluster_match(y_true, y_pred): Examples -------- >>> from sklearn.metrics import confusion_matrix - >>> from sklearn.metrics.cluster import class_cluster_match + >>> from sklearn.metrics.cluster import map_cluster_labels >>> y_true = ["class1", "class2", "class3", "class1", "class1", "class3"] >>> y_pred = [0, 0, 2, 2, 0, 2] - >>> y_pred_translated = class_cluster_match(y_true, y_pred) + >>> y_pred_translated = map_cluster_labels(y_true, y_pred) >>> y_pred_translated ['class1', 'class1', 'class3', 'class3', 'class1', 'class3'] >>> confusion_matrix(y_true, y_pred_translated) From bd31fb9cdaba97da61d983a658ea6be50f908931 Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 13:08:07 -0300 Subject: [PATCH 24/39] fixing nomenclature --- doc/modules/clustering.rst | 4 ++-- examples/cluster/plot_affinity_propagation.py | 2 +- sklearn/metrics/cluster/__init__.py | 6 +++--- sklearn/metrics/cluster/supervised.py | 2 +- sklearn/metrics/cluster/tests/test_supervised.py | 10 +++++----- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 6f7799852a585..8c3eb6d231c44 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1647,13 +1647,13 @@ Drawbacks * `Wikipedia entry for contingency matrix `_ -.. _class_cluster_match: +.. _map_cluster_labels: Class-cluster Match ------------------- Class-cluster match -(:func:`sklearn.metrics.cluster.class_cluster_matching`) provides a +(:func:`sklearn.metrics.cluster.map_cluster_labels`) provides a friendly way for the user to calculate classical classification metrics, such as :func:`sklearn.metrics.accuracy_score` and :func:`sklearn.metrics.f1_score`. diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py index 4ab71d90e018b..1f318875a7198 100644 --- a/examples/cluster/plot_affinity_propagation.py +++ b/examples/cluster/plot_affinity_propagation.py @@ -27,7 +27,7 @@ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) -translated_labels = metrics.cluster.class_cluster_match(labels_true, labels) +translated_labels = metrics.cluster.map_cluster_labels(labels_true, labels) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py index 4325d3670a650..32c58c3312ac7 100644 --- a/sklearn/metrics/cluster/__init__.py +++ b/sklearn/metrics/cluster/__init__.py @@ -17,7 +17,7 @@ from .supervised import v_measure_score from .supervised import fowlkes_mallows_score from .supervised import entropy -from .supervised import class_cluster_match +from .supervised import map_cluster_labels from .unsupervised import silhouette_samples from .unsupervised import silhouette_score from .unsupervised import calinski_harabaz_score @@ -27,6 +27,6 @@ "adjusted_rand_score", "completeness_score", "contingency_matrix", "expected_mutual_information", "homogeneity_completeness_v_measure", "homogeneity_score", "mutual_info_score", "v_measure_score", - "fowlkes_mallows_score", "entropy", "class_cluster_match", + "fowlkes_mallows_score", "entropy", "map_cluster_labels", "silhouette_samples", "silhouette_score", - "calinski_harabaz_score", "consensus_score", "class_cluster_match"] + "calinski_harabaz_score", "consensus_score", "map_cluster_labels"] diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index ebdc669f52ff5..179e15039e3b0 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -875,7 +875,7 @@ def entropy(labels): return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum))) -def class_cluster_match(y_true, y_pred): +def map_cluster_labels(y_true, y_pred): """Translate prediction labels to maximize the accuracy. Translate the prediction labels of a clustering output to enable calc diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index fe2c4e846c361..c7ca5595eeff5 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -12,7 +12,7 @@ from sklearn.metrics.cluster import mutual_info_score from sklearn.metrics.cluster import normalized_mutual_info_score from sklearn.metrics.cluster import v_measure_score -from sklearn.metrics.cluster import class_cluster_match +from sklearn.metrics.cluster import map_cluster_labels from sklearn.utils import assert_all_finite from sklearn.utils.testing import ( @@ -277,7 +277,7 @@ def test_fowlkes_mallows_score_properties(): assert_almost_equal(score_both, expected) -def test_class_cluster_match(): +def test_map_cluster_labels(): # handcrafted example - same number of clusters and classes y_true = ['a'] * 1 + ['b'] * 2 + ['c'] * 20 + ['d'] * 6 + ['e'] * \ 13 + ['f'] * 2 + ['g'] * 3 + ['h'] * 3 + ['i'] * 2 + ['j'] * 1 @@ -339,7 +339,7 @@ def test_class_cluster_match(): 'e', 'j'] - y_pred_translated = class_cluster_match(y_true, y_pred) + y_pred_translated = map_cluster_labels(y_true, y_pred) assert_equal(y_pred_translated, expected) # handcrafted example - more clusters than classes @@ -348,7 +348,7 @@ def test_class_cluster_match(): expected = ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b'] - y_pred_translated = class_cluster_match(y_true, y_pred) + y_pred_translated = map_cluster_labels(y_true, y_pred) assert_equal(y_pred_translated, expected) # handcrafted example - more clusters than classes @@ -357,5 +357,5 @@ def test_class_cluster_match(): expected = ['a', 'a', 'e', 'e', 'b', 'b'] - y_pred_translated = class_cluster_match(y_true, y_pred) + y_pred_translated = map_cluster_labels(y_true, y_pred) assert_equal(y_pred_translated, expected) From 1539a0be0203f8b2011d9e4a64ed28603606694b Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 13:21:02 -0300 Subject: [PATCH 25/39] fixing commit bug --- sklearn/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index cfecfdd82da4e..a89dbdb808e21 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -17,7 +17,7 @@ from .class_weight import compute_class_weight, compute_sample_weight from ..externals.joblib import cpu_count from ..exceptions import DataConversionWarning -from .deprecation import deprecated<<<<<<< clustering_match +from .deprecation import deprecated from .linear_assignment_ import linear_assignment from .. import get_config From 9d3916397efa59b865ef3c7764f1f5250446dec2 Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 14:59:29 -0300 Subject: [PATCH 26/39] fixing doc title --- doc/modules/clustering.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 24abef41a84c4..5aa1de7a7c775 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1736,7 +1736,7 @@ Drawbacks .. _map_cluster_labels: -Class-cluster Match +Map cluster labels ------------------- Class-cluster match From 1e08a8b2a5872ced94e870125b57b327bbc2c580 Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 15:34:19 -0300 Subject: [PATCH 27/39] avoinding messing with imports --- sklearn/metrics/cluster/supervised.py | 2 +- sklearn/utils/__init__.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 507cab646afb1..b88dc5ec3d954 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -25,7 +25,7 @@ from ...utils.validation import check_array from ...utils.multiclass import unique_labels from ...utils.fixes import comb -from ...utils import linear_assignment +from ...utils.linear_assignment_ import linear_assignment def comb2(n): diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index a89dbdb808e21..bb1f383505fe9 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -18,7 +18,6 @@ from ..externals.joblib import cpu_count from ..exceptions import DataConversionWarning from .deprecation import deprecated -from .linear_assignment_ import linear_assignment from .. import get_config __all__ = ["murmurhash3_32", "as_float_array", @@ -27,8 +26,7 @@ "compute_class_weight", "compute_sample_weight", "column_or_1d", "safe_indexing", "check_consistent_length", "check_X_y", 'indexable', - "check_symmetric", "indices_to_mask", "deprecated", - "linear_assignment"] + "check_symmetric", "indices_to_mask", "deprecated"] class Bunch(dict): From 51a68cb89d73f6a9e9818504d11200181c876e91 Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 15:41:45 -0300 Subject: [PATCH 28/39] changing default label nomenclature --- doc/modules/clustering.rst | 2 +- sklearn/metrics/cluster/supervised.py | 2 +- sklearn/metrics/cluster/tests/test_supervised.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 5aa1de7a7c775..064ec997a60ef 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1784,7 +1784,7 @@ Here is an example:: >>> y_true = ['a', 'a', 'a', 'b', 'b', 'b'] >>> y_pred = [3, 0, 1, 1, 2, 2] >>> map_cluster_labels(y_true, y_pred) - ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b'] + ['DEFAULT_LABEL_1', 'a', 'DEFAULT_LABEL_0', 'DEFAULT_LABEL_0', 'b', 'b'] The above example shows what happens with your clustering method identifies more clusters than true classes. *Such results must be treated carefully*, diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index b88dc5ec3d954..6cc9643e75f13 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -921,7 +921,7 @@ def map_cluster_labels(y_true, y_pred): n_clusters = len(clusters) if n_clusters > n_classes: - classes += ['DEF_CLASS'+str(i) for i in range(n_clusters-n_classes)] + classes += ['DEFAULT_LABEL_'+str(i) for i in range(n_clusters-n_classes)] elif n_classes > n_clusters: clusters += ['DEF_CLUSTER'+str(i) for i in range(n_classes-n_clusters)] diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index a5f8d63f3edc6..a6afbec9edd6e 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -347,7 +347,7 @@ def test_map_cluster_labels(): y_true = ['a', 'a', 'a', 'b', 'b', 'b'] y_pred = [4, 0, 1, 1, 2, 2] - expected = ['DEF_CLASS1', 'a', 'DEF_CLASS0', 'DEF_CLASS0', 'b', 'b'] + expected = ['DEFAULT_LABEL_1', 'a', 'DEFAULT_LABEL_0', 'DEFAULT_LABEL_0', 'b', 'b'] y_pred_translated = map_cluster_labels(y_true, y_pred) assert_equal(y_pred_translated, expected) From 01e92533956c05d363f9712ad87710c564058c5f Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 18:25:39 -0300 Subject: [PATCH 29/39] adding negative indices to the test --- sklearn/metrics/cluster/tests/test_supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index a6afbec9edd6e..f9dbdc4a8c1d7 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -354,7 +354,7 @@ def test_map_cluster_labels(): # handcrafted example - more clusters than classes y_true = ['a', 'd', 'e', 'b', 'b', 'b'] - y_pred = [0, 0, 1, 1, 2, 2] + y_pred = [0, 0, -1, -1, 2, 2] expected = ['a', 'a', 'e', 'e', 'b', 'b'] From e11cd9164ed268bade9de8fb9eb433f1f3c5b62b Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 18:32:32 -0300 Subject: [PATCH 30/39] nomenclature fix --- sklearn/metrics/cluster/supervised.py | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 6cc9643e75f13..082332fd414f6 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -876,26 +876,26 @@ def entropy(labels): return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum))) -def map_cluster_labels(y_true, y_pred): +def map_cluster_labels(labels_true, labels_pred): """Translate prediction labels to maximize the accuracy. Translate the prediction labels of a clustering output to enable calc of external metrics (eg. accuracy, f1_score, ...). Translation is done by maximization of the confusion matrix :math:`C` main diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`. Notice the number of cluster has to be equal - or smaller than the number of true classes. + or smaller than the number of true classes. Parameters ---------- - y_true : array, shape = [n_samples] + labels_true : array, shape = [n_samples] Ground truth (correct) target values. - y_pred : array, shape = [n_samples] - Estimated targets as returned by a clustering algorithm. + labels_pred : array, shape = [n_samples] + Estimated clusters as returned by a clustering algorithm. Returns ------- trans : array, shape = [n_classes, n_classes] - Mapping of y_pred clusters, such that :math:`trans\subseteq y_true` + Mapping of labels_pred clusters, such that :math:`trans\subseteq labels_true` References ---------- @@ -904,20 +904,20 @@ def map_cluster_labels(y_true, y_pred): -------- >>> from sklearn.metrics import confusion_matrix >>> from sklearn.metrics.cluster import map_cluster_labels - >>> y_true = ["class1", "class2", "class3", "class1", "class1", "class3"] - >>> y_pred = [0, 0, 2, 2, 0, 2] - >>> y_pred_translated = map_cluster_labels(y_true, y_pred) + >>> labels_true = ["class1", "class2", "class3", "class1", "class1", "class3"] + >>> labels_pred = [0, 0, 2, 2, 0, 2] + >>> y_pred_translated = map_cluster_labels(labels_true, labels_pred) >>> y_pred_translated ['class1', 'class1', 'class3', 'class3', 'class1', 'class3'] - >>> confusion_matrix(y_true, y_pred_translated) + >>> confusion_matrix(labels_true, y_pred_translated) array([[2, 0, 1], [1, 0, 0], [0, 0, 2]]) """ - classes = unique_labels(y_true).tolist() + classes = unique_labels(labels_true).tolist() n_classes = len(classes) - clusters = unique_labels(y_pred).tolist() + clusters = unique_labels(labels_pred).tolist() n_clusters = len(clusters) if n_clusters > n_classes: @@ -925,7 +925,7 @@ def map_cluster_labels(y_true, y_pred): elif n_classes > n_clusters: clusters += ['DEF_CLUSTER'+str(i) for i in range(n_classes-n_clusters)] - C = contingency_matrix(y_true, y_pred) + C = contingency_matrix(labels_true, labels_pred) true_idx, pred_idx = linear_assignment(-C).T true_idx = true_idx.tolist() @@ -936,6 +936,6 @@ def map_cluster_labels(y_true, y_pred): pred_idx = [clusters[idx] for idx in pred_idx] pred_idx = pred_idx + sorted(set(clusters) - set(pred_idx)) - return_list = [true_idx[pred_idx.index(y)] for y in y_pred] + return_list = [true_idx[pred_idx.index(y)] for y in labels_pred] return return_list From 320858dbbabf1f22e3eb9c79952cfbcaf31ee2b9 Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 18:33:54 -0300 Subject: [PATCH 31/39] comment fix --- sklearn/metrics/cluster/tests/test_supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index f9dbdc4a8c1d7..ba9b1d9325302 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -352,7 +352,7 @@ def test_map_cluster_labels(): y_pred_translated = map_cluster_labels(y_true, y_pred) assert_equal(y_pred_translated, expected) - # handcrafted example - more clusters than classes + # handcrafted example - more classes than clusters y_true = ['a', 'd', 'e', 'b', 'b', 'b'] y_pred = [0, 0, -1, -1, 2, 2] From 0a65cc2fa9ccf03a2b3ba16f2d5ba12e0b4990c1 Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 18:48:28 -0300 Subject: [PATCH 32/39] doc fix --- doc/modules/clustering.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 064ec997a60ef..db36537857519 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1739,7 +1739,7 @@ Drawbacks Map cluster labels ------------------- -Class-cluster match +Map cluster labels (:func:`sklearn.metrics.cluster.map_cluster_labels`) provides a friendly way for the user to calculate classical classification metrics, such as :func:`sklearn.metrics.accuracy_score` and From 4847df3d6ca37d3480bb82027ca0352e7bf35a1f Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 18:52:24 -0300 Subject: [PATCH 33/39] comment fix --- sklearn/metrics/cluster/supervised.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 082332fd414f6..1d606f8c53271 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -879,11 +879,10 @@ def entropy(labels): def map_cluster_labels(labels_true, labels_pred): """Translate prediction labels to maximize the accuracy. - Translate the prediction labels of a clustering output to enable calc - of external metrics (eg. accuracy, f1_score, ...). Translation is done by - maximization of the confusion matrix :math:`C` main diagonal sum - :math:`\sum{i=0}^{K}C_{i, i}`. Notice the number of cluster has to be equal - or smaller than the number of true classes. + Translate the prediction labels of a clustering output to those in the + ground truth to enable calc of external metrics (eg. accuracy, f1_score, ...). + Translation is done by maximization of the confusion matrix :math:`C` main + diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`. Parameters ---------- From 42a7a1d778493593f8ddd4488eea5a5606873a8e Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 18:58:56 -0300 Subject: [PATCH 34/39] simplifying test --- .../metrics/cluster/tests/test_supervised.py | 63 ++----------------- 1 file changed, 4 insertions(+), 59 deletions(-) diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index ba9b1d9325302..a8fec4f7f4743 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -280,65 +280,10 @@ def test_fowlkes_mallows_score_properties(): def test_map_cluster_labels(): # handcrafted example - same number of clusters and classes - y_true = ['a'] * 1 + ['b'] * 2 + ['c'] * 20 + ['d'] * 6 + ['e'] * \ - 13 + ['f'] * 2 + ['g'] * 3 + ['h'] * 3 + ['i'] * 2 + ['j'] * 1 - y_pred = [6] * 1 + [2] * 2 + [0] * 6 + [2] * 10 + [8] * 4 + [1] *\ - 4 + [5] * 2 + [0] * 4 + [3] * 5 + [6] * 2 + [9] * 2 + [7] *\ - 2 + [0] * 2 + [8] * 1 + [4] * 3 + [3] * 2 + [8] * 1 - expected = [ - 'a', - 'c', - 'c', - 'g', - 'g', - 'g', - 'g', - 'g', - 'g', - 'c', - 'c', - 'c', - 'c', - 'c', - 'c', - 'c', - 'c', - 'c', - 'c', - 'j', - 'j', - 'j', - 'j', - 'd', - 'd', - 'd', - 'd', - 'i', - 'i', - 'g', - 'g', - 'g', - 'g', - 'e', - 'e', - 'e', - 'e', - 'e', - 'a', - 'a', - 'b', - 'b', - 'f', - 'f', - 'g', - 'g', - 'j', - 'h', - 'h', - 'h', - 'e', - 'e', - 'j'] + y_true = ['a', 'b', 'b', 'c', 'c', 'a'] + y_pred = [1, 0, 0, 1, 2, 1] + + expected = ['a', 'b', 'b', 'a', 'c', 'a'] y_pred_translated = map_cluster_labels(y_true, y_pred) assert_equal(y_pred_translated, expected) From cdbe4a387016292f505309fa8e38b6ecf216186c Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 19:03:32 -0300 Subject: [PATCH 35/39] fixing name style on labels --- sklearn/metrics/cluster/supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 1d606f8c53271..72ec559d7c62a 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -922,7 +922,7 @@ def map_cluster_labels(labels_true, labels_pred): if n_clusters > n_classes: classes += ['DEFAULT_LABEL_'+str(i) for i in range(n_clusters-n_classes)] elif n_classes > n_clusters: - clusters += ['DEF_CLUSTER'+str(i) for i in range(n_classes-n_clusters)] + clusters += ['DEF_CLUSTER_'+str(i) for i in range(n_classes-n_clusters)] C = contingency_matrix(labels_true, labels_pred) true_idx, pred_idx = linear_assignment(-C).T From 1474d53accc0cc8cd7ac415fe4e964b5f6119b09 Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 19:04:55 -0300 Subject: [PATCH 36/39] fixing name style on labels --- sklearn/metrics/cluster/supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 72ec559d7c62a..673582128a78c 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -922,7 +922,7 @@ def map_cluster_labels(labels_true, labels_pred): if n_clusters > n_classes: classes += ['DEFAULT_LABEL_'+str(i) for i in range(n_clusters-n_classes)] elif n_classes > n_clusters: - clusters += ['DEF_CLUSTER_'+str(i) for i in range(n_classes-n_clusters)] + clusters += ['DEFAULT_CLUSTER_'+str(i) for i in range(n_classes-n_clusters)] C = contingency_matrix(labels_true, labels_pred) true_idx, pred_idx = linear_assignment(-C).T From f582884d3e364ee2464552e0493738de47380d2e Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 20:08:20 -0300 Subject: [PATCH 37/39] fixing line length --- sklearn/metrics/cluster/supervised.py | 9 ++++++--- sklearn/metrics/cluster/tests/test_supervised.py | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index 673582128a78c..d8722479705fc 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -903,7 +903,8 @@ def map_cluster_labels(labels_true, labels_pred): -------- >>> from sklearn.metrics import confusion_matrix >>> from sklearn.metrics.cluster import map_cluster_labels - >>> labels_true = ["class1", "class2", "class3", "class1", "class1", "class3"] + >>> labels_true = ["class1", "class2", "class3", "class1", "class1", + >>> "class3"] >>> labels_pred = [0, 0, 2, 2, 0, 2] >>> y_pred_translated = map_cluster_labels(labels_true, labels_pred) >>> y_pred_translated @@ -920,9 +921,11 @@ def map_cluster_labels(labels_true, labels_pred): n_clusters = len(clusters) if n_clusters > n_classes: - classes += ['DEFAULT_LABEL_'+str(i) for i in range(n_clusters-n_classes)] + classes += ['DEFAULT_LABEL_'+str(i) for i in + range(n_clusters-n_classes)] elif n_classes > n_clusters: - clusters += ['DEFAULT_CLUSTER_'+str(i) for i in range(n_classes-n_clusters)] + clusters += ['DEFAULT_CLUSTER_'+str(i) for i in + range(n_classes-n_clusters)] C = contingency_matrix(labels_true, labels_pred) true_idx, pred_idx = linear_assignment(-C).T diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index a8fec4f7f4743..44b56fd051f3d 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -292,7 +292,8 @@ def test_map_cluster_labels(): y_true = ['a', 'a', 'a', 'b', 'b', 'b'] y_pred = [4, 0, 1, 1, 2, 2] - expected = ['DEFAULT_LABEL_1', 'a', 'DEFAULT_LABEL_0', 'DEFAULT_LABEL_0', 'b', 'b'] + expected = ['DEFAULT_LABEL_1', 'a', 'DEFAULT_LABEL_0', 'DEFAULT_LABEL_0', + 'b', 'b'] y_pred_translated = map_cluster_labels(y_true, y_pred) assert_equal(y_pred_translated, expected) From 7227ab56fb9a2384b967920436b985f5c6ecdf5f Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 21:01:11 -0300 Subject: [PATCH 38/39] fixing line length --- sklearn/metrics/cluster/supervised.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index d8722479705fc..c0b0892fb5527 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -880,9 +880,9 @@ def map_cluster_labels(labels_true, labels_pred): """Translate prediction labels to maximize the accuracy. Translate the prediction labels of a clustering output to those in the - ground truth to enable calc of external metrics (eg. accuracy, f1_score, ...). - Translation is done by maximization of the confusion matrix :math:`C` main - diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`. + ground truth to enable calc of external metrics (eg. accuracy, f1_score, + ...). Translation is done by maximization of the confusion matrix :math:`C` + main diagonal sum :math:`\sum{i=0}^{K}C_{i, i}`. Parameters ---------- @@ -894,7 +894,8 @@ def map_cluster_labels(labels_true, labels_pred): Returns ------- trans : array, shape = [n_classes, n_classes] - Mapping of labels_pred clusters, such that :math:`trans\subseteq labels_true` + Mapping of labels_pred clusters, such that :math:`trans\subseteq + labels_true` References ---------- From e1686d650339f8b1c16e0961039f56076745b50b Mon Sep 17 00:00:00 2001 From: LucasPugensFernandes Date: Sun, 1 Jul 2018 21:48:01 -0300 Subject: [PATCH 39/39] fixing comment code --- sklearn/metrics/cluster/supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/supervised.py b/sklearn/metrics/cluster/supervised.py index c0b0892fb5527..987438ac93a6b 100644 --- a/sklearn/metrics/cluster/supervised.py +++ b/sklearn/metrics/cluster/supervised.py @@ -905,7 +905,7 @@ def map_cluster_labels(labels_true, labels_pred): >>> from sklearn.metrics import confusion_matrix >>> from sklearn.metrics.cluster import map_cluster_labels >>> labels_true = ["class1", "class2", "class3", "class1", "class1", - >>> "class3"] + ... "class3"] >>> labels_pred = [0, 0, 2, 2, 0, 2] >>> y_pred_translated = map_cluster_labels(labels_true, labels_pred) >>> y_pred_translated