From 829fa6586e8a865a3765e79ad7d9b5c546038d4d Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Tue, 16 Apr 2024 18:56:30 +0200
Subject: [PATCH 1/7] cln deprecations [doc build]

---
 sklearn/metrics/_classification.py           | 52 +++---------
 sklearn/metrics/tests/test_classification.py | 84 ++++++--------------
 sklearn/metrics/tests/test_common.py         | 11 ++-
 3 files changed, 45 insertions(+), 102 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 62a6c87428e9a..623a12929b8d1 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2816,7 +2816,6 @@ def hamming_loss(y_true, y_pred, *, sample_weight=None):
     {
         "y_true": ["array-like"],
         "y_pred": ["array-like"],
-        "eps": [StrOptions({"auto"}), Interval(Real, 0, 1, closed="both")],
         "normalize": ["boolean"],
         "sample_weight": ["array-like", None],
         "labels": ["array-like", None],
@@ -2824,7 +2823,7 @@ def hamming_loss(y_true, y_pred, *, sample_weight=None):
     prefer_skip_nested_validation=True,
 )
 def log_loss(
-    y_true, y_pred, *, eps="auto", normalize=True, sample_weight=None, labels=None
+    y_true, y_pred, *, normalize=True, sample_weight=None, labels=None
 ):
     r"""Log loss, aka logistic loss or cross-entropy loss.
 
@@ -2855,20 +2854,6 @@ def log_loss(
         ordered alphabetically, as done by
         :class:`~sklearn.preprocessing.LabelBinarizer`.
 
-    eps : float or "auto", default="auto"
-        Log loss is undefined for p=0 or p=1, so probabilities are
-        clipped to `max(eps, min(1 - eps, p))`. The default will depend on the
-        data type of `y_pred` and is set to `np.finfo(y_pred.dtype).eps`.
-
-        .. versionadded:: 1.2
-
-        .. versionchanged:: 1.2
-           The default value changed from `1e-15` to `"auto"` that is
-           equivalent to `np.finfo(y_pred.dtype).eps`.
-
-        .. deprecated:: 1.3
-           `eps` is deprecated in 1.3 and will be removed in 1.5.
-
     normalize : bool, default=True
         If true, return the mean loss per sample.
         Otherwise, return the sum of the per-sample losses.
@@ -2907,18 +2892,6 @@ def log_loss(
     y_pred = check_array(
         y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16]
     )
-    if eps == "auto":
-        eps = np.finfo(y_pred.dtype).eps
-    else:
-        # TODO: Remove user defined eps in 1.5
-        warnings.warn(
-            (
-                "Setting the eps parameter is deprecated and will "
-                "be removed in 1.5. Instead eps will always have"
-                "a default value of `np.finfo(y_pred.dtype).eps`."
-            ),
-            FutureWarning,
-        )
 
     check_consistent_length(y_pred, y_true, sample_weight)
     lb = LabelBinarizer()
@@ -2949,9 +2922,6 @@ def log_loss(
             1 - transformed_labels, transformed_labels, axis=1
         )
 
-    # Clipping
-    y_pred = np.clip(y_pred, eps, 1 - eps)
-
     # If y_pred is of single dimension, assume y_true to be binary
     # and then check.
     if y_pred.ndim == 1:
@@ -2959,6 +2929,15 @@ def log_loss(
     if y_pred.shape[1] == 1:
         y_pred = np.append(1 - y_pred, y_pred, axis=1)
 
+    # Make sure y_pred is normalized
+    y_pred_sum = y_pred.sum(axis=1)
+    if not np.allclose(y_pred_sum, 1, rtol=1e-15):
+        raise ValueError("The y_pred values do not sum to one.")
+
+    # Clipping
+    eps = np.finfo(y_pred.dtype).eps
+    y_pred = np.clip(y_pred, eps, 1 - eps)
+
     # Check if dimensions are consistent.
     transformed_labels = check_array(transformed_labels)
     if len(lb.classes_) != y_pred.shape[1]:
@@ -2979,17 +2958,6 @@ def log_loss(
                 "labels: {0}".format(lb.classes_)
             )
 
-    # Renormalize
-    y_pred_sum = y_pred.sum(axis=1)
-    if not np.isclose(y_pred_sum, 1, rtol=1e-15, atol=5 * eps).all():
-        warnings.warn(
-            (
-                "The y_pred values do not sum to one. Starting from 1.5 this"
-                "will result in an error."
-            ),
-            UserWarning,
-        )
-    y_pred = y_pred / y_pred_sum[:, np.newaxis]
     loss = -xlogy(transformed_labels, y_pred).sum(axis=1)
 
     return float(_average(loss, weights=sample_weight, normalize=normalize))
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index bbebe2cba2197..93237e8fc4ecf 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2624,62 +2624,37 @@ def test_log_loss():
     )
     loss = log_loss(y_true, y_pred)
     loss_true = -np.mean(bernoulli.logpmf(np.array(y_true) == "yes", y_pred[:, 1]))
-    assert_almost_equal(loss, loss_true)
+    assert_allclose(loss, loss_true)
 
     # multiclass case; adapted from http://bit.ly/RJJHWA
     y_true = [1, 0, 2]
     y_pred = [[0.2, 0.7, 0.1], [0.6, 0.2, 0.2], [0.6, 0.1, 0.3]]
     loss = log_loss(y_true, y_pred, normalize=True)
-    assert_almost_equal(loss, 0.6904911)
+    assert_allclose(loss, 0.6904911)
 
     # check that we got all the shapes and axes right
     # by doubling the length of y_true and y_pred
     y_true *= 2
     y_pred *= 2
     loss = log_loss(y_true, y_pred, normalize=False)
-    assert_almost_equal(loss, 0.6904911 * 6, decimal=6)
-
-    user_warning_msg = "y_pred values do not sum to one"
-    # check eps and handling of absolute zero and one probabilities
-    y_pred = np.asarray(y_pred) > 0.5
-    with pytest.warns(FutureWarning):
-        loss = log_loss(y_true, y_pred, normalize=True, eps=0.1)
-    with pytest.warns(UserWarning, match=user_warning_msg):
-        assert_almost_equal(loss, log_loss(y_true, np.clip(y_pred, 0.1, 0.9)))
-
-    # binary case: check correct boundary values for eps = 0
-    with pytest.warns(FutureWarning):
-        assert log_loss([0, 1], [0, 1], eps=0) == 0
-    with pytest.warns(FutureWarning):
-        assert log_loss([0, 1], [0, 0], eps=0) == np.inf
-    with pytest.warns(FutureWarning):
-        assert log_loss([0, 1], [1, 1], eps=0) == np.inf
-
-    # multiclass case: check correct boundary values for eps = 0
-    with pytest.warns(FutureWarning):
-        assert log_loss([0, 1, 2], [[1, 0, 0], [0, 1, 0], [0, 0, 1]], eps=0) == 0
-    with pytest.warns(FutureWarning):
-        assert (
-            log_loss([0, 1, 2], [[0, 0.5, 0.5], [0, 1, 0], [0, 0, 1]], eps=0) == np.inf
-        )
+    assert_allclose(loss, 0.6904911 * 6)
 
     # raise error if number of classes are not equal.
     y_true = [1, 0, 2]
-    y_pred = [[0.2, 0.7], [0.6, 0.5], [0.4, 0.1]]
+    y_pred = [[0.3, 0.7], [0.6, 0.4], [0.4, 0.6]]
     with pytest.raises(ValueError):
         log_loss(y_true, y_pred)
 
     # case when y_true is a string array object
     y_true = ["ham", "spam", "spam", "ham"]
-    y_pred = [[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]]
-    with pytest.warns(UserWarning, match=user_warning_msg):
-        loss = log_loss(y_true, y_pred)
-    assert_almost_equal(loss, 1.0383217, decimal=6)
+    y_pred = [[0.3, 0.7], [0.6, 0.4], [0.4, 0.6], [0.7, 0.3]]
+    loss = log_loss(y_true, y_pred)
+    assert_allclose(loss, 0.7469410)
 
     # test labels option
 
     y_true = [2, 2]
-    y_pred = [[0.2, 0.7], [0.6, 0.5]]
+    y_pred = [[0.2, 0.8], [0.6, 0.4]]
     y_score = np.array([[0.1, 0.9], [0.1, 0.9]])
     error_str = (
         r"y_true contains only one label \(2\). Please provide "
@@ -2688,50 +2663,42 @@ def test_log_loss():
     with pytest.raises(ValueError, match=error_str):
         log_loss(y_true, y_pred)
 
-    y_pred = [[0.2, 0.7], [0.6, 0.5], [0.2, 0.3]]
-    error_str = "Found input variables with inconsistent numbers of samples: [3, 2]"
-    (ValueError, error_str, log_loss, y_true, y_pred)
+    y_pred = [[0.2, 0.8], [0.6, 0.4], [0.7, 0.3]]
+    error_str = r"Found input variables with inconsistent numbers of samples: \[3, 2\]"
+    with pytest.raises(ValueError, match=error_str):
+        log_loss(y_true, y_pred)
 
     # works when the labels argument is used
 
     true_log_loss = -np.mean(np.log(y_score[:, 1]))
     calculated_log_loss = log_loss(y_true, y_score, labels=[1, 2])
-    assert_almost_equal(calculated_log_loss, true_log_loss)
+    assert_allclose(calculated_log_loss, true_log_loss)
 
     # ensure labels work when len(np.unique(y_true)) != y_pred.shape[1]
     y_true = [1, 2, 2]
-    y_score2 = [[0.2, 0.7, 0.3], [0.6, 0.5, 0.3], [0.3, 0.9, 0.1]]
-    with pytest.warns(UserWarning, match=user_warning_msg):
-        loss = log_loss(y_true, y_score2, labels=[1, 2, 3])
-    assert_almost_equal(loss, 1.0630345, decimal=6)
+    y_score2 = [[0.1, 0.6, 0.3], [0.2, 0.5, 0.3], [0.4, 0.5, 0.1]]
+    loss = log_loss(y_true, y_score2, labels=[1, 2, 3])
+    assert_allclose(loss, 1.2296264)
+
 
+@pytest.mark.parametrize("dtype", [np.float64, np.float32, np.float16])
+def test_log_loss_eps(dtype):
+    """Check the behaviour internal eps that changes depending on the input dtype.
 
-def test_log_loss_eps_auto(global_dtype):
-    """Check the behaviour of `eps="auto"` that changes depending on the input
-    array dtype.
     Non-regression test for:
     https://github.com/scikit-learn/scikit-learn/issues/24315
     """
-    y_true = np.array([0, 1], dtype=global_dtype)
+    y_true = np.array([0, 1], dtype=dtype)
     y_pred = y_true.copy()
 
-    loss = log_loss(y_true, y_pred, eps="auto")
-    assert np.isfinite(loss)
-
-
-def test_log_loss_eps_auto_float16():
-    """Check the behaviour of `eps="auto"` for np.float16"""
-    y_true = np.array([0, 1], dtype=np.float16)
-    y_pred = y_true.copy()
-
-    loss = log_loss(y_true, y_pred, eps="auto")
+    loss = log_loss(y_true, y_pred)
     assert np.isfinite(loss)
 
 
 def test_log_loss_pandas_input():
     # case when input is a pandas series and dataframe gh-5715
     y_tr = np.array(["ham", "spam", "spam", "ham"])
-    y_pr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]])
+    y_pr = np.array([[0.3, 0.7], [0.6, 0.4], [0.4, 0.6], [0.7, 0.3]])
     types = [(MockDataFrame, MockDataFrame)]
     try:
         from pandas import DataFrame, Series
@@ -2742,9 +2709,8 @@ def test_log_loss_pandas_input():
     for TrueInputType, PredInputType in types:
         # y_pred dataframe, y_true series
         y_true, y_pred = TrueInputType(y_tr), PredInputType(y_pr)
-        with pytest.warns(UserWarning, match="y_pred values do not sum to one"):
-            loss = log_loss(y_true, y_pred)
-        assert_almost_equal(loss, 1.0383217, decimal=6)
+        loss = log_loss(y_true, y_pred)
+        assert_almost_equal(loss, 0.7469410, decimal=6)
 
 
 def test_brier_score_loss():
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index e84ef1e358473..228efddeac32d 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -639,6 +639,9 @@ def test_sample_order_invariance_multilabel_and_multioutput():
     y_pred = random_state.randint(0, 2, size=(20, 25))
     y_score = random_state.normal(size=y_true.shape)
 
+    # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1)
+    y_score /= y_score.sum(axis=1)[:, np.newaxis]
+
     y_true_shuffle, y_pred_shuffle, y_score_shuffle = shuffle(
         y_true, y_pred, y_score, random_state=0
     )
@@ -1566,7 +1569,10 @@ def test_multilabel_sample_weight_invariance(name):
     )
     y_true = np.vstack([ya, yb])
     y_pred = np.vstack([ya, ya])
-    y_score = random_state.randint(1, 4, size=y_true.shape)
+    y_score = random_state.normal(size=y_true.shape)
+
+    # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1)
+    y_score /= y_score.sum(axis=1)[:, np.newaxis]
 
     metric = ALL_METRICS[name]
     if name in THRESHOLDED_METRICS:
@@ -1631,6 +1637,9 @@ def test_thresholded_multilabel_multioutput_permutations_invariance(name):
     y_true = random_state.randint(0, 2, size=(n_samples, n_classes))
     y_score = random_state.normal(size=y_true.shape)
 
+    # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1)
+    y_score /= y_score.sum(axis=1)[:, np.newaxis]
+
     # Makes sure all samples have at least one label. This works around errors
     # when running metrics where average="sample"
     y_true[y_true.sum(1) == 4, 0] = 0

From 12765b0b5a64fc72b7ed17fa79be0c8172d0338b Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Tue, 16 Apr 2024 19:06:47 +0200
Subject: [PATCH 2/7] lint [doc build]

---
 sklearn/metrics/_classification.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 623a12929b8d1..865d762cf97e4 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2822,9 +2822,7 @@ def hamming_loss(y_true, y_pred, *, sample_weight=None):
     },
     prefer_skip_nested_validation=True,
 )
-def log_loss(
-    y_true, y_pred, *, normalize=True, sample_weight=None, labels=None
-):
+def log_loss(y_true, y_pred, *, normalize=True, sample_weight=None, labels=None):
     r"""Log loss, aka logistic loss or cross-entropy loss.
 
     This is the loss function used in (multinomial) logistic regression

From bb13aa2cd4e485808dece7a1e32255e867d8dd1f Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Wed, 17 Apr 2024 12:17:33 +0200
Subject: [PATCH 3/7] add test for sum(probas) != 1 error

---
 sklearn/metrics/_classification.py           | 2 +-
 sklearn/metrics/tests/test_classification.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 865d762cf97e4..b83d22fd45a31 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2929,7 +2929,7 @@ def log_loss(y_true, y_pred, *, normalize=True, sample_weight=None, labels=None)
 
     # Make sure y_pred is normalized
     y_pred_sum = y_pred.sum(axis=1)
-    if not np.allclose(y_pred_sum, 1, rtol=1e-15):
+    if not np.allclose(y_pred_sum, 1):
         raise ValueError("The y_pred values do not sum to one.")
 
     # Clipping
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 93237e8fc4ecf..9d2d8c02f55db 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2695,6 +2695,15 @@ def test_log_loss_eps(dtype):
     assert np.isfinite(loss)
 
 
+def test_log_loss_not_probabilities_error():
+    """Check that log_loss raises an error when y_pred values don't sum to 1."""
+    y_true = np.array([0, 1, 1, 0])
+    y_pred = np.array([[0.2, 0.7], [0.6, 0.3], [0.4, 0.7], [0.8, 0.3]])
+
+    with pytest.raises(ValueError, match="The y_pred values do not sum to one."):
+        log_loss(y_true, y_pred)
+
+
 def test_log_loss_pandas_input():
     # case when input is a pandas series and dataframe gh-5715
     y_tr = np.array(["ham", "spam", "spam", "ham"])

From bcb5f3a3d5227189c943168716572c9ab64c79fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <jeremie@probabl.ai>
Date: Thu, 18 Apr 2024 16:06:01 +0200
Subject: [PATCH 4/7] fix test for inf edge case

---
 sklearn/metrics/tests/test_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 9d2d8c02f55db..77ded25c02b39 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2689,7 +2689,7 @@ def test_log_loss_eps(dtype):
     https://github.com/scikit-learn/scikit-learn/issues/24315
     """
     y_true = np.array([0, 1], dtype=dtype)
-    y_pred = y_true.copy()
+    y_pred = np.array([1, 0], dtype=dtype)
 
     loss = log_loss(y_true, y_pred)
     assert np.isfinite(loss)

From fc4edaa215b88f938b93f66d3a79cfc34a352c1c Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Wed, 24 Apr 2024 15:51:19 +0200
Subject: [PATCH 5/7] warn for non-probabilities

---
 sklearn/metrics/_classification.py           | 13 ++++++++++---
 sklearn/metrics/tests/test_classification.py |  6 +++---
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index b83d22fd45a31..1cb49abf4cd61 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2852,6 +2852,9 @@ def log_loss(y_true, y_pred, *, normalize=True, sample_weight=None, labels=None)
         ordered alphabetically, as done by
         :class:`~sklearn.preprocessing.LabelBinarizer`.
 
+        `y_pred` values are clipped to `[eps, 1-eps]` where `eps` is the machine
+        precsion for y_pred's dtype.
+
     normalize : bool, default=True
         If true, return the mean loss per sample.
         Otherwise, return the sum of the per-sample losses.
@@ -2927,13 +2930,17 @@ def log_loss(y_true, y_pred, *, normalize=True, sample_weight=None, labels=None)
     if y_pred.shape[1] == 1:
         y_pred = np.append(1 - y_pred, y_pred, axis=1)
 
+    eps = np.finfo(y_pred.dtype).eps
+
     # Make sure y_pred is normalized
     y_pred_sum = y_pred.sum(axis=1)
-    if not np.allclose(y_pred_sum, 1):
-        raise ValueError("The y_pred values do not sum to one.")
+    if not np.allclose(y_pred_sum, 1, rtol=np.sqrt(eps)):
+        warnings.warn(
+            "The y_pred values do not sum to one. Make sure to pass probabilities.",
+            UserWarning,
+        )
 
     # Clipping
-    eps = np.finfo(y_pred.dtype).eps
     y_pred = np.clip(y_pred, eps, 1 - eps)
 
     # Check if dimensions are consistent.
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 77ded25c02b39..97daa2e29e6a6 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2695,12 +2695,12 @@ def test_log_loss_eps(dtype):
     assert np.isfinite(loss)
 
 
-def test_log_loss_not_probabilities_error():
-    """Check that log_loss raises an error when y_pred values don't sum to 1."""
+def test_log_loss_not_probabilities_warning():
+    """Check that log_loss raises a warning when y_pred values don't sum to 1."""
     y_true = np.array([0, 1, 1, 0])
     y_pred = np.array([[0.2, 0.7], [0.6, 0.3], [0.4, 0.7], [0.8, 0.3]])
 
-    with pytest.raises(ValueError, match="The y_pred values do not sum to one."):
+    with pytest.warns(UserWarning, match="The y_pred values do not sum to one."):
         log_loss(y_true, y_pred)
 
 

From eb26d8ee63f56be5174be4b7cb41b3ef3d7eb7c9 Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Wed, 24 Apr 2024 19:12:34 +0200
Subject: [PATCH 6/7] address review comments

---
 sklearn/metrics/_classification.py           |  2 +-
 sklearn/metrics/tests/test_classification.py | 25 ++++++++++++++++----
 sklearn/metrics/tests/test_common.py         | 12 +++++-----
 3 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 1cb49abf4cd61..f2d38b0818659 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2853,7 +2853,7 @@ def log_loss(y_true, y_pred, *, normalize=True, sample_weight=None, labels=None)
         :class:`~sklearn.preprocessing.LabelBinarizer`.
 
         `y_pred` values are clipped to `[eps, 1-eps]` where `eps` is the machine
-        precsion for y_pred's dtype.
+        precision for y_pred's dtype.
 
     normalize : bool, default=True
         If true, return the mean loss per sample.
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 97daa2e29e6a6..144871c8d02ee 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -2676,9 +2676,9 @@ def test_log_loss():
 
     # ensure labels work when len(np.unique(y_true)) != y_pred.shape[1]
     y_true = [1, 2, 2]
-    y_score2 = [[0.1, 0.6, 0.3], [0.2, 0.5, 0.3], [0.4, 0.5, 0.1]]
+    y_score2 = [[0.7, 0.1, 0.2], [0.2, 0.7, 0.1], [0.1, 0.7, 0.2]]
     loss = log_loss(y_true, y_score2, labels=[1, 2, 3])
-    assert_allclose(loss, 1.2296264)
+    assert_allclose(loss, -np.log(0.7))
 
 
 @pytest.mark.parametrize("dtype", [np.float64, np.float32, np.float16])
@@ -2695,15 +2695,30 @@ def test_log_loss_eps(dtype):
     assert np.isfinite(loss)
 
 
-def test_log_loss_not_probabilities_warning():
+@pytest.mark.parametrize("dtype", [np.float64, np.float32, np.float16])
+def test_log_loss_not_probabilities_warning(dtype):
     """Check that log_loss raises a warning when y_pred values don't sum to 1."""
     y_true = np.array([0, 1, 1, 0])
-    y_pred = np.array([[0.2, 0.7], [0.6, 0.3], [0.4, 0.7], [0.8, 0.3]])
+    y_pred = np.array([[0.2, 0.7], [0.6, 0.3], [0.4, 0.7], [0.8, 0.3]], dtype=dtype)
 
     with pytest.warns(UserWarning, match="The y_pred values do not sum to one."):
         log_loss(y_true, y_pred)
 
 
+@pytest.mark.parametrize(
+    "y_true, y_pred",
+    [
+        ([0, 1, 0], [0, 1, 0]),
+        ([0, 1, 0], [[1, 0], [0, 1], [1, 0]]),
+        ([0, 1, 2], [[1, 0, 0], [0, 1, 0], [0, 0, 1]]),
+    ],
+)
+def test_log_loss_perfect_predictions(y_true, y_pred):
+    """Check that log_loss returns 0 for perfect predictions."""
+    # Because of the clipping, the result is not exactly 0
+    assert log_loss(y_true, y_pred) == pytest.approx(0)
+
+
 def test_log_loss_pandas_input():
     # case when input is a pandas series and dataframe gh-5715
     y_tr = np.array(["ham", "spam", "spam", "ham"])
@@ -2719,7 +2734,7 @@ def test_log_loss_pandas_input():
         # y_pred dataframe, y_true series
         y_true, y_pred = TrueInputType(y_tr), PredInputType(y_pr)
         loss = log_loss(y_true, y_pred)
-        assert_almost_equal(loss, 0.7469410, decimal=6)
+        assert_allclose(loss, 0.7469410)
 
 
 def test_brier_score_loss():
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 228efddeac32d..886f870da6adf 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -637,10 +637,10 @@ def test_sample_order_invariance_multilabel_and_multioutput():
     # Generate some data
     y_true = random_state.randint(0, 2, size=(20, 25))
     y_pred = random_state.randint(0, 2, size=(20, 25))
-    y_score = random_state.normal(size=y_true.shape)
+    y_score = random_state.uniform(size=y_true.shape)
 
     # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1)
-    y_score /= y_score.sum(axis=1)[:, np.newaxis]
+    y_score /= y_score.sum(axis=1, keepdims=True)
 
     y_true_shuffle, y_pred_shuffle, y_score_shuffle = shuffle(
         y_true, y_pred, y_score, random_state=0
@@ -1569,10 +1569,10 @@ def test_multilabel_sample_weight_invariance(name):
     )
     y_true = np.vstack([ya, yb])
     y_pred = np.vstack([ya, ya])
-    y_score = random_state.normal(size=y_true.shape)
+    y_score = random_state.uniform(size=y_true.shape)
 
     # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1)
-    y_score /= y_score.sum(axis=1)[:, np.newaxis]
+    y_score /= y_score.sum(axis=1, keepdims=True)
 
     metric = ALL_METRICS[name]
     if name in THRESHOLDED_METRICS:
@@ -1635,10 +1635,10 @@ def test_thresholded_multilabel_multioutput_permutations_invariance(name):
     random_state = check_random_state(0)
     n_samples, n_classes = 20, 4
     y_true = random_state.randint(0, 2, size=(n_samples, n_classes))
-    y_score = random_state.normal(size=y_true.shape)
+    y_score = random_state.uniform(size=y_true.shape)
 
     # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1)
-    y_score /= y_score.sum(axis=1)[:, np.newaxis]
+    y_score /= y_score.sum(axis=1, keepdims=True)
 
     # Makes sure all samples have at least one label. This works around errors
     # when running metrics where average="sample"

From dac73bc00f43cbdeda199458ac4e8afa61131254 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <jeremie@probabl.ai>
Date: Fri, 26 Apr 2024 18:19:15 +0200
Subject: [PATCH 7/7] Update sklearn/metrics/_classification.py

Co-authored-by: Guillaume Lemaitre <guillaume@probabl.ai>
---
 sklearn/metrics/_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index f2d38b0818659..caa4db5479a29 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2853,7 +2853,7 @@ def log_loss(y_true, y_pred, *, normalize=True, sample_weight=None, labels=None)
         :class:`~sklearn.preprocessing.LabelBinarizer`.
 
         `y_pred` values are clipped to `[eps, 1-eps]` where `eps` is the machine
-        precision for y_pred's dtype.
+        precision for `y_pred`'s dtype.
 
     normalize : bool, default=True
         If true, return the mean loss per sample.