scikit-learn · StefanieSenger · Apr 12, 2025 · Apr 12, 2025 · Apr 12, 2025 · Apr 15, 2025
diff --git a/doc/whats_new/upcoming_changes/sklearn.metrics/31187.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.metrics/31187.enhancement.rst
@@ -0,0 +1,3 @@
+- :func:`~metrics.accuracy_score` now has a `replace_undefined_by` param, that can be
+  set to define the function's return value when the metric is undefined.
+  By :user:`Stefanie Senger <StefanieSenger>`
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
@@ -10,6 +10,7 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
+import math
 import warnings
 from numbers import Integral, Real
 
@@ -291,10 +292,13 @@ def _validate_multiclass_probabilistic_prediction(
         "y_pred": ["array-like", "sparse matrix"],
         "normalize": ["boolean"],
         "sample_weight": ["array-like", None],
+        "replace_undefined_by": [Interval(Real, 0.0, 1.0, closed="both"), np.nan],
     },
     prefer_skip_nested_validation=True,
 )
-def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
+def accuracy_score(
+    y_true, y_pred, *, normalize=True, sample_weight=None, replace_undefined_by=0.0
+):
     """Accuracy classification score.
 
     In multilabel classification, this function computes subset accuracy:
@@ -318,14 +322,26 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
+    replace_undefined_by : np.nan, int 0, float in `[0.0, 1.0]`, default=0.0
+        Sets the return value when `y_true` and `y_pred` are empty and the metric is
+        thus ill-defined. Can take the following values:
+
+        - `np.nan` to return `np.nan`
+        - a floating point value in the range of `[0.0, 1.0]`
+
+        Note that with `normalize=False` only `np.nan` or `0.0` can be returned
+        regardless of the value set, since `0.0 ≤ accuracy_score ≤ number of samples`
+        and here, `y_true` and `y_pred` are empty.
+
+        .. versionadded:: 1.7
+
     Returns
     -------
-    score : float or int
-        If ``normalize == True``, return the fraction of correctly
-        classified samples (float), else returns the number of correctly
-        classified samples (int).
+    score : float
+        If ``normalize == True``, return the fraction of correctly classified samples,
+        else returns the number of correctly classified samples.
 
-        The best performance is 1 with ``normalize == True`` and the number
+        The best performance is 1.0 with ``normalize == True`` and the number
         of samples with ``normalize == False``.
 
     See Also
@@ -360,6 +376,18 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     check_consistent_length(y_true, y_pred, sample_weight)
 
+    if _num_samples(y_true) == 0:
+        msg = (
+            "`y_true` and `y_pred` are empty. `accuracy_score` is undefined and "
+            "set to the value defined in the `replace_undefined_by` param, which "
+            "defaults to 0.0."
+        )
+        warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
+        if normalize:
+            return replace_undefined_by
+        else:
+            return replace_undefined_by if math.isnan(replace_undefined_by) else 0.0
+
     if y_type.startswith("multilabel"):
         differing_labels = _count_nonzero(y_true - y_pred, xp=xp, device=device, axis=1)
         score = xp.asarray(differing_labels == 0, device=device)
@@ -1210,9 +1238,8 @@ def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
 def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None):
     """Zero-one classification loss.
 
-    If normalize is ``True``, return the fraction of misclassifications
-    (float), else it returns the number of misclassifications (int). The best
-    performance is 0.
+    If normalize is ``True``, returns the fraction of misclassifications, else returns
+    the number of misclassifications. The best performance is 0.
 
     Read more in the :ref:`User Guide <zero_one_loss>`.
 
@@ -1233,9 +1260,9 @@ def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None):
 
     Returns
     -------
-    loss : float or int,
-        If ``normalize == True``, return the fraction of misclassifications
-        (float), else it returns the number of misclassifications (int).
+    loss : float,
+        If ``normalize == True``, returns the fraction of misclassifications, else
+        returns the number of misclassifications.
 
     See Also
     --------
@@ -3081,7 +3108,7 @@ def hamming_loss(y_true, y_pred, *, sample_weight=None):
 
     Returns
     -------
-    loss : float or int
+    loss : float
         Return the average Hamming loss between element of ``y_true`` and
         ``y_pred``.
 

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -1,3 +1,4 @@
+import math
 import re
 import warnings
 from functools import partial
@@ -251,6 +252,42 @@ def test_multilabel_accuracy_score_subset_accuracy():
     assert accuracy_score(y2, np.zeros(y1.shape)) == 0
 
 
+@pytest.mark.parametrize("replace_undefined_by", [0.0, 0.5, np.nan])
+def test_accuracy_score_undefined(replace_undefined_by):
+    """Test that accuracy_score returns the argument set in the `replace_undefined_by`
+    param when the metric is undefined."""
+
+    def check_equal(res, exp):
+        if np.isnan(res) and np.isnan(exp):
+            return True
+        return res == exp
+
+    y_true = y_pred = np.array([])
+
+    acc = accuracy_score(y_true, y_pred, replace_undefined_by=replace_undefined_by)
+    assert check_equal(acc, replace_undefined_by)
+
+    acc = accuracy_score(
+        y_true, y_pred, normalize=False, replace_undefined_by=replace_undefined_by
+    )
+    if math.isnan(replace_undefined_by):
+        assert check_equal(acc, np.nan)
+    else:
+        # can only return 0 to stay true to the range of possible output values if the
+        # metric was defined:
+        assert acc == 0
+
+
+def test_accuracy_score_undefined_raises_warning():
+    """Test that accuracy_score raises UndefinedMetricWarning when y_true and y_pred are
+    empty."""
+    with pytest.warns(
+        UndefinedMetricWarning,
+        match="`y_true` and `y_pred` are empty. `accuracy_score` is undefined",
+    ):
+        accuracy_score(np.array([]), np.array([]))
+
+
 def test_precision_recall_f1_score_binary():
     # Test Precision Recall and F1 Score for binary classification task
     y_true, y_pred, _ = make_prediction(binary=True)