scikit-learn · lesteve · Feb 3, 2025 · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
@@ -1305,7 +1305,7 @@ ignoring permutations::
   >>> labels_true = [0, 0, 0, 1, 1, 1]
   >>> labels_pred = [0, 0, 1, 1, 2, 2]
   >>> metrics.rand_score(labels_true, labels_pred)
-  np.float64(0.66...)
+  0.66...
 
 The Rand index does not ensure to obtain a value close to 0.0 for a
 random labelling. The adjusted Rand index **corrects for chance** and
@@ -1319,7 +1319,7 @@ labels, rename 2 to 3, and get the same score::
 
   >>> labels_pred = [1, 1, 0, 0, 3, 3]
   >>> metrics.rand_score(labels_true, labels_pred)
-  np.float64(0.66...)
+  0.66...
   >>> metrics.adjusted_rand_score(labels_true, labels_pred)
   0.24...
 
@@ -1328,7 +1328,7 @@ Furthermore, both :func:`rand_score` :func:`adjusted_rand_score` are
 thus be used as **consensus measures**::
 
   >>> metrics.rand_score(labels_pred, labels_true)
-  np.float64(0.66...)
+  0.66...
   >>> metrics.adjusted_rand_score(labels_pred, labels_true)
   0.24...
 
@@ -1348,7 +1348,7 @@ will not necessarily be close to zero.::
   >>> labels_true = [0, 0, 0, 0, 0, 0, 1, 1]
   >>> labels_pred = [0, 1, 2, 3, 4, 5, 5, 6]
   >>> metrics.rand_score(labels_true, labels_pred)
-  np.float64(0.39...)
+  0.39...
   >>> metrics.adjusted_rand_score(labels_true, labels_pred)
   -0.07...
 
@@ -1644,16 +1644,16 @@ We can turn those concept as scores :func:`homogeneity_score` and
   >>> labels_pred = [0, 0, 1, 1, 2, 2]
 
   >>> metrics.homogeneity_score(labels_true, labels_pred)
-  np.float64(0.66...)
+  0.66...
 
   >>> metrics.completeness_score(labels_true, labels_pred)
-  np.float64(0.42...)
+  0.42...
 
 Their harmonic mean called **V-measure** is computed by
 :func:`v_measure_score`::
 
   >>> metrics.v_measure_score(labels_true, labels_pred)
-  np.float64(0.51...)
+  0.51...
 
 This function's formula is as follows:
 
@@ -1662,12 +1662,12 @@ This function's formula is as follows:
 `beta` defaults to a value of 1.0, but for using a value less than 1 for beta::
 
   >>> metrics.v_measure_score(labels_true, labels_pred, beta=0.6)
-  np.float64(0.54...)
+  0.54...
 
 more weight will be attributed to homogeneity, and using a value greater than 1::
 
   >>> metrics.v_measure_score(labels_true, labels_pred, beta=1.8)
-  np.float64(0.48...)
+  0.48...
 
 more weight will be attributed to completeness.
 
@@ -1678,14 +1678,14 @@ Homogeneity, completeness and V-measure can be computed at once using
 :func:`homogeneity_completeness_v_measure` as follows::
 
   >>> metrics.homogeneity_completeness_v_measure(labels_true, labels_pred)
-  (np.float64(0.66...), np.float64(0.42...), np.float64(0.51...))
+  (0.66..., 0.42..., 0.51...)
 
 The following clustering assignment is slightly better, since it is
 homogeneous but not complete::
 
   >>> labels_pred = [0, 0, 0, 1, 2, 2]
   >>> metrics.homogeneity_completeness_v_measure(labels_true, labels_pred)
-  (np.float64(1.0), np.float64(0.68...), np.float64(0.81...))
+  (1.0, 0.68..., 0.81...)
 
 .. note::
 
@@ -1815,21 +1815,21 @@ between two clusters.
   >>> labels_pred = [0, 0, 1, 1, 2, 2]
 
   >>> metrics.fowlkes_mallows_score(labels_true, labels_pred)
-  np.float64(0.47140...)
+  0.47140...
 
 One can permute 0 and 1 in the predicted labels, rename 2 to 3 and get
 the same score::
 
   >>> labels_pred = [1, 1, 0, 0, 3, 3]
 
   >>> metrics.fowlkes_mallows_score(labels_true, labels_pred)
-  np.float64(0.47140...)
+  0.47140...
 
 Perfect labeling is scored 1.0::
 
   >>> labels_pred = labels_true[:]
   >>> metrics.fowlkes_mallows_score(labels_true, labels_pred)
-  np.float64(1.0)
+  1.0
 
 Bad (e.g. independent labelings) have zero scores::
 
@@ -1912,7 +1912,7 @@ cluster analysis.
   >>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)
   >>> labels = kmeans_model.labels_
   >>> metrics.silhouette_score(X, labels, metric='euclidean')
-  np.float64(0.55...)
+  0.55...
 
 .. topic:: Advantages:
 
@@ -1969,7 +1969,7 @@ cluster analysis:
   >>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)
   >>> labels = kmeans_model.labels_
   >>> metrics.calinski_harabasz_score(X, labels)
-  np.float64(561.59...)
+  561.59...
 
 
 .. topic:: Advantages:
@@ -2043,7 +2043,7 @@ cluster analysis as follows:
   >>> kmeans = KMeans(n_clusters=3, random_state=1).fit(X)
   >>> labels = kmeans.labels_
   >>> davies_bouldin_score(X, labels)
-  np.float64(0.666...)
+  0.666...
 
 
 .. topic:: Advantages:

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
@@ -377,7 +377,7 @@ You can create your own custom scorer object using
       >>> import numpy as np
       >>> def my_custom_loss_func(y_true, y_pred):
       ...     diff = np.abs(y_true - y_pred).max()
-      ...     return np.log1p(diff)
+      ...     return float(np.log1p(diff))
       ...
       >>> # score will negate the return value of my_custom_loss_func,
       >>> # which will be np.log(2), 0.693, given the values for X
@@ -389,9 +389,9 @@ You can create your own custom scorer object using
       >>> clf = DummyClassifier(strategy='most_frequent', random_state=0)
       >>> clf = clf.fit(X, y)
       >>> my_custom_loss_func(y, clf.predict(X))
-      np.float64(0.69...)
+      0.69...
       >>> score(clf, X, y)
-      np.float64(-0.69...)
+      -0.69...
 
 .. dropdown:: Custom scorer objects from scratch
 
@@ -673,10 +673,10 @@ where :math:`k` is the number of guesses allowed and :math:`1(x)` is the
   ...                     [0.2, 0.4, 0.3],
   ...                     [0.7, 0.2, 0.1]])
   >>> top_k_accuracy_score(y_true, y_score, k=2)
-  np.float64(0.75)
+  0.75
   >>> # Not normalizing gives the number of "correctly" classified samples
   >>> top_k_accuracy_score(y_true, y_score, k=2, normalize=False)
-  np.int64(3)
+  3.0
 
 .. _balanced_accuracy_score:
 
@@ -786,7 +786,7 @@ and not for more than two annotators.
   >>> labeling1 = [2, 0, 2, 2, 0, 1]
   >>> labeling2 = [0, 0, 2, 2, 0, 2]
   >>> cohen_kappa_score(labeling1, labeling2)
-  np.float64(0.4285714285714286)
+  0.4285714285714286
 
 .. _confusion_matrix:
 
@@ -837,9 +837,9 @@ false negatives and true positives as follows::
 
   >>> y_true = [0, 0, 0, 1, 1, 1, 1, 1]
   >>> y_pred = [0, 1, 0, 1, 0, 1, 0, 1]
-  >>> tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
+  >>> tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel().tolist()
   >>> tn, fp, fn, tp
-  (np.int64(2), np.int64(1), np.int64(2), np.int64(3))
+  (2, 1, 2, 3)
 
 .. rubric:: Examples
 
@@ -1115,7 +1115,7 @@ Here are some small examples in binary classification::
   >>> threshold
   array([0.1 , 0.35, 0.4 , 0.8 ])
   >>> average_precision_score(y_true, y_scores)
-  np.float64(0.83...)
+  0.83...
 
 
 
@@ -1234,19 +1234,19 @@ In the binary case::
   >>> y_pred = np.array([[1, 1, 1],
   ...                    [1, 0, 0]])
   >>> jaccard_score(y_true[0], y_pred[0])
-  np.float64(0.6666...)
+  0.6666...
 
 In the 2D comparison case (e.g. image similarity):
 
   >>> jaccard_score(y_true, y_pred, average="micro")
-  np.float64(0.6)
+  0.6
 
 In the multilabel case with binary label indicators::
 
   >>> jaccard_score(y_true, y_pred, average='samples')
-  np.float64(0.5833...)
+  0.5833...
   >>> jaccard_score(y_true, y_pred, average='macro')
-  np.float64(0.6666...)
+  0.6666...
   >>> jaccard_score(y_true, y_pred, average=None)
   array([0.5, 0.5, 1. ])
 
@@ -1258,9 +1258,9 @@ multilabel problem::
   >>> jaccard_score(y_true, y_pred, average=None)
   array([1. , 0. , 0.33...])
   >>> jaccard_score(y_true, y_pred, average='macro')
-  np.float64(0.44...)
+  0.44...
   >>> jaccard_score(y_true, y_pred, average='micro')
-  np.float64(0.33...)
+  0.33...
 
 .. _hinge_loss:
 
@@ -1315,7 +1315,7 @@ with a svm classifier in a binary class problem::
   >>> pred_decision
   array([-2.18...,  2.36...,  0.09...])
   >>> hinge_loss([-1, 1, 1], pred_decision)
-  np.float64(0.3...)
+  0.3...
 
 Here is an example demonstrating the use of the :func:`hinge_loss` function
 with a svm classifier in a multiclass problem::
@@ -1329,7 +1329,7 @@ with a svm classifier in a multiclass problem::
   >>> pred_decision = est.decision_function([[-1], [2], [3]])
   >>> y_true = [0, 2, 3]
   >>> hinge_loss(y_true, pred_decision, labels=labels)
-  np.float64(0.56...)
+  0.56...
 
 .. _log_loss:
 
@@ -1445,7 +1445,7 @@ function:
     >>> y_true = [+1, +1, +1, -1]
     >>> y_pred = [+1, -1, +1, +1]
     >>> matthews_corrcoef(y_true, y_pred)
-    np.float64(-0.33...)
+    -0.33...
 
 .. rubric:: References
 
@@ -1640,12 +1640,12 @@ We can use the probability estimates corresponding to `clf.classes_[1]`.
 
   >>> y_score = clf.predict_proba(X)[:, 1]
   >>> roc_auc_score(y, y_score)
-  np.float64(0.99...)
+  0.99...
 
 Otherwise, we can use the non-thresholded decision values
 
   >>> roc_auc_score(y, clf.decision_function(X))
-  np.float64(0.99...)
+  0.99...
 
 .. _roc_auc_multiclass:
 
@@ -1951,13 +1951,13 @@ Here is a small example of usage of this function::
     >>> y_prob = np.array([0.1, 0.9, 0.8, 0.4])
     >>> y_pred = np.array([0, 1, 1, 0])
     >>> brier_score_loss(y_true, y_prob)
-    np.float64(0.055)
+    0.055
     >>> brier_score_loss(y_true, 1 - y_prob, pos_label=0)
-    np.float64(0.055)
+    0.055
     >>> brier_score_loss(y_true_categorical, y_prob, pos_label="ham")
-    np.float64(0.055)
+    0.055
     >>> brier_score_loss(y_true, y_prob > 0.5)
-    np.float64(0.0)
+    0.0
 
 The Brier score can be used to assess how well a classifier is calibrated.
 However, a lower Brier score loss does not always mean a better calibration.
@@ -2236,7 +2236,7 @@ Here is a small example of usage of this function::
     >>> y_true = np.array([[1, 0, 0], [0, 0, 1]])
     >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]])
     >>> coverage_error(y_true, y_score)
-    np.float64(2.5)
+    2.5
 
 .. _label_ranking_average_precision:
 
@@ -2283,7 +2283,7 @@ Here is a small example of usage of this function::
     >>> y_true = np.array([[1, 0, 0], [0, 0, 1]])
     >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]])
     >>> label_ranking_average_precision_score(y_true, y_score)
-    np.float64(0.416...)
+    0.416...
 
 .. _label_ranking_loss:
 
@@ -2318,11 +2318,11 @@ Here is a small example of usage of this function::
     >>> y_true = np.array([[1, 0, 0], [0, 0, 1]])
     >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]])
     >>> label_ranking_loss(y_true, y_score)
-    np.float64(0.75...)
+    0.75...
     >>> # With the following prediction, we have perfect and minimal loss
     >>> y_score = np.array([[1.0, 0.1, 0.2], [0.1, 0.2, 0.9]])
     >>> label_ranking_loss(y_true, y_score)
-    np.float64(0.0)
+    0.0
 
 
 .. dropdown:: References
@@ -2700,7 +2700,7 @@ function::
   >>> y_true = [3, -0.5, 2, 7]
   >>> y_pred = [2.5, 0.0, 2, 8]
   >>> median_absolute_error(y_true, y_pred)
-  np.float64(0.5)
+  0.5
 
 
 
@@ -2732,7 +2732,7 @@ Here is a small example of usage of the :func:`max_error` function::
   >>> y_true = [3, 2, 7, 1]
   >>> y_pred = [9, 2, 7, 1]
   >>> max_error(y_true, y_pred)
-  np.int64(6)
+  6.0
 
 The :func:`max_error` does not support multioutput.
 
@@ -3011,15 +3011,15 @@ of 0.0.
     >>> y_true = [3, -0.5, 2, 7]
     >>> y_pred = [2.5, 0.0, 2, 8]
     >>> d2_absolute_error_score(y_true, y_pred)
-    np.float64(0.764...)
+    0.764...
     >>> y_true = [1, 2, 3]
     >>> y_pred = [1, 2, 3]
     >>> d2_absolute_error_score(y_true, y_pred)
-    np.float64(1.0)
+    1.0
     >>> y_true = [1, 2, 3]
     >>> y_pred = [2, 2, 2]
     >>> d2_absolute_error_score(y_true, y_pred)
-    np.float64(0.0)
+    0.0
 
 
 .. _visualization_regression_evaluation:

diff --git a/sklearn/metrics/_base.py b/sklearn/metrics/_base.py
@@ -118,7 +118,7 @@ def _average_binary_score(binary_metric, y_true, y_score, average, sample_weight
             # score from being affected by 0-weighted NaN elements.
             average_weight = np.asarray(average_weight)
             score[average_weight == 0] = 0
-        return np.average(score, weights=average_weight)
+        return float(np.average(score, weights=average_weight))
     else:
         return score