From a9f461711eff9ec07d0ef12c174c6d0e8183ef51 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Sun, 3 Sep 2023 16:58:50 +0200
Subject: [PATCH 01/33] test: Improve `assert_argkmin_results_quasi_equality`
 error message

Signed-off-by: Julien Jerphanion <git@jjerphan.xyz>

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../test_pairwise_distances_reduction.py      | 83 ++++++++++++++-----
 1 file changed, 61 insertions(+), 22 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 46405048a4fa1..1be218be07687 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -107,6 +107,35 @@ def test_relative_rounding():
     assert relative_rounding(123.456789, 10) == 123.456789
 
 
+def quasi_equality_error_message(
+    n_significant_digits,
+    rtol,
+    query_idx,
+    ref_dist_row,
+    dist_row,
+    rounded_dist,
+    group_rank,
+    reference_neighbors_group,
+    effective_neighbors_group,
+):
+    # Turning formatting off to avoid black reformatting the message.
+    # fmt: off
+    error_message = (
+        "Neighbors indices are not matching when rounding distances at\n"
+        f"{n_significant_digits} significant digits derived from rtol={rtol:.1e}.\n\n"
+        f"Query vector index         : {query_idx}\n"
+        f"Reference ordered distances: {ref_dist_row}\n"
+        f"Computed ordered distances : {dist_row}\n"
+        f"Norm. abs. ord. dist. diff.: {np.mean(np.abs(dist_row - ref_dist_row))}\n"
+        f"Rounded distance           : {rounded_dist}\n"
+        f"Neighbors group rank       : {group_rank}\n"
+        f"Reference neighbors indices: {list(sorted(reference_neighbors_group))}\n"
+        f"Computed neighbors indices : {list(sorted(effective_neighbors_group))}"
+    )
+    # fmt: on
+    return error_message
+
+
 def assert_argkmin_results_quasi_equality(
     ref_dist,
     dist,
@@ -162,16 +191,21 @@ def assert_argkmin_results_quasi_equality(
             effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank])
 
         # Asserting equality of groups (sets) for each distance
-        msg = (
-            f"Neighbors indices for query {query_idx} are not matching "
-            f"when rounding distances at {n_significant_digits} significant digits "
-            f"derived from rtol={rtol:.1e}"
-        )
-        for rounded_distance in reference_neighbors_groups.keys():
+        for group_rank, rounded_dist in enumerate(reference_neighbors_groups.keys()):
             assert (
-                reference_neighbors_groups[rounded_distance]
-                == effective_neighbors_groups[rounded_distance]
-            ), msg
+                reference_neighbors_groups[rounded_dist]
+                == effective_neighbors_groups[rounded_dist]
+            ), quasi_equality_error_message(
+                n_significant_digits,
+                rtol,
+                query_idx,
+                ref_dist_row,
+                dist_row,
+                rounded_dist,
+                group_rank,
+                reference_neighbors_groups[rounded_dist],
+                effective_neighbors_groups[rounded_dist],
+            )
 
 
 def assert_radius_neighbors_results_equality(
@@ -272,16 +306,21 @@ def assert_radius_neighbors_results_quasi_equality(
             effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank])
 
         # Asserting equality of groups (sets) for each distance
-        msg = (
-            f"Neighbors indices for query {query_idx} are not matching "
-            f"when rounding distances at {n_significant_digits} significant digits "
-            f"derived from rtol={rtol:.1e}"
-        )
-        for rounded_distance in reference_neighbors_groups.keys():
+        for group_rank, rounded_dist in enumerate(reference_neighbors_groups.keys()):
             assert (
-                reference_neighbors_groups[rounded_distance]
-                == effective_neighbors_groups[rounded_distance]
-            ), msg
+                reference_neighbors_groups[rounded_dist]
+                == effective_neighbors_groups[rounded_dist]
+            ), quasi_equality_error_message(
+                n_significant_digits,
+                rtol,
+                query_idx,
+                ref_dist_row,
+                dist_row,
+                rounded_dist,
+                group_rank,
+                reference_neighbors_groups[rounded_dist],
+                effective_neighbors_groups[rounded_dist],
+            )
 
 
 ASSERT_RESULT = {
@@ -358,7 +397,7 @@ def test_assert_argkmin_results_quasi_equality():
     # Apply invalid permutation on indices: permuting the ranks
     # of the 2 nearest neighbors is invalid because the distance
     # values are too different.
-    msg = "Neighbors indices for query 0 are not matching"
+    msg = "Neighbors indices are not matching"
     with pytest.raises(AssertionError, match=msg):
         assert_argkmin_results_quasi_equality(
             np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
@@ -369,7 +408,7 @@ def test_assert_argkmin_results_quasi_equality():
         )
 
     # Indices aren't properly sorted w.r.t their distances
-    msg = "Neighbors indices for query 0 are not matching"
+    msg = "Neighbors indices are not matching"
     with pytest.raises(AssertionError, match=msg):
         assert_argkmin_results_quasi_equality(
             np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
@@ -439,7 +478,7 @@ def test_assert_radius_neighbors_results_quasi_equality():
     )
 
     # Apply invalid permutation on indices
-    msg = "Neighbors indices for query 0 are not matching"
+    msg = "Neighbors indices are not matching"
     with pytest.raises(AssertionError, match=msg):
         assert_radius_neighbors_results_quasi_equality(
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
@@ -475,7 +514,7 @@ def test_assert_radius_neighbors_results_quasi_equality():
         )
 
     # Indices aren't properly sorted w.r.t their distances
-    msg = "Neighbors indices for query 0 are not matching"
+    msg = "Neighbors indices are not matching"
     with pytest.raises(AssertionError, match=msg):
         assert_radius_neighbors_results_quasi_equality(
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),

From a2b9a80b2c287ff53cd75871b65f7dc9a0b73b70 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Mon, 11 Sep 2023 17:31:16 +0200
Subject: [PATCH 02/33] Trigger [all random seeds] for
 test_pairwise_distances_argkmin test_pairwise_distances_argkmin


From 37df63414fb05bd0d76aad42c9b6a40057faec42 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Mon, 11 Sep 2023 18:04:30 +0200
Subject: [PATCH 03/33] [azure parallel] [all random seeds]
 test_pairwise_distances_argkmin


From 109db17cb5a45f18ad48ff0307fd11ce36abcbf7 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 13 Sep 2023 14:10:46 +0200
Subject: [PATCH 04/33] Simpler way to assert approximate equality of neighbor
 results

---
 .../test_pairwise_distances_reduction.py      | 208 +++++++++++-------
 1 file changed, 129 insertions(+), 79 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 1be218be07687..943e8f27bfc54 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -137,74 +137,99 @@ def quasi_equality_error_message(
 
 
 def assert_argkmin_results_quasi_equality(
-    ref_dist,
-    dist,
-    ref_indices,
-    indices,
-    rtol=1e-4,
+    neighbors_dists_b,
+    neighbors_dists_a,
+    neighbors_indices_a,
+    neighbors_indices_b,
+    rtol=1e-5,
+    atol=1e-6,
 ):
-    """Assert that argkmin results are valid up to:
-      - relative tolerance on computed distance values
-      - permutations of indices for distances values that differ up to
-        a precision level
+    """Assert that argkmin results are valid up to rounding errors
 
-    To be used for testing neighbors queries on float32 datasets: we
-    accept neighbors rank swaps only if they are caused by small
-    rounding errors on the distance computations.
+    This function asserts that the results of argkmin queries are valid up to:
+    - rounding error tolerance on distance values;
+    - permutations of indices for distances values that differ up to the
+      expected precision level.
+
+    Furthermore, the distances must be sorted.
+
+    To be used for testing neighbors queries on float32 datasets: we accept
+    neighbors rank swaps only if they are caused by small rounding errors on
+    the distance computations.
     """
     is_sorted = lambda a: np.all(a[:-1] <= a[1:])
 
-    n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1)
-
     assert (
-        ref_dist.shape == dist.shape == ref_indices.shape == indices.shape
-    ), "Arrays of results have various shapes."
+        neighbors_dists_a.shape
+        == neighbors_dists_b.shape
+        == neighbors_indices_a.shape
+        == neighbors_indices_b.shape
+    ), "Arrays of results have incompatible shapes."
 
-    n_queries, n_neighbors = ref_dist.shape
+    n_queries, _ = neighbors_dists_a.shape
 
     # Asserting equality results one row at a time
     for query_idx in range(n_queries):
-        ref_dist_row = ref_dist[query_idx]
-        dist_row = dist[query_idx]
-
-        assert is_sorted(
-            ref_dist_row
-        ), f"Reference distances aren't sorted on row {query_idx}"
-        assert is_sorted(dist_row), f"Distances aren't sorted on row {query_idx}"
-
-        assert_allclose(ref_dist_row, dist_row, rtol=rtol)
-
-        ref_indices_row = ref_indices[query_idx]
-        indices_row = indices[query_idx]
-
-        # Grouping indices by distances using sets on a rounded distances up
-        # to a given number of decimals of significant digits derived from rtol.
-        reference_neighbors_groups = defaultdict(set)
-        effective_neighbors_groups = defaultdict(set)
-
-        for neighbor_rank in range(n_neighbors):
-            rounded_dist = relative_rounding(
-                ref_dist_row[neighbor_rank],
-                n_significant_digits=n_significant_digits,
-            )
-            reference_neighbors_groups[rounded_dist].add(ref_indices_row[neighbor_rank])
-            effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank])
-
-        # Asserting equality of groups (sets) for each distance
-        for group_rank, rounded_dist in enumerate(reference_neighbors_groups.keys()):
-            assert (
-                reference_neighbors_groups[rounded_dist]
-                == effective_neighbors_groups[rounded_dist]
-            ), quasi_equality_error_message(
-                n_significant_digits,
-                rtol,
-                query_idx,
-                ref_dist_row,
-                dist_row,
-                rounded_dist,
-                group_rank,
-                reference_neighbors_groups[rounded_dist],
-                effective_neighbors_groups[rounded_dist],
+        dist_row_a = neighbors_dists_a[query_idx]
+        dist_row_b = neighbors_dists_b[query_idx]
+        indices_row_a = neighbors_indices_a[query_idx]
+        indices_row_b = neighbors_indices_b[query_idx]
+
+        assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}"
+        assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}"
+
+        # The distances should match, irrespective of neighbors index swappping
+        # because of rounding errors.
+        try:
+            assert_allclose(dist_row_a, dist_row_b, rtol=rtol, atol=atol)
+        except AssertionError as e:
+            # Wrap exception to provide more context while also including the original
+            # exception with the computed absolute and relative differences.
+            raise AssertionError(
+                f"Query vector with index {query_idx} lead to different neighbors'"
+                f" distances (with atol={atol} and"
+                f" rtol={rtol}):\ndist_row_a={dist_row_a}\ndist_row_b={dist_row_b}"
+            ) from e
+
+        # Compute a mapping from indices to distances for each result set and
+        # check that the computed neighbors with matching indices are withing
+        # the expected distance tolerance.
+        # It's ok if they are missing indices on either side, as long as the
+        # distances match in the previous assertions.
+        indices_to_dist_a = {idx: dist for idx, dist in zip(indices_row_a, dist_row_a)}
+        indices_to_dist_b = {idx: dist for idx, dist in zip(indices_row_b, dist_row_b)}
+
+        common_indices = set(indices_row_a).intersection(set(indices_row_b))
+        for idx in common_indices:
+            dist_a = indices_to_dist_a[idx]
+            dist_b = indices_to_dist_b[idx]
+            try:
+                assert_allclose(dist_a, dist_b, rtol=rtol, atol=atol)
+            except AssertionError as e:
+                raise AssertionError(
+                    f"Query vector with index {query_idx} lead to different distances"
+                    f" for common neighbor with index {idx}:"
+                    f" dist_a={dist_a} vs dist_b={dist_b} (with atol={atol} and"
+                    f" rtol={rtol})"
+                ) from e
+
+        # Check that any neighbor with distances below the rounding error threshold have
+        # matching indices.
+        threshold = (1 - rtol) * np.max(dist_row_a) - atol
+        mask_a = dist_row_a <= threshold
+        mask_b = dist_row_b <= threshold
+        missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b[mask_b])
+        missing_from_a = np.setdiff1d(indices_row_b[mask_b], indices_row_a[mask_a])
+        if len(missing_from_a) > 0 or len(missing_from_b) > 0:
+            raise AssertionError(
+                f"Query vector with index {query_idx} lead to mismatched result indices"
+                f" (with atol={atol} and rtol={rtol}):\n"
+                f"neighors in b missing from a: {missing_from_a}\n"
+                f"neighors in a missing from b: {missing_from_b}\n"
+                f"dist_row_a={dist_row_a}\n"
+                f"dist_row_b={dist_row_b}\n"
+                f"indices_row_a={indices_row_a}\n"
+                f"indices_row_b={indices_row_b}\n"
             )
 
 
@@ -348,8 +373,9 @@ def assert_radius_neighbors_results_quasi_equality(
 
 
 def test_assert_argkmin_results_quasi_equality():
+    atol = 0.0
     rtol = 1e-7
-    eps = 1e-7
+    eps = 1e-7 / 2
     _1m = 1.0 - eps
     _1p = 1.0 + eps
 
@@ -374,47 +400,78 @@ def test_assert_argkmin_results_quasi_equality():
         ref_dist, ref_dist, ref_indices, ref_indices, rtol
     )
 
-    # Apply valid permutation on indices: the last 3 points are
-    # all very close to one another so we accept any permutation
-    # on their rankings.
+    # Apply valid permutation on indices: the last 3 points are all very close
+    # to one another so we accept any permutation on their rankings.
     assert_argkmin_results_quasi_equality(
         np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
         np.array([[1.2, 2.5, 6.1, 6.1, 6.1]]),
         np.array([[1, 2, 3, 4, 5]]),
         np.array([[1, 2, 4, 5, 3]]),
+        atol=atol,
         rtol=rtol,
     )
+
+    # The last few indices do not necessarily have to match because of the rounding
+    # errors on the distances: their could be tied results at the boundary.
+    assert_argkmin_results_quasi_equality(
+        np.array([[1.2, 2.5, 3.0, 6.1, _6_1p]]),
+        np.array([[1.2, 2.5, 3.0, _6_1m, 6.1]]),
+        np.array([[1, 2, 3, 4, 5]]),
+        np.array([[1, 2, 3, 6, 7]]),
+        atol=atol,
+        rtol=rtol,
+    )
+
     # All points are have close distances so any ranking permutation
     # is valid for this query result.
     assert_argkmin_results_quasi_equality(
-        np.array([[_1m, _1m, 1, _1p, _1p]]),
-        np.array([[_1m, _1m, 1, _1p, _1p]]),
-        np.array([[6, 7, 8, 9, 10]]),
+        np.array([[1, 1, _1p, _1p, _1p]]),
+        np.array([[1, 1, 1, 1, _1p]]),
+        np.array([[7, 6, 8, 10, 9]]),
         np.array([[6, 9, 7, 8, 10]]),
+        atol=atol,
         rtol=rtol,
     )
 
-    # Apply invalid permutation on indices: permuting the ranks
-    # of the 2 nearest neighbors is invalid because the distance
-    # values are too different.
-    msg = "Neighbors indices are not matching"
+    # They could also be nearly truncation of very large nearly tied result
+    # sets hence all indices can also be distinct in this case:
+    assert_argkmin_results_quasi_equality(
+        np.array([[1, 1, _1p, _1p, _1p]]),
+        np.array([[1, 1, 1, 1, _1p]]),
+        np.array([[34, 30, 8, 12, 24]]),
+        np.array([[42, 1, 21, 13, 3]]),
+        atol=atol,
+        rtol=rtol,
+    )
+
+    # Apply invalid permutation on indices: permuting the ranks of the 2
+    # nearest neighbors is invalid because the distance values are too
+    # different.
+    msg = re.escape(
+        "Query vector with index 0 lead to different distances for common neighbor with"
+        " index 1: dist_a=1.2 vs dist_b=2.5 (with atol=0.0 and rtol=1e-07)"
+    )
     with pytest.raises(AssertionError, match=msg):
         assert_argkmin_results_quasi_equality(
             np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
             np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
             np.array([[1, 2, 3, 4, 5]]),
             np.array([[2, 1, 3, 4, 5]]),
+            atol=atol,
             rtol=rtol,
         )
 
     # Indices aren't properly sorted w.r.t their distances
-    msg = "Neighbors indices are not matching"
+    msg = re.escape(
+        "neighors in b missing from a: [12]\nneighors in a missing from b: [1]"
+    )
     with pytest.raises(AssertionError, match=msg):
         assert_argkmin_results_quasi_equality(
             np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
             np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
             np.array([[1, 2, 3, 4, 5]]),
-            np.array([[2, 1, 4, 5, 3]]),
+            np.array([[12, 2, 4, 11, 3]]),
+            atol=atol,
             rtol=rtol,
         )
 
@@ -426,6 +483,7 @@ def test_assert_argkmin_results_quasi_equality():
             np.array([[2.5, 1.2, _6_1m, 6.1, _6_1p]]),
             np.array([[1, 2, 3, 4, 5]]),
             np.array([[2, 1, 4, 5, 3]]),
+            atol=atol,
             rtol=rtol,
         )
 
@@ -1159,14 +1217,6 @@ def test_pairwise_distances_argkmin(
     n_samples=100,
     k=10,
 ):
-    # TODO: can we easily fix this discrepancy?
-    edge_cases = [
-        (np.float32, "chebyshev", 1000000.0),
-        (np.float32, "cityblock", 1000000.0),
-    ]
-    if (dtype, metric, translation) in edge_cases:
-        pytest.xfail("Numerical differences lead to small differences in results.")
-
     rng = np.random.RandomState(global_random_seed)
     spread = 1000
     X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread

From baa792dbf699cb6dc6705f3c3179d714622ff7c3 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 13 Sep 2023 17:50:28 +0200
Subject: [PATCH 05/33] Focus this PR on argkmin tests for now

---
 .../test_pairwise_distances_reduction.py      | 56 ++++---------------
 1 file changed, 11 insertions(+), 45 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 943e8f27bfc54..925598b60c582 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -107,35 +107,6 @@ def test_relative_rounding():
     assert relative_rounding(123.456789, 10) == 123.456789
 
 
-def quasi_equality_error_message(
-    n_significant_digits,
-    rtol,
-    query_idx,
-    ref_dist_row,
-    dist_row,
-    rounded_dist,
-    group_rank,
-    reference_neighbors_group,
-    effective_neighbors_group,
-):
-    # Turning formatting off to avoid black reformatting the message.
-    # fmt: off
-    error_message = (
-        "Neighbors indices are not matching when rounding distances at\n"
-        f"{n_significant_digits} significant digits derived from rtol={rtol:.1e}.\n\n"
-        f"Query vector index         : {query_idx}\n"
-        f"Reference ordered distances: {ref_dist_row}\n"
-        f"Computed ordered distances : {dist_row}\n"
-        f"Norm. abs. ord. dist. diff.: {np.mean(np.abs(dist_row - ref_dist_row))}\n"
-        f"Rounded distance           : {rounded_dist}\n"
-        f"Neighbors group rank       : {group_rank}\n"
-        f"Reference neighbors indices: {list(sorted(reference_neighbors_group))}\n"
-        f"Computed neighbors indices : {list(sorted(effective_neighbors_group))}"
-    )
-    # fmt: on
-    return error_message
-
-
 def assert_argkmin_results_quasi_equality(
     neighbors_dists_b,
     neighbors_dists_a,
@@ -331,21 +302,16 @@ def assert_radius_neighbors_results_quasi_equality(
             effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank])
 
         # Asserting equality of groups (sets) for each distance
-        for group_rank, rounded_dist in enumerate(reference_neighbors_groups.keys()):
+        msg = (
+            f"Neighbors indices for query {query_idx} are not matching "
+            f"when rounding distances at {n_significant_digits} significant digits "
+            f"derived from rtol={rtol:.1e}"
+        )
+        for rounded_distance in reference_neighbors_groups.keys():
             assert (
-                reference_neighbors_groups[rounded_dist]
-                == effective_neighbors_groups[rounded_dist]
-            ), quasi_equality_error_message(
-                n_significant_digits,
-                rtol,
-                query_idx,
-                ref_dist_row,
-                dist_row,
-                rounded_dist,
-                group_rank,
-                reference_neighbors_groups[rounded_dist],
-                effective_neighbors_groups[rounded_dist],
-            )
+                reference_neighbors_groups[rounded_distance]
+                == effective_neighbors_groups[rounded_distance]
+            ), msg
 
 
 ASSERT_RESULT = {
@@ -536,7 +502,7 @@ def test_assert_radius_neighbors_results_quasi_equality():
     )
 
     # Apply invalid permutation on indices
-    msg = "Neighbors indices are not matching"
+    msg = "Neighbors indices for query 0 are not matching"
     with pytest.raises(AssertionError, match=msg):
         assert_radius_neighbors_results_quasi_equality(
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
@@ -572,7 +538,7 @@ def test_assert_radius_neighbors_results_quasi_equality():
         )
 
     # Indices aren't properly sorted w.r.t their distances
-    msg = "Neighbors indices are not matching"
+    msg = "Neighbors indices for query 0 are not matching"
     with pytest.raises(AssertionError, match=msg):
         assert_radius_neighbors_results_quasi_equality(
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),

From 755f641bdc0bb9d78f0c98e4f996d4487a659ef4 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 13 Sep 2023 18:01:42 +0200
Subject: [PATCH 06/33] Fix edge cases by making check for missing indices
 symmetric

---
 .../tests/test_pairwise_distances_reduction.py      | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 925598b60c582..8aea622cf852e 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -186,11 +186,14 @@ def assert_argkmin_results_quasi_equality(
 
         # Check that any neighbor with distances below the rounding error threshold have
         # matching indices.
-        threshold = (1 - rtol) * np.max(dist_row_a) - atol
-        mask_a = dist_row_a <= threshold
-        mask_b = dist_row_b <= threshold
-        missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b[mask_b])
-        missing_from_a = np.setdiff1d(indices_row_b[mask_b], indices_row_a[mask_a])
+        threshold = np.minimum(
+            (1 - rtol) * np.max(dist_row_a) - atol,
+            (1 - rtol) * np.max(dist_row_b) - atol,
+        )
+        mask_a = dist_row_a < threshold
+        mask_b = dist_row_b < threshold
+        missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b)
+        missing_from_a = np.setdiff1d(indices_row_b[mask_b], indices_row_a)
         if len(missing_from_a) > 0 or len(missing_from_b) > 0:
             raise AssertionError(
                 f"Query vector with index {query_idx} lead to mismatched result indices"

From 6a5fa0f330283c2820747f9f05909eced6f71242 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 13 Sep 2023 20:16:41 +0200
Subject: [PATCH 07/33] [azure parallel] [all random seeds]
 test_pairwise_distances_argkmin


From 9a975b33f2d2db24b87d4574535aea934ac3338e Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 08:18:51 +0200
Subject: [PATCH 08/33] Speed-up test_pairwise_distances_* by dividing the
 number of queries by 10

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 8aea622cf852e..b59161c3e504f 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -1183,12 +1183,13 @@ def test_pairwise_distances_argkmin(
     metric,
     strategy,
     dtype,
+    n_queries=10,
     n_samples=100,
     k=10,
 ):
     rng = np.random.RandomState(global_random_seed)
     spread = 1000
-    X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
+    X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread
     Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
 
     X_csr = csr_matrix(X)
@@ -1249,12 +1250,13 @@ def test_pairwise_distances_radius_neighbors(
     metric,
     strategy,
     dtype,
+    n_queries=10,
     n_samples=100,
 ):
     rng = np.random.RandomState(global_random_seed)
     spread = 1000
     radius = spread * np.log(n_features)
-    X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
+    X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread
     Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
 
     metric_kwargs = _get_metric_params_list(

From 04f7bddc1262002ffcfb1e5630c7bfed9842171a Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 08:19:49 +0200
Subject: [PATCH 09/33] [azure parallel] [all random seeds]
 test_pairwise_distances


From 9ec12633d45e9292b9c6784bedfd4ca0c3646371 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 10:59:03 +0200
Subject: [PATCH 10/33] Further speed-up pairwise distance tests by leveraging
 global random seed

---
 .../tests/test_pairwise_distances_reduction.py       | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index ee6e23d689e43..08826f6183ced 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -1174,16 +1174,12 @@ def test_strategies_consistency(
 # "Concrete Dispatchers"-specific tests
 
 
-@pytest.mark.parametrize("n_features", [50, 500])
-@pytest.mark.parametrize("translation", [0, 1e6])
 @pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
 @pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
 @pytest.mark.parametrize("dtype", [np.float64, np.float32])
 @pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
 def test_pairwise_distances_argkmin(
     global_random_seed,
-    n_features,
-    translation,
     metric,
     strategy,
     dtype,
@@ -1193,6 +1189,8 @@ def test_pairwise_distances_argkmin(
     k=10,
 ):
     rng = np.random.RandomState(global_random_seed)
+    n_features = rng.choice([50, 500])
+    translation = rng.choice([0, 1e6])
     spread = 1000
     X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread
     Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
@@ -1243,15 +1241,11 @@ def test_pairwise_distances_argkmin(
         )
 
 
-@pytest.mark.parametrize("n_features", [50, 500])
-@pytest.mark.parametrize("translation", [0, 1e6])
 @pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
 @pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
 @pytest.mark.parametrize("dtype", [np.float64, np.float32])
 def test_pairwise_distances_radius_neighbors(
     global_random_seed,
-    n_features,
-    translation,
     metric,
     strategy,
     dtype,
@@ -1259,6 +1253,8 @@ def test_pairwise_distances_radius_neighbors(
     n_samples=100,
 ):
     rng = np.random.RandomState(global_random_seed)
+    n_features = rng.choice([50, 500])
+    translation = rng.choice([0, 1e6])
     spread = 1000
     radius = spread * np.log(n_features)
     X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread

From 21ed72618016105cbc3150b7f482f5e0f3abb02c Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 11:49:59 +0200
Subject: [PATCH 11/33] Simplify assert_argkmin_results_quasi_equality to
 always use informative message with even stronger tests and a fix

---
 .../test_pairwise_distances_reduction.py      | 39 ++++++++++---------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 08826f6183ced..3ae395f944d4a 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -108,8 +108,8 @@ def test_relative_rounding():
 
 
 def assert_argkmin_results_quasi_equality(
-    neighbors_dists_b,
     neighbors_dists_a,
+    neighbors_dists_b,
     neighbors_indices_a,
     neighbors_indices_b,
     rtol=1e-5,
@@ -149,19 +149,6 @@ def assert_argkmin_results_quasi_equality(
         assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}"
         assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}"
 
-        # The distances should match, irrespective of neighbors index swappping
-        # because of rounding errors.
-        try:
-            assert_allclose(dist_row_a, dist_row_b, rtol=rtol, atol=atol)
-        except AssertionError as e:
-            # Wrap exception to provide more context while also including the original
-            # exception with the computed absolute and relative differences.
-            raise AssertionError(
-                f"Query vector with index {query_idx} lead to different neighbors'"
-                f" distances (with atol={atol} and"
-                f" rtol={rtol}):\ndist_row_a={dist_row_a}\ndist_row_b={dist_row_b}"
-            ) from e
-
         # Compute a mapping from indices to distances for each result set and
         # check that the computed neighbors with matching indices are withing
         # the expected distance tolerance.
@@ -177,6 +164,9 @@ def assert_argkmin_results_quasi_equality(
             try:
                 assert_allclose(dist_a, dist_b, rtol=rtol, atol=atol)
             except AssertionError as e:
+                # Wrap exception to provide more context while also including
+                # the original exception with the computed absolute and
+                # relative differences.
                 raise AssertionError(
                     f"Query vector with index {query_idx} lead to different distances"
                     f" for common neighbor with index {idx}:"
@@ -186,10 +176,9 @@ def assert_argkmin_results_quasi_equality(
 
         # Check that any neighbor with distances below the rounding error threshold have
         # matching indices.
-        threshold = np.minimum(
-            (1 - rtol) * np.max(dist_row_a) - atol,
-            (1 - rtol) * np.max(dist_row_b) - atol,
-        )
+        threshold = (1 - rtol) * np.maximum(
+            np.max(dist_row_a), np.max(dist_row_b)
+        ) - atol
         mask_a = dist_row_a < threshold
         mask_b = dist_row_b < threshold
         missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b)
@@ -444,6 +433,20 @@ def test_assert_argkmin_results_quasi_equality():
             rtol=rtol,
         )
 
+    # Indices aren't properly sorted w.r.t their distances
+    msg = re.escape(
+        "neighors in b missing from a: []\nneighors in a missing from b: [3]"
+    )
+    with pytest.raises(AssertionError, match=msg):
+        assert_argkmin_results_quasi_equality(
+            np.array([[_1m, 1.0, _6_1m, 6.1, _6_1p]]),
+            np.array([[1.0, 1.0, _6_1m, 6.1, 7]]),
+            np.array([[1, 2, 3, 4, 5]]),
+            np.array([[2, 1, 4, 5, 12]]),
+            atol=atol,
+            rtol=rtol,
+        )
+
     # Distances aren't properly sorted
     msg = "Distances aren't sorted on row 0"
     with pytest.raises(AssertionError, match=msg):

From c66770e678b18159f0aa4efbdeb0eed78fe30bdf Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 11:57:55 +0200
Subject: [PATCH 12/33] Fix inline comments in test + one more test case to
 check symmetry of missing neighbor detection

---
 .../test_pairwise_distances_reduction.py      | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 3ae395f944d4a..83a65a6822cbf 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -419,7 +419,8 @@ def test_assert_argkmin_results_quasi_equality():
             rtol=rtol,
         )
 
-    # Indices aren't properly sorted w.r.t their distances
+    # Detect missing indices within the expected precision level, even when the
+    # distances match exactly.
     msg = re.escape(
         "neighors in b missing from a: [12]\nneighors in a missing from b: [1]"
     )
@@ -433,7 +434,7 @@ def test_assert_argkmin_results_quasi_equality():
             rtol=rtol,
         )
 
-    # Indices aren't properly sorted w.r.t their distances
+    # Detect missing indices outside the expected precision level.
     msg = re.escape(
         "neighors in b missing from a: []\nneighors in a missing from b: [3]"
     )
@@ -447,6 +448,21 @@ def test_assert_argkmin_results_quasi_equality():
             rtol=rtol,
         )
 
+    # Detect missing indices outside the expected precision level, in the other
+    # direction:
+    msg = re.escape(
+        "neighors in b missing from a: [5]\nneighors in a missing from b: []"
+    )
+    with pytest.raises(AssertionError, match=msg):
+        assert_argkmin_results_quasi_equality(
+            np.array([[_1m, 1.0, _6_1m, 6.1, 7]]),
+            np.array([[1.0, 1.0, _6_1m, 6.1, _6_1p]]),
+            np.array([[1, 2, 3, 4, 12]]),
+            np.array([[2, 1, 5, 3, 4]]),
+            atol=atol,
+            rtol=rtol,
+        )
+
     # Distances aren't properly sorted
     msg = "Distances aren't sorted on row 0"
     with pytest.raises(AssertionError, match=msg):

From 2663108c47588f32a1d00621c7bc39a8475d2f32 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 14:30:48 +0200
Subject: [PATCH 13/33] [azure parallel] [all random seeds]
 test_pairwise_distances


From 4a498e910a1f5fd2a4d2ba61678d44cef3c0300b Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 15:43:14 +0200
Subject: [PATCH 14/33] Refactor radius neighbors checks, remove unused code
 and accelerate parametrization while covering non dimensions that are non
 multiple of one another

---
 .../test_pairwise_distances_reduction.py      | 357 +++++++++---------
 1 file changed, 188 insertions(+), 169 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 83a65a6822cbf..2fffb3e2cc8ef 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -1,8 +1,6 @@
 import itertools
 import re
 import warnings
-from collections import defaultdict
-from math import floor, log10
 
 import numpy as np
 import pytest
@@ -79,32 +77,66 @@ def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices, rtol=1
     )
 
 
-def relative_rounding(scalar, n_significant_digits):
-    """Round a scalar to a number of significant digits relatively to its value."""
-    if scalar == 0:
-        return 0.0
-    magnitude = int(floor(log10(abs(scalar)))) + 1
-    return round(scalar, n_significant_digits - magnitude)
-
-
-def test_relative_rounding():
-    assert relative_rounding(0, 1) == 0.0
-    assert relative_rounding(0, 10) == 0.0
-    assert relative_rounding(0, 123456) == 0.0
-
-    assert relative_rounding(123456789, 0) == 0
-    assert relative_rounding(123456789, 2) == 120000000
-    assert relative_rounding(123456789, 3) == 123000000
-    assert relative_rounding(123456789, 10) == 123456789
-    assert relative_rounding(123456789, 20) == 123456789
-
-    assert relative_rounding(1.23456789, 2) == 1.2
-    assert relative_rounding(1.23456789, 3) == 1.23
-    assert relative_rounding(1.23456789, 10) == 1.23456789
+def assert_same_distances_for_common_neighbors(
+    query_idx,
+    dist_row_a,
+    dist_row_b,
+    indices_row_a,
+    indices_row_b,
+    rtol,
+    atol,
+):
+    """Check that the distances of common neighbors are equal up to tolerance.
 
-    assert relative_rounding(123.456789, 3) == 123.0
-    assert relative_rounding(123.456789, 9) == 123.456789
-    assert relative_rounding(123.456789, 10) == 123.456789
+    This does not check if there are missing neighbors in either result set.
+    Missingness is handled by assert_no_missing_neighbors.
+    """
+    # Compute a mapping from indices to distances for each result set and
+    # check that the computed neighbors with matching indices are withing
+    # the expected distance tolerance.
+    indices_to_dist_a = {idx: dist for idx, dist in zip(indices_row_a, dist_row_a)}
+    indices_to_dist_b = {idx: dist for idx, dist in zip(indices_row_b, dist_row_b)}
+
+    common_indices = set(indices_row_a).intersection(set(indices_row_b))
+    for idx in common_indices:
+        dist_a = indices_to_dist_a[idx]
+        dist_b = indices_to_dist_b[idx]
+        try:
+            assert_allclose(dist_a, dist_b, rtol=rtol, atol=atol)
+        except AssertionError as e:
+            # Wrap exception to provide more context while also including
+            # the original exception with the computed absolute and
+            # relative differences.
+            raise AssertionError(
+                f"Query vector with index {query_idx} lead to different distances"
+                f" for common neighbor with index {idx}:"
+                f" dist_a={dist_a} vs dist_b={dist_b} (with atol={atol} and"
+                f" rtol={rtol})"
+            ) from e
+
+
+def assert_no_missing_neighbors(
+    query_idx,
+    dist_row_a,
+    dist_row_b,
+    indices_row_a,
+    indices_row_b,
+    threshold,
+):
+    mask_a = dist_row_a < threshold
+    mask_b = dist_row_b < threshold
+    missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b)
+    missing_from_a = np.setdiff1d(indices_row_b[mask_b], indices_row_a)
+    if len(missing_from_a) > 0 or len(missing_from_b) > 0:
+        raise AssertionError(
+            f"Query vector with index {query_idx} lead to mismatched result indices:\n"
+            f"neighors in b missing from a: {missing_from_a}\n"
+            f"neighors in a missing from b: {missing_from_b}\n"
+            f"dist_row_a={dist_row_a}\n"
+            f"dist_row_b={dist_row_b}\n"
+            f"indices_row_a={indices_row_a}\n"
+            f"indices_row_b={indices_row_b}\n"
+        )
 
 
 def assert_argkmin_results_quasi_equality(
@@ -149,82 +181,66 @@ def assert_argkmin_results_quasi_equality(
         assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}"
         assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}"
 
-        # Compute a mapping from indices to distances for each result set and
-        # check that the computed neighbors with matching indices are withing
-        # the expected distance tolerance.
-        # It's ok if they are missing indices on either side, as long as the
-        # distances match in the previous assertions.
-        indices_to_dist_a = {idx: dist for idx, dist in zip(indices_row_a, dist_row_a)}
-        indices_to_dist_b = {idx: dist for idx, dist in zip(indices_row_b, dist_row_b)}
-
-        common_indices = set(indices_row_a).intersection(set(indices_row_b))
-        for idx in common_indices:
-            dist_a = indices_to_dist_a[idx]
-            dist_b = indices_to_dist_b[idx]
-            try:
-                assert_allclose(dist_a, dist_b, rtol=rtol, atol=atol)
-            except AssertionError as e:
-                # Wrap exception to provide more context while also including
-                # the original exception with the computed absolute and
-                # relative differences.
-                raise AssertionError(
-                    f"Query vector with index {query_idx} lead to different distances"
-                    f" for common neighbor with index {idx}:"
-                    f" dist_a={dist_a} vs dist_b={dist_b} (with atol={atol} and"
-                    f" rtol={rtol})"
-                ) from e
+        assert_same_distances_for_common_neighbors(
+            query_idx,
+            dist_row_a,
+            dist_row_b,
+            indices_row_a,
+            indices_row_b,
+            rtol,
+            atol,
+        )
 
         # Check that any neighbor with distances below the rounding error threshold have
         # matching indices.
         threshold = (1 - rtol) * np.maximum(
             np.max(dist_row_a), np.max(dist_row_b)
         ) - atol
-        mask_a = dist_row_a < threshold
-        mask_b = dist_row_b < threshold
-        missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b)
-        missing_from_a = np.setdiff1d(indices_row_b[mask_b], indices_row_a)
-        if len(missing_from_a) > 0 or len(missing_from_b) > 0:
-            raise AssertionError(
-                f"Query vector with index {query_idx} lead to mismatched result indices"
-                f" (with atol={atol} and rtol={rtol}):\n"
-                f"neighors in b missing from a: {missing_from_a}\n"
-                f"neighors in a missing from b: {missing_from_b}\n"
-                f"dist_row_a={dist_row_a}\n"
-                f"dist_row_b={dist_row_b}\n"
-                f"indices_row_a={indices_row_a}\n"
-                f"indices_row_b={indices_row_b}\n"
-            )
+        assert_no_missing_neighbors(
+            query_idx,
+            dist_row_a,
+            dist_row_b,
+            indices_row_a,
+            indices_row_b,
+            threshold,
+        )
 
 
 def assert_radius_neighbors_results_equality(
-    ref_dist, dist, ref_indices, indices, radius
+    neighbors_dists_a,
+    neighbors_dists_b,
+    neighbors_indices_a,
+    neighbors_indices_b,
+    radius,
 ):
     # We get arrays of arrays and we need to check for individual pairs
-    for i in range(ref_dist.shape[0]):
-        assert (ref_dist[i] <= radius).all()
+    for i in range(neighbors_dists_a.shape[0]):
+        assert (neighbors_dists_a[i] <= radius).all()
         assert_array_equal(
-            ref_indices[i],
-            indices[i],
+            neighbors_indices_a[i],
+            neighbors_indices_b[i],
             err_msg=f"Query vector #{i} has different neighbors' indices",
         )
         assert_allclose(
-            ref_dist[i],
-            dist[i],
+            neighbors_dists_a[i],
+            neighbors_dists_b[i],
             err_msg=f"Query vector #{i} has different neighbors' distances",
             rtol=1e-7,
         )
 
 
 def assert_radius_neighbors_results_quasi_equality(
-    ref_dist,
-    dist,
-    ref_indices,
-    indices,
+    neighbors_dists_a,
+    neighbors_dists_b,
+    neighbors_indices_a,
+    neighbors_indices_b,
     radius,
-    rtol=1e-4,
+    rtol=1e-5,
+    atol=1e-6,
 ):
     """Assert that radius neighborhood results are valid up to:
-      - relative tolerance on computed distance values
+
+      - relative and absolute tolerance on computed distance values
       - permutations of indices for distances values that differ up to
         a precision level
       - missing or extra last elements if their distance is
@@ -238,72 +254,59 @@ def assert_radius_neighbors_results_quasi_equality(
     """
     is_sorted = lambda a: np.all(a[:-1] <= a[1:])
 
-    n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1)
-
     assert (
-        len(ref_dist) == len(dist) == len(ref_indices) == len(indices)
+        len(neighbors_dists_a)
+        == len(neighbors_dists_b)
+        == len(neighbors_indices_a)
+        == len(neighbors_indices_b)
     ), "Arrays of results have various lengths."
 
-    n_queries = len(ref_dist)
+    n_queries = len(neighbors_dists_a)
 
     # Asserting equality of results one vector at a time
     for query_idx in range(n_queries):
-        ref_dist_row = ref_dist[query_idx]
-        dist_row = dist[query_idx]
-
-        assert is_sorted(
-            ref_dist_row
-        ), f"Reference distances aren't sorted on row {query_idx}"
-        assert is_sorted(dist_row), f"Distances aren't sorted on row {query_idx}"
-
-        # Vectors' lengths might be different due to small
-        # numerical differences of distance w.r.t the `radius` threshold.
-        largest_row = ref_dist_row if len(ref_dist_row) > len(dist_row) else dist_row
-
-        # For the longest distances vector, we check that last extra elements
-        # that aren't present in the other vector are all in: [radius ± rtol]
-        min_length = min(len(ref_dist_row), len(dist_row))
-        last_extra_elements = largest_row[min_length:]
-        if last_extra_elements.size > 0:
-            assert np.all(radius - rtol <= last_extra_elements <= radius + rtol), (
-                f"The last extra elements ({last_extra_elements}) aren't in [radius ±"
-                f" rtol]=[{radius} ± {rtol}]"
-            )
+        dist_row_a = neighbors_dists_a[query_idx]
+        dist_row_b = neighbors_dists_b[query_idx]
+        indices_row_a = neighbors_indices_a[query_idx]
+        indices_row_b = neighbors_indices_b[query_idx]
 
-        # We truncate the neighbors results list on the smallest length to
-        # be able to compare them, ignoring the elements checked above.
-        ref_dist_row = ref_dist_row[:min_length]
-        dist_row = dist_row[:min_length]
+        assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}"
+        assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}"
 
-        assert_allclose(ref_dist_row, dist_row, rtol=rtol)
+        assert len(dist_row_a) == len(indices_row_a)
+        assert len(dist_row_b) == len(indices_row_b)
 
-        ref_indices_row = ref_indices[query_idx]
-        indices_row = indices[query_idx]
+        # Check that all distances are within the requested radius
+        if len(dist_row_a) > 0:
+            assert dist_row_a[-1] <= radius, (
+                f"Largest returned distance {dist_row_a[-1]} not within requested"
+                f" radius {radius} on row {query_idx}"
+            )
+        if len(dist_row_b) > 0:
+            assert dist_row_b[-1] <= radius, (
+                f"Largest returned distance {dist_row_b[-1]} not within requested"
+                f" radius {radius} on row {query_idx}"
+            )
 
-        # Grouping indices by distances using sets on a rounded distances up
-        # to a given number of significant digits derived from rtol.
-        reference_neighbors_groups = defaultdict(set)
-        effective_neighbors_groups = defaultdict(set)
+        assert_same_distances_for_common_neighbors(
+            query_idx,
+            dist_row_a,
+            dist_row_b,
+            indices_row_a,
+            indices_row_b,
+            rtol,
+            atol,
+        )
 
-        for neighbor_rank in range(min_length):
-            rounded_dist = relative_rounding(
-                ref_dist_row[neighbor_rank],
-                n_significant_digits=n_significant_digits,
-            )
-            reference_neighbors_groups[rounded_dist].add(ref_indices_row[neighbor_rank])
-            effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank])
-
-        # Asserting equality of groups (sets) for each distance
-        msg = (
-            f"Neighbors indices for query {query_idx} are not matching "
-            f"when rounding distances at {n_significant_digits} significant digits "
-            f"derived from rtol={rtol:.1e}"
+        threshold = (1 - rtol) * radius - atol
+        assert_no_missing_neighbors(
+            query_idx,
+            dist_row_a,
+            dist_row_b,
+            indices_row_a,
+            indices_row_b,
+            threshold,
         )
-        for rounded_distance in reference_neighbors_groups.keys():
-            assert (
-                reference_neighbors_groups[rounded_distance]
-                == effective_neighbors_groups[rounded_distance]
-            ), msg
 
 
 ASSERT_RESULT = {
@@ -478,7 +481,7 @@ def test_assert_argkmin_results_quasi_equality():
 
 def test_assert_radius_neighbors_results_quasi_equality():
     rtol = 1e-7
-    eps = 1e-7
+    eps = 1e-7 / 2
     _1m = 1.0 - eps
     _1p = 1.0 + eps
 
@@ -501,7 +504,7 @@ def test_assert_radius_neighbors_results_quasi_equality():
         ref_dist,
         ref_indices,
         ref_indices,
-        radius=6.1,
+        radius=7.0,
         rtol=rtol,
     )
 
@@ -511,7 +514,7 @@ def test_assert_radius_neighbors_results_quasi_equality():
         np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
         np.array([np.array([1, 2, 3, 4, 5])]),
         np.array([np.array([1, 2, 4, 5, 3])]),
-        radius=6.1,
+        radius=7.0,
         rtol=rtol,
     )
     assert_radius_neighbors_results_quasi_equality(
@@ -519,35 +522,41 @@ def test_assert_radius_neighbors_results_quasi_equality():
         np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
         np.array([np.array([6, 7, 8, 9, 10])]),
         np.array([np.array([6, 9, 7, 8, 10])]),
-        radius=6.1,
+        radius=7.0,
         rtol=rtol,
     )
 
     # Apply invalid permutation on indices
-    msg = "Neighbors indices for query 0 are not matching"
+    msg = re.escape(
+        "Query vector with index 0 lead to different distances for common neighbor with"
+        " index 1: dist_a=1.2 vs dist_b=2.5 (with atol=1e-06 and rtol=1e-07)"
+    )
     with pytest.raises(AssertionError, match=msg):
         assert_radius_neighbors_results_quasi_equality(
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
             np.array([np.array([1, 2, 3, 4, 5])]),
             np.array([np.array([2, 1, 3, 4, 5])]),
-            radius=6.1,
+            radius=7.0,
             rtol=rtol,
         )
 
-    # Having extra last elements is valid if they are in: [radius ± rtol]
+    # Having extra last or missing elements is valid if they are in the
+    # tolerated rounding error range: [(1 - rtol) * radius - atol, radius]
     assert_radius_neighbors_results_quasi_equality(
-        np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
+        np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p, _6_1p])]),
         np.array([np.array([1.2, 2.5, _6_1m, 6.1])]),
-        np.array([np.array([1, 2, 3, 4, 5])]),
-        np.array([np.array([1, 2, 3, 4])]),
-        radius=6.1,
+        np.array([np.array([1, 2, 3, 4, 5, 7])]),
+        np.array([np.array([1, 2, 3, 6])]),
+        radius=_6_1p,
         rtol=rtol,
     )
 
-    # Having extra last elements is invalid if they are lesser than radius - rtol
+    # Any discrepancy outside the tolerated rounding error range is invalid and
+    # indicates a missing neighbor in one of the result sets.
     msg = re.escape(
-        "The last extra elements ([6.]) aren't in [radius ± rtol]=[6.1 ± 1e-07]"
+        "Query vector with index 0 lead to mismatched result indices:\nneighors in b"
+        " missing from a: []\nneighors in a missing from b: [3]"
     )
     with pytest.raises(AssertionError, match=msg):
         assert_radius_neighbors_results_quasi_equality(
@@ -558,12 +567,37 @@ def test_assert_radius_neighbors_results_quasi_equality():
             radius=6.1,
             rtol=rtol,
         )
+    msg = re.escape(
+        "Query vector with index 0 lead to mismatched result indices:\nneighors in b"
+        " missing from a: [4]\nneighors in a missing from b: [2]"
+    )
+    with pytest.raises(AssertionError, match=msg):
+        assert_radius_neighbors_results_quasi_equality(
+            np.array([np.array([1.2, 2.1, 2.5])]),
+            np.array([np.array([1.2, 2, 2.5])]),
+            np.array([np.array([1, 2, 3])]),
+            np.array([np.array([1, 4, 3])]),
+            radius=6.1,
+            rtol=rtol,
+        )
 
-    # Indices aren't properly sorted w.r.t their distances
-    msg = "Neighbors indices for query 0 are not matching"
+    # Radius upper bound is strictly checked
+    msg = re.escape(
+        "Largest returned distance 6.100000049999999 not within requested radius 6.1 on"
+        " row 0"
+    )
     with pytest.raises(AssertionError, match=msg):
         assert_radius_neighbors_results_quasi_equality(
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
+            np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]),
+            np.array([np.array([1, 2, 3, 4, 5])]),
+            np.array([np.array([2, 1, 4, 5, 3])]),
+            radius=6.1,
+            rtol=rtol,
+        )
+    with pytest.raises(AssertionError, match=msg):
+        assert_radius_neighbors_results_quasi_equality(
+            np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]),
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
             np.array([np.array([1, 2, 3, 4, 5])]),
             np.array([np.array([2, 1, 4, 5, 3])]),
@@ -937,22 +971,18 @@ def test_radius_neighbors_factory_method_wrong_usages():
         )
 
 
-@pytest.mark.parametrize(
-    "n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)]
-)
 @pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
 @pytest.mark.parametrize("dtype", [np.float64, np.float32])
 def test_chunk_size_agnosticism(
     global_random_seed,
     Dispatcher,
-    n_samples_X,
-    n_samples_Y,
     dtype,
     n_features=100,
 ):
     """Check that results do not depend on the chunk size."""
     rng = np.random.RandomState(global_random_seed)
     spread = 100
+    n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False)
     X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
 
@@ -992,21 +1022,17 @@ def test_chunk_size_agnosticism(
     )
 
 
-@pytest.mark.parametrize(
-    "n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)]
-)
 @pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
 @pytest.mark.parametrize("dtype", [np.float64, np.float32])
 def test_n_threads_agnosticism(
     global_random_seed,
     Dispatcher,
-    n_samples_X,
-    n_samples_Y,
     dtype,
     n_features=100,
 ):
     """Check that results do not depend on the number of threads."""
     rng = np.random.RandomState(global_random_seed)
+    n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False)
     spread = 100
     X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
@@ -1113,9 +1139,6 @@ def test_format_agnosticism(
         )
 
 
-@pytest.mark.parametrize(
-    "n_samples_X, n_samples_Y", [(100, 100), (100, 500), (500, 100)]
-)
 @pytest.mark.parametrize(
     "metric",
     ["euclidean", "minkowski", "manhattan", "infinity", "seuclidean", "haversine"],
@@ -1126,13 +1149,12 @@ def test_strategies_consistency(
     global_random_seed,
     Dispatcher,
     metric,
-    n_samples_X,
-    n_samples_Y,
     dtype,
     n_features=10,
 ):
     """Check that the results do not depend on the strategy used."""
     rng = np.random.RandomState(global_random_seed)
+    n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False)
     spread = 100
     X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
     Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
@@ -1374,21 +1396,18 @@ def test_memmap_backed_data(
     )
 
 
-@pytest.mark.parametrize("n_samples", [100, 1000])
-@pytest.mark.parametrize("n_features", [5, 10, 100])
-@pytest.mark.parametrize("num_threads", [1, 2, 8])
 @pytest.mark.parametrize("dtype", [np.float64, np.float32])
 @pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
 def test_sqeuclidean_row_norms(
     global_random_seed,
-    n_samples,
-    n_features,
-    num_threads,
     dtype,
     csr_container,
 ):
     rng = np.random.RandomState(global_random_seed)
     spread = 100
+    n_samples = rng.choice([97, 100, 101, 1000])
+    n_features = rng.choice([5, 10, 100])
+    num_threads = rng.choice([1, 2, 8])
     X = rng.rand(n_samples, n_features).astype(dtype) * spread
 
     X_csr = csr_container(X)

From 36aead5c61bf8dff4244c88fd7d93824cdc79e34 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 16:49:50 +0200
Subject: [PATCH 15/33] [azure parallel] [all random seeds]
 test_pairwise_distances


From 7887faa9f695ca358fc3de4580d6b23ec9b81fbd Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 17:31:14 +0200
Subject: [PATCH 16/33] More factorization, easier to understand handling of
 tols in the tests

---
 .../test_pairwise_distances_reduction.py      | 175 +++++++-----------
 1 file changed, 67 insertions(+), 108 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 2fffb3e2cc8ef..6decc47fdd6f5 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -1,6 +1,7 @@
 import itertools
 import re
 import warnings
+from functools import partial
 
 import numpy as np
 import pytest
@@ -63,20 +64,6 @@ def _get_metric_params_list(metric: str, n_features: int, seed: int = 1):
     return [{}]
 
 
-def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices, rtol=1e-7):
-    assert_array_equal(
-        ref_indices,
-        indices,
-        err_msg="Query vectors have different neighbors' indices",
-    )
-    assert_allclose(
-        ref_dist,
-        dist,
-        err_msg="Query vectors have different neighbors' distances",
-        rtol=rtol,
-    )
-
-
 def assert_same_distances_for_common_neighbors(
     query_idx,
     dist_row_a,
@@ -139,7 +126,7 @@ def assert_no_missing_neighbors(
         )
 
 
-def assert_argkmin_results_quasi_equality(
+def assert_compatible_argkmin_results(
     neighbors_dists_a,
     neighbors_dists_b,
     neighbors_indices_a,
@@ -206,30 +193,7 @@ def assert_argkmin_results_quasi_equality(
         )
 
 
-def assert_radius_neighbors_results_equality(
-    neighbors_dists_a,
-    neighbors_dists_b,
-    neighbors_indices_a,
-    neighbors_indices_b,
-    radius,
-):
-    # We get arrays of arrays and we need to check for individual pairs
-    for i in range(neighbors_dists_a.shape[0]):
-        assert (neighbors_dists_a[i] <= radius).all()
-        assert_array_equal(
-            neighbors_indices_a[i],
-            neighbors_indices_b[i],
-            err_msg=f"Query vector #{i} has different neighbors' indices",
-        )
-        assert_allclose(
-            neighbors_dists_a[i],
-            neighbors_dists_b[i],
-            err_msg=f"Query vector #{i} has different neighbors' distances",
-            rtol=1e-7,
-        )
-
-
-def assert_radius_neighbors_results_quasi_equality(
+def assert_compatible_radius_results(
     neighbors_dists_a,
     neighbors_dists_b,
     neighbors_indices_a,
@@ -309,34 +273,34 @@ def assert_radius_neighbors_results_quasi_equality(
         )
 
 
+FLOAT32_TOLS = {
+    "atol": 1e-7,
+    "rtol": 1e-5,
+}
+FLOAT64_TOLS = {
+    "atol": 1e-9,
+    "rtol": 1e-7,
+}
 ASSERT_RESULT = {
-    # In the case of 64bit, we test for exact equality of the results rankings
-    # and standard tolerance levels for the computed distance values.
-    #
-    # XXX: Note that in the future we might be interested in using quasi equality
-    # checks also for float64 data (with a larger number of significant digits)
-    # as the tests could be unstable because of numerically tied distances on
-    # some datasets (e.g. uniform grids).
-    (ArgKmin, np.float64): assert_argkmin_results_equality,
+    (ArgKmin, np.float64): partial(assert_compatible_argkmin_results, **FLOAT64_TOLS),
+    (ArgKmin, np.float32): partial(assert_compatible_argkmin_results, **FLOAT32_TOLS),
     (
         RadiusNeighbors,
         np.float64,
-    ): assert_radius_neighbors_results_equality,
-    # In the case of 32bit, indices can be permuted due to small difference
-    # in the computations of their associated distances, hence we test equality of
-    # results up to valid permutations.
-    (ArgKmin, np.float32): assert_argkmin_results_quasi_equality,
+    ): partial(assert_compatible_radius_results, **FLOAT64_TOLS),
     (
         RadiusNeighbors,
         np.float32,
-    ): assert_radius_neighbors_results_quasi_equality,
+    ): partial(assert_compatible_radius_results, **FLOAT32_TOLS),
 }
 
 
-def test_assert_argkmin_results_quasi_equality():
-    atol = 0.0
-    rtol = 1e-7
-    eps = 1e-7 / 2
+def test_assert_compatible_argkmin_results():
+    atol = 1e-7
+    rtol = 0.0
+    tols = dict(atol=atol, rtol=rtol)
+
+    eps = atol / 3
     _1m = 1.0 - eps
     _1p = 1.0 + eps
 
@@ -357,13 +321,13 @@ def test_assert_argkmin_results_quasi_equality():
     )
 
     # Sanity check: compare the reference results to themselves.
-    assert_argkmin_results_quasi_equality(
+    assert_compatible_argkmin_results(
         ref_dist, ref_dist, ref_indices, ref_indices, rtol
     )
 
     # Apply valid permutation on indices: the last 3 points are all very close
     # to one another so we accept any permutation on their rankings.
-    assert_argkmin_results_quasi_equality(
+    assert_compatible_argkmin_results(
         np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
         np.array([[1.2, 2.5, 6.1, 6.1, 6.1]]),
         np.array([[1, 2, 3, 4, 5]]),
@@ -374,7 +338,7 @@ def test_assert_argkmin_results_quasi_equality():
 
     # The last few indices do not necessarily have to match because of the rounding
     # errors on the distances: their could be tied results at the boundary.
-    assert_argkmin_results_quasi_equality(
+    assert_compatible_argkmin_results(
         np.array([[1.2, 2.5, 3.0, 6.1, _6_1p]]),
         np.array([[1.2, 2.5, 3.0, _6_1m, 6.1]]),
         np.array([[1, 2, 3, 4, 5]]),
@@ -385,24 +349,22 @@ def test_assert_argkmin_results_quasi_equality():
 
     # All points are have close distances so any ranking permutation
     # is valid for this query result.
-    assert_argkmin_results_quasi_equality(
+    assert_compatible_argkmin_results(
         np.array([[1, 1, _1p, _1p, _1p]]),
         np.array([[1, 1, 1, 1, _1p]]),
         np.array([[7, 6, 8, 10, 9]]),
         np.array([[6, 9, 7, 8, 10]]),
-        atol=atol,
-        rtol=rtol,
+        **tols,
     )
 
     # They could also be nearly truncation of very large nearly tied result
     # sets hence all indices can also be distinct in this case:
-    assert_argkmin_results_quasi_equality(
+    assert_compatible_argkmin_results(
         np.array([[1, 1, _1p, _1p, _1p]]),
         np.array([[1, 1, 1, 1, _1p]]),
         np.array([[34, 30, 8, 12, 24]]),
         np.array([[42, 1, 21, 13, 3]]),
-        atol=atol,
-        rtol=rtol,
+        **tols,
     )
 
     # Apply invalid permutation on indices: permuting the ranks of the 2
@@ -410,16 +372,15 @@ def test_assert_argkmin_results_quasi_equality():
     # different.
     msg = re.escape(
         "Query vector with index 0 lead to different distances for common neighbor with"
-        " index 1: dist_a=1.2 vs dist_b=2.5 (with atol=0.0 and rtol=1e-07)"
+        " index 1: dist_a=1.2 vs dist_b=2.5"
     )
     with pytest.raises(AssertionError, match=msg):
-        assert_argkmin_results_quasi_equality(
+        assert_compatible_argkmin_results(
             np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
             np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
             np.array([[1, 2, 3, 4, 5]]),
             np.array([[2, 1, 3, 4, 5]]),
-            atol=atol,
-            rtol=rtol,
+            **tols,
         )
 
     # Detect missing indices within the expected precision level, even when the
@@ -428,13 +389,12 @@ def test_assert_argkmin_results_quasi_equality():
         "neighors in b missing from a: [12]\nneighors in a missing from b: [1]"
     )
     with pytest.raises(AssertionError, match=msg):
-        assert_argkmin_results_quasi_equality(
+        assert_compatible_argkmin_results(
             np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
             np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
             np.array([[1, 2, 3, 4, 5]]),
             np.array([[12, 2, 4, 11, 3]]),
-            atol=atol,
-            rtol=rtol,
+            **tols,
         )
 
     # Detect missing indices outside the expected precision level.
@@ -442,13 +402,12 @@ def test_assert_argkmin_results_quasi_equality():
         "neighors in b missing from a: []\nneighors in a missing from b: [3]"
     )
     with pytest.raises(AssertionError, match=msg):
-        assert_argkmin_results_quasi_equality(
+        assert_compatible_argkmin_results(
             np.array([[_1m, 1.0, _6_1m, 6.1, _6_1p]]),
             np.array([[1.0, 1.0, _6_1m, 6.1, 7]]),
             np.array([[1, 2, 3, 4, 5]]),
             np.array([[2, 1, 4, 5, 12]]),
-            atol=atol,
-            rtol=rtol,
+            **tols,
         )
 
     # Detect missing indices outside the expected precision level, in the other
@@ -457,34 +416,34 @@ def test_assert_argkmin_results_quasi_equality():
         "neighors in b missing from a: [5]\nneighors in a missing from b: []"
     )
     with pytest.raises(AssertionError, match=msg):
-        assert_argkmin_results_quasi_equality(
+        assert_compatible_argkmin_results(
             np.array([[_1m, 1.0, _6_1m, 6.1, 7]]),
             np.array([[1.0, 1.0, _6_1m, 6.1, _6_1p]]),
             np.array([[1, 2, 3, 4, 12]]),
             np.array([[2, 1, 5, 3, 4]]),
-            atol=atol,
-            rtol=rtol,
+            **tols,
         )
 
     # Distances aren't properly sorted
     msg = "Distances aren't sorted on row 0"
     with pytest.raises(AssertionError, match=msg):
-        assert_argkmin_results_quasi_equality(
+        assert_compatible_argkmin_results(
             np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
             np.array([[2.5, 1.2, _6_1m, 6.1, _6_1p]]),
             np.array([[1, 2, 3, 4, 5]]),
             np.array([[2, 1, 4, 5, 3]]),
-            atol=atol,
-            rtol=rtol,
+            **tols,
         )
 
 
-def test_assert_radius_neighbors_results_quasi_equality():
-    rtol = 1e-7
-    eps = 1e-7 / 2
+def test_assert_compatible_radius_results():
+    atol = 1e-7
+    rtol = 0.0
+    tols = dict(atol=atol, rtol=rtol)
+
+    eps = atol / 3
     _1m = 1.0 - eps
     _1p = 1.0 + eps
-
     _6_1m = 6.1 - eps
     _6_1p = 6.1 + eps
 
@@ -499,57 +458,57 @@ def test_assert_radius_neighbors_results_quasi_equality():
     ]
 
     # Sanity check: compare the reference results to themselves.
-    assert_radius_neighbors_results_quasi_equality(
+    assert_compatible_radius_results(
         ref_dist,
         ref_dist,
         ref_indices,
         ref_indices,
         radius=7.0,
-        rtol=rtol,
+        **tols,
     )
 
     # Apply valid permutation on indices
-    assert_radius_neighbors_results_quasi_equality(
+    assert_compatible_radius_results(
         np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
         np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
         np.array([np.array([1, 2, 3, 4, 5])]),
         np.array([np.array([1, 2, 4, 5, 3])]),
         radius=7.0,
-        rtol=rtol,
+        **tols,
     )
-    assert_radius_neighbors_results_quasi_equality(
+    assert_compatible_radius_results(
         np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
         np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
         np.array([np.array([6, 7, 8, 9, 10])]),
         np.array([np.array([6, 9, 7, 8, 10])]),
         radius=7.0,
-        rtol=rtol,
+        **tols,
     )
 
     # Apply invalid permutation on indices
     msg = re.escape(
         "Query vector with index 0 lead to different distances for common neighbor with"
-        " index 1: dist_a=1.2 vs dist_b=2.5 (with atol=1e-06 and rtol=1e-07)"
+        " index 1: dist_a=1.2 vs dist_b=2.5"
     )
     with pytest.raises(AssertionError, match=msg):
-        assert_radius_neighbors_results_quasi_equality(
+        assert_compatible_radius_results(
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
             np.array([np.array([1, 2, 3, 4, 5])]),
             np.array([np.array([2, 1, 3, 4, 5])]),
             radius=7.0,
-            rtol=rtol,
+            **tols,
         )
 
     # Having extra last or missing elements is valid if they are in the
     # tolerated rounding error range: [(1 - rtol) * radius - atol, radius]
-    assert_radius_neighbors_results_quasi_equality(
+    assert_compatible_radius_results(
         np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p, _6_1p])]),
         np.array([np.array([1.2, 2.5, _6_1m, 6.1])]),
         np.array([np.array([1, 2, 3, 4, 5, 7])]),
         np.array([np.array([1, 2, 3, 6])]),
         radius=_6_1p,
-        rtol=rtol,
+        **tols,
     )
 
     # Any discrepancy outside the tolerated rounding error range is invalid and
@@ -559,62 +518,62 @@ def test_assert_radius_neighbors_results_quasi_equality():
         " missing from a: []\nneighors in a missing from b: [3]"
     )
     with pytest.raises(AssertionError, match=msg):
-        assert_radius_neighbors_results_quasi_equality(
+        assert_compatible_radius_results(
             np.array([np.array([1.2, 2.5, 6])]),
             np.array([np.array([1.2, 2.5])]),
             np.array([np.array([1, 2, 3])]),
             np.array([np.array([1, 2])]),
             radius=6.1,
-            rtol=rtol,
+            **tols,
         )
     msg = re.escape(
         "Query vector with index 0 lead to mismatched result indices:\nneighors in b"
         " missing from a: [4]\nneighors in a missing from b: [2]"
     )
     with pytest.raises(AssertionError, match=msg):
-        assert_radius_neighbors_results_quasi_equality(
+        assert_compatible_radius_results(
             np.array([np.array([1.2, 2.1, 2.5])]),
             np.array([np.array([1.2, 2, 2.5])]),
             np.array([np.array([1, 2, 3])]),
             np.array([np.array([1, 4, 3])]),
             radius=6.1,
-            rtol=rtol,
+            **tols,
         )
 
     # Radius upper bound is strictly checked
     msg = re.escape(
-        "Largest returned distance 6.100000049999999 not within requested radius 6.1 on"
+        "Largest returned distance 6.100000033333333 not within requested radius 6.1 on"
         " row 0"
     )
     with pytest.raises(AssertionError, match=msg):
-        assert_radius_neighbors_results_quasi_equality(
+        assert_compatible_radius_results(
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]),
             np.array([np.array([1, 2, 3, 4, 5])]),
             np.array([np.array([2, 1, 4, 5, 3])]),
             radius=6.1,
-            rtol=rtol,
+            **tols,
         )
     with pytest.raises(AssertionError, match=msg):
-        assert_radius_neighbors_results_quasi_equality(
+        assert_compatible_radius_results(
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]),
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
             np.array([np.array([1, 2, 3, 4, 5])]),
             np.array([np.array([2, 1, 4, 5, 3])]),
             radius=6.1,
-            rtol=rtol,
+            **tols,
         )
 
     # Distances aren't properly sorted
     msg = "Distances aren't sorted on row 0"
     with pytest.raises(AssertionError, match=msg):
-        assert_radius_neighbors_results_quasi_equality(
+        assert_compatible_radius_results(
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
             np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]),
             np.array([np.array([1, 2, 3, 4, 5])]),
             np.array([np.array([2, 1, 4, 5, 3])]),
             radius=6.1,
-            rtol=rtol,
+            **tols,
         )
 
 
From 70dd93306d393e675f2fbd1b25dacbceffe78be6 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 17:31:24 +0200
Subject: [PATCH 17/33] [azure parallel] [all random seeds]
 test_pairwise_distances


From e8c227a0f63f7c587867c663f5dbc88f76d4ae5a Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 17:36:26 +0200
Subject: [PATCH 18/33] More interesting tests

---
 .../metrics/tests/test_pairwise_distances_reduction.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 6decc47fdd6f5..8588cf0c8fc22 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -329,9 +329,9 @@ def test_assert_compatible_argkmin_results():
     # to one another so we accept any permutation on their rankings.
     assert_compatible_argkmin_results(
         np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
-        np.array([[1.2, 2.5, 6.1, 6.1, 6.1]]),
+        np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
         np.array([[1, 2, 3, 4, 5]]),
-        np.array([[1, 2, 4, 5, 3]]),
+        np.array([[1, 2, 5, 4, 3]]),
         atol=atol,
         rtol=rtol,
     )
@@ -350,7 +350,7 @@ def test_assert_compatible_argkmin_results():
     # All points are have close distances so any ranking permutation
     # is valid for this query result.
     assert_compatible_argkmin_results(
-        np.array([[1, 1, _1p, _1p, _1p]]),
+        np.array([[_1m, 1, _1p, _1p, _1p]]),
         np.array([[1, 1, 1, 1, _1p]]),
         np.array([[7, 6, 8, 10, 9]]),
         np.array([[6, 9, 7, 8, 10]]),
@@ -360,8 +360,8 @@ def test_assert_compatible_argkmin_results():
     # They could also be nearly truncation of very large nearly tied result
     # sets hence all indices can also be distinct in this case:
     assert_compatible_argkmin_results(
-        np.array([[1, 1, _1p, _1p, _1p]]),
-        np.array([[1, 1, 1, 1, _1p]]),
+        np.array([[_1m, 1, _1p, _1p, _1p]]),
+        np.array([[_1m, 1, 1, 1, _1p]]),
         np.array([[34, 30, 8, 12, 24]]),
         np.array([[42, 1, 21, 13, 3]]),
         **tols,

From 9845866e2ca0ad0dc93a411adbfa29bc35c253fc Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 17:36:32 +0200
Subject: [PATCH 19/33] [azure parallel] [all random seeds]
 test_pairwise_distances


From 106952a1ee0a6e416095152dacc1f638f4e8722b Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 17:51:27 +0200
Subject: [PATCH 20/33] Fix side effect of the refactoring in test_neighbors

---
 .../test_pairwise_distances_reduction.py      | 38 +++++++++++++++----
 sklearn/neighbors/tests/test_neighbors.py     |  5 ++-
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 8588cf0c8fc22..fb9281967de09 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -199,6 +199,7 @@ def assert_compatible_radius_results(
     neighbors_indices_a,
     neighbors_indices_b,
     radius,
+    check_sorted=True,
     rtol=1e-5,
     atol=1e-6,
 ):
@@ -234,8 +235,9 @@ def assert_compatible_radius_results(
         indices_row_a = neighbors_indices_a[query_idx]
         indices_row_b = neighbors_indices_b[query_idx]
 
-        assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}"
-        assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}"
+        if check_sorted:
+            assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}"
+            assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}"
 
         assert len(dist_row_a) == len(indices_row_a)
         assert len(dist_row_b) == len(indices_row_b)
@@ -436,7 +438,8 @@ def test_assert_compatible_argkmin_results():
         )
 
 
-def test_assert_compatible_radius_results():
+@pytest.mark.parametrize("check_sorted", [True, False])
+def test_assert_compatible_radius_results(check_sorted):
     atol = 1e-7
     rtol = 0.0
     tols = dict(atol=atol, rtol=rtol)
@@ -464,6 +467,7 @@ def test_assert_compatible_radius_results():
         ref_indices,
         ref_indices,
         radius=7.0,
+        check_sorted=check_sorted,
         **tols,
     )
 
@@ -474,6 +478,7 @@ def test_assert_compatible_radius_results():
         np.array([np.array([1, 2, 3, 4, 5])]),
         np.array([np.array([1, 2, 4, 5, 3])]),
         radius=7.0,
+        check_sorted=check_sorted,
         **tols,
     )
     assert_compatible_radius_results(
@@ -482,6 +487,7 @@ def test_assert_compatible_radius_results():
         np.array([np.array([6, 7, 8, 9, 10])]),
         np.array([np.array([6, 9, 7, 8, 10])]),
         radius=7.0,
+        check_sorted=check_sorted,
         **tols,
     )
 
@@ -497,6 +503,7 @@ def test_assert_compatible_radius_results():
             np.array([np.array([1, 2, 3, 4, 5])]),
             np.array([np.array([2, 1, 3, 4, 5])]),
             radius=7.0,
+            check_sorted=check_sorted,
             **tols,
         )
 
@@ -508,6 +515,7 @@ def test_assert_compatible_radius_results():
         np.array([np.array([1, 2, 3, 4, 5, 7])]),
         np.array([np.array([1, 2, 3, 6])]),
         radius=_6_1p,
+        check_sorted=check_sorted,
         **tols,
     )
 
@@ -524,6 +532,7 @@ def test_assert_compatible_radius_results():
             np.array([np.array([1, 2, 3])]),
             np.array([np.array([1, 2])]),
             radius=6.1,
+            check_sorted=check_sorted,
             **tols,
         )
     msg = re.escape(
@@ -537,6 +546,7 @@ def test_assert_compatible_radius_results():
             np.array([np.array([1, 2, 3])]),
             np.array([np.array([1, 4, 3])]),
             radius=6.1,
+            check_sorted=check_sorted,
             **tols,
         )
 
@@ -552,6 +562,7 @@ def test_assert_compatible_radius_results():
             np.array([np.array([1, 2, 3, 4, 5])]),
             np.array([np.array([2, 1, 4, 5, 3])]),
             radius=6.1,
+            check_sorted=check_sorted,
             **tols,
         )
     with pytest.raises(AssertionError, match=msg):
@@ -561,18 +572,31 @@ def test_assert_compatible_radius_results():
             np.array([np.array([1, 2, 3, 4, 5])]),
             np.array([np.array([2, 1, 4, 5, 3])]),
             radius=6.1,
+            check_sorted=check_sorted,
             **tols,
         )
 
-    # Distances aren't properly sorted
-    msg = "Distances aren't sorted on row 0"
-    with pytest.raises(AssertionError, match=msg):
+    if check_sorted:
+        # Distances aren't properly sorted
+        msg = "Distances aren't sorted on row 0"
+        with pytest.raises(AssertionError, match=msg):
+            assert_compatible_radius_results(
+                np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
+                np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]),
+                np.array([np.array([1, 2, 3, 4, 5])]),
+                np.array([np.array([2, 1, 4, 5, 3])]),
+                radius=_6_1p,
+                check_sorted=True,
+                **tols,
+            )
+    else:
         assert_compatible_radius_results(
             np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
             np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]),
             np.array([np.array([1, 2, 3, 4, 5])]),
             np.array([np.array([2, 1, 4, 5, 3])]),
-            radius=6.1,
+            radius=_6_1p,
+            check_sorted=False,
             **tols,
         )
 
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 35fc210bea7f3..08efb587808f7 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -30,7 +30,7 @@
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS
 from sklearn.metrics.tests.test_pairwise_distances_reduction import (
-    assert_radius_neighbors_results_equality,
+    assert_compatible_radius_results,
 )
 from sklearn.model_selection import cross_val_score, train_test_split
 from sklearn.neighbors import (
@@ -2242,12 +2242,13 @@ def test_radius_neighbors_brute_backend(
                 X_test, return_distance=True
             )
 
-        assert_radius_neighbors_results_equality(
+        assert_compatible_radius_results(
             legacy_brute_dst,
             pdr_brute_dst,
             legacy_brute_idx,
             pdr_brute_idx,
             radius=radius,
+            check_sorted=False,
         )
 
 
From 9b57538d4a2c734f1a370131bb5237aa6ed669f9 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 19:36:45 +0200
Subject: [PATCH 21/33] More test tweaks to reduce test time while increasing
 aggregate coverage

---
 .../test_pairwise_distances_reduction.py      |  4 +--
 sklearn/neighbors/tests/test_neighbors.py     | 26 +++++++++++++------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index fb9281967de09..19c68cecc01b6 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -1208,7 +1208,7 @@ def test_pairwise_distances_argkmin(
     strategy,
     dtype,
     csr_container,
-    n_queries=10,
+    n_queries=5,
     n_samples=100,
     k=10,
 ):
@@ -1273,7 +1273,7 @@ def test_pairwise_distances_radius_neighbors(
     metric,
     strategy,
     dtype,
-    n_queries=10,
+    n_queries=5,
     n_samples=100,
 ):
     rng = np.random.RandomState(global_random_seed)
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 08efb587808f7..daf4c13c2a5b3 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -30,6 +30,7 @@
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS
 from sklearn.metrics.tests.test_pairwise_distances_reduction import (
+    assert_compatible_argkmin_results,
     assert_compatible_radius_results,
 )
 from sklearn.model_selection import cross_val_score, train_test_split
@@ -1694,8 +1695,15 @@ def test_neighbors_metrics(
     "metric", sorted(set(neighbors.VALID_METRICS["brute"]) - set(["precomputed"]))
 )
 def test_kneighbors_brute_backend(
-    global_dtype, metric, n_samples=2000, n_features=30, n_query_pts=100, n_neighbors=5
+    metric,
+    global_dtype,
+    global_random_seed,
+    n_samples=2000,
+    n_features=30,
+    n_query_pts=5,
+    n_neighbors=5,
 ):
+    rng = np.random.RandomState(global_random_seed)
     # Both backend for the 'brute' algorithm of kneighbors must give identical results.
     X_train = rng.rand(n_samples, n_features).astype(global_dtype, copy=False)
     X_test = rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False)
@@ -1732,8 +1740,9 @@ def test_kneighbors_brute_backend(
                 X_test, return_distance=True
             )
 
-        assert_allclose(legacy_brute_dst, pdr_brute_dst)
-        assert_array_equal(legacy_brute_idx, pdr_brute_idx)
+        assert_compatible_argkmin_results(
+            legacy_brute_dst, pdr_brute_dst, legacy_brute_idx, pdr_brute_idx
+        )
 
 
 def test_callable_metric():
@@ -2198,16 +2207,18 @@ def test_auto_algorithm(X, metric, metric_params, expected_algo):
 )
 def test_radius_neighbors_brute_backend(
     metric,
+    global_random_seed,
+    global_dtype,
     n_samples=2000,
     n_features=30,
-    n_query_pts=100,
-    n_neighbors=5,
+    n_query_pts=5,
     radius=1.0,
 ):
+    rng = np.random.RandomState(global_random_seed)
     # Both backends for the 'brute' algorithm of radius_neighbors
     # must give identical results.
-    X_train = rng.rand(n_samples, n_features)
-    X_test = rng.rand(n_query_pts, n_features)
+    X_train = rng.rand(n_samples, n_features).astype(global_dtype, copy=False)
+    X_test = rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False)
 
     # Haversine distance only accepts 2D data
     if metric == "haversine":
@@ -2221,7 +2232,6 @@ def test_radius_neighbors_brute_backend(
         p = metric_params.pop("p", 2)
 
         neigh = neighbors.NearestNeighbors(
-            n_neighbors=n_neighbors,
             radius=radius,
             algorithm="brute",
             metric=metric,

From 30f190164a921bc7be19402fdd216363d88558f5 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 20:26:53 +0200
Subject: [PATCH 22/33] Optimize the speed of running
 test_strategies_consistency while keeping it as useful as possible at finding
 regressions

---
 .../test_pairwise_distances_reduction.py      | 38 ++++++++++++-------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 19c68cecc01b6..c78aefbfa9455 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -8,7 +8,7 @@
 import threadpoolctl
 from scipy.spatial.distance import cdist
 
-from sklearn.metrics import euclidean_distances
+from sklearn.metrics import euclidean_distances, pairwise_distances
 from sklearn.metrics._pairwise_distances_reduction import (
     ArgKmin,
     ArgKminClassMode,
@@ -1122,25 +1122,30 @@ def test_format_agnosticism(
         )
 
 
-@pytest.mark.parametrize(
-    "metric",
-    ["euclidean", "minkowski", "manhattan", "infinity", "seuclidean", "haversine"],
-)
 @pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
-@pytest.mark.parametrize("dtype", [np.float64, np.float32])
 def test_strategies_consistency(
     global_random_seed,
+    global_dtype,
     Dispatcher,
-    metric,
-    dtype,
     n_features=10,
 ):
     """Check that the results do not depend on the strategy used."""
     rng = np.random.RandomState(global_random_seed)
+    metric = rng.choice(
+        np.array(
+            [
+                "euclidean",
+                "minkowski",
+                "manhattan",
+                "haversine",
+            ],
+            dtype=object,
+        )
+    )
     n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False)
     spread = 100
-    X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
-    Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
+    X = rng.rand(n_samples_X, n_features).astype(global_dtype) * spread
+    Y = rng.rand(n_samples_Y, n_features).astype(global_dtype) * spread
 
     # Haversine distance only accepts 2D data
     if metric == "haversine":
@@ -1152,8 +1157,15 @@ def test_strategies_consistency(
         check_parameters = {}
         compute_parameters = {}
     else:
-        # Scaling the radius slightly with the numbers of dimensions
-        radius = 10 ** np.log(n_features)
+        # Find a non-trivial radius using a small subsample of X and Y: we want
+        # to return around expected_n_neighbors on average. Yielding too many
+        # results would make the test slow (because checking the results is
+        # expensive for large result sets), yielding 0 most of the time would
+        # make the test useless.
+        expected_n_neighbors = 10
+        sample_dists = pairwise_distances(X[:10], Y, metric=metric)
+        sample_dists.sort(axis=1)
+        radius = sample_dists[:, expected_n_neighbors].mean()
         parameter = radius
         check_parameters = {"radius": radius}
         compute_parameters = {"sort_results": True}
@@ -1190,7 +1202,7 @@ def test_strategies_consistency(
         **compute_parameters,
     )
 
-    ASSERT_RESULT[(Dispatcher, dtype)](
+    ASSERT_RESULT[(Dispatcher, global_dtype)](
         dist_par_X, dist_par_Y, indices_par_X, indices_par_Y, **check_parameters
     )
 

From 897fece7c8c03d65168fe1e6ed3daba6db589551 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 20:48:11 +0200
Subject: [PATCH 23/33] Attempt to make
 test_pairwise_distances_radius_neighbors more useful by using a data-driven
 radius

---
 .../test_pairwise_distances_reduction.py      | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index c78aefbfa9455..d31ada7753d92 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -244,13 +244,15 @@ def assert_compatible_radius_results(
 
         # Check that all distances are within the requested radius
         if len(dist_row_a) > 0:
-            assert dist_row_a[-1] <= radius, (
-                f"Largest returned distance {dist_row_a[-1]} not within requested"
+            max_dist_a = np.max(dist_row_a)
+            assert max_dist_a <= radius, (
+                f"Largest returned distance {max_dist_a} not within requested"
                 f" radius {radius} on row {query_idx}"
             )
         if len(dist_row_b) > 0:
-            assert dist_row_b[-1] <= radius, (
-                f"Largest returned distance {dist_row_b[-1]} not within requested"
+            max_dist_b = np.max(dist_row_b)
+            assert max_dist_b <= radius, (
+                f"Largest returned distance {max_dist_b} not within requested"
                 f" radius {radius} on row {query_idx}"
             )
 
@@ -1292,7 +1294,6 @@ def test_pairwise_distances_radius_neighbors(
     n_features = rng.choice([50, 500])
     translation = rng.choice([0, 1e6])
     spread = 1000
-    radius = spread * np.log(n_features)
     X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread
     Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
 
@@ -1307,6 +1308,16 @@ def test_pairwise_distances_radius_neighbors(
     else:
         dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
 
+    # Find a non-trivial radius using a small subsample of the dist_matrix:
+    # we want to return around expected_n_neighbors on average. Yielding too
+    # many results would make the test slow (because checking the results is
+    # expensive for large result sets), yielding 0 most of the time would make
+    # the test useless.
+    expected_n_neighbors = 10
+    sample_dists = dist_matrix[:10]
+    sample_dists.sort(axis=1)
+    radius = sample_dists[:, expected_n_neighbors].mean()
+
     # Getting the neighbors for a given radius
     neigh_indices_ref = []
     neigh_distances_ref = []

From d23210ba77ca55297570651e303b3f0ac38413f5 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 14 Sep 2023 23:56:56 +0200
Subject: [PATCH 24/33] Fix side effect of the heuristic radius computation

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index d31ada7753d92..381802fa284b5 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -1314,7 +1314,7 @@ def test_pairwise_distances_radius_neighbors(
     # expensive for large result sets), yielding 0 most of the time would make
     # the test useless.
     expected_n_neighbors = 10
-    sample_dists = dist_matrix[:10]
+    sample_dists = dist_matrix[:10].copy()
     sample_dists.sort(axis=1)
     radius = sample_dists[:, expected_n_neighbors].mean()
 

From f2d9725c8d0a2c84ffdf5beb40a968e1e118d51c Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 15 Sep 2023 00:11:20 +0200
Subject: [PATCH 25/33] Accelerate test_n_threads_agnosticism by using a
 data-drive radius value

---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 381802fa284b5..57ab46edaedd5 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -1028,7 +1028,10 @@ def test_n_threads_agnosticism(
         compute_parameters = {}
     else:
         # Scaling the radius slightly with the numbers of dimensions
-        radius = 10 ** np.log(n_features)
+        expected_n_neighbors = 10
+        dist_sample = euclidean_distances(X[:10], Y)
+        dist_sample.sort(axis=1)
+        radius = dist_sample[:, expected_n_neighbors].mean()
         parameter = radius
         check_parameters = {"radius": radius}
         compute_parameters = {"sort_results": True}

From ec263ca9e70840d91688550c45d11e027d05bda4 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 15 Sep 2023 00:16:38 +0200
Subject: [PATCH 26/33] Accelerate test_format_agnosticism and
 test_chunk_size_agnosticism by using a data-drive radius value

---
 .../metrics/tests/test_pairwise_distances_reduction.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 57ab46edaedd5..5dcca4ad97b02 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -977,7 +977,10 @@ def test_chunk_size_agnosticism(
         compute_parameters = {}
     else:
         # Scaling the radius slightly with the numbers of dimensions
-        radius = 10 ** np.log(n_features)
+        expected_n_neighbors = 10
+        dist_sample = euclidean_distances(X[:10], Y)
+        dist_sample.sort(axis=1)
+        radius = dist_sample[:, expected_n_neighbors].mean()
         parameter = radius
         check_parameters = {"radius": radius}
         compute_parameters = {"sort_results": True}
@@ -1093,7 +1096,10 @@ def test_format_agnosticism(
         compute_parameters = {}
     else:
         # Scaling the radius slightly with the numbers of dimensions
-        radius = 10 ** np.log(n_features)
+        expected_n_neighbors = 10
+        dist_sample = euclidean_distances(X[:10], Y)
+        dist_sample.sort(axis=1)
+        radius = dist_sample[:, expected_n_neighbors].mean()
         parameter = radius
         check_parameters = {"radius": radius}
         compute_parameters = {"sort_results": True}

From f43d5cb7d755f0a2c01b41e9a804c034a8da20b9 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 15 Sep 2023 10:22:52 +0200
Subject: [PATCH 27/33] Do not rely on implicit data conversion in
 test_kneighbors_brute_backend

---
 sklearn/neighbors/tests/test_neighbors.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index daf4c13c2a5b3..9a0dd18d845fa 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -27,7 +27,7 @@
 from sklearn.metrics._dist_metrics import (
     DistanceMetric,
 )
-from sklearn.metrics.pairwise import pairwise_distances
+from sklearn.metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS, pairwise_distances
 from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS
 from sklearn.metrics.tests.test_pairwise_distances_reduction import (
     assert_compatible_argkmin_results,
@@ -1714,6 +1714,10 @@ def test_kneighbors_brute_backend(
         X_train = np.ascontiguousarray(X_train[:, feature_sl])
         X_test = np.ascontiguousarray(X_test[:, feature_sl])
 
+    if metric in metric in PAIRWISE_BOOLEAN_FUNCTIONS:
+        X_train = X_train > 0.5
+        X_test = X_test > 0.5
+
     metric_params_list = _generate_test_params_for(metric, n_features)
 
     for metric_params in metric_params_list:

From f2d8d8ec9cebe6465ce15e3cf7fd21d7ac2c65ad Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 15 Sep 2023 11:51:27 +0200
Subject: [PATCH 28/33] [azure parallel] [all random seeds]
 test_pairwise_distances


From e85d81bebcfba9927728f3ed1defcf7c7ee94e49 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 15 Sep 2023 14:44:17 +0200
Subject: [PATCH 29/33] Typo: if metric in metric in ...

---
 sklearn/neighbors/tests/test_neighbors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index d39e10eeddfa4..947879999a156 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1714,7 +1714,7 @@ def test_kneighbors_brute_backend(
         X_train = np.ascontiguousarray(X_train[:, feature_sl])
         X_test = np.ascontiguousarray(X_test[:, feature_sl])
 
-    if metric in metric in PAIRWISE_BOOLEAN_FUNCTIONS:
+    if metric in PAIRWISE_BOOLEAN_FUNCTIONS:
         X_train = X_train > 0.5
         X_test = X_test > 0.5
 

From c01255e3f7f0ffa4226b8a4f0f4ae142d53925ec Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 26 Sep 2023 11:03:45 +0200
Subject: [PATCH 30/33] Typos and code style improvements

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 .../tests/test_pairwise_distances_reduction.py   | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 5dcca4ad97b02..d82aba911d6b8 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -81,8 +81,8 @@ def assert_same_distances_for_common_neighbors(
     # Compute a mapping from indices to distances for each result set and
     # check that the computed neighbors with matching indices are withing
     # the expected distance tolerance.
-    indices_to_dist_a = {idx: dist for idx, dist in zip(indices_row_a, dist_row_a)}
-    indices_to_dist_b = {idx: dist for idx, dist in zip(indices_row_b, dist_row_b)}
+    indices_to_dist_a = dict(zip(indices_row_a, dist_row_a))
+    indices_to_dist_b = dict(zip(indices_row_b, dist_row_b))
 
     common_indices = set(indices_row_a).intersection(set(indices_row_b))
     for idx in common_indices:
@@ -224,7 +224,7 @@ def assert_compatible_radius_results(
         == len(neighbors_dists_b)
         == len(neighbors_indices_a)
         == len(neighbors_indices_b)
-    ), "Arrays of results have various lengths."
+    )
 
     n_queries = len(neighbors_dists_a)
 
@@ -336,22 +336,20 @@ def test_assert_compatible_argkmin_results():
         np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
         np.array([[1, 2, 3, 4, 5]]),
         np.array([[1, 2, 5, 4, 3]]),
-        atol=atol,
-        rtol=rtol,
+        **tols,
     )
 
     # The last few indices do not necessarily have to match because of the rounding
-    # errors on the distances: their could be tied results at the boundary.
+    # errors on the distances: there could be tied results at the boundary.
     assert_compatible_argkmin_results(
         np.array([[1.2, 2.5, 3.0, 6.1, _6_1p]]),
         np.array([[1.2, 2.5, 3.0, _6_1m, 6.1]]),
         np.array([[1, 2, 3, 4, 5]]),
         np.array([[1, 2, 3, 6, 7]]),
-        atol=atol,
-        rtol=rtol,
+        **tols,
     )
 
-    # All points are have close distances so any ranking permutation
+    # All points have close distances so any ranking permutation
     # is valid for this query result.
     assert_compatible_argkmin_results(
         np.array([[_1m, 1, _1p, _1p, _1p]]),

From 05234d490102c3e3f7b8a7fa3c8e47b0b6105543 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 26 Sep 2023 11:35:15 +0200
Subject: [PATCH 31/33] Factorize redundant test logic to find a non-trivial
 test radius

---
 .../test_pairwise_distances_reduction.py      | 67 ++++++++++---------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index d82aba911d6b8..7dd1a708df9b7 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -193,6 +193,33 @@ def assert_compatible_argkmin_results(
         )
 
 
+def _non_trivial_radius(
+    *,
+    X=None,
+    Y=None,
+    metric=None,
+    precomputed_dists=None,
+    expected_n_neighbors=10,
+    n_subsampled_queries=10,
+    **metric_kwargs,
+):
+    # Find a non-trivial radius using a small subsample of the pairwise
+    # distances between X and Y: we want to return around expected_n_neighbors
+    # on average. Yielding too many results would make the test slow (because
+    # checking the results is expensive for large result sets), yielding 0 most
+    # of the time would make the test useless.
+    if precomputed_dists is None and metric is None:
+        raise ValueError("Either metric or dists must be provided")
+    if precomputed_dists is None:
+        assert X is not None
+        assert Y is not None
+        sampled_dists = pairwise_distances(X, Y, metric=metric, **metric_kwargs)
+    else:
+        sampled_dists = precomputed_dists[:n_subsampled_queries].copy()
+    sampled_dists.sort(axis=1)
+    return sampled_dists[:, expected_n_neighbors].mean()
+
+
 def assert_compatible_radius_results(
     neighbors_dists_a,
     neighbors_dists_b,
@@ -974,11 +1001,7 @@ def test_chunk_size_agnosticism(
         check_parameters = {}
         compute_parameters = {}
     else:
-        # Scaling the radius slightly with the numbers of dimensions
-        expected_n_neighbors = 10
-        dist_sample = euclidean_distances(X[:10], Y)
-        dist_sample.sort(axis=1)
-        radius = dist_sample[:, expected_n_neighbors].mean()
+        radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean")
         parameter = radius
         check_parameters = {"radius": radius}
         compute_parameters = {"sort_results": True}
@@ -1028,11 +1051,7 @@ def test_n_threads_agnosticism(
         check_parameters = {}
         compute_parameters = {}
     else:
-        # Scaling the radius slightly with the numbers of dimensions
-        expected_n_neighbors = 10
-        dist_sample = euclidean_distances(X[:10], Y)
-        dist_sample.sort(axis=1)
-        radius = dist_sample[:, expected_n_neighbors].mean()
+        radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean")
         parameter = radius
         check_parameters = {"radius": radius}
         compute_parameters = {"sort_results": True}
@@ -1093,11 +1112,9 @@ def test_format_agnosticism(
         check_parameters = {}
         compute_parameters = {}
     else:
-        # Scaling the radius slightly with the numbers of dimensions
-        expected_n_neighbors = 10
-        dist_sample = euclidean_distances(X[:10], Y)
-        dist_sample.sort(axis=1)
-        radius = dist_sample[:, expected_n_neighbors].mean()
+        # Adjusting the radius to ensure that the expected results is neither
+        # trivially empty nor too large.
+        radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean")
         parameter = radius
         check_parameters = {"radius": radius}
         compute_parameters = {"sort_results": True}
@@ -1166,15 +1183,7 @@ def test_strategies_consistency(
         check_parameters = {}
         compute_parameters = {}
     else:
-        # Find a non-trivial radius using a small subsample of X and Y: we want
-        # to return around expected_n_neighbors on average. Yielding too many
-        # results would make the test slow (because checking the results is
-        # expensive for large result sets), yielding 0 most of the time would
-        # make the test useless.
-        expected_n_neighbors = 10
-        sample_dists = pairwise_distances(X[:10], Y, metric=metric)
-        sample_dists.sort(axis=1)
-        radius = sample_dists[:, expected_n_neighbors].mean()
+        radius = _non_trivial_radius(X=X, Y=Y, metric=metric)
         parameter = radius
         check_parameters = {"radius": radius}
         compute_parameters = {"sort_results": True}
@@ -1315,15 +1324,7 @@ def test_pairwise_distances_radius_neighbors(
     else:
         dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
 
-    # Find a non-trivial radius using a small subsample of the dist_matrix:
-    # we want to return around expected_n_neighbors on average. Yielding too
-    # many results would make the test slow (because checking the results is
-    # expensive for large result sets), yielding 0 most of the time would make
-    # the test useless.
-    expected_n_neighbors = 10
-    sample_dists = dist_matrix[:10].copy()
-    sample_dists.sort(axis=1)
-    radius = sample_dists[:, expected_n_neighbors].mean()
+    radius = _non_trivial_radius(precomputed_dists=dist_matrix)
 
     # Getting the neighbors for a given radius
     neigh_indices_ref = []

From 1224fb6aa50951d67db09c8dd8b89fe642d26c2a Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 27 Sep 2023 18:08:20 +0200
Subject: [PATCH 32/33] Apply suggestions from code review

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/metrics/tests/test_pairwise_distances_reduction.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index 686a9d9354fa7..c9d5911ae7712 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -80,7 +80,7 @@ def assert_same_distances_for_common_neighbors(
     Missingness is handled by assert_no_missing_neighbors.
     """
     # Compute a mapping from indices to distances for each result set and
-    # check that the computed neighbors with matching indices are withing
+    # check that the computed neighbors with matching indices are within
     # the expected distance tolerance.
     indices_to_dist_a = dict(zip(indices_row_a, dist_row_a))
     indices_to_dist_b = dict(zip(indices_row_b, dist_row_b))
@@ -135,7 +135,7 @@ def assert_compatible_argkmin_results(
     rtol=1e-5,
     atol=1e-6,
 ):
-    """Assert that argkmin results are valid up to rounding errors
+    """Assert that argkmin results are valid up to rounding errors.
 
     This function asserts that the results of argkmin queries are valid up to:
     - rounding error tolerance on distance values;

From 7fb42619edfaea497eb5f6f321c9af6f0e890817 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 27 Sep 2023 18:20:43 +0200
Subject: [PATCH 33/33] DOC add missing docstring + improve inline comment

---
 .../test_pairwise_distances_reduction.py      | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index c9d5911ae7712..75f497315ff01 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -111,6 +111,18 @@ def assert_no_missing_neighbors(
     indices_row_b,
     threshold,
 ):
+    """Compare the indices of neighbors in two results sets.
+
+    Any neighbor index with a distance below the precision threshold should
+    match one in the other result set. We ignore the last few neighbors beyond
+    the threshold as those can typically be missing due to rounding errors.
+
+    For radius queries, the threshold is just the radius minus the expected
+    precision level.
+
+    For k-NN queries, it is the maxium distance to the k-th neighbor minus the
+    expected precision level.
+    """
     mask_a = dist_row_a < threshold
     mask_b = dist_row_b < threshold
     missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b)
@@ -179,8 +191,15 @@ def assert_compatible_argkmin_results(
             atol,
         )
 
-        # Check that any neighbor with distances below the rounding error threshold have
-        # matching indices.
+        # Check that any neighbor with distances below the rounding error
+        # threshold have matching indices. The threshold is the distance to the
+        # k-th neighbors minus the expected precision level:
+        #
+        # (1 - rtol) * dist_k - atol
+        #
+        # Where dist_k is defined as the maxium distance to the kth-neighbor
+        # among the two result sets. This way of defining the threshold is
+        # stricter than taking the minimum of the two.
         threshold = (1 - rtol) * np.maximum(
             np.max(dist_row_a), np.max(dist_row_b)
         ) - atol