From a9f461711eff9ec07d0ef12c174c6d0e8183ef51 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Sun, 3 Sep 2023 16:58:50 +0200 Subject: [PATCH 01/33] test: Improve `assert_argkmin_results_quasi_equality` error message Signed-off-by: Julien Jerphanion Co-authored-by: Olivier Grisel --- .../test_pairwise_distances_reduction.py | 83 ++++++++++++++----- 1 file changed, 61 insertions(+), 22 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 46405048a4fa1..1be218be07687 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -107,6 +107,35 @@ def test_relative_rounding(): assert relative_rounding(123.456789, 10) == 123.456789 +def quasi_equality_error_message( + n_significant_digits, + rtol, + query_idx, + ref_dist_row, + dist_row, + rounded_dist, + group_rank, + reference_neighbors_group, + effective_neighbors_group, +): + # Turning formatting off to avoid black reformatting the message. + # fmt: off + error_message = ( + "Neighbors indices are not matching when rounding distances at\n" + f"{n_significant_digits} significant digits derived from rtol={rtol:.1e}.\n\n" + f"Query vector index : {query_idx}\n" + f"Reference ordered distances: {ref_dist_row}\n" + f"Computed ordered distances : {dist_row}\n" + f"Norm. abs. ord. dist. diff.: {np.mean(np.abs(dist_row - ref_dist_row))}\n" + f"Rounded distance : {rounded_dist}\n" + f"Neighbors group rank : {group_rank}\n" + f"Reference neighbors indices: {list(sorted(reference_neighbors_group))}\n" + f"Computed neighbors indices : {list(sorted(effective_neighbors_group))}" + ) + # fmt: on + return error_message + + def assert_argkmin_results_quasi_equality( ref_dist, dist, @@ -162,16 +191,21 @@ def assert_argkmin_results_quasi_equality( effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank]) # Asserting equality of groups (sets) for each distance - msg = ( - f"Neighbors indices for query {query_idx} are not matching " - f"when rounding distances at {n_significant_digits} significant digits " - f"derived from rtol={rtol:.1e}" - ) - for rounded_distance in reference_neighbors_groups.keys(): + for group_rank, rounded_dist in enumerate(reference_neighbors_groups.keys()): assert ( - reference_neighbors_groups[rounded_distance] - == effective_neighbors_groups[rounded_distance] - ), msg + reference_neighbors_groups[rounded_dist] + == effective_neighbors_groups[rounded_dist] + ), quasi_equality_error_message( + n_significant_digits, + rtol, + query_idx, + ref_dist_row, + dist_row, + rounded_dist, + group_rank, + reference_neighbors_groups[rounded_dist], + effective_neighbors_groups[rounded_dist], + ) def assert_radius_neighbors_results_equality( @@ -272,16 +306,21 @@ def assert_radius_neighbors_results_quasi_equality( effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank]) # Asserting equality of groups (sets) for each distance - msg = ( - f"Neighbors indices for query {query_idx} are not matching " - f"when rounding distances at {n_significant_digits} significant digits " - f"derived from rtol={rtol:.1e}" - ) - for rounded_distance in reference_neighbors_groups.keys(): + for group_rank, rounded_dist in enumerate(reference_neighbors_groups.keys()): assert ( - reference_neighbors_groups[rounded_distance] - == effective_neighbors_groups[rounded_distance] - ), msg + reference_neighbors_groups[rounded_dist] + == effective_neighbors_groups[rounded_dist] + ), quasi_equality_error_message( + n_significant_digits, + rtol, + query_idx, + ref_dist_row, + dist_row, + rounded_dist, + group_rank, + reference_neighbors_groups[rounded_dist], + effective_neighbors_groups[rounded_dist], + ) ASSERT_RESULT = { @@ -358,7 +397,7 @@ def test_assert_argkmin_results_quasi_equality(): # Apply invalid permutation on indices: permuting the ranks # of the 2 nearest neighbors is invalid because the distance # values are too different. - msg = "Neighbors indices for query 0 are not matching" + msg = "Neighbors indices are not matching" with pytest.raises(AssertionError, match=msg): assert_argkmin_results_quasi_equality( np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), @@ -369,7 +408,7 @@ def test_assert_argkmin_results_quasi_equality(): ) # Indices aren't properly sorted w.r.t their distances - msg = "Neighbors indices for query 0 are not matching" + msg = "Neighbors indices are not matching" with pytest.raises(AssertionError, match=msg): assert_argkmin_results_quasi_equality( np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), @@ -439,7 +478,7 @@ def test_assert_radius_neighbors_results_quasi_equality(): ) # Apply invalid permutation on indices - msg = "Neighbors indices for query 0 are not matching" + msg = "Neighbors indices are not matching" with pytest.raises(AssertionError, match=msg): assert_radius_neighbors_results_quasi_equality( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), @@ -475,7 +514,7 @@ def test_assert_radius_neighbors_results_quasi_equality(): ) # Indices aren't properly sorted w.r.t their distances - msg = "Neighbors indices for query 0 are not matching" + msg = "Neighbors indices are not matching" with pytest.raises(AssertionError, match=msg): assert_radius_neighbors_results_quasi_equality( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), From a2b9a80b2c287ff53cd75871b65f7dc9a0b73b70 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 11 Sep 2023 17:31:16 +0200 Subject: [PATCH 02/33] Trigger [all random seeds] for test_pairwise_distances_argkmin test_pairwise_distances_argkmin From 37df63414fb05bd0d76aad42c9b6a40057faec42 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 11 Sep 2023 18:04:30 +0200 Subject: [PATCH 03/33] [azure parallel] [all random seeds] test_pairwise_distances_argkmin From 109db17cb5a45f18ad48ff0307fd11ce36abcbf7 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 13 Sep 2023 14:10:46 +0200 Subject: [PATCH 04/33] Simpler way to assert approximate equality of neighbor results --- .../test_pairwise_distances_reduction.py | 208 +++++++++++------- 1 file changed, 129 insertions(+), 79 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 1be218be07687..943e8f27bfc54 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -137,74 +137,99 @@ def quasi_equality_error_message( def assert_argkmin_results_quasi_equality( - ref_dist, - dist, - ref_indices, - indices, - rtol=1e-4, + neighbors_dists_b, + neighbors_dists_a, + neighbors_indices_a, + neighbors_indices_b, + rtol=1e-5, + atol=1e-6, ): - """Assert that argkmin results are valid up to: - - relative tolerance on computed distance values - - permutations of indices for distances values that differ up to - a precision level + """Assert that argkmin results are valid up to rounding errors - To be used for testing neighbors queries on float32 datasets: we - accept neighbors rank swaps only if they are caused by small - rounding errors on the distance computations. + This function asserts that the results of argkmin queries are valid up to: + - rounding error tolerance on distance values; + - permutations of indices for distances values that differ up to the + expected precision level. + + Furthermore, the distances must be sorted. + + To be used for testing neighbors queries on float32 datasets: we accept + neighbors rank swaps only if they are caused by small rounding errors on + the distance computations. """ is_sorted = lambda a: np.all(a[:-1] <= a[1:]) - n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1) - assert ( - ref_dist.shape == dist.shape == ref_indices.shape == indices.shape - ), "Arrays of results have various shapes." + neighbors_dists_a.shape + == neighbors_dists_b.shape + == neighbors_indices_a.shape + == neighbors_indices_b.shape + ), "Arrays of results have incompatible shapes." - n_queries, n_neighbors = ref_dist.shape + n_queries, _ = neighbors_dists_a.shape # Asserting equality results one row at a time for query_idx in range(n_queries): - ref_dist_row = ref_dist[query_idx] - dist_row = dist[query_idx] - - assert is_sorted( - ref_dist_row - ), f"Reference distances aren't sorted on row {query_idx}" - assert is_sorted(dist_row), f"Distances aren't sorted on row {query_idx}" - - assert_allclose(ref_dist_row, dist_row, rtol=rtol) - - ref_indices_row = ref_indices[query_idx] - indices_row = indices[query_idx] - - # Grouping indices by distances using sets on a rounded distances up - # to a given number of decimals of significant digits derived from rtol. - reference_neighbors_groups = defaultdict(set) - effective_neighbors_groups = defaultdict(set) - - for neighbor_rank in range(n_neighbors): - rounded_dist = relative_rounding( - ref_dist_row[neighbor_rank], - n_significant_digits=n_significant_digits, - ) - reference_neighbors_groups[rounded_dist].add(ref_indices_row[neighbor_rank]) - effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank]) - - # Asserting equality of groups (sets) for each distance - for group_rank, rounded_dist in enumerate(reference_neighbors_groups.keys()): - assert ( - reference_neighbors_groups[rounded_dist] - == effective_neighbors_groups[rounded_dist] - ), quasi_equality_error_message( - n_significant_digits, - rtol, - query_idx, - ref_dist_row, - dist_row, - rounded_dist, - group_rank, - reference_neighbors_groups[rounded_dist], - effective_neighbors_groups[rounded_dist], + dist_row_a = neighbors_dists_a[query_idx] + dist_row_b = neighbors_dists_b[query_idx] + indices_row_a = neighbors_indices_a[query_idx] + indices_row_b = neighbors_indices_b[query_idx] + + assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}" + assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}" + + # The distances should match, irrespective of neighbors index swappping + # because of rounding errors. + try: + assert_allclose(dist_row_a, dist_row_b, rtol=rtol, atol=atol) + except AssertionError as e: + # Wrap exception to provide more context while also including the original + # exception with the computed absolute and relative differences. + raise AssertionError( + f"Query vector with index {query_idx} lead to different neighbors'" + f" distances (with atol={atol} and" + f" rtol={rtol}):\ndist_row_a={dist_row_a}\ndist_row_b={dist_row_b}" + ) from e + + # Compute a mapping from indices to distances for each result set and + # check that the computed neighbors with matching indices are withing + # the expected distance tolerance. + # It's ok if they are missing indices on either side, as long as the + # distances match in the previous assertions. + indices_to_dist_a = {idx: dist for idx, dist in zip(indices_row_a, dist_row_a)} + indices_to_dist_b = {idx: dist for idx, dist in zip(indices_row_b, dist_row_b)} + + common_indices = set(indices_row_a).intersection(set(indices_row_b)) + for idx in common_indices: + dist_a = indices_to_dist_a[idx] + dist_b = indices_to_dist_b[idx] + try: + assert_allclose(dist_a, dist_b, rtol=rtol, atol=atol) + except AssertionError as e: + raise AssertionError( + f"Query vector with index {query_idx} lead to different distances" + f" for common neighbor with index {idx}:" + f" dist_a={dist_a} vs dist_b={dist_b} (with atol={atol} and" + f" rtol={rtol})" + ) from e + + # Check that any neighbor with distances below the rounding error threshold have + # matching indices. + threshold = (1 - rtol) * np.max(dist_row_a) - atol + mask_a = dist_row_a <= threshold + mask_b = dist_row_b <= threshold + missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b[mask_b]) + missing_from_a = np.setdiff1d(indices_row_b[mask_b], indices_row_a[mask_a]) + if len(missing_from_a) > 0 or len(missing_from_b) > 0: + raise AssertionError( + f"Query vector with index {query_idx} lead to mismatched result indices" + f" (with atol={atol} and rtol={rtol}):\n" + f"neighors in b missing from a: {missing_from_a}\n" + f"neighors in a missing from b: {missing_from_b}\n" + f"dist_row_a={dist_row_a}\n" + f"dist_row_b={dist_row_b}\n" + f"indices_row_a={indices_row_a}\n" + f"indices_row_b={indices_row_b}\n" ) @@ -348,8 +373,9 @@ def assert_radius_neighbors_results_quasi_equality( def test_assert_argkmin_results_quasi_equality(): + atol = 0.0 rtol = 1e-7 - eps = 1e-7 + eps = 1e-7 / 2 _1m = 1.0 - eps _1p = 1.0 + eps @@ -374,47 +400,78 @@ def test_assert_argkmin_results_quasi_equality(): ref_dist, ref_dist, ref_indices, ref_indices, rtol ) - # Apply valid permutation on indices: the last 3 points are - # all very close to one another so we accept any permutation - # on their rankings. + # Apply valid permutation on indices: the last 3 points are all very close + # to one another so we accept any permutation on their rankings. assert_argkmin_results_quasi_equality( np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), np.array([[1.2, 2.5, 6.1, 6.1, 6.1]]), np.array([[1, 2, 3, 4, 5]]), np.array([[1, 2, 4, 5, 3]]), + atol=atol, rtol=rtol, ) + + # The last few indices do not necessarily have to match because of the rounding + # errors on the distances: their could be tied results at the boundary. + assert_argkmin_results_quasi_equality( + np.array([[1.2, 2.5, 3.0, 6.1, _6_1p]]), + np.array([[1.2, 2.5, 3.0, _6_1m, 6.1]]), + np.array([[1, 2, 3, 4, 5]]), + np.array([[1, 2, 3, 6, 7]]), + atol=atol, + rtol=rtol, + ) + # All points are have close distances so any ranking permutation # is valid for this query result. assert_argkmin_results_quasi_equality( - np.array([[_1m, _1m, 1, _1p, _1p]]), - np.array([[_1m, _1m, 1, _1p, _1p]]), - np.array([[6, 7, 8, 9, 10]]), + np.array([[1, 1, _1p, _1p, _1p]]), + np.array([[1, 1, 1, 1, _1p]]), + np.array([[7, 6, 8, 10, 9]]), np.array([[6, 9, 7, 8, 10]]), + atol=atol, rtol=rtol, ) - # Apply invalid permutation on indices: permuting the ranks - # of the 2 nearest neighbors is invalid because the distance - # values are too different. - msg = "Neighbors indices are not matching" + # They could also be nearly truncation of very large nearly tied result + # sets hence all indices can also be distinct in this case: + assert_argkmin_results_quasi_equality( + np.array([[1, 1, _1p, _1p, _1p]]), + np.array([[1, 1, 1, 1, _1p]]), + np.array([[34, 30, 8, 12, 24]]), + np.array([[42, 1, 21, 13, 3]]), + atol=atol, + rtol=rtol, + ) + + # Apply invalid permutation on indices: permuting the ranks of the 2 + # nearest neighbors is invalid because the distance values are too + # different. + msg = re.escape( + "Query vector with index 0 lead to different distances for common neighbor with" + " index 1: dist_a=1.2 vs dist_b=2.5 (with atol=0.0 and rtol=1e-07)" + ) with pytest.raises(AssertionError, match=msg): assert_argkmin_results_quasi_equality( np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), np.array([[1, 2, 3, 4, 5]]), np.array([[2, 1, 3, 4, 5]]), + atol=atol, rtol=rtol, ) # Indices aren't properly sorted w.r.t their distances - msg = "Neighbors indices are not matching" + msg = re.escape( + "neighors in b missing from a: [12]\nneighors in a missing from b: [1]" + ) with pytest.raises(AssertionError, match=msg): assert_argkmin_results_quasi_equality( np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), np.array([[1, 2, 3, 4, 5]]), - np.array([[2, 1, 4, 5, 3]]), + np.array([[12, 2, 4, 11, 3]]), + atol=atol, rtol=rtol, ) @@ -426,6 +483,7 @@ def test_assert_argkmin_results_quasi_equality(): np.array([[2.5, 1.2, _6_1m, 6.1, _6_1p]]), np.array([[1, 2, 3, 4, 5]]), np.array([[2, 1, 4, 5, 3]]), + atol=atol, rtol=rtol, ) @@ -1159,14 +1217,6 @@ def test_pairwise_distances_argkmin( n_samples=100, k=10, ): - # TODO: can we easily fix this discrepancy? - edge_cases = [ - (np.float32, "chebyshev", 1000000.0), - (np.float32, "cityblock", 1000000.0), - ] - if (dtype, metric, translation) in edge_cases: - pytest.xfail("Numerical differences lead to small differences in results.") - rng = np.random.RandomState(global_random_seed) spread = 1000 X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread From baa792dbf699cb6dc6705f3c3179d714622ff7c3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 13 Sep 2023 17:50:28 +0200 Subject: [PATCH 05/33] Focus this PR on argkmin tests for now --- .../test_pairwise_distances_reduction.py | 56 ++++--------------- 1 file changed, 11 insertions(+), 45 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 943e8f27bfc54..925598b60c582 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -107,35 +107,6 @@ def test_relative_rounding(): assert relative_rounding(123.456789, 10) == 123.456789 -def quasi_equality_error_message( - n_significant_digits, - rtol, - query_idx, - ref_dist_row, - dist_row, - rounded_dist, - group_rank, - reference_neighbors_group, - effective_neighbors_group, -): - # Turning formatting off to avoid black reformatting the message. - # fmt: off - error_message = ( - "Neighbors indices are not matching when rounding distances at\n" - f"{n_significant_digits} significant digits derived from rtol={rtol:.1e}.\n\n" - f"Query vector index : {query_idx}\n" - f"Reference ordered distances: {ref_dist_row}\n" - f"Computed ordered distances : {dist_row}\n" - f"Norm. abs. ord. dist. diff.: {np.mean(np.abs(dist_row - ref_dist_row))}\n" - f"Rounded distance : {rounded_dist}\n" - f"Neighbors group rank : {group_rank}\n" - f"Reference neighbors indices: {list(sorted(reference_neighbors_group))}\n" - f"Computed neighbors indices : {list(sorted(effective_neighbors_group))}" - ) - # fmt: on - return error_message - - def assert_argkmin_results_quasi_equality( neighbors_dists_b, neighbors_dists_a, @@ -331,21 +302,16 @@ def assert_radius_neighbors_results_quasi_equality( effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank]) # Asserting equality of groups (sets) for each distance - for group_rank, rounded_dist in enumerate(reference_neighbors_groups.keys()): + msg = ( + f"Neighbors indices for query {query_idx} are not matching " + f"when rounding distances at {n_significant_digits} significant digits " + f"derived from rtol={rtol:.1e}" + ) + for rounded_distance in reference_neighbors_groups.keys(): assert ( - reference_neighbors_groups[rounded_dist] - == effective_neighbors_groups[rounded_dist] - ), quasi_equality_error_message( - n_significant_digits, - rtol, - query_idx, - ref_dist_row, - dist_row, - rounded_dist, - group_rank, - reference_neighbors_groups[rounded_dist], - effective_neighbors_groups[rounded_dist], - ) + reference_neighbors_groups[rounded_distance] + == effective_neighbors_groups[rounded_distance] + ), msg ASSERT_RESULT = { @@ -536,7 +502,7 @@ def test_assert_radius_neighbors_results_quasi_equality(): ) # Apply invalid permutation on indices - msg = "Neighbors indices are not matching" + msg = "Neighbors indices for query 0 are not matching" with pytest.raises(AssertionError, match=msg): assert_radius_neighbors_results_quasi_equality( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), @@ -572,7 +538,7 @@ def test_assert_radius_neighbors_results_quasi_equality(): ) # Indices aren't properly sorted w.r.t their distances - msg = "Neighbors indices are not matching" + msg = "Neighbors indices for query 0 are not matching" with pytest.raises(AssertionError, match=msg): assert_radius_neighbors_results_quasi_equality( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), From 755f641bdc0bb9d78f0c98e4f996d4487a659ef4 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 13 Sep 2023 18:01:42 +0200 Subject: [PATCH 06/33] Fix edge cases by making check for missing indices symmetric --- .../tests/test_pairwise_distances_reduction.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 925598b60c582..8aea622cf852e 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -186,11 +186,14 @@ def assert_argkmin_results_quasi_equality( # Check that any neighbor with distances below the rounding error threshold have # matching indices. - threshold = (1 - rtol) * np.max(dist_row_a) - atol - mask_a = dist_row_a <= threshold - mask_b = dist_row_b <= threshold - missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b[mask_b]) - missing_from_a = np.setdiff1d(indices_row_b[mask_b], indices_row_a[mask_a]) + threshold = np.minimum( + (1 - rtol) * np.max(dist_row_a) - atol, + (1 - rtol) * np.max(dist_row_b) - atol, + ) + mask_a = dist_row_a < threshold + mask_b = dist_row_b < threshold + missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b) + missing_from_a = np.setdiff1d(indices_row_b[mask_b], indices_row_a) if len(missing_from_a) > 0 or len(missing_from_b) > 0: raise AssertionError( f"Query vector with index {query_idx} lead to mismatched result indices" From 6a5fa0f330283c2820747f9f05909eced6f71242 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 13 Sep 2023 20:16:41 +0200 Subject: [PATCH 07/33] [azure parallel] [all random seeds] test_pairwise_distances_argkmin From 9a975b33f2d2db24b87d4574535aea934ac3338e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 08:18:51 +0200 Subject: [PATCH 08/33] Speed-up test_pairwise_distances_* by dividing the number of queries by 10 --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 8aea622cf852e..b59161c3e504f 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1183,12 +1183,13 @@ def test_pairwise_distances_argkmin( metric, strategy, dtype, + n_queries=10, n_samples=100, k=10, ): rng = np.random.RandomState(global_random_seed) spread = 1000 - X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread + X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread X_csr = csr_matrix(X) @@ -1249,12 +1250,13 @@ def test_pairwise_distances_radius_neighbors( metric, strategy, dtype, + n_queries=10, n_samples=100, ): rng = np.random.RandomState(global_random_seed) spread = 1000 radius = spread * np.log(n_features) - X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread + X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread metric_kwargs = _get_metric_params_list( From 04f7bddc1262002ffcfb1e5630c7bfed9842171a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 08:19:49 +0200 Subject: [PATCH 09/33] [azure parallel] [all random seeds] test_pairwise_distances From 9ec12633d45e9292b9c6784bedfd4ca0c3646371 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 10:59:03 +0200 Subject: [PATCH 10/33] Further speed-up pairwise distance tests by leveraging global random seed --- .../tests/test_pairwise_distances_reduction.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index ee6e23d689e43..08826f6183ced 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1174,16 +1174,12 @@ def test_strategies_consistency( # "Concrete Dispatchers"-specific tests -@pytest.mark.parametrize("n_features", [50, 500]) -@pytest.mark.parametrize("translation", [0, 1e6]) @pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS) @pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y")) @pytest.mark.parametrize("dtype", [np.float64, np.float32]) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_pairwise_distances_argkmin( global_random_seed, - n_features, - translation, metric, strategy, dtype, @@ -1193,6 +1189,8 @@ def test_pairwise_distances_argkmin( k=10, ): rng = np.random.RandomState(global_random_seed) + n_features = rng.choice([50, 500]) + translation = rng.choice([0, 1e6]) spread = 1000 X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread @@ -1243,15 +1241,11 @@ def test_pairwise_distances_argkmin( ) -@pytest.mark.parametrize("n_features", [50, 500]) -@pytest.mark.parametrize("translation", [0, 1e6]) @pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS) @pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y")) @pytest.mark.parametrize("dtype", [np.float64, np.float32]) def test_pairwise_distances_radius_neighbors( global_random_seed, - n_features, - translation, metric, strategy, dtype, @@ -1259,6 +1253,8 @@ def test_pairwise_distances_radius_neighbors( n_samples=100, ): rng = np.random.RandomState(global_random_seed) + n_features = rng.choice([50, 500]) + translation = rng.choice([0, 1e6]) spread = 1000 radius = spread * np.log(n_features) X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread From 21ed72618016105cbc3150b7f482f5e0f3abb02c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 11:49:59 +0200 Subject: [PATCH 11/33] Simplify assert_argkmin_results_quasi_equality to always use informative message with even stronger tests and a fix --- .../test_pairwise_distances_reduction.py | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 08826f6183ced..3ae395f944d4a 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -108,8 +108,8 @@ def test_relative_rounding(): def assert_argkmin_results_quasi_equality( - neighbors_dists_b, neighbors_dists_a, + neighbors_dists_b, neighbors_indices_a, neighbors_indices_b, rtol=1e-5, @@ -149,19 +149,6 @@ def assert_argkmin_results_quasi_equality( assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}" assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}" - # The distances should match, irrespective of neighbors index swappping - # because of rounding errors. - try: - assert_allclose(dist_row_a, dist_row_b, rtol=rtol, atol=atol) - except AssertionError as e: - # Wrap exception to provide more context while also including the original - # exception with the computed absolute and relative differences. - raise AssertionError( - f"Query vector with index {query_idx} lead to different neighbors'" - f" distances (with atol={atol} and" - f" rtol={rtol}):\ndist_row_a={dist_row_a}\ndist_row_b={dist_row_b}" - ) from e - # Compute a mapping from indices to distances for each result set and # check that the computed neighbors with matching indices are withing # the expected distance tolerance. @@ -177,6 +164,9 @@ def assert_argkmin_results_quasi_equality( try: assert_allclose(dist_a, dist_b, rtol=rtol, atol=atol) except AssertionError as e: + # Wrap exception to provide more context while also including + # the original exception with the computed absolute and + # relative differences. raise AssertionError( f"Query vector with index {query_idx} lead to different distances" f" for common neighbor with index {idx}:" @@ -186,10 +176,9 @@ def assert_argkmin_results_quasi_equality( # Check that any neighbor with distances below the rounding error threshold have # matching indices. - threshold = np.minimum( - (1 - rtol) * np.max(dist_row_a) - atol, - (1 - rtol) * np.max(dist_row_b) - atol, - ) + threshold = (1 - rtol) * np.maximum( + np.max(dist_row_a), np.max(dist_row_b) + ) - atol mask_a = dist_row_a < threshold mask_b = dist_row_b < threshold missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b) @@ -444,6 +433,20 @@ def test_assert_argkmin_results_quasi_equality(): rtol=rtol, ) + # Indices aren't properly sorted w.r.t their distances + msg = re.escape( + "neighors in b missing from a: []\nneighors in a missing from b: [3]" + ) + with pytest.raises(AssertionError, match=msg): + assert_argkmin_results_quasi_equality( + np.array([[_1m, 1.0, _6_1m, 6.1, _6_1p]]), + np.array([[1.0, 1.0, _6_1m, 6.1, 7]]), + np.array([[1, 2, 3, 4, 5]]), + np.array([[2, 1, 4, 5, 12]]), + atol=atol, + rtol=rtol, + ) + # Distances aren't properly sorted msg = "Distances aren't sorted on row 0" with pytest.raises(AssertionError, match=msg): From c66770e678b18159f0aa4efbdeb0eed78fe30bdf Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 11:57:55 +0200 Subject: [PATCH 12/33] Fix inline comments in test + one more test case to check symmetry of missing neighbor detection --- .../test_pairwise_distances_reduction.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 3ae395f944d4a..83a65a6822cbf 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -419,7 +419,8 @@ def test_assert_argkmin_results_quasi_equality(): rtol=rtol, ) - # Indices aren't properly sorted w.r.t their distances + # Detect missing indices within the expected precision level, even when the + # distances match exactly. msg = re.escape( "neighors in b missing from a: [12]\nneighors in a missing from b: [1]" ) @@ -433,7 +434,7 @@ def test_assert_argkmin_results_quasi_equality(): rtol=rtol, ) - # Indices aren't properly sorted w.r.t their distances + # Detect missing indices outside the expected precision level. msg = re.escape( "neighors in b missing from a: []\nneighors in a missing from b: [3]" ) @@ -447,6 +448,21 @@ def test_assert_argkmin_results_quasi_equality(): rtol=rtol, ) + # Detect missing indices outside the expected precision level, in the other + # direction: + msg = re.escape( + "neighors in b missing from a: [5]\nneighors in a missing from b: []" + ) + with pytest.raises(AssertionError, match=msg): + assert_argkmin_results_quasi_equality( + np.array([[_1m, 1.0, _6_1m, 6.1, 7]]), + np.array([[1.0, 1.0, _6_1m, 6.1, _6_1p]]), + np.array([[1, 2, 3, 4, 12]]), + np.array([[2, 1, 5, 3, 4]]), + atol=atol, + rtol=rtol, + ) + # Distances aren't properly sorted msg = "Distances aren't sorted on row 0" with pytest.raises(AssertionError, match=msg): From 2663108c47588f32a1d00621c7bc39a8475d2f32 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 14:30:48 +0200 Subject: [PATCH 13/33] [azure parallel] [all random seeds] test_pairwise_distances From 4a498e910a1f5fd2a4d2ba61678d44cef3c0300b Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 15:43:14 +0200 Subject: [PATCH 14/33] Refactor radius neighbors checks, remove unused code and accelerate parametrization while covering non dimensions that are non multiple of one another --- .../test_pairwise_distances_reduction.py | 357 +++++++++--------- 1 file changed, 188 insertions(+), 169 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 83a65a6822cbf..2fffb3e2cc8ef 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1,8 +1,6 @@ import itertools import re import warnings -from collections import defaultdict -from math import floor, log10 import numpy as np import pytest @@ -79,32 +77,66 @@ def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices, rtol=1 ) -def relative_rounding(scalar, n_significant_digits): - """Round a scalar to a number of significant digits relatively to its value.""" - if scalar == 0: - return 0.0 - magnitude = int(floor(log10(abs(scalar)))) + 1 - return round(scalar, n_significant_digits - magnitude) - - -def test_relative_rounding(): - assert relative_rounding(0, 1) == 0.0 - assert relative_rounding(0, 10) == 0.0 - assert relative_rounding(0, 123456) == 0.0 - - assert relative_rounding(123456789, 0) == 0 - assert relative_rounding(123456789, 2) == 120000000 - assert relative_rounding(123456789, 3) == 123000000 - assert relative_rounding(123456789, 10) == 123456789 - assert relative_rounding(123456789, 20) == 123456789 - - assert relative_rounding(1.23456789, 2) == 1.2 - assert relative_rounding(1.23456789, 3) == 1.23 - assert relative_rounding(1.23456789, 10) == 1.23456789 +def assert_same_distances_for_common_neighbors( + query_idx, + dist_row_a, + dist_row_b, + indices_row_a, + indices_row_b, + rtol, + atol, +): + """Check that the distances of common neighbors are equal up to tolerance. - assert relative_rounding(123.456789, 3) == 123.0 - assert relative_rounding(123.456789, 9) == 123.456789 - assert relative_rounding(123.456789, 10) == 123.456789 + This does not check if there are missing neighbors in either result set. + Missingness is handled by assert_no_missing_neighbors. + """ + # Compute a mapping from indices to distances for each result set and + # check that the computed neighbors with matching indices are withing + # the expected distance tolerance. + indices_to_dist_a = {idx: dist for idx, dist in zip(indices_row_a, dist_row_a)} + indices_to_dist_b = {idx: dist for idx, dist in zip(indices_row_b, dist_row_b)} + + common_indices = set(indices_row_a).intersection(set(indices_row_b)) + for idx in common_indices: + dist_a = indices_to_dist_a[idx] + dist_b = indices_to_dist_b[idx] + try: + assert_allclose(dist_a, dist_b, rtol=rtol, atol=atol) + except AssertionError as e: + # Wrap exception to provide more context while also including + # the original exception with the computed absolute and + # relative differences. + raise AssertionError( + f"Query vector with index {query_idx} lead to different distances" + f" for common neighbor with index {idx}:" + f" dist_a={dist_a} vs dist_b={dist_b} (with atol={atol} and" + f" rtol={rtol})" + ) from e + + +def assert_no_missing_neighbors( + query_idx, + dist_row_a, + dist_row_b, + indices_row_a, + indices_row_b, + threshold, +): + mask_a = dist_row_a < threshold + mask_b = dist_row_b < threshold + missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b) + missing_from_a = np.setdiff1d(indices_row_b[mask_b], indices_row_a) + if len(missing_from_a) > 0 or len(missing_from_b) > 0: + raise AssertionError( + f"Query vector with index {query_idx} lead to mismatched result indices:\n" + f"neighors in b missing from a: {missing_from_a}\n" + f"neighors in a missing from b: {missing_from_b}\n" + f"dist_row_a={dist_row_a}\n" + f"dist_row_b={dist_row_b}\n" + f"indices_row_a={indices_row_a}\n" + f"indices_row_b={indices_row_b}\n" + ) def assert_argkmin_results_quasi_equality( @@ -149,82 +181,66 @@ def assert_argkmin_results_quasi_equality( assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}" assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}" - # Compute a mapping from indices to distances for each result set and - # check that the computed neighbors with matching indices are withing - # the expected distance tolerance. - # It's ok if they are missing indices on either side, as long as the - # distances match in the previous assertions. - indices_to_dist_a = {idx: dist for idx, dist in zip(indices_row_a, dist_row_a)} - indices_to_dist_b = {idx: dist for idx, dist in zip(indices_row_b, dist_row_b)} - - common_indices = set(indices_row_a).intersection(set(indices_row_b)) - for idx in common_indices: - dist_a = indices_to_dist_a[idx] - dist_b = indices_to_dist_b[idx] - try: - assert_allclose(dist_a, dist_b, rtol=rtol, atol=atol) - except AssertionError as e: - # Wrap exception to provide more context while also including - # the original exception with the computed absolute and - # relative differences. - raise AssertionError( - f"Query vector with index {query_idx} lead to different distances" - f" for common neighbor with index {idx}:" - f" dist_a={dist_a} vs dist_b={dist_b} (with atol={atol} and" - f" rtol={rtol})" - ) from e + assert_same_distances_for_common_neighbors( + query_idx, + dist_row_a, + dist_row_b, + indices_row_a, + indices_row_b, + rtol, + atol, + ) # Check that any neighbor with distances below the rounding error threshold have # matching indices. threshold = (1 - rtol) * np.maximum( np.max(dist_row_a), np.max(dist_row_b) ) - atol - mask_a = dist_row_a < threshold - mask_b = dist_row_b < threshold - missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b) - missing_from_a = np.setdiff1d(indices_row_b[mask_b], indices_row_a) - if len(missing_from_a) > 0 or len(missing_from_b) > 0: - raise AssertionError( - f"Query vector with index {query_idx} lead to mismatched result indices" - f" (with atol={atol} and rtol={rtol}):\n" - f"neighors in b missing from a: {missing_from_a}\n" - f"neighors in a missing from b: {missing_from_b}\n" - f"dist_row_a={dist_row_a}\n" - f"dist_row_b={dist_row_b}\n" - f"indices_row_a={indices_row_a}\n" - f"indices_row_b={indices_row_b}\n" - ) + assert_no_missing_neighbors( + query_idx, + dist_row_a, + dist_row_b, + indices_row_a, + indices_row_b, + threshold, + ) def assert_radius_neighbors_results_equality( - ref_dist, dist, ref_indices, indices, radius + neighbors_dists_a, + neighbors_dists_b, + neighbors_indices_a, + neighbors_indices_b, + radius, ): # We get arrays of arrays and we need to check for individual pairs - for i in range(ref_dist.shape[0]): - assert (ref_dist[i] <= radius).all() + for i in range(neighbors_dists_a.shape[0]): + assert (neighbors_dists_a[i] <= radius).all() assert_array_equal( - ref_indices[i], - indices[i], + neighbors_indices_a[i], + neighbors_indices_b[i], err_msg=f"Query vector #{i} has different neighbors' indices", ) assert_allclose( - ref_dist[i], - dist[i], + neighbors_dists_a[i], + neighbors_dists_b[i], err_msg=f"Query vector #{i} has different neighbors' distances", rtol=1e-7, ) def assert_radius_neighbors_results_quasi_equality( - ref_dist, - dist, - ref_indices, - indices, + neighbors_dists_a, + neighbors_dists_b, + neighbors_indices_a, + neighbors_indices_b, radius, - rtol=1e-4, + rtol=1e-5, + atol=1e-6, ): """Assert that radius neighborhood results are valid up to: - - relative tolerance on computed distance values + + - relative and absolute tolerance on computed distance values - permutations of indices for distances values that differ up to a precision level - missing or extra last elements if their distance is @@ -238,72 +254,59 @@ def assert_radius_neighbors_results_quasi_equality( """ is_sorted = lambda a: np.all(a[:-1] <= a[1:]) - n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1) - assert ( - len(ref_dist) == len(dist) == len(ref_indices) == len(indices) + len(neighbors_dists_a) + == len(neighbors_dists_b) + == len(neighbors_indices_a) + == len(neighbors_indices_b) ), "Arrays of results have various lengths." - n_queries = len(ref_dist) + n_queries = len(neighbors_dists_a) # Asserting equality of results one vector at a time for query_idx in range(n_queries): - ref_dist_row = ref_dist[query_idx] - dist_row = dist[query_idx] - - assert is_sorted( - ref_dist_row - ), f"Reference distances aren't sorted on row {query_idx}" - assert is_sorted(dist_row), f"Distances aren't sorted on row {query_idx}" - - # Vectors' lengths might be different due to small - # numerical differences of distance w.r.t the `radius` threshold. - largest_row = ref_dist_row if len(ref_dist_row) > len(dist_row) else dist_row - - # For the longest distances vector, we check that last extra elements - # that aren't present in the other vector are all in: [radius ± rtol] - min_length = min(len(ref_dist_row), len(dist_row)) - last_extra_elements = largest_row[min_length:] - if last_extra_elements.size > 0: - assert np.all(radius - rtol <= last_extra_elements <= radius + rtol), ( - f"The last extra elements ({last_extra_elements}) aren't in [radius ±" - f" rtol]=[{radius} ± {rtol}]" - ) + dist_row_a = neighbors_dists_a[query_idx] + dist_row_b = neighbors_dists_b[query_idx] + indices_row_a = neighbors_indices_a[query_idx] + indices_row_b = neighbors_indices_b[query_idx] - # We truncate the neighbors results list on the smallest length to - # be able to compare them, ignoring the elements checked above. - ref_dist_row = ref_dist_row[:min_length] - dist_row = dist_row[:min_length] + assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}" + assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}" - assert_allclose(ref_dist_row, dist_row, rtol=rtol) + assert len(dist_row_a) == len(indices_row_a) + assert len(dist_row_b) == len(indices_row_b) - ref_indices_row = ref_indices[query_idx] - indices_row = indices[query_idx] + # Check that all distances are within the requested radius + if len(dist_row_a) > 0: + assert dist_row_a[-1] <= radius, ( + f"Largest returned distance {dist_row_a[-1]} not within requested" + f" radius {radius} on row {query_idx}" + ) + if len(dist_row_b) > 0: + assert dist_row_b[-1] <= radius, ( + f"Largest returned distance {dist_row_b[-1]} not within requested" + f" radius {radius} on row {query_idx}" + ) - # Grouping indices by distances using sets on a rounded distances up - # to a given number of significant digits derived from rtol. - reference_neighbors_groups = defaultdict(set) - effective_neighbors_groups = defaultdict(set) + assert_same_distances_for_common_neighbors( + query_idx, + dist_row_a, + dist_row_b, + indices_row_a, + indices_row_b, + rtol, + atol, + ) - for neighbor_rank in range(min_length): - rounded_dist = relative_rounding( - ref_dist_row[neighbor_rank], - n_significant_digits=n_significant_digits, - ) - reference_neighbors_groups[rounded_dist].add(ref_indices_row[neighbor_rank]) - effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank]) - - # Asserting equality of groups (sets) for each distance - msg = ( - f"Neighbors indices for query {query_idx} are not matching " - f"when rounding distances at {n_significant_digits} significant digits " - f"derived from rtol={rtol:.1e}" + threshold = (1 - rtol) * radius - atol + assert_no_missing_neighbors( + query_idx, + dist_row_a, + dist_row_b, + indices_row_a, + indices_row_b, + threshold, ) - for rounded_distance in reference_neighbors_groups.keys(): - assert ( - reference_neighbors_groups[rounded_distance] - == effective_neighbors_groups[rounded_distance] - ), msg ASSERT_RESULT = { @@ -478,7 +481,7 @@ def test_assert_argkmin_results_quasi_equality(): def test_assert_radius_neighbors_results_quasi_equality(): rtol = 1e-7 - eps = 1e-7 + eps = 1e-7 / 2 _1m = 1.0 - eps _1p = 1.0 + eps @@ -501,7 +504,7 @@ def test_assert_radius_neighbors_results_quasi_equality(): ref_dist, ref_indices, ref_indices, - radius=6.1, + radius=7.0, rtol=rtol, ) @@ -511,7 +514,7 @@ def test_assert_radius_neighbors_results_quasi_equality(): np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1, 2, 3, 4, 5])]), np.array([np.array([1, 2, 4, 5, 3])]), - radius=6.1, + radius=7.0, rtol=rtol, ) assert_radius_neighbors_results_quasi_equality( @@ -519,35 +522,41 @@ def test_assert_radius_neighbors_results_quasi_equality(): np.array([np.array([_1m, _1m, 1, _1p, _1p])]), np.array([np.array([6, 7, 8, 9, 10])]), np.array([np.array([6, 9, 7, 8, 10])]), - radius=6.1, + radius=7.0, rtol=rtol, ) # Apply invalid permutation on indices - msg = "Neighbors indices for query 0 are not matching" + msg = re.escape( + "Query vector with index 0 lead to different distances for common neighbor with" + " index 1: dist_a=1.2 vs dist_b=2.5 (with atol=1e-06 and rtol=1e-07)" + ) with pytest.raises(AssertionError, match=msg): assert_radius_neighbors_results_quasi_equality( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1, 2, 3, 4, 5])]), np.array([np.array([2, 1, 3, 4, 5])]), - radius=6.1, + radius=7.0, rtol=rtol, ) - # Having extra last elements is valid if they are in: [radius ± rtol] + # Having extra last or missing elements is valid if they are in the + # tolerated rounding error range: [(1 - rtol) * radius - atol, radius] assert_radius_neighbors_results_quasi_equality( - np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), + np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p, _6_1p])]), np.array([np.array([1.2, 2.5, _6_1m, 6.1])]), - np.array([np.array([1, 2, 3, 4, 5])]), - np.array([np.array([1, 2, 3, 4])]), - radius=6.1, + np.array([np.array([1, 2, 3, 4, 5, 7])]), + np.array([np.array([1, 2, 3, 6])]), + radius=_6_1p, rtol=rtol, ) - # Having extra last elements is invalid if they are lesser than radius - rtol + # Any discrepancy outside the tolerated rounding error range is invalid and + # indicates a missing neighbor in one of the result sets. msg = re.escape( - "The last extra elements ([6.]) aren't in [radius ± rtol]=[6.1 ± 1e-07]" + "Query vector with index 0 lead to mismatched result indices:\nneighors in b" + " missing from a: []\nneighors in a missing from b: [3]" ) with pytest.raises(AssertionError, match=msg): assert_radius_neighbors_results_quasi_equality( @@ -558,12 +567,37 @@ def test_assert_radius_neighbors_results_quasi_equality(): radius=6.1, rtol=rtol, ) + msg = re.escape( + "Query vector with index 0 lead to mismatched result indices:\nneighors in b" + " missing from a: [4]\nneighors in a missing from b: [2]" + ) + with pytest.raises(AssertionError, match=msg): + assert_radius_neighbors_results_quasi_equality( + np.array([np.array([1.2, 2.1, 2.5])]), + np.array([np.array([1.2, 2, 2.5])]), + np.array([np.array([1, 2, 3])]), + np.array([np.array([1, 4, 3])]), + radius=6.1, + rtol=rtol, + ) - # Indices aren't properly sorted w.r.t their distances - msg = "Neighbors indices for query 0 are not matching" + # Radius upper bound is strictly checked + msg = re.escape( + "Largest returned distance 6.100000049999999 not within requested radius 6.1 on" + " row 0" + ) with pytest.raises(AssertionError, match=msg): assert_radius_neighbors_results_quasi_equality( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), + np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]), + np.array([np.array([1, 2, 3, 4, 5])]), + np.array([np.array([2, 1, 4, 5, 3])]), + radius=6.1, + rtol=rtol, + ) + with pytest.raises(AssertionError, match=msg): + assert_radius_neighbors_results_quasi_equality( + np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]), np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1, 2, 3, 4, 5])]), np.array([np.array([2, 1, 4, 5, 3])]), @@ -937,22 +971,18 @@ def test_radius_neighbors_factory_method_wrong_usages(): ) -@pytest.mark.parametrize( - "n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)] -) @pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors]) @pytest.mark.parametrize("dtype", [np.float64, np.float32]) def test_chunk_size_agnosticism( global_random_seed, Dispatcher, - n_samples_X, - n_samples_Y, dtype, n_features=100, ): """Check that results do not depend on the chunk size.""" rng = np.random.RandomState(global_random_seed) spread = 100 + n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False) X = rng.rand(n_samples_X, n_features).astype(dtype) * spread Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread @@ -992,21 +1022,17 @@ def test_chunk_size_agnosticism( ) -@pytest.mark.parametrize( - "n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)] -) @pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors]) @pytest.mark.parametrize("dtype", [np.float64, np.float32]) def test_n_threads_agnosticism( global_random_seed, Dispatcher, - n_samples_X, - n_samples_Y, dtype, n_features=100, ): """Check that results do not depend on the number of threads.""" rng = np.random.RandomState(global_random_seed) + n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False) spread = 100 X = rng.rand(n_samples_X, n_features).astype(dtype) * spread Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread @@ -1113,9 +1139,6 @@ def test_format_agnosticism( ) -@pytest.mark.parametrize( - "n_samples_X, n_samples_Y", [(100, 100), (100, 500), (500, 100)] -) @pytest.mark.parametrize( "metric", ["euclidean", "minkowski", "manhattan", "infinity", "seuclidean", "haversine"], @@ -1126,13 +1149,12 @@ def test_strategies_consistency( global_random_seed, Dispatcher, metric, - n_samples_X, - n_samples_Y, dtype, n_features=10, ): """Check that the results do not depend on the strategy used.""" rng = np.random.RandomState(global_random_seed) + n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False) spread = 100 X = rng.rand(n_samples_X, n_features).astype(dtype) * spread Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread @@ -1374,21 +1396,18 @@ def test_memmap_backed_data( ) -@pytest.mark.parametrize("n_samples", [100, 1000]) -@pytest.mark.parametrize("n_features", [5, 10, 100]) -@pytest.mark.parametrize("num_threads", [1, 2, 8]) @pytest.mark.parametrize("dtype", [np.float64, np.float32]) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_sqeuclidean_row_norms( global_random_seed, - n_samples, - n_features, - num_threads, dtype, csr_container, ): rng = np.random.RandomState(global_random_seed) spread = 100 + n_samples = rng.choice([97, 100, 101, 1000]) + n_features = rng.choice([5, 10, 100]) + num_threads = rng.choice([1, 2, 8]) X = rng.rand(n_samples, n_features).astype(dtype) * spread X_csr = csr_container(X) From 36aead5c61bf8dff4244c88fd7d93824cdc79e34 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 16:49:50 +0200 Subject: [PATCH 15/33] [azure parallel] [all random seeds] test_pairwise_distances From 7887faa9f695ca358fc3de4580d6b23ec9b81fbd Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 17:31:14 +0200 Subject: [PATCH 16/33] More factorization, easier to understand handling of tols in the tests --- .../test_pairwise_distances_reduction.py | 175 +++++++----------- 1 file changed, 67 insertions(+), 108 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 2fffb3e2cc8ef..6decc47fdd6f5 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1,6 +1,7 @@ import itertools import re import warnings +from functools import partial import numpy as np import pytest @@ -63,20 +64,6 @@ def _get_metric_params_list(metric: str, n_features: int, seed: int = 1): return [{}] -def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices, rtol=1e-7): - assert_array_equal( - ref_indices, - indices, - err_msg="Query vectors have different neighbors' indices", - ) - assert_allclose( - ref_dist, - dist, - err_msg="Query vectors have different neighbors' distances", - rtol=rtol, - ) - - def assert_same_distances_for_common_neighbors( query_idx, dist_row_a, @@ -139,7 +126,7 @@ def assert_no_missing_neighbors( ) -def assert_argkmin_results_quasi_equality( +def assert_compatible_argkmin_results( neighbors_dists_a, neighbors_dists_b, neighbors_indices_a, @@ -206,30 +193,7 @@ def assert_argkmin_results_quasi_equality( ) -def assert_radius_neighbors_results_equality( - neighbors_dists_a, - neighbors_dists_b, - neighbors_indices_a, - neighbors_indices_b, - radius, -): - # We get arrays of arrays and we need to check for individual pairs - for i in range(neighbors_dists_a.shape[0]): - assert (neighbors_dists_a[i] <= radius).all() - assert_array_equal( - neighbors_indices_a[i], - neighbors_indices_b[i], - err_msg=f"Query vector #{i} has different neighbors' indices", - ) - assert_allclose( - neighbors_dists_a[i], - neighbors_dists_b[i], - err_msg=f"Query vector #{i} has different neighbors' distances", - rtol=1e-7, - ) - - -def assert_radius_neighbors_results_quasi_equality( +def assert_compatible_radius_results( neighbors_dists_a, neighbors_dists_b, neighbors_indices_a, @@ -309,34 +273,34 @@ def assert_radius_neighbors_results_quasi_equality( ) +FLOAT32_TOLS = { + "atol": 1e-7, + "rtol": 1e-5, +} +FLOAT64_TOLS = { + "atol": 1e-9, + "rtol": 1e-7, +} ASSERT_RESULT = { - # In the case of 64bit, we test for exact equality of the results rankings - # and standard tolerance levels for the computed distance values. - # - # XXX: Note that in the future we might be interested in using quasi equality - # checks also for float64 data (with a larger number of significant digits) - # as the tests could be unstable because of numerically tied distances on - # some datasets (e.g. uniform grids). - (ArgKmin, np.float64): assert_argkmin_results_equality, + (ArgKmin, np.float64): partial(assert_compatible_argkmin_results, **FLOAT64_TOLS), + (ArgKmin, np.float32): partial(assert_compatible_argkmin_results, **FLOAT32_TOLS), ( RadiusNeighbors, np.float64, - ): assert_radius_neighbors_results_equality, - # In the case of 32bit, indices can be permuted due to small difference - # in the computations of their associated distances, hence we test equality of - # results up to valid permutations. - (ArgKmin, np.float32): assert_argkmin_results_quasi_equality, + ): partial(assert_compatible_radius_results, **FLOAT64_TOLS), ( RadiusNeighbors, np.float32, - ): assert_radius_neighbors_results_quasi_equality, + ): partial(assert_compatible_radius_results, **FLOAT32_TOLS), } -def test_assert_argkmin_results_quasi_equality(): - atol = 0.0 - rtol = 1e-7 - eps = 1e-7 / 2 +def test_assert_compatible_argkmin_results(): + atol = 1e-7 + rtol = 0.0 + tols = dict(atol=atol, rtol=rtol) + + eps = atol / 3 _1m = 1.0 - eps _1p = 1.0 + eps @@ -357,13 +321,13 @@ def test_assert_argkmin_results_quasi_equality(): ) # Sanity check: compare the reference results to themselves. - assert_argkmin_results_quasi_equality( + assert_compatible_argkmin_results( ref_dist, ref_dist, ref_indices, ref_indices, rtol ) # Apply valid permutation on indices: the last 3 points are all very close # to one another so we accept any permutation on their rankings. - assert_argkmin_results_quasi_equality( + assert_compatible_argkmin_results( np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), np.array([[1.2, 2.5, 6.1, 6.1, 6.1]]), np.array([[1, 2, 3, 4, 5]]), @@ -374,7 +338,7 @@ def test_assert_argkmin_results_quasi_equality(): # The last few indices do not necessarily have to match because of the rounding # errors on the distances: their could be tied results at the boundary. - assert_argkmin_results_quasi_equality( + assert_compatible_argkmin_results( np.array([[1.2, 2.5, 3.0, 6.1, _6_1p]]), np.array([[1.2, 2.5, 3.0, _6_1m, 6.1]]), np.array([[1, 2, 3, 4, 5]]), @@ -385,24 +349,22 @@ def test_assert_argkmin_results_quasi_equality(): # All points are have close distances so any ranking permutation # is valid for this query result. - assert_argkmin_results_quasi_equality( + assert_compatible_argkmin_results( np.array([[1, 1, _1p, _1p, _1p]]), np.array([[1, 1, 1, 1, _1p]]), np.array([[7, 6, 8, 10, 9]]), np.array([[6, 9, 7, 8, 10]]), - atol=atol, - rtol=rtol, + **tols, ) # They could also be nearly truncation of very large nearly tied result # sets hence all indices can also be distinct in this case: - assert_argkmin_results_quasi_equality( + assert_compatible_argkmin_results( np.array([[1, 1, _1p, _1p, _1p]]), np.array([[1, 1, 1, 1, _1p]]), np.array([[34, 30, 8, 12, 24]]), np.array([[42, 1, 21, 13, 3]]), - atol=atol, - rtol=rtol, + **tols, ) # Apply invalid permutation on indices: permuting the ranks of the 2 @@ -410,16 +372,15 @@ def test_assert_argkmin_results_quasi_equality(): # different. msg = re.escape( "Query vector with index 0 lead to different distances for common neighbor with" - " index 1: dist_a=1.2 vs dist_b=2.5 (with atol=0.0 and rtol=1e-07)" + " index 1: dist_a=1.2 vs dist_b=2.5" ) with pytest.raises(AssertionError, match=msg): - assert_argkmin_results_quasi_equality( + assert_compatible_argkmin_results( np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), np.array([[1, 2, 3, 4, 5]]), np.array([[2, 1, 3, 4, 5]]), - atol=atol, - rtol=rtol, + **tols, ) # Detect missing indices within the expected precision level, even when the @@ -428,13 +389,12 @@ def test_assert_argkmin_results_quasi_equality(): "neighors in b missing from a: [12]\nneighors in a missing from b: [1]" ) with pytest.raises(AssertionError, match=msg): - assert_argkmin_results_quasi_equality( + assert_compatible_argkmin_results( np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), np.array([[1, 2, 3, 4, 5]]), np.array([[12, 2, 4, 11, 3]]), - atol=atol, - rtol=rtol, + **tols, ) # Detect missing indices outside the expected precision level. @@ -442,13 +402,12 @@ def test_assert_argkmin_results_quasi_equality(): "neighors in b missing from a: []\nneighors in a missing from b: [3]" ) with pytest.raises(AssertionError, match=msg): - assert_argkmin_results_quasi_equality( + assert_compatible_argkmin_results( np.array([[_1m, 1.0, _6_1m, 6.1, _6_1p]]), np.array([[1.0, 1.0, _6_1m, 6.1, 7]]), np.array([[1, 2, 3, 4, 5]]), np.array([[2, 1, 4, 5, 12]]), - atol=atol, - rtol=rtol, + **tols, ) # Detect missing indices outside the expected precision level, in the other @@ -457,34 +416,34 @@ def test_assert_argkmin_results_quasi_equality(): "neighors in b missing from a: [5]\nneighors in a missing from b: []" ) with pytest.raises(AssertionError, match=msg): - assert_argkmin_results_quasi_equality( + assert_compatible_argkmin_results( np.array([[_1m, 1.0, _6_1m, 6.1, 7]]), np.array([[1.0, 1.0, _6_1m, 6.1, _6_1p]]), np.array([[1, 2, 3, 4, 12]]), np.array([[2, 1, 5, 3, 4]]), - atol=atol, - rtol=rtol, + **tols, ) # Distances aren't properly sorted msg = "Distances aren't sorted on row 0" with pytest.raises(AssertionError, match=msg): - assert_argkmin_results_quasi_equality( + assert_compatible_argkmin_results( np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), np.array([[2.5, 1.2, _6_1m, 6.1, _6_1p]]), np.array([[1, 2, 3, 4, 5]]), np.array([[2, 1, 4, 5, 3]]), - atol=atol, - rtol=rtol, + **tols, ) -def test_assert_radius_neighbors_results_quasi_equality(): - rtol = 1e-7 - eps = 1e-7 / 2 +def test_assert_compatible_radius_results(): + atol = 1e-7 + rtol = 0.0 + tols = dict(atol=atol, rtol=rtol) + + eps = atol / 3 _1m = 1.0 - eps _1p = 1.0 + eps - _6_1m = 6.1 - eps _6_1p = 6.1 + eps @@ -499,57 +458,57 @@ def test_assert_radius_neighbors_results_quasi_equality(): ] # Sanity check: compare the reference results to themselves. - assert_radius_neighbors_results_quasi_equality( + assert_compatible_radius_results( ref_dist, ref_dist, ref_indices, ref_indices, radius=7.0, - rtol=rtol, + **tols, ) # Apply valid permutation on indices - assert_radius_neighbors_results_quasi_equality( + assert_compatible_radius_results( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1, 2, 3, 4, 5])]), np.array([np.array([1, 2, 4, 5, 3])]), radius=7.0, - rtol=rtol, + **tols, ) - assert_radius_neighbors_results_quasi_equality( + assert_compatible_radius_results( np.array([np.array([_1m, _1m, 1, _1p, _1p])]), np.array([np.array([_1m, _1m, 1, _1p, _1p])]), np.array([np.array([6, 7, 8, 9, 10])]), np.array([np.array([6, 9, 7, 8, 10])]), radius=7.0, - rtol=rtol, + **tols, ) # Apply invalid permutation on indices msg = re.escape( "Query vector with index 0 lead to different distances for common neighbor with" - " index 1: dist_a=1.2 vs dist_b=2.5 (with atol=1e-06 and rtol=1e-07)" + " index 1: dist_a=1.2 vs dist_b=2.5" ) with pytest.raises(AssertionError, match=msg): - assert_radius_neighbors_results_quasi_equality( + assert_compatible_radius_results( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1, 2, 3, 4, 5])]), np.array([np.array([2, 1, 3, 4, 5])]), radius=7.0, - rtol=rtol, + **tols, ) # Having extra last or missing elements is valid if they are in the # tolerated rounding error range: [(1 - rtol) * radius - atol, radius] - assert_radius_neighbors_results_quasi_equality( + assert_compatible_radius_results( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p, _6_1p])]), np.array([np.array([1.2, 2.5, _6_1m, 6.1])]), np.array([np.array([1, 2, 3, 4, 5, 7])]), np.array([np.array([1, 2, 3, 6])]), radius=_6_1p, - rtol=rtol, + **tols, ) # Any discrepancy outside the tolerated rounding error range is invalid and @@ -559,62 +518,62 @@ def test_assert_radius_neighbors_results_quasi_equality(): " missing from a: []\nneighors in a missing from b: [3]" ) with pytest.raises(AssertionError, match=msg): - assert_radius_neighbors_results_quasi_equality( + assert_compatible_radius_results( np.array([np.array([1.2, 2.5, 6])]), np.array([np.array([1.2, 2.5])]), np.array([np.array([1, 2, 3])]), np.array([np.array([1, 2])]), radius=6.1, - rtol=rtol, + **tols, ) msg = re.escape( "Query vector with index 0 lead to mismatched result indices:\nneighors in b" " missing from a: [4]\nneighors in a missing from b: [2]" ) with pytest.raises(AssertionError, match=msg): - assert_radius_neighbors_results_quasi_equality( + assert_compatible_radius_results( np.array([np.array([1.2, 2.1, 2.5])]), np.array([np.array([1.2, 2, 2.5])]), np.array([np.array([1, 2, 3])]), np.array([np.array([1, 4, 3])]), radius=6.1, - rtol=rtol, + **tols, ) # Radius upper bound is strictly checked msg = re.escape( - "Largest returned distance 6.100000049999999 not within requested radius 6.1 on" + "Largest returned distance 6.100000033333333 not within requested radius 6.1 on" " row 0" ) with pytest.raises(AssertionError, match=msg): - assert_radius_neighbors_results_quasi_equality( + assert_compatible_radius_results( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]), np.array([np.array([1, 2, 3, 4, 5])]), np.array([np.array([2, 1, 4, 5, 3])]), radius=6.1, - rtol=rtol, + **tols, ) with pytest.raises(AssertionError, match=msg): - assert_radius_neighbors_results_quasi_equality( + assert_compatible_radius_results( np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]), np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([1, 2, 3, 4, 5])]), np.array([np.array([2, 1, 4, 5, 3])]), radius=6.1, - rtol=rtol, + **tols, ) # Distances aren't properly sorted msg = "Distances aren't sorted on row 0" with pytest.raises(AssertionError, match=msg): - assert_radius_neighbors_results_quasi_equality( + assert_compatible_radius_results( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]), np.array([np.array([1, 2, 3, 4, 5])]), np.array([np.array([2, 1, 4, 5, 3])]), radius=6.1, - rtol=rtol, + **tols, ) From 70dd93306d393e675f2fbd1b25dacbceffe78be6 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 17:31:24 +0200 Subject: [PATCH 17/33] [azure parallel] [all random seeds] test_pairwise_distances From e8c227a0f63f7c587867c663f5dbc88f76d4ae5a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 17:36:26 +0200 Subject: [PATCH 18/33] More interesting tests --- .../metrics/tests/test_pairwise_distances_reduction.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 6decc47fdd6f5..8588cf0c8fc22 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -329,9 +329,9 @@ def test_assert_compatible_argkmin_results(): # to one another so we accept any permutation on their rankings. assert_compatible_argkmin_results( np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), - np.array([[1.2, 2.5, 6.1, 6.1, 6.1]]), + np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), np.array([[1, 2, 3, 4, 5]]), - np.array([[1, 2, 4, 5, 3]]), + np.array([[1, 2, 5, 4, 3]]), atol=atol, rtol=rtol, ) @@ -350,7 +350,7 @@ def test_assert_compatible_argkmin_results(): # All points are have close distances so any ranking permutation # is valid for this query result. assert_compatible_argkmin_results( - np.array([[1, 1, _1p, _1p, _1p]]), + np.array([[_1m, 1, _1p, _1p, _1p]]), np.array([[1, 1, 1, 1, _1p]]), np.array([[7, 6, 8, 10, 9]]), np.array([[6, 9, 7, 8, 10]]), @@ -360,8 +360,8 @@ def test_assert_compatible_argkmin_results(): # They could also be nearly truncation of very large nearly tied result # sets hence all indices can also be distinct in this case: assert_compatible_argkmin_results( - np.array([[1, 1, _1p, _1p, _1p]]), - np.array([[1, 1, 1, 1, _1p]]), + np.array([[_1m, 1, _1p, _1p, _1p]]), + np.array([[_1m, 1, 1, 1, _1p]]), np.array([[34, 30, 8, 12, 24]]), np.array([[42, 1, 21, 13, 3]]), **tols, From 9845866e2ca0ad0dc93a411adbfa29bc35c253fc Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 17:36:32 +0200 Subject: [PATCH 19/33] [azure parallel] [all random seeds] test_pairwise_distances From 106952a1ee0a6e416095152dacc1f638f4e8722b Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 17:51:27 +0200 Subject: [PATCH 20/33] Fix side effect of the refactoring in test_neighbors --- .../test_pairwise_distances_reduction.py | 38 +++++++++++++++---- sklearn/neighbors/tests/test_neighbors.py | 5 ++- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 8588cf0c8fc22..fb9281967de09 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -199,6 +199,7 @@ def assert_compatible_radius_results( neighbors_indices_a, neighbors_indices_b, radius, + check_sorted=True, rtol=1e-5, atol=1e-6, ): @@ -234,8 +235,9 @@ def assert_compatible_radius_results( indices_row_a = neighbors_indices_a[query_idx] indices_row_b = neighbors_indices_b[query_idx] - assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}" - assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}" + if check_sorted: + assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}" + assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}" assert len(dist_row_a) == len(indices_row_a) assert len(dist_row_b) == len(indices_row_b) @@ -436,7 +438,8 @@ def test_assert_compatible_argkmin_results(): ) -def test_assert_compatible_radius_results(): +@pytest.mark.parametrize("check_sorted", [True, False]) +def test_assert_compatible_radius_results(check_sorted): atol = 1e-7 rtol = 0.0 tols = dict(atol=atol, rtol=rtol) @@ -464,6 +467,7 @@ def test_assert_compatible_radius_results(): ref_indices, ref_indices, radius=7.0, + check_sorted=check_sorted, **tols, ) @@ -474,6 +478,7 @@ def test_assert_compatible_radius_results(): np.array([np.array([1, 2, 3, 4, 5])]), np.array([np.array([1, 2, 4, 5, 3])]), radius=7.0, + check_sorted=check_sorted, **tols, ) assert_compatible_radius_results( @@ -482,6 +487,7 @@ def test_assert_compatible_radius_results(): np.array([np.array([6, 7, 8, 9, 10])]), np.array([np.array([6, 9, 7, 8, 10])]), radius=7.0, + check_sorted=check_sorted, **tols, ) @@ -497,6 +503,7 @@ def test_assert_compatible_radius_results(): np.array([np.array([1, 2, 3, 4, 5])]), np.array([np.array([2, 1, 3, 4, 5])]), radius=7.0, + check_sorted=check_sorted, **tols, ) @@ -508,6 +515,7 @@ def test_assert_compatible_radius_results(): np.array([np.array([1, 2, 3, 4, 5, 7])]), np.array([np.array([1, 2, 3, 6])]), radius=_6_1p, + check_sorted=check_sorted, **tols, ) @@ -524,6 +532,7 @@ def test_assert_compatible_radius_results(): np.array([np.array([1, 2, 3])]), np.array([np.array([1, 2])]), radius=6.1, + check_sorted=check_sorted, **tols, ) msg = re.escape( @@ -537,6 +546,7 @@ def test_assert_compatible_radius_results(): np.array([np.array([1, 2, 3])]), np.array([np.array([1, 4, 3])]), radius=6.1, + check_sorted=check_sorted, **tols, ) @@ -552,6 +562,7 @@ def test_assert_compatible_radius_results(): np.array([np.array([1, 2, 3, 4, 5])]), np.array([np.array([2, 1, 4, 5, 3])]), radius=6.1, + check_sorted=check_sorted, **tols, ) with pytest.raises(AssertionError, match=msg): @@ -561,18 +572,31 @@ def test_assert_compatible_radius_results(): np.array([np.array([1, 2, 3, 4, 5])]), np.array([np.array([2, 1, 4, 5, 3])]), radius=6.1, + check_sorted=check_sorted, **tols, ) - # Distances aren't properly sorted - msg = "Distances aren't sorted on row 0" - with pytest.raises(AssertionError, match=msg): + if check_sorted: + # Distances aren't properly sorted + msg = "Distances aren't sorted on row 0" + with pytest.raises(AssertionError, match=msg): + assert_compatible_radius_results( + np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), + np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]), + np.array([np.array([1, 2, 3, 4, 5])]), + np.array([np.array([2, 1, 4, 5, 3])]), + radius=_6_1p, + check_sorted=True, + **tols, + ) + else: assert_compatible_radius_results( np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]), np.array([np.array([1, 2, 3, 4, 5])]), np.array([np.array([2, 1, 4, 5, 3])]), - radius=6.1, + radius=_6_1p, + check_sorted=False, **tols, ) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 35fc210bea7f3..08efb587808f7 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -30,7 +30,7 @@ from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS from sklearn.metrics.tests.test_pairwise_distances_reduction import ( - assert_radius_neighbors_results_equality, + assert_compatible_radius_results, ) from sklearn.model_selection import cross_val_score, train_test_split from sklearn.neighbors import ( @@ -2242,12 +2242,13 @@ def test_radius_neighbors_brute_backend( X_test, return_distance=True ) - assert_radius_neighbors_results_equality( + assert_compatible_radius_results( legacy_brute_dst, pdr_brute_dst, legacy_brute_idx, pdr_brute_idx, radius=radius, + check_sorted=False, ) From 9b57538d4a2c734f1a370131bb5237aa6ed669f9 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 19:36:45 +0200 Subject: [PATCH 21/33] More test tweaks to reduce test time while increasing aggregate coverage --- .../test_pairwise_distances_reduction.py | 4 +-- sklearn/neighbors/tests/test_neighbors.py | 26 +++++++++++++------ 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index fb9281967de09..19c68cecc01b6 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1208,7 +1208,7 @@ def test_pairwise_distances_argkmin( strategy, dtype, csr_container, - n_queries=10, + n_queries=5, n_samples=100, k=10, ): @@ -1273,7 +1273,7 @@ def test_pairwise_distances_radius_neighbors( metric, strategy, dtype, - n_queries=10, + n_queries=5, n_samples=100, ): rng = np.random.RandomState(global_random_seed) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 08efb587808f7..daf4c13c2a5b3 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -30,6 +30,7 @@ from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS from sklearn.metrics.tests.test_pairwise_distances_reduction import ( + assert_compatible_argkmin_results, assert_compatible_radius_results, ) from sklearn.model_selection import cross_val_score, train_test_split @@ -1694,8 +1695,15 @@ def test_neighbors_metrics( "metric", sorted(set(neighbors.VALID_METRICS["brute"]) - set(["precomputed"])) ) def test_kneighbors_brute_backend( - global_dtype, metric, n_samples=2000, n_features=30, n_query_pts=100, n_neighbors=5 + metric, + global_dtype, + global_random_seed, + n_samples=2000, + n_features=30, + n_query_pts=5, + n_neighbors=5, ): + rng = np.random.RandomState(global_random_seed) # Both backend for the 'brute' algorithm of kneighbors must give identical results. X_train = rng.rand(n_samples, n_features).astype(global_dtype, copy=False) X_test = rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False) @@ -1732,8 +1740,9 @@ def test_kneighbors_brute_backend( X_test, return_distance=True ) - assert_allclose(legacy_brute_dst, pdr_brute_dst) - assert_array_equal(legacy_brute_idx, pdr_brute_idx) + assert_compatible_argkmin_results( + legacy_brute_dst, pdr_brute_dst, legacy_brute_idx, pdr_brute_idx + ) def test_callable_metric(): @@ -2198,16 +2207,18 @@ def test_auto_algorithm(X, metric, metric_params, expected_algo): ) def test_radius_neighbors_brute_backend( metric, + global_random_seed, + global_dtype, n_samples=2000, n_features=30, - n_query_pts=100, - n_neighbors=5, + n_query_pts=5, radius=1.0, ): + rng = np.random.RandomState(global_random_seed) # Both backends for the 'brute' algorithm of radius_neighbors # must give identical results. - X_train = rng.rand(n_samples, n_features) - X_test = rng.rand(n_query_pts, n_features) + X_train = rng.rand(n_samples, n_features).astype(global_dtype, copy=False) + X_test = rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False) # Haversine distance only accepts 2D data if metric == "haversine": @@ -2221,7 +2232,6 @@ def test_radius_neighbors_brute_backend( p = metric_params.pop("p", 2) neigh = neighbors.NearestNeighbors( - n_neighbors=n_neighbors, radius=radius, algorithm="brute", metric=metric, From 30f190164a921bc7be19402fdd216363d88558f5 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 20:26:53 +0200 Subject: [PATCH 22/33] Optimize the speed of running test_strategies_consistency while keeping it as useful as possible at finding regressions --- .../test_pairwise_distances_reduction.py | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 19c68cecc01b6..c78aefbfa9455 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -8,7 +8,7 @@ import threadpoolctl from scipy.spatial.distance import cdist -from sklearn.metrics import euclidean_distances +from sklearn.metrics import euclidean_distances, pairwise_distances from sklearn.metrics._pairwise_distances_reduction import ( ArgKmin, ArgKminClassMode, @@ -1122,25 +1122,30 @@ def test_format_agnosticism( ) -@pytest.mark.parametrize( - "metric", - ["euclidean", "minkowski", "manhattan", "infinity", "seuclidean", "haversine"], -) @pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors]) -@pytest.mark.parametrize("dtype", [np.float64, np.float32]) def test_strategies_consistency( global_random_seed, + global_dtype, Dispatcher, - metric, - dtype, n_features=10, ): """Check that the results do not depend on the strategy used.""" rng = np.random.RandomState(global_random_seed) + metric = rng.choice( + np.array( + [ + "euclidean", + "minkowski", + "manhattan", + "haversine", + ], + dtype=object, + ) + ) n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False) spread = 100 - X = rng.rand(n_samples_X, n_features).astype(dtype) * spread - Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread + X = rng.rand(n_samples_X, n_features).astype(global_dtype) * spread + Y = rng.rand(n_samples_Y, n_features).astype(global_dtype) * spread # Haversine distance only accepts 2D data if metric == "haversine": @@ -1152,8 +1157,15 @@ def test_strategies_consistency( check_parameters = {} compute_parameters = {} else: - # Scaling the radius slightly with the numbers of dimensions - radius = 10 ** np.log(n_features) + # Find a non-trivial radius using a small subsample of X and Y: we want + # to return around expected_n_neighbors on average. Yielding too many + # results would make the test slow (because checking the results is + # expensive for large result sets), yielding 0 most of the time would + # make the test useless. + expected_n_neighbors = 10 + sample_dists = pairwise_distances(X[:10], Y, metric=metric) + sample_dists.sort(axis=1) + radius = sample_dists[:, expected_n_neighbors].mean() parameter = radius check_parameters = {"radius": radius} compute_parameters = {"sort_results": True} @@ -1190,7 +1202,7 @@ def test_strategies_consistency( **compute_parameters, ) - ASSERT_RESULT[(Dispatcher, dtype)]( + ASSERT_RESULT[(Dispatcher, global_dtype)]( dist_par_X, dist_par_Y, indices_par_X, indices_par_Y, **check_parameters ) From 897fece7c8c03d65168fe1e6ed3daba6db589551 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 20:48:11 +0200 Subject: [PATCH 23/33] Attempt to make test_pairwise_distances_radius_neighbors more useful by using a data-driven radius --- .../test_pairwise_distances_reduction.py | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index c78aefbfa9455..d31ada7753d92 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -244,13 +244,15 @@ def assert_compatible_radius_results( # Check that all distances are within the requested radius if len(dist_row_a) > 0: - assert dist_row_a[-1] <= radius, ( - f"Largest returned distance {dist_row_a[-1]} not within requested" + max_dist_a = np.max(dist_row_a) + assert max_dist_a <= radius, ( + f"Largest returned distance {max_dist_a} not within requested" f" radius {radius} on row {query_idx}" ) if len(dist_row_b) > 0: - assert dist_row_b[-1] <= radius, ( - f"Largest returned distance {dist_row_b[-1]} not within requested" + max_dist_b = np.max(dist_row_b) + assert max_dist_b <= radius, ( + f"Largest returned distance {max_dist_b} not within requested" f" radius {radius} on row {query_idx}" ) @@ -1292,7 +1294,6 @@ def test_pairwise_distances_radius_neighbors( n_features = rng.choice([50, 500]) translation = rng.choice([0, 1e6]) spread = 1000 - radius = spread * np.log(n_features) X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread @@ -1307,6 +1308,16 @@ def test_pairwise_distances_radius_neighbors( else: dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs) + # Find a non-trivial radius using a small subsample of the dist_matrix: + # we want to return around expected_n_neighbors on average. Yielding too + # many results would make the test slow (because checking the results is + # expensive for large result sets), yielding 0 most of the time would make + # the test useless. + expected_n_neighbors = 10 + sample_dists = dist_matrix[:10] + sample_dists.sort(axis=1) + radius = sample_dists[:, expected_n_neighbors].mean() + # Getting the neighbors for a given radius neigh_indices_ref = [] neigh_distances_ref = [] From d23210ba77ca55297570651e303b3f0ac38413f5 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Sep 2023 23:56:56 +0200 Subject: [PATCH 24/33] Fix side effect of the heuristic radius computation --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index d31ada7753d92..381802fa284b5 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1314,7 +1314,7 @@ def test_pairwise_distances_radius_neighbors( # expensive for large result sets), yielding 0 most of the time would make # the test useless. expected_n_neighbors = 10 - sample_dists = dist_matrix[:10] + sample_dists = dist_matrix[:10].copy() sample_dists.sort(axis=1) radius = sample_dists[:, expected_n_neighbors].mean() From f2d9725c8d0a2c84ffdf5beb40a968e1e118d51c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 15 Sep 2023 00:11:20 +0200 Subject: [PATCH 25/33] Accelerate test_n_threads_agnosticism by using a data-drive radius value --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 381802fa284b5..57ab46edaedd5 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1028,7 +1028,10 @@ def test_n_threads_agnosticism( compute_parameters = {} else: # Scaling the radius slightly with the numbers of dimensions - radius = 10 ** np.log(n_features) + expected_n_neighbors = 10 + dist_sample = euclidean_distances(X[:10], Y) + dist_sample.sort(axis=1) + radius = dist_sample[:, expected_n_neighbors].mean() parameter = radius check_parameters = {"radius": radius} compute_parameters = {"sort_results": True} From ec263ca9e70840d91688550c45d11e027d05bda4 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 15 Sep 2023 00:16:38 +0200 Subject: [PATCH 26/33] Accelerate test_format_agnosticism and test_chunk_size_agnosticism by using a data-drive radius value --- .../metrics/tests/test_pairwise_distances_reduction.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 57ab46edaedd5..5dcca4ad97b02 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -977,7 +977,10 @@ def test_chunk_size_agnosticism( compute_parameters = {} else: # Scaling the radius slightly with the numbers of dimensions - radius = 10 ** np.log(n_features) + expected_n_neighbors = 10 + dist_sample = euclidean_distances(X[:10], Y) + dist_sample.sort(axis=1) + radius = dist_sample[:, expected_n_neighbors].mean() parameter = radius check_parameters = {"radius": radius} compute_parameters = {"sort_results": True} @@ -1093,7 +1096,10 @@ def test_format_agnosticism( compute_parameters = {} else: # Scaling the radius slightly with the numbers of dimensions - radius = 10 ** np.log(n_features) + expected_n_neighbors = 10 + dist_sample = euclidean_distances(X[:10], Y) + dist_sample.sort(axis=1) + radius = dist_sample[:, expected_n_neighbors].mean() parameter = radius check_parameters = {"radius": radius} compute_parameters = {"sort_results": True} From f43d5cb7d755f0a2c01b41e9a804c034a8da20b9 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 15 Sep 2023 10:22:52 +0200 Subject: [PATCH 27/33] Do not rely on implicit data conversion in test_kneighbors_brute_backend --- sklearn/neighbors/tests/test_neighbors.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index daf4c13c2a5b3..9a0dd18d845fa 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -27,7 +27,7 @@ from sklearn.metrics._dist_metrics import ( DistanceMetric, ) -from sklearn.metrics.pairwise import pairwise_distances +from sklearn.metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS, pairwise_distances from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS from sklearn.metrics.tests.test_pairwise_distances_reduction import ( assert_compatible_argkmin_results, @@ -1714,6 +1714,10 @@ def test_kneighbors_brute_backend( X_train = np.ascontiguousarray(X_train[:, feature_sl]) X_test = np.ascontiguousarray(X_test[:, feature_sl]) + if metric in metric in PAIRWISE_BOOLEAN_FUNCTIONS: + X_train = X_train > 0.5 + X_test = X_test > 0.5 + metric_params_list = _generate_test_params_for(metric, n_features) for metric_params in metric_params_list: From f2d8d8ec9cebe6465ce15e3cf7fd21d7ac2c65ad Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 15 Sep 2023 11:51:27 +0200 Subject: [PATCH 28/33] [azure parallel] [all random seeds] test_pairwise_distances From e85d81bebcfba9927728f3ed1defcf7c7ee94e49 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 15 Sep 2023 14:44:17 +0200 Subject: [PATCH 29/33] Typo: if metric in metric in ... --- sklearn/neighbors/tests/test_neighbors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index d39e10eeddfa4..947879999a156 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1714,7 +1714,7 @@ def test_kneighbors_brute_backend( X_train = np.ascontiguousarray(X_train[:, feature_sl]) X_test = np.ascontiguousarray(X_test[:, feature_sl]) - if metric in metric in PAIRWISE_BOOLEAN_FUNCTIONS: + if metric in PAIRWISE_BOOLEAN_FUNCTIONS: X_train = X_train > 0.5 X_test = X_test > 0.5 From c01255e3f7f0ffa4226b8a4f0f4ae142d53925ec Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 26 Sep 2023 11:03:45 +0200 Subject: [PATCH 30/33] Typos and code style improvements Co-authored-by: Julien Jerphanion --- .../tests/test_pairwise_distances_reduction.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 5dcca4ad97b02..d82aba911d6b8 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -81,8 +81,8 @@ def assert_same_distances_for_common_neighbors( # Compute a mapping from indices to distances for each result set and # check that the computed neighbors with matching indices are withing # the expected distance tolerance. - indices_to_dist_a = {idx: dist for idx, dist in zip(indices_row_a, dist_row_a)} - indices_to_dist_b = {idx: dist for idx, dist in zip(indices_row_b, dist_row_b)} + indices_to_dist_a = dict(zip(indices_row_a, dist_row_a)) + indices_to_dist_b = dict(zip(indices_row_b, dist_row_b)) common_indices = set(indices_row_a).intersection(set(indices_row_b)) for idx in common_indices: @@ -224,7 +224,7 @@ def assert_compatible_radius_results( == len(neighbors_dists_b) == len(neighbors_indices_a) == len(neighbors_indices_b) - ), "Arrays of results have various lengths." + ) n_queries = len(neighbors_dists_a) @@ -336,22 +336,20 @@ def test_assert_compatible_argkmin_results(): np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), np.array([[1, 2, 3, 4, 5]]), np.array([[1, 2, 5, 4, 3]]), - atol=atol, - rtol=rtol, + **tols, ) # The last few indices do not necessarily have to match because of the rounding - # errors on the distances: their could be tied results at the boundary. + # errors on the distances: there could be tied results at the boundary. assert_compatible_argkmin_results( np.array([[1.2, 2.5, 3.0, 6.1, _6_1p]]), np.array([[1.2, 2.5, 3.0, _6_1m, 6.1]]), np.array([[1, 2, 3, 4, 5]]), np.array([[1, 2, 3, 6, 7]]), - atol=atol, - rtol=rtol, + **tols, ) - # All points are have close distances so any ranking permutation + # All points have close distances so any ranking permutation # is valid for this query result. assert_compatible_argkmin_results( np.array([[_1m, 1, _1p, _1p, _1p]]), From 05234d490102c3e3f7b8a7fa3c8e47b0b6105543 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 26 Sep 2023 11:35:15 +0200 Subject: [PATCH 31/33] Factorize redundant test logic to find a non-trivial test radius --- .../test_pairwise_distances_reduction.py | 67 ++++++++++--------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index d82aba911d6b8..7dd1a708df9b7 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -193,6 +193,33 @@ def assert_compatible_argkmin_results( ) +def _non_trivial_radius( + *, + X=None, + Y=None, + metric=None, + precomputed_dists=None, + expected_n_neighbors=10, + n_subsampled_queries=10, + **metric_kwargs, +): + # Find a non-trivial radius using a small subsample of the pairwise + # distances between X and Y: we want to return around expected_n_neighbors + # on average. Yielding too many results would make the test slow (because + # checking the results is expensive for large result sets), yielding 0 most + # of the time would make the test useless. + if precomputed_dists is None and metric is None: + raise ValueError("Either metric or dists must be provided") + if precomputed_dists is None: + assert X is not None + assert Y is not None + sampled_dists = pairwise_distances(X, Y, metric=metric, **metric_kwargs) + else: + sampled_dists = precomputed_dists[:n_subsampled_queries].copy() + sampled_dists.sort(axis=1) + return sampled_dists[:, expected_n_neighbors].mean() + + def assert_compatible_radius_results( neighbors_dists_a, neighbors_dists_b, @@ -974,11 +1001,7 @@ def test_chunk_size_agnosticism( check_parameters = {} compute_parameters = {} else: - # Scaling the radius slightly with the numbers of dimensions - expected_n_neighbors = 10 - dist_sample = euclidean_distances(X[:10], Y) - dist_sample.sort(axis=1) - radius = dist_sample[:, expected_n_neighbors].mean() + radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean") parameter = radius check_parameters = {"radius": radius} compute_parameters = {"sort_results": True} @@ -1028,11 +1051,7 @@ def test_n_threads_agnosticism( check_parameters = {} compute_parameters = {} else: - # Scaling the radius slightly with the numbers of dimensions - expected_n_neighbors = 10 - dist_sample = euclidean_distances(X[:10], Y) - dist_sample.sort(axis=1) - radius = dist_sample[:, expected_n_neighbors].mean() + radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean") parameter = radius check_parameters = {"radius": radius} compute_parameters = {"sort_results": True} @@ -1093,11 +1112,9 @@ def test_format_agnosticism( check_parameters = {} compute_parameters = {} else: - # Scaling the radius slightly with the numbers of dimensions - expected_n_neighbors = 10 - dist_sample = euclidean_distances(X[:10], Y) - dist_sample.sort(axis=1) - radius = dist_sample[:, expected_n_neighbors].mean() + # Adjusting the radius to ensure that the expected results is neither + # trivially empty nor too large. + radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean") parameter = radius check_parameters = {"radius": radius} compute_parameters = {"sort_results": True} @@ -1166,15 +1183,7 @@ def test_strategies_consistency( check_parameters = {} compute_parameters = {} else: - # Find a non-trivial radius using a small subsample of X and Y: we want - # to return around expected_n_neighbors on average. Yielding too many - # results would make the test slow (because checking the results is - # expensive for large result sets), yielding 0 most of the time would - # make the test useless. - expected_n_neighbors = 10 - sample_dists = pairwise_distances(X[:10], Y, metric=metric) - sample_dists.sort(axis=1) - radius = sample_dists[:, expected_n_neighbors].mean() + radius = _non_trivial_radius(X=X, Y=Y, metric=metric) parameter = radius check_parameters = {"radius": radius} compute_parameters = {"sort_results": True} @@ -1315,15 +1324,7 @@ def test_pairwise_distances_radius_neighbors( else: dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs) - # Find a non-trivial radius using a small subsample of the dist_matrix: - # we want to return around expected_n_neighbors on average. Yielding too - # many results would make the test slow (because checking the results is - # expensive for large result sets), yielding 0 most of the time would make - # the test useless. - expected_n_neighbors = 10 - sample_dists = dist_matrix[:10].copy() - sample_dists.sort(axis=1) - radius = sample_dists[:, expected_n_neighbors].mean() + radius = _non_trivial_radius(precomputed_dists=dist_matrix) # Getting the neighbors for a given radius neigh_indices_ref = [] From 1224fb6aa50951d67db09c8dd8b89fe642d26c2a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 27 Sep 2023 18:08:20 +0200 Subject: [PATCH 32/33] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 686a9d9354fa7..c9d5911ae7712 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -80,7 +80,7 @@ def assert_same_distances_for_common_neighbors( Missingness is handled by assert_no_missing_neighbors. """ # Compute a mapping from indices to distances for each result set and - # check that the computed neighbors with matching indices are withing + # check that the computed neighbors with matching indices are within # the expected distance tolerance. indices_to_dist_a = dict(zip(indices_row_a, dist_row_a)) indices_to_dist_b = dict(zip(indices_row_b, dist_row_b)) @@ -135,7 +135,7 @@ def assert_compatible_argkmin_results( rtol=1e-5, atol=1e-6, ): - """Assert that argkmin results are valid up to rounding errors + """Assert that argkmin results are valid up to rounding errors. This function asserts that the results of argkmin queries are valid up to: - rounding error tolerance on distance values; From 7fb42619edfaea497eb5f6f321c9af6f0e890817 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 27 Sep 2023 18:20:43 +0200 Subject: [PATCH 33/33] DOC add missing docstring + improve inline comment --- .../test_pairwise_distances_reduction.py | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index c9d5911ae7712..75f497315ff01 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -111,6 +111,18 @@ def assert_no_missing_neighbors( indices_row_b, threshold, ): + """Compare the indices of neighbors in two results sets. + + Any neighbor index with a distance below the precision threshold should + match one in the other result set. We ignore the last few neighbors beyond + the threshold as those can typically be missing due to rounding errors. + + For radius queries, the threshold is just the radius minus the expected + precision level. + + For k-NN queries, it is the maxium distance to the k-th neighbor minus the + expected precision level. + """ mask_a = dist_row_a < threshold mask_b = dist_row_b < threshold missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b) @@ -179,8 +191,15 @@ def assert_compatible_argkmin_results( atol, ) - # Check that any neighbor with distances below the rounding error threshold have - # matching indices. + # Check that any neighbor with distances below the rounding error + # threshold have matching indices. The threshold is the distance to the + # k-th neighbors minus the expected precision level: + # + # (1 - rtol) * dist_k - atol + # + # Where dist_k is defined as the maxium distance to the kth-neighbor + # among the two result sets. This way of defining the threshold is + # stricter than taking the minimum of the two. threshold = (1 - rtol) * np.maximum( np.max(dist_row_a), np.max(dist_row_b) ) - atol