From 06e733e1c1c3327c3ffe12a92a12fdf69ffa0bbc Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 5 Aug 2022 11:48:59 +0200
Subject: [PATCH 1/3] ASV Add benchmarks for PairwiseDistancesReductions

The radius definition is from Olivier:

https://github.com/scikit-learn/scikit-learn/pull/22320#pullrequestreview-892758702

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../pairwise_distances_reductions.py          | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 asv_benchmarks/benchmarks/pairwise_distances_reductions.py

diff --git a/asv_benchmarks/benchmarks/pairwise_distances_reductions.py b/asv_benchmarks/benchmarks/pairwise_distances_reductions.py
new file mode 100644
index 0000000000000..55ad93529442d
--- /dev/null
+++ b/asv_benchmarks/benchmarks/pairwise_distances_reductions.py
@@ -0,0 +1,84 @@
+import numpy as np
+from scipy.spatial.distance import cdist
+
+from .common import Benchmark
+
+from sklearn.metrics._pairwise_distances_reduction import (
+    PairwiseDistancesArgKmin,
+    PairwiseDistancesRadiusNeighborhood,
+)
+
+# To run benchmarks defined this file, between for instance your <current_branch>
+# and its fork point with upstream/main, run:
+#
+#             asv continuous -b PairwiseDistancesReductions \
+#             -e `git merge-base --fork-point upstream/main <your_branch>` <your_branch>
+
+
+class PairwiseDistancesReductionsBenchmark(Benchmark):
+
+    param_names = ["n_train", "n_test", "n_features", "metric", "strategy"]
+    params = [
+        [1000, 10_000, int(1e7)],
+        [1000, 10_000, 100_000],
+        [100],
+        ["euclidean", "manhattan"],
+        ["auto", "parallel_on_X", "parallel_on_Y"],
+    ]
+
+    def setup(self, n_train, n_test, n_features, metric, strategy):
+        rng = np.random.RandomState(0)
+        self.X_train = rng.rand(n_train, n_features)
+        self.X_test = rng.rand(n_test, n_features)
+        self.y_train = rng.randint(low=-1, high=1, size=(n_train,))
+        self.metric = metric
+        self.strategy = strategy
+
+        self.k = 10
+
+        # Motive: radius has to be scaled with the number of feature
+        # Defining it as the 0.001-quantile allows to have in expectation
+        # a constant amount of neighbors, regardless of the value of n_features.
+        self.radius = np.quantile(
+            a=cdist(self.X_train[:10000], self.X_test[:10]).ravel(), q=0.001
+        )
+
+    def time_PairwiseDistancesArgKmin(
+        self, n_train, n_test, n_features, metric, strategy
+    ):
+
+        PairwiseDistancesArgKmin.compute(
+            X=self.X_test,
+            Y=self.X_train,
+            k=10,
+            metric=self.metric,
+            return_distance=True,
+            strategy=self.strategy,
+        )
+
+    def peakmem_PairwiseDistancesArgKmin(
+        self, n_train, n_test, n_features, metric, strategy
+    ):
+        self.time_PairwiseDistancesArgKmin(
+            n_train, n_test, n_features, metric, strategy
+        )
+
+    def time_PairwiseDistancesRadiusNeighborhood(
+        self, n_train, n_test, n_features, metric, strategy
+    ):
+
+        PairwiseDistancesRadiusNeighborhood.compute(
+            X=self.X_test,
+            Y=self.X_train,
+            radius=10 * np.log(n_features),
+            metric=self.metric,
+            return_distance=True,
+            strategy=self.strategy,
+        )
+
+    def peakmem_PairwiseDistancesRadiusNeighborhood(
+        self, n_train, n_test, n_features, metric, strategy
+    ):
+        self.time_PairwiseDistancesRadiusNeighborhood(
+            n_train, n_test, n_features, metric, strategy
+        )

From 3ce295ce68df39af282b59e07315dd7191770dab Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 5 Aug 2022 21:58:33 +0200
Subject: [PATCH 2/3] Well, Julien

---
 asv_benchmarks/benchmarks/pairwise_distances_reductions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asv_benchmarks/benchmarks/pairwise_distances_reductions.py b/asv_benchmarks/benchmarks/pairwise_distances_reductions.py
index 55ad93529442d..8a7793c1b4e30 100644
--- a/asv_benchmarks/benchmarks/pairwise_distances_reductions.py
+++ b/asv_benchmarks/benchmarks/pairwise_distances_reductions.py
@@ -70,7 +70,7 @@ def time_PairwiseDistancesRadiusNeighborhood(
         PairwiseDistancesRadiusNeighborhood.compute(
             X=self.X_test,
             Y=self.X_train,
-            radius=10 * np.log(n_features),
+            radius=self.radius,
             metric=self.metric,
             return_distance=True,
             strategy=self.strategy,

From 61f64be4b060b159a4691e0a8c00004cfcb6f3a4 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 21 Oct 2022 14:13:07 +0200
Subject: [PATCH 3/3] ASV Update benchmark with new dataset supports

Namely support for float32 and CSR matrices.
---
 .../pairwise_distances_reductions.py          | 114 +++++++++++++-----
 1 file changed, 87 insertions(+), 27 deletions(-)

diff --git a/asv_benchmarks/benchmarks/pairwise_distances_reductions.py b/asv_benchmarks/benchmarks/pairwise_distances_reductions.py
index 8a7793c1b4e30..655e8967cdbf8 100644
--- a/asv_benchmarks/benchmarks/pairwise_distances_reductions.py
+++ b/asv_benchmarks/benchmarks/pairwise_distances_reductions.py
@@ -1,35 +1,69 @@
 import numpy as np
 from scipy.spatial.distance import cdist
+from scipy.sparse import rand as sparse_rand
 
 from .common import Benchmark
 
-from sklearn.metrics._pairwise_distances_reduction import (
-    PairwiseDistancesArgKmin,
-    PairwiseDistancesRadiusNeighborhood,
-)
+from sklearn.metrics._pairwise_distances_reduction import ArgKmin, RadiusNeighbors
 
 # To run benchmarks defined this file, between for instance your <current_branch>
-# and its fork point with upstream/main, run:
+# and upstream/main, use:
 #
-#             asv continuous -b PairwiseDistancesReductions \
-#             -e `git merge-base --fork-point upstream/main <your_branch>` <your_branch>
+#     asv continuous -b PairwiseDistancesReductions upstream/main <current_branch>
 
 
 class PairwiseDistancesReductionsBenchmark(Benchmark):
 
-    param_names = ["n_train", "n_test", "n_features", "metric", "strategy"]
+    param_names = [
+        "n_train",
+        "n_test",
+        "n_features",
+        "metric",
+        "strategy",
+        "dtype",
+        "X_train",
+        "X_test",
+    ]
     params = [
         [1000, 10_000, int(1e7)],
         [1000, 10_000, 100_000],
         [100],
         ["euclidean", "manhattan"],
         ["auto", "parallel_on_X", "parallel_on_Y"],
+        [np.float32, np.float64],
+        ["dense", "csr"],
+        ["dense", "csr"],
     ]
 
-    def setup(self, n_train, n_test, n_features, metric, strategy):
+    def setup(
+        self, n_train, n_test, n_features, metric, strategy, dtype, X_train, X_test
+    ):
         rng = np.random.RandomState(0)
-        self.X_train = rng.rand(n_train, n_features)
-        self.X_test = rng.rand(n_test, n_features)
+        self.X_train = (
+            rng.rand(n_train, n_features).astype(dtype)
+            if X_train == "dense"
+            else sparse_rand(
+                n_train,
+                n_features,
+                density=0.05,
+                format="csr",
+                dtype=dtype,
+                random_state=rng,
+            )
+        )
+        self.X_test = (
+            rng.rand(n_test, n_features).astype(dtype)
+            if X_test == "dense"
+            else sparse_rand(
+                n_test,
+                n_features,
+                density=0.05,
+                format="csr",
+                dtype=dtype,
+                random_state=rng,
+            )
+        )
+
         self.y_train = rng.randint(low=-1, high=1, size=(n_train,))
         self.metric = metric
         self.strategy = strategy
@@ -39,15 +73,26 @@ def setup(self, n_train, n_test, n_features, metric, strategy):
         # Motive: radius has to be scaled with the number of feature
         # Defining it as the 0.001-quantile allows to have in expectation
         # a constant amount of neighbors, regardless of the value of n_features.
-        self.radius = np.quantile(
-            a=cdist(self.X_train[:10000], self.X_test[:10]).ravel(), q=0.001
+        dist_mat = cdist(
+            (self.X_train if X_train == "dense" else self.X_train.toarray())[:1000],
+            (self.X_test if X_test == "dense" else self.X_test.toarray())[:10],
         )
 
-    def time_PairwiseDistancesArgKmin(
-        self, n_train, n_test, n_features, metric, strategy
+        self.radius = np.quantile(a=dist_mat.ravel(), q=0.001)
+
+    def time_ArgKmin(
+        self,
+        n_train,
+        n_test,
+        n_features,
+        metric,
+        strategy,
+        dtype,
+        X_train,
+        X_test,
     ):
 
-        PairwiseDistancesArgKmin.compute(
+        ArgKmin.compute(
             X=self.X_test,
             Y=self.X_train,
             k=10,
@@ -56,18 +101,33 @@ def time_PairwiseDistancesArgKmin(
             strategy=self.strategy,
         )
 
-    def peakmem_PairwiseDistancesArgKmin(
-        self, n_train, n_test, n_features, metric, strategy
+    def peakmem_ArgKmin(
+        self,
+        n_train,
+        n_test,
+        n_features,
+        metric,
+        strategy,
+        dtype,
+        X_train,
+        X_test,
     ):
-        self.time_PairwiseDistancesArgKmin(
-            n_train, n_test, n_features, metric, strategy
+        self.time_ArgKmin(
+            n_train,
+            n_test,
+            n_features,
+            metric,
+            strategy,
+            dtype,
+            X_train,
+            X_test,
         )
 
-    def time_PairwiseDistancesRadiusNeighborhood(
-        self, n_train, n_test, n_features, metric, strategy
+    def time_RadiusNeighbors(
+        self, n_train, n_test, n_features, metric, strategy, dtype, X_train, X_test
     ):
 
-        PairwiseDistancesRadiusNeighborhood.compute(
+        RadiusNeighbors.compute(
             X=self.X_test,
             Y=self.X_train,
             radius=self.radius,
@@ -76,9 +136,9 @@ def time_PairwiseDistancesRadiusNeighborhood(
             strategy=self.strategy,
         )
 
-    def peakmem_PairwiseDistancesRadiusNeighborhood(
-        self, n_train, n_test, n_features, metric, strategy
+    def peakmem_RadiusNeighbors(
+        self, n_train, n_test, n_features, metric, strategy, dtype, X_train, X_test
     ):
-        self.time_PairwiseDistancesRadiusNeighborhood(
-            n_train, n_test, n_features, metric, strategy
+        self.time_RadiusNeighbors(
+            n_train, n_test, n_features, metric, strategy, dtype, X_train, X_test
         )