From 06e733e1c1c3327c3ffe12a92a12fdf69ffa0bbc Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 5 Aug 2022 11:48:59 +0200 Subject: [PATCH 1/3] ASV Add benchmarks for PairwiseDistancesReductions The radius definition is from Olivier: https://github.com/scikit-learn/scikit-learn/pull/22320#pullrequestreview-892758702 Co-authored-by: Olivier Grisel --- .../pairwise_distances_reductions.py | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 asv_benchmarks/benchmarks/pairwise_distances_reductions.py diff --git a/asv_benchmarks/benchmarks/pairwise_distances_reductions.py b/asv_benchmarks/benchmarks/pairwise_distances_reductions.py new file mode 100644 index 0000000000000..55ad93529442d --- /dev/null +++ b/asv_benchmarks/benchmarks/pairwise_distances_reductions.py @@ -0,0 +1,84 @@ +import numpy as np +from scipy.spatial.distance import cdist + +from .common import Benchmark + +from sklearn.metrics._pairwise_distances_reduction import ( + PairwiseDistancesArgKmin, + PairwiseDistancesRadiusNeighborhood, +) + +# To run benchmarks defined this file, between for instance your +# and its fork point with upstream/main, run: +# +# asv continuous -b PairwiseDistancesReductions \ +# -e `git merge-base --fork-point upstream/main ` + + +class PairwiseDistancesReductionsBenchmark(Benchmark): + + param_names = ["n_train", "n_test", "n_features", "metric", "strategy"] + params = [ + [1000, 10_000, int(1e7)], + [1000, 10_000, 100_000], + [100], + ["euclidean", "manhattan"], + ["auto", "parallel_on_X", "parallel_on_Y"], + ] + + def setup(self, n_train, n_test, n_features, metric, strategy): + rng = np.random.RandomState(0) + self.X_train = rng.rand(n_train, n_features) + self.X_test = rng.rand(n_test, n_features) + self.y_train = rng.randint(low=-1, high=1, size=(n_train,)) + self.metric = metric + self.strategy = strategy + + self.k = 10 + + # Motive: radius has to be scaled with the number of feature + # Defining it as the 0.001-quantile allows to have in expectation + # a constant amount of neighbors, regardless of the value of n_features. + self.radius = np.quantile( + a=cdist(self.X_train[:10000], self.X_test[:10]).ravel(), q=0.001 + ) + + def time_PairwiseDistancesArgKmin( + self, n_train, n_test, n_features, metric, strategy + ): + + PairwiseDistancesArgKmin.compute( + X=self.X_test, + Y=self.X_train, + k=10, + metric=self.metric, + return_distance=True, + strategy=self.strategy, + ) + + def peakmem_PairwiseDistancesArgKmin( + self, n_train, n_test, n_features, metric, strategy + ): + self.time_PairwiseDistancesArgKmin( + n_train, n_test, n_features, metric, strategy + ) + + def time_PairwiseDistancesRadiusNeighborhood( + self, n_train, n_test, n_features, metric, strategy + ): + + PairwiseDistancesRadiusNeighborhood.compute( + X=self.X_test, + Y=self.X_train, + radius=10 * np.log(n_features), + metric=self.metric, + return_distance=True, + strategy=self.strategy, + ) + + def peakmem_PairwiseDistancesRadiusNeighborhood( + self, n_train, n_test, n_features, metric, strategy + ): + self.time_PairwiseDistancesRadiusNeighborhood( + n_train, n_test, n_features, metric, strategy + ) From 3ce295ce68df39af282b59e07315dd7191770dab Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 5 Aug 2022 21:58:33 +0200 Subject: [PATCH 2/3] Well, Julien --- asv_benchmarks/benchmarks/pairwise_distances_reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_benchmarks/benchmarks/pairwise_distances_reductions.py b/asv_benchmarks/benchmarks/pairwise_distances_reductions.py index 55ad93529442d..8a7793c1b4e30 100644 --- a/asv_benchmarks/benchmarks/pairwise_distances_reductions.py +++ b/asv_benchmarks/benchmarks/pairwise_distances_reductions.py @@ -70,7 +70,7 @@ def time_PairwiseDistancesRadiusNeighborhood( PairwiseDistancesRadiusNeighborhood.compute( X=self.X_test, Y=self.X_train, - radius=10 * np.log(n_features), + radius=self.radius, metric=self.metric, return_distance=True, strategy=self.strategy, From 61f64be4b060b159a4691e0a8c00004cfcb6f3a4 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 21 Oct 2022 14:13:07 +0200 Subject: [PATCH 3/3] ASV Update benchmark with new dataset supports Namely support for float32 and CSR matrices. --- .../pairwise_distances_reductions.py | 114 +++++++++++++----- 1 file changed, 87 insertions(+), 27 deletions(-) diff --git a/asv_benchmarks/benchmarks/pairwise_distances_reductions.py b/asv_benchmarks/benchmarks/pairwise_distances_reductions.py index 8a7793c1b4e30..655e8967cdbf8 100644 --- a/asv_benchmarks/benchmarks/pairwise_distances_reductions.py +++ b/asv_benchmarks/benchmarks/pairwise_distances_reductions.py @@ -1,35 +1,69 @@ import numpy as np from scipy.spatial.distance import cdist +from scipy.sparse import rand as sparse_rand from .common import Benchmark -from sklearn.metrics._pairwise_distances_reduction import ( - PairwiseDistancesArgKmin, - PairwiseDistancesRadiusNeighborhood, -) +from sklearn.metrics._pairwise_distances_reduction import ArgKmin, RadiusNeighbors # To run benchmarks defined this file, between for instance your -# and its fork point with upstream/main, run: +# and upstream/main, use: # -# asv continuous -b PairwiseDistancesReductions \ -# -e `git merge-base --fork-point upstream/main ` +# asv continuous -b PairwiseDistancesReductions upstream/main class PairwiseDistancesReductionsBenchmark(Benchmark): - param_names = ["n_train", "n_test", "n_features", "metric", "strategy"] + param_names = [ + "n_train", + "n_test", + "n_features", + "metric", + "strategy", + "dtype", + "X_train", + "X_test", + ] params = [ [1000, 10_000, int(1e7)], [1000, 10_000, 100_000], [100], ["euclidean", "manhattan"], ["auto", "parallel_on_X", "parallel_on_Y"], + [np.float32, np.float64], + ["dense", "csr"], + ["dense", "csr"], ] - def setup(self, n_train, n_test, n_features, metric, strategy): + def setup( + self, n_train, n_test, n_features, metric, strategy, dtype, X_train, X_test + ): rng = np.random.RandomState(0) - self.X_train = rng.rand(n_train, n_features) - self.X_test = rng.rand(n_test, n_features) + self.X_train = ( + rng.rand(n_train, n_features).astype(dtype) + if X_train == "dense" + else sparse_rand( + n_train, + n_features, + density=0.05, + format="csr", + dtype=dtype, + random_state=rng, + ) + ) + self.X_test = ( + rng.rand(n_test, n_features).astype(dtype) + if X_test == "dense" + else sparse_rand( + n_test, + n_features, + density=0.05, + format="csr", + dtype=dtype, + random_state=rng, + ) + ) + self.y_train = rng.randint(low=-1, high=1, size=(n_train,)) self.metric = metric self.strategy = strategy @@ -39,15 +73,26 @@ def setup(self, n_train, n_test, n_features, metric, strategy): # Motive: radius has to be scaled with the number of feature # Defining it as the 0.001-quantile allows to have in expectation # a constant amount of neighbors, regardless of the value of n_features. - self.radius = np.quantile( - a=cdist(self.X_train[:10000], self.X_test[:10]).ravel(), q=0.001 + dist_mat = cdist( + (self.X_train if X_train == "dense" else self.X_train.toarray())[:1000], + (self.X_test if X_test == "dense" else self.X_test.toarray())[:10], ) - def time_PairwiseDistancesArgKmin( - self, n_train, n_test, n_features, metric, strategy + self.radius = np.quantile(a=dist_mat.ravel(), q=0.001) + + def time_ArgKmin( + self, + n_train, + n_test, + n_features, + metric, + strategy, + dtype, + X_train, + X_test, ): - PairwiseDistancesArgKmin.compute( + ArgKmin.compute( X=self.X_test, Y=self.X_train, k=10, @@ -56,18 +101,33 @@ def time_PairwiseDistancesArgKmin( strategy=self.strategy, ) - def peakmem_PairwiseDistancesArgKmin( - self, n_train, n_test, n_features, metric, strategy + def peakmem_ArgKmin( + self, + n_train, + n_test, + n_features, + metric, + strategy, + dtype, + X_train, + X_test, ): - self.time_PairwiseDistancesArgKmin( - n_train, n_test, n_features, metric, strategy + self.time_ArgKmin( + n_train, + n_test, + n_features, + metric, + strategy, + dtype, + X_train, + X_test, ) - def time_PairwiseDistancesRadiusNeighborhood( - self, n_train, n_test, n_features, metric, strategy + def time_RadiusNeighbors( + self, n_train, n_test, n_features, metric, strategy, dtype, X_train, X_test ): - PairwiseDistancesRadiusNeighborhood.compute( + RadiusNeighbors.compute( X=self.X_test, Y=self.X_train, radius=self.radius, @@ -76,9 +136,9 @@ def time_PairwiseDistancesRadiusNeighborhood( strategy=self.strategy, ) - def peakmem_PairwiseDistancesRadiusNeighborhood( - self, n_train, n_test, n_features, metric, strategy + def peakmem_RadiusNeighbors( + self, n_train, n_test, n_features, metric, strategy, dtype, X_train, X_test ): - self.time_PairwiseDistancesRadiusNeighborhood( - n_train, n_test, n_features, metric, strategy + self.time_RadiusNeighbors( + n_train, n_test, n_features, metric, strategy, dtype, X_train, X_test )