From 827044c60478988f39c08d7fdbfcd56301060147 Mon Sep 17 00:00:00 2001 From: Kshitij Mathur Date: Mon, 7 Nov 2022 11:26:49 +0530 Subject: [PATCH 1/2] optimize calculating start values --- sklearn/metrics/cluster/_expected_mutual_info_fast.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx b/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx index e9452659a9a94..135170e184c9e 100644 --- a/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx +++ b/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx @@ -50,7 +50,8 @@ def expected_mutual_information(contingency, int n_samples): gln_N = gammaln(N + 1) gln_nij = gammaln(nijs + 1) # start and end values for nij terms for each summation. - start = np.array([[v - N + w for w in b] for v in a], dtype='int') + start = np.array(np.meshgrid(a, b), dtype='int').T + start = np.array([start[i].sum(axis=1) - N for i in range(len(start))]) start = np.maximum(start, 1) end = np.minimum(np.resize(a, (C, R)).T, np.resize(b, (R, C))) + 1 # emi itself is a summation over the various values. From 39dd0bf7ccffb00822c9b46b488be630e82f008b Mon Sep 17 00:00:00 2001 From: Kshitij Mathur Date: Mon, 7 Nov 2022 11:26:59 +0530 Subject: [PATCH 2/2] Add benchmarks --- .../bench_adjusted_mutual_info_score.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 benchmarks/bench_adjusted_mutual_info_score.py diff --git a/benchmarks/bench_adjusted_mutual_info_score.py b/benchmarks/bench_adjusted_mutual_info_score.py new file mode 100644 index 0000000000000..2cfd3859d4fb1 --- /dev/null +++ b/benchmarks/bench_adjusted_mutual_info_score.py @@ -0,0 +1,53 @@ +from collections import defaultdict +from itertools import product +from time import time + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +from sklearn.metrics import adjusted_mutual_info_score + +repeat = 10 +n_samples = [1_000, 10_000, 100_000, 1_000_000] +n_labels = [10, 100, 1_000] + +rng = np.random.default_rng(0) + +result = defaultdict(list) +for ns, nl in product(n_samples, n_labels): + local_result = [] + for i in range(repeat): + print(f"Repetition {i+1} for n_samples={ns} and n_labels={nl}") + x = rng.integers(low=0, high=nl, size=ns) + y = rng.integers(low=0, high=nl, size=ns) + + start = time() + adjusted_mutual_info_score(x, y) + end = time() + local_result.append(end - start) + + result["n_samples"].append(ns) + result["n_labels"].append(nl) + result["mean_time"].append(np.mean(local_result)) + +result = pd.DataFrame(result) +plt.figure("Adjusted Mutual Info Score Benchmarks against number of Labels") +for n_sample in n_samples: + samples = result[result["n_samples"] == n_sample] + plt.plot( + samples["n_labels"], samples["mean_time"], label=f"{str(n_sample)} samples" + ) +plt.xlabel("n_labels") +plt.ylabel("Time (s)") +plt.legend() + +plt.figure("Adjusted Mutual Info Score Benchmarks against number of Samples") +for n_label in n_labels: + labels = result[result["n_labels"] == n_label] + plt.plot(labels["n_samples"], labels["mean_time"], label=f"{str(n_label)} labels") +plt.xlabel("n_samples") +plt.ylabel("Time (s)") +plt.legend() + +plt.show()