jjerphan · March 14, 2022 11:27
diff --git a/README.md b/README.md
diff --git a/hardware-scalability.py b/hardware-scalability.py
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import threadpoolctl
 import subprocess

 from io import StringIO
 from time import perf_counter
 from matplotlib import pyplot as plt
 from scipy.spatial.distance import cdist
 from sklearn.neighbors import NearestNeighbors

 from joblib import Memory
 memory = Memory(".tmp", verbose=0)

 commit = (
    subprocess
    .check_output(['git', 'rev-parse', '--short', 'HEAD'])
    .decode('ascii')
    .strip()
 )

 @memory.cache
 def execute_bench(
    n_train=100,
    n_test=100,
    n_features_list=None,
    n_threads_list=None,
 ):
    rng = np.random.RandomState(0)

    n_features_list = n_features_list or [50, 100, 500]
    n_threads_list = n_threads_list or [1, 2, 4, 8, 16, 32, 64, 128]

    lists = []

    controler = threadpoolctl.ThreadpoolController()

    for n_features in n_features_list:
        X_train = rng.rand(n_train, n_features)
        X_test = rng.rand(n_test, n_features)

        # Adaptation of ogrisel's heuristic to pick the radius
        radius = np.quantile(a=cdist(X_train[:10000], X_test[:10]).ravel(), q=0.05)
        print(f"Radius for {n_features} features: {radius}")

        for n_threads in n_threads_list:
            with controler.limit(limits=n_threads, user_api=None):
                nn = NearestNeighbors(radius=radius, algorithm='brute', n_jobs=n_threads)
                nn.fit(X_train)
                start = perf_counter()
                nn.radius_neighbors(X_test, return_distance=True)
                end = perf_counter()
                lists.append([n_threads, n_train, n_test, n_features, end - start, 0])

    columns = [
        "n_threads",
        "n_train",
        "n_test",
        "n_features",
        "mean_runtime",
        "stderr_runtime",
    ]
    return pd.DataFrame(lists, columns=columns)


 def plot_results(df, save=False, n_train=None, n_test=None):

    fig = plt.figure(figsize=(30, 15))

    ax = plt.gca()

    df_p = df.query("n_features == 50")

    ax.loglog(
        df_p["n_threads"],
        df_p["n_threads"],
        linestyle="--",
        color="black",
        label="linear",
        alpha=.5,
    )

    speed_up = float(df_p.query("n_threads == 1")["mean_runtime"]) / df_p["mean_runtime"]
    ax.loglog(
        df_p["n_threads"],
        speed_up,
        color="yellow",
        label="50 features",
        alpha=.5,
    )

    df_p = df.query("n_features == 100")
    speed_up = float(df_p.query("n_threads == 1")["mean_runtime"]) / df_p["mean_runtime"]
    ax.loglog(
        df_p["n_threads"],
        speed_up,
        color="blue",
        label="100 features",
        alpha=.5,
    )

    df_p = df.query("n_features == 500")
    speed_up = float(df_p.query("n_threads == 1")["mean_runtime"]) / df_p["mean_runtime"]
    ax.loglog(
        df_p["n_threads"],
        speed_up,
        color="red",
        label="500 features",
        alpha=.5,
    )

    ax.set(
        xlabel="Number of threads",
        ylabel="Speed-up",
        xticks=df["n_threads"],
        xticklabels=df["n_threads"],
        yticks=df["n_threads"],
        yticklabels=[f"×{i}" for i in df["n_threads"]],
    )
    plt.legend()
    title= (
        f"Scalability of sklearn.NearestNeighbors.radius_neighbors "
        f" - (method, n_train, n_test) = ('brute', {n_train}, {n_test})"
        f" - commit: {commit}"
    )
    plt.title(title)
    if save:
        plt.savefig(f'speed_up_{n_train}_{n_test}_log.png')
    else:
        plt.show()


 if __name__ == "__main__":
    columns = [
        "n_threads",
        "n_train",
        "n_test",
        "n_features",
        "mean_runtime",
        "stderr_runtime",
    ]

    n_train = 100_000
    n_test = 100_000
    df = execute_bench(n_train, n_test)

    print(df)
    plot_results(df, save=True, n_train=n_train, n_test=n_test)
	import numpy as np
	import pandas as pd
	import seaborn as sns
	import threadpoolctl
	import subprocess

	from io import StringIO
	from time import perf_counter
	from matplotlib import pyplot as plt
	from scipy.spatial.distance import cdist
	from sklearn.neighbors import NearestNeighbors

	from joblib import Memory
	memory = Memory(".tmp", verbose=0)

	commit = (
	subprocess
	.check_output(['git', 'rev-parse', '--short', 'HEAD'])
	.decode('ascii')
	.strip()
	)

	@memory.cache
	def execute_bench(
	n_train=100,
	n_test=100,
	n_features_list=None,
	n_threads_list=None,
	):
	rng = np.random.RandomState(0)

	n_features_list = n_features_list or [50, 100, 500]
	n_threads_list = n_threads_list or [1, 2, 4, 8, 16, 32, 64, 128]

	lists = []

	controler = threadpoolctl.ThreadpoolController()

	for n_features in n_features_list:
	X_train = rng.rand(n_train, n_features)
	X_test = rng.rand(n_test, n_features)

	# Adaptation of ogrisel's heuristic to pick the radius
	radius = np.quantile(a=cdist(X_train[:10000], X_test[:10]).ravel(), q=0.05)
	print(f"Radius for {n_features} features: {radius}")

	for n_threads in n_threads_list:
	with controler.limit(limits=n_threads, user_api=None):
	nn = NearestNeighbors(radius=radius, algorithm='brute', n_jobs=n_threads)
	nn.fit(X_train)
	start = perf_counter()
	nn.radius_neighbors(X_test, return_distance=True)
	end = perf_counter()
	lists.append([n_threads, n_train, n_test, n_features, end - start, 0])

	columns = [
	"n_threads",
	"n_train",
	"n_test",
	"n_features",
	"mean_runtime",
	"stderr_runtime",
	]
	return pd.DataFrame(lists, columns=columns)


	def plot_results(df, save=False, n_train=None, n_test=None):

	fig = plt.figure(figsize=(30, 15))

	ax = plt.gca()

	df_p = df.query("n_features == 50")

	ax.loglog(
	df_p["n_threads"],
	df_p["n_threads"],
	linestyle="--",
	color="black",
	label="linear",
	alpha=.5,
	)

	speed_up = float(df_p.query("n_threads == 1")["mean_runtime"]) / df_p["mean_runtime"]
	ax.loglog(
	df_p["n_threads"],
	speed_up,
	color="yellow",
	label="50 features",
	alpha=.5,
	)

	df_p = df.query("n_features == 100")
	speed_up = float(df_p.query("n_threads == 1")["mean_runtime"]) / df_p["mean_runtime"]
	ax.loglog(
	df_p["n_threads"],
	speed_up,
	color="blue",
	label="100 features",
	alpha=.5,
	)

	df_p = df.query("n_features == 500")
	speed_up = float(df_p.query("n_threads == 1")["mean_runtime"]) / df_p["mean_runtime"]
	ax.loglog(
	df_p["n_threads"],
	speed_up,
	color="red",
	label="500 features",
	alpha=.5,
	)

	ax.set(
	xlabel="Number of threads",
	ylabel="Speed-up",
	xticks=df["n_threads"],
	xticklabels=df["n_threads"],
	yticks=df["n_threads"],
	yticklabels=[f"×{i}" for i in df["n_threads"]],
	)
	plt.legend()
	title= (
	f"Scalability of sklearn.NearestNeighbors.radius_neighbors "
	f" - (method, n_train, n_test) = ('brute', {n_train}, {n_test})"
	f" - commit: {commit}"
	)
	plt.title(title)
	if save:
	plt.savefig(f'speed_up_{n_train}_{n_test}_log.png')
	else:
	plt.show()


	if __name__ == "__main__":
	columns = [
	"n_threads",
	"n_train",
	"n_test",
	"n_features",
	"mean_runtime",
	"stderr_runtime",
	]

	n_train = 100_000
	n_test = 100_000
	df = execute_bench(n_train, n_test)

	print(df)
	plot_results(df, save=True, n_train=n_train, n_test=n_test)