From 7b63241b508f249498e28ca31ebec6ac4ab96c04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <jeremie@probabl.ai>
Date: Mon, 15 Apr 2024 13:19:57 +0200
Subject: [PATCH 01/12] bump threadpoolctl min version -> 3.1.0

---
 README.rst                                    |   2 +-
 build_tools/azure/debian_atlas_32bit_lock.txt |   2 +-
 .../azure/debian_atlas_32bit_requirements.txt |   2 +-
 ...latest_conda_mkl_no_openmp_environment.yml |   1 +
 ...test_conda_mkl_no_openmp_osx-64_conda.lock |   2 +-
 ...in_conda_defaults_openblas_environment.yml |   3 +-
 ...onda_defaults_openblas_linux-64_conda.lock |   2 +-
 build_tools/azure/ubuntu_atlas_lock.txt       |   2 +-
 .../azure/ubuntu_atlas_requirements.txt       |   2 +-
 build_tools/circle/doc_linux-64_conda.lock    |   2 +-
 .../update_environments_and_lock_files.py     |   7 +-
 pyproject.toml                                |   4 +-
 sklearn/__init__.py                           |   6 +
 sklearn/_min_dependencies.py                  |   2 +-
 sklearn/cluster/_bisect_k_means.py            |   4 +-
 sklearn/cluster/_k_means_lloyd.pyx            |   4 +
 sklearn/cluster/_kmeans.py                    | 201 ++++++++----------
 sklearn/cluster/tests/test_k_means.py         |   7 +-
 .../_argkmin.pyx.tp                           |   4 +-
 .../_argkmin_classmode.pyx.tp                 |   4 +-
 .../_radius_neighbors.pyx.tp                  |   4 +-
 .../_radius_neighbors_classmode.pyx.tp        |   4 +-
 .../test_pairwise_distances_reduction.py      |   4 +-
 sklearn/utils/_show_versions.py               |   3 +-
 sklearn/utils/fixes.py                        |  39 +---
 sklearn/utils/tests/test_show_versions.py     |   3 +-
 26 files changed, 140 insertions(+), 180 deletions(-)

diff --git a/README.rst b/README.rst
index 3f9a4ad726806..4ac297063c26e 100644
--- a/README.rst
+++ b/README.rst
@@ -36,7 +36,7 @@
 .. |NumPyMinVersion| replace:: 1.19.5
 .. |SciPyMinVersion| replace:: 1.6.0
 .. |JoblibMinVersion| replace:: 1.2.0
-.. |ThreadpoolctlMinVersion| replace:: 2.0.0
+.. |ThreadpoolctlMinVersion| replace:: 3.1.0
 .. |MatplotlibMinVersion| replace:: 3.3.4
 .. |Scikit-ImageMinVersion| replace:: 0.17.2
 .. |PandasMinVersion| replace:: 1.1.5
diff --git a/build_tools/azure/debian_atlas_32bit_lock.txt b/build_tools/azure/debian_atlas_32bit_lock.txt
index 40e0ff4e25cb8..eba989d478f8a 100644
--- a/build_tools/azure/debian_atlas_32bit_lock.txt
+++ b/build_tools/azure/debian_atlas_32bit_lock.txt
@@ -36,7 +36,7 @@ pytest==7.1.2
     #   pytest-cov
 pytest-cov==2.9.0
     # via -r build_tools/azure/debian_atlas_32bit_requirements.txt
-threadpoolctl==2.2.0
+threadpoolctl==3.1.0
     # via -r build_tools/azure/debian_atlas_32bit_requirements.txt
 tomli==2.0.1
     # via
diff --git a/build_tools/azure/debian_atlas_32bit_requirements.txt b/build_tools/azure/debian_atlas_32bit_requirements.txt
index d1bc22529d4f4..615193a71fc6b 100644
--- a/build_tools/azure/debian_atlas_32bit_requirements.txt
+++ b/build_tools/azure/debian_atlas_32bit_requirements.txt
@@ -3,7 +3,7 @@
 # build_tools/update_environments_and_lock_files.py
 cython==3.0.10  # min
 joblib==1.2.0  # min
-threadpoolctl==2.2.0
+threadpoolctl==3.1.0
 pytest==7.1.2  # min
 pytest-cov==2.9.0  # min
 ninja
diff --git a/build_tools/azure/pylatest_conda_mkl_no_openmp_environment.yml b/build_tools/azure/pylatest_conda_mkl_no_openmp_environment.yml
index 9c46400c2d3c6..c2ce48d6a19a1 100644
--- a/build_tools/azure/pylatest_conda_mkl_no_openmp_environment.yml
+++ b/build_tools/azure/pylatest_conda_mkl_no_openmp_environment.yml
@@ -25,3 +25,4 @@ dependencies:
   - pip
   - pip:
     - cython
+    - threadpoolctl
diff --git a/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock b/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock
index aa946a23c4650..fabd22be28e29 100644
--- a/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock
+++ b/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: osx-64
-# input_hash: d3fadab6d5d5d715660beb53286e2687b018b5195ef7ce47928bb79a96ee851b
+# input_hash: 4462fac2150cfe9fe0aedbc88d55a5f0cc9bb5d786aa4f0553fc92fcbf35d503
 @EXPLICIT
 https://repo.anaconda.com/pkgs/main/osx-64/blas-1.0-mkl.conda#cb2c87e85ac8e0ceae776d26d4214c8a
 https://repo.anaconda.com/pkgs/main/osx-64/bzip2-1.0.8-h6c40b1e_5.conda#0f51dde96c82dcf58a788787fed4c5b9
diff --git a/build_tools/azure/pymin_conda_defaults_openblas_environment.yml b/build_tools/azure/pymin_conda_defaults_openblas_environment.yml
index a422a0a539c53..144b432c486b3 100644
--- a/build_tools/azure/pymin_conda_defaults_openblas_environment.yml
+++ b/build_tools/azure/pymin_conda_defaults_openblas_environment.yml
@@ -9,7 +9,7 @@ dependencies:
   - blas[build=openblas]
   - scipy=1.7
   - joblib
-  - threadpoolctl=2.2.0
+  - threadpoolctl
   - matplotlib=3.3.4  # min
   - pyamg
   - pytest<8
@@ -21,3 +21,4 @@ dependencies:
   - pip
   - pip:
     - cython==3.0.10  # min
+    - threadpoolctl
diff --git a/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock b/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock
index 4e64af1960718..e6e873c144b09 100644
--- a/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock
+++ b/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: linux-64
-# input_hash: 3d6bcb087065974114c1567c8dccd947a7376a7503b3514d82904299b651692d
+# input_hash: 60862fc361d93a51f183ea9ef3bb29dd428c1359bae96e3f4d37ad92e4ae67c1
 @EXPLICIT
 https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda#c3473ff8bdb3d124ed5ff11ec380d6f9
 https://repo.anaconda.com/pkgs/main/linux-64/blas-1.0-openblas.conda#9ddfcaef10d79366c90128f5dc444be8
diff --git a/build_tools/azure/ubuntu_atlas_lock.txt b/build_tools/azure/ubuntu_atlas_lock.txt
index aa17f49e75936..3bf97905a883b 100644
--- a/build_tools/azure/ubuntu_atlas_lock.txt
+++ b/build_tools/azure/ubuntu_atlas_lock.txt
@@ -34,7 +34,7 @@ pytest==7.4.4
     #   pytest-xdist
 pytest-xdist==3.5.0
     # via -r build_tools/azure/ubuntu_atlas_requirements.txt
-threadpoolctl==2.0.0
+threadpoolctl==3.1.0
     # via -r build_tools/azure/ubuntu_atlas_requirements.txt
 tomli==2.0.1
     # via
diff --git a/build_tools/azure/ubuntu_atlas_requirements.txt b/build_tools/azure/ubuntu_atlas_requirements.txt
index aab362dda0bf2..805d84d4d0aac 100644
--- a/build_tools/azure/ubuntu_atlas_requirements.txt
+++ b/build_tools/azure/ubuntu_atlas_requirements.txt
@@ -3,7 +3,7 @@
 # build_tools/update_environments_and_lock_files.py
 cython==3.0.10  # min
 joblib==1.2.0  # min
-threadpoolctl==2.0.0  # min
+threadpoolctl==3.1.0  # min
 pytest<8
 pytest-xdist
 ninja
diff --git a/build_tools/circle/doc_linux-64_conda.lock b/build_tools/circle/doc_linux-64_conda.lock
index 984cca332fc7f..88782bee45e0d 100644
--- a/build_tools/circle/doc_linux-64_conda.lock
+++ b/build_tools/circle/doc_linux-64_conda.lock
@@ -308,7 +308,7 @@ https://conda.anaconda.org/conda-forge/noarch/sphinxext-opengraph-0.9.1-pyhd8ed1
 # pip jsonschema-specifications @ https://files.pythonhosted.org/packages/ee/07/44bd408781594c4d0a027666ef27fab1e441b109dc3b76b4f836f8fd04fe/jsonschema_specifications-2023.12.1-py3-none-any.whl#sha256=87e4fdf3a94858b8a2ba2778d9ba57d8a9cafca7c7489c46ba0d30a8bc6a9c3c
 # pip jupyter-server-terminals @ https://files.pythonhosted.org/packages/07/2d/2b32cdbe8d2a602f697a649798554e4f072115438e92249624e532e8aca6/jupyter_server_terminals-0.5.3-py3-none-any.whl#sha256=41ee0d7dc0ebf2809c668e0fc726dfaf258fcd3e769568996ca731b6194ae9aa
 # pip jupyterlite-core @ https://files.pythonhosted.org/packages/05/d2/1d59d9a70d684b1eb3eb3a0b80a36b4e1d691e94af5d53aee56b1ad5240b/jupyterlite_core-0.3.0-py3-none-any.whl#sha256=247cc34ae6fedda41b15ce4778997164508b2039bc92480665cadfe955193467
-# pip pyzmq @ https://files.pythonhosted.org/packages/76/8b/6fca99e22c6316917de32b17be299dea431544209d619da16b6d9ec85c83/pyzmq-25.1.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl#sha256=c0b5ca88a8928147b7b1e2dfa09f3b6c256bc1135a1338536cbc9ea13d3b7add
+# pip pyzmq @ https://files.pythonhosted.org/packages/c3/ff/99542c2d7596b344793d5431da25ae437eefb99e3c0c91e43f95abd3e9c8/pyzmq-26.0.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl#sha256=a0562054930471b386a44b0887504687c4e7adf4ba89bddc2e5959d16c371764
 # pip argon2-cffi @ https://files.pythonhosted.org/packages/a4/6a/e8a041599e78b6b3752da48000b14c8d1e8a04ded09c88c714ba047f34f5/argon2_cffi-23.1.0-py3-none-any.whl#sha256=c670642b78ba29641818ab2e68bd4e6a78ba53b7eff7b4c3815ae16abf91c7ea
 # pip jsonschema @ https://files.pythonhosted.org/packages/39/9d/b035d024c62c85f2e2d4806a59ca7b8520307f34e0932fbc8cc75fe7b2d9/jsonschema-4.21.1-py3-none-any.whl#sha256=7996507afae316306f9e2290407761157c6f78002dcf7419acb99822143d1c6f
 # pip jupyter-client @ https://files.pythonhosted.org/packages/75/6d/d7b55b9c1ac802ab066b3e5015e90faab1fffbbd67a2af498ffc6cc81c97/jupyter_client-8.6.1-py3-none-any.whl#sha256=3b7bd22f058434e3b9a7ea4b1500ed47de2713872288c0d511d19926f99b459f
diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py
index fd77cfd3c0721..62a4d58ea91fd 100644
--- a/build_tools/update_environments_and_lock_files.py
+++ b/build_tools/update_environments_and_lock_files.py
@@ -151,7 +151,7 @@ def remove_from(alist, to_remove):
         },
         # TODO: put cython back to conda dependencies when required version is
         # available on the main channel
-        "pip_dependencies": ["cython"],
+        "pip_dependencies": ["cython", "threadpoolctl"],
     },
     {
         "name": "pymin_conda_defaults_openblas",
@@ -170,12 +170,11 @@ def remove_from(alist, to_remove):
             "numpy": "1.21",  # the min version is not available on the defaults channel
             "scipy": "1.7",  # the min version has some low level crashes
             "matplotlib": "min",
-            "threadpoolctl": "2.2.0",
             "cython": "min",
         },
         # TODO: put cython back to conda dependencies when required version is
         # available on the main channel
-        "pip_dependencies": ["cython"],
+        "pip_dependencies": ["cython", "threadpoolctl"],
     },
     {
         "name": "pymin_conda_forge_openblas_ubuntu_2204",
@@ -381,7 +380,7 @@ def remove_from(alist, to_remove):
         ],
         "package_constraints": {
             "joblib": "min",
-            "threadpoolctl": "2.2.0",
+            "threadpoolctl": "3.1.0",
             "pytest": "min",
             "pytest-cov": "min",
             # no pytest-xdist because it causes issue on 32bit
diff --git a/pyproject.toml b/pyproject.toml
index 828569ecc71ee..69d9702716cb5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ dependencies = [
   "numpy>=1.19.5",
   "scipy>=1.6.0",
   "joblib>=1.2.0",
-  "threadpoolctl>=2.0.0",
+  "threadpoolctl>=3.1.0",
 ]
 requires-python = ">=3.9"
 license = {text = "new BSD"}
@@ -45,7 +45,7 @@ tracker = "https://github.com/scikit-learn/scikit-learn/issues"
 
 [project.optional-dependencies]
 build = ["numpy>=1.19.5", "scipy>=1.6.0", "cython>=3.0.10", "meson-python>=0.15.0"]
-install = ["numpy>=1.19.5", "scipy>=1.6.0", "joblib>=1.2.0", "threadpoolctl>=2.0.0"]
+install = ["numpy>=1.19.5", "scipy>=1.6.0", "joblib>=1.2.0", "threadpoolctl>=3.1.0"]
 benchmark = ["matplotlib>=3.3.4", "pandas>=1.1.5", "memory_profiler>=0.57.0"]
 docs = [
     "matplotlib>=3.3.4",
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 30a3bbcdcbf66..b8dd1f49f34a7 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -80,6 +80,8 @@
     # It is necessary to do this prior to importing show_versions as the
     # later is linked to the OpenMP runtime to make it possible to introspect
     # it and importing it first would fail if the OpenMP dll cannot be found.
+    from threadpoolctl import ThreadpoolController
+
     from . import (
         __check_build,  # noqa: F401
         _distributor_init,  # noqa: F401
@@ -141,6 +143,10 @@
     except ModuleNotFoundError:
         pass
 
+    # Set a global controller that can be used to locally limit the number of
+    # threads without looping through all shared libraries every time.
+    sklearn._sklearn_threadpool_controller = ThreadpoolController()
+
 
 def setup_module(module):
     """Fixture for the tests to assure globally controllable seeding of RNGs"""
diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py
index a487a048c53c1..00315f31d4c3f 100644
--- a/sklearn/_min_dependencies.py
+++ b/sklearn/_min_dependencies.py
@@ -7,7 +7,7 @@
 NUMPY_MIN_VERSION = "1.19.5"
 SCIPY_MIN_VERSION = "1.6.0"
 JOBLIB_MIN_VERSION = "1.2.0"
-THREADPOOLCTL_MIN_VERSION = "2.0.0"
+THREADPOOLCTL_MIN_VERSION = "3.1.0"
 PYTEST_MIN_VERSION = "7.1.2"
 CYTHON_MIN_VERSION = "3.0.10"
 
diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py
index 1d4a9e1d84c26..986b1714dd7c4 100644
--- a/sklearn/cluster/_bisect_k_means.py
+++ b/sklearn/cluster/_bisect_k_means.py
@@ -17,7 +17,7 @@
     _BaseKMeans,
     _kmeans_single_elkan,
     _kmeans_single_lloyd,
-    _labels_inertia_threadpool_limit,
+    _labels_inertia,
 )
 
 
@@ -504,7 +504,7 @@ def _predict_recursive(self, X, sample_weight, cluster_node):
         if hasattr(self, "_X_mean"):
             centers += self._X_mean
 
-        cluster_labels = _labels_inertia_threadpool_limit(
+        cluster_labels = _labels_inertia(
             X,
             sample_weight,
             centers,
diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx
index db7b4e259f434..cb2977a040f4b 100644
--- a/sklearn/cluster/_k_means_lloyd.pyx
+++ b/sklearn/cluster/_k_means_lloyd.pyx
@@ -6,6 +6,7 @@ from libc.stdlib cimport malloc, calloc, free
 from libc.string cimport memset
 from libc.float cimport DBL_MAX, FLT_MAX
 
+from .. import _sklearn_threadpool_controller
 from ..utils._openmp_helpers cimport omp_lock_t
 from ..utils._openmp_helpers cimport omp_init_lock
 from ..utils._openmp_helpers cimport omp_destroy_lock
@@ -20,6 +21,9 @@ from ._k_means_common cimport _relocate_empty_clusters_sparse
 from ._k_means_common cimport _average_centers, _center_shift
 
 
+# Threadpoolctl context to limit the number of threads in second level of
+# nested parallelism (i.e. BLAS) to avoid oversubscription.
+@_sklearn_threadpool_controller.wrap(limits=1, user_api="blas")
 def lloyd_iter_chunked_dense(
         const floating[:, ::1] X,            # IN
         const floating[::1] sample_weight,   # IN
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 178242e60be57..03478bebfe2b4 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -18,6 +18,7 @@
 import numpy as np
 import scipy.sparse as sp
 
+from .. import _sklearn_threadpool_controller
 from ..base import (
     BaseEstimator,
     ClassNamePrefixFeaturesOutMixin,
@@ -31,7 +32,6 @@
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._param_validation import Interval, StrOptions, validate_params
 from ..utils.extmath import row_norms, stable_cumsum
-from ..utils.fixes import threadpool_info, threadpool_limits
 from ..utils.sparsefuncs import mean_variance_axis
 from ..utils.sparsefuncs_fast import assign_rows_csr
 from ..utils.validation import (
@@ -697,59 +697,56 @@ def _kmeans_single_lloyd(
 
     strict_convergence = False
 
-    # Threadpoolctl context to limit the number of threads in second level of
-    # nested parallelism (i.e. BLAS) to avoid oversubscription.
-    with threadpool_limits(limits=1, user_api="blas"):
-        for i in range(max_iter):
-            lloyd_iter(
-                X,
-                sample_weight,
-                centers,
-                centers_new,
-                weight_in_clusters,
-                labels,
-                center_shift,
-                n_threads,
-            )
+    for i in range(max_iter):
+        lloyd_iter(
+            X,
+            sample_weight,
+            centers,
+            centers_new,
+            weight_in_clusters,
+            labels,
+            center_shift,
+            n_threads,
+        )
 
-            if verbose:
-                inertia = _inertia(X, sample_weight, centers, labels, n_threads)
-                print(f"Iteration {i}, inertia {inertia}.")
+        if verbose:
+            inertia = _inertia(X, sample_weight, centers, labels, n_threads)
+            print(f"Iteration {i}, inertia {inertia}.")
 
-            centers, centers_new = centers_new, centers
+        centers, centers_new = centers_new, centers
 
-            if np.array_equal(labels, labels_old):
-                # First check the labels for strict convergence.
+        if np.array_equal(labels, labels_old):
+            # First check the labels for strict convergence.
+            if verbose:
+                print(f"Converged at iteration {i}: strict convergence.")
+            strict_convergence = True
+            break
+        else:
+            # No strict convergence, check for tol based convergence.
+            center_shift_tot = (center_shift**2).sum()
+            if center_shift_tot <= tol:
                 if verbose:
-                    print(f"Converged at iteration {i}: strict convergence.")
-                strict_convergence = True
+                    print(
+                        f"Converged at iteration {i}: center shift "
+                        f"{center_shift_tot} within tolerance {tol}."
+                    )
                 break
-            else:
-                # No strict convergence, check for tol based convergence.
-                center_shift_tot = (center_shift**2).sum()
-                if center_shift_tot <= tol:
-                    if verbose:
-                        print(
-                            f"Converged at iteration {i}: center shift "
-                            f"{center_shift_tot} within tolerance {tol}."
-                        )
-                    break
-
-            labels_old[:] = labels
-
-        if not strict_convergence:
-            # rerun E-step so that predicted labels match cluster centers
-            lloyd_iter(
-                X,
-                sample_weight,
-                centers,
-                centers,
-                weight_in_clusters,
-                labels,
-                center_shift,
-                n_threads,
-                update_centers=False,
-            )
+
+        labels_old[:] = labels
+
+    if not strict_convergence:
+        # rerun E-step so that predicted labels match cluster centers
+        lloyd_iter(
+            X,
+            sample_weight,
+            centers,
+            centers,
+            weight_in_clusters,
+            labels,
+            center_shift,
+            n_threads,
+            update_centers=False,
+        )
 
     inertia = _inertia(X, sample_weight, centers, labels, n_threads)
 
@@ -826,16 +823,6 @@ def _labels_inertia(X, sample_weight, centers, n_threads=1, return_inertia=True)
     return labels
 
 
-def _labels_inertia_threadpool_limit(
-    X, sample_weight, centers, n_threads=1, return_inertia=True
-):
-    """Same as _labels_inertia but in a threadpool_limits context."""
-    with threadpool_limits(limits=1, user_api="blas"):
-        result = _labels_inertia(X, sample_weight, centers, n_threads, return_inertia)
-
-    return result
-
-
 class _BaseKMeans(
     ClassNamePrefixFeaturesOutMixin, TransformerMixin, ClusterMixin, BaseEstimator, ABC
 ):
@@ -926,7 +913,7 @@ def _check_mkl_vcomp(self, X, n_samples):
 
         n_active_threads = int(np.ceil(n_samples / CHUNK_SIZE))
         if n_active_threads < self._n_threads:
-            modules = threadpool_info()
+            modules = _sklearn_threadpool_controller.info()
             has_vcomp = "vcomp" in [module["prefix"] for module in modules]
             has_mkl = ("mkl", "intel") in [
                 (module["internal_api"], module.get("threading_layer", None))
@@ -1110,7 +1097,7 @@ def predict(self, X, sample_weight="deprecated"):
         else:
             sample_weight = _check_sample_weight(None, X, dtype=X.dtype)
 
-        labels = _labels_inertia_threadpool_limit(
+        labels = _labels_inertia(
             X,
             sample_weight,
             self.cluster_centers_,
@@ -1195,7 +1182,7 @@ def score(self, X, y=None, sample_weight=None):
         X = self._check_test_data(X)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
-        _, scores = _labels_inertia_threadpool_limit(
+        _, scores = _labels_inertia(
             X, sample_weight, self.cluster_centers_, self._n_threads
         )
         return -scores
@@ -1645,8 +1632,6 @@ def _mini_batch_step(
         the centers.
     """
     # Perform label assignment to nearest centers
-    # For better efficiency, it's better to run _mini_batch_step in a
-    # threadpool_limit context than using _labels_inertia_threadpool_limit here
     labels, inertia = _labels_inertia(X, sample_weight, centers, n_threads=n_threads)
 
     # Update centers according to the labels
@@ -2135,7 +2120,7 @@ def fit(self, X, y=None, sample_weight=None):
             )
 
             # Compute inertia on a validation set.
-            _, inertia = _labels_inertia_threadpool_limit(
+            _, inertia = _labels_inertia(
                 X_valid,
                 sample_weight_valid,
                 cluster_centers,
@@ -2164,38 +2149,37 @@ def fit(self, X, y=None, sample_weight=None):
 
         n_steps = (self.max_iter * n_samples) // self._batch_size
 
-        with threadpool_limits(limits=1, user_api="blas"):
-            # Perform the iterative optimization until convergence
-            for i in range(n_steps):
-                # Sample a minibatch from the full dataset
-                minibatch_indices = random_state.randint(0, n_samples, self._batch_size)
-
-                # Perform the actual update step on the minibatch data
-                batch_inertia = _mini_batch_step(
-                    X=X[minibatch_indices],
-                    sample_weight=sample_weight[minibatch_indices],
-                    centers=centers,
-                    centers_new=centers_new,
-                    weight_sums=self._counts,
-                    random_state=random_state,
-                    random_reassign=self._random_reassign(),
-                    reassignment_ratio=self.reassignment_ratio,
-                    verbose=self.verbose,
-                    n_threads=self._n_threads,
-                )
+        # Perform the iterative optimization until convergence
+        for i in range(n_steps):
+            # Sample a minibatch from the full dataset
+            minibatch_indices = random_state.randint(0, n_samples, self._batch_size)
+
+            # Perform the actual update step on the minibatch data
+            batch_inertia = _mini_batch_step(
+                X=X[minibatch_indices],
+                sample_weight=sample_weight[minibatch_indices],
+                centers=centers,
+                centers_new=centers_new,
+                weight_sums=self._counts,
+                random_state=random_state,
+                random_reassign=self._random_reassign(),
+                reassignment_ratio=self.reassignment_ratio,
+                verbose=self.verbose,
+                n_threads=self._n_threads,
+            )
 
-                if self._tol > 0.0:
-                    centers_squared_diff = np.sum((centers_new - centers) ** 2)
-                else:
-                    centers_squared_diff = 0
+            if self._tol > 0.0:
+                centers_squared_diff = np.sum((centers_new - centers) ** 2)
+            else:
+                centers_squared_diff = 0
 
-                centers, centers_new = centers_new, centers
+            centers, centers_new = centers_new, centers
 
-                # Monitor convergence and do early stopping if necessary
-                if self._mini_batch_convergence(
-                    i, n_steps, n_samples, centers_squared_diff, batch_inertia
-                ):
-                    break
+            # Monitor convergence and do early stopping if necessary
+            if self._mini_batch_convergence(
+                i, n_steps, n_samples, centers_squared_diff, batch_inertia
+            ):
+                break
 
         self.cluster_centers_ = centers
         self._n_features_out = self.cluster_centers_.shape[0]
@@ -2204,7 +2188,7 @@ def fit(self, X, y=None, sample_weight=None):
         self.n_iter_ = int(np.ceil(((i + 1) * self._batch_size) / n_samples))
 
         if self.compute_labels:
-            self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
+            self.labels_, self.inertia_ = _labels_inertia(
                 X,
                 sample_weight,
                 self.cluster_centers_,
@@ -2290,22 +2274,21 @@ def partial_fit(self, X, y=None, sample_weight=None):
             # Initialize number of samples seen since last reassignment
             self._n_since_last_reassign = 0
 
-        with threadpool_limits(limits=1, user_api="blas"):
-            _mini_batch_step(
-                X,
-                sample_weight=sample_weight,
-                centers=self.cluster_centers_,
-                centers_new=self.cluster_centers_,
-                weight_sums=self._counts,
-                random_state=self._random_state,
-                random_reassign=self._random_reassign(),
-                reassignment_ratio=self.reassignment_ratio,
-                verbose=self.verbose,
-                n_threads=self._n_threads,
-            )
+        _mini_batch_step(
+            X,
+            sample_weight=sample_weight,
+            centers=self.cluster_centers_,
+            centers_new=self.cluster_centers_,
+            weight_sums=self._counts,
+            random_state=self._random_state,
+            random_reassign=self._random_reassign(),
+            reassignment_ratio=self.reassignment_ratio,
+            verbose=self.verbose,
+            n_threads=self._n_threads,
+        )
 
         if self.compute_labels:
-            self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
+            self.labels_, self.inertia_ = _labels_inertia(
                 X,
                 sample_weight,
                 self.cluster_centers_,
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 1f2f8c390c909..b6d9575deded4 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -8,6 +8,7 @@
 import pytest
 from scipy import sparse as sp
 
+from sklearn import _sklearn_threadpool_controller
 from sklearn.base import clone
 from sklearn.cluster import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus
 from sklearn.cluster._k_means_common import (
@@ -31,7 +32,7 @@
     create_memmap_backed_data,
 )
 from sklearn.utils.extmath import row_norms
-from sklearn.utils.fixes import CSR_CONTAINERS, threadpool_limits
+from sklearn.utils.fixes import CSR_CONTAINERS
 
 # non centered, sparse centers to check the
 centers = np.array(
@@ -980,13 +981,13 @@ def test_result_equal_in_diff_n_threads(Estimator, global_random_seed):
     rnd = np.random.RandomState(global_random_seed)
     X = rnd.normal(size=(50, 10))
 
-    with threadpool_limits(limits=1, user_api="openmp"):
+    with _sklearn_threadpool_controller.limit(limits=1, user_api="openmp"):
         result_1 = (
             Estimator(n_clusters=n_clusters, random_state=global_random_seed)
             .fit(X)
             .labels_
         )
-    with threadpool_limits(limits=2, user_api="openmp"):
+    with _sklearn_threadpool_controller.limit(limits=2, user_api="openmp"):
         result_2 = (
             Estimator(n_clusters=n_clusters, random_state=global_random_seed)
             .fit(X)
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index a686153c3ac9c..0bc90498da893 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -14,7 +14,7 @@ from numbers import Integral
 from scipy.sparse import issparse
 from ...utils import check_array, check_scalar
 from ...utils.fixes import _in_unstable_openblas_configuration
-from ...utils.fixes import threadpool_limits
+from ... import _sklearn_threadpool_controller
 
 {{for name_suffix in ['64', '32']}}
 
@@ -58,7 +58,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
         """
         # Limit the number of threads in second level of nested parallelism for BLAS
         # to avoid threads over-subscription (in DOT or GEMM for instance).
-        with threadpool_limits(limits=1, user_api='blas'):
+        with _sklearn_threadpool_controller.limit(limits=1, user_api='blas'):
           if metric in ("euclidean", "sqeuclidean"):
               # Specialized implementation of ArgKmin for the Euclidean distance
               # for the dense-dense and sparse-sparse cases.
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp
index f9719f6959dfc..4d0c2a8511ea2 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp
@@ -4,10 +4,10 @@ from libcpp.map cimport map as cpp_map, pair as cpp_pair
 from libc.stdlib cimport free
 
 from ...utils._typedefs cimport intp_t, float64_t
+from ... import _sklearn_threadpool_controller
 
 import numpy as np
 from scipy.sparse import issparse
-from sklearn.utils.fixes import threadpool_limits
 from ._classmode cimport WeightingStrategy
 
 {{for name_suffix in ["32", "64"]}}
@@ -66,7 +66,7 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}):
 
         # Limit the number of threads in second level of nested parallelism for BLAS
         # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
+        with _sklearn_threadpool_controller.limit(limits=1, user_api="blas"):
             if pda.execute_in_parallel_on_Y:
                 pda._parallel_on_Y()
             else:
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
index dcc97b4d32fd4..b5d21aef6cfc2 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -17,7 +17,7 @@ from numbers import Real
 from scipy.sparse import issparse
 from ...utils import check_array, check_scalar
 from ...utils.fixes import _in_unstable_openblas_configuration
-from ...utils.fixes import threadpool_limits
+from ... import _sklearn_threadpool_controller
 
 cnp.import_array()
 
@@ -110,7 +110,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
 
         # Limit the number of threads in second level of nested parallelism for BLAS
         # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
+        with _sklearn_threadpool_controller.limit(limits=1, user_api="blas"):
             if pda.execute_in_parallel_on_Y:
                 pda._parallel_on_Y()
             else:
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp
index 25067b43cd20c..6db1cd9853a13 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp
@@ -8,7 +8,7 @@ from ...utils._typedefs cimport intp_t, float64_t
 
 import numpy as np
 from scipy.sparse import issparse
-from ...utils.fixes import threadpool_limits
+from ... import _sklearn_threadpool_controller
 
 
 {{for name_suffix in ["32", "64"]}}
@@ -60,7 +60,7 @@ cdef class RadiusNeighborsClassMode{{name_suffix}}(RadiusNeighbors{{name_suffix}
 
         # Limit the number of threads in second level of nested parallelism for BLAS
         # to avoid threads over-subscription (in GEMM for instance).
-        with threadpool_limits(limits=1, user_api="blas"):
+        with _sklearn_threadpool_controller.limit(limits=1, user_api="blas"):
             if pda.execute_in_parallel_on_Y:
                 pda._parallel_on_Y()
             else:
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index e5983f9273d94..fcf78846ce86d 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -5,9 +5,9 @@
 
 import numpy as np
 import pytest
-import threadpoolctl
 from scipy.spatial.distance import cdist
 
+from sklearn import _sklearn_threadpool_controller
 from sklearn.metrics import euclidean_distances, pairwise_distances
 from sklearn.metrics._pairwise_distances_reduction import (
     ArgKmin,
@@ -1200,7 +1200,7 @@ def test_n_threads_agnosticism(
         **compute_parameters,
     )
 
-    with threadpoolctl.threadpool_limits(limits=1, user_api="openmp"):
+    with _sklearn_threadpool_controller.limit(limits=1, user_api="openmp"):
         dist, indices = Dispatcher.compute(
             X,
             Y,
diff --git a/sklearn/utils/_show_versions.py b/sklearn/utils/_show_versions.py
index 1431108477263..cc17b71b23799 100644
--- a/sklearn/utils/_show_versions.py
+++ b/sklearn/utils/_show_versions.py
@@ -9,8 +9,9 @@
 import platform
 import sys
 
+from threadpoolctl import threadpool_info
+
 from .. import __version__
-from ..utils.fixes import threadpool_info
 from ._openmp_helpers import _openmp_parallelism_enabled
 
 
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 1b34a3fe1ffbc..fc78ca0829885 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -18,7 +18,6 @@
 import scipy
 import scipy.sparse.linalg
 import scipy.stats
-import threadpoolctl
 
 import sklearn
 
@@ -98,42 +97,6 @@ def _percentile(a, q, *, method="linear", **kwargs):
     from numpy import percentile  # type: ignore  # noqa
 
 
-# compatibility fix for threadpoolctl >= 3.0.0
-# since version 3 it's possible to setup a global threadpool controller to avoid
-# looping through all loaded shared libraries each time.
-# the global controller is created during the first call to threadpoolctl.
-def _get_threadpool_controller():
-    if not hasattr(threadpoolctl, "ThreadpoolController"):
-        return None
-
-    if not hasattr(sklearn, "_sklearn_threadpool_controller"):
-        sklearn._sklearn_threadpool_controller = threadpoolctl.ThreadpoolController()
-
-    return sklearn._sklearn_threadpool_controller
-
-
-def threadpool_limits(limits=None, user_api=None):
-    controller = _get_threadpool_controller()
-    if controller is not None:
-        return controller.limit(limits=limits, user_api=user_api)
-    else:
-        return threadpoolctl.threadpool_limits(limits=limits, user_api=user_api)
-
-
-threadpool_limits.__doc__ = threadpoolctl.threadpool_limits.__doc__
-
-
-def threadpool_info():
-    controller = _get_threadpool_controller()
-    if controller is not None:
-        return controller.info()
-    else:
-        return threadpoolctl.threadpool_info()
-
-
-threadpool_info.__doc__ = threadpoolctl.threadpool_info.__doc__
-
-
 @deprecated(
     "The function `delayed` has been moved from `sklearn.utils.fixes` to "
     "`sklearn.utils.parallel`. This import path will be removed in 1.5."
@@ -439,7 +402,7 @@ def _in_unstable_openblas_configuration():
     import numpy  # noqa
     import scipy  # noqa
 
-    modules_info = threadpool_info()
+    modules_info = sklearn._sklearn_threadpool_controller.info()
 
     open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info)
     if not open_blas_used:
diff --git a/sklearn/utils/tests/test_show_versions.py b/sklearn/utils/tests/test_show_versions.py
index bd166dfd8e522..aade231e46f56 100644
--- a/sklearn/utils/tests/test_show_versions.py
+++ b/sklearn/utils/tests/test_show_versions.py
@@ -1,6 +1,7 @@
+from threadpoolctl import threadpool_info
+
 from sklearn.utils._show_versions import _get_deps_info, _get_sys_info, show_versions
 from sklearn.utils._testing import ignore_warnings
-from sklearn.utils.fixes import threadpool_info
 
 
 def test_get_sys_info():

From b5845da6a63abfce64ddd37d42f1b86d356df2f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <jeremie@probabl.ai>
Date: Mon, 15 Apr 2024 16:31:15 +0200
Subject: [PATCH 02/12] less simple kmeans

---
 sklearn/cluster/_k_means_lloyd.pyx |   4 --
 sklearn/cluster/_kmeans.py         | 103 ++++++++++++++++-------------
 2 files changed, 58 insertions(+), 49 deletions(-)

diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx
index cb2977a040f4b..db7b4e259f434 100644
--- a/sklearn/cluster/_k_means_lloyd.pyx
+++ b/sklearn/cluster/_k_means_lloyd.pyx
@@ -6,7 +6,6 @@ from libc.stdlib cimport malloc, calloc, free
 from libc.string cimport memset
 from libc.float cimport DBL_MAX, FLT_MAX
 
-from .. import _sklearn_threadpool_controller
 from ..utils._openmp_helpers cimport omp_lock_t
 from ..utils._openmp_helpers cimport omp_init_lock
 from ..utils._openmp_helpers cimport omp_destroy_lock
@@ -21,9 +20,6 @@ from ._k_means_common cimport _relocate_empty_clusters_sparse
 from ._k_means_common cimport _average_centers, _center_shift
 
 
-# Threadpoolctl context to limit the number of threads in second level of
-# nested parallelism (i.e. BLAS) to avoid oversubscription.
-@_sklearn_threadpool_controller.wrap(limits=1, user_api="blas")
 def lloyd_iter_chunked_dense(
         const floating[:, ::1] X,            # IN
         const floating[::1] sample_weight,   # IN
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 03478bebfe2b4..fb7bdbf93fbb0 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -622,6 +622,9 @@ def _kmeans_single_elkan(
     return labels, inertia, centers, i + 1
 
 
+# Threadpoolctl context to limit the number of threads in second level of
+# nested parallelism (i.e. BLAS) to avoid oversubscription.
+@_sklearn_threadpool_controller.wrap(limits=1, user_api="blas")
 def _kmeans_single_lloyd(
     X,
     sample_weight,
@@ -823,6 +826,12 @@ def _labels_inertia(X, sample_weight, centers, n_threads=1, return_inertia=True)
     return labels
 
 
+# Same as _labels_inertia but in a threadpool_limits context.
+_labels_inertia_threadpool_limit = _sklearn_threadpool_controller.wrap(
+    limits=1, user_api="blas"
+)(_labels_inertia)
+
+
 class _BaseKMeans(
     ClassNamePrefixFeaturesOutMixin, TransformerMixin, ClusterMixin, BaseEstimator, ABC
 ):
@@ -1097,7 +1106,7 @@ def predict(self, X, sample_weight="deprecated"):
         else:
             sample_weight = _check_sample_weight(None, X, dtype=X.dtype)
 
-        labels = _labels_inertia(
+        labels = _labels_inertia_threadpool_limit(
             X,
             sample_weight,
             self.cluster_centers_,
@@ -1182,7 +1191,7 @@ def score(self, X, y=None, sample_weight=None):
         X = self._check_test_data(X)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
-        _, scores = _labels_inertia(
+        _, scores = _labels_inertia_threadpool_limit(
             X, sample_weight, self.cluster_centers_, self._n_threads
         )
         return -scores
@@ -1632,6 +1641,8 @@ def _mini_batch_step(
         the centers.
     """
     # Perform label assignment to nearest centers
+    # For better efficiency, it's better to run _mini_batch_step in a
+    # threadpool_limit context than using _labels_inertia_threadpool_limit here
     labels, inertia = _labels_inertia(X, sample_weight, centers, n_threads=n_threads)
 
     # Update centers according to the labels
@@ -2120,7 +2131,7 @@ def fit(self, X, y=None, sample_weight=None):
             )
 
             # Compute inertia on a validation set.
-            _, inertia = _labels_inertia(
+            _, inertia = _labels_inertia_threadpool_limit(
                 X_valid,
                 sample_weight_valid,
                 cluster_centers,
@@ -2149,37 +2160,38 @@ def fit(self, X, y=None, sample_weight=None):
 
         n_steps = (self.max_iter * n_samples) // self._batch_size
 
-        # Perform the iterative optimization until convergence
-        for i in range(n_steps):
-            # Sample a minibatch from the full dataset
-            minibatch_indices = random_state.randint(0, n_samples, self._batch_size)
-
-            # Perform the actual update step on the minibatch data
-            batch_inertia = _mini_batch_step(
-                X=X[minibatch_indices],
-                sample_weight=sample_weight[minibatch_indices],
-                centers=centers,
-                centers_new=centers_new,
-                weight_sums=self._counts,
-                random_state=random_state,
-                random_reassign=self._random_reassign(),
-                reassignment_ratio=self.reassignment_ratio,
-                verbose=self.verbose,
-                n_threads=self._n_threads,
-            )
+        with _sklearn_threadpool_controller.limit(limits=1, user_api="blas"):
+            # Perform the iterative optimization until convergence
+            for i in range(n_steps):
+                # Sample a minibatch from the full dataset
+                minibatch_indices = random_state.randint(0, n_samples, self._batch_size)
+
+                # Perform the actual update step on the minibatch data
+                batch_inertia = _mini_batch_step(
+                    X=X[minibatch_indices],
+                    sample_weight=sample_weight[minibatch_indices],
+                    centers=centers,
+                    centers_new=centers_new,
+                    weight_sums=self._counts,
+                    random_state=random_state,
+                    random_reassign=self._random_reassign(),
+                    reassignment_ratio=self.reassignment_ratio,
+                    verbose=self.verbose,
+                    n_threads=self._n_threads,
+                )
 
-            if self._tol > 0.0:
-                centers_squared_diff = np.sum((centers_new - centers) ** 2)
-            else:
-                centers_squared_diff = 0
+                if self._tol > 0.0:
+                    centers_squared_diff = np.sum((centers_new - centers) ** 2)
+                else:
+                    centers_squared_diff = 0
 
-            centers, centers_new = centers_new, centers
+                centers, centers_new = centers_new, centers
 
-            # Monitor convergence and do early stopping if necessary
-            if self._mini_batch_convergence(
-                i, n_steps, n_samples, centers_squared_diff, batch_inertia
-            ):
-                break
+                # Monitor convergence and do early stopping if necessary
+                if self._mini_batch_convergence(
+                    i, n_steps, n_samples, centers_squared_diff, batch_inertia
+                ):
+                    break
 
         self.cluster_centers_ = centers
         self._n_features_out = self.cluster_centers_.shape[0]
@@ -2188,7 +2200,7 @@ def fit(self, X, y=None, sample_weight=None):
         self.n_iter_ = int(np.ceil(((i + 1) * self._batch_size) / n_samples))
 
         if self.compute_labels:
-            self.labels_, self.inertia_ = _labels_inertia(
+            self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
                 X,
                 sample_weight,
                 self.cluster_centers_,
@@ -2274,21 +2286,22 @@ def partial_fit(self, X, y=None, sample_weight=None):
             # Initialize number of samples seen since last reassignment
             self._n_since_last_reassign = 0
 
-        _mini_batch_step(
-            X,
-            sample_weight=sample_weight,
-            centers=self.cluster_centers_,
-            centers_new=self.cluster_centers_,
-            weight_sums=self._counts,
-            random_state=self._random_state,
-            random_reassign=self._random_reassign(),
-            reassignment_ratio=self.reassignment_ratio,
-            verbose=self.verbose,
-            n_threads=self._n_threads,
-        )
+        with _sklearn_threadpool_controller.limit(limits=1, user_api="blas"):
+            _mini_batch_step(
+                X,
+                sample_weight=sample_weight,
+                centers=self.cluster_centers_,
+                centers_new=self.cluster_centers_,
+                weight_sums=self._counts,
+                random_state=self._random_state,
+                random_reassign=self._random_reassign(),
+                reassignment_ratio=self.reassignment_ratio,
+                verbose=self.verbose,
+                n_threads=self._n_threads,
+            )
 
         if self.compute_labels:
-            self.labels_, self.inertia_ = _labels_inertia(
+            self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
                 X,
                 sample_weight,
                 self.cluster_centers_,

From 9b23c71b91d7c6843fae81bea629448a971c721f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <jeremie@probabl.ai>
Date: Mon, 15 Apr 2024 16:39:17 +0200
Subject: [PATCH 03/12] cln

---
 sklearn/cluster/_bisect_k_means.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py
index 986b1714dd7c4..1d4a9e1d84c26 100644
--- a/sklearn/cluster/_bisect_k_means.py
+++ b/sklearn/cluster/_bisect_k_means.py
@@ -17,7 +17,7 @@
     _BaseKMeans,
     _kmeans_single_elkan,
     _kmeans_single_lloyd,
-    _labels_inertia,
+    _labels_inertia_threadpool_limit,
 )
 
 
@@ -504,7 +504,7 @@ def _predict_recursive(self, X, sample_weight, cluster_node):
         if hasattr(self, "_X_mean"):
             centers += self._X_mean
 
-        cluster_labels = _labels_inertia(
+        cluster_labels = _labels_inertia_threadpool_limit(
             X,
             sample_weight,
             centers,

From 4df4a5651194c0d3ab287f845c22c0346e234990 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <jeremie@probabl.ai>
Date: Mon, 15 Apr 2024 16:57:33 +0200
Subject: [PATCH 04/12] iter

---
 sklearn/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index b8dd1f49f34a7..9bbdc99308f41 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -145,7 +145,7 @@
 
     # Set a global controller that can be used to locally limit the number of
     # threads without looping through all shared libraries every time.
-    sklearn._sklearn_threadpool_controller = ThreadpoolController()
+    _sklearn_threadpool_controller = ThreadpoolController()
 
 
 def setup_module(module):

From 738b32dfd5aee592912fe5c5596dcdd6eefe1763 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <jeremie@probabl.ai>
Date: Mon, 15 Apr 2024 17:46:29 +0200
Subject: [PATCH 05/12] regen lock files

---
 .../azure/pylatest_conda_mkl_no_openmp_environment.yml        | 1 -
 .../azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock      | 4 ++--
 .../azure/pymin_conda_defaults_openblas_environment.yml       | 1 -
 .../azure/pymin_conda_defaults_openblas_linux-64_conda.lock   | 4 ++--
 build_tools/update_environments_and_lock_files.py             | 4 ++--
 5 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/build_tools/azure/pylatest_conda_mkl_no_openmp_environment.yml b/build_tools/azure/pylatest_conda_mkl_no_openmp_environment.yml
index c2ce48d6a19a1..01bd378aa121a 100644
--- a/build_tools/azure/pylatest_conda_mkl_no_openmp_environment.yml
+++ b/build_tools/azure/pylatest_conda_mkl_no_openmp_environment.yml
@@ -9,7 +9,6 @@ dependencies:
   - blas[build=mkl]
   - scipy<1.12
   - joblib
-  - threadpoolctl
   - matplotlib
   - pandas
   - pyamg
diff --git a/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock b/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock
index fabd22be28e29..0d96f6e4a312d 100644
--- a/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock
+++ b/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: osx-64
-# input_hash: 4462fac2150cfe9fe0aedbc88d55a5f0cc9bb5d786aa4f0553fc92fcbf35d503
+# input_hash: e0d2cf2593df1f2c6969d68cf849136bee785b51f6cfc50ea1bdca2143d4a051
 @EXPLICIT
 https://repo.anaconda.com/pkgs/main/osx-64/blas-1.0-mkl.conda#cb2c87e85ac8e0ceae776d26d4214c8a
 https://repo.anaconda.com/pkgs/main/osx-64/bzip2-1.0.8-h6c40b1e_5.conda#0f51dde96c82dcf58a788787fed4c5b9
@@ -57,7 +57,6 @@ https://repo.anaconda.com/pkgs/main/noarch/python-tzdata-2023.3-pyhd3eb1b0_0.con
 https://repo.anaconda.com/pkgs/main/osx-64/pytz-2023.3.post1-py312hecd8cb5_0.conda#2636382c9a424f69cbc36b1c5dc1f2fc
 https://repo.anaconda.com/pkgs/main/osx-64/setuptools-68.2.2-py312hecd8cb5_0.conda#64235f0c451427d86808c70c1c31cb8b
 https://repo.anaconda.com/pkgs/main/noarch/six-1.16.0-pyhd3eb1b0_1.conda#34586824d411d36af2fa40e799c172d0
-https://repo.anaconda.com/pkgs/main/noarch/threadpoolctl-2.2.0-pyh0d69192_0.conda#bbfdbae4934150b902f97daaf287efe2
 https://repo.anaconda.com/pkgs/main/noarch/toml-0.10.2-pyhd3eb1b0_0.conda#cda05f5f6d8509529d1a2743288d197a
 https://repo.anaconda.com/pkgs/main/osx-64/tornado-6.3.3-py312h6c40b1e_0.conda#49173b5a36c9134865221f29d4a73fb6
 https://repo.anaconda.com/pkgs/main/osx-64/wheel-0.41.2-py312hecd8cb5_0.conda#e7aea266d81142e2bb0bbc2280e64526
@@ -84,3 +83,4 @@ https://repo.anaconda.com/pkgs/main/osx-64/scipy-1.11.4-py312h81688c2_0.conda#7d
 https://repo.anaconda.com/pkgs/main/osx-64/pandas-2.2.1-py312he282a81_0.conda#021b70a1e40efb75b89eb8ebdb347132
 https://repo.anaconda.com/pkgs/main/osx-64/pyamg-4.2.3-py312h44cbcf4_0.conda#3bdc7be74087b3a5a83c520a74e1e8eb
 # pip cython @ https://files.pythonhosted.org/packages/d5/6d/06c08d75adb98cdf72af18801e193d22580cc86ca553610f430f18ea26b3/Cython-3.0.10-cp312-cp312-macosx_10_9_x86_64.whl#sha256=8f2864ab5fcd27a346f0b50f901ebeb8f60b25a60a575ccfd982e7f3e9674914
+# pip threadpoolctl @ https://files.pythonhosted.org/packages/1e/84/ccd9b08653022b7785b6e3ee070ffb2825841e0dc119be22f0840b2b35cb/threadpoolctl-3.4.0-py3-none-any.whl#sha256=8f4c689a65b23e5ed825c8436a92b818aac005e0f3715f6a1664d7c7ee29d262
diff --git a/build_tools/azure/pymin_conda_defaults_openblas_environment.yml b/build_tools/azure/pymin_conda_defaults_openblas_environment.yml
index 144b432c486b3..17824c9b97074 100644
--- a/build_tools/azure/pymin_conda_defaults_openblas_environment.yml
+++ b/build_tools/azure/pymin_conda_defaults_openblas_environment.yml
@@ -9,7 +9,6 @@ dependencies:
   - blas[build=openblas]
   - scipy=1.7
   - joblib
-  - threadpoolctl
   - matplotlib=3.3.4  # min
   - pyamg
   - pytest<8
diff --git a/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock b/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock
index e6e873c144b09..ffb8a5c4b1b9a 100644
--- a/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock
+++ b/build_tools/azure/pymin_conda_defaults_openblas_linux-64_conda.lock
@@ -1,6 +1,6 @@
 # Generated by conda-lock.
 # platform: linux-64
-# input_hash: 60862fc361d93a51f183ea9ef3bb29dd428c1359bae96e3f4d37ad92e4ae67c1
+# input_hash: 4f71ddcce93c9279161f5b016be417469aa5df726d06e4a1447c9270f60179e4
 @EXPLICIT
 https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda#c3473ff8bdb3d124ed5ff11ec380d6f9
 https://repo.anaconda.com/pkgs/main/linux-64/blas-1.0-openblas.conda#9ddfcaef10d79366c90128f5dc444be8
@@ -74,7 +74,6 @@ https://repo.anaconda.com/pkgs/main/linux-64/pyparsing-3.0.9-py39h06a4308_0.cond
 https://repo.anaconda.com/pkgs/main/linux-64/pyqt5-sip-12.13.0-py39h5eee18b_0.conda#256840c3841b52346ea5743be8490ede
 https://repo.anaconda.com/pkgs/main/linux-64/setuptools-68.2.2-py39h06a4308_0.conda#5b42cae5548732ae5c167bb1066085de
 https://repo.anaconda.com/pkgs/main/noarch/six-1.16.0-pyhd3eb1b0_1.conda#34586824d411d36af2fa40e799c172d0
-https://repo.anaconda.com/pkgs/main/noarch/threadpoolctl-2.2.0-pyh0d69192_0.conda#bbfdbae4934150b902f97daaf287efe2
 https://repo.anaconda.com/pkgs/main/noarch/toml-0.10.2-pyhd3eb1b0_0.conda#cda05f5f6d8509529d1a2743288d197a
 https://repo.anaconda.com/pkgs/main/linux-64/tomli-2.0.1-py39h06a4308_0.conda#b06dffe7ddca2645ed72f5116f0a087d
 https://repo.anaconda.com/pkgs/main/linux-64/tornado-6.3.3-py39h5eee18b_0.conda#9c4bd985bb8adcd12f47e790e95a9333
@@ -97,3 +96,4 @@ https://repo.anaconda.com/pkgs/main/linux-64/qt-main-5.15.2-h53bd1ea_10.conda#bd
 https://repo.anaconda.com/pkgs/main/linux-64/pyqt-5.15.10-py39h6a678d5_0.conda#52da5ff9b1144b078d2f41bab0b213f2
 https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-3.3.4-py39h06a4308_0.conda#384fc5e01ebfcf30e7161119d3029b5a
 # pip cython @ https://files.pythonhosted.org/packages/a7/f5/3dde4d96076888ceaa981827b098274c2b45ddd4b20d75a8cfaa92b91eec/Cython-3.0.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=651a15a8534ebfb9b58cb0b87c269c70984b6f9c88bfe65e4f635f0e3f07dfcd
+# pip threadpoolctl @ https://files.pythonhosted.org/packages/1e/84/ccd9b08653022b7785b6e3ee070ffb2825841e0dc119be22f0840b2b35cb/threadpoolctl-3.4.0-py3-none-any.whl#sha256=8f4c689a65b23e5ed825c8436a92b818aac005e0f3715f6a1664d7c7ee29d262
diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py
index 62a4d58ea91fd..5f9d9b3380c98 100644
--- a/build_tools/update_environments_and_lock_files.py
+++ b/build_tools/update_environments_and_lock_files.py
@@ -141,7 +141,7 @@ def remove_from(alist, to_remove):
         "folder": "build_tools/azure",
         "platform": "osx-64",
         "channel": "defaults",
-        "conda_dependencies": remove_from(common_dependencies, ["cython"]) + ["ccache"],
+        "conda_dependencies": remove_from(common_dependencies, ["cython", "threadpoolctl"]) + ["ccache"],
         "package_constraints": {
             "blas": "[build=mkl]",
             # scipy 1.12.x crashes on this platform (https://github.com/scipy/scipy/pull/20086)
@@ -161,7 +161,7 @@ def remove_from(alist, to_remove):
         "platform": "linux-64",
         "channel": "defaults",
         "conda_dependencies": remove_from(
-            common_dependencies, ["pandas", "cython", "pip", "ninja", "meson-python"]
+            common_dependencies, ["pandas", "cython", "threadpoolctl", "pip", "ninja", "meson-python"]
         )
         + ["ccache"],
         "package_constraints": {

From 18d85452bc0bd272568345272b9c032a8bd1962a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <jeremie@probabl.ai>
Date: Mon, 15 Apr 2024 17:47:58 +0200
Subject: [PATCH 06/12] check all builds [doc build] [scipy-dev] [cd build]


From 2cc69f120ee67cb8ce260db0d2ffca0b23c97cd8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <jeremie@probabl.ai>
Date: Mon, 15 Apr 2024 17:50:58 +0200
Subject: [PATCH 07/12] lint [doc build] [scipy-dev] [cd build]

---
 build_tools/update_environments_and_lock_files.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py
index 5f9d9b3380c98..475d9d0425d5b 100644
--- a/build_tools/update_environments_and_lock_files.py
+++ b/build_tools/update_environments_and_lock_files.py
@@ -141,7 +141,10 @@ def remove_from(alist, to_remove):
         "folder": "build_tools/azure",
         "platform": "osx-64",
         "channel": "defaults",
-        "conda_dependencies": remove_from(common_dependencies, ["cython", "threadpoolctl"]) + ["ccache"],
+        "conda_dependencies": remove_from(
+            common_dependencies, ["cython", "threadpoolctl"]
+        )
+        + ["ccache"],
         "package_constraints": {
             "blas": "[build=mkl]",
             # scipy 1.12.x crashes on this platform (https://github.com/scipy/scipy/pull/20086)
@@ -161,7 +164,8 @@ def remove_from(alist, to_remove):
         "platform": "linux-64",
         "channel": "defaults",
         "conda_dependencies": remove_from(
-            common_dependencies, ["pandas", "cython", "threadpoolctl", "pip", "ninja", "meson-python"]
+            common_dependencies,
+            ["pandas", "cython", "threadpoolctl", "pip", "ninja", "meson-python"],
         )
         + ["ccache"],
         "package_constraints": {

From 7641848297691cb45226714a7c89e42716022b4b Mon Sep 17 00:00:00 2001
From: jeremiedbb <jeremiedbb@yahoo.fr>
Date: Wed, 17 Apr 2024 13:00:43 +0200
Subject: [PATCH 08/12] add comments

---
 build_tools/update_environments_and_lock_files.py |  8 ++++----
 sklearn/__init__.py                               | 11 +++++++++--
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py
index 475d9d0425d5b..8a4a30d235e0b 100644
--- a/build_tools/update_environments_and_lock_files.py
+++ b/build_tools/update_environments_and_lock_files.py
@@ -152,8 +152,8 @@ def remove_from(alist, to_remove):
             # channel.
             "scipy": "<1.12",
         },
-        # TODO: put cython back to conda dependencies when required version is
-        # available on the main channel
+        # TODO: put cython and threadpoolctl back to conda dependencies when required
+        # version is available on the main channel
         "pip_dependencies": ["cython", "threadpoolctl"],
     },
     {
@@ -176,8 +176,8 @@ def remove_from(alist, to_remove):
             "matplotlib": "min",
             "cython": "min",
         },
-        # TODO: put cython back to conda dependencies when required version is
-        # available on the main channel
+        # TODO: put cython and threadpoolctl back to conda dependencies when required
+        # version is available on the main channel
         "pip_dependencies": ["cython", "threadpoolctl"],
     },
     {
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 9bbdc99308f41..ebec16cd78877 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -73,6 +73,12 @@
     # We are not importing the rest of scikit-learn during the build
     # process, as it may not be compiled yet
 else:
+    # Import numpy, scipy to make sure that the BLAS libs are loaded before
+    # creating the ThreadpoolController. (OpenMP is loaded by importing show_versions)
+    import numpy  # noqa
+    import scipy.linalg  # noqa
+    from threadpoolctl import ThreadpoolController
+
     # `_distributor_init` allows distributors to run custom init code.
     # For instance, for the Windows wheel, this is used to pre-load the
     # vcomp shared library runtime for OpenMP embedded in the sklearn/.libs
@@ -80,12 +86,11 @@
     # It is necessary to do this prior to importing show_versions as the
     # later is linked to the OpenMP runtime to make it possible to introspect
     # it and importing it first would fail if the OpenMP dll cannot be found.
-    from threadpoolctl import ThreadpoolController
-
     from . import (
         __check_build,  # noqa: F401
         _distributor_init,  # noqa: F401
     )
+
     from .base import clone
     from .utils._show_versions import show_versions
 
@@ -145,6 +150,8 @@
 
     # Set a global controller that can be used to locally limit the number of
     # threads without looping through all shared libraries every time.
+    # This instantitation should not happen earlier because it needs all BLAS and
+    # OpenMP libs to be loaded first.
     _sklearn_threadpool_controller = ThreadpoolController()
 
 
From 96a3d028cee0c6dc4dbac4c8b3d80e7f0576dcf4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <jeremie@probabl.ai>
Date: Tue, 23 Apr 2024 12:15:51 +0200
Subject: [PATCH 09/12] rename + extend comment

---
 sklearn/__init__.py                                  |  7 +++++--
 sklearn/cluster/_kmeans.py                           | 12 ++++++------
 sklearn/cluster/tests/test_k_means.py                |  6 +++---
 .../_pairwise_distances_reduction/_argkmin.pyx.tp    |  4 ++--
 .../_argkmin_classmode.pyx.tp                        |  4 ++--
 .../_radius_neighbors.pyx.tp                         |  4 ++--
 .../_radius_neighbors_classmode.pyx.tp               |  4 ++--
 .../tests/test_pairwise_distances_reduction.py       |  4 ++--
 sklearn/utils/fixes.py                               |  2 +-
 9 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index ebec16cd78877..d6ba77c14516b 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -74,7 +74,10 @@
     # process, as it may not be compiled yet
 else:
     # Import numpy, scipy to make sure that the BLAS libs are loaded before
-    # creating the ThreadpoolController. (OpenMP is loaded by importing show_versions)
+    # creating the ThreadpoolController. They would be imported just after
+    # when importing utils anyway. This makes it explicit and robust to changes
+    # in utils.
+    # (OpenMP is loaded by importing show_versions right after this block)
     import numpy  # noqa
     import scipy.linalg  # noqa
     from threadpoolctl import ThreadpoolController
@@ -152,7 +155,7 @@
     # threads without looping through all shared libraries every time.
     # This instantitation should not happen earlier because it needs all BLAS and
     # OpenMP libs to be loaded first.
-    _sklearn_threadpool_controller = ThreadpoolController()
+    _threadpool_controller = ThreadpoolController()
 
 
 def setup_module(module):
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 20aefb73f65de..2ab6f1e95563b 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -18,7 +18,7 @@
 import numpy as np
 import scipy.sparse as sp
 
-from .. import _sklearn_threadpool_controller
+from .. import _threadpool_controller
 from ..base import (
     BaseEstimator,
     ClassNamePrefixFeaturesOutMixin,
@@ -624,7 +624,7 @@ def _kmeans_single_elkan(
 
 # Threadpoolctl context to limit the number of threads in second level of
 # nested parallelism (i.e. BLAS) to avoid oversubscription.
-@_sklearn_threadpool_controller.wrap(limits=1, user_api="blas")
+@_threadpool_controller.wrap(limits=1, user_api="blas")
 def _kmeans_single_lloyd(
     X,
     sample_weight,
@@ -827,7 +827,7 @@ def _labels_inertia(X, sample_weight, centers, n_threads=1, return_inertia=True)
 
 
 # Same as _labels_inertia but in a threadpool_limits context.
-_labels_inertia_threadpool_limit = _sklearn_threadpool_controller.wrap(
+_labels_inertia_threadpool_limit = _threadpool_controller.wrap(
     limits=1, user_api="blas"
 )(_labels_inertia)
 
@@ -922,7 +922,7 @@ def _check_mkl_vcomp(self, X, n_samples):
 
         n_active_threads = int(np.ceil(n_samples / CHUNK_SIZE))
         if n_active_threads < self._n_threads:
-            modules = _sklearn_threadpool_controller.info()
+            modules = _threadpool_controller.info()
             has_vcomp = "vcomp" in [module["prefix"] for module in modules]
             has_mkl = ("mkl", "intel") in [
                 (module["internal_api"], module.get("threading_layer", None))
@@ -2144,7 +2144,7 @@ def fit(self, X, y=None, sample_weight=None):
 
         n_steps = (self.max_iter * n_samples) // self._batch_size
 
-        with _sklearn_threadpool_controller.limit(limits=1, user_api="blas"):
+        with _threadpool_controller.limit(limits=1, user_api="blas"):
             # Perform the iterative optimization until convergence
             for i in range(n_steps):
                 # Sample a minibatch from the full dataset
@@ -2270,7 +2270,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
             # Initialize number of samples seen since last reassignment
             self._n_since_last_reassign = 0
 
-        with _sklearn_threadpool_controller.limit(limits=1, user_api="blas"):
+        with _threadpool_controller.limit(limits=1, user_api="blas"):
             _mini_batch_step(
                 X,
                 sample_weight=sample_weight,
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 6cc40e38fb2cb..c3a41a65de632 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -8,7 +8,7 @@
 import pytest
 from scipy import sparse as sp
 
-from sklearn import _sklearn_threadpool_controller
+from sklearn import _threadpool_controller
 from sklearn.base import clone
 from sklearn.cluster import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus
 from sklearn.cluster._k_means_common import (
@@ -968,13 +968,13 @@ def test_result_equal_in_diff_n_threads(Estimator, global_random_seed):
     rnd = np.random.RandomState(global_random_seed)
     X = rnd.normal(size=(50, 10))
 
-    with _sklearn_threadpool_controller.limit(limits=1, user_api="openmp"):
+    with _threadpool_controller.limit(limits=1, user_api="openmp"):
         result_1 = (
             Estimator(n_clusters=n_clusters, random_state=global_random_seed)
             .fit(X)
             .labels_
         )
-    with _sklearn_threadpool_controller.limit(limits=2, user_api="openmp"):
+    with _threadpool_controller.limit(limits=2, user_api="openmp"):
         result_2 = (
             Estimator(n_clusters=n_clusters, random_state=global_random_seed)
             .fit(X)
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
index 0bc90498da893..ef61158fedca8 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp
@@ -14,7 +14,7 @@ from numbers import Integral
 from scipy.sparse import issparse
 from ...utils import check_array, check_scalar
 from ...utils.fixes import _in_unstable_openblas_configuration
-from ... import _sklearn_threadpool_controller
+from ... import _threadpool_controller
 
 {{for name_suffix in ['64', '32']}}
 
@@ -58,7 +58,7 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}):
         """
         # Limit the number of threads in second level of nested parallelism for BLAS
         # to avoid threads over-subscription (in DOT or GEMM for instance).
-        with _sklearn_threadpool_controller.limit(limits=1, user_api='blas'):
+        with _threadpool_controller.limit(limits=1, user_api='blas'):
           if metric in ("euclidean", "sqeuclidean"):
               # Specialized implementation of ArgKmin for the Euclidean distance
               # for the dense-dense and sparse-sparse cases.
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp
index 4d0c2a8511ea2..b875499f44ed4 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp
@@ -4,7 +4,7 @@ from libcpp.map cimport map as cpp_map, pair as cpp_pair
 from libc.stdlib cimport free
 
 from ...utils._typedefs cimport intp_t, float64_t
-from ... import _sklearn_threadpool_controller
+from ... import _threadpool_controller
 
 import numpy as np
 from scipy.sparse import issparse
@@ -66,7 +66,7 @@ cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}):
 
         # Limit the number of threads in second level of nested parallelism for BLAS
         # to avoid threads over-subscription (in GEMM for instance).
-        with _sklearn_threadpool_controller.limit(limits=1, user_api="blas"):
+        with _threadpool_controller.limit(limits=1, user_api="blas"):
             if pda.execute_in_parallel_on_Y:
                 pda._parallel_on_Y()
             else:
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
index b5d21aef6cfc2..f4af378062bdc 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp
@@ -17,7 +17,7 @@ from numbers import Real
 from scipy.sparse import issparse
 from ...utils import check_array, check_scalar
 from ...utils.fixes import _in_unstable_openblas_configuration
-from ... import _sklearn_threadpool_controller
+from ... import _threadpool_controller
 
 cnp.import_array()
 
@@ -110,7 +110,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}})
 
         # Limit the number of threads in second level of nested parallelism for BLAS
         # to avoid threads over-subscription (in GEMM for instance).
-        with _sklearn_threadpool_controller.limit(limits=1, user_api="blas"):
+        with _threadpool_controller.limit(limits=1, user_api="blas"):
             if pda.execute_in_parallel_on_Y:
                 pda._parallel_on_Y()
             else:
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp
index 6db1cd9853a13..ab12d7904c7fd 100644
--- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp
+++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp
@@ -8,7 +8,7 @@ from ...utils._typedefs cimport intp_t, float64_t
 
 import numpy as np
 from scipy.sparse import issparse
-from ... import _sklearn_threadpool_controller
+from ... import _threadpool_controller
 
 
 {{for name_suffix in ["32", "64"]}}
@@ -60,7 +60,7 @@ cdef class RadiusNeighborsClassMode{{name_suffix}}(RadiusNeighbors{{name_suffix}
 
         # Limit the number of threads in second level of nested parallelism for BLAS
         # to avoid threads over-subscription (in GEMM for instance).
-        with _sklearn_threadpool_controller.limit(limits=1, user_api="blas"):
+        with _threadpool_controller.limit(limits=1, user_api="blas"):
             if pda.execute_in_parallel_on_Y:
                 pda._parallel_on_Y()
             else:
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
index fcf78846ce86d..95dfa98178ee7 100644
--- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py
+++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
@@ -7,7 +7,7 @@
 import pytest
 from scipy.spatial.distance import cdist
 
-from sklearn import _sklearn_threadpool_controller
+from sklearn import _threadpool_controller
 from sklearn.metrics import euclidean_distances, pairwise_distances
 from sklearn.metrics._pairwise_distances_reduction import (
     ArgKmin,
@@ -1200,7 +1200,7 @@ def test_n_threads_agnosticism(
         **compute_parameters,
     )
 
-    with _sklearn_threadpool_controller.limit(limits=1, user_api="openmp"):
+    with _threadpool_controller.limit(limits=1, user_api="openmp"):
         dist, indices = Dispatcher.compute(
             X,
             Y,
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 8103012190615..21e62150b0356 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -391,7 +391,7 @@ def _in_unstable_openblas_configuration():
     import numpy  # noqa
     import scipy  # noqa
 
-    modules_info = sklearn._sklearn_threadpool_controller.info()
+    modules_info = sklearn._threadpool_controller.info()
 
     open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info)
     if not open_blas_used:

From 581c4770b0735972870b52bfa124c1e7a1472534 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <jeremie@probabl.ai>
Date: Tue, 23 Apr 2024 12:19:46 +0200
Subject: [PATCH 10/12] fix conflict

---
 build_tools/circle/doc_linux-64_conda.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/circle/doc_linux-64_conda.lock b/build_tools/circle/doc_linux-64_conda.lock
index d8c67609e77ed..60bbe8298c34c 100644
--- a/build_tools/circle/doc_linux-64_conda.lock
+++ b/build_tools/circle/doc_linux-64_conda.lock
@@ -318,5 +318,5 @@ https://conda.anaconda.org/conda-forge/noarch/sphinxext-opengraph-0.9.1-pyhd8ed1
 # pip nbclient @ https://files.pythonhosted.org/packages/66/e8/00517a23d3eeaed0513e718fbc94aab26eaa1758f5690fc8578839791c79/nbclient-0.10.0-py3-none-any.whl#sha256=f13e3529332a1f1f81d82a53210322476a168bb7090a0289c795fe9cc11c9d3f
 # pip nbconvert @ https://files.pythonhosted.org/packages/23/8a/8d67cbd984739247e4b205c1143e2f71b25b4f71e180fe70f7cb2cf02633/nbconvert-7.16.3-py3-none-any.whl#sha256=ddeff14beeeedf3dd0bc506623e41e4507e551736de59df69a91f86700292b3b
 # pip jupyter-server @ https://files.pythonhosted.org/packages/07/46/6bb926b3bf878bf687b952fb6a4c09d014b4575a25960f2cd1a61793763f/jupyter_server-2.14.0-py3-none-any.whl#sha256=fb6be52c713e80e004fac34b35a0990d6d36ba06fd0a2b2ed82b899143a64210
-# pip jupyterlab-server @ https://files.pythonhosted.org/packages/09/e3/32d9dff7870d7236de690106e8da62a861893558edd592c9cc55732541e3/jupyterlab_server-2.27.0-py3-none-any.whl#sha256=1dbf26210c2426bca6164c0df57da85ec711aeeed4480dee1cf66501f06292c3
+# pip jupyterlab-server @ https://files.pythonhosted.org/packages/2f/b9/ed4ecad7cf1863a64920dc4c19b0376628b5d6bd28d2ec1e00cbac4ba2fb/jupyterlab_server-2.27.1-py3-none-any.whl#sha256=f5e26156e5258b24d532c84e7c74cc212e203bff93eb856f81c24c16daeecc75
 # pip jupyterlite-sphinx @ https://files.pythonhosted.org/packages/38/c9/5f1142c005cf8d75830b10029e53f074324bc85cfca1f1d0f22a207b771c/jupyterlite_sphinx-0.9.3-py3-none-any.whl#sha256=be6332d16490ea2fa90b78187a2c5e1c357195966a25741d60b1790346571041

From 0c1787c9929233d0045770c6c178ccca320a3927 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <jeremie@probabl.ai>
Date: Tue, 23 Apr 2024 12:21:35 +0200
Subject: [PATCH 11/12] cln unrelated

---
 sklearn/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index d6ba77c14516b..45a26334a25f5 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -93,7 +93,6 @@
         __check_build,  # noqa: F401
         _distributor_init,  # noqa: F401
     )
-
     from .base import clone
     from .utils._show_versions import show_versions
 

From 1b833c66b41eb3eff1d667c9f685b0d5f7853460 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <jeremie@probabl.ai>
Date: Tue, 23 Apr 2024 12:22:27 +0200
Subject: [PATCH 12/12] [doc build] [scipy-dev] [cd build]