scikit-learn · glemaitre · Dec 27, 2022 · Dec 27, 2022 · Dec 27, 2022 · Dec 27, 2022
diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -9,6 +9,16 @@ Version 1.2.1
 
 **In Development**
 
+Changes impacting all modules
+-----------------------------
+
+- |Fix| Fix a bug where the current configuration was ignored in estimators using
+  `n_jobs > 1`. This bug was triggered for tasks dispatched by the ancillary
+  thread of `joblib` as :func:`sklearn.get_config` used to access an empty thread
+  local configuration instead of the configuration visible from the thread where
+  `delayed` was first called.
+  :pr:`25242` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Changelog
 ---------
 

diff --git a/sklearn/_config.py b/sklearn/_config.py
@@ -1,10 +1,13 @@
 """Global configuration state and functions for management
 """
 import os
-from contextlib import contextmanager as contextmanager
 import threading
 
-_global_config = {
+from contextlib import contextmanager
+from typing import Dict  # noqa
+from weakref import WeakKeyDictionary
+
+_global_config_default = {
     "assume_finite": bool(os.environ.get("SKLEARN_ASSUME_FINITE", False)),
     "working_memory": int(os.environ.get("SKLEARN_WORKING_MEMORY", 1024)),
     "print_changed_only": True,
@@ -16,15 +19,22 @@
     "array_api_dispatch": False,
     "transform_output": "default",
 }
-_threadlocal = threading.local()
+_thread_config = WeakKeyDictionary()  # type: WeakKeyDictionary[threading.Thread, Dict]
+
 
+def _get_thread_config(thread=None):
+    """Get a thread **mutable** configuration.
+
+    If the configuration does not exist, copy the default global configuration.
+    The configuration is also registered to a global dictionary where the keys
+    are weak references to the thread objects.
+    """
+    if thread is None:
+        thread = threading.current_thread()
 
-def _get_threadlocal_config():
-    """Get a threadlocal **mutable** configuration. If the configuration
-    does not exist, copy the default global configuration."""
-    if not hasattr(_threadlocal, "global_config"):
-        _threadlocal.global_config = _global_config.copy()
-    return _threadlocal.global_config
+    if thread not in _thread_config:
+        _thread_config[thread] = _global_config_default.copy()
+    return _thread_config[thread]
 
 
 def get_config():
@@ -40,9 +50,9 @@ def get_config():
     config_context : Context manager for global scikit-learn configuration.
     set_config : Set global scikit-learn configuration.
     """
-    # Return a copy of the threadlocal configuration so that users will
-    # not be able to modify the configuration with the returned dict.
-    return _get_threadlocal_config().copy()
+    # Return a copy of the configuration so that users will not be able to
+    # modify the configuration with the returned dict.
+    return _get_thread_config().copy()
 
 
 def set_config(
@@ -139,7 +149,7 @@ def set_config(
     config_context : Context manager for global scikit-learn configuration.
     get_config : Retrieve current values of the global configuration.
     """
-    local_config = _get_threadlocal_config()
+    local_config = _get_thread_config()
 
     if assume_finite is not None:
         local_config["assume_finite"] = assume_finite

diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py
@@ -1,9 +1,12 @@
 import time
+import threading
 from concurrent.futures import ThreadPoolExecutor
 
 from joblib import Parallel
 import pytest
 
+from sklearn._config import _get_thread_config
+
 from sklearn import get_config, set_config, config_context
 from sklearn.utils.fixes import delayed
 
@@ -120,29 +123,42 @@ def test_config_threadsafe_joblib(backend):
     should be the same as the value passed to the function. In other words,
     it is not influenced by the other job setting assume_finite to True.
     """
-    assume_finites = [False, True]
-    sleep_durations = [0.1, 0.2]
+    assume_finites = [False, True, False, True]
+    sleep_durations = [0.1, 0.2, 0.1, 0.2]
 
-    items = Parallel(backend=backend, n_jobs=2)(
+    items = Parallel(backend=backend, n_jobs=2, pre_dispatch=2)(
         delayed(set_assume_finite)(assume_finite, sleep_dur)
         for assume_finite, sleep_dur in zip(assume_finites, sleep_durations)
     )
 
-    assert items == [False, True]
+    assert items == [False, True, False, True]
 
 
 def test_config_threadsafe():
     """Uses threads directly to test that the global config does not change
     between threads. Same test as `test_config_threadsafe_joblib` but with
     `ThreadPoolExecutor`."""
 
-    assume_finites = [False, True]
-    sleep_durations = [0.1, 0.2]
+    assume_finites = [False, True, False, True]
+    sleep_durations = [0.1, 0.2, 0.1, 0.2]
 
     with ThreadPoolExecutor(max_workers=2) as e:
         items = [
             output
             for output in e.map(set_assume_finite, assume_finites, sleep_durations)
         ]
 
-    assert items == [False, True]
+    assert items == [False, True, False, True]
+
+
+def test_get_thread_config():
+    """Check that we can retrieve the config from a specific thread."""
+
+    thread = threading.Thread(target=set_config, kwargs={"assume_finite": True})
+    thread.start()
+    thread.join()
+
+    thread_specific_config = _get_thread_config(thread=thread)
+    assert thread_specific_config["assume_finite"] is True
+    main_thread_config = _get_thread_config()
+    assert main_thread_config["assume_finite"] is False
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
@@ -14,13 +14,14 @@
 from importlib import resources
 import functools
 import sys
+import threading
 
 import sklearn
 import numpy as np
 import scipy
 import scipy.stats
 import threadpoolctl
-from .._config import config_context, get_config
+from .._config import config_context, _get_thread_config
 from ..externals._packaging.version import parse as parse_version
 
 
@@ -107,22 +108,30 @@ def _eigh(*args, **kwargs):
 
 
 # remove when https://github.com/joblib/joblib/issues/1071 is fixed
-def delayed(function):
+def delayed(func):
     """Decorator used to capture the arguments of a function."""
+    return _delayed(func)
 
-    @functools.wraps(function)
-    def delayed_function(*args, **kwargs):
-        return _FuncWrapper(function), args, kwargs
 
-    return delayed_function
+def _delayed(func, thread=threading.current_thread()):
+    """Private function to expose the thread argument."""
+
+    def decorate(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            return _FuncWrapper(func, thread=thread), args, kwargs
+
+        return wrapper
+
+    return decorate(func)
 
 
 class _FuncWrapper:
     """ "Load the global configuration before calling the function."""
 
-    def __init__(self, function):
+    def __init__(self, function, thread):
         self.function = function
-        self.config = get_config()
+        self.config = _get_thread_config(thread=thread)
         update_wrapper(self, self.function)
 
     def __call__(self, *args, **kwargs):

diff --git a/sklearn/utils/tests/test_fixes.py b/sklearn/utils/tests/test_fixes.py
@@ -4,15 +4,18 @@
 # License: BSD 3 clause
 
 import math
+import threading
 
 import numpy as np
 import pytest
 import scipy.stats
 
+from joblib import Parallel
+
+import sklearn
 from sklearn.utils._testing import assert_array_equal
 
-from sklearn.utils.fixes import _object_dtype_isnan
-from sklearn.utils.fixes import loguniform
+from sklearn.utils.fixes import _delayed, _object_dtype_isnan, loguniform
 
 
 @pytest.mark.parametrize("dtype, val", ([object, 1], [object, "a"], [float, 1]))
@@ -46,3 +49,37 @@ def test_loguniform(low, high, base):
     assert loguniform(base**low, base**high).rvs(random_state=0) == loguniform(
         base**low, base**high
     ).rvs(random_state=0)
+
+
+def test_delayed_fetching_right_config():
+    """Check that `_delayed` function fetches the right config associated to
+    the main thread.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/25239
+    """
+
+    def get_working_memory():
+        return sklearn.get_config()["working_memory"]
+
+    n_iter = 10
+
+    # by default, we register the main thread and we should retrieve the
+    # parameters defined within the context manager
+    with sklearn.config_context(working_memory=123):
+        results = Parallel(n_jobs=2, pre_dispatch=4)(
+            _delayed(get_working_memory)() for _ in range(n_iter)
+        )
+
+    assert results == [123] * n_iter
+
+    # simulate that we refer to another thread
+    local_thread = threading.Thread(target=sklearn.get_config)
+    local_thread.start()
+    local_thread.join()
+    with sklearn.config_context(working_memory=123):
+        results = Parallel(n_jobs=2, pre_dispatch=4)(
+            _delayed(get_working_memory, thread=local_thread)() for _ in range(n_iter)
+        )
+
+    assert results == [get_working_memory()] * n_iter