scikit-learn · ogrisel · Feb 25, 2023 · Feb 21, 2023 · Feb 21, 2023 · Feb 21, 2023
diff --git a/doc/developers/performance.rst b/doc/developers/performance.rst
@@ -344,27 +344,18 @@ Using OpenMP
 Since scikit-learn can be built without OpenMP, it's necessary to protect each
 direct call to OpenMP.
 
-There are some helpers in
+The `_openmp_helpers` module, available in
 `sklearn/utils/_openmp_helpers.pyx <https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/utils/_openmp_helpers.pyx>`_
-that you can reuse for the main useful functionalities and already have the
-necessary protection to be built without OpenMP.
+provides protected versions of the OpenMP routines. To use OpenMP routines, they
+must be cimported from this module and not from the OpenMP library directly::
 
-If the helpers are not enough, you need to protect your OpenMP code using the
-following syntax::
-
-  # importing OpenMP
-  IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-      cimport openmp
-
-  # calling OpenMP
-  IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-      max_threads = openmp.omp_get_max_threads()
-  ELSE:
-      max_threads = 1
+  from sklearn.utils._openmp_helpers cimport omp_get_max_threads
+  max_threads = omp_get_max_threads()
 
 .. note::
 
-   Protecting the parallel loop, ``prange``, is already done by cython.
+   The parallel loop, `prange`, is already protected by cython and can be used directly
+   from `cython.parallel`.
 
 
 .. _profiling-compiled-extension:

diff --git a/sklearn/_build_utils/__init__.py b/sklearn/_build_utils/__init__.py
@@ -52,11 +52,9 @@ def cythonize_extensions(extension):
     # compilers are properly configured to build with OpenMP. This is expensive
     # and we only want to call this function once.
     # The result of this check is cached as a private attribute on the sklearn
-    # module (only at build-time) to be used twice:
-    # - First to set the value of SKLEARN_OPENMP_PARALLELISM_ENABLED, the
-    #   cython build-time variable passed to the cythonize() call.
-    # - Then in the build_ext subclass defined in the top-level setup.py file
-    #   to actually build the compiled extensions with OpenMP flags if needed.
+    # module (only at build-time) to be used in the build_ext subclass defined
+    # in the top-level setup.py file to actually build the compiled extensions
+    # with OpenMP flags if needed.
     sklearn._OPENMP_SUPPORTED = check_openmp_support()
 
     n_jobs = 1
@@ -82,9 +80,6 @@ def cythonize_extensions(extension):
     return cythonize(
         extension,
         nthreads=n_jobs,
-        compile_time_env={
-            "SKLEARN_OPENMP_PARALLELISM_ENABLED": sklearn._OPENMP_SUPPORTED
-        },
         compiler_directives=compiler_directives,
     )
 

diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
@@ -6,13 +6,16 @@
 # fused types and when the array may be read-only (for instance when it's
 # provided by the user). This is fixed in cython > 0.3.
 
-IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-    cimport openmp
 from cython cimport floating
 from cython.parallel import prange, parallel
 from libc.stdlib cimport calloc, free
 from libc.string cimport memset
 
+from ..utils._openmp_helpers cimport omp_lock_t
+from ..utils._openmp_helpers cimport omp_init_lock
+from ..utils._openmp_helpers cimport omp_destroy_lock
+from ..utils._openmp_helpers cimport omp_set_lock
+from ..utils._openmp_helpers cimport omp_unset_lock
 from ..utils.extmath import row_norms
 from ._k_means_common import CHUNK_SIZE
 from ._k_means_common cimport _relocate_empty_clusters_dense
@@ -274,8 +277,7 @@ def elkan_iter_chunked_dense(
         floating *centers_new_chunk
         floating *weight_in_clusters_chunk
 
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_lock_t lock
+        omp_lock_t lock
 
     # count remainder chunk in total number of chunks
     n_chunks += n_samples != n_chunks * n_samples_chunk
@@ -286,8 +288,7 @@ def elkan_iter_chunked_dense(
     if update_centers:
         memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
         memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_init_lock(&lock)
+        omp_init_lock(&lock)
 
     with nogil, parallel(num_threads=n_threads):
         # thread local buffers
@@ -316,23 +317,20 @@ def elkan_iter_chunked_dense(
 
         # reduction from local buffers.
         if update_centers:
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                # The lock is necessary to avoid race conditions when aggregating
-                # info from different thread-local buffers.
-                openmp.omp_set_lock(&lock)
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
             for j in range(n_clusters):
                 weight_in_clusters[j] += weight_in_clusters_chunk[j]
                 for k in range(n_features):
                     centers_new[j, k] += centers_new_chunk[j * n_features + k]
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                openmp.omp_unset_lock(&lock)
+            omp_unset_lock(&lock)
 
         free(centers_new_chunk)
         free(weight_in_clusters_chunk)
 
     if update_centers:
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_destroy_lock(&lock)
+        omp_destroy_lock(&lock)
         _relocate_empty_clusters_dense(X, sample_weight, centers_old,
                                        centers_new, weight_in_clusters, labels)
 
@@ -516,8 +514,7 @@ def elkan_iter_chunked_sparse(
         floating *centers_new_chunk
         floating *weight_in_clusters_chunk
 
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_lock_t lock
+        omp_lock_t lock
 
     # count remainder chunk in total number of chunks
     n_chunks += n_samples != n_chunks * n_samples_chunk
@@ -528,8 +525,7 @@ def elkan_iter_chunked_sparse(
     if update_centers:
         memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
         memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_init_lock(&lock)
+        omp_init_lock(&lock)
 
     with nogil, parallel(num_threads=n_threads):
         # thread local buffers
@@ -561,23 +557,20 @@ def elkan_iter_chunked_sparse(
 
         # reduction from local buffers.
         if update_centers:
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                # The lock is necessary to avoid race conditions when aggregating
-                # info from different thread-local buffers.
-                openmp.omp_set_lock(&lock)
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
             for j in range(n_clusters):
                 weight_in_clusters[j] += weight_in_clusters_chunk[j]
                 for k in range(n_features):
                     centers_new[j, k] += centers_new_chunk[j * n_features + k]
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                openmp.omp_unset_lock(&lock)
+            omp_unset_lock(&lock)
 
         free(centers_new_chunk)
         free(weight_in_clusters_chunk)
 
     if update_centers:
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_destroy_lock(&lock)
+        omp_destroy_lock(&lock)
         _relocate_empty_clusters_sparse(
             X_data, X_indices, X_indptr, sample_weight,
             centers_old, centers_new, weight_in_clusters, labels)

diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx
@@ -4,14 +4,17 @@
 # fused types and when the array may be read-only (for instance when it's
 # provided by the user). This is fixed in cython > 0.3.
 
-IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-    cimport openmp
 from cython cimport floating
 from cython.parallel import prange, parallel
 from libc.stdlib cimport malloc, calloc, free
 from libc.string cimport memset
 from libc.float cimport DBL_MAX, FLT_MAX
 
+from ..utils._openmp_helpers cimport omp_lock_t
+from ..utils._openmp_helpers cimport omp_init_lock
+from ..utils._openmp_helpers cimport omp_destroy_lock
+from ..utils._openmp_helpers cimport omp_set_lock
+from ..utils._openmp_helpers cimport omp_unset_lock
 from ..utils.extmath import row_norms
 from ..utils._cython_blas cimport _gemm
 from ..utils._cython_blas cimport RowMajor, Trans, NoTrans
@@ -94,8 +97,8 @@ def lloyd_iter_chunked_dense(
         floating *centers_new_chunk
         floating *weight_in_clusters_chunk
         floating *pairwise_distances_chunk
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_lock_t lock
+
+        omp_lock_t lock
 
     # count remainder chunk in total number of chunks
     n_chunks += n_samples != n_chunks * n_samples_chunk
@@ -106,8 +109,7 @@ def lloyd_iter_chunked_dense(
     if update_centers:
         memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
         memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_init_lock(&lock)
+        omp_init_lock(&lock)
 
     with nogil, parallel(num_threads=n_threads):
         # thread local buffers
@@ -135,24 +137,22 @@ def lloyd_iter_chunked_dense(
 
         # reduction from local buffers.
         if update_centers:
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                # The lock is necessary to avoid race conditions when aggregating
-                # info from different thread-local buffers.
-                openmp.omp_set_lock(&lock)
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
             for j in range(n_clusters):
                 weight_in_clusters[j] += weight_in_clusters_chunk[j]
                 for k in range(n_features):
                     centers_new[j, k] += centers_new_chunk[j * n_features + k]
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                openmp.omp_unset_lock(&lock)
+
+            omp_unset_lock(&lock)
 
         free(centers_new_chunk)
         free(weight_in_clusters_chunk)
         free(pairwise_distances_chunk)
 
     if update_centers:
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_destroy_lock(&lock)
+        omp_destroy_lock(&lock)
         _relocate_empty_clusters_dense(X, sample_weight, centers_old,
                                     centers_new, weight_in_clusters, labels)
 
@@ -292,8 +292,7 @@ def lloyd_iter_chunked_sparse(
         floating *centers_new_chunk
         floating *weight_in_clusters_chunk
 
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_lock_t lock
+        omp_lock_t lock
 
     # count remainder chunk in total number of chunks
     n_chunks += n_samples != n_chunks * n_samples_chunk
@@ -304,8 +303,7 @@ def lloyd_iter_chunked_sparse(
     if update_centers:
         memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
         memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_init_lock(&lock)
+        omp_init_lock(&lock)
 
     with nogil, parallel(num_threads=n_threads):
         # thread local buffers
@@ -333,23 +331,20 @@ def lloyd_iter_chunked_sparse(
 
         # reduction from local buffers.
         if update_centers:
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                # The lock is necessary to avoid race conditions when aggregating
-                # info from different thread-local buffers.
-                openmp.omp_set_lock(&lock)
+            # The lock is necessary to avoid race conditions when aggregating
+            # info from different thread-local buffers.
+            omp_set_lock(&lock)
             for j in range(n_clusters):
                 weight_in_clusters[j] += weight_in_clusters_chunk[j]
                 for k in range(n_features):
                     centers_new[j, k] += centers_new_chunk[j * n_features + k]
-            IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-                openmp.omp_unset_lock(&lock)
+            omp_unset_lock(&lock)
 
         free(centers_new_chunk)
         free(weight_in_clusters_chunk)
 
     if update_centers:
-        IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
-            openmp.omp_destroy_lock(&lock)
+        omp_destroy_lock(&lock)
         _relocate_empty_clusters_sparse(
             X_data, X_indices, X_indptr, sample_weight,
             centers_old, centers_new, weight_in_clusters, labels)

diff --git a/sklearn/linear_model/_sgd_fast.pyx b/sklearn/linear_model/_sgd_fast.pyx
@@ -20,20 +20,33 @@ from ..utils._seq_dataset cimport SequentialDataset64 as SequentialDataset
 
 cnp.import_array()
 
-# Penalty constants
-DEF NO_PENALTY = 0
-DEF L1 = 1
-DEF L2 = 2
-DEF ELASTICNET = 3
-
-# Learning rate constants
-DEF CONSTANT = 1
-DEF OPTIMAL = 2
-DEF INVSCALING = 3
-DEF ADAPTIVE = 4
-DEF PA1 = 5
-DEF PA2 = 6
-
+cdef extern from *:
+    """
+    /* Penalty constants */
+    #define NO_PENALTY 0
+    #define L1 1
+    #define L2 2
+    #define ELASTICNET 3
+
+    /* Learning rate constants */
+    #define CONSTANT 1
+    #define OPTIMAL 2
+    #define INVSCALING 3
+    #define ADAPTIVE 4
+    #define PA1 5
+    #define PA2 6
+    """
+    int NO_PENALTY = 0
+    int L1 = 1
+    int L2 = 2
+    int ELASTICNET = 3
+
+    int CONSTANT = 1
+    int OPTIMAL = 2
+    int INVSCALING = 3
+    int ADAPTIVE = 4
+    int PA1 = 5
+    int PA2 = 6
 
 
 # ----------------------------------------

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp
@@ -21,7 +21,7 @@ from cython.parallel cimport parallel, prange
 from libcpp.vector cimport vector
 
 from ...utils._cython_blas cimport _dot
-from ...utils._openmp_helpers cimport _openmp_thread_num
+from ...utils._openmp_helpers cimport omp_get_thread_num
 from ...utils._typedefs cimport ITYPE_t, DTYPE_t
 
 import numpy as np
@@ -88,7 +88,7 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense(
         )
 
     with nogil, parallel(num_threads=num_threads):
-        thread_num = _openmp_thread_num()
+        thread_num = omp_get_thread_num()
 
         for i in prange(n, schedule='static'):
             # Upcasting the i-th row of X from float32 to float64
@@ -245,7 +245,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
             ITYPE_t thread_num
 
         with nogil, parallel(num_threads=self.chunks_n_threads):
-            thread_num = _openmp_thread_num()
+            thread_num = omp_get_thread_num()
 
             # Allocating thread datastructures
             self._parallel_on_X_parallel_init(thread_num)
@@ -324,7 +324,7 @@ cdef class BaseDistancesReduction{{name_suffix}}:
                 X_end = X_start + self.X_n_samples_chunk
 
             with nogil, parallel(num_threads=self.chunks_n_threads):
-                thread_num = _openmp_thread_num()
+                thread_num = omp_get_thread_num()
 
                 # Initializing datastructures used in this thread
                 self._parallel_on_Y_parallel_init(thread_num, X_start, X_end)