scikit-learn · ogrisel · Aug 30, 2023 · Nov 1, 2023 · Nov 6, 2023 · fcharras
diff --git a/doc/computing.rst b/doc/computing.rst
@@ -14,3 +14,4 @@ Computing with scikit-learn
     computing/scaling_strategies
     computing/computational_performance
     computing/parallelism
+    computing/engine
diff --git a/doc/computing/engine.rst b/doc/computing/engine.rst
@@ -0,0 +1,29 @@
+.. Places parent toc into the sidebar
+
+:parenttoc: True
+
+.. _engine:
+
+Computation Engines (experimental)
+==================================
+
+**This API is experimental** which means that it is subject to change without
+any backward compatibility guarantees.
+
+TODO: explain goals here
+
+Activating an engine
+--------------------
+
+TODO: installing third party engine provider packages
+
+TODO: how to list installed engines
+
+TODO: how to install a plugin
+
+Writing a new engine provider
+-----------------------------
+
+TODO: show engine API of a given estimator.
+
+TODO: give example setup.py with setuptools to define an entrypoint.
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -48,6 +48,23 @@ Changes impacting all modules
   to work with our estimators and functions.
   :pr:`26464` by `Thomas Fan`_.
 
+- |Enhancement| Experimental engine API (no backward compatibility guarantees)
+  to allow for external packages to contribute alternative implementations for
+  the core computational routines of some selected scikit-learn estimators.
+
+  Currently, the following estimators allow alternative implementations:
+
+  - :class:`~sklearn.cluster.KMeans` (only for the LLoyd algorithm).
+  - TODO: add more when available.
+
+  External engine providers include:
+
+  - https://github.com/soda-inria/sklearn-numba-dpex that provided a KMeans
+    engine optimized for OpenCL enabled GPUs.
+  - TODO: add more here
+
+  :pr:`25535` by :user:`ogrisel`, :user:`fcharras` and :user:`betatim`.
+
 - |Enhancement| The HTML representation of estimators now includes a link to the
   documentation and is color-coded to denote whether the estimator is fitted or
   not (unfitted estimators are orange, fitted estimators are blue).

diff --git a/setup.py b/setup.py
@@ -608,6 +608,7 @@ def setup_package():
         package_data={
             "": ["*.csv", "*.gz", "*.txt", "*.pxd", "*.rst", "*.jpg", "*.css"]
         },
+        entry_points={"pytest11": ["sklearn_plugin_testing = sklearn._engine.testing"]},
         zip_safe=False,  # the package can run out of an .egg file
         extras_require={
             key: min_deps.tag_to_packages[key]

diff --git a/sklearn/_config.py b/sklearn/_config.py
@@ -1,5 +1,6 @@
 """Global configuration state and functions for management
 """
+import inspect
 import os
 import threading
 from contextlib import contextmanager as contextmanager
@@ -14,6 +15,8 @@
     ),
     "enable_cython_pairwise_dist": True,
     "array_api_dispatch": False,
+    "engine_provider": (),
+    "engine_attributes": "engine_types",
     "transform_output": "default",
     "enable_metadata_routing": False,
     "skip_parameter_validation": False,
@@ -55,6 +58,8 @@ def set_config(
     pairwise_dist_chunk_size=None,
     enable_cython_pairwise_dist=None,
     array_api_dispatch=None,
+    engine_provider=None,
+    engine_attributes=None,
     transform_output=None,
     enable_metadata_routing=None,
     skip_parameter_validation=None,
@@ -126,6 +131,26 @@ def set_config(
 
         .. versionadded:: 1.2
 
+    engine_provider : str or sequence of {str, engine class}, default=None
+        Specify list of enabled computational engine implementations provided
+        by third party packages. Engines are enabled by listing the name of
+        the provider or listing an engine class directly.
+
+        See the :ref:`User Guide <engine>` for more details.
+
+        .. versionadded:: 1.4
+
+    engine_attributes : str, default=None
+        Enable conversion of estimator attributes to scikit-learn native
+        types by setting to "sklearn_types". By default attributes are
+        stored using engine native types. This avoids additional conversions
+        and memory transfers between host and device when calling `predict`/
+        `transform` after `fit` of an engine-aware estimator.
+
+        See the :ref:`User Guide <engine>` for more details.
+
+        .. versionadded:: 1.4
+
     transform_output : str, default=None
         Configure output of `transform` and `fit_transform`.
 
@@ -185,6 +210,18 @@ def set_config(
 
         _check_array_api_dispatch(array_api_dispatch)
         local_config["array_api_dispatch"] = array_api_dispatch
+    if engine_provider is not None:
+        # Single provider name was passed in
+        if isinstance(engine_provider, str):
+            engine_provider = (engine_provider,)
+        # Allow direct registration of engine classes to ease testing, debugging
+        # and benchmarking without having to register a fake package with metadata
+        # just to use a custom engine not meant to be used by end-users.
+        elif inspect.isclass(engine_provider):
+            engine_provider = (engine_provider,)
+        local_config["engine_provider"] = engine_provider
+    if engine_attributes is not None:
+        local_config["engine_attributes"] = engine_attributes
     if transform_output is not None:
         local_config["transform_output"] = transform_output
     if enable_metadata_routing is not None:
@@ -203,6 +240,8 @@ def config_context(
     pairwise_dist_chunk_size=None,
     enable_cython_pairwise_dist=None,
     array_api_dispatch=None,
+    engine_provider=None,
+    engine_attributes=None,
     transform_output=None,
     enable_metadata_routing=None,
     skip_parameter_validation=None,
@@ -273,6 +312,24 @@ def config_context(
 
         .. versionadded:: 1.2
 
+    engine_provider : str or sequence of {str, engine class}, default=None
+        Specify list of enabled computational engine implementations provided
+        by third party packages. Engines are enabled by listing the name of
+        the provider or listing an engine class directly.
+
+        See the :ref:`User Guide <engine>` for more details.
+
+        .. versionadded:: 1.4
+
+    engine_attributes : str, default=None
+        Enable conversion of estimator attributes to scikit-learn native
+        types by setting to "sklearn_types". By default attributes are
+        stored using engine native types.
+
+        See the :ref:`User Guide <engine>` for more details.
+
+        .. versionadded:: 1.4
+
     transform_output : str, default=None
         Configure output of `transform` and `fit_transform`.
 
@@ -344,6 +401,8 @@ def config_context(
         pairwise_dist_chunk_size=pairwise_dist_chunk_size,
         enable_cython_pairwise_dist=enable_cython_pairwise_dist,
         array_api_dispatch=array_api_dispatch,
+        engine_provider=engine_provider,
+        engine_attributes=engine_attributes,
         transform_output=transform_output,
         enable_metadata_routing=enable_metadata_routing,
         skip_parameter_validation=skip_parameter_validation,

diff --git a/sklearn/_engine/__init__.py b/sklearn/_engine/__init__.py
@@ -0,0 +1,3 @@
+from .base import convert_attributes, get_engine_classes, list_engine_provider_names
+
+__all__ = ["convert_attributes", "get_engine_classes", "list_engine_provider_names"]
diff --git a/sklearn/_engine/base.py b/sklearn/_engine/base.py
@@ -0,0 +1,186 @@
+import inspect
+import warnings
+from functools import lru_cache, wraps
+from importlib import import_module
+from importlib.metadata import entry_points
+
+from sklearn._config import get_config
+
+SKLEARN_ENGINES_ENTRY_POINT = "sklearn_engines"
+
+
+class EngineSpec:
+    __slots__ = ["name", "provider_name", "module_name", "engine_qualname"]
+
+    def __init__(self, name, provider_name, module_name, engine_qualname):
+        self.name = name
+        self.provider_name = provider_name
+        self.module_name = module_name
+        self.engine_qualname = engine_qualname
+
+    def get_engine_class(self):
+        engine = import_module(self.module_name)
+        for attr in self.engine_qualname.split("."):
+            engine = getattr(engine, attr)
+        return engine
+
+
+def _parse_entry_point(entry_point):
+    module_name, engine_qualname = entry_point.value.split(":")
+    provider_name = next(iter(module_name.split(".", 1)))
+    return EngineSpec(entry_point.name, provider_name, module_name, engine_qualname)
+
+
+@lru_cache
+def _parse_entry_points(provider_names=None):
+    specs = []
+    all_entry_points = entry_points()
+    if hasattr(all_entry_points, "select"):
+        engine_entry_points = all_entry_points.select(group=SKLEARN_ENGINES_ENTRY_POINT)
+    else:
+        engine_entry_points = all_entry_points.get(SKLEARN_ENGINES_ENTRY_POINT, ())
+    for entry_point in engine_entry_points:
+        try:
+            spec = _parse_entry_point(entry_point)
+            if provider_names is not None and spec.provider_name not in provider_names:
+                # Skip entry points that do not match the requested provider names.
+                continue
+            specs.append(spec)
+        except Exception as e:
+            # Do not raise an exception in case an invalid package has been
+            # installed in the same Python env as scikit-learn: just warn and
+            # skip.
+            warnings.warn(
+                f"Invalid {SKLEARN_ENGINES_ENTRY_POINT} entry point"
+                f" {entry_point.name} with value {entry_point.value}: {e}"
+            )
+    if provider_names is not None:
+        observed_provider_names = {spec.provider_name for spec in specs}
+        missing_providers = set(provider_names) - observed_provider_names
+        if missing_providers:
+            raise RuntimeError(
+                "Could not find any provider for the"
+                f" {SKLEARN_ENGINES_ENTRY_POINT} entry point with name(s):"
+                f" {', '.join(repr(p) for p in sorted(missing_providers))}"
+            )
+    return specs
+
+
+def list_engine_provider_names():
+    """Find the list of sklearn_engine provider names
+
+    This function only inspects the metadata and should trigger any module import.
+    """
+    return sorted({spec.provider_name for spec in _parse_entry_points()})
+
+
+def _get_engine_classes(engine_name, provider_names, engine_specs, default):
+    specs_by_provider = {}
+    for spec in engine_specs:
+        if spec.name != engine_name:
+            continue
+        specs_by_provider.setdefault(spec.provider_name, spec)
+
+    for provider_name in provider_names:
+        if inspect.isclass(provider_name):
+            # The provider name is actually a ready-to-go engine class.
+            # Instead of a made up string to name this ad-hoc provider
+            # we use the class itself. This mirrors what the user used
+            # when they set the config (ad-hoc class or string naming
+            # a provider).
+            engine_class = provider_name
+            if getattr(engine_class, "engine_name", None) != engine_name:
+                continue
+            yield engine_class, engine_class
+
+        spec = specs_by_provider.get(provider_name)
+        if spec is not None:
+            yield spec.provider_name, spec.get_engine_class()
+
+    yield "default", default
+
+
+def get_engine_classes(engine_name, default, verbose=False):
+    """Find all possible providers of `engine_name`.
+
+    Provider candidates are found based on parsing entrypoint definitions that
+    match the name of enabled engine providers, as well as, ad-hoc providers
+    in the form of engine classes in the list of enabled engine providers.
+
+    Parameters
+    ----------
+    engine_name : str
+        The name of the algorithm for which to find engine classes.
+
+    default : class
+        The default engine class to use if no other provider is found.
+
+    verbose : bool, default=False
+        If True, print the name of the engine classes that are tried.
+
+    Yields
+    ------
+    provider : str or class
+        The "name" of each matching provider. The "name" corresponds to the
+        entry in the `engine_provider` configuration. It can be a string or a
+        class for programmatically registered ad-hoc providers.
+
+    engine_class :
+        The engine class that implements the algorithm for the given provider.
+    """
+    provider_names = get_config()["engine_provider"]
+
+    if not provider_names:
+        yield "default", default
+        return
+
+    engine_specs = _parse_entry_points(
+        provider_names=tuple(
+            [name for name in provider_names if not inspect.isclass(name)]
+        )
+    )
+    for provider, engine_class in _get_engine_classes(
+        engine_name=engine_name,
+        provider_names=provider_names,
+        engine_specs=engine_specs,
+        default=default,
+    ):
+        if verbose:
+            print(
+                f"trying engine {engine_class.__module__}.{engine_class.__qualname__}."
+            )
+        yield provider, engine_class
+
+
+def convert_attributes(method):
+    """Convert estimator attributes after calling the decorated method.
+
+    The attributes of an estimator can be stored in "engine native" types
+    (default) or "scikit-learn native" types. This decorator will call the
+    engine's conversion function when needed. Use this decorator on methods
+    that set estimator attributes.
+    """
+
+    @wraps(method)
+    def wrapper(self, *args, **kwargs):
+        r = method(self, *args, **kwargs)
+        convert_attributes = get_config()["engine_attributes"]
+
+        if convert_attributes == "sklearn_types":
+            engine = self._engine_class
+            for name, value in vars(self).items():
+                # All attributes are passed to the engine, which can
+                # either convert the value (engine specific types) or
+                # return it as is (native Python types)
+                converted = engine.convert_to_sklearn_types(name, value)
+                setattr(self, name, converted)
+
+            # No matter which engine was used to fit, after the attribute
+            # conversion to the sklearn native types the default engine
+            # is used.
+            self._engine_class = self._default_engine
+            self._engine_provider = "default"
+
+        return r
+
+    return wrapper
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .base import convert_attributes, get_engine_classes, list_engine_provider_names

		__all__ = ["convert_attributes", "get_engine_classes", "list_engine_provider_names"]