mpi4jax
diff --git a/‎.github/workflows/build-xpu-ext.yml
Lines changed: 50 additions & 0 deletions b/‎.github/workflows/build-xpu-ext.yml
Lines changed: 50 additions & 0 deletions
diff --git a/‎docs/installation.rst
Lines changed: 18 additions & 3 deletions b/‎docs/installation.rst
Lines changed: 18 additions & 3 deletions
diff --git a/‎docs/sharp-bits.rst
Lines changed: 23 additions & 0 deletions b/‎docs/sharp-bits.rst
Lines changed: 23 additions & 0 deletions
diff --git a/‎mpi4jax/__init__.py
Lines changed: 2 additions & 0 deletions b/‎mpi4jax/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎mpi4jax/_src/__init__.py
Lines changed: 1 addition & 1 deletion b/‎mpi4jax/_src/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎mpi4jax/_src/collective_ops/allgather.py
Lines changed: 13 additions & 5 deletions b/‎mpi4jax/_src/collective_ops/allgather.py
Lines changed: 13 additions & 5 deletions
diff --git a/‎mpi4jax/_src/collective_ops/allreduce.py
Lines changed: 12 additions & 5 deletions b/‎mpi4jax/_src/collective_ops/allreduce.py
Lines changed: 12 additions & 5 deletions
diff --git a/‎mpi4jax/_src/collective_ops/alltoall.py
Lines changed: 12 additions & 6 deletions b/‎mpi4jax/_src/collective_ops/alltoall.py
Lines changed: 12 additions & 6 deletions
diff --git a/‎mpi4jax/_src/collective_ops/barrier.py
Lines changed: 12 additions & 5 deletions b/‎mpi4jax/_src/collective_ops/barrier.py
Lines changed: 12 additions & 5 deletions
diff --git a/‎mpi4jax/_src/collective_ops/bcast.py
Lines changed: 12 additions & 5 deletions b/‎mpi4jax/_src/collective_ops/bcast.py
Lines changed: 12 additions & 5 deletions
@@ -0,0 +1,50 @@
+name: Build XPU extensions
+
+on:
+  pull_request:
+
+  push:
+    branches:
+      - master
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04
+
+    strategy:
+      fail-fast: false
+
+    steps:
+    - uses: actions/checkout@v2
+
+    # make sure tags are fetched so we can get a version
+    - run: |
+        git fetch --prune --unshallow --tags
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Install OneAPI components
+      run: |
+        wget -nv https://registrationcenter-download.intel.com/akdlm/IRC_NAS/bb99984f-370f-413d-bbec-38928d2458f2/l_dpcpp-cpp-compiler_p_2024.0.2.29_offline.sh -P $HOME/basekit
+        chmod +x $HOME/basekit/l_dpcpp-cpp-compiler_p_2024.0.2.29_offline.sh
+        bash $HOME/basekit/l_dpcpp-cpp-compiler_p_2024.0.2.29_offline.sh -f "$HOME/basekit" -a --install-dir "$HOME/basekit" --eula=accept --silent
+      shell: bash
+
+    - name: Setup MPI (mpich)
+      uses: mpi4py/setup-mpi@v1
+      with:
+        mpi: mpich
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools wheel mpi4py cython
+
+    - name: Build XPU extensions
+      run: |
+        source $HOME/basekit/setvars.sh
+        python setup.py build_ext --inplace
+        test -f mpi4jax/_src/xla_bridge/mpi_xla_bridge_xpu*.so
@@ -62,14 +62,14 @@ usually sufficient to specify the ``MPICC`` environment variable *before* instal
    In doubt, please refer to `the mpi4py documentation <https://mpi4py.readthedocs.io/en/stable/install.html>`_.
 
 
-Installation with GPU support
------------------------------
+Installation with NVIDIA GPU support (CUDA)
+-------------------------------------------
 
 .. note::
 
    To use JAX on the GPU, make sure that your ``jaxlib`` is `built with CUDA support <https://github.com/google/jax#installation>`_.
 
-``mpi4jax`` also supports JAX arrays stored in GPU memory.
+``mpi4jax`` supports communication of JAX arrays stored in GPU memory.
 
 To build ``mpi4jax``'s GPU extensions, we need to be able to locate the CUDA headers on your system. If they are not detected automatically, you can set the environment variable :envvar:`CUDA_ROOT` when installing ``mpi4jax``::
 
@@ -86,3 +86,18 @@ If this is a bottleneck in your application, you can build MPI with CUDA support
 .. seealso::
 
    Read :ref:`here <gpu-usage>` on how to use zero-copy GPU communication after installation.
+
+
+Installation with Intel GPU/XPU support
+---------------------------------------
+
+``mpi4jax`` supports communication of JAX arrays stored in Intel GPU/XPU memory, via JAX's ``xpu`` backend.
+
+**Requirements:**
+
+- `Intel extension for OpenXLA <https://github.com/intel/intel-extension-for-openxla>`__ at least in version 0.3.0.
+- SYCL headers and libraries, which come as part of the `Intel oneAPI Base Toolkit <https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-analytics-toolkit.html>`__.
+- Optionally, `Intel MPI <https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/mpi-library.html>`__ with Intel XPU/GPU support.
+  To leverage this, you also need to rebuild `mpi4py <https://mpi4py.readthedocs.io/en/stable/install.html>`__ to ensure it is linked to the XPU/GPU aware MPI implementation.
+
+An example setup is found in the `mpi4jax test suite <https://github.com/mpi4jax/mpi4jax/tree/master/.github/workflows/build-xpu-ext.yml>`__.
@@ -78,6 +78,29 @@ Data will then be copied directly from GPU to GPU. If your MPI library
 does not have CUDA support, you will receive a segmentation fault when
 trying to access GPU memory.
 
+Using Intel XPU aware MPI
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``mpi4jax`` is able to communicate data directly from and to Intel XPU
+and Intel GPU memory. This requires that you have installed MPI that is
+Intel GPU/XPU aware (MPI calls can work directly with XPU/GPU memory)
+and that JAX and `mpi4jax is built with Intel XPU
+support <installation>`__.
+
+Currently, we cannot detect whether MPI is XPU/GPU aware. Therefore, by
+default, ``mpi4jax`` will not read directly from XPU/GPU memory, but
+instead copy to the CPU and back.
+
+If you are certain that the underlying MPI library is XPU/GPU aware
+then, you can set the following environment variable:
+
+.. code:: bash
+
+   $ export MPI4JAX_USE_SYCL_MPI=1
+
+Data will then be copied directly from XPU to XPU. If your MPI library
+cannot work with Intel GPU/XPU buffers, you will receive a segmentation
+fault when trying to access mentioned GPU/XPU memory.
 
 Using ``mpi4jax`` *and* ``mpi4py``
 ----------------------------------
 
@@ -20,6 +20,7 @@
     send,
     sendrecv,
     has_cuda_support,
+    has_sycl_support,
 )
 
 __all__ = [
@@ -36,4 +37,5 @@
     "send",
     "sendrecv",
     "has_cuda_support",
+    "has_sycl_support",
 ]
@@ -30,7 +30,7 @@
 from .collective_ops.send import send  # noqa: F401, E402
 from .collective_ops.sendrecv import sendrecv  # noqa: F401, E402
 
-from .utils import has_cuda_support  # noqa: F401, E402
+from .utils import has_cuda_support, has_sycl_support  # noqa: F401, E402
 
 # sanitize namespace
 del jax_compat, xla_bridge, MPI, atexit, flush
@@ -21,10 +21,16 @@
     prefer_notoken,
 )
 from ..jax_compat import custom_call, token_type, ShapedArray
-from ..decorators import translation_rule_cpu, translation_rule_gpu
+from ..decorators import (
+    translation_rule_cpu,
+    translation_rule_gpu,
+    translation_rule_xpu,
+)
 from ..validation import enforce_types
 from ..comm import get_default_comm
 
+from ..xla_bridge.device_descriptors import build_allgather_descriptor
+
 # The Jax primitive
 mpi_allgather_p = Primitive("allgather_mpi")  # Create the primitive
 mpi_allgather_impl = default_primitive_impl(mpi_allgather_p)
@@ -128,10 +134,7 @@ def mpi_allgather_xla_encode_cpu(ctx, sendbuf, token, comm):
     ).results
 
 
-@translation_rule_gpu
-def mpi_allgather_xla_encode_gpu(ctx, sendbuf, token, comm):
-    from ..xla_bridge.mpi_xla_bridge_gpu import build_allgather_descriptor
-
+def mpi_allgather_xla_encode_device(ctx, sendbuf, token, comm):
     comm = unpack_hashable(comm)
 
     sendbuf_aval, *_ = ctx.avals_in
@@ -177,6 +180,10 @@ def mpi_allgather_xla_encode_gpu(ctx, sendbuf, token, comm):
     ).results
 
 
+mpi_allgather_xla_encode_xpu = translation_rule_xpu(mpi_allgather_xla_encode_device)
+mpi_allgather_xla_encode_gpu = translation_rule_gpu(mpi_allgather_xla_encode_device)
+
+
 # This function evaluates only the shapes during AST construction
 def mpi_allgather_abstract_eval(x, token, comm):
     comm = unpack_hashable(comm)
@@ -194,3 +201,4 @@ def mpi_allgather_abstract_eval(x, token, comm):
 
 mlir.register_lowering(mpi_allgather_p, mpi_allgather_xla_encode_cpu, platform="cpu")
 mlir.register_lowering(mpi_allgather_p, mpi_allgather_xla_encode_gpu, platform="cuda")
+mlir.register_lowering(mpi_allgather_p, mpi_allgather_xla_encode_xpu, platform="xpu")
@@ -22,10 +22,15 @@
     prefer_notoken,
 )
 from ..jax_compat import custom_call, token_type, ShapedArray
-from ..decorators import translation_rule_cpu, translation_rule_gpu
+from ..decorators import (
+    translation_rule_cpu,
+    translation_rule_gpu,
+    translation_rule_xpu,
+)
 from ..validation import enforce_types
 from ..comm import get_default_comm
 
+from ..xla_bridge.device_descriptors import build_allreduce_descriptor
 
 # The Jax primitive
 mpi_allreduce_p = Primitive("allreduce_mpi")  # Create the primitive
@@ -122,10 +127,7 @@ def mpi_allreduce_xla_encode_cpu(ctx, x, token, op, comm, transpose):
     ).results
 
 
-@translation_rule_gpu
-def mpi_allreduce_xla_encode_gpu(ctx, x, token, op, comm, transpose):
-    from ..xla_bridge.mpi_xla_bridge_gpu import build_allreduce_descriptor
-
+def mpi_allreduce_xla_encode_device(ctx, x, token, op, comm, transpose):
     op = unpack_hashable(op)
     comm = unpack_hashable(comm)
 
@@ -171,6 +173,10 @@ def mpi_allreduce_xla_encode_gpu(ctx, x, token, op, comm, transpose):
     ).results
 
 
+mpi_allreduce_xla_encode_gpu = translation_rule_gpu(mpi_allreduce_xla_encode_device)
+mpi_allreduce_xla_encode_xpu = translation_rule_xpu(mpi_allreduce_xla_encode_device)
+
+
 # This function evaluates only the shapes during AST construction
 def mpi_allreduce_abstract_eval(xs, token, op, comm, transpose):
     return (
@@ -230,3 +236,4 @@ def mpi_allreduce_transpose_rule(tan_args, *x_args, op, comm, transpose):
 # assign to the primitive the correct encoder
 mlir.register_lowering(mpi_allreduce_p, mpi_allreduce_xla_encode_cpu, platform="cpu")
 mlir.register_lowering(mpi_allreduce_p, mpi_allreduce_xla_encode_gpu, platform="cuda")
+mlir.register_lowering(mpi_allreduce_p, mpi_allreduce_xla_encode_xpu, platform="xpu")
@@ -21,10 +21,14 @@
     prefer_notoken,
 )
 from ..jax_compat import custom_call, token_type, ShapedArray
-from ..decorators import translation_rule_cpu, translation_rule_gpu
+from ..decorators import (
+    translation_rule_cpu,
+    translation_rule_gpu,
+    translation_rule_xpu,
+)
 from ..validation import enforce_types
 from ..comm import get_default_comm
-
+from ..xla_bridge.device_descriptors import build_alltoall_descriptor
 
 # The Jax primitive
 mpi_alltoall_p = Primitive("alltoall_mpi")  # Create the primitive
@@ -129,10 +133,7 @@ def mpi_alltoall_xla_encode_cpu(ctx, x, token, comm):
     ).results
 
 
-@translation_rule_gpu
-def mpi_alltoall_xla_encode_gpu(ctx, x, token, comm):
-    from ..xla_bridge.mpi_xla_bridge_gpu import build_alltoall_descriptor
-
+def mpi_alltoall_xla_encode_device(ctx, x, token, comm):
     comm = unpack_hashable(comm)
 
     x_aval, *_ = ctx.avals_in
@@ -180,6 +181,10 @@ def mpi_alltoall_xla_encode_gpu(ctx, x, token, comm):
     ).results
 
 
+mpi_alltoall_xla_encode_xpu = translation_rule_xpu(mpi_alltoall_xla_encode_device)
+mpi_alltoall_xla_encode_gpu = translation_rule_gpu(mpi_alltoall_xla_encode_device)
+
+
 # This function evaluates only the shapes during AST construction
 def mpi_alltoall_abstract_eval(xs, token, comm):
     return (
@@ -195,3 +200,4 @@ def mpi_alltoall_abstract_eval(xs, token, comm):
 # assign to the primitive the correct encoder
 mlir.register_lowering(mpi_alltoall_p, mpi_alltoall_xla_encode_cpu, platform="cpu")
 mlir.register_lowering(mpi_alltoall_p, mpi_alltoall_xla_encode_gpu, platform="cuda")
+mlir.register_lowering(mpi_alltoall_p, mpi_alltoall_xla_encode_xpu, platform="xpu")
@@ -20,9 +20,14 @@
     prefer_notoken,
 )
 from ..jax_compat import custom_call, token_type
-from ..decorators import translation_rule_cpu, translation_rule_gpu
+from ..decorators import (
+    translation_rule_cpu,
+    translation_rule_gpu,
+    translation_rule_xpu,
+)
 from ..validation import enforce_types
 from ..comm import get_default_comm
+from ..xla_bridge.device_descriptors import build_barrier_descriptor
 
 
 # The Jax primitive
@@ -89,10 +94,7 @@ def mpi_barrier_xla_encode_cpu(ctx, token, comm):
     ).results
 
 
-@translation_rule_gpu
-def mpi_barrier_xla_encode_gpu(ctx, token, comm):
-    from ..xla_bridge.mpi_xla_bridge_gpu import build_barrier_descriptor
-
+def mpi_barrier_xla_encode_device(ctx, token, comm):
     comm = unpack_hashable(comm)
 
     out_types = token_type()
@@ -112,6 +114,10 @@ def mpi_barrier_xla_encode_gpu(ctx, token, comm):
     ).results
 
 
+mpi_barrier_xla_encode_xpu = translation_rule_xpu(mpi_barrier_xla_encode_device)
+mpi_barrier_xla_encode_gpu = translation_rule_gpu(mpi_barrier_xla_encode_device)
+
+
 # This function evaluates only the shapes during AST construction
 def mpi_barrier_abstract_eval(token, comm):
     return core.abstract_token, {effect}
@@ -131,3 +137,4 @@ def mpi_barrier_batch_eval(in_args, batch_axes, comm):
 # assign to the primitive the correct encoder
 mlir.register_lowering(mpi_barrier_p, mpi_barrier_xla_encode_cpu, platform="cpu")
 mlir.register_lowering(mpi_barrier_p, mpi_barrier_xla_encode_gpu, platform="cuda")
+mlir.register_lowering(mpi_barrier_p, mpi_barrier_xla_encode_xpu, platform="xpu")
@@ -21,9 +21,14 @@
     prefer_notoken,
 )
 from ..jax_compat import custom_call, token_type, ShapedArray
-from ..decorators import translation_rule_cpu, translation_rule_gpu
+from ..decorators import (
+    translation_rule_cpu,
+    translation_rule_gpu,
+    translation_rule_xpu,
+)
 from ..validation import enforce_types
 from ..comm import get_default_comm
+from ..xla_bridge.device_descriptors import build_bcast_descriptor
 
 
 # The Jax primitive
@@ -126,10 +131,7 @@ def mpi_bcast_xla_encode_cpu(ctx, x, token, root, comm):
     ).results
 
 
-@translation_rule_gpu
-def mpi_bcast_xla_encode_gpu(ctx, x, token, root, comm):
-    from ..xla_bridge.mpi_xla_bridge_gpu import build_bcast_descriptor
-
+def mpi_bcast_xla_encode_device(ctx, x, token, root, comm):
     comm = unpack_hashable(comm)
 
     x_aval, *_ = ctx.avals_in
@@ -176,6 +178,10 @@ def mpi_bcast_xla_encode_gpu(ctx, x, token, root, comm):
     ).results
 
 
+mpi_bcast_xla_encode_xpu = translation_rule_xpu(mpi_bcast_xla_encode_device)
+mpi_bcast_xla_encode_gpu = translation_rule_gpu(mpi_bcast_xla_encode_device)
+
+
 # This function evaluates only the shapes during AST construction
 def mpi_bcast_abstract_eval(xs, token, root, comm):
     comm = unpack_hashable(comm)
@@ -199,3 +205,4 @@ def mpi_bcast_abstract_eval(xs, token, root, comm):
 # assign to the primitive the correct encoder
 mlir.register_lowering(mpi_bcast_p, mpi_bcast_xla_encode_cpu, platform="cpu")
 mlir.register_lowering(mpi_bcast_p, mpi_bcast_xla_encode_gpu, platform="cuda")
+mlir.register_lowering(mpi_bcast_p, mpi_bcast_xla_encode_xpu, platform="xpu")
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@`
`20`	`20`	`send,`
`21`	`21`	`sendrecv,`
`22`	`22`	`has_cuda_support,`
	`23`	`+ has_sycl_support,`
`23`	`24`	`)`
`24`	`25`
`25`	`26`	`__all__ = [`
`@@ -36,4 +37,5 @@`
`36`	`37`	`"send",`
`37`	`38`	`"sendrecv",`
`38`	`39`	`"has_cuda_support",`
	`40`	`+ "has_sycl_support",`
`39`	`41`	`]`