From aa0a90b25c682a89c1aa7acadf1615ef4764c978 Mon Sep 17 00:00:00 2001
From: Miroslaw Oksiucik <moksiucik@habana.ai>
Date: Wed, 30 Jul 2025 14:08:55 +0300
Subject: [PATCH 01/12] Support XPU in --nproc-per-node option to torchrun

Support both --nproc-per-node=xpu and autodetection of XPU
device in case of --nproc-per-node=auto
---
 torch/distributed/run.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index c37ecd8f72d8..e995fbfb390e 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -77,7 +77,9 @@
 .. note:: ``--nproc-per-node`` may be
           ``"gpu"`` (spawn one process per GPU),
           ``"cpu"`` (spawn one process per CPU),
+          ``"xpu"`` (spawn one process per XPU),
           ``"auto"`` (equivalent to ``"gpu"`` if CUDA is available,
+          else equivalent to ``"xpu"`` if XPU is available,
           else equivalent to ``"cpu"``),
           or an integer specifying the number of processes.
           See `torch.distributed.run.determine_local_world_size
@@ -413,7 +415,7 @@ def get_args_parser() -> ArgumentParser:
         action=env,
         type=str,
         default="1",
-        help="Number of workers per node; supported values: [auto, cpu, gpu, int].",
+        help="Number of workers per node; supported values: [auto, cpu, gpu, xpu, int].",
     )
 
     #
@@ -694,6 +696,11 @@ def determine_local_world_size(nproc_per_node: str):
                 raise ValueError("Cuda is not available.") from e
             device_type = "gpu"
             num_proc = torch.cuda.device_count()
+        elif nproc_per_node == "xpu":
+            if not torch.xpu.is_available():
+                raise ValueError("Xpu is not available.") from e
+            device_type = "xpu"
+            num_proc = torch.xpu.device_count()
         elif nproc_per_node == torch._C._get_privateuse1_backend_name():
             if not _get_custom_mod_func("is_available")():
                 raise ValueError(f"{nproc_per_node} is not available.") from e
@@ -703,6 +710,9 @@ def determine_local_world_size(nproc_per_node: str):
             if torch.cuda.is_available():
                 num_proc = torch.cuda.device_count()
                 device_type = "gpu"
+            elif torch.xpu.is_available():
+                num_proc = torch.xpu.device_count()
+                device_type = "xpu"
             elif (
                 hasattr(torch, torch._C._get_privateuse1_backend_name())
                 and _get_custom_mod_func("is_available")()

From 249f7c23ce9f0460d332fa54973982100425fa8e Mon Sep 17 00:00:00 2001
From: Miroslaw Oksiucik <moksiucik@habana.ai>
Date: Thu, 31 Jul 2025 12:51:50 +0300
Subject: [PATCH 02/12] Apply review comment

---
 torch/distributed/run.py | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index e995fbfb390e..42d7ae7b4ef7 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -707,18 +707,9 @@ def determine_local_world_size(nproc_per_node: str):
             device_type = nproc_per_node
             num_proc = _get_custom_mod_func("device_count")()
         elif nproc_per_node == "auto":
-            if torch.cuda.is_available():
-                num_proc = torch.cuda.device_count()
-                device_type = "gpu"
-            elif torch.xpu.is_available():
-                num_proc = torch.xpu.device_count()
-                device_type = "xpu"
-            elif (
-                hasattr(torch, torch._C._get_privateuse1_backend_name())
-                and _get_custom_mod_func("is_available")()
-            ):
-                num_proc = _get_custom_mod_func("device_count")()
-                device_type = torch._C._get_privateuse1_backend_name()
+            if torch.accelerator.is_available():
+                num_proc = torch.accelerator.device_count()
+                device_type = torch.accelerator.current_accelerator().type
             else:
                 num_proc = os.cpu_count()
                 device_type = "cpu"

From e38a821c462ee8461164d5a9f83fe3c923a0bc0c Mon Sep 17 00:00:00 2001
From: Miroslaw Oksiucik <moksiucik@habana.ai>
Date: Thu, 31 Jul 2025 15:38:11 +0300
Subject: [PATCH 03/12] Fix lint error

---
 torch/distributed/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index 42d7ae7b4ef7..3a2d0d32a730 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -707,7 +707,7 @@ def determine_local_world_size(nproc_per_node: str):
             device_type = nproc_per_node
             num_proc = _get_custom_mod_func("device_count")()
         elif nproc_per_node == "auto":
-            if torch.accelerator.is_available():
+            if torch.accelerator.is_available() and torch.accelerator.current_accelerator():
                 num_proc = torch.accelerator.device_count()
                 device_type = torch.accelerator.current_accelerator().type
             else:

From 3bea66838c486f89bfbae1deccceaee1ddc0a55d Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com>
Date: Fri, 1 Aug 2025 10:07:47 +0800
Subject: [PATCH 04/12] Update torch/distributed/run.py

---
 torch/distributed/run.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index 3a2d0d32a730..bd1dfdb2a02f 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -707,9 +707,9 @@ def determine_local_world_size(nproc_per_node: str):
             device_type = nproc_per_node
             num_proc = _get_custom_mod_func("device_count")()
         elif nproc_per_node == "auto":
-            if torch.accelerator.is_available() and torch.accelerator.current_accelerator():
+            if torch.accelerator.is_available():
                 num_proc = torch.accelerator.device_count()
-                device_type = torch.accelerator.current_accelerator().type
+                device_type = torch.accelerator.current_accelerator().type  # type: ignore[union-attr]
             else:
                 num_proc = os.cpu_count()
                 device_type = "cpu"

From a29dba11a45fadeda0414e456cf30dd598d0d96d Mon Sep 17 00:00:00 2001
From: Miroslaw Oksiucik <moksiucik@habana.ai>
Date: Mon, 4 Aug 2025 11:11:44 +0300
Subject: [PATCH 05/12] debug

---
 torch/distributed/run.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index bd1dfdb2a02f..09439022b8a0 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -713,6 +713,11 @@ def determine_local_world_size(nproc_per_node: str):
             else:
                 num_proc = os.cpu_count()
                 device_type = "cpu"
+            ### CI ERROR DEBUG CODE BEGIN ###
+            print(f"nproc_per_node=auto: {device_type = }, {num_proc = }, {os.cpu_count() = }")
+            if torch.cuda.is_available():
+                print(f"{torch.cuda.device_count() = }")
+            ### CI ERROR DEBUG CODE END ###
         else:
             raise ValueError(
                 f"Unsupported nproc_per_node value: {nproc_per_node}"

From 9123bf0fd01ee23587f5383bed598eca56484af5 Mon Sep 17 00:00:00 2001
From: Miroslaw Oksiucik <moksiucik@habana.ai>
Date: Tue, 5 Aug 2025 11:50:41 +0300
Subject: [PATCH 06/12] Revert "debug"

This reverts commit a29dba11a45fadeda0414e456cf30dd598d0d96d.
---
 torch/distributed/run.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index 09439022b8a0..bd1dfdb2a02f 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -713,11 +713,6 @@ def determine_local_world_size(nproc_per_node: str):
             else:
                 num_proc = os.cpu_count()
                 device_type = "cpu"
-            ### CI ERROR DEBUG CODE BEGIN ###
-            print(f"nproc_per_node=auto: {device_type = }, {num_proc = }, {os.cpu_count() = }")
-            if torch.cuda.is_available():
-                print(f"{torch.cuda.device_count() = }")
-            ### CI ERROR DEBUG CODE END ###
         else:
             raise ValueError(
                 f"Unsupported nproc_per_node value: {nproc_per_node}"

From 98012b6567925b822b33b846cb1b913bfc7e7cf7 Mon Sep 17 00:00:00 2001
From: Miroslaw Oksiucik <moksiucik@habana.ai>
Date: Tue, 5 Aug 2025 11:46:10 +0300
Subject: [PATCH 07/12] Check cuda before torch.accelerator

---
 torch/distributed/run.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index bd1dfdb2a02f..0fee9a028703 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -707,7 +707,10 @@ def determine_local_world_size(nproc_per_node: str):
             device_type = nproc_per_node
             num_proc = _get_custom_mod_func("device_count")()
         elif nproc_per_node == "auto":
-            if torch.accelerator.is_available():
+            if torch.cuda.is_available():
+                num_proc = torch.cuda.device_count()
+                device_type = "gpu"
+            elif torch.accelerator.is_available():
                 num_proc = torch.accelerator.device_count()
                 device_type = torch.accelerator.current_accelerator().type  # type: ignore[union-attr]
             else:

From 7b0e8cfbe64250e41598d2d66dfda5bb3538e948 Mon Sep 17 00:00:00 2001
From: Miroslaw Oksiucik <moksiucik@habana.ai>
Date: Tue, 5 Aug 2025 12:39:02 +0300
Subject: [PATCH 08/12] Revert "Check cuda before torch.accelerator"

This reverts commit 98012b6567925b822b33b846cb1b913bfc7e7cf7.
---
 torch/distributed/run.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index 0fee9a028703..bd1dfdb2a02f 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -707,10 +707,7 @@ def determine_local_world_size(nproc_per_node: str):
             device_type = nproc_per_node
             num_proc = _get_custom_mod_func("device_count")()
         elif nproc_per_node == "auto":
-            if torch.cuda.is_available():
-                num_proc = torch.cuda.device_count()
-                device_type = "gpu"
-            elif torch.accelerator.is_available():
+            if torch.accelerator.is_available():
                 num_proc = torch.accelerator.device_count()
                 device_type = torch.accelerator.current_accelerator().type  # type: ignore[union-attr]
             else:

From 3be536ad6ef08f59863c9e18493c7a706d94187c Mon Sep 17 00:00:00 2001
From: Miroslaw Oksiucik <moksiucik@habana.ai>
Date: Tue, 5 Aug 2025 12:38:10 +0300
Subject: [PATCH 09/12] Adjust unit tests

---
 test/distributed/launcher/test_run.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/test/distributed/launcher/test_run.py b/test/distributed/launcher/test_run.py
index f71bffd527c1..d394828aa1ea 100644
--- a/test/distributed/launcher/test_run.py
+++ b/test/distributed/launcher/test_run.py
@@ -273,10 +273,25 @@ def test_nproc_launch_unknown_configurations(self):
     )
     @patch("torch.cuda.is_available", return_value=True)
     @patch("torch.cuda.device_count", return_value=3)
+    @patch("torch.accelerator.is_available", return_value=True)
+    @patch("torch.accelerator.device_count", return_value=3)
+    @patch("torch.accelerator.current_accelerator", return_value=MagicMock(type="gpu"))
     def test_nproc_gpu_launch_configurations(self, _mock1, _mock2):
         self._test_nproc_launch_configuration("auto", 3)
         self._test_nproc_launch_configuration("gpu", 3)
 
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
+    @patch("torch.xpu.is_available", return_value=True)
+    @patch("torch.xpu.device_count", return_value=3)
+    @patch("torch.accelerator.is_available", return_value=True)
+    @patch("torch.accelerator.device_count", return_value=3)
+    @patch("torch.accelerator.current_accelerator", return_value=MagicMock(type="xpu"))
+    def test_nproc_gpu_launch_configurations(self, _mock1, _mock2):
+        self._test_nproc_launch_configuration("auto", 3)
+        self._test_nproc_launch_configuration("xpu", 3)
+
     @skip_but_pass_in_sandcastle_if(
         TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
     )

From c9947883c4316ff44a98f4139de26bdd305de876 Mon Sep 17 00:00:00 2001
From: Miroslaw Oksiucik <moksiucik@habana.ai>
Date: Tue, 5 Aug 2025 12:54:09 +0300
Subject: [PATCH 10/12] Typo in test name

---
 test/distributed/launcher/test_run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/distributed/launcher/test_run.py b/test/distributed/launcher/test_run.py
index d394828aa1ea..f5ae40cea109 100644
--- a/test/distributed/launcher/test_run.py
+++ b/test/distributed/launcher/test_run.py
@@ -288,7 +288,7 @@ def test_nproc_gpu_launch_configurations(self, _mock1, _mock2):
     @patch("torch.accelerator.is_available", return_value=True)
     @patch("torch.accelerator.device_count", return_value=3)
     @patch("torch.accelerator.current_accelerator", return_value=MagicMock(type="xpu"))
-    def test_nproc_gpu_launch_configurations(self, _mock1, _mock2):
+    def test_nproc_xpu_launch_configurations(self, _mock1, _mock2):
         self._test_nproc_launch_configuration("auto", 3)
         self._test_nproc_launch_configuration("xpu", 3)
 

From 696d34b8a54ecbd5b80b2a570c7c2603c3de5d41 Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com>
Date: Tue, 5 Aug 2025 18:51:00 +0800
Subject: [PATCH 11/12] Update test/distributed/launcher/test_run.py

---
 test/distributed/launcher/test_run.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/distributed/launcher/test_run.py b/test/distributed/launcher/test_run.py
index f5ae40cea109..46a8cb659af2 100644
--- a/test/distributed/launcher/test_run.py
+++ b/test/distributed/launcher/test_run.py
@@ -276,7 +276,9 @@ def test_nproc_launch_unknown_configurations(self):
     @patch("torch.accelerator.is_available", return_value=True)
     @patch("torch.accelerator.device_count", return_value=3)
     @patch("torch.accelerator.current_accelerator", return_value=MagicMock(type="gpu"))
-    def test_nproc_gpu_launch_configurations(self, _mock1, _mock2):
+    def test_nproc_gpu_launch_configurations(
+        self, _mock1, _mock2, _mock3, _mock4, _mock5
+    ):
         self._test_nproc_launch_configuration("auto", 3)
         self._test_nproc_launch_configuration("gpu", 3)
 

From b7b86beefc4e46da0a5a59b172c7bc36ca0e9c85 Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com>
Date: Tue, 5 Aug 2025 18:51:13 +0800
Subject: [PATCH 12/12] Update test/distributed/launcher/test_run.py

---
 test/distributed/launcher/test_run.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/distributed/launcher/test_run.py b/test/distributed/launcher/test_run.py
index 46a8cb659af2..d271e60954ae 100644
--- a/test/distributed/launcher/test_run.py
+++ b/test/distributed/launcher/test_run.py
@@ -290,7 +290,9 @@ def test_nproc_gpu_launch_configurations(
     @patch("torch.accelerator.is_available", return_value=True)
     @patch("torch.accelerator.device_count", return_value=3)
     @patch("torch.accelerator.current_accelerator", return_value=MagicMock(type="xpu"))
-    def test_nproc_xpu_launch_configurations(self, _mock1, _mock2):
+    def test_nproc_xpu_launch_configurations(
+        self, _mock1, _mock2, _mock3, _mock4, _mock5
+    ):
         self._test_nproc_launch_configuration("auto", 3)
         self._test_nproc_launch_configuration("xpu", 3)