From aa0a90b25c682a89c1aa7acadf1615ef4764c978 Mon Sep 17 00:00:00 2001 From: Miroslaw Oksiucik Date: Wed, 30 Jul 2025 14:08:55 +0300 Subject: [PATCH 01/12] Support XPU in --nproc-per-node option to torchrun Support both --nproc-per-node=xpu and autodetection of XPU device in case of --nproc-per-node=auto --- torch/distributed/run.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/torch/distributed/run.py b/torch/distributed/run.py index c37ecd8f72d8..e995fbfb390e 100644 --- a/torch/distributed/run.py +++ b/torch/distributed/run.py @@ -77,7 +77,9 @@ .. note:: ``--nproc-per-node`` may be ``"gpu"`` (spawn one process per GPU), ``"cpu"`` (spawn one process per CPU), + ``"xpu"`` (spawn one process per XPU), ``"auto"`` (equivalent to ``"gpu"`` if CUDA is available, + else equivalent to ``"xpu"`` if XPU is available, else equivalent to ``"cpu"``), or an integer specifying the number of processes. See `torch.distributed.run.determine_local_world_size @@ -413,7 +415,7 @@ def get_args_parser() -> ArgumentParser: action=env, type=str, default="1", - help="Number of workers per node; supported values: [auto, cpu, gpu, int].", + help="Number of workers per node; supported values: [auto, cpu, gpu, xpu, int].", ) # @@ -694,6 +696,11 @@ def determine_local_world_size(nproc_per_node: str): raise ValueError("Cuda is not available.") from e device_type = "gpu" num_proc = torch.cuda.device_count() + elif nproc_per_node == "xpu": + if not torch.xpu.is_available(): + raise ValueError("Xpu is not available.") from e + device_type = "xpu" + num_proc = torch.xpu.device_count() elif nproc_per_node == torch._C._get_privateuse1_backend_name(): if not _get_custom_mod_func("is_available")(): raise ValueError(f"{nproc_per_node} is not available.") from e @@ -703,6 +710,9 @@ def determine_local_world_size(nproc_per_node: str): if torch.cuda.is_available(): num_proc = torch.cuda.device_count() device_type = "gpu" + elif torch.xpu.is_available(): + num_proc = torch.xpu.device_count() + device_type = "xpu" elif ( hasattr(torch, torch._C._get_privateuse1_backend_name()) and _get_custom_mod_func("is_available")() From 249f7c23ce9f0460d332fa54973982100425fa8e Mon Sep 17 00:00:00 2001 From: Miroslaw Oksiucik Date: Thu, 31 Jul 2025 12:51:50 +0300 Subject: [PATCH 02/12] Apply review comment --- torch/distributed/run.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/torch/distributed/run.py b/torch/distributed/run.py index e995fbfb390e..42d7ae7b4ef7 100644 --- a/torch/distributed/run.py +++ b/torch/distributed/run.py @@ -707,18 +707,9 @@ def determine_local_world_size(nproc_per_node: str): device_type = nproc_per_node num_proc = _get_custom_mod_func("device_count")() elif nproc_per_node == "auto": - if torch.cuda.is_available(): - num_proc = torch.cuda.device_count() - device_type = "gpu" - elif torch.xpu.is_available(): - num_proc = torch.xpu.device_count() - device_type = "xpu" - elif ( - hasattr(torch, torch._C._get_privateuse1_backend_name()) - and _get_custom_mod_func("is_available")() - ): - num_proc = _get_custom_mod_func("device_count")() - device_type = torch._C._get_privateuse1_backend_name() + if torch.accelerator.is_available(): + num_proc = torch.accelerator.device_count() + device_type = torch.accelerator.current_accelerator().type else: num_proc = os.cpu_count() device_type = "cpu" From e38a821c462ee8461164d5a9f83fe3c923a0bc0c Mon Sep 17 00:00:00 2001 From: Miroslaw Oksiucik Date: Thu, 31 Jul 2025 15:38:11 +0300 Subject: [PATCH 03/12] Fix lint error --- torch/distributed/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/distributed/run.py b/torch/distributed/run.py index 42d7ae7b4ef7..3a2d0d32a730 100644 --- a/torch/distributed/run.py +++ b/torch/distributed/run.py @@ -707,7 +707,7 @@ def determine_local_world_size(nproc_per_node: str): device_type = nproc_per_node num_proc = _get_custom_mod_func("device_count")() elif nproc_per_node == "auto": - if torch.accelerator.is_available(): + if torch.accelerator.is_available() and torch.accelerator.current_accelerator(): num_proc = torch.accelerator.device_count() device_type = torch.accelerator.current_accelerator().type else: From 3bea66838c486f89bfbae1deccceaee1ddc0a55d Mon Sep 17 00:00:00 2001 From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com> Date: Fri, 1 Aug 2025 10:07:47 +0800 Subject: [PATCH 04/12] Update torch/distributed/run.py --- torch/distributed/run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/distributed/run.py b/torch/distributed/run.py index 3a2d0d32a730..bd1dfdb2a02f 100644 --- a/torch/distributed/run.py +++ b/torch/distributed/run.py @@ -707,9 +707,9 @@ def determine_local_world_size(nproc_per_node: str): device_type = nproc_per_node num_proc = _get_custom_mod_func("device_count")() elif nproc_per_node == "auto": - if torch.accelerator.is_available() and torch.accelerator.current_accelerator(): + if torch.accelerator.is_available(): num_proc = torch.accelerator.device_count() - device_type = torch.accelerator.current_accelerator().type + device_type = torch.accelerator.current_accelerator().type # type: ignore[union-attr] else: num_proc = os.cpu_count() device_type = "cpu" From a29dba11a45fadeda0414e456cf30dd598d0d96d Mon Sep 17 00:00:00 2001 From: Miroslaw Oksiucik Date: Mon, 4 Aug 2025 11:11:44 +0300 Subject: [PATCH 05/12] debug --- torch/distributed/run.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/torch/distributed/run.py b/torch/distributed/run.py index bd1dfdb2a02f..09439022b8a0 100644 --- a/torch/distributed/run.py +++ b/torch/distributed/run.py @@ -713,6 +713,11 @@ def determine_local_world_size(nproc_per_node: str): else: num_proc = os.cpu_count() device_type = "cpu" + ### CI ERROR DEBUG CODE BEGIN ### + print(f"nproc_per_node=auto: {device_type = }, {num_proc = }, {os.cpu_count() = }") + if torch.cuda.is_available(): + print(f"{torch.cuda.device_count() = }") + ### CI ERROR DEBUG CODE END ### else: raise ValueError( f"Unsupported nproc_per_node value: {nproc_per_node}" From 9123bf0fd01ee23587f5383bed598eca56484af5 Mon Sep 17 00:00:00 2001 From: Miroslaw Oksiucik Date: Tue, 5 Aug 2025 11:50:41 +0300 Subject: [PATCH 06/12] Revert "debug" This reverts commit a29dba11a45fadeda0414e456cf30dd598d0d96d. --- torch/distributed/run.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/torch/distributed/run.py b/torch/distributed/run.py index 09439022b8a0..bd1dfdb2a02f 100644 --- a/torch/distributed/run.py +++ b/torch/distributed/run.py @@ -713,11 +713,6 @@ def determine_local_world_size(nproc_per_node: str): else: num_proc = os.cpu_count() device_type = "cpu" - ### CI ERROR DEBUG CODE BEGIN ### - print(f"nproc_per_node=auto: {device_type = }, {num_proc = }, {os.cpu_count() = }") - if torch.cuda.is_available(): - print(f"{torch.cuda.device_count() = }") - ### CI ERROR DEBUG CODE END ### else: raise ValueError( f"Unsupported nproc_per_node value: {nproc_per_node}" From 98012b6567925b822b33b846cb1b913bfc7e7cf7 Mon Sep 17 00:00:00 2001 From: Miroslaw Oksiucik Date: Tue, 5 Aug 2025 11:46:10 +0300 Subject: [PATCH 07/12] Check cuda before torch.accelerator --- torch/distributed/run.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torch/distributed/run.py b/torch/distributed/run.py index bd1dfdb2a02f..0fee9a028703 100644 --- a/torch/distributed/run.py +++ b/torch/distributed/run.py @@ -707,7 +707,10 @@ def determine_local_world_size(nproc_per_node: str): device_type = nproc_per_node num_proc = _get_custom_mod_func("device_count")() elif nproc_per_node == "auto": - if torch.accelerator.is_available(): + if torch.cuda.is_available(): + num_proc = torch.cuda.device_count() + device_type = "gpu" + elif torch.accelerator.is_available(): num_proc = torch.accelerator.device_count() device_type = torch.accelerator.current_accelerator().type # type: ignore[union-attr] else: From 7b0e8cfbe64250e41598d2d66dfda5bb3538e948 Mon Sep 17 00:00:00 2001 From: Miroslaw Oksiucik Date: Tue, 5 Aug 2025 12:39:02 +0300 Subject: [PATCH 08/12] Revert "Check cuda before torch.accelerator" This reverts commit 98012b6567925b822b33b846cb1b913bfc7e7cf7. --- torch/distributed/run.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/torch/distributed/run.py b/torch/distributed/run.py index 0fee9a028703..bd1dfdb2a02f 100644 --- a/torch/distributed/run.py +++ b/torch/distributed/run.py @@ -707,10 +707,7 @@ def determine_local_world_size(nproc_per_node: str): device_type = nproc_per_node num_proc = _get_custom_mod_func("device_count")() elif nproc_per_node == "auto": - if torch.cuda.is_available(): - num_proc = torch.cuda.device_count() - device_type = "gpu" - elif torch.accelerator.is_available(): + if torch.accelerator.is_available(): num_proc = torch.accelerator.device_count() device_type = torch.accelerator.current_accelerator().type # type: ignore[union-attr] else: From 3be536ad6ef08f59863c9e18493c7a706d94187c Mon Sep 17 00:00:00 2001 From: Miroslaw Oksiucik Date: Tue, 5 Aug 2025 12:38:10 +0300 Subject: [PATCH 09/12] Adjust unit tests --- test/distributed/launcher/test_run.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/distributed/launcher/test_run.py b/test/distributed/launcher/test_run.py index f71bffd527c1..d394828aa1ea 100644 --- a/test/distributed/launcher/test_run.py +++ b/test/distributed/launcher/test_run.py @@ -273,10 +273,25 @@ def test_nproc_launch_unknown_configurations(self): ) @patch("torch.cuda.is_available", return_value=True) @patch("torch.cuda.device_count", return_value=3) + @patch("torch.accelerator.is_available", return_value=True) + @patch("torch.accelerator.device_count", return_value=3) + @patch("torch.accelerator.current_accelerator", return_value=MagicMock(type="gpu")) def test_nproc_gpu_launch_configurations(self, _mock1, _mock2): self._test_nproc_launch_configuration("auto", 3) self._test_nproc_launch_configuration("gpu", 3) + @skip_but_pass_in_sandcastle_if( + TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" + ) + @patch("torch.xpu.is_available", return_value=True) + @patch("torch.xpu.device_count", return_value=3) + @patch("torch.accelerator.is_available", return_value=True) + @patch("torch.accelerator.device_count", return_value=3) + @patch("torch.accelerator.current_accelerator", return_value=MagicMock(type="xpu")) + def test_nproc_gpu_launch_configurations(self, _mock1, _mock2): + self._test_nproc_launch_configuration("auto", 3) + self._test_nproc_launch_configuration("xpu", 3) + @skip_but_pass_in_sandcastle_if( TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" ) From c9947883c4316ff44a98f4139de26bdd305de876 Mon Sep 17 00:00:00 2001 From: Miroslaw Oksiucik Date: Tue, 5 Aug 2025 12:54:09 +0300 Subject: [PATCH 10/12] Typo in test name --- test/distributed/launcher/test_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/distributed/launcher/test_run.py b/test/distributed/launcher/test_run.py index d394828aa1ea..f5ae40cea109 100644 --- a/test/distributed/launcher/test_run.py +++ b/test/distributed/launcher/test_run.py @@ -288,7 +288,7 @@ def test_nproc_gpu_launch_configurations(self, _mock1, _mock2): @patch("torch.accelerator.is_available", return_value=True) @patch("torch.accelerator.device_count", return_value=3) @patch("torch.accelerator.current_accelerator", return_value=MagicMock(type="xpu")) - def test_nproc_gpu_launch_configurations(self, _mock1, _mock2): + def test_nproc_xpu_launch_configurations(self, _mock1, _mock2): self._test_nproc_launch_configuration("auto", 3) self._test_nproc_launch_configuration("xpu", 3) From 696d34b8a54ecbd5b80b2a570c7c2603c3de5d41 Mon Sep 17 00:00:00 2001 From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com> Date: Tue, 5 Aug 2025 18:51:00 +0800 Subject: [PATCH 11/12] Update test/distributed/launcher/test_run.py --- test/distributed/launcher/test_run.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/distributed/launcher/test_run.py b/test/distributed/launcher/test_run.py index f5ae40cea109..46a8cb659af2 100644 --- a/test/distributed/launcher/test_run.py +++ b/test/distributed/launcher/test_run.py @@ -276,7 +276,9 @@ def test_nproc_launch_unknown_configurations(self): @patch("torch.accelerator.is_available", return_value=True) @patch("torch.accelerator.device_count", return_value=3) @patch("torch.accelerator.current_accelerator", return_value=MagicMock(type="gpu")) - def test_nproc_gpu_launch_configurations(self, _mock1, _mock2): + def test_nproc_gpu_launch_configurations( + self, _mock1, _mock2, _mock3, _mock4, _mock5 + ): self._test_nproc_launch_configuration("auto", 3) self._test_nproc_launch_configuration("gpu", 3) From b7b86beefc4e46da0a5a59b172c7bc36ca0e9c85 Mon Sep 17 00:00:00 2001 From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com> Date: Tue, 5 Aug 2025 18:51:13 +0800 Subject: [PATCH 12/12] Update test/distributed/launcher/test_run.py --- test/distributed/launcher/test_run.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/distributed/launcher/test_run.py b/test/distributed/launcher/test_run.py index 46a8cb659af2..d271e60954ae 100644 --- a/test/distributed/launcher/test_run.py +++ b/test/distributed/launcher/test_run.py @@ -290,7 +290,9 @@ def test_nproc_gpu_launch_configurations( @patch("torch.accelerator.is_available", return_value=True) @patch("torch.accelerator.device_count", return_value=3) @patch("torch.accelerator.current_accelerator", return_value=MagicMock(type="xpu")) - def test_nproc_xpu_launch_configurations(self, _mock1, _mock2): + def test_nproc_xpu_launch_configurations( + self, _mock1, _mock2, _mock3, _mock4, _mock5 + ): self._test_nproc_launch_configuration("auto", 3) self._test_nproc_launch_configuration("xpu", 3)