From ca5489532606e8405284120604720e74143d743d Mon Sep 17 00:00:00 2001 From: dsashidh Date: Mon, 21 Jul 2025 19:45:57 +0000 Subject: [PATCH 1/3] Add header check for rocm_smi.h to avoid build errors --- CMakeLists.txt | 9 +++++++++ torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 99c0b9e0ea0c..7156fcff3a29 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1396,3 +1396,12 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA) install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" DESTINATION "${CMAKE_INSTALL_BINDIR}") endif() + +if(USE_ROCM) + include(CheckIncludeFile) + check_include_file("rocm_smi/rocm_smi.h" HAS_ROCM_SMI_H) + message(STATUS "HAS_ROCM_SMI_H = ${HAS_ROCM_SMI_H}") + if(HAS_ROCM_SMI_H) + add_definitions(-DHAS_ROCM_SMI) + endif() +endif() diff --git a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp index 0d53d100cee7..72415909ab00 100644 --- a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp +++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp @@ -2,7 +2,7 @@ #include #include -#if defined(USE_ROCM) +#if defined(USE_ROCM) && defined(HAS_ROCM_SMI) #include #endif From 3e76a0f43e02d6bdc29a3fcaa6e9af5c79ea3920 Mon Sep 17 00:00:00 2001 From: dsashidh Date: Tue, 22 Jul 2025 15:29:34 +0000 Subject: [PATCH 2/3] Guard rsmi ussage with HAS_ROCM_SMI to avoid build errors --- torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp index 72415909ab00..9600a156486c 100644 --- a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp +++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp @@ -171,7 +171,7 @@ bool IntraNodeComm::rendezvous() { gethostname(devInfo.hostname, sizeof(devInfo.hostname)); devInfo.deviceIdx = deviceIdx_; -#if defined(USE_ROCM) +#if defined(USE_ROCM) && defined(HAS_ROCM_SMI) auto ret = rsmi_init(0); if (ret != RSMI_STATUS_SUCCESS) { LOG(ERROR) << "IntraNodeComm:: rendezvous failed in rsmi_init, ret=" << ret; From 07b5dbe2202933abc43f494b63bd146be3ae27b1 Mon Sep 17 00:00:00 2001 From: Dev Sashidhar Date: Tue, 5 Aug 2025 17:29:15 +0000 Subject: [PATCH 3/3] Move rocm_smi.h check to top-level CMakeLists.txt and fail early if missing --- CMakeLists.txt | 17 ++++++++--------- .../c10d/symm_mem/intra_node_comm.cpp | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7156fcff3a29..35bbfb563899 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,14 @@ endif() # ---[ Project and semantic versioning. project(Torch CXX C) +if(DEFINED ENV{USE_ROCM}) + include(CheckIncludeFile) + check_include_file("rocm_smi/rocm_smi.h" HAS_ROCM_SMI_H) + if(NOT HAS_ROCM_SMI_H) + message(FATAL_ERROR "rocm_smi.h not found. Please install the rocm-smi-lib package.") + endif() +endif() + if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") set(LINUX TRUE) else() @@ -1396,12 +1404,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA) install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" DESTINATION "${CMAKE_INSTALL_BINDIR}") endif() - -if(USE_ROCM) - include(CheckIncludeFile) - check_include_file("rocm_smi/rocm_smi.h" HAS_ROCM_SMI_H) - message(STATUS "HAS_ROCM_SMI_H = ${HAS_ROCM_SMI_H}") - if(HAS_ROCM_SMI_H) - add_definitions(-DHAS_ROCM_SMI) - endif() -endif() diff --git a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp index 9600a156486c..f4c1c3fbbf57 100644 --- a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp +++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp @@ -2,7 +2,7 @@ #include #include -#if defined(USE_ROCM) && defined(HAS_ROCM_SMI) +#if defined(USE_ROCM) #include #endif