diff --git a/CMakeLists.txt b/CMakeLists.txt index 99c0b9e0ea0c..35bbfb563899 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,14 @@ endif() # ---[ Project and semantic versioning. project(Torch CXX C) +if(DEFINED ENV{USE_ROCM}) + include(CheckIncludeFile) + check_include_file("rocm_smi/rocm_smi.h" HAS_ROCM_SMI_H) + if(NOT HAS_ROCM_SMI_H) + message(FATAL_ERROR "rocm_smi.h not found. Please install the rocm-smi-lib package.") + endif() +endif() + if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") set(LINUX TRUE) else() diff --git a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp index 0d53d100cee7..f4c1c3fbbf57 100644 --- a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp +++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp @@ -171,7 +171,7 @@ bool IntraNodeComm::rendezvous() { gethostname(devInfo.hostname, sizeof(devInfo.hostname)); devInfo.deviceIdx = deviceIdx_; -#if defined(USE_ROCM) +#if defined(USE_ROCM) && defined(HAS_ROCM_SMI) auto ret = rsmi_init(0); if (ret != RSMI_STATUS_SUCCESS) { LOG(ERROR) << "IntraNodeComm:: rendezvous failed in rsmi_init, ret=" << ret;