From 651fdf40d0caa0e05725016d380abeba04cf57b4 Mon Sep 17 00:00:00 2001 From: Yoshi_likes_e4 <104140648+pt13762104@users.noreply.github.com> Date: Thu, 21 Aug 2025 13:16:18 +0700 Subject: [PATCH 01/11] Add option (again) --- ggml/CMakeLists.txt | 2 ++ ggml/src/ggml-cuda/CMakeLists.txt | 5 +++++ ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++++ ggml/src/ggml-cuda/mmq.cuh | 3 +++ 4 files changed, 14 insertions(+) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 2ead001e2c610..af78fe4244ac8 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -158,6 +158,8 @@ option(GGML_CUDA "ggml: use CUDA" option(GGML_MUSA "ggml: use MUSA" OFF) option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF) +option(GGML_CUDA_NO_TURING_MMA "ggml: disable the use of mma in mmq kernels" OFF) + set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "ggml: max. batch size for using peer access") option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index ea824965aae2d..b609ee6e77cad 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -77,6 +77,11 @@ if (CUDAToolkit_FOUND) add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) endif() + if (GGML_CUDA_NO_TURING_MMA) + add_compile_definitions(GGML_CUDA_NO_TURING_MMA) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) + endif() + if (GGML_CUDA_NO_VMM) add_compile_definitions(GGML_CUDA_NO_VMM) endif() diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 4e17fd211e1bb..0b37b32411c53 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -71,6 +71,10 @@ #include #include +#ifdef GGML_CUDA_NO_TURING_MMA +#define CUBLAS_COMPUTE_16F CUBLAS_COMPUTE_16F_PEDANTIC +#endif + static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); [[noreturn]] diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 650f7080677ad..9834830a945d9 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -9,6 +9,9 @@ using namespace ggml_cuda_mma; +#ifdef GGML_CUDA_NO_TURING_MMA +#undef TURING_MMA_AVAILABLE +#endif #define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available. #define MMQ_ITER_K 256 #define MMQ_NWARPS 8 From 078c6df36a2f6da2bdd04e3608a01b37cdbb0c90 Mon Sep 17 00:00:00 2001 From: Yoshi Date: Sat, 23 Aug 2025 08:26:40 +0700 Subject: [PATCH 02/11] Fix FA perf --- ggml/src/ggml-cuda/common.cuh | 2 +- ggml/src/ggml-cuda/fattn.cu | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 767ad83f60eb5..87d271fbc632e 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -218,7 +218,7 @@ static const char * cu_get_error_str(CUresult err) { #define FAST_FP16_AVAILABLE #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 -#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) +#if ((!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)) && !defined(GGML_CUDA_NO_TURING_MMA) #define FP16_MMA_AVAILABLE #endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 48834272660e5..6e0332147f0de 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -380,9 +380,12 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const } const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0; - +#ifdef GGML_CUDA_NO_TURING_MMA + if (K->ne[0] != 64 && K->ne[0] != 128 && turing_mma_available(cc)) { +#else // If Turing tensor cores available, use them except for some cases with batch size 1: if (turing_mma_available(cc)) { +#endif const bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask; // The mma-based kernels have GQA-specific optimizations const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16; const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (gqa_ratio > 4 && K->ne[1] >= 8192); @@ -396,7 +399,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const } return BEST_FATTN_KERNEL_MMA_F16; } - // Use kernels specializes for small batch sizes if possible: if (Q->ne[1] <= 8 && can_use_vector_kernel) { if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { @@ -404,12 +406,12 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const } return BEST_FATTN_KERNEL_VEC_F32; } - +#ifndef GGML_CUDA_NO_TURING_MMA // For large batch sizes, use the WMMA kernel if possible: if (fp16_mma_available(cc)) { return BEST_FATTN_KERNEL_WMMA_F16; } - +#endif // If there is no suitable kernel for tensor cores or small batch sizes, use the generic kernel for large batch sizes: if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { return BEST_FATTN_KERNEL_TILE_F16; From 29df6a29696aef2d3c8a649d845b4aecd1118152 Mon Sep 17 00:00:00 2001 From: Yoshi Date: Sat, 23 Aug 2025 18:25:12 +0700 Subject: [PATCH 03/11] Test disable FP16 --- ggml/src/ggml-cuda/common.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 87d271fbc632e..4b3633c60595e 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -256,7 +256,7 @@ static bool fast_fp16_available(const int cc) { // To be used for feature selection of external libraries, e.g. cuBLAS. static bool fast_fp16_hardware_available(const int cc) { - return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) || + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610 && !GGML_CUDA_NO_TURING_MMA) || GGML_CUDA_CC_IS_AMD(cc) || (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); } From 09d3271985f341491353e6e78723b228101487c9 Mon Sep 17 00:00:00 2001 From: Yoshi Date: Sat, 23 Aug 2025 18:40:49 +0700 Subject: [PATCH 04/11] Revert disable Fp16 --- ggml/src/ggml-cuda/common.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 4b3633c60595e..87d271fbc632e 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -256,7 +256,7 @@ static bool fast_fp16_available(const int cc) { // To be used for feature selection of external libraries, e.g. cuBLAS. static bool fast_fp16_hardware_available(const int cc) { - return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610 && !GGML_CUDA_NO_TURING_MMA) || GGML_CUDA_CC_IS_AMD(cc) || + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) || (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); } From 58b68cdc988e18040ef56832c7dbdb587aa62510 Mon Sep 17 00:00:00 2001 From: Yoshi Date: Sun, 24 Aug 2025 10:20:22 +0700 Subject: [PATCH 05/11] Disable F32 in FA? --- src/llama-graph.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 6419d739bd8a2..644168ced68b8 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1262,8 +1262,7 @@ ggml_tensor * llm_graph_context::build_attn_mha( hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); ggml_flash_attn_ext_add_sinks(cur, sinks); - ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32); - + // ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32); if (v_mla) { #if 0 // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens. From e215c8311f7463365beed23c081d049084f208d2 Mon Sep 17 00:00:00 2001 From: Yoshi_likes_e4 <104140648+pt13762104@users.noreply.github.com> Date: Wed, 27 Aug 2025 09:58:36 +0700 Subject: [PATCH 06/11] Add notice --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a01ef6d503e40..1cc30e2e1c16b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +### This is a special testing branch for random things that I test. It includes an option, `GGML_CUDA_NO_TURING_MMA`, which makes Turing devices appears as if tensor cores don't exist. DO NOT USE THIS OPTION UNLESS YOU ARE AFFECTED (GTX 16 series, etc.) # llama.cpp ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) From f73a43d3e8bfbaff6f8d666d6dc65754cc185db0 Mon Sep 17 00:00:00 2001 From: Yoshi Date: Wed, 27 Aug 2025 10:04:50 +0700 Subject: [PATCH 07/11] Supports RTX 30+ --- ggml/src/ggml-cuda/common.cuh | 10 +++++----- ggml/src/ggml-cuda/fattn.cu | 5 +---- ggml/src/ggml-cuda/ggml-cuda.cu | 3 ++- ggml/src/ggml-cuda/mmq.cuh | 3 --- 4 files changed, 8 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 649984030ebac..93c1e503b33ff 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -218,7 +218,7 @@ static const char * cu_get_error_str(CUresult err) { #define FAST_FP16_AVAILABLE #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 -#if ((!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)) && !defined(GGML_CUDA_NO_TURING_MMA) +#if ((!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)) && (!defined(GGML_CUDA_NO_TURING_MMA) || __CUDA_ARCH__ != GGML_CUDA_CC_TURING) #define FP16_MMA_AVAILABLE #endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) @@ -230,7 +230,7 @@ static const char * cu_get_error_str(CUresult err) { #define AMD_MFMA_AVAILABLE #endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA) -#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING +#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING) && (!defined(GGML_CUDA_NO_TURING_MMA) || __CUDA_ARCH__ != GGML_CUDA_CC_TURING) #define TURING_MMA_AVAILABLE #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING @@ -265,7 +265,7 @@ static bool fp16_mma_available(const int cc) { #if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN) return false; #else - if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || + if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && (!GGML_CUDA_NO_TURING_MMA || cc >= GGML_CUDA_CC_AMPERE)) || GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_MTHREADS(cc)) { return true; @@ -283,7 +283,7 @@ static bool fp16_mma_available(const int cc) { // To be used for feature selection of external libraries, e.g. cuBLAS. static bool fp16_mma_hardware_available(const int cc) { - return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA && (!GGML_CUDA_NO_TURING_MMA || cc >= GGML_CUDA_CC_AMPERE)) || GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) || (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); } @@ -306,7 +306,7 @@ static bool amd_mfma_available(const int cc) { // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. static bool turing_mma_available(const int cc) { - return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING; + return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING && (!GGML_CUDA_NO_TURING_MMA || cc >= GGML_CUDA_CC_AMPERE); } static bool ampere_mma_available(const int cc) { diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 6e0332147f0de..22adc0f9ec7dc 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -380,12 +380,9 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const } const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0; -#ifdef GGML_CUDA_NO_TURING_MMA - if (K->ne[0] != 64 && K->ne[0] != 128 && turing_mma_available(cc)) { -#else // If Turing tensor cores available, use them except for some cases with batch size 1: if (turing_mma_available(cc)) { -#endif + const bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask; // The mma-based kernels have GQA-specific optimizations const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16; const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (gqa_ratio > 4 && K->ne[1] >= 8192); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 52ddfeeecd7ab..95ddcc8f9ecec 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -72,8 +72,9 @@ #include #include -#ifdef GGML_CUDA_NO_TURING_MMA +#if (defined(GGML_CUDA_NO_TURING_MMA) && __CUDA_ARCH__ == GGML_CUDA_CC_TURING) #define CUBLAS_COMPUTE_16F CUBLAS_COMPUTE_16F_PEDANTIC +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP CUBLAS_GEMM_DEFAULT #endif static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index e2ccdc7da85b1..c9a07e82fedf2 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -9,9 +9,6 @@ using namespace ggml_cuda_mma; -#ifdef GGML_CUDA_NO_TURING_MMA -#undef TURING_MMA_AVAILABLE -#endif #define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available. #define MMQ_ITER_K 256 #define MMQ_NWARPS 8 From 453622d4d28ee342c69ffd09689774552c405103 Mon Sep 17 00:00:00 2001 From: Yoshi Date: Wed, 27 Aug 2025 15:11:07 +0700 Subject: [PATCH 08/11] Disable FP16 in Cublas (again) --- ggml/src/ggml-cuda/common.cuh | 2 +- ggml/src/ggml-cuda/ggml-cuda.cu | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 93c1e503b33ff..c796279d294c1 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -256,7 +256,7 @@ static bool fast_fp16_available(const int cc) { // To be used for feature selection of external libraries, e.g. cuBLAS. static bool fast_fp16_hardware_available(const int cc) { - return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) || + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610 && (!GGML_CUDA_NO_TURING_MMA || cc >= GGML_CUDA_CC_AMPERE)) || GGML_CUDA_CC_IS_AMD(cc) || (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 95ddcc8f9ecec..0f363bfde6cc1 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -72,10 +72,8 @@ #include #include -#if (defined(GGML_CUDA_NO_TURING_MMA) && __CUDA_ARCH__ == GGML_CUDA_CC_TURING) -#define CUBLAS_COMPUTE_16F CUBLAS_COMPUTE_16F_PEDANTIC -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP CUBLAS_GEMM_DEFAULT -#endif + + static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); From a22af8ebe4a65abbcf642cc8d24dea4dd805eea0 Mon Sep 17 00:00:00 2001 From: Yoshi Date: Wed, 27 Aug 2025 18:26:12 +0700 Subject: [PATCH 09/11] Small fix --- ggml/src/ggml-cuda/common.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index c796279d294c1..e4468ddc87e9e 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -246,6 +246,10 @@ static const char * cu_get_error_str(CUresult err) { #define FLASH_ATTN_AVAILABLE #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220) +#ifndef GGML_CUDA_NO_TURING_MMA +#define GGML_CUDA_NO_TURING_MMA 0 +#endif + static bool fp16_available(const int cc) { return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL; } From 5b8bfd79c743cfe149a07343dde1f5904a15a1bc Mon Sep 17 00:00:00 2001 From: Yoshi Date: Fri, 29 Aug 2025 08:04:16 +0700 Subject: [PATCH 10/11] Enable Cuda graphs --- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index d0f11c682acd1..2e090e74abab2 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2974,7 +2974,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, bool cuda_graph_update_required = false; if (cuda_ctx->cuda_graph->graph == nullptr) { - if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) { + if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_TURING) { cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); From bfdc5f614fc4cd0acdd612bc1f5c26dcba1d82f2 Mon Sep 17 00:00:00 2001 From: Yoshi Date: Mon, 1 Sep 2025 21:36:52 +0700 Subject: [PATCH 11/11] Revert enabling CUDA graphs --- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index f5817cc9dfeea..0d7e18f614ad6 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3032,7 +3032,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, bool cuda_graph_update_required = false; if (cuda_ctx->cuda_graph->graph == nullptr) { - if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_TURING) { + if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) { cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);