update default priority order setting

mayuyuace · mayuyuace · commit f4cde98a4e86 · 2025-08-10T20:40:52.000-07:00
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
@@ -432,9 +432,9 @@ class TORCH_API Context {
   std::array<at::SDPBackend, at::num_sdp_backends> sdp_priority_order = {
       at::SDPBackend::flash_attention,
       at::SDPBackend::efficient_attention,
-      at::SDPBackend::overrideable,
       at::SDPBackend::math,
-      at::SDPBackend::cudnn_attention};
+      at::SDPBackend::cudnn_attention,
+      at::SDPBackend::overrideable};
   bool enabled_flashSDP = true;
   bool enabled_mem_efficientSDP = true;
   bool enabled_mathSDP = true;
diff --git a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
@@ -93,8 +93,20 @@ bool can_use_mem_efficien_attention(sdp::sdp_params const& params, bool debug) {
   return false;
 }
 
+bool priority_order_init = false;
+
 std::array<sdp::SDPBackend, sdp::num_backends> priority_order(
     sdp::sdp_params const& params) {
+  if (!priority_order_init) {
+    priority_order_init = true;
+    const std::vector<int64_t> priority_order = {
+        static_cast<int64_t>(at::SDPBackend::overrideable),
+        static_cast<int64_t>(at::SDPBackend::math),
+        static_cast<int64_t>(at::SDPBackend::flash_attention),
+        static_cast<int64_t>(at::SDPBackend::efficient_attention),
+        static_cast<int64_t>(at::SDPBackend::cudnn_attention)};
+    at::globalContext().setSDPPriorityOrder(priority_order);
+  }
   return at::globalContext().sDPPriorityOrder();
 }
 
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -805,13 +805,6 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
   return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug);
 }
 
-inline bool can_use_overrideable_attention(sdp_params const& params, bool debug) {
-  if (debug) {
-    TORCH_WARN("CUDA don't support SDPA overrideable attention backend.");
-  }
-  return false;
-}
-
 SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
   // This function defines the priority order of the different sdp backends
   // 1. Flash Attention
@@ -851,8 +844,7 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
         }
         break;
       case SDPBackend::overrideable:
-        if (ctx.userEnabledOverrideableSDP() &&
-            sdp::can_use_overrideable_attention(kernel_params, print_debug)) {
+        if (ctx.userEnabledOverrideableSDP()) {
           TORCH_CHECK(false, "Invalid backend");
         }
         break;
@@ -874,8 +866,6 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
   sdp::can_use_flash_attention(kernel_params, print_debug);
   TORCH_WARN("CuDNN attention kernel not used because:");
   sdp::can_use_cudnn_attention(kernel_params, print_debug);
-  TORCH_WARN("Overrideable attention kernel not used because:");
-  sdp::can_use_overrideable_attention(kernel_params, print_debug);
   TORCH_CHECK(!print_debug, "No available kernel. Aborting execution.")
   return SDPBackend::error;
 }
diff --git a/test/test_transformers.py b/test/test_transformers.py
@@ -4259,15 +4259,15 @@ def test_backends_set_to_math(self, device):
             _ = F.scaled_dot_product_attention(q, k, v)
 
     def test_default_priority_order(self, device):
-        # The default priority order is flash, efficient, overrideable, math, cudnn
-        # For non-cuda backend, we need to make sure that flash > overrideable > math
+        # The default priority order of xpu is overrideable, math, flash, efficient, cudnn
+        # For xpu backend, we need to make sure that overrideable > math > flash
         from torch.nn.attention import _cur_sdpa_kernel_backends
         default_priority = _cur_sdpa_kernel_backends(with_priority=True)
         flash_index = default_priority.index(SDPBackend.FLASH_ATTENTION)
         overrideable_index = default_priority.index(SDPBackend.OVERRIDEABLE)
         math_index = default_priority.index(SDPBackend.MATH)
-        self.assertTrue(flash_index < overrideable_index < math_index,
-                        f"Expected flash < overrideable < math, got {flash_index}, {overrideable_index}, {math_index}")
+        self.assertTrue(overrideable_index < math_index < flash_index,
+                        f"Expected overrideable < math < flash, got {overrideable_index}, {math_index}, {flash_index}")
 
     def test_scaled_dot_product_attention_fused_kernels_safe_softmax(self, device):
         dtype = torch.bfloat16