[cuDNN][64-bit indexing] update conv depthwise 64bit indexing dispatch condition to match native kernel (#156140)

eqy · pytorchmergebot · commit a5f59cc2eab3 · 2025-06-18T17:32:36.000Z
The native kernel doesn't support batch splitting so the previous check wasn't aggressive enough in dispatching to cuDNN #155225 Pull Request resolved: #156140 Approved by: https://github.com/ngimel
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
@@ -3,6 +3,7 @@
 #include <ATen/Config.h>
 #include <ATen/Parallel.h>
 #include <ATen/TensorOperators.h>
+#include <ATen/native/CanUse32BitIndexMath.h>
 #include <ATen/native/ConvolutionMM3d.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/Pool.h>
@@ -463,7 +464,7 @@ struct ConvParams {
       return true;
     }
     // native kernel doesn't support 64-bit non-splittable case
-    if (cudnn_enabled && needs_64bit_indexing_no_split(input, weight)) {
+    if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
       static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
       if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
         TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
@@ -4057,11 +4057,22 @@ def test_conv3d_64bit_indexing(self, device):
     @largeTensorTest("20GB")
     @largeTensorTest("80GB", "cpu")
     def test_depthwise_conv_64bit_indexing(self, device):
-        x = torch.randn(1, 2, 32800, 32800)
-        c = nn.Conv2d(2, 2, kernel_size=3, stride=1, padding=1, groups=2)
+        x = torch.randn(1, 2, 32800, 32800, dtype=torch.bfloat16).to(
+            memory_format=torch.channels_last
+        )
+        c = nn.Conv2d(
+            2, 2, kernel_size=3, stride=1, padding=1, groups=2, dtype=torch.bfloat16
+        ).to(memory_format=torch.channels_last)
         yref = c(x)
         y = c.to(device=device)(x.to(device=device))
-        self.assertEqual(yref, y)
+        self.assertEqual(yref, y, atol=1e-3, rtol=1e-4)
+        del y, yref
+
+        # try a batch-splittable case
+        x = x.reshape(100, 2, 3280, 3280).contiguous(memory_format=torch.channels_last)
+        yref = c(x)
+        y = c.to(device=device)(x.to(device=device))
+        self.assertEqual(yref, y, atol=1e-3, rtol=1e-4)
 
 
 instantiate_device_type_tests(TestConvolutionNNDeviceType, globals(), allow_mps=True)