Work around MPSGraph issue in backward pass of nn.ReplicationPad1d/2d.

xwu-498 · xwu-498 · commit ebd8f3a0cde9 · 2025-05-08T12:03:00.000-07:00
Fixes #135447. When the 3rd from last dimension is 2^16 or greater, MPSGraph returns 0 for padgradient. To work around this, we break the problematic dimension into chunks with chunk size being no greater than 2^16 - 1. Test case for nn.ReplicationPad1d: ``` shape = [65739, 2, 4] x_cpu = torch.randn(shape, device='cpu', requires_grad=True) x_mps = x_cpu.clone().detach().to('mps').requires_grad_(True) model = torch.nn.ReplicationPad1d((1, 1)) out_cpu = model(x_cpu) out_mps = model(x_mps) # backward g_cpu = torch.randn_like(out_cpu) g_mps = g_cpu.clone().detach().to('mps').requires_grad_(False) out_cpu.backward(g_cpu) out_mps.backward(g_mps) print(f"{((x_cpu.grad - x_mps.grad.cpu()).abs() > 1e-5).sum() = }") # Expected Output: # ((x_cpu.grad - x_mps.grad.cpu()).abs() > 1e-5).sum() = tensor(0) ``` Test case for nn.ReplicationPad2d, ``` shape = [2, 65739, 2, 4] x_cpu = torch.randn(shape, device='cpu', requires_grad=True) x_mps = x_cpu.clone().detach().to('mps').requires_grad_(True) model = torch.nn.ReplicationPad2d((1, 1, 1, 1)) out_cpu = model(x_cpu) out_mps = model(x_mps) # backward g_cpu = torch.randn_like(out_cpu) g_mps = g_cpu.clone().detach().to('mps').requires_grad_(False) out_cpu.backward(g_cpu) out_mps.backward(g_mps) print(f"{((x_cpu.grad - x_mps.grad.cpu()).abs() > 1e-5).sum() = }") # Expected Output: # ((x_cpu.grad - x_mps.grad.cpu()).abs() > 1e-5).sum() = tensor(0) ``` These tests produce expected output with this workaround.
diff --git a/aten/src/ATen/native/mps/operations/Pad.mm b/aten/src/ATen/native/mps/operations/Pad.mm
@@ -19,6 +19,7 @@
 #include <ATen/ops/replication_pad2d_native.h>
 #include <ATen/ops/replication_pad3d_backward_native.h>
 #include <ATen/ops/replication_pad3d_native.h>
+#include <ATen/ops/slice.h>
 #endif
 
 namespace at::native {
@@ -243,75 +244,113 @@
     dataType = MPSDataTypeInt8;
   }
 
-  @autoreleasepool {
-    string key = op_name + getTensorsStringKey({input, grad_output, output}) + ":[" + getArrayRefString(padding) +
-        "]:" + std::to_string(constantValue);
-
-    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, dataType, getMPSShape(input));
-      const bool needsSlice = startMask != dims_mask || endMask != dims_mask;
-
-      if (!is_backward_pass) {
-        MPSGraphTensor* padTensor = [mpsGraph padTensor:newCachedGraph->inputTensor_
-                                        withPaddingMode:mode
-                                            leftPadding:leftPadding
-                                           rightPadding:rightPadding
-                                          constantValue:constantValue
-                                                   name:nil];
-        // workaround for the right padding bug in Monterey
-        if (needsSlice) {
-          newCachedGraph->gradInputTensor_ =
-              [mpsGraph sliceTensor:padTensor
-                             starts:[NSArray arrayWithObjects:startsVec.data() count:ndims]
-                               ends:[NSArray arrayWithObjects:endsVec.data() count:ndims]
-                            strides:[NSArray arrayWithObjects:stridesVec.data() count:ndims]
-                          startMask:startMask
-                            endMask:endMask
-                        squeezeMask:0
-                               name:nil];
-        } else {
-          newCachedGraph->gradInputTensor_ = padTensor;
-        }
-      } else {
-        newCachedGraph->gradOutputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, dataType, getMPSShape(grad_output));
-        MPSGraphTensor* padGradTensor =
-            [mpsGraph padGradientWithIncomingGradientTensor:newCachedGraph->gradOutputTensor_
-                                               sourceTensor:newCachedGraph->inputTensor_
-                                                paddingMode:mode
-                                                leftPadding:leftPadding
-                                               rightPadding:rightPadding
-                                                       name:nil];
-        // workaround for negative padding issue with padGradientWithIncomingGradientTensor()
-        if (needsSlice) {
-          newCachedGraph->gradInputTensor_ =
-              [mpsGraph sliceGradientTensor:padGradTensor
-                           fwdInShapeTensor:[mpsGraph shapeOfTensor:newCachedGraph->inputTensor_ name:nil]
-                                     starts:[NSArray arrayWithObjects:startsVec.data() count:ndims]
-                                       ends:[NSArray arrayWithObjects:endsVec.data() count:ndims]
-                                    strides:[NSArray arrayWithObjects:stridesVec.data() count:ndims]
-                                  startMask:startMask
-                                    endMask:endMask
-                                squeezeMask:0
-                                       name:nil];
+  // For tensor with rank equal 3 or 4 and padding mode replicate1d/2d, when the 3rd from the
+  // last dimension is 2**16 or greater, MPSGraph returns incorrect gradient. To work around this,
+  // we break the tensor into chuncks where the problematic dimention is no greater than 2**16-1.
+  // This is reported in https://github.com/pytorch/pytorch/issues/135447.
+  // Internal radar for MPSGraph: rdar://149853787.
+  const int64_t max_sub_batch_size = 65535;
+  int64_t sliced_dim = -1;
+  int64_t sub_batch_start = 0;
+  int64_t remaining_batch_size = 0;
+  if ((ndims == 3 || ndims == 4) && mode == MPSGraphPaddingModeClampToEdge && pad_front == 0 && pad_back == 0) {
+    int64_t batch_size = input_.size(-3);
+    if (batch_size > max_sub_batch_size) {
+      sliced_dim = ndims - 3;
+      remaining_batch_size = batch_size;
+    }
+  }
+  do {
+    Tensor sub_batch_input = input;
+    Tensor sub_batch_grad_output = grad_output;
+    Tensor sub_batch_output = output;
+
+    if (sliced_dim >= 0) {
+      int64_t sub_batch_size =
+          is_backward_pass ? std::min<int64_t>(remaining_batch_size, max_sub_batch_size) : remaining_batch_size;
+      sub_batch_input = at::slice(input, sliced_dim, sub_batch_start, sub_batch_start + sub_batch_size);
+      sub_batch_output = at::slice(output, sliced_dim, sub_batch_start, sub_batch_start + sub_batch_size);
+      if (is_backward_pass) {
+        sub_batch_grad_output = at::slice(grad_output, sliced_dim, sub_batch_start, sub_batch_start + sub_batch_size);
+      }
+      remaining_batch_size -= sub_batch_size;
+      sub_batch_start += sub_batch_size;
+    }
+    @autoreleasepool {
+      string key = op_name + getTensorsStringKey({sub_batch_input, sub_batch_grad_output, sub_batch_output}) + ":[" +
+          getArrayRefString(padding) + "]:" + std::to_string(constantValue) + std::to_string(sub_batch_start);
+
+      auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
+        newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, dataType, getMPSShape(sub_batch_input));
+        const bool needsSlice = startMask != dims_mask || endMask != dims_mask;
+
+        if (!is_backward_pass) {
+          MPSGraphTensor* padTensor = [mpsGraph padTensor:newCachedGraph->inputTensor_
+                                          withPaddingMode:mode
+                                              leftPadding:leftPadding
+                                             rightPadding:rightPadding
+                                            constantValue:constantValue
+                                                     name:nil];
+          // workaround for the right padding bug in Monterey
+          if (needsSlice) {
+            newCachedGraph->gradInputTensor_ =
+                [mpsGraph sliceTensor:padTensor
+                               starts:[NSArray arrayWithObjects:startsVec.data() count:ndims]
+                                 ends:[NSArray arrayWithObjects:endsVec.data() count:ndims]
+                              strides:[NSArray arrayWithObjects:stridesVec.data() count:ndims]
+                            startMask:startMask
+                              endMask:endMask
+                          squeezeMask:0
+                                 name:nil];
+          } else {
+            newCachedGraph->gradInputTensor_ = padTensor;
+          }
         } else {
-          newCachedGraph->gradInputTensor_ = padGradTensor;
+          newCachedGraph->gradOutputTensor_ =
+              mpsGraphRankedPlaceHolder(mpsGraph, dataType, getMPSShape(sub_batch_grad_output));
+          MPSGraphTensor* padGradTensor =
+              [mpsGraph padGradientWithIncomingGradientTensor:newCachedGraph->gradOutputTensor_
+                                                 sourceTensor:newCachedGraph->inputTensor_
+                                                  paddingMode:mode
+                                                  leftPadding:leftPadding
+                                                 rightPadding:rightPadding
+                                                         name:nil];
+          // workaround for negative padding issue with padGradientWithIncomingGradientTensor()
+          if (needsSlice) {
+            newCachedGraph->gradInputTensor_ =
+                [mpsGraph sliceGradientTensor:padGradTensor
+                             fwdInShapeTensor:[mpsGraph shapeOfTensor:newCachedGraph->inputTensor_ name:nil]
+                                       starts:[NSArray arrayWithObjects:startsVec.data() count:ndims]
+                                         ends:[NSArray arrayWithObjects:endsVec.data() count:ndims]
+                                      strides:[NSArray arrayWithObjects:stridesVec.data() count:ndims]
+                                    startMask:startMask
+                                      endMask:endMask
+                                  squeezeMask:0
+                                         name:nil];
+          } else {
+            newCachedGraph->gradInputTensor_ = padGradTensor;
+          }
         }
+      });
+      Placeholder inputPlaceholder =
+          Placeholder(cachedGraph->inputTensor_, sub_batch_input, getMPSShape(sub_batch_input), true, dataType);
+      Placeholder outputPlaceholder =
+          Placeholder(cachedGraph->gradInputTensor_, sub_batch_output, getMPSShape(sub_batch_output), true, dataType);
+      Placeholder gradOutputPlaceholder = !is_backward_pass ? Placeholder()
+                                                            : Placeholder(cachedGraph->gradOutputTensor_,
+                                                                          sub_batch_grad_output,
+                                                                          getMPSShape(sub_batch_grad_output),
+                                                                          true,
+                                                                          dataType);
+
+      NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease];
+      feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
+      if (is_backward_pass) {
+        feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData();
       }
-    });
-
-    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input, nullptr, true, dataType);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, output, nullptr, true, dataType);
-    Placeholder gradOutputPlaceholder = !is_backward_pass
-        ? Placeholder()
-        : Placeholder(cachedGraph->gradOutputTensor_, grad_output, nullptr, true, dataType);
-
-    NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease];
-    feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
-    if (is_backward_pass) {
-      feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData();
+      runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
     }
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
-  }
+  } while (remaining_batch_size > 0);
   return output;
 }
 } // namespace mps
diff --git a/test/test_mps.py b/test/test_mps.py
@@ -9556,6 +9556,8 @@ def helper(shape, padding, op, value=0):
         helper((2, 4, 4), (1, 3), nn.ReflectionPad1d)
         # Replication 1D
         helper((2, 1, 6), 3, nn.ReplicationPad1d)
+        # Replication 1D with large batch size (>= 2**16)
+        helper((65539, 1, 6), 3, nn.ReplicationPad1d)
         # Constant Pad 1D
         helper((2, 3, 4), 2, nn.ConstantPad1d)
         # Constant Pad 1D with single dimension input
@@ -9569,6 +9571,8 @@ def helper(shape, padding, op, value=0):
         helper((2, 1, 6, 8), 2, nn.ReplicationPad2d)
         # verify if a change in shape of padding would cause problems with graph caching
         helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ReplicationPad2d)
+        # verify 2d padding with the 2nd dimension >= 2**16
+        helper((2, 65539, 6, 8), (2, 4, 3, 5), nn.ReplicationPad2d)
         # Constant Pad 2D
         helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ConstantPad2d)
         # input size < pad size