Skip to content

Conversation

arsenm
Copy link
Contributor

@arsenm arsenm commented Aug 27, 2025

Perform a register class constraint check when performing the fold

Perform a register class constraint check when performing the fold
@llvmbot
Copy link
Member

llvmbot commented Aug 27, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Perform a register class constraint check when performing the fold


Full diff: https://github.com/llvm/llvm-project/pull/155559.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+35-24)
  • (modified) llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir (+6-9)
  • (modified) llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir (+6-9)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 3979e1e0c44aa..a116b57c85a88 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -173,6 +173,7 @@ struct FoldCandidate {
 
 class SIFoldOperandsImpl {
 public:
+  MachineFunction *MF;
   MachineRegisterInfo *MRI;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
@@ -705,6 +706,36 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
   }
 
   MachineOperand *New = Fold.Def.OpToFold;
+
+  // Verify the register is compatible with the operand.
+  if (const TargetRegisterClass *OpRC =
+          TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI, *MF)) {
+    const TargetRegisterClass *OldRC = MRI->getRegClass(Old.getReg());
+    const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg());
+    unsigned NewSubReg = New->getSubReg();
+    unsigned OldSubReg = Old.getSubReg();
+
+    const TargetRegisterClass *ConstrainRC = OpRC;
+    if (NewSubReg && OldSubReg) {
+      unsigned PreA, PreB;
+      ConstrainRC = TRI->getCommonSuperRegClass(OpRC, OldSubReg, NewRC,
+                                                NewSubReg, PreA, PreB);
+    } else if (OldSubReg) {
+      ConstrainRC = TRI->getMatchingSuperRegClass(OldRC, OpRC, OldSubReg);
+    } else if (NewSubReg) {
+      ConstrainRC = TRI->getMatchingSuperRegClass(NewRC, OpRC, NewSubReg);
+    }
+
+    if (!ConstrainRC)
+      return false;
+
+    if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
+      LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
+                        << TRI->getRegClassName(ConstrainRC) << '\n');
+      return false;
+    }
+  }
+
   // Rework once the VS_16 register class is updated to include proper
   // 16-bit SGPRs instead of 32-bit ones.
   if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
@@ -1429,30 +1460,9 @@ void SIFoldOperandsImpl::foldOperand(
       return;
   }
 
-  if (!FoldingImmLike) {
-    if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
-      // Don't fold if OpToFold doesn't hold an aligned register.
-      const TargetRegisterClass *RC =
-          TRI->getRegClassForReg(*MRI, OpToFold.getReg());
-      assert(RC);
-      if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
-        unsigned SubReg = OpToFold.getSubReg();
-        if (const TargetRegisterClass *SubRC =
-                TRI->getSubRegisterClass(RC, SubReg))
-          RC = SubRC;
-      }
-
-      if (!RC || !TRI->isProperlyAlignedRC(*RC))
-        return;
-    }
-
-    tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
-
-    // FIXME: We could try to change the instruction from 64-bit to 32-bit
-    // to enable more folding opportunities.  The shrink operands pass
-    // already does this.
-    return;
-  }
+  // FIXME: We could try to change the instruction from 64-bit to 32-bit
+  // to enable more folding opportunities.  The shrink operands pass
+  // already does this.
 
   tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
 }
@@ -2747,6 +2757,7 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
 }
 
 bool SIFoldOperandsImpl::run(MachineFunction &MF) {
+  this->MF = &MF;
   MRI = &MF.getRegInfo();
   ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir
index a0ea04b1b9c0f..8326862706a02 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir
@@ -31,9 +31,8 @@ body:             |
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
-    ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_64 = IMPLICIT_DEF
-    ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY killed [[DEF]]
-    ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+    ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_64_align2 = IMPLICIT_DEF
+    ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %1:areg_64 = IMPLICIT_DEF
     %2:areg_64_align2 = COPY killed %1
@@ -105,9 +104,8 @@ body:             |
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
-    ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_96 = IMPLICIT_DEF
-    ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY killed [[DEF]]
-    ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+    ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_96_align2 = IMPLICIT_DEF
+    ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %1:areg_96 = IMPLICIT_DEF
     %2:areg_96_align2 = COPY killed %1
@@ -234,9 +232,8 @@ body:             |
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
-    ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_128 = IMPLICIT_DEF
-    ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY killed [[DEF]]
-    ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+    ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+    ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %1:areg_128 = IMPLICIT_DEF
     %2:areg_128_align2 = COPY killed %1
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
index a54c0accce783..9dd025a3da086 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
@@ -46,9 +46,8 @@ body:             |
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
-    ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-    ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[DEF]]
-    ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+    ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %1:vreg_64 = IMPLICIT_DEF
     %2:vreg_64_align2 = COPY killed %1
@@ -148,9 +147,8 @@ body:             |
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
-    ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_96 = IMPLICIT_DEF
-    ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[DEF]]
-    ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+    ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
+    ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %1:vreg_96 = IMPLICIT_DEF
     %2:vreg_96_align2 = COPY killed %1
@@ -326,9 +324,8 @@ body:             |
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
-    ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
-    ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY killed [[DEF]]
-    ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+    ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %1:vreg_128 = IMPLICIT_DEF
     %2:vreg_128_align2 = COPY killed %1

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants