Skip to content

Conversation

Pierre-vh
Copy link
Contributor

No description provided.

Copy link
Contributor Author

Pierre-vh commented Aug 21, 2025

@llvmbot
Copy link
Member

llvmbot commented Aug 21, 2025

@llvm/pr-subscribers-llvm-transforms

Author: Pierre van Houtryve (Pierre-vh)

Changes

Patch is 1.02 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154710.diff

9 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+46-4)
  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+2)
  • (modified) llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll (-12)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll (+3235-504)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll (+2892-540)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll (+3131-475)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll (+2892-540)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll (+2938-540)
  • (added) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll (+172)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 561019bb65549..60faf211df0d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17808,11 +17808,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
          !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
 }
 
+static TargetLowering::AtomicExpansionKind
+getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
+  // For GAS, lower to flat atomic.
+  return STI.hasGloballyAddressableScratch()
+             ? TargetLowering::AtomicExpansionKind::Expand
+             : TargetLowering::AtomicExpansionKind::NotAtomic;
+}
+
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   unsigned AS = RMW->getPointerAddressSpace();
   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
-    return AtomicExpansionKind::NotAtomic;
+    return getPrivateAtomicExpansionKind(*getSubtarget());
 
   // 64-bit flat atomics that dynamically reside in private memory will silently
   // be dropped.
@@ -18038,14 +18046,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
-             ? AtomicExpansionKind::NotAtomic
+             ? getPrivateAtomicExpansionKind(*getSubtarget())
              : AtomicExpansionKind::None;
 }
 
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
-             ? AtomicExpansionKind::NotAtomic
+             ? getPrivateAtomicExpansionKind(*getSubtarget())
              : AtomicExpansionKind::None;
 }
 
@@ -18053,7 +18061,7 @@ TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
   unsigned AddrSpace = CmpX->getPointerAddressSpace();
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
-    return AtomicExpansionKind::NotAtomic;
+    return getPrivateAtomicExpansionKind(*getSubtarget());
 
   if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
     return AtomicExpansionKind::None;
@@ -18423,9 +18431,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
   Builder.CreateBr(ExitBB);
 }
 
+static void convertScratchAtomicToFlatAtomic(Instruction *I,
+                                             unsigned PtrOpIdx) {
+  Value *PtrOp = I->getOperand(PtrOpIdx);
+  assert(PtrOp->getType()->getPointerAddressSpace() ==
+         AMDGPUAS::PRIVATE_ADDRESS);
+
+  Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
+  Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
+                                              I->getIterator());
+  I->setOperand(PtrOpIdx, ASCast);
+}
+
 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   AtomicRMWInst::BinOp Op = AI->getOperation();
 
+  if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex());
+
   if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
       Op == AtomicRMWInst::Xor) {
     if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
@@ -18448,9 +18471,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
 }
 
 void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+  if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex());
+
   emitExpandAtomicAddrSpacePredicate(CI);
 }
 
+void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
+  if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex());
+
+  llvm_unreachable(
+      "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
+}
+
+void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
+  if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
+
+  llvm_unreachable(
+      "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
+}
+
 LoadInst *
 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   IRBuilder<> Builder(AI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index dedd9ae170774..e96b702367299 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -562,6 +562,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
   void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
   void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
+  void emitExpandAtomicLoad(LoadInst *LI) const override;
+  void emitExpandAtomicStore(StoreInst *SI) const override;
 
   LoadInst *
   lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
index d13d76fcfabf4..fcdba69c30213 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
@@ -86,15 +86,3 @@ entry:
   store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
   ret void
 }
-
-; GCN:     scratch_atomic_store:
-; CU:        scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; NOCU:      scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; GCN:     .amdhsa_kernel scratch_atomic_store
-; CU:        .amdhsa_uses_cu_stores 1
-; NOCU:      .amdhsa_uses_cu_stores 0
-define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
-entry:
-  store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
index af5b529fc387e..fe345f9244066 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
@@ -179,11 +179,35 @@ define amdgpu_kernel void @private_agent_unordered_load(
 ;
 ; GFX1250-LABEL: private_agent_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +382,35 @@ define amdgpu_kernel void @private_agent_monotonic_load(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +585,36 @@ define amdgpu_kernel void @private_agent_acquire_load(
 ;
 ; GFX1250-LABEL: private_agent_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +789,42 @@ define amdgpu_kernel void @private_agent_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +979,35 @@ define amdgpu_kernel void @private_agent_unordered_store(
 ;
 ; GFX1250-LABEL: private_agent_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1032,11 +1160,35 @@ define amdgpu_kernel void @private_agent_monotonic_store(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1189,11 +1341,39 @@ define amdgpu_kernel void @private_agent_release_store(
 ;
 ; GFX1250-LABEL: private_agent_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1346,11 +1526,39 @@ define amdgpu_kernel void @private_agent_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Aug 21, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)

Changes

Patch is 1.02 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154710.diff

9 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+46-4)
  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+2)
  • (modified) llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll (-12)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll (+3235-504)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll (+2892-540)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll (+3131-475)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll (+2892-540)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll (+2938-540)
  • (added) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll (+172)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 561019bb65549..60faf211df0d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17808,11 +17808,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
          !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
 }
 
+static TargetLowering::AtomicExpansionKind
+getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
+  // For GAS, lower to flat atomic.
+  return STI.hasGloballyAddressableScratch()
+             ? TargetLowering::AtomicExpansionKind::Expand
+             : TargetLowering::AtomicExpansionKind::NotAtomic;
+}
+
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   unsigned AS = RMW->getPointerAddressSpace();
   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
-    return AtomicExpansionKind::NotAtomic;
+    return getPrivateAtomicExpansionKind(*getSubtarget());
 
   // 64-bit flat atomics that dynamically reside in private memory will silently
   // be dropped.
@@ -18038,14 +18046,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
-             ? AtomicExpansionKind::NotAtomic
+             ? getPrivateAtomicExpansionKind(*getSubtarget())
              : AtomicExpansionKind::None;
 }
 
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
-             ? AtomicExpansionKind::NotAtomic
+             ? getPrivateAtomicExpansionKind(*getSubtarget())
              : AtomicExpansionKind::None;
 }
 
@@ -18053,7 +18061,7 @@ TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
   unsigned AddrSpace = CmpX->getPointerAddressSpace();
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
-    return AtomicExpansionKind::NotAtomic;
+    return getPrivateAtomicExpansionKind(*getSubtarget());
 
   if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
     return AtomicExpansionKind::None;
@@ -18423,9 +18431,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
   Builder.CreateBr(ExitBB);
 }
 
+static void convertScratchAtomicToFlatAtomic(Instruction *I,
+                                             unsigned PtrOpIdx) {
+  Value *PtrOp = I->getOperand(PtrOpIdx);
+  assert(PtrOp->getType()->getPointerAddressSpace() ==
+         AMDGPUAS::PRIVATE_ADDRESS);
+
+  Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
+  Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
+                                              I->getIterator());
+  I->setOperand(PtrOpIdx, ASCast);
+}
+
 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   AtomicRMWInst::BinOp Op = AI->getOperation();
 
+  if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex());
+
   if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
       Op == AtomicRMWInst::Xor) {
     if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
@@ -18448,9 +18471,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
 }
 
 void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+  if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex());
+
   emitExpandAtomicAddrSpacePredicate(CI);
 }
 
+void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
+  if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex());
+
+  llvm_unreachable(
+      "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
+}
+
+void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
+  if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
+
+  llvm_unreachable(
+      "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
+}
+
 LoadInst *
 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   IRBuilder<> Builder(AI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index dedd9ae170774..e96b702367299 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -562,6 +562,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
   void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
   void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
+  void emitExpandAtomicLoad(LoadInst *LI) const override;
+  void emitExpandAtomicStore(StoreInst *SI) const override;
 
   LoadInst *
   lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
index d13d76fcfabf4..fcdba69c30213 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
@@ -86,15 +86,3 @@ entry:
   store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
   ret void
 }
-
-; GCN:     scratch_atomic_store:
-; CU:        scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; NOCU:      scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; GCN:     .amdhsa_kernel scratch_atomic_store
-; CU:        .amdhsa_uses_cu_stores 1
-; NOCU:      .amdhsa_uses_cu_stores 0
-define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
-entry:
-  store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
index af5b529fc387e..fe345f9244066 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
@@ -179,11 +179,35 @@ define amdgpu_kernel void @private_agent_unordered_load(
 ;
 ; GFX1250-LABEL: private_agent_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +382,35 @@ define amdgpu_kernel void @private_agent_monotonic_load(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +585,36 @@ define amdgpu_kernel void @private_agent_acquire_load(
 ;
 ; GFX1250-LABEL: private_agent_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +789,42 @@ define amdgpu_kernel void @private_agent_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +979,35 @@ define amdgpu_kernel void @private_agent_unordered_store(
 ;
 ; GFX1250-LABEL: private_agent_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1032,11 +1160,35 @@ define amdgpu_kernel void @private_agent_monotonic_store(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1189,11 +1341,39 @@ define amdgpu_kernel void @private_agent_release_store(
 ;
 ; GFX1250-LABEL: private_agent_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1346,11 +1526,39 @@ define amdgpu_kernel void @private_agent_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:...
[truncated]

Copy link
Collaborator

@rampitec rampitec left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does it target private branch?

Copy link
Collaborator

@rampitec rampitec left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM in principle.

@arsenm
Copy link
Contributor

arsenm commented Aug 22, 2025

Why does it target private branch?

That's how stacked pull requests look, the bottom merges into main. The next one up merges into this one

@Pierre-vh Pierre-vh force-pushed the users/pierre-vh/expand-scratch-to-flat-for-gas branch from 83f5e4d to 6cd7c41 Compare August 22, 2025 10:45
@Pierre-vh Pierre-vh force-pushed the users/pierre-vh/memorylegalizer-private-test branch from 799ee4d to 3c6b5f7 Compare August 22, 2025 10:45
Copy link
Contributor

@shiltian shiltian left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it mean that the scratch memory can be accessed by not only its own thread?

@rampitec
Copy link
Collaborator

Does it mean that the scratch memory can be accessed by not only its own thread?

GAS is exactly that.

;
%val = atomicrmw volatile xchg ptr addrspace(5) %addr, i32 %in acq_rel
ret i32 %val
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you test few more atomicrmws, especially some FP atomics. Also test 16-bit and 64-bit cases

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

32 bit causes an error (by design), and the pipeline stops for opt after the first error. How can I test these cases?

; GFX1250-NEXT: store atomic float [[VAL]], ptr [[SCRATCH_ASCAST]] unordered, align 4
; GFX1250-NEXT: ret void
;
store atomic float %val, ptr addrspace(5) %addr unordered, align 4
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test metadata and scope preservation

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are tests with syncscopes already
I added some MD in a few testcases

Base automatically changed from users/pierre-vh/memorylegalizer-private-test to main August 28, 2025 07:58
@Pierre-vh Pierre-vh force-pushed the users/pierre-vh/expand-scratch-to-flat-for-gas branch from 6cd7c41 to 1f02a65 Compare August 28, 2025 08:20
@Pierre-vh Pierre-vh requested a review from arsenm August 28, 2025 09:32
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants