-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGPU] Expand scratch atomics to flat atomics if GAS is enabled #154710
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-transforms Author: Pierre van Houtryve (Pierre-vh) ChangesPatch is 1.02 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154710.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 561019bb65549..60faf211df0d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17808,11 +17808,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
!AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
}
+static TargetLowering::AtomicExpansionKind
+getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
+ // For GAS, lower to flat atomic.
+ return STI.hasGloballyAddressableScratch()
+ ? TargetLowering::AtomicExpansionKind::Expand
+ : TargetLowering::AtomicExpansionKind::NotAtomic;
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
- return AtomicExpansionKind::NotAtomic;
+ return getPrivateAtomicExpansionKind(*getSubtarget());
// 64-bit flat atomics that dynamically reside in private memory will silently
// be dropped.
@@ -18038,14 +18046,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
+ ? getPrivateAtomicExpansionKind(*getSubtarget())
: AtomicExpansionKind::None;
}
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
+ ? getPrivateAtomicExpansionKind(*getSubtarget())
: AtomicExpansionKind::None;
}
@@ -18053,7 +18061,7 @@ TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
unsigned AddrSpace = CmpX->getPointerAddressSpace();
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
- return AtomicExpansionKind::NotAtomic;
+ return getPrivateAtomicExpansionKind(*getSubtarget());
if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
return AtomicExpansionKind::None;
@@ -18423,9 +18431,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
Builder.CreateBr(ExitBB);
}
+static void convertScratchAtomicToFlatAtomic(Instruction *I,
+ unsigned PtrOpIdx) {
+ Value *PtrOp = I->getOperand(PtrOpIdx);
+ assert(PtrOp->getType()->getPointerAddressSpace() ==
+ AMDGPUAS::PRIVATE_ADDRESS);
+
+ Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
+ Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
+ I->getIterator());
+ I->setOperand(PtrOpIdx, ASCast);
+}
+
void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
AtomicRMWInst::BinOp Op = AI->getOperation();
+ if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex());
+
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
Op == AtomicRMWInst::Xor) {
if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
@@ -18448,9 +18471,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
}
void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+ if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex());
+
emitExpandAtomicAddrSpacePredicate(CI);
}
+void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
+ if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex());
+
+ llvm_unreachable(
+ "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
+}
+
+void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
+ if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
+
+ llvm_unreachable(
+ "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
+}
+
LoadInst *
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
IRBuilder<> Builder(AI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index dedd9ae170774..e96b702367299 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -562,6 +562,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
+ void emitExpandAtomicLoad(LoadInst *LI) const override;
+ void emitExpandAtomicStore(StoreInst *SI) const override;
LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
index d13d76fcfabf4..fcdba69c30213 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
@@ -86,15 +86,3 @@ entry:
store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
ret void
}
-
-; GCN: scratch_atomic_store:
-; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; GCN: .amdhsa_kernel scratch_atomic_store
-; CU: .amdhsa_uses_cu_stores 1
-; NOCU: .amdhsa_uses_cu_stores 0
-define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
-entry:
- store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
index af5b529fc387e..fe345f9244066 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
@@ -179,11 +179,35 @@ define amdgpu_kernel void @private_agent_unordered_load(
;
; GFX1250-LABEL: private_agent_unordered_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +382,35 @@ define amdgpu_kernel void @private_agent_monotonic_load(
;
; GFX1250-LABEL: private_agent_monotonic_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +585,36 @@ define amdgpu_kernel void @private_agent_acquire_load(
;
; GFX1250-LABEL: private_agent_acquire_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +789,42 @@ define amdgpu_kernel void @private_agent_seq_cst_load(
;
; GFX1250-LABEL: private_agent_seq_cst_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +979,35 @@ define amdgpu_kernel void @private_agent_unordered_store(
;
; GFX1250-LABEL: private_agent_unordered_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1032,11 +1160,35 @@ define amdgpu_kernel void @private_agent_monotonic_store(
;
; GFX1250-LABEL: private_agent_monotonic_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1189,11 +1341,39 @@ define amdgpu_kernel void @private_agent_release_store(
;
; GFX1250-LABEL: private_agent_release_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1346,11 +1526,39 @@ define amdgpu_kernel void @private_agent_seq_cst_store(
;
; GFX1250-LABEL: private_agent_seq_cst_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesPatch is 1.02 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154710.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 561019bb65549..60faf211df0d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17808,11 +17808,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
!AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
}
+static TargetLowering::AtomicExpansionKind
+getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
+ // For GAS, lower to flat atomic.
+ return STI.hasGloballyAddressableScratch()
+ ? TargetLowering::AtomicExpansionKind::Expand
+ : TargetLowering::AtomicExpansionKind::NotAtomic;
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
- return AtomicExpansionKind::NotAtomic;
+ return getPrivateAtomicExpansionKind(*getSubtarget());
// 64-bit flat atomics that dynamically reside in private memory will silently
// be dropped.
@@ -18038,14 +18046,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
+ ? getPrivateAtomicExpansionKind(*getSubtarget())
: AtomicExpansionKind::None;
}
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
+ ? getPrivateAtomicExpansionKind(*getSubtarget())
: AtomicExpansionKind::None;
}
@@ -18053,7 +18061,7 @@ TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
unsigned AddrSpace = CmpX->getPointerAddressSpace();
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
- return AtomicExpansionKind::NotAtomic;
+ return getPrivateAtomicExpansionKind(*getSubtarget());
if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
return AtomicExpansionKind::None;
@@ -18423,9 +18431,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
Builder.CreateBr(ExitBB);
}
+static void convertScratchAtomicToFlatAtomic(Instruction *I,
+ unsigned PtrOpIdx) {
+ Value *PtrOp = I->getOperand(PtrOpIdx);
+ assert(PtrOp->getType()->getPointerAddressSpace() ==
+ AMDGPUAS::PRIVATE_ADDRESS);
+
+ Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
+ Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
+ I->getIterator());
+ I->setOperand(PtrOpIdx, ASCast);
+}
+
void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
AtomicRMWInst::BinOp Op = AI->getOperation();
+ if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex());
+
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
Op == AtomicRMWInst::Xor) {
if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
@@ -18448,9 +18471,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
}
void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+ if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex());
+
emitExpandAtomicAddrSpacePredicate(CI);
}
+void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
+ if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex());
+
+ llvm_unreachable(
+ "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
+}
+
+void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
+ if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
+
+ llvm_unreachable(
+ "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
+}
+
LoadInst *
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
IRBuilder<> Builder(AI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index dedd9ae170774..e96b702367299 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -562,6 +562,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
+ void emitExpandAtomicLoad(LoadInst *LI) const override;
+ void emitExpandAtomicStore(StoreInst *SI) const override;
LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
index d13d76fcfabf4..fcdba69c30213 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
@@ -86,15 +86,3 @@ entry:
store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
ret void
}
-
-; GCN: scratch_atomic_store:
-; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; GCN: .amdhsa_kernel scratch_atomic_store
-; CU: .amdhsa_uses_cu_stores 1
-; NOCU: .amdhsa_uses_cu_stores 0
-define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
-entry:
- store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
index af5b529fc387e..fe345f9244066 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
@@ -179,11 +179,35 @@ define amdgpu_kernel void @private_agent_unordered_load(
;
; GFX1250-LABEL: private_agent_unordered_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +382,35 @@ define amdgpu_kernel void @private_agent_monotonic_load(
;
; GFX1250-LABEL: private_agent_monotonic_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +585,36 @@ define amdgpu_kernel void @private_agent_acquire_load(
;
; GFX1250-LABEL: private_agent_acquire_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +789,42 @@ define amdgpu_kernel void @private_agent_seq_cst_load(
;
; GFX1250-LABEL: private_agent_seq_cst_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +979,35 @@ define amdgpu_kernel void @private_agent_unordered_store(
;
; GFX1250-LABEL: private_agent_unordered_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1032,11 +1160,35 @@ define amdgpu_kernel void @private_agent_monotonic_store(
;
; GFX1250-LABEL: private_agent_monotonic_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1189,11 +1341,39 @@ define amdgpu_kernel void @private_agent_release_store(
;
; GFX1250-LABEL: private_agent_release_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: ; implicit-def: $sgpr1
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; implicit-def: $sgpr1
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1346,11 +1526,39 @@ define amdgpu_kernel void @private_agent_seq_cst_store(
;
; GFX1250-LABEL: private_agent_seq_cst_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why does it target private branch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM in principle.
That's how stacked pull requests look, the bottom merges into main. The next one up merges into this one |
83f5e4d
to
6cd7c41
Compare
799ee4d
to
3c6b5f7
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does it mean that the scratch memory can be accessed by not only its own thread?
GAS is exactly that. |
; | ||
%val = atomicrmw volatile xchg ptr addrspace(5) %addr, i32 %in acq_rel | ||
ret i32 %val | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you test few more atomicrmws, especially some FP atomics. Also test 16-bit and 64-bit cases
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
32 bit causes an error (by design), and the pipeline stops for opt after the first error. How can I test these cases?
; GFX1250-NEXT: store atomic float [[VAL]], ptr [[SCRATCH_ASCAST]] unordered, align 4 | ||
; GFX1250-NEXT: ret void | ||
; | ||
store atomic float %val, ptr addrspace(5) %addr unordered, align 4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test metadata and scope preservation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are tests with syncscopes already
I added some MD in a few testcases
6cd7c41
to
1f02a65
Compare
No description provided.